├── .github
└── workflows
│ ├── genlocale.yml
│ ├── pull_format.yml
│ ├── push_format.yml
│ └── unitest.yml
├── .gitignore
├── README.md
├── RVC-GUI.bat
├── config.py
├── docs
├── GUI.JPG
└── GUI20230508.JPG
├── infer
├── infer-pm-index256.py
├── train-index.py
└── trans_weights.py
├── infer_pack
├── attentions.py
├── commons.py
├── models.py
├── models_onnx.py
├── models_onnx_moess.py
├── modelsv2.py
├── modules.py
└── transforms.py
├── my_utils.py
├── requirements.txt
├── rvcgui.py
├── setup.bat
├── trainset_preprocess_pipeline_print.py
└── vc_infer_pipeline.py
/.github/workflows/genlocale.yml:
--------------------------------------------------------------------------------
1 | name: genlocale
2 | on:
3 | push:
4 | branches:
5 | - main
6 | jobs:
7 | golangci:
8 | name: genlocale
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Check out
12 | uses: actions/checkout@master
13 |
14 | - name: Run locale generation
15 | run: |
16 | python3 extract_locale.py
17 | cd i18n && python3 locale_diff.py
18 |
19 | - name: Commit back
20 | if: ${{ !github.head_ref }}
21 | continue-on-error: true
22 | run: |
23 | git config --local user.name 'github-actions[bot]'
24 | git config --local user.email '41898282+github-actions[bot]@users.noreply.github.com'
25 | git add --all
26 | git commit -m "🎨 同步 locale"
27 |
28 | - name: Create Pull Request
29 | if: ${{ !github.head_ref }}
30 | continue-on-error: true
31 | uses: peter-evans/create-pull-request@v4
32 |
33 |
--------------------------------------------------------------------------------
/.github/workflows/pull_format.yml:
--------------------------------------------------------------------------------
1 | name: pull format
2 |
3 | on: [pull_request]
4 |
5 | permissions:
6 | contents: write
7 | jobs:
8 | pull_format:
9 | runs-on: ubuntu-latest
10 | continue-on-error: true
11 | steps:
12 | - name: checkout
13 | continue-on-error: true
14 | uses: actions/checkout@v3
15 | with:
16 | ref: ${{ github.head_ref }}
17 | fetch-depth: 0
18 |
19 |
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v4
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 |
25 | - name: Install Black
26 | run: pip install black
27 |
28 | - name: Run Black
29 | # run: black $(git ls-files '*.py')
30 | run: black .
31 |
32 | - name: Commit Back
33 | uses: stefanzweifel/git-auto-commit-action@v4
34 | with:
35 | commit_message: Apply Code Formatter Change
36 |
--------------------------------------------------------------------------------
/.github/workflows/push_format.yml:
--------------------------------------------------------------------------------
1 | name: push format
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | permissions:
9 | contents: write
10 | pull-requests: write
11 | jobs:
12 | push_format:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v3
16 | with:
17 | ref: ${{github.ref_name}}
18 |
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 |
24 | - name: Install Black
25 | run: pip install black
26 |
27 | - name: Run Black
28 | # run: black $(git ls-files '*.py')
29 | run: black .
30 |
31 | - name: Commit Back
32 | continue-on-error: true
33 | id: commitback
34 | run: |
35 | git config --local user.email "github-actions[bot]@users.noreply.github.com"
36 | git config --local user.name "github-actions[bot]"
37 | git add --all
38 | git commit -m "Format code"
39 |
40 | - name: Create Pull Request
41 | if: steps.commitback.outcome == 'success'
42 | continue-on-error: true
43 | uses: peter-evans/create-pull-request@v4
44 | with:
45 | body: Apply Code Formatter Change
46 | commit-message: Automatic code format
47 |
--------------------------------------------------------------------------------
/.github/workflows/unitest.yml:
--------------------------------------------------------------------------------
1 | name: unitest
2 | on: [ push, pull_request ]
3 | jobs:
4 | build:
5 | runs-on: ${{ matrix.os }}
6 | strategy:
7 | matrix:
8 | python-version: ["3.8", "3.9", "3.10"]
9 | os: [ubuntu-latest]
10 | fail-fast: false
11 |
12 | steps:
13 | - uses: actions/checkout@master
14 | - name: Set up Python ${{ matrix.python-version }}
15 | uses: actions/setup-python@v4
16 | with:
17 | python-version: ${{ matrix.python-version }}
18 | - name: Install dependencies
19 | run: |
20 | sudo apt update
21 | sudo apt -y install ffmpeg
22 | sudo apt -y install -qq aria2
23 | aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d ./ -o hubert_base.pt
24 | python -m pip install --upgrade pip
25 | python -m pip install --upgrade setuptools
26 | python -m pip install --upgrade wheel
27 | pip install torch torchvision torchaudio
28 | pip install -r requirements.txt
29 | - name: Test step 1 & 2
30 | run: |
31 | mkdir -p logs/mi-test
32 | touch logs/mi-test/preprocess.log
33 | python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True
34 | touch logs/mi-test/extract_f0_feature.log
35 | python extract_f0_print.py logs/mi-test $(nproc) pm
36 | python extract_feature_print.py cpu 1 0 0 logs/mi-test
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | /TEMP
4 | *.pyd
5 | hubert_base.pt
6 | /logs
7 | models/
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
RVC GUI
4 |
5 | For audio file inference only
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | ## GUI
19 |
20 | 
21 |
22 |
23 | ## Direct setup for Windows users
24 | ## [Windows-pkg](https://github.com/Tiger14n/RVC-GUI/releases/tag/Windows-pkg)
25 |
26 |
27 | ## Preparing the environment
28 |
29 |
30 | * Install Python version +3.8 if you have not:
31 |
32 | * Execute these commands
33 |
34 | Windows with Nvidia cards
35 | ```bash
36 | python -m pip install -U pip setuptools wheel
37 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118
38 | pip install -r requirements.txt
39 | ```
40 | Other
41 | ```
42 | python -m pip install -U pip setuptools wheel
43 | pip install -U torch torchaudio
44 | pip install -r requirements.txt
45 | ```
46 |
47 | Apple silicon Macs fix
48 | ```
49 | pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
50 |
51 | export PYTORCH_ENABLE_MPS_FALLBACK=1
52 | ```
53 |
54 |
55 | * Downlaod [hubert_base.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt/) and place it in the root folder
56 |
57 |
58 |
59 | * Then use this command to start RVC GUI:
60 | ```bash
61 | python rvcgui.py
62 | ```
63 | Or run this file on windows
64 | ```
65 | RVC-GUI.bat
66 | ```
67 |
68 | # Loading models
69 | use the import button to import a model from a zip file,
70 | * The .zip must contain the ".pth" weight file.
71 | * The .zip is recommended to contain the feature retrieval files ".index"
72 |
73 | Or place the model manually in root/models
74 | ```
75 | models
76 | ├───Person1
77 | │ ├───xxxx.pth
78 | │ ├───xxxx.index
79 | │ └───xxxx.npy
80 | └───Person2
81 | ├───xxxx.pth
82 | ├───...
83 | └───...
84 | ````
85 |
86 |
87 |
88 |
89 |
90 | ### How to get models?.
91 | * Join the[ AI Hub](https://discord.gg/aihub) Discord
92 | * [Community Models on HuggingFace](https://huggingface.co/QuickWick/Music-AI-Voices/tree/main) by Wicked aka QuickWick
93 |
94 |
95 |
96 | K7#4523
97 |
98 |
99 |
--------------------------------------------------------------------------------
/RVC-GUI.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | REM Get the path of the script's directory
4 | set "scriptDir=%~dp0"
5 |
6 | REM Set the path to the Python runtime folder
7 | set "runtimeFolder=%scriptDir%runtime"
8 |
9 | REM Check if the runtime folder exists
10 |
11 | REM Check if the runtime folder exists
12 | if exist "%runtimeFolder%\python.exe" (
13 | REM Runtime folder exists, so run the file using the runtime Python
14 | echo Running with the runtime Python.
15 | "runtime/python.exe" rvcgui.py --pycmd "runtime/python.exe"
16 | pause
17 | ) else (
18 | REM Runtime folder does not exist, so run the file using the system Python
19 | echo Running with the system Python.
20 | python.exe rvcgui.py --pycmd python.exe
21 | pause
22 | )
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import sys
4 | import torch
5 | from multiprocessing import cpu_count
6 |
7 |
8 | class Config:
9 | def __init__(self):
10 | self.device = "cuda:0"
11 | self.is_half = True
12 | self.n_cpu = 0
13 | self.gpu_name = None
14 | self.gpu_mem = None
15 | (
16 | self.python_cmd,
17 | self.listen_port,
18 | self.iscolab,
19 | self.noparallel,
20 | self.noautoopen,
21 | self.use_gfloat,
22 | self.paperspace,
23 | ) = self.arg_parse()
24 |
25 | if self.use_gfloat:
26 | print("Using g_float instead of g_half")
27 | self.is_half = False
28 | self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
29 |
30 | def arg_parse(self) -> tuple:
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("--port", type=int, default=7865, help="Listen port")
33 | parser.add_argument(
34 | "--pycmd", type=str, default="python", help="Python command"
35 | )
36 | parser.add_argument("--colab", action="store_true", help="Launch in colab")
37 | parser.add_argument(
38 | "--noparallel", action="store_true", help="Disable parallel processing"
39 | )
40 | parser.add_argument(
41 | "--noautoopen",
42 | action="store_true",
43 | help="Do not open in browser automatically",
44 | )
45 | parser.add_argument( # this argument (if set to false) allows windows users to avoid the "slow_conv2d_cpu not implemented for 'Half'" exception
46 | "--use_gfloat", action="store_true", help="Will use g_float instead of g_half during voice conversion."
47 | )
48 | parser.add_argument( # Fork Feature. Paperspace integration for web UI
49 | "--paperspace", action="store_true", help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems."
50 | )
51 | cmd_opts = parser.parse_args()
52 |
53 | cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
54 |
55 | return (
56 | cmd_opts.pycmd,
57 | cmd_opts.port,
58 | cmd_opts.colab,
59 | cmd_opts.noparallel,
60 | cmd_opts.noautoopen,
61 | cmd_opts.use_gfloat,
62 | cmd_opts.paperspace,
63 | )
64 |
65 | def device_config(self) -> tuple:
66 | if torch.cuda.is_available():
67 | i_device = int(self.device.split(":")[-1])
68 | self.gpu_name = torch.cuda.get_device_name(i_device)
69 | if (
70 | ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
71 | or "P40" in self.gpu_name.upper()
72 | or "1060" in self.gpu_name
73 | or "1070" in self.gpu_name
74 | or "1080" in self.gpu_name
75 | ):
76 | print("16系/10系显卡和P40强制单精度")
77 | self.is_half = False
78 | with open("trainset_preprocess_pipeline_print.py", "r") as f:
79 | strr = f.read().replace("3.7", "3.0")
80 | with open("trainset_preprocess_pipeline_print.py", "w") as f:
81 | f.write(strr)
82 | else:
83 | self.gpu_name = None
84 | self.gpu_mem = int(
85 | torch.cuda.get_device_properties(i_device).total_memory
86 | / 1024
87 | / 1024
88 | / 1024
89 | + 0.4
90 | )
91 | if self.gpu_mem <= 4:
92 | with open("trainset_preprocess_pipeline_print.py", "r") as f:
93 | strr = f.read().replace("3.7", "3.0")
94 | with open("trainset_preprocess_pipeline_print.py", "w") as f:
95 | f.write(strr)
96 | elif torch.backends.mps.is_available():
97 | print("No supported Nvidia cards found, using MPS for inference ")
98 | self.device = "mps"
99 | else:
100 | print("No supported Nvidia cards found, using CPU for inference")
101 | self.device = "cpu"
102 | if not self.use_gfloat: # Fork Feature: Force g_float (is_half = False) if --use_gfloat arg is used.
103 | self.is_half = False
104 |
105 | if self.n_cpu == 0:
106 | self.n_cpu = cpu_count()
107 |
108 | if self.is_half:
109 | # 6G显存配置
110 | x_pad = 3
111 | x_query = 10
112 | x_center = 60
113 | x_max = 65
114 | else:
115 | # 5G显存配置
116 | x_pad = 1
117 | x_query = 6
118 | x_center = 38
119 | x_max = 41
120 |
121 | if self.gpu_mem != None and self.gpu_mem <= 4:
122 | x_pad = 1
123 | x_query = 5
124 | x_center = 30
125 | x_max = 32
126 |
127 | return x_pad, x_query, x_center, x_max
128 |
--------------------------------------------------------------------------------
/docs/GUI.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI.JPG
--------------------------------------------------------------------------------
/docs/GUI20230508.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tiger14n/RVC-GUI/0c2e2b158e0fdff0ed91a53d9fea2b0b3dc4752b/docs/GUI20230508.JPG
--------------------------------------------------------------------------------
/infer/infer-pm-index256.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | 对源特征进行检索
4 | """
5 | import torch, pdb, os, parselmouth
6 |
7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8 | import numpy as np
9 | import soundfile as sf
10 |
11 | # from models import SynthesizerTrn256#hifigan_nonsf
12 | # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
13 | from infer_pack.models import (
14 | SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
15 | ) # hifigan_nsf
16 |
17 | # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
18 | # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
19 | # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
20 |
21 |
22 | from scipy.io import wavfile
23 | from fairseq import checkpoint_utils
24 |
25 | # import pyworld
26 | import librosa
27 | import torch.nn.functional as F
28 | import scipy.signal as signal
29 |
30 | # import torchcrepe
31 | from time import time as ttime
32 |
33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34 | model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
35 | print("load model(s) from {}".format(model_path))
36 | models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
37 | [model_path],
38 | suffix="",
39 | )
40 | model = models[0]
41 | model = model.to(device)
42 | model = model.half()
43 | model.eval()
44 |
45 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
46 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
47 | net_g = SynthesizerTrn256(
48 | 1025,
49 | 32,
50 | 192,
51 | 192,
52 | 768,
53 | 2,
54 | 6,
55 | 3,
56 | 0,
57 | "1",
58 | [3, 7, 11],
59 | [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
60 | [10, 10, 2, 2],
61 | 512,
62 | [16, 16, 4, 4],
63 | 183,
64 | 256,
65 | is_half=True,
66 | ) # hifigan#512#256#no_dropout
67 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
68 | # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
69 | #
70 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
71 | # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
72 |
73 | # weights=torch.load("infer/ft-mi_1k-noD.pt")
74 | # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
75 | # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
76 | # weights=torch.load("infer/ft-mi-sim1k.pt")
77 | weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
78 | print(net_g.load_state_dict(weights, strict=True))
79 |
80 | net_g.eval().to(device)
81 | net_g.half()
82 |
83 |
84 | def get_f0(x, p_len, f0_up_key=0):
85 | time_step = 160 / 16000 * 1000
86 | f0_min = 50
87 | f0_max = 1100
88 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
89 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
90 |
91 | f0 = (
92 | parselmouth.Sound(x, 16000)
93 | .to_pitch_ac(
94 | time_step=time_step / 1000,
95 | voicing_threshold=0.6,
96 | pitch_floor=f0_min,
97 | pitch_ceiling=f0_max,
98 | )
99 | .selected_array["frequency"]
100 | )
101 |
102 | pad_size = (p_len - len(f0) + 1) // 2
103 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
104 | f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
105 | f0 *= pow(2, f0_up_key / 12)
106 | f0bak = f0.copy()
107 |
108 | f0_mel = 1127 * np.log(1 + f0 / 700)
109 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
110 | f0_mel_max - f0_mel_min
111 | ) + 1
112 | f0_mel[f0_mel <= 1] = 1
113 | f0_mel[f0_mel > 255] = 255
114 | # f0_mel[f0_mel > 188] = 188
115 | f0_coarse = np.rint(f0_mel).astype(np.int)
116 | return f0_coarse, f0bak
117 |
118 |
119 | import faiss
120 |
121 | index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
122 | big_npy = np.load("infer/big_src_feature_mi.npy")
123 | ta0 = ta1 = ta2 = 0
124 | for idx, name in enumerate(
125 | [
126 | "冬之花clip1.wav",
127 | ]
128 | ): ##
129 | wav_path = "todo-songs/%s" % name #
130 | f0_up_key = -2 #
131 | audio, sampling_rate = sf.read(wav_path)
132 | if len(audio.shape) > 1:
133 | audio = librosa.to_mono(audio.transpose(1, 0))
134 | if sampling_rate != 16000:
135 | audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136 |
137 | feats = torch.from_numpy(audio).float()
138 | if feats.dim() == 2: # double channels
139 | feats = feats.mean(-1)
140 | assert feats.dim() == 1, feats.dim()
141 | feats = feats.view(1, -1)
142 | padding_mask = torch.BoolTensor(feats.shape).fill_(False)
143 | inputs = {
144 | "source": feats.half().to(device),
145 | "padding_mask": padding_mask.to(device),
146 | "output_layer": 9, # layer 9
147 | }
148 | if torch.cuda.is_available():
149 | torch.cuda.synchronize()
150 | t0 = ttime()
151 | with torch.no_grad():
152 | logits = model.extract_features(**inputs)
153 | feats = model.final_proj(logits[0])
154 |
155 | ####索引优化
156 | npy = feats[0].cpu().numpy().astype("float32")
157 | D, I = index.search(npy, 1)
158 | feats = (
159 | torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
160 | )
161 |
162 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
163 | if torch.cuda.is_available():
164 | torch.cuda.synchronize()
165 | t1 = ttime()
166 | # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
167 | p_len = min(feats.shape[1], 10000) #
168 | pitch, pitchf = get_f0(audio, p_len, f0_up_key)
169 | p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
170 | if torch.cuda.is_available():
171 | torch.cuda.synchronize()
172 | t2 = ttime()
173 | feats = feats[:, :p_len, :]
174 | pitch = pitch[:p_len]
175 | pitchf = pitchf[:p_len]
176 | p_len = torch.LongTensor([p_len]).to(device)
177 | pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
178 | sid = torch.LongTensor([0]).to(device)
179 | pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
180 | with torch.no_grad():
181 | audio = (
182 | net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
183 | .data.cpu()
184 | .float()
185 | .numpy()
186 | ) # nsf
187 | if torch.cuda.is_available():
188 | torch.cuda.synchronize()
189 | t3 = ttime()
190 | ta0 += t1 - t0
191 | ta1 += t2 - t1
192 | ta2 += t3 - t2
193 | # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
194 | # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
195 | # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
196 | wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
197 |
198 |
199 | print(ta0, ta1, ta2) #
200 |
--------------------------------------------------------------------------------
/infer/train-index.py:
--------------------------------------------------------------------------------
1 | """
2 | 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3 | """
4 | import faiss, numpy as np, os
5 |
6 | # ###########如果是原始特征要先写save
7 | inp_root = r"E:\codes\py39\dataset\mi\2-co256"
8 | npys = []
9 | for name in sorted(list(os.listdir(inp_root))):
10 | phone = np.load("%s/%s" % (inp_root, name))
11 | npys.append(phone)
12 | big_npy = np.concatenate(npys, 0)
13 | print(big_npy.shape) # (6196072, 192)#fp32#4.43G
14 | np.save("infer/big_src_feature_mi.npy", big_npy)
15 |
16 | ##################train+add
17 | # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
18 | print(big_npy.shape)
19 | index = faiss.index_factory(256, "IVF512,Flat") # mi
20 | print("training")
21 | index_ivf = faiss.extract_index_ivf(index) #
22 | index_ivf.nprobe = 9
23 | index.train(big_npy)
24 | faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
25 | print("adding")
26 | index.add(big_npy)
27 | faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
28 | """
29 | 大小(都是FP32)
30 | big_src_feature 2.95G
31 | (3098036, 256)
32 | big_emb 4.43G
33 | (6196072, 192)
34 | big_emb双倍是因为求特征要repeat后再加pitch
35 |
36 | """
37 |
--------------------------------------------------------------------------------
/infer/trans_weights.py:
--------------------------------------------------------------------------------
1 | import torch, pdb
2 |
3 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
4 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
5 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
6 | # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
7 | a = torch.load(
8 | r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
9 | )[
10 | "model"
11 | ] # sim_nsf#
12 | for key in a.keys():
13 | a[key] = a[key].half()
14 | # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
15 | # torch.save(a,"ft-mi-sim1k.pt")#
16 | torch.save(a, "ft-mi-no_opt-no_dropout.pt") #
17 |
--------------------------------------------------------------------------------
/infer_pack/attentions.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | import numpy as np
4 | import torch
5 | from torch import nn
6 | from torch.nn import functional as F
7 |
8 | from infer_pack import commons
9 | from infer_pack import modules
10 | from infer_pack.modules import LayerNorm
11 |
12 |
13 | class Encoder(nn.Module):
14 | def __init__(
15 | self,
16 | hidden_channels,
17 | filter_channels,
18 | n_heads,
19 | n_layers,
20 | kernel_size=1,
21 | p_dropout=0.0,
22 | window_size=10,
23 | **kwargs
24 | ):
25 | super().__init__()
26 | self.hidden_channels = hidden_channels
27 | self.filter_channels = filter_channels
28 | self.n_heads = n_heads
29 | self.n_layers = n_layers
30 | self.kernel_size = kernel_size
31 | self.p_dropout = p_dropout
32 | self.window_size = window_size
33 |
34 | self.drop = nn.Dropout(p_dropout)
35 | self.attn_layers = nn.ModuleList()
36 | self.norm_layers_1 = nn.ModuleList()
37 | self.ffn_layers = nn.ModuleList()
38 | self.norm_layers_2 = nn.ModuleList()
39 | for i in range(self.n_layers):
40 | self.attn_layers.append(
41 | MultiHeadAttention(
42 | hidden_channels,
43 | hidden_channels,
44 | n_heads,
45 | p_dropout=p_dropout,
46 | window_size=window_size,
47 | )
48 | )
49 | self.norm_layers_1.append(LayerNorm(hidden_channels))
50 | self.ffn_layers.append(
51 | FFN(
52 | hidden_channels,
53 | hidden_channels,
54 | filter_channels,
55 | kernel_size,
56 | p_dropout=p_dropout,
57 | )
58 | )
59 | self.norm_layers_2.append(LayerNorm(hidden_channels))
60 |
61 | def forward(self, x, x_mask):
62 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63 | x = x * x_mask
64 | for i in range(self.n_layers):
65 | y = self.attn_layers[i](x, x, attn_mask)
66 | y = self.drop(y)
67 | x = self.norm_layers_1[i](x + y)
68 |
69 | y = self.ffn_layers[i](x, x_mask)
70 | y = self.drop(y)
71 | x = self.norm_layers_2[i](x + y)
72 | x = x * x_mask
73 | return x
74 |
75 |
76 | class Decoder(nn.Module):
77 | def __init__(
78 | self,
79 | hidden_channels,
80 | filter_channels,
81 | n_heads,
82 | n_layers,
83 | kernel_size=1,
84 | p_dropout=0.0,
85 | proximal_bias=False,
86 | proximal_init=True,
87 | **kwargs
88 | ):
89 | super().__init__()
90 | self.hidden_channels = hidden_channels
91 | self.filter_channels = filter_channels
92 | self.n_heads = n_heads
93 | self.n_layers = n_layers
94 | self.kernel_size = kernel_size
95 | self.p_dropout = p_dropout
96 | self.proximal_bias = proximal_bias
97 | self.proximal_init = proximal_init
98 |
99 | self.drop = nn.Dropout(p_dropout)
100 | self.self_attn_layers = nn.ModuleList()
101 | self.norm_layers_0 = nn.ModuleList()
102 | self.encdec_attn_layers = nn.ModuleList()
103 | self.norm_layers_1 = nn.ModuleList()
104 | self.ffn_layers = nn.ModuleList()
105 | self.norm_layers_2 = nn.ModuleList()
106 | for i in range(self.n_layers):
107 | self.self_attn_layers.append(
108 | MultiHeadAttention(
109 | hidden_channels,
110 | hidden_channels,
111 | n_heads,
112 | p_dropout=p_dropout,
113 | proximal_bias=proximal_bias,
114 | proximal_init=proximal_init,
115 | )
116 | )
117 | self.norm_layers_0.append(LayerNorm(hidden_channels))
118 | self.encdec_attn_layers.append(
119 | MultiHeadAttention(
120 | hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121 | )
122 | )
123 | self.norm_layers_1.append(LayerNorm(hidden_channels))
124 | self.ffn_layers.append(
125 | FFN(
126 | hidden_channels,
127 | hidden_channels,
128 | filter_channels,
129 | kernel_size,
130 | p_dropout=p_dropout,
131 | causal=True,
132 | )
133 | )
134 | self.norm_layers_2.append(LayerNorm(hidden_channels))
135 |
136 | def forward(self, x, x_mask, h, h_mask):
137 | """
138 | x: decoder input
139 | h: encoder output
140 | """
141 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142 | device=x.device, dtype=x.dtype
143 | )
144 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145 | x = x * x_mask
146 | for i in range(self.n_layers):
147 | y = self.self_attn_layers[i](x, x, self_attn_mask)
148 | y = self.drop(y)
149 | x = self.norm_layers_0[i](x + y)
150 |
151 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152 | y = self.drop(y)
153 | x = self.norm_layers_1[i](x + y)
154 |
155 | y = self.ffn_layers[i](x, x_mask)
156 | y = self.drop(y)
157 | x = self.norm_layers_2[i](x + y)
158 | x = x * x_mask
159 | return x
160 |
161 |
162 | class MultiHeadAttention(nn.Module):
163 | def __init__(
164 | self,
165 | channels,
166 | out_channels,
167 | n_heads,
168 | p_dropout=0.0,
169 | window_size=None,
170 | heads_share=True,
171 | block_length=None,
172 | proximal_bias=False,
173 | proximal_init=False,
174 | ):
175 | super().__init__()
176 | assert channels % n_heads == 0
177 |
178 | self.channels = channels
179 | self.out_channels = out_channels
180 | self.n_heads = n_heads
181 | self.p_dropout = p_dropout
182 | self.window_size = window_size
183 | self.heads_share = heads_share
184 | self.block_length = block_length
185 | self.proximal_bias = proximal_bias
186 | self.proximal_init = proximal_init
187 | self.attn = None
188 |
189 | self.k_channels = channels // n_heads
190 | self.conv_q = nn.Conv1d(channels, channels, 1)
191 | self.conv_k = nn.Conv1d(channels, channels, 1)
192 | self.conv_v = nn.Conv1d(channels, channels, 1)
193 | self.conv_o = nn.Conv1d(channels, out_channels, 1)
194 | self.drop = nn.Dropout(p_dropout)
195 |
196 | if window_size is not None:
197 | n_heads_rel = 1 if heads_share else n_heads
198 | rel_stddev = self.k_channels**-0.5
199 | self.emb_rel_k = nn.Parameter(
200 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201 | * rel_stddev
202 | )
203 | self.emb_rel_v = nn.Parameter(
204 | torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205 | * rel_stddev
206 | )
207 |
208 | nn.init.xavier_uniform_(self.conv_q.weight)
209 | nn.init.xavier_uniform_(self.conv_k.weight)
210 | nn.init.xavier_uniform_(self.conv_v.weight)
211 | if proximal_init:
212 | with torch.no_grad():
213 | self.conv_k.weight.copy_(self.conv_q.weight)
214 | self.conv_k.bias.copy_(self.conv_q.bias)
215 |
216 | def forward(self, x, c, attn_mask=None):
217 | q = self.conv_q(x)
218 | k = self.conv_k(c)
219 | v = self.conv_v(c)
220 |
221 | x, self.attn = self.attention(q, k, v, mask=attn_mask)
222 |
223 | x = self.conv_o(x)
224 | return x
225 |
226 | def attention(self, query, key, value, mask=None):
227 | # reshape [b, d, t] -> [b, n_h, t, d_k]
228 | b, d, t_s, t_t = (*key.size(), query.size(2))
229 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232 |
233 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234 | if self.window_size is not None:
235 | assert (
236 | t_s == t_t
237 | ), "Relative attention is only available for self-attention."
238 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239 | rel_logits = self._matmul_with_relative_keys(
240 | query / math.sqrt(self.k_channels), key_relative_embeddings
241 | )
242 | scores_local = self._relative_position_to_absolute_position(rel_logits)
243 | scores = scores + scores_local
244 | if self.proximal_bias:
245 | assert t_s == t_t, "Proximal bias is only available for self-attention."
246 | scores = scores + self._attention_bias_proximal(t_s).to(
247 | device=scores.device, dtype=scores.dtype
248 | )
249 | if mask is not None:
250 | scores = scores.masked_fill(mask == 0, -1e4)
251 | if self.block_length is not None:
252 | assert (
253 | t_s == t_t
254 | ), "Local attention is only available for self-attention."
255 | block_mask = (
256 | torch.ones_like(scores)
257 | .triu(-self.block_length)
258 | .tril(self.block_length)
259 | )
260 | scores = scores.masked_fill(block_mask == 0, -1e4)
261 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262 | p_attn = self.drop(p_attn)
263 | output = torch.matmul(p_attn, value)
264 | if self.window_size is not None:
265 | relative_weights = self._absolute_position_to_relative_position(p_attn)
266 | value_relative_embeddings = self._get_relative_embeddings(
267 | self.emb_rel_v, t_s
268 | )
269 | output = output + self._matmul_with_relative_values(
270 | relative_weights, value_relative_embeddings
271 | )
272 | output = (
273 | output.transpose(2, 3).contiguous().view(b, d, t_t)
274 | ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275 | return output, p_attn
276 |
277 | def _matmul_with_relative_values(self, x, y):
278 | """
279 | x: [b, h, l, m]
280 | y: [h or 1, m, d]
281 | ret: [b, h, l, d]
282 | """
283 | ret = torch.matmul(x, y.unsqueeze(0))
284 | return ret
285 |
286 | def _matmul_with_relative_keys(self, x, y):
287 | """
288 | x: [b, h, l, d]
289 | y: [h or 1, m, d]
290 | ret: [b, h, l, m]
291 | """
292 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293 | return ret
294 |
295 | def _get_relative_embeddings(self, relative_embeddings, length):
296 | max_relative_position = 2 * self.window_size + 1
297 | # Pad first before slice to avoid using cond ops.
298 | pad_length = max(length - (self.window_size + 1), 0)
299 | slice_start_position = max((self.window_size + 1) - length, 0)
300 | slice_end_position = slice_start_position + 2 * length - 1
301 | if pad_length > 0:
302 | padded_relative_embeddings = F.pad(
303 | relative_embeddings,
304 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305 | )
306 | else:
307 | padded_relative_embeddings = relative_embeddings
308 | used_relative_embeddings = padded_relative_embeddings[
309 | :, slice_start_position:slice_end_position
310 | ]
311 | return used_relative_embeddings
312 |
313 | def _relative_position_to_absolute_position(self, x):
314 | """
315 | x: [b, h, l, 2*l-1]
316 | ret: [b, h, l, l]
317 | """
318 | batch, heads, length, _ = x.size()
319 | # Concat columns of pad to shift from relative to absolute indexing.
320 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321 |
322 | # Concat extra elements so to add up to shape (len+1, 2*len-1).
323 | x_flat = x.view([batch, heads, length * 2 * length])
324 | x_flat = F.pad(
325 | x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326 | )
327 |
328 | # Reshape and slice out the padded elements.
329 | x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330 | :, :, :length, length - 1 :
331 | ]
332 | return x_final
333 |
334 | def _absolute_position_to_relative_position(self, x):
335 | """
336 | x: [b, h, l, l]
337 | ret: [b, h, l, 2*l-1]
338 | """
339 | batch, heads, length, _ = x.size()
340 | # padd along column
341 | x = F.pad(
342 | x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343 | )
344 | x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345 | # add 0's in the beginning that will skew the elements after reshape
346 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347 | x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348 | return x_final
349 |
350 | def _attention_bias_proximal(self, length):
351 | """Bias for self-attention to encourage attention to close positions.
352 | Args:
353 | length: an integer scalar.
354 | Returns:
355 | a Tensor with shape [1, 1, length, length]
356 | """
357 | r = torch.arange(length, dtype=torch.float32)
358 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360 |
361 |
362 | class FFN(nn.Module):
363 | def __init__(
364 | self,
365 | in_channels,
366 | out_channels,
367 | filter_channels,
368 | kernel_size,
369 | p_dropout=0.0,
370 | activation=None,
371 | causal=False,
372 | ):
373 | super().__init__()
374 | self.in_channels = in_channels
375 | self.out_channels = out_channels
376 | self.filter_channels = filter_channels
377 | self.kernel_size = kernel_size
378 | self.p_dropout = p_dropout
379 | self.activation = activation
380 | self.causal = causal
381 |
382 | if causal:
383 | self.padding = self._causal_padding
384 | else:
385 | self.padding = self._same_padding
386 |
387 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389 | self.drop = nn.Dropout(p_dropout)
390 |
391 | def forward(self, x, x_mask):
392 | x = self.conv_1(self.padding(x * x_mask))
393 | if self.activation == "gelu":
394 | x = x * torch.sigmoid(1.702 * x)
395 | else:
396 | x = torch.relu(x)
397 | x = self.drop(x)
398 | x = self.conv_2(self.padding(x * x_mask))
399 | return x * x_mask
400 |
401 | def _causal_padding(self, x):
402 | if self.kernel_size == 1:
403 | return x
404 | pad_l = self.kernel_size - 1
405 | pad_r = 0
406 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407 | x = F.pad(x, commons.convert_pad_shape(padding))
408 | return x
409 |
410 | def _same_padding(self, x):
411 | if self.kernel_size == 1:
412 | return x
413 | pad_l = (self.kernel_size - 1) // 2
414 | pad_r = self.kernel_size // 2
415 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416 | x = F.pad(x, commons.convert_pad_shape(padding))
417 | return x
418 |
--------------------------------------------------------------------------------
/infer_pack/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 |
8 | def init_weights(m, mean=0.0, std=0.01):
9 | classname = m.__class__.__name__
10 | if classname.find("Conv") != -1:
11 | m.weight.data.normal_(mean, std)
12 |
13 |
14 | def get_padding(kernel_size, dilation=1):
15 | return int((kernel_size * dilation - dilation) / 2)
16 |
17 |
18 | def convert_pad_shape(pad_shape):
19 | l = pad_shape[::-1]
20 | pad_shape = [item for sublist in l for item in sublist]
21 | return pad_shape
22 |
23 |
24 | def kl_divergence(m_p, logs_p, m_q, logs_q):
25 | """KL(P||Q)"""
26 | kl = (logs_q - logs_p) - 0.5
27 | kl += (
28 | 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29 | )
30 | return kl
31 |
32 |
33 | def rand_gumbel(shape):
34 | """Sample from the Gumbel distribution, protect from overflows."""
35 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36 | return -torch.log(-torch.log(uniform_samples))
37 |
38 |
39 | def rand_gumbel_like(x):
40 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41 | return g
42 |
43 |
44 | def slice_segments(x, ids_str, segment_size=4):
45 | ret = torch.zeros_like(x[:, :, :segment_size])
46 | for i in range(x.size(0)):
47 | idx_str = ids_str[i]
48 | idx_end = idx_str + segment_size
49 | ret[i] = x[i, :, idx_str:idx_end]
50 | return ret
51 |
52 |
53 | def slice_segments2(x, ids_str, segment_size=4):
54 | ret = torch.zeros_like(x[:, :segment_size])
55 | for i in range(x.size(0)):
56 | idx_str = ids_str[i]
57 | idx_end = idx_str + segment_size
58 | ret[i] = x[i, idx_str:idx_end]
59 | return ret
60 |
61 |
62 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
63 | b, d, t = x.size()
64 | if x_lengths is None:
65 | x_lengths = t
66 | ids_str_max = x_lengths - segment_size + 1
67 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
68 | ret = slice_segments(x, ids_str, segment_size)
69 | return ret, ids_str
70 |
71 |
72 | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
73 | position = torch.arange(length, dtype=torch.float)
74 | num_timescales = channels // 2
75 | log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
76 | num_timescales - 1
77 | )
78 | inv_timescales = min_timescale * torch.exp(
79 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
80 | )
81 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
82 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
83 | signal = F.pad(signal, [0, 0, 0, channels % 2])
84 | signal = signal.view(1, channels, length)
85 | return signal
86 |
87 |
88 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
89 | b, channels, length = x.size()
90 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91 | return x + signal.to(dtype=x.dtype, device=x.device)
92 |
93 |
94 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
95 | b, channels, length = x.size()
96 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
98 |
99 |
100 | def subsequent_mask(length):
101 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102 | return mask
103 |
104 |
105 | @torch.jit.script
106 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107 | n_channels_int = n_channels[0]
108 | in_act = input_a + input_b
109 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
110 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111 | acts = t_act * s_act
112 | return acts
113 |
114 |
115 | def convert_pad_shape(pad_shape):
116 | l = pad_shape[::-1]
117 | pad_shape = [item for sublist in l for item in sublist]
118 | return pad_shape
119 |
120 |
121 | def shift_1d(x):
122 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123 | return x
124 |
125 |
126 | def sequence_mask(length, max_length=None):
127 | if max_length is None:
128 | max_length = length.max()
129 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130 | return x.unsqueeze(0) < length.unsqueeze(1)
131 |
132 |
133 | def generate_path(duration, mask):
134 | """
135 | duration: [b, 1, t_x]
136 | mask: [b, 1, t_y, t_x]
137 | """
138 | device = duration.device
139 |
140 | b, _, t_y, t_x = mask.shape
141 | cum_duration = torch.cumsum(duration, -1)
142 |
143 | cum_duration_flat = cum_duration.view(b * t_x)
144 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145 | path = path.view(b, t_x, t_y)
146 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147 | path = path.unsqueeze(1).transpose(2, 3) * mask
148 | return path
149 |
150 |
151 | def clip_grad_value_(parameters, clip_value, norm_type=2):
152 | if isinstance(parameters, torch.Tensor):
153 | parameters = [parameters]
154 | parameters = list(filter(lambda p: p.grad is not None, parameters))
155 | norm_type = float(norm_type)
156 | if clip_value is not None:
157 | clip_value = float(clip_value)
158 |
159 | total_norm = 0
160 | for p in parameters:
161 | param_norm = p.grad.data.norm(norm_type)
162 | total_norm += param_norm.item() ** norm_type
163 | if clip_value is not None:
164 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
165 | total_norm = total_norm ** (1.0 / norm_type)
166 | return total_norm
167 |
--------------------------------------------------------------------------------
/infer_pack/models_onnx.py:
--------------------------------------------------------------------------------
1 | import math, pdb, os
2 | from time import time as ttime
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from infer_pack import modules
7 | from infer_pack import attentions
8 | from infer_pack import commons
9 | from infer_pack.commons import init_weights, get_padding
10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12 | from infer_pack.commons import init_weights
13 | import numpy as np
14 | from infer_pack import commons
15 |
16 |
17 | class TextEncoder256(nn.Module):
18 | def __init__(
19 | self,
20 | out_channels,
21 | hidden_channels,
22 | filter_channels,
23 | n_heads,
24 | n_layers,
25 | kernel_size,
26 | p_dropout,
27 | f0=True,
28 | ):
29 | super().__init__()
30 | self.out_channels = out_channels
31 | self.hidden_channels = hidden_channels
32 | self.filter_channels = filter_channels
33 | self.n_heads = n_heads
34 | self.n_layers = n_layers
35 | self.kernel_size = kernel_size
36 | self.p_dropout = p_dropout
37 | self.emb_phone = nn.Linear(256, hidden_channels)
38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39 | if f0 == True:
40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41 | self.encoder = attentions.Encoder(
42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43 | )
44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45 |
46 | def forward(self, phone, pitch, lengths):
47 | if pitch == None:
48 | x = self.emb_phone(phone)
49 | else:
50 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52 | x = self.lrelu(x)
53 | x = torch.transpose(x, 1, -1) # [b, h, t]
54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55 | x.dtype
56 | )
57 | x = self.encoder(x * x_mask, x_mask)
58 | stats = self.proj(x) * x_mask
59 |
60 | m, logs = torch.split(stats, self.out_channels, dim=1)
61 | return m, logs, x_mask
62 |
63 |
64 | class TextEncoder768(nn.Module):
65 | def __init__(
66 | self,
67 | out_channels,
68 | hidden_channels,
69 | filter_channels,
70 | n_heads,
71 | n_layers,
72 | kernel_size,
73 | p_dropout,
74 | f0=True,
75 | ):
76 | super().__init__()
77 | self.out_channels = out_channels
78 | self.hidden_channels = hidden_channels
79 | self.filter_channels = filter_channels
80 | self.n_heads = n_heads
81 | self.n_layers = n_layers
82 | self.kernel_size = kernel_size
83 | self.p_dropout = p_dropout
84 | self.emb_phone = nn.Linear(768, hidden_channels)
85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86 | if f0 == True:
87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88 | self.encoder = attentions.Encoder(
89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90 | )
91 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92 |
93 | def forward(self, phone, pitch, lengths):
94 | if pitch == None:
95 | x = self.emb_phone(phone)
96 | else:
97 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99 | x = self.lrelu(x)
100 | x = torch.transpose(x, 1, -1) # [b, h, t]
101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102 | x.dtype
103 | )
104 | x = self.encoder(x * x_mask, x_mask)
105 | stats = self.proj(x) * x_mask
106 |
107 | m, logs = torch.split(stats, self.out_channels, dim=1)
108 | return m, logs, x_mask
109 |
110 |
111 | class ResidualCouplingBlock(nn.Module):
112 | def __init__(
113 | self,
114 | channels,
115 | hidden_channels,
116 | kernel_size,
117 | dilation_rate,
118 | n_layers,
119 | n_flows=4,
120 | gin_channels=0,
121 | ):
122 | super().__init__()
123 | self.channels = channels
124 | self.hidden_channels = hidden_channels
125 | self.kernel_size = kernel_size
126 | self.dilation_rate = dilation_rate
127 | self.n_layers = n_layers
128 | self.n_flows = n_flows
129 | self.gin_channels = gin_channels
130 |
131 | self.flows = nn.ModuleList()
132 | for i in range(n_flows):
133 | self.flows.append(
134 | modules.ResidualCouplingLayer(
135 | channels,
136 | hidden_channels,
137 | kernel_size,
138 | dilation_rate,
139 | n_layers,
140 | gin_channels=gin_channels,
141 | mean_only=True,
142 | )
143 | )
144 | self.flows.append(modules.Flip())
145 |
146 | def forward(self, x, x_mask, g=None, reverse=False):
147 | if not reverse:
148 | for flow in self.flows:
149 | x, _ = flow(x, x_mask, g=g, reverse=reverse)
150 | else:
151 | for flow in reversed(self.flows):
152 | x = flow(x, x_mask, g=g, reverse=reverse)
153 | return x
154 |
155 | def remove_weight_norm(self):
156 | for i in range(self.n_flows):
157 | self.flows[i * 2].remove_weight_norm()
158 |
159 |
160 | class PosteriorEncoder(nn.Module):
161 | def __init__(
162 | self,
163 | in_channels,
164 | out_channels,
165 | hidden_channels,
166 | kernel_size,
167 | dilation_rate,
168 | n_layers,
169 | gin_channels=0,
170 | ):
171 | super().__init__()
172 | self.in_channels = in_channels
173 | self.out_channels = out_channels
174 | self.hidden_channels = hidden_channels
175 | self.kernel_size = kernel_size
176 | self.dilation_rate = dilation_rate
177 | self.n_layers = n_layers
178 | self.gin_channels = gin_channels
179 |
180 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181 | self.enc = modules.WN(
182 | hidden_channels,
183 | kernel_size,
184 | dilation_rate,
185 | n_layers,
186 | gin_channels=gin_channels,
187 | )
188 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189 |
190 | def forward(self, x, x_lengths, g=None):
191 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192 | x.dtype
193 | )
194 | x = self.pre(x) * x_mask
195 | x = self.enc(x, x_mask, g=g)
196 | stats = self.proj(x) * x_mask
197 | m, logs = torch.split(stats, self.out_channels, dim=1)
198 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199 | return z, m, logs, x_mask
200 |
201 | def remove_weight_norm(self):
202 | self.enc.remove_weight_norm()
203 |
204 |
205 | class Generator(torch.nn.Module):
206 | def __init__(
207 | self,
208 | initial_channel,
209 | resblock,
210 | resblock_kernel_sizes,
211 | resblock_dilation_sizes,
212 | upsample_rates,
213 | upsample_initial_channel,
214 | upsample_kernel_sizes,
215 | gin_channels=0,
216 | ):
217 | super(Generator, self).__init__()
218 | self.num_kernels = len(resblock_kernel_sizes)
219 | self.num_upsamples = len(upsample_rates)
220 | self.conv_pre = Conv1d(
221 | initial_channel, upsample_initial_channel, 7, 1, padding=3
222 | )
223 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224 |
225 | self.ups = nn.ModuleList()
226 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227 | self.ups.append(
228 | weight_norm(
229 | ConvTranspose1d(
230 | upsample_initial_channel // (2**i),
231 | upsample_initial_channel // (2 ** (i + 1)),
232 | k,
233 | u,
234 | padding=(k - u) // 2,
235 | )
236 | )
237 | )
238 |
239 | self.resblocks = nn.ModuleList()
240 | for i in range(len(self.ups)):
241 | ch = upsample_initial_channel // (2 ** (i + 1))
242 | for j, (k, d) in enumerate(
243 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
244 | ):
245 | self.resblocks.append(resblock(ch, k, d))
246 |
247 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248 | self.ups.apply(init_weights)
249 |
250 | if gin_channels != 0:
251 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252 |
253 | def forward(self, x, g=None):
254 | x = self.conv_pre(x)
255 | if g is not None:
256 | x = x + self.cond(g)
257 |
258 | for i in range(self.num_upsamples):
259 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
260 | x = self.ups[i](x)
261 | xs = None
262 | for j in range(self.num_kernels):
263 | if xs is None:
264 | xs = self.resblocks[i * self.num_kernels + j](x)
265 | else:
266 | xs += self.resblocks[i * self.num_kernels + j](x)
267 | x = xs / self.num_kernels
268 | x = F.leaky_relu(x)
269 | x = self.conv_post(x)
270 | x = torch.tanh(x)
271 |
272 | return x
273 |
274 | def remove_weight_norm(self):
275 | for l in self.ups:
276 | remove_weight_norm(l)
277 | for l in self.resblocks:
278 | l.remove_weight_norm()
279 |
280 |
281 | class SineGen(torch.nn.Module):
282 | """Definition of sine generator
283 | SineGen(samp_rate, harmonic_num = 0,
284 | sine_amp = 0.1, noise_std = 0.003,
285 | voiced_threshold = 0,
286 | flag_for_pulse=False)
287 | samp_rate: sampling rate in Hz
288 | harmonic_num: number of harmonic overtones (default 0)
289 | sine_amp: amplitude of sine-wavefrom (default 0.1)
290 | noise_std: std of Gaussian noise (default 0.003)
291 | voiced_thoreshold: F0 threshold for U/V classification (default 0)
292 | flag_for_pulse: this SinGen is used inside PulseGen (default False)
293 | Note: when flag_for_pulse is True, the first time step of a voiced
294 | segment is always sin(np.pi) or cos(0)
295 | """
296 |
297 | def __init__(
298 | self,
299 | samp_rate,
300 | harmonic_num=0,
301 | sine_amp=0.1,
302 | noise_std=0.003,
303 | voiced_threshold=0,
304 | flag_for_pulse=False,
305 | ):
306 | super(SineGen, self).__init__()
307 | self.sine_amp = sine_amp
308 | self.noise_std = noise_std
309 | self.harmonic_num = harmonic_num
310 | self.dim = self.harmonic_num + 1
311 | self.sampling_rate = samp_rate
312 | self.voiced_threshold = voiced_threshold
313 |
314 | def _f02uv(self, f0):
315 | # generate uv signal
316 | uv = torch.ones_like(f0)
317 | uv = uv * (f0 > self.voiced_threshold)
318 | return uv
319 |
320 | def forward(self, f0, upp):
321 | """sine_tensor, uv = forward(f0)
322 | input F0: tensor(batchsize=1, length, dim=1)
323 | f0 for unvoiced steps should be 0
324 | output sine_tensor: tensor(batchsize=1, length, dim)
325 | output uv: tensor(batchsize=1, length, 1)
326 | """
327 | with torch.no_grad():
328 | f0 = f0[:, None].transpose(1, 2)
329 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330 | # fundamental component
331 | f0_buf[:, :, 0] = f0[:, :, 0]
332 | for idx in np.arange(self.harmonic_num):
333 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334 | idx + 2
335 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337 | rand_ini = torch.rand(
338 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339 | )
340 | rand_ini[:, 0] = 0
341 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343 | tmp_over_one *= upp
344 | tmp_over_one = F.interpolate(
345 | tmp_over_one.transpose(2, 1),
346 | scale_factor=upp,
347 | mode="linear",
348 | align_corners=True,
349 | ).transpose(2, 1)
350 | rad_values = F.interpolate(
351 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352 | ).transpose(
353 | 2, 1
354 | ) #######
355 | tmp_over_one %= 1
356 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357 | cumsum_shift = torch.zeros_like(rad_values)
358 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359 | sine_waves = torch.sin(
360 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361 | )
362 | sine_waves = sine_waves * self.sine_amp
363 | uv = self._f02uv(f0)
364 | uv = F.interpolate(
365 | uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366 | ).transpose(2, 1)
367 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368 | noise = noise_amp * torch.randn_like(sine_waves)
369 | sine_waves = sine_waves * uv + noise
370 | return sine_waves, uv, noise
371 |
372 |
373 | class SourceModuleHnNSF(torch.nn.Module):
374 | """SourceModule for hn-nsf
375 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376 | add_noise_std=0.003, voiced_threshod=0)
377 | sampling_rate: sampling_rate in Hz
378 | harmonic_num: number of harmonic above F0 (default: 0)
379 | sine_amp: amplitude of sine source signal (default: 0.1)
380 | add_noise_std: std of additive Gaussian noise (default: 0.003)
381 | note that amplitude of noise in unvoiced is decided
382 | by sine_amp
383 | voiced_threshold: threhold to set U/V given F0 (default: 0)
384 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385 | F0_sampled (batchsize, length, 1)
386 | Sine_source (batchsize, length, 1)
387 | noise_source (batchsize, length 1)
388 | uv (batchsize, length, 1)
389 | """
390 |
391 | def __init__(
392 | self,
393 | sampling_rate,
394 | harmonic_num=0,
395 | sine_amp=0.1,
396 | add_noise_std=0.003,
397 | voiced_threshod=0,
398 | is_half=True,
399 | ):
400 | super(SourceModuleHnNSF, self).__init__()
401 |
402 | self.sine_amp = sine_amp
403 | self.noise_std = add_noise_std
404 | self.is_half = is_half
405 | # to produce sine waveforms
406 | self.l_sin_gen = SineGen(
407 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408 | )
409 |
410 | # to merge source harmonics into a single excitation
411 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412 | self.l_tanh = torch.nn.Tanh()
413 |
414 | def forward(self, x, upp=None):
415 | sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416 | if self.is_half:
417 | sine_wavs = sine_wavs.half()
418 | sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419 | return sine_merge, None, None # noise, uv
420 |
421 |
422 | class GeneratorNSF(torch.nn.Module):
423 | def __init__(
424 | self,
425 | initial_channel,
426 | resblock,
427 | resblock_kernel_sizes,
428 | resblock_dilation_sizes,
429 | upsample_rates,
430 | upsample_initial_channel,
431 | upsample_kernel_sizes,
432 | gin_channels,
433 | sr,
434 | is_half=False,
435 | ):
436 | super(GeneratorNSF, self).__init__()
437 | self.num_kernels = len(resblock_kernel_sizes)
438 | self.num_upsamples = len(upsample_rates)
439 |
440 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441 | self.m_source = SourceModuleHnNSF(
442 | sampling_rate=sr, harmonic_num=0, is_half=is_half
443 | )
444 | self.noise_convs = nn.ModuleList()
445 | self.conv_pre = Conv1d(
446 | initial_channel, upsample_initial_channel, 7, 1, padding=3
447 | )
448 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449 |
450 | self.ups = nn.ModuleList()
451 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452 | c_cur = upsample_initial_channel // (2 ** (i + 1))
453 | self.ups.append(
454 | weight_norm(
455 | ConvTranspose1d(
456 | upsample_initial_channel // (2**i),
457 | upsample_initial_channel // (2 ** (i + 1)),
458 | k,
459 | u,
460 | padding=(k - u) // 2,
461 | )
462 | )
463 | )
464 | if i + 1 < len(upsample_rates):
465 | stride_f0 = np.prod(upsample_rates[i + 1 :])
466 | self.noise_convs.append(
467 | Conv1d(
468 | 1,
469 | c_cur,
470 | kernel_size=stride_f0 * 2,
471 | stride=stride_f0,
472 | padding=stride_f0 // 2,
473 | )
474 | )
475 | else:
476 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477 |
478 | self.resblocks = nn.ModuleList()
479 | for i in range(len(self.ups)):
480 | ch = upsample_initial_channel // (2 ** (i + 1))
481 | for j, (k, d) in enumerate(
482 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
483 | ):
484 | self.resblocks.append(resblock(ch, k, d))
485 |
486 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487 | self.ups.apply(init_weights)
488 |
489 | if gin_channels != 0:
490 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491 |
492 | self.upp = np.prod(upsample_rates)
493 |
494 | def forward(self, x, f0, g=None):
495 | har_source, noi_source, uv = self.m_source(f0, self.upp)
496 | har_source = har_source.transpose(1, 2)
497 | x = self.conv_pre(x)
498 | if g is not None:
499 | x = x + self.cond(g)
500 |
501 | for i in range(self.num_upsamples):
502 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
503 | x = self.ups[i](x)
504 | x_source = self.noise_convs[i](har_source)
505 | x = x + x_source
506 | xs = None
507 | for j in range(self.num_kernels):
508 | if xs is None:
509 | xs = self.resblocks[i * self.num_kernels + j](x)
510 | else:
511 | xs += self.resblocks[i * self.num_kernels + j](x)
512 | x = xs / self.num_kernels
513 | x = F.leaky_relu(x)
514 | x = self.conv_post(x)
515 | x = torch.tanh(x)
516 | return x
517 |
518 | def remove_weight_norm(self):
519 | for l in self.ups:
520 | remove_weight_norm(l)
521 | for l in self.resblocks:
522 | l.remove_weight_norm()
523 |
524 |
525 | sr2sr = {
526 | "32k": 32000,
527 | "40k": 40000,
528 | "48k": 48000,
529 | }
530 |
531 |
532 | class SynthesizerTrnMsNSFsidM(nn.Module):
533 | def __init__(
534 | self,
535 | spec_channels,
536 | segment_size,
537 | inter_channels,
538 | hidden_channels,
539 | filter_channels,
540 | n_heads,
541 | n_layers,
542 | kernel_size,
543 | p_dropout,
544 | resblock,
545 | resblock_kernel_sizes,
546 | resblock_dilation_sizes,
547 | upsample_rates,
548 | upsample_initial_channel,
549 | upsample_kernel_sizes,
550 | spk_embed_dim,
551 | gin_channels,
552 | sr,
553 | **kwargs
554 | ):
555 | super().__init__()
556 | if type(sr) == type("strr"):
557 | sr = sr2sr[sr]
558 | self.spec_channels = spec_channels
559 | self.inter_channels = inter_channels
560 | self.hidden_channels = hidden_channels
561 | self.filter_channels = filter_channels
562 | self.n_heads = n_heads
563 | self.n_layers = n_layers
564 | self.kernel_size = kernel_size
565 | self.p_dropout = p_dropout
566 | self.resblock = resblock
567 | self.resblock_kernel_sizes = resblock_kernel_sizes
568 | self.resblock_dilation_sizes = resblock_dilation_sizes
569 | self.upsample_rates = upsample_rates
570 | self.upsample_initial_channel = upsample_initial_channel
571 | self.upsample_kernel_sizes = upsample_kernel_sizes
572 | self.segment_size = segment_size
573 | self.gin_channels = gin_channels
574 | # self.hop_length = hop_length#
575 | self.spk_embed_dim = spk_embed_dim
576 | if self.gin_channels == 256:
577 | self.enc_p = TextEncoder256(
578 | inter_channels,
579 | hidden_channels,
580 | filter_channels,
581 | n_heads,
582 | n_layers,
583 | kernel_size,
584 | p_dropout,
585 | )
586 | else:
587 | self.enc_p = TextEncoder768(
588 | inter_channels,
589 | hidden_channels,
590 | filter_channels,
591 | n_heads,
592 | n_layers,
593 | kernel_size,
594 | p_dropout,
595 | )
596 | self.dec = GeneratorNSF(
597 | inter_channels,
598 | resblock,
599 | resblock_kernel_sizes,
600 | resblock_dilation_sizes,
601 | upsample_rates,
602 | upsample_initial_channel,
603 | upsample_kernel_sizes,
604 | gin_channels=gin_channels,
605 | sr=sr,
606 | is_half=kwargs["is_half"],
607 | )
608 | self.enc_q = PosteriorEncoder(
609 | spec_channels,
610 | inter_channels,
611 | hidden_channels,
612 | 5,
613 | 1,
614 | 16,
615 | gin_channels=gin_channels,
616 | )
617 | self.flow = ResidualCouplingBlock(
618 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
619 | )
620 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
621 | self.speaker_map = None
622 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
623 |
624 | def remove_weight_norm(self):
625 | self.dec.remove_weight_norm()
626 | self.flow.remove_weight_norm()
627 | self.enc_q.remove_weight_norm()
628 |
629 | def construct_spkmixmap(self, n_speaker):
630 | self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
631 | for i in range(n_speaker):
632 | self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
633 | self.speaker_map = self.speaker_map.unsqueeze(0)
634 |
635 | def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
636 | if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
637 | g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
638 | g = g * self.speaker_map # [N, S, B, 1, H]
639 | g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
640 | g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
641 | else:
642 | g = g.unsqueeze(0)
643 | g = self.emb_g(g).transpose(1, 2)
644 |
645 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
646 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
647 | z = self.flow(z_p, x_mask, g=g, reverse=True)
648 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
649 | return o
650 |
651 |
652 | class MultiPeriodDiscriminator(torch.nn.Module):
653 | def __init__(self, use_spectral_norm=False):
654 | super(MultiPeriodDiscriminator, self).__init__()
655 | periods = [2, 3, 5, 7, 11, 17]
656 | # periods = [3, 5, 7, 11, 17, 23, 37]
657 |
658 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
659 | discs = discs + [
660 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
661 | ]
662 | self.discriminators = nn.ModuleList(discs)
663 |
664 | def forward(self, y, y_hat):
665 | y_d_rs = [] #
666 | y_d_gs = []
667 | fmap_rs = []
668 | fmap_gs = []
669 | for i, d in enumerate(self.discriminators):
670 | y_d_r, fmap_r = d(y)
671 | y_d_g, fmap_g = d(y_hat)
672 | # for j in range(len(fmap_r)):
673 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
674 | y_d_rs.append(y_d_r)
675 | y_d_gs.append(y_d_g)
676 | fmap_rs.append(fmap_r)
677 | fmap_gs.append(fmap_g)
678 |
679 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
680 |
681 |
682 | class MultiPeriodDiscriminatorV2(torch.nn.Module):
683 | def __init__(self, use_spectral_norm=False):
684 | super(MultiPeriodDiscriminatorV2, self).__init__()
685 | # periods = [2, 3, 5, 7, 11, 17]
686 | periods = [2, 3, 5, 7, 11, 17, 23, 37]
687 |
688 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
689 | discs = discs + [
690 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
691 | ]
692 | self.discriminators = nn.ModuleList(discs)
693 |
694 | def forward(self, y, y_hat):
695 | y_d_rs = [] #
696 | y_d_gs = []
697 | fmap_rs = []
698 | fmap_gs = []
699 | for i, d in enumerate(self.discriminators):
700 | y_d_r, fmap_r = d(y)
701 | y_d_g, fmap_g = d(y_hat)
702 | # for j in range(len(fmap_r)):
703 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
704 | y_d_rs.append(y_d_r)
705 | y_d_gs.append(y_d_g)
706 | fmap_rs.append(fmap_r)
707 | fmap_gs.append(fmap_g)
708 |
709 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
710 |
711 |
712 | class DiscriminatorS(torch.nn.Module):
713 | def __init__(self, use_spectral_norm=False):
714 | super(DiscriminatorS, self).__init__()
715 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
716 | self.convs = nn.ModuleList(
717 | [
718 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
719 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
720 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
721 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
722 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
723 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
724 | ]
725 | )
726 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
727 |
728 | def forward(self, x):
729 | fmap = []
730 |
731 | for l in self.convs:
732 | x = l(x)
733 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
734 | fmap.append(x)
735 | x = self.conv_post(x)
736 | fmap.append(x)
737 | x = torch.flatten(x, 1, -1)
738 |
739 | return x, fmap
740 |
741 |
742 | class DiscriminatorP(torch.nn.Module):
743 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
744 | super(DiscriminatorP, self).__init__()
745 | self.period = period
746 | self.use_spectral_norm = use_spectral_norm
747 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
748 | self.convs = nn.ModuleList(
749 | [
750 | norm_f(
751 | Conv2d(
752 | 1,
753 | 32,
754 | (kernel_size, 1),
755 | (stride, 1),
756 | padding=(get_padding(kernel_size, 1), 0),
757 | )
758 | ),
759 | norm_f(
760 | Conv2d(
761 | 32,
762 | 128,
763 | (kernel_size, 1),
764 | (stride, 1),
765 | padding=(get_padding(kernel_size, 1), 0),
766 | )
767 | ),
768 | norm_f(
769 | Conv2d(
770 | 128,
771 | 512,
772 | (kernel_size, 1),
773 | (stride, 1),
774 | padding=(get_padding(kernel_size, 1), 0),
775 | )
776 | ),
777 | norm_f(
778 | Conv2d(
779 | 512,
780 | 1024,
781 | (kernel_size, 1),
782 | (stride, 1),
783 | padding=(get_padding(kernel_size, 1), 0),
784 | )
785 | ),
786 | norm_f(
787 | Conv2d(
788 | 1024,
789 | 1024,
790 | (kernel_size, 1),
791 | 1,
792 | padding=(get_padding(kernel_size, 1), 0),
793 | )
794 | ),
795 | ]
796 | )
797 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
798 |
799 | def forward(self, x):
800 | fmap = []
801 |
802 | # 1d to 2d
803 | b, c, t = x.shape
804 | if t % self.period != 0: # pad first
805 | n_pad = self.period - (t % self.period)
806 | x = F.pad(x, (0, n_pad), "reflect")
807 | t = t + n_pad
808 | x = x.view(b, c, t // self.period, self.period)
809 |
810 | for l in self.convs:
811 | x = l(x)
812 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
813 | fmap.append(x)
814 | x = self.conv_post(x)
815 | fmap.append(x)
816 | x = torch.flatten(x, 1, -1)
817 |
818 | return x, fmap
819 |
--------------------------------------------------------------------------------
/infer_pack/models_onnx_moess.py:
--------------------------------------------------------------------------------
1 | import math, pdb, os
2 | from time import time as ttime
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from infer_pack import modules
7 | from infer_pack import attentions
8 | from infer_pack import commons
9 | from infer_pack.commons import init_weights, get_padding
10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12 | from infer_pack.commons import init_weights
13 | import numpy as np
14 | from infer_pack import commons
15 |
16 |
17 | class TextEncoder256(nn.Module):
18 | def __init__(
19 | self,
20 | out_channels,
21 | hidden_channels,
22 | filter_channels,
23 | n_heads,
24 | n_layers,
25 | kernel_size,
26 | p_dropout,
27 | f0=True,
28 | ):
29 | super().__init__()
30 | self.out_channels = out_channels
31 | self.hidden_channels = hidden_channels
32 | self.filter_channels = filter_channels
33 | self.n_heads = n_heads
34 | self.n_layers = n_layers
35 | self.kernel_size = kernel_size
36 | self.p_dropout = p_dropout
37 | self.emb_phone = nn.Linear(256, hidden_channels)
38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39 | if f0 == True:
40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41 | self.encoder = attentions.Encoder(
42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43 | )
44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45 |
46 | def forward(self, phone, pitch, lengths):
47 | if pitch == None:
48 | x = self.emb_phone(phone)
49 | else:
50 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52 | x = self.lrelu(x)
53 | x = torch.transpose(x, 1, -1) # [b, h, t]
54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55 | x.dtype
56 | )
57 | x = self.encoder(x * x_mask, x_mask)
58 | stats = self.proj(x) * x_mask
59 |
60 | m, logs = torch.split(stats, self.out_channels, dim=1)
61 | return m, logs, x_mask
62 |
63 |
64 | class TextEncoder256Sim(nn.Module):
65 | def __init__(
66 | self,
67 | out_channels,
68 | hidden_channels,
69 | filter_channels,
70 | n_heads,
71 | n_layers,
72 | kernel_size,
73 | p_dropout,
74 | f0=True,
75 | ):
76 | super().__init__()
77 | self.out_channels = out_channels
78 | self.hidden_channels = hidden_channels
79 | self.filter_channels = filter_channels
80 | self.n_heads = n_heads
81 | self.n_layers = n_layers
82 | self.kernel_size = kernel_size
83 | self.p_dropout = p_dropout
84 | self.emb_phone = nn.Linear(256, hidden_channels)
85 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86 | if f0 == True:
87 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88 | self.encoder = attentions.Encoder(
89 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90 | )
91 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
92 |
93 | def forward(self, phone, pitch, lengths):
94 | if pitch == None:
95 | x = self.emb_phone(phone)
96 | else:
97 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
98 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99 | x = self.lrelu(x)
100 | x = torch.transpose(x, 1, -1) # [b, h, t]
101 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102 | x.dtype
103 | )
104 | x = self.encoder(x * x_mask, x_mask)
105 | x = self.proj(x) * x_mask
106 | return x, x_mask
107 |
108 |
109 | class ResidualCouplingBlock(nn.Module):
110 | def __init__(
111 | self,
112 | channels,
113 | hidden_channels,
114 | kernel_size,
115 | dilation_rate,
116 | n_layers,
117 | n_flows=4,
118 | gin_channels=0,
119 | ):
120 | super().__init__()
121 | self.channels = channels
122 | self.hidden_channels = hidden_channels
123 | self.kernel_size = kernel_size
124 | self.dilation_rate = dilation_rate
125 | self.n_layers = n_layers
126 | self.n_flows = n_flows
127 | self.gin_channels = gin_channels
128 |
129 | self.flows = nn.ModuleList()
130 | for i in range(n_flows):
131 | self.flows.append(
132 | modules.ResidualCouplingLayer(
133 | channels,
134 | hidden_channels,
135 | kernel_size,
136 | dilation_rate,
137 | n_layers,
138 | gin_channels=gin_channels,
139 | mean_only=True,
140 | )
141 | )
142 | self.flows.append(modules.Flip())
143 |
144 | def forward(self, x, x_mask, g=None, reverse=False):
145 | if not reverse:
146 | for flow in self.flows:
147 | x, _ = flow(x, x_mask, g=g, reverse=reverse)
148 | else:
149 | for flow in reversed(self.flows):
150 | x = flow(x, x_mask, g=g, reverse=reverse)
151 | return x
152 |
153 | def remove_weight_norm(self):
154 | for i in range(self.n_flows):
155 | self.flows[i * 2].remove_weight_norm()
156 |
157 |
158 | class PosteriorEncoder(nn.Module):
159 | def __init__(
160 | self,
161 | in_channels,
162 | out_channels,
163 | hidden_channels,
164 | kernel_size,
165 | dilation_rate,
166 | n_layers,
167 | gin_channels=0,
168 | ):
169 | super().__init__()
170 | self.in_channels = in_channels
171 | self.out_channels = out_channels
172 | self.hidden_channels = hidden_channels
173 | self.kernel_size = kernel_size
174 | self.dilation_rate = dilation_rate
175 | self.n_layers = n_layers
176 | self.gin_channels = gin_channels
177 |
178 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
179 | self.enc = modules.WN(
180 | hidden_channels,
181 | kernel_size,
182 | dilation_rate,
183 | n_layers,
184 | gin_channels=gin_channels,
185 | )
186 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
187 |
188 | def forward(self, x, x_lengths, g=None):
189 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
190 | x.dtype
191 | )
192 | x = self.pre(x) * x_mask
193 | x = self.enc(x, x_mask, g=g)
194 | stats = self.proj(x) * x_mask
195 | m, logs = torch.split(stats, self.out_channels, dim=1)
196 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
197 | return z, m, logs, x_mask
198 |
199 | def remove_weight_norm(self):
200 | self.enc.remove_weight_norm()
201 |
202 |
203 | class Generator(torch.nn.Module):
204 | def __init__(
205 | self,
206 | initial_channel,
207 | resblock,
208 | resblock_kernel_sizes,
209 | resblock_dilation_sizes,
210 | upsample_rates,
211 | upsample_initial_channel,
212 | upsample_kernel_sizes,
213 | gin_channels=0,
214 | ):
215 | super(Generator, self).__init__()
216 | self.num_kernels = len(resblock_kernel_sizes)
217 | self.num_upsamples = len(upsample_rates)
218 | self.conv_pre = Conv1d(
219 | initial_channel, upsample_initial_channel, 7, 1, padding=3
220 | )
221 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
222 |
223 | self.ups = nn.ModuleList()
224 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
225 | self.ups.append(
226 | weight_norm(
227 | ConvTranspose1d(
228 | upsample_initial_channel // (2**i),
229 | upsample_initial_channel // (2 ** (i + 1)),
230 | k,
231 | u,
232 | padding=(k - u) // 2,
233 | )
234 | )
235 | )
236 |
237 | self.resblocks = nn.ModuleList()
238 | for i in range(len(self.ups)):
239 | ch = upsample_initial_channel // (2 ** (i + 1))
240 | for j, (k, d) in enumerate(
241 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
242 | ):
243 | self.resblocks.append(resblock(ch, k, d))
244 |
245 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
246 | self.ups.apply(init_weights)
247 |
248 | if gin_channels != 0:
249 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
250 |
251 | def forward(self, x, g=None):
252 | x = self.conv_pre(x)
253 | if g is not None:
254 | x = x + self.cond(g)
255 |
256 | for i in range(self.num_upsamples):
257 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
258 | x = self.ups[i](x)
259 | xs = None
260 | for j in range(self.num_kernels):
261 | if xs is None:
262 | xs = self.resblocks[i * self.num_kernels + j](x)
263 | else:
264 | xs += self.resblocks[i * self.num_kernels + j](x)
265 | x = xs / self.num_kernels
266 | x = F.leaky_relu(x)
267 | x = self.conv_post(x)
268 | x = torch.tanh(x)
269 |
270 | return x
271 |
272 | def remove_weight_norm(self):
273 | for l in self.ups:
274 | remove_weight_norm(l)
275 | for l in self.resblocks:
276 | l.remove_weight_norm()
277 |
278 |
279 | class SineGen(torch.nn.Module):
280 | """Definition of sine generator
281 | SineGen(samp_rate, harmonic_num = 0,
282 | sine_amp = 0.1, noise_std = 0.003,
283 | voiced_threshold = 0,
284 | flag_for_pulse=False)
285 | samp_rate: sampling rate in Hz
286 | harmonic_num: number of harmonic overtones (default 0)
287 | sine_amp: amplitude of sine-wavefrom (default 0.1)
288 | noise_std: std of Gaussian noise (default 0.003)
289 | voiced_thoreshold: F0 threshold for U/V classification (default 0)
290 | flag_for_pulse: this SinGen is used inside PulseGen (default False)
291 | Note: when flag_for_pulse is True, the first time step of a voiced
292 | segment is always sin(np.pi) or cos(0)
293 | """
294 |
295 | def __init__(
296 | self,
297 | samp_rate,
298 | harmonic_num=0,
299 | sine_amp=0.1,
300 | noise_std=0.003,
301 | voiced_threshold=0,
302 | flag_for_pulse=False,
303 | ):
304 | super(SineGen, self).__init__()
305 | self.sine_amp = sine_amp
306 | self.noise_std = noise_std
307 | self.harmonic_num = harmonic_num
308 | self.dim = self.harmonic_num + 1
309 | self.sampling_rate = samp_rate
310 | self.voiced_threshold = voiced_threshold
311 |
312 | def _f02uv(self, f0):
313 | # generate uv signal
314 | uv = torch.ones_like(f0)
315 | uv = uv * (f0 > self.voiced_threshold)
316 | return uv
317 |
318 | def forward(self, f0, upp):
319 | """sine_tensor, uv = forward(f0)
320 | input F0: tensor(batchsize=1, length, dim=1)
321 | f0 for unvoiced steps should be 0
322 | output sine_tensor: tensor(batchsize=1, length, dim)
323 | output uv: tensor(batchsize=1, length, 1)
324 | """
325 | with torch.no_grad():
326 | f0 = f0[:, None].transpose(1, 2)
327 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
328 | # fundamental component
329 | f0_buf[:, :, 0] = f0[:, :, 0]
330 | for idx in np.arange(self.harmonic_num):
331 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
332 | idx + 2
333 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
334 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
335 | rand_ini = torch.rand(
336 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
337 | )
338 | rand_ini[:, 0] = 0
339 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
340 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
341 | tmp_over_one *= upp
342 | tmp_over_one = F.interpolate(
343 | tmp_over_one.transpose(2, 1),
344 | scale_factor=upp,
345 | mode="linear",
346 | align_corners=True,
347 | ).transpose(2, 1)
348 | rad_values = F.interpolate(
349 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
350 | ).transpose(
351 | 2, 1
352 | ) #######
353 | tmp_over_one %= 1
354 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
355 | cumsum_shift = torch.zeros_like(rad_values)
356 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
357 | sine_waves = torch.sin(
358 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
359 | )
360 | sine_waves = sine_waves * self.sine_amp
361 | uv = self._f02uv(f0)
362 | uv = F.interpolate(
363 | uv.transpose(2, 1), scale_factor=upp, mode="nearest"
364 | ).transpose(2, 1)
365 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
366 | noise = noise_amp * torch.randn_like(sine_waves)
367 | sine_waves = sine_waves * uv + noise
368 | return sine_waves, uv, noise
369 |
370 |
371 | class SourceModuleHnNSF(torch.nn.Module):
372 | """SourceModule for hn-nsf
373 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
374 | add_noise_std=0.003, voiced_threshod=0)
375 | sampling_rate: sampling_rate in Hz
376 | harmonic_num: number of harmonic above F0 (default: 0)
377 | sine_amp: amplitude of sine source signal (default: 0.1)
378 | add_noise_std: std of additive Gaussian noise (default: 0.003)
379 | note that amplitude of noise in unvoiced is decided
380 | by sine_amp
381 | voiced_threshold: threhold to set U/V given F0 (default: 0)
382 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
383 | F0_sampled (batchsize, length, 1)
384 | Sine_source (batchsize, length, 1)
385 | noise_source (batchsize, length 1)
386 | uv (batchsize, length, 1)
387 | """
388 |
389 | def __init__(
390 | self,
391 | sampling_rate,
392 | harmonic_num=0,
393 | sine_amp=0.1,
394 | add_noise_std=0.003,
395 | voiced_threshod=0,
396 | is_half=True,
397 | ):
398 | super(SourceModuleHnNSF, self).__init__()
399 |
400 | self.sine_amp = sine_amp
401 | self.noise_std = add_noise_std
402 | self.is_half = is_half
403 | # to produce sine waveforms
404 | self.l_sin_gen = SineGen(
405 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
406 | )
407 |
408 | # to merge source harmonics into a single excitation
409 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
410 | self.l_tanh = torch.nn.Tanh()
411 |
412 | def forward(self, x, upp=None):
413 | sine_wavs, uv, _ = self.l_sin_gen(x, upp)
414 | if self.is_half:
415 | sine_wavs = sine_wavs.half()
416 | sine_merge = self.l_tanh(self.l_linear(sine_wavs))
417 | return sine_merge, None, None # noise, uv
418 |
419 |
420 | class GeneratorNSF(torch.nn.Module):
421 | def __init__(
422 | self,
423 | initial_channel,
424 | resblock,
425 | resblock_kernel_sizes,
426 | resblock_dilation_sizes,
427 | upsample_rates,
428 | upsample_initial_channel,
429 | upsample_kernel_sizes,
430 | gin_channels,
431 | sr,
432 | is_half=False,
433 | ):
434 | super(GeneratorNSF, self).__init__()
435 | self.num_kernels = len(resblock_kernel_sizes)
436 | self.num_upsamples = len(upsample_rates)
437 |
438 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
439 | self.m_source = SourceModuleHnNSF(
440 | sampling_rate=sr, harmonic_num=0, is_half=is_half
441 | )
442 | self.noise_convs = nn.ModuleList()
443 | self.conv_pre = Conv1d(
444 | initial_channel, upsample_initial_channel, 7, 1, padding=3
445 | )
446 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
447 |
448 | self.ups = nn.ModuleList()
449 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
450 | c_cur = upsample_initial_channel // (2 ** (i + 1))
451 | self.ups.append(
452 | weight_norm(
453 | ConvTranspose1d(
454 | upsample_initial_channel // (2**i),
455 | upsample_initial_channel // (2 ** (i + 1)),
456 | k,
457 | u,
458 | padding=(k - u) // 2,
459 | )
460 | )
461 | )
462 | if i + 1 < len(upsample_rates):
463 | stride_f0 = np.prod(upsample_rates[i + 1 :])
464 | self.noise_convs.append(
465 | Conv1d(
466 | 1,
467 | c_cur,
468 | kernel_size=stride_f0 * 2,
469 | stride=stride_f0,
470 | padding=stride_f0 // 2,
471 | )
472 | )
473 | else:
474 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
475 |
476 | self.resblocks = nn.ModuleList()
477 | for i in range(len(self.ups)):
478 | ch = upsample_initial_channel // (2 ** (i + 1))
479 | for j, (k, d) in enumerate(
480 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
481 | ):
482 | self.resblocks.append(resblock(ch, k, d))
483 |
484 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
485 | self.ups.apply(init_weights)
486 |
487 | if gin_channels != 0:
488 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
489 |
490 | self.upp = np.prod(upsample_rates)
491 |
492 | def forward(self, x, f0, g=None):
493 | har_source, noi_source, uv = self.m_source(f0, self.upp)
494 | har_source = har_source.transpose(1, 2)
495 | x = self.conv_pre(x)
496 | if g is not None:
497 | x = x + self.cond(g)
498 |
499 | for i in range(self.num_upsamples):
500 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
501 | x = self.ups[i](x)
502 | x_source = self.noise_convs[i](har_source)
503 | x = x + x_source
504 | xs = None
505 | for j in range(self.num_kernels):
506 | if xs is None:
507 | xs = self.resblocks[i * self.num_kernels + j](x)
508 | else:
509 | xs += self.resblocks[i * self.num_kernels + j](x)
510 | x = xs / self.num_kernels
511 | x = F.leaky_relu(x)
512 | x = self.conv_post(x)
513 | x = torch.tanh(x)
514 | return x
515 |
516 | def remove_weight_norm(self):
517 | for l in self.ups:
518 | remove_weight_norm(l)
519 | for l in self.resblocks:
520 | l.remove_weight_norm()
521 |
522 |
523 | sr2sr = {
524 | "32k": 32000,
525 | "40k": 40000,
526 | "48k": 48000,
527 | }
528 |
529 |
530 | class SynthesizerTrnMs256NSFsidM(nn.Module):
531 | def __init__(
532 | self,
533 | spec_channels,
534 | segment_size,
535 | inter_channels,
536 | hidden_channels,
537 | filter_channels,
538 | n_heads,
539 | n_layers,
540 | kernel_size,
541 | p_dropout,
542 | resblock,
543 | resblock_kernel_sizes,
544 | resblock_dilation_sizes,
545 | upsample_rates,
546 | upsample_initial_channel,
547 | upsample_kernel_sizes,
548 | spk_embed_dim,
549 | gin_channels,
550 | sr,
551 | **kwargs
552 | ):
553 | super().__init__()
554 | if type(sr) == type("strr"):
555 | sr = sr2sr[sr]
556 | self.spec_channels = spec_channels
557 | self.inter_channels = inter_channels
558 | self.hidden_channels = hidden_channels
559 | self.filter_channels = filter_channels
560 | self.n_heads = n_heads
561 | self.n_layers = n_layers
562 | self.kernel_size = kernel_size
563 | self.p_dropout = p_dropout
564 | self.resblock = resblock
565 | self.resblock_kernel_sizes = resblock_kernel_sizes
566 | self.resblock_dilation_sizes = resblock_dilation_sizes
567 | self.upsample_rates = upsample_rates
568 | self.upsample_initial_channel = upsample_initial_channel
569 | self.upsample_kernel_sizes = upsample_kernel_sizes
570 | self.segment_size = segment_size
571 | self.gin_channels = gin_channels
572 | # self.hop_length = hop_length#
573 | self.spk_embed_dim = spk_embed_dim
574 | self.enc_p = TextEncoder256(
575 | inter_channels,
576 | hidden_channels,
577 | filter_channels,
578 | n_heads,
579 | n_layers,
580 | kernel_size,
581 | p_dropout,
582 | )
583 | self.dec = GeneratorNSF(
584 | inter_channels,
585 | resblock,
586 | resblock_kernel_sizes,
587 | resblock_dilation_sizes,
588 | upsample_rates,
589 | upsample_initial_channel,
590 | upsample_kernel_sizes,
591 | gin_channels=gin_channels,
592 | sr=sr,
593 | is_half=kwargs["is_half"],
594 | )
595 | self.enc_q = PosteriorEncoder(
596 | spec_channels,
597 | inter_channels,
598 | hidden_channels,
599 | 5,
600 | 1,
601 | 16,
602 | gin_channels=gin_channels,
603 | )
604 | self.flow = ResidualCouplingBlock(
605 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
606 | )
607 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
608 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
609 |
610 | def remove_weight_norm(self):
611 | self.dec.remove_weight_norm()
612 | self.flow.remove_weight_norm()
613 | self.enc_q.remove_weight_norm()
614 |
615 | def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
616 | g = self.emb_g(sid).unsqueeze(-1)
617 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
618 | z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
619 | z = self.flow(z_p, x_mask, g=g, reverse=True)
620 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
621 | return o
622 |
623 |
624 | class SynthesizerTrnMs256NSFsid_sim(nn.Module):
625 | """
626 | Synthesizer for Training
627 | """
628 |
629 | def __init__(
630 | self,
631 | spec_channels,
632 | segment_size,
633 | inter_channels,
634 | hidden_channels,
635 | filter_channels,
636 | n_heads,
637 | n_layers,
638 | kernel_size,
639 | p_dropout,
640 | resblock,
641 | resblock_kernel_sizes,
642 | resblock_dilation_sizes,
643 | upsample_rates,
644 | upsample_initial_channel,
645 | upsample_kernel_sizes,
646 | spk_embed_dim,
647 | # hop_length,
648 | gin_channels=0,
649 | use_sdp=True,
650 | **kwargs
651 | ):
652 | super().__init__()
653 | self.spec_channels = spec_channels
654 | self.inter_channels = inter_channels
655 | self.hidden_channels = hidden_channels
656 | self.filter_channels = filter_channels
657 | self.n_heads = n_heads
658 | self.n_layers = n_layers
659 | self.kernel_size = kernel_size
660 | self.p_dropout = p_dropout
661 | self.resblock = resblock
662 | self.resblock_kernel_sizes = resblock_kernel_sizes
663 | self.resblock_dilation_sizes = resblock_dilation_sizes
664 | self.upsample_rates = upsample_rates
665 | self.upsample_initial_channel = upsample_initial_channel
666 | self.upsample_kernel_sizes = upsample_kernel_sizes
667 | self.segment_size = segment_size
668 | self.gin_channels = gin_channels
669 | # self.hop_length = hop_length#
670 | self.spk_embed_dim = spk_embed_dim
671 | self.enc_p = TextEncoder256Sim(
672 | inter_channels,
673 | hidden_channels,
674 | filter_channels,
675 | n_heads,
676 | n_layers,
677 | kernel_size,
678 | p_dropout,
679 | )
680 | self.dec = GeneratorNSF(
681 | inter_channels,
682 | resblock,
683 | resblock_kernel_sizes,
684 | resblock_dilation_sizes,
685 | upsample_rates,
686 | upsample_initial_channel,
687 | upsample_kernel_sizes,
688 | gin_channels=gin_channels,
689 | is_half=kwargs["is_half"],
690 | )
691 |
692 | self.flow = ResidualCouplingBlock(
693 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
694 | )
695 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
696 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
697 |
698 | def remove_weight_norm(self):
699 | self.dec.remove_weight_norm()
700 | self.flow.remove_weight_norm()
701 | self.enc_q.remove_weight_norm()
702 |
703 | def forward(
704 | self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
705 | ): # y是spec不需要了现在
706 | g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
707 | x, x_mask = self.enc_p(phone, pitch, phone_lengths)
708 | x = self.flow(x, x_mask, g=g, reverse=True)
709 | o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
710 | return o
711 |
712 |
713 | class MultiPeriodDiscriminator(torch.nn.Module):
714 | def __init__(self, use_spectral_norm=False):
715 | super(MultiPeriodDiscriminator, self).__init__()
716 | periods = [2, 3, 5, 7, 11, 17]
717 | # periods = [3, 5, 7, 11, 17, 23, 37]
718 |
719 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
720 | discs = discs + [
721 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
722 | ]
723 | self.discriminators = nn.ModuleList(discs)
724 |
725 | def forward(self, y, y_hat):
726 | y_d_rs = [] #
727 | y_d_gs = []
728 | fmap_rs = []
729 | fmap_gs = []
730 | for i, d in enumerate(self.discriminators):
731 | y_d_r, fmap_r = d(y)
732 | y_d_g, fmap_g = d(y_hat)
733 | # for j in range(len(fmap_r)):
734 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
735 | y_d_rs.append(y_d_r)
736 | y_d_gs.append(y_d_g)
737 | fmap_rs.append(fmap_r)
738 | fmap_gs.append(fmap_g)
739 |
740 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
741 |
742 |
743 | class DiscriminatorS(torch.nn.Module):
744 | def __init__(self, use_spectral_norm=False):
745 | super(DiscriminatorS, self).__init__()
746 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
747 | self.convs = nn.ModuleList(
748 | [
749 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
750 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
751 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
752 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
753 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
754 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
755 | ]
756 | )
757 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
758 |
759 | def forward(self, x):
760 | fmap = []
761 |
762 | for l in self.convs:
763 | x = l(x)
764 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
765 | fmap.append(x)
766 | x = self.conv_post(x)
767 | fmap.append(x)
768 | x = torch.flatten(x, 1, -1)
769 |
770 | return x, fmap
771 |
772 |
773 | class DiscriminatorP(torch.nn.Module):
774 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
775 | super(DiscriminatorP, self).__init__()
776 | self.period = period
777 | self.use_spectral_norm = use_spectral_norm
778 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
779 | self.convs = nn.ModuleList(
780 | [
781 | norm_f(
782 | Conv2d(
783 | 1,
784 | 32,
785 | (kernel_size, 1),
786 | (stride, 1),
787 | padding=(get_padding(kernel_size, 1), 0),
788 | )
789 | ),
790 | norm_f(
791 | Conv2d(
792 | 32,
793 | 128,
794 | (kernel_size, 1),
795 | (stride, 1),
796 | padding=(get_padding(kernel_size, 1), 0),
797 | )
798 | ),
799 | norm_f(
800 | Conv2d(
801 | 128,
802 | 512,
803 | (kernel_size, 1),
804 | (stride, 1),
805 | padding=(get_padding(kernel_size, 1), 0),
806 | )
807 | ),
808 | norm_f(
809 | Conv2d(
810 | 512,
811 | 1024,
812 | (kernel_size, 1),
813 | (stride, 1),
814 | padding=(get_padding(kernel_size, 1), 0),
815 | )
816 | ),
817 | norm_f(
818 | Conv2d(
819 | 1024,
820 | 1024,
821 | (kernel_size, 1),
822 | 1,
823 | padding=(get_padding(kernel_size, 1), 0),
824 | )
825 | ),
826 | ]
827 | )
828 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
829 |
830 | def forward(self, x):
831 | fmap = []
832 |
833 | # 1d to 2d
834 | b, c, t = x.shape
835 | if t % self.period != 0: # pad first
836 | n_pad = self.period - (t % self.period)
837 | x = F.pad(x, (0, n_pad), "reflect")
838 | t = t + n_pad
839 | x = x.view(b, c, t // self.period, self.period)
840 |
841 | for l in self.convs:
842 | x = l(x)
843 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
844 | fmap.append(x)
845 | x = self.conv_post(x)
846 | fmap.append(x)
847 | x = torch.flatten(x, 1, -1)
848 |
849 | return x, fmap
850 |
--------------------------------------------------------------------------------
/infer_pack/modelsv2.py:
--------------------------------------------------------------------------------
1 | import math, pdb, os
2 | from time import time as ttime
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from infer_pack import modules
7 | from infer_pack import attentions
8 | from infer_pack import commons
9 | from infer_pack.commons import init_weights, get_padding
10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12 | from infer_pack.commons import init_weights
13 | import numpy as np
14 | from infer_pack import commons
15 |
16 |
17 | class TextEncoder256(nn.Module):
18 | def __init__(
19 | self,
20 | out_channels,
21 | hidden_channels,
22 | filter_channels,
23 | n_heads,
24 | n_layers,
25 | kernel_size,
26 | p_dropout,
27 | f0=True,
28 | ):
29 | super().__init__()
30 | self.out_channels = out_channels
31 | self.hidden_channels = hidden_channels
32 | self.filter_channels = filter_channels
33 | self.n_heads = n_heads
34 | self.n_layers = n_layers
35 | self.kernel_size = kernel_size
36 | self.p_dropout = p_dropout
37 | self.emb_phone = nn.Linear(256, hidden_channels)
38 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39 | if f0 == True:
40 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41 | self.encoder = attentions.Encoder(
42 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43 | )
44 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45 |
46 | def forward(self, phone, pitch, lengths):
47 | if pitch == None:
48 | x = self.emb_phone(phone)
49 | else:
50 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
51 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52 | x = self.lrelu(x)
53 | x = torch.transpose(x, 1, -1) # [b, h, t]
54 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55 | x.dtype
56 | )
57 | x = self.encoder(x * x_mask, x_mask)
58 | stats = self.proj(x) * x_mask
59 |
60 | m, logs = torch.split(stats, self.out_channels, dim=1)
61 | return m, logs, x_mask
62 | class TextEncoder768(nn.Module):
63 | def __init__(
64 | self,
65 | out_channels,
66 | hidden_channels,
67 | filter_channels,
68 | n_heads,
69 | n_layers,
70 | kernel_size,
71 | p_dropout,
72 | f0=True,
73 | ):
74 | super().__init__()
75 | self.out_channels = out_channels
76 | self.hidden_channels = hidden_channels
77 | self.filter_channels = filter_channels
78 | self.n_heads = n_heads
79 | self.n_layers = n_layers
80 | self.kernel_size = kernel_size
81 | self.p_dropout = p_dropout
82 | self.emb_phone = nn.Linear(768, hidden_channels)
83 | self.lrelu = nn.LeakyReLU(0.1, inplace=True)
84 | if f0 == True:
85 | self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
86 | self.encoder = attentions.Encoder(
87 | hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
88 | )
89 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
90 |
91 | def forward(self, phone, pitch, lengths):
92 | if pitch == None:
93 | x = self.emb_phone(phone)
94 | else:
95 | x = self.emb_phone(phone) + self.emb_pitch(pitch)
96 | x = x * math.sqrt(self.hidden_channels) # [b, t, h]
97 | x = self.lrelu(x)
98 | x = torch.transpose(x, 1, -1) # [b, h, t]
99 | x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
100 | x.dtype
101 | )
102 | x = self.encoder(x * x_mask, x_mask)
103 | stats = self.proj(x) * x_mask
104 |
105 | m, logs = torch.split(stats, self.out_channels, dim=1)
106 | return m, logs, x_mask
107 |
108 | class ResidualCouplingBlock(nn.Module):
109 | def __init__(
110 | self,
111 | channels,
112 | hidden_channels,
113 | kernel_size,
114 | dilation_rate,
115 | n_layers,
116 | n_flows=4,
117 | gin_channels=0,
118 | ):
119 | super().__init__()
120 | self.channels = channels
121 | self.hidden_channels = hidden_channels
122 | self.kernel_size = kernel_size
123 | self.dilation_rate = dilation_rate
124 | self.n_layers = n_layers
125 | self.n_flows = n_flows
126 | self.gin_channels = gin_channels
127 |
128 | self.flows = nn.ModuleList()
129 | for i in range(n_flows):
130 | self.flows.append(
131 | modules.ResidualCouplingLayer(
132 | channels,
133 | hidden_channels,
134 | kernel_size,
135 | dilation_rate,
136 | n_layers,
137 | gin_channels=gin_channels,
138 | mean_only=True,
139 | )
140 | )
141 | self.flows.append(modules.Flip())
142 |
143 | def forward(self, x, x_mask, g=None, reverse=False):
144 | if not reverse:
145 | for flow in self.flows:
146 | x, _ = flow(x, x_mask, g=g, reverse=reverse)
147 | else:
148 | for flow in reversed(self.flows):
149 | x = flow(x, x_mask, g=g, reverse=reverse)
150 | return x
151 |
152 | def remove_weight_norm(self):
153 | for i in range(self.n_flows):
154 | self.flows[i * 2].remove_weight_norm()
155 |
156 |
157 | class PosteriorEncoder(nn.Module):
158 | def __init__(
159 | self,
160 | in_channels,
161 | out_channels,
162 | hidden_channels,
163 | kernel_size,
164 | dilation_rate,
165 | n_layers,
166 | gin_channels=0,
167 | ):
168 | super().__init__()
169 | self.in_channels = in_channels
170 | self.out_channels = out_channels
171 | self.hidden_channels = hidden_channels
172 | self.kernel_size = kernel_size
173 | self.dilation_rate = dilation_rate
174 | self.n_layers = n_layers
175 | self.gin_channels = gin_channels
176 |
177 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
178 | self.enc = modules.WN(
179 | hidden_channels,
180 | kernel_size,
181 | dilation_rate,
182 | n_layers,
183 | gin_channels=gin_channels,
184 | )
185 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
186 |
187 | def forward(self, x, x_lengths, g=None):
188 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
189 | x.dtype
190 | )
191 | x = self.pre(x) * x_mask
192 | x = self.enc(x, x_mask, g=g)
193 | stats = self.proj(x) * x_mask
194 | m, logs = torch.split(stats, self.out_channels, dim=1)
195 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
196 | return z, m, logs, x_mask
197 |
198 | def remove_weight_norm(self):
199 | self.enc.remove_weight_norm()
200 |
201 |
202 | class Generator(torch.nn.Module):
203 | def __init__(
204 | self,
205 | initial_channel,
206 | resblock,
207 | resblock_kernel_sizes,
208 | resblock_dilation_sizes,
209 | upsample_rates,
210 | upsample_initial_channel,
211 | upsample_kernel_sizes,
212 | gin_channels=0,
213 | ):
214 | super(Generator, self).__init__()
215 | self.num_kernels = len(resblock_kernel_sizes)
216 | self.num_upsamples = len(upsample_rates)
217 | self.conv_pre = Conv1d(
218 | initial_channel, upsample_initial_channel, 7, 1, padding=3
219 | )
220 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
221 |
222 | self.ups = nn.ModuleList()
223 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
224 | self.ups.append(
225 | weight_norm(
226 | ConvTranspose1d(
227 | upsample_initial_channel // (2**i),
228 | upsample_initial_channel // (2 ** (i + 1)),
229 | k,
230 | u,
231 | padding=(k - u) // 2,
232 | )
233 | )
234 | )
235 |
236 | self.resblocks = nn.ModuleList()
237 | for i in range(len(self.ups)):
238 | ch = upsample_initial_channel // (2 ** (i + 1))
239 | for j, (k, d) in enumerate(
240 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
241 | ):
242 | self.resblocks.append(resblock(ch, k, d))
243 |
244 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
245 | self.ups.apply(init_weights)
246 |
247 | if gin_channels != 0:
248 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
249 |
250 | def forward(self, x, g=None):
251 | x = self.conv_pre(x)
252 | if g is not None:
253 | x = x + self.cond(g)
254 |
255 | for i in range(self.num_upsamples):
256 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
257 | x = self.ups[i](x)
258 | xs = None
259 | for j in range(self.num_kernels):
260 | if xs is None:
261 | xs = self.resblocks[i * self.num_kernels + j](x)
262 | else:
263 | xs += self.resblocks[i * self.num_kernels + j](x)
264 | x = xs / self.num_kernels
265 | x = F.leaky_relu(x)
266 | x = self.conv_post(x)
267 | x = torch.tanh(x)
268 |
269 | return x
270 |
271 | def remove_weight_norm(self):
272 | for l in self.ups:
273 | remove_weight_norm(l)
274 | for l in self.resblocks:
275 | l.remove_weight_norm()
276 |
277 |
278 | class SineGen(torch.nn.Module):
279 | """Definition of sine generator
280 | SineGen(samp_rate, harmonic_num = 0,
281 | sine_amp = 0.1, noise_std = 0.003,
282 | voiced_threshold = 0,
283 | flag_for_pulse=False)
284 | samp_rate: sampling rate in Hz
285 | harmonic_num: number of harmonic overtones (default 0)
286 | sine_amp: amplitude of sine-wavefrom (default 0.1)
287 | noise_std: std of Gaussian noise (default 0.003)
288 | voiced_thoreshold: F0 threshold for U/V classification (default 0)
289 | flag_for_pulse: this SinGen is used inside PulseGen (default False)
290 | Note: when flag_for_pulse is True, the first time step of a voiced
291 | segment is always sin(np.pi) or cos(0)
292 | """
293 |
294 | def __init__(
295 | self,
296 | samp_rate,
297 | harmonic_num=0,
298 | sine_amp=0.1,
299 | noise_std=0.003,
300 | voiced_threshold=0,
301 | flag_for_pulse=False,
302 | ):
303 | super(SineGen, self).__init__()
304 | self.sine_amp = sine_amp
305 | self.noise_std = noise_std
306 | self.harmonic_num = harmonic_num
307 | self.dim = self.harmonic_num + 1
308 | self.sampling_rate = samp_rate
309 | self.voiced_threshold = voiced_threshold
310 |
311 | def _f02uv(self, f0):
312 | # generate uv signal
313 | uv = torch.ones_like(f0)
314 | uv = uv * (f0 > self.voiced_threshold)
315 | return uv
316 |
317 | def forward(self, f0, upp):
318 | """sine_tensor, uv = forward(f0)
319 | input F0: tensor(batchsize=1, length, dim=1)
320 | f0 for unvoiced steps should be 0
321 | output sine_tensor: tensor(batchsize=1, length, dim)
322 | output uv: tensor(batchsize=1, length, 1)
323 | """
324 | with torch.no_grad():
325 | f0 = f0[:, None].transpose(1, 2)
326 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
327 | # fundamental component
328 | f0_buf[:, :, 0] = f0[:, :, 0]
329 | for idx in np.arange(self.harmonic_num):
330 | f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
331 | idx + 2
332 | ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
333 | rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
334 | rand_ini = torch.rand(
335 | f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
336 | )
337 | rand_ini[:, 0] = 0
338 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
339 | tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
340 | tmp_over_one *= upp
341 | tmp_over_one = F.interpolate(
342 | tmp_over_one.transpose(2, 1),
343 | scale_factor=upp,
344 | mode="linear",
345 | align_corners=True,
346 | ).transpose(2, 1)
347 | rad_values = F.interpolate(
348 | rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
349 | ).transpose(
350 | 2, 1
351 | ) #######
352 | tmp_over_one %= 1
353 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
354 | cumsum_shift = torch.zeros_like(rad_values)
355 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
356 | sine_waves = torch.sin(
357 | torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
358 | )
359 | sine_waves = sine_waves * self.sine_amp
360 | uv = self._f02uv(f0)
361 | uv = F.interpolate(
362 | uv.transpose(2, 1), scale_factor=upp, mode="nearest"
363 | ).transpose(2, 1)
364 | noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
365 | noise = noise_amp * torch.randn_like(sine_waves)
366 | sine_waves = sine_waves * uv + noise
367 | return sine_waves, uv, noise
368 |
369 |
370 | class SourceModuleHnNSF(torch.nn.Module):
371 | """SourceModule for hn-nsf
372 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
373 | add_noise_std=0.003, voiced_threshod=0)
374 | sampling_rate: sampling_rate in Hz
375 | harmonic_num: number of harmonic above F0 (default: 0)
376 | sine_amp: amplitude of sine source signal (default: 0.1)
377 | add_noise_std: std of additive Gaussian noise (default: 0.003)
378 | note that amplitude of noise in unvoiced is decided
379 | by sine_amp
380 | voiced_threshold: threhold to set U/V given F0 (default: 0)
381 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
382 | F0_sampled (batchsize, length, 1)
383 | Sine_source (batchsize, length, 1)
384 | noise_source (batchsize, length 1)
385 | uv (batchsize, length, 1)
386 | """
387 |
388 | def __init__(
389 | self,
390 | sampling_rate,
391 | harmonic_num=0,
392 | sine_amp=0.1,
393 | add_noise_std=0.003,
394 | voiced_threshod=0,
395 | is_half=True,
396 | ):
397 | super(SourceModuleHnNSF, self).__init__()
398 |
399 | self.sine_amp = sine_amp
400 | self.noise_std = add_noise_std
401 | self.is_half = is_half
402 | # to produce sine waveforms
403 | self.l_sin_gen = SineGen(
404 | sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
405 | )
406 |
407 | # to merge source harmonics into a single excitation
408 | self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
409 | self.l_tanh = torch.nn.Tanh()
410 |
411 | def forward(self, x, upp=None):
412 | sine_wavs, uv, _ = self.l_sin_gen(x, upp)
413 | if self.is_half:
414 | sine_wavs = sine_wavs.half()
415 | sine_merge = self.l_tanh(self.l_linear(sine_wavs))
416 | return sine_merge, None, None # noise, uv
417 |
418 |
419 | class GeneratorNSF(torch.nn.Module):
420 | def __init__(
421 | self,
422 | initial_channel,
423 | resblock,
424 | resblock_kernel_sizes,
425 | resblock_dilation_sizes,
426 | upsample_rates,
427 | upsample_initial_channel,
428 | upsample_kernel_sizes,
429 | gin_channels,
430 | sr,
431 | is_half=False,
432 | ):
433 | super(GeneratorNSF, self).__init__()
434 | self.num_kernels = len(resblock_kernel_sizes)
435 | self.num_upsamples = len(upsample_rates)
436 |
437 | self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
438 | self.m_source = SourceModuleHnNSF(
439 | sampling_rate=sr, harmonic_num=0, is_half=is_half
440 | )
441 | self.noise_convs = nn.ModuleList()
442 | self.conv_pre = Conv1d(
443 | initial_channel, upsample_initial_channel, 7, 1, padding=3
444 | )
445 | resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
446 |
447 | self.ups = nn.ModuleList()
448 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
449 | c_cur = upsample_initial_channel // (2 ** (i + 1))
450 | self.ups.append(
451 | weight_norm(
452 | ConvTranspose1d(
453 | upsample_initial_channel // (2**i),
454 | upsample_initial_channel // (2 ** (i + 1)),
455 | k,
456 | u,
457 | padding=(k - u) // 2,
458 | )
459 | )
460 | )
461 | if i + 1 < len(upsample_rates):
462 | stride_f0 = np.prod(upsample_rates[i + 1 :])
463 | self.noise_convs.append(
464 | Conv1d(
465 | 1,
466 | c_cur,
467 | kernel_size=stride_f0 * 2,
468 | stride=stride_f0,
469 | padding=stride_f0 // 2,
470 | )
471 | )
472 | else:
473 | self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
474 |
475 | self.resblocks = nn.ModuleList()
476 | for i in range(len(self.ups)):
477 | ch = upsample_initial_channel // (2 ** (i + 1))
478 | for j, (k, d) in enumerate(
479 | zip(resblock_kernel_sizes, resblock_dilation_sizes)
480 | ):
481 | self.resblocks.append(resblock(ch, k, d))
482 |
483 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
484 | self.ups.apply(init_weights)
485 |
486 | if gin_channels != 0:
487 | self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
488 |
489 | self.upp = np.prod(upsample_rates)
490 |
491 | def forward(self, x, f0, g=None):
492 | har_source, noi_source, uv = self.m_source(f0, self.upp)
493 | har_source = har_source.transpose(1, 2)
494 | x = self.conv_pre(x)
495 | if g is not None:
496 | x = x + self.cond(g)
497 |
498 | for i in range(self.num_upsamples):
499 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
500 | x = self.ups[i](x)
501 | x_source = self.noise_convs[i](har_source)
502 | x = x + x_source
503 | xs = None
504 | for j in range(self.num_kernels):
505 | if xs is None:
506 | xs = self.resblocks[i * self.num_kernels + j](x)
507 | else:
508 | xs += self.resblocks[i * self.num_kernels + j](x)
509 | x = xs / self.num_kernels
510 | x = F.leaky_relu(x)
511 | x = self.conv_post(x)
512 | x = torch.tanh(x)
513 | return x
514 |
515 | def remove_weight_norm(self):
516 | for l in self.ups:
517 | remove_weight_norm(l)
518 | for l in self.resblocks:
519 | l.remove_weight_norm()
520 |
521 |
522 | sr2sr = {
523 | "32k": 32000,
524 | "40k": 40000,
525 | "48k": 48000,
526 | }
527 |
528 |
529 | class SynthesizerTrnMs256NSFsid(nn.Module):
530 | def __init__(
531 | self,
532 | spec_channels,
533 | segment_size,
534 | inter_channels,
535 | hidden_channels,
536 | filter_channels,
537 | n_heads,
538 | n_layers,
539 | kernel_size,
540 | p_dropout,
541 | resblock,
542 | resblock_kernel_sizes,
543 | resblock_dilation_sizes,
544 | upsample_rates,
545 | upsample_initial_channel,
546 | upsample_kernel_sizes,
547 | spk_embed_dim,
548 | gin_channels,
549 | sr,
550 | **kwargs
551 | ):
552 | super().__init__()
553 | if type(sr) == type("strr"):
554 | sr = sr2sr[sr]
555 | self.spec_channels = spec_channels
556 | self.inter_channels = inter_channels
557 | self.hidden_channels = hidden_channels
558 | self.filter_channels = filter_channels
559 | self.n_heads = n_heads
560 | self.n_layers = n_layers
561 | self.kernel_size = kernel_size
562 | self.p_dropout = p_dropout
563 | self.resblock = resblock
564 | self.resblock_kernel_sizes = resblock_kernel_sizes
565 | self.resblock_dilation_sizes = resblock_dilation_sizes
566 | self.upsample_rates = upsample_rates
567 | self.upsample_initial_channel = upsample_initial_channel
568 | self.upsample_kernel_sizes = upsample_kernel_sizes
569 | self.segment_size = segment_size
570 | self.gin_channels = gin_channels
571 | # self.hop_length = hop_length#
572 | self.spk_embed_dim = spk_embed_dim
573 | self.enc_p = TextEncoder256(
574 | inter_channels,
575 | hidden_channels,
576 | filter_channels,
577 | n_heads,
578 | n_layers,
579 | kernel_size,
580 | p_dropout,
581 | )
582 | self.dec = GeneratorNSF(
583 | inter_channels,
584 | resblock,
585 | resblock_kernel_sizes,
586 | resblock_dilation_sizes,
587 | upsample_rates,
588 | upsample_initial_channel,
589 | upsample_kernel_sizes,
590 | gin_channels=gin_channels,
591 | sr=sr,
592 | is_half=kwargs["is_half"],
593 | )
594 | self.enc_q = PosteriorEncoder(
595 | spec_channels,
596 | inter_channels,
597 | hidden_channels,
598 | 5,
599 | 1,
600 | 16,
601 | gin_channels=gin_channels,
602 | )
603 | self.flow = ResidualCouplingBlock(
604 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
605 | )
606 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
607 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
608 |
609 | def remove_weight_norm(self):
610 | self.dec.remove_weight_norm()
611 | self.flow.remove_weight_norm()
612 | self.enc_q.remove_weight_norm()
613 |
614 | def forward(
615 | self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
616 | ): # 这里ds是id,[bs,1]
617 | # print(1,pitch.shape)#[bs,t]
618 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
619 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
620 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
621 | z_p = self.flow(z, y_mask, g=g)
622 | z_slice, ids_slice = commons.rand_slice_segments(
623 | z, y_lengths, self.segment_size
624 | )
625 | # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
626 | pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
627 | # print(-2,pitchf.shape,z_slice.shape)
628 | o = self.dec(z_slice, pitchf, g=g)
629 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
630 |
631 | def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
632 | g = self.emb_g(sid).unsqueeze(-1)
633 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
634 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
635 | z = self.flow(z_p, x_mask, g=g, reverse=True)
636 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
637 | return o, x_mask, (z, z_p, m_p, logs_p)
638 | class SynthesizerTrnMs768NSFsid(nn.Module):
639 | def __init__(
640 | self,
641 | spec_channels,
642 | segment_size,
643 | inter_channels,
644 | hidden_channels,
645 | filter_channels,
646 | n_heads,
647 | n_layers,
648 | kernel_size,
649 | p_dropout,
650 | resblock,
651 | resblock_kernel_sizes,
652 | resblock_dilation_sizes,
653 | upsample_rates,
654 | upsample_initial_channel,
655 | upsample_kernel_sizes,
656 | spk_embed_dim,
657 | gin_channels,
658 | sr,
659 | **kwargs
660 | ):
661 | super().__init__()
662 | if type(sr) == type("strr"):
663 | sr = sr2sr[sr]
664 | self.spec_channels = spec_channels
665 | self.inter_channels = inter_channels
666 | self.hidden_channels = hidden_channels
667 | self.filter_channels = filter_channels
668 | self.n_heads = n_heads
669 | self.n_layers = n_layers
670 | self.kernel_size = kernel_size
671 | self.p_dropout = p_dropout
672 | self.resblock = resblock
673 | self.resblock_kernel_sizes = resblock_kernel_sizes
674 | self.resblock_dilation_sizes = resblock_dilation_sizes
675 | self.upsample_rates = upsample_rates
676 | self.upsample_initial_channel = upsample_initial_channel
677 | self.upsample_kernel_sizes = upsample_kernel_sizes
678 | self.segment_size = segment_size
679 | self.gin_channels = gin_channels
680 | # self.hop_length = hop_length#
681 | self.spk_embed_dim = spk_embed_dim
682 | self.enc_p = TextEncoder768(
683 | inter_channels,
684 | hidden_channels,
685 | filter_channels,
686 | n_heads,
687 | n_layers,
688 | kernel_size,
689 | p_dropout,
690 | )
691 | self.dec = GeneratorNSF(
692 | inter_channels,
693 | resblock,
694 | resblock_kernel_sizes,
695 | resblock_dilation_sizes,
696 | upsample_rates,
697 | upsample_initial_channel,
698 | upsample_kernel_sizes,
699 | gin_channels=gin_channels,
700 | sr=sr,
701 | is_half=kwargs["is_half"],
702 | )
703 | self.enc_q = PosteriorEncoder(
704 | spec_channels,
705 | inter_channels,
706 | hidden_channels,
707 | 5,
708 | 1,
709 | 16,
710 | gin_channels=gin_channels,
711 | )
712 | self.flow = ResidualCouplingBlock(
713 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
714 | )
715 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
716 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
717 |
718 | def remove_weight_norm(self):
719 | self.dec.remove_weight_norm()
720 | self.flow.remove_weight_norm()
721 | self.enc_q.remove_weight_norm()
722 |
723 | def forward(
724 | self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
725 | ): # 这里ds是id,[bs,1]
726 | # print(1,pitch.shape)#[bs,t]
727 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
728 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
729 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
730 | z_p = self.flow(z, y_mask, g=g)
731 | z_slice, ids_slice = commons.rand_slice_segments(
732 | z, y_lengths, self.segment_size
733 | )
734 | # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
735 | pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
736 | # print(-2,pitchf.shape,z_slice.shape)
737 | o = self.dec(z_slice, pitchf, g=g)
738 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
739 |
740 | def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
741 | g = self.emb_g(sid).unsqueeze(-1)
742 | m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
743 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
744 | z = self.flow(z_p, x_mask, g=g, reverse=True)
745 | o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
746 | return o, x_mask, (z, z_p, m_p, logs_p)
747 |
748 |
749 | class SynthesizerTrnMs256NSFsid_nono(nn.Module):
750 | def __init__(
751 | self,
752 | spec_channels,
753 | segment_size,
754 | inter_channels,
755 | hidden_channels,
756 | filter_channels,
757 | n_heads,
758 | n_layers,
759 | kernel_size,
760 | p_dropout,
761 | resblock,
762 | resblock_kernel_sizes,
763 | resblock_dilation_sizes,
764 | upsample_rates,
765 | upsample_initial_channel,
766 | upsample_kernel_sizes,
767 | spk_embed_dim,
768 | gin_channels,
769 | sr=None,
770 | **kwargs
771 | ):
772 | super().__init__()
773 | self.spec_channels = spec_channels
774 | self.inter_channels = inter_channels
775 | self.hidden_channels = hidden_channels
776 | self.filter_channels = filter_channels
777 | self.n_heads = n_heads
778 | self.n_layers = n_layers
779 | self.kernel_size = kernel_size
780 | self.p_dropout = p_dropout
781 | self.resblock = resblock
782 | self.resblock_kernel_sizes = resblock_kernel_sizes
783 | self.resblock_dilation_sizes = resblock_dilation_sizes
784 | self.upsample_rates = upsample_rates
785 | self.upsample_initial_channel = upsample_initial_channel
786 | self.upsample_kernel_sizes = upsample_kernel_sizes
787 | self.segment_size = segment_size
788 | self.gin_channels = gin_channels
789 | # self.hop_length = hop_length#
790 | self.spk_embed_dim = spk_embed_dim
791 | self.enc_p = TextEncoder256(
792 | inter_channels,
793 | hidden_channels,
794 | filter_channels,
795 | n_heads,
796 | n_layers,
797 | kernel_size,
798 | p_dropout,
799 | f0=False,
800 | )
801 | self.dec = Generator(
802 | inter_channels,
803 | resblock,
804 | resblock_kernel_sizes,
805 | resblock_dilation_sizes,
806 | upsample_rates,
807 | upsample_initial_channel,
808 | upsample_kernel_sizes,
809 | gin_channels=gin_channels,
810 | )
811 | self.enc_q = PosteriorEncoder(
812 | spec_channels,
813 | inter_channels,
814 | hidden_channels,
815 | 5,
816 | 1,
817 | 16,
818 | gin_channels=gin_channels,
819 | )
820 | self.flow = ResidualCouplingBlock(
821 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
822 | )
823 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
824 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
825 |
826 | def remove_weight_norm(self):
827 | self.dec.remove_weight_norm()
828 | self.flow.remove_weight_norm()
829 | self.enc_q.remove_weight_norm()
830 |
831 | def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
832 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
833 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
834 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
835 | z_p = self.flow(z, y_mask, g=g)
836 | z_slice, ids_slice = commons.rand_slice_segments(
837 | z, y_lengths, self.segment_size
838 | )
839 | o = self.dec(z_slice, g=g)
840 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
841 |
842 | def infer(self, phone, phone_lengths, sid, max_len=None):
843 | g = self.emb_g(sid).unsqueeze(-1)
844 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
845 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
846 | z = self.flow(z_p, x_mask, g=g, reverse=True)
847 | o = self.dec((z * x_mask)[:, :, :max_len], g=g)
848 | return o, x_mask, (z, z_p, m_p, logs_p)
849 | class SynthesizerTrnMs768NSFsid_nono(nn.Module):
850 | def __init__(
851 | self,
852 | spec_channels,
853 | segment_size,
854 | inter_channels,
855 | hidden_channels,
856 | filter_channels,
857 | n_heads,
858 | n_layers,
859 | kernel_size,
860 | p_dropout,
861 | resblock,
862 | resblock_kernel_sizes,
863 | resblock_dilation_sizes,
864 | upsample_rates,
865 | upsample_initial_channel,
866 | upsample_kernel_sizes,
867 | spk_embed_dim,
868 | gin_channels,
869 | sr=None,
870 | **kwargs
871 | ):
872 | super().__init__()
873 | self.spec_channels = spec_channels
874 | self.inter_channels = inter_channels
875 | self.hidden_channels = hidden_channels
876 | self.filter_channels = filter_channels
877 | self.n_heads = n_heads
878 | self.n_layers = n_layers
879 | self.kernel_size = kernel_size
880 | self.p_dropout = p_dropout
881 | self.resblock = resblock
882 | self.resblock_kernel_sizes = resblock_kernel_sizes
883 | self.resblock_dilation_sizes = resblock_dilation_sizes
884 | self.upsample_rates = upsample_rates
885 | self.upsample_initial_channel = upsample_initial_channel
886 | self.upsample_kernel_sizes = upsample_kernel_sizes
887 | self.segment_size = segment_size
888 | self.gin_channels = gin_channels
889 | # self.hop_length = hop_length#
890 | self.spk_embed_dim = spk_embed_dim
891 | self.enc_p = TextEncoder768(
892 | inter_channels,
893 | hidden_channels,
894 | filter_channels,
895 | n_heads,
896 | n_layers,
897 | kernel_size,
898 | p_dropout,
899 | f0=False,
900 | )
901 | self.dec = Generator(
902 | inter_channels,
903 | resblock,
904 | resblock_kernel_sizes,
905 | resblock_dilation_sizes,
906 | upsample_rates,
907 | upsample_initial_channel,
908 | upsample_kernel_sizes,
909 | gin_channels=gin_channels,
910 | )
911 | self.enc_q = PosteriorEncoder(
912 | spec_channels,
913 | inter_channels,
914 | hidden_channels,
915 | 5,
916 | 1,
917 | 16,
918 | gin_channels=gin_channels,
919 | )
920 | self.flow = ResidualCouplingBlock(
921 | inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
922 | )
923 | self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
924 | print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
925 |
926 | def remove_weight_norm(self):
927 | self.dec.remove_weight_norm()
928 | self.flow.remove_weight_norm()
929 | self.enc_q.remove_weight_norm()
930 |
931 | def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
932 | g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
933 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
934 | z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
935 | z_p = self.flow(z, y_mask, g=g)
936 | z_slice, ids_slice = commons.rand_slice_segments(
937 | z, y_lengths, self.segment_size
938 | )
939 | o = self.dec(z_slice, g=g)
940 | return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
941 |
942 | def infer(self, phone, phone_lengths, sid, max_len=None):
943 | g = self.emb_g(sid).unsqueeze(-1)
944 | m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
945 | z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
946 | z = self.flow(z_p, x_mask, g=g, reverse=True)
947 | o = self.dec((z * x_mask)[:, :, :max_len], g=g)
948 | return o, x_mask, (z, z_p, m_p, logs_p)
949 |
950 |
951 | class MultiPeriodDiscriminator(torch.nn.Module):
952 | def __init__(self, use_spectral_norm=False):
953 | super(MultiPeriodDiscriminator, self).__init__()
954 | periods = [2, 3, 5, 7, 11, 17]
955 | # periods = [3, 5, 7, 11, 17, 23, 37]
956 |
957 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
958 | discs = discs + [
959 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
960 | ]
961 | self.discriminators = nn.ModuleList(discs)
962 |
963 | def forward(self, y, y_hat):
964 | y_d_rs = [] #
965 | y_d_gs = []
966 | fmap_rs = []
967 | fmap_gs = []
968 | for i, d in enumerate(self.discriminators):
969 | y_d_r, fmap_r = d(y)
970 | y_d_g, fmap_g = d(y_hat)
971 | # for j in range(len(fmap_r)):
972 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
973 | y_d_rs.append(y_d_r)
974 | y_d_gs.append(y_d_g)
975 | fmap_rs.append(fmap_r)
976 | fmap_gs.append(fmap_g)
977 |
978 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
979 |
980 | class MultiPeriodDiscriminatorV2(torch.nn.Module):
981 | def __init__(self, use_spectral_norm=False):
982 | super(MultiPeriodDiscriminatorV2, self).__init__()
983 | # periods = [2, 3, 5, 7, 11, 17]
984 | periods = [2,3, 5, 7, 11, 17, 23, 37]
985 |
986 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
987 | discs = discs + [
988 | DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
989 | ]
990 | self.discriminators = nn.ModuleList(discs)
991 |
992 | def forward(self, y, y_hat):
993 | y_d_rs = [] #
994 | y_d_gs = []
995 | fmap_rs = []
996 | fmap_gs = []
997 | for i, d in enumerate(self.discriminators):
998 | y_d_r, fmap_r = d(y)
999 | y_d_g, fmap_g = d(y_hat)
1000 | # for j in range(len(fmap_r)):
1001 | # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1002 | y_d_rs.append(y_d_r)
1003 | y_d_gs.append(y_d_g)
1004 | fmap_rs.append(fmap_r)
1005 | fmap_gs.append(fmap_g)
1006 |
1007 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1008 |
1009 |
1010 | class DiscriminatorS(torch.nn.Module):
1011 | def __init__(self, use_spectral_norm=False):
1012 | super(DiscriminatorS, self).__init__()
1013 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1014 | self.convs = nn.ModuleList(
1015 | [
1016 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1017 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1018 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1019 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1020 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1021 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1022 | ]
1023 | )
1024 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1025 |
1026 | def forward(self, x):
1027 | fmap = []
1028 |
1029 | for l in self.convs:
1030 | x = l(x)
1031 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
1032 | fmap.append(x)
1033 | x = self.conv_post(x)
1034 | fmap.append(x)
1035 | x = torch.flatten(x, 1, -1)
1036 |
1037 | return x, fmap
1038 |
1039 |
1040 | class DiscriminatorP(torch.nn.Module):
1041 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1042 | super(DiscriminatorP, self).__init__()
1043 | self.period = period
1044 | self.use_spectral_norm = use_spectral_norm
1045 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1046 | self.convs = nn.ModuleList(
1047 | [
1048 | norm_f(
1049 | Conv2d(
1050 | 1,
1051 | 32,
1052 | (kernel_size, 1),
1053 | (stride, 1),
1054 | padding=(get_padding(kernel_size, 1), 0),
1055 | )
1056 | ),
1057 | norm_f(
1058 | Conv2d(
1059 | 32,
1060 | 128,
1061 | (kernel_size, 1),
1062 | (stride, 1),
1063 | padding=(get_padding(kernel_size, 1), 0),
1064 | )
1065 | ),
1066 | norm_f(
1067 | Conv2d(
1068 | 128,
1069 | 512,
1070 | (kernel_size, 1),
1071 | (stride, 1),
1072 | padding=(get_padding(kernel_size, 1), 0),
1073 | )
1074 | ),
1075 | norm_f(
1076 | Conv2d(
1077 | 512,
1078 | 1024,
1079 | (kernel_size, 1),
1080 | (stride, 1),
1081 | padding=(get_padding(kernel_size, 1), 0),
1082 | )
1083 | ),
1084 | norm_f(
1085 | Conv2d(
1086 | 1024,
1087 | 1024,
1088 | (kernel_size, 1),
1089 | 1,
1090 | padding=(get_padding(kernel_size, 1), 0),
1091 | )
1092 | ),
1093 | ]
1094 | )
1095 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1096 |
1097 | def forward(self, x):
1098 | fmap = []
1099 |
1100 | # 1d to 2d
1101 | b, c, t = x.shape
1102 | if t % self.period != 0: # pad first
1103 | n_pad = self.period - (t % self.period)
1104 | x = F.pad(x, (0, n_pad), "reflect")
1105 | t = t + n_pad
1106 | x = x.view(b, c, t // self.period, self.period)
1107 |
1108 | for l in self.convs:
1109 | x = l(x)
1110 | x = F.leaky_relu(x, modules.LRELU_SLOPE)
1111 | fmap.append(x)
1112 | x = self.conv_post(x)
1113 | fmap.append(x)
1114 | x = torch.flatten(x, 1, -1)
1115 |
1116 | return x, fmap
1117 |
--------------------------------------------------------------------------------
/infer_pack/modules.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | import numpy as np
4 | import scipy
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 |
9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10 | from torch.nn.utils import weight_norm, remove_weight_norm
11 |
12 | from infer_pack import commons
13 | from infer_pack.commons import init_weights, get_padding
14 | from infer_pack.transforms import piecewise_rational_quadratic_transform
15 |
16 |
17 | LRELU_SLOPE = 0.1
18 |
19 |
20 | class LayerNorm(nn.Module):
21 | def __init__(self, channels, eps=1e-5):
22 | super().__init__()
23 | self.channels = channels
24 | self.eps = eps
25 |
26 | self.gamma = nn.Parameter(torch.ones(channels))
27 | self.beta = nn.Parameter(torch.zeros(channels))
28 |
29 | def forward(self, x):
30 | x = x.transpose(1, -1)
31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32 | return x.transpose(1, -1)
33 |
34 |
35 | class ConvReluNorm(nn.Module):
36 | def __init__(
37 | self,
38 | in_channels,
39 | hidden_channels,
40 | out_channels,
41 | kernel_size,
42 | n_layers,
43 | p_dropout,
44 | ):
45 | super().__init__()
46 | self.in_channels = in_channels
47 | self.hidden_channels = hidden_channels
48 | self.out_channels = out_channels
49 | self.kernel_size = kernel_size
50 | self.n_layers = n_layers
51 | self.p_dropout = p_dropout
52 | assert n_layers > 1, "Number of layers should be larger than 0."
53 |
54 | self.conv_layers = nn.ModuleList()
55 | self.norm_layers = nn.ModuleList()
56 | self.conv_layers.append(
57 | nn.Conv1d(
58 | in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59 | )
60 | )
61 | self.norm_layers.append(LayerNorm(hidden_channels))
62 | self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
63 | for _ in range(n_layers - 1):
64 | self.conv_layers.append(
65 | nn.Conv1d(
66 | hidden_channels,
67 | hidden_channels,
68 | kernel_size,
69 | padding=kernel_size // 2,
70 | )
71 | )
72 | self.norm_layers.append(LayerNorm(hidden_channels))
73 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74 | self.proj.weight.data.zero_()
75 | self.proj.bias.data.zero_()
76 |
77 | def forward(self, x, x_mask):
78 | x_org = x
79 | for i in range(self.n_layers):
80 | x = self.conv_layers[i](x * x_mask)
81 | x = self.norm_layers[i](x)
82 | x = self.relu_drop(x)
83 | x = x_org + self.proj(x)
84 | return x * x_mask
85 |
86 |
87 | class DDSConv(nn.Module):
88 | """
89 | Dialted and Depth-Separable Convolution
90 | """
91 |
92 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93 | super().__init__()
94 | self.channels = channels
95 | self.kernel_size = kernel_size
96 | self.n_layers = n_layers
97 | self.p_dropout = p_dropout
98 |
99 | self.drop = nn.Dropout(p_dropout)
100 | self.convs_sep = nn.ModuleList()
101 | self.convs_1x1 = nn.ModuleList()
102 | self.norms_1 = nn.ModuleList()
103 | self.norms_2 = nn.ModuleList()
104 | for i in range(n_layers):
105 | dilation = kernel_size**i
106 | padding = (kernel_size * dilation - dilation) // 2
107 | self.convs_sep.append(
108 | nn.Conv1d(
109 | channels,
110 | channels,
111 | kernel_size,
112 | groups=channels,
113 | dilation=dilation,
114 | padding=padding,
115 | )
116 | )
117 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118 | self.norms_1.append(LayerNorm(channels))
119 | self.norms_2.append(LayerNorm(channels))
120 |
121 | def forward(self, x, x_mask, g=None):
122 | if g is not None:
123 | x = x + g
124 | for i in range(self.n_layers):
125 | y = self.convs_sep[i](x * x_mask)
126 | y = self.norms_1[i](y)
127 | y = F.gelu(y)
128 | y = self.convs_1x1[i](y)
129 | y = self.norms_2[i](y)
130 | y = F.gelu(y)
131 | y = self.drop(y)
132 | x = x + y
133 | return x * x_mask
134 |
135 |
136 | class WN(torch.nn.Module):
137 | def __init__(
138 | self,
139 | hidden_channels,
140 | kernel_size,
141 | dilation_rate,
142 | n_layers,
143 | gin_channels=0,
144 | p_dropout=0,
145 | ):
146 | super(WN, self).__init__()
147 | assert kernel_size % 2 == 1
148 | self.hidden_channels = hidden_channels
149 | self.kernel_size = (kernel_size,)
150 | self.dilation_rate = dilation_rate
151 | self.n_layers = n_layers
152 | self.gin_channels = gin_channels
153 | self.p_dropout = p_dropout
154 |
155 | self.in_layers = torch.nn.ModuleList()
156 | self.res_skip_layers = torch.nn.ModuleList()
157 | self.drop = nn.Dropout(p_dropout)
158 |
159 | if gin_channels != 0:
160 | cond_layer = torch.nn.Conv1d(
161 | gin_channels, 2 * hidden_channels * n_layers, 1
162 | )
163 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164 |
165 | for i in range(n_layers):
166 | dilation = dilation_rate**i
167 | padding = int((kernel_size * dilation - dilation) / 2)
168 | in_layer = torch.nn.Conv1d(
169 | hidden_channels,
170 | 2 * hidden_channels,
171 | kernel_size,
172 | dilation=dilation,
173 | padding=padding,
174 | )
175 | in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176 | self.in_layers.append(in_layer)
177 |
178 | # last one is not necessary
179 | if i < n_layers - 1:
180 | res_skip_channels = 2 * hidden_channels
181 | else:
182 | res_skip_channels = hidden_channels
183 |
184 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186 | self.res_skip_layers.append(res_skip_layer)
187 |
188 | def forward(self, x, x_mask, g=None, **kwargs):
189 | output = torch.zeros_like(x)
190 | n_channels_tensor = torch.IntTensor([self.hidden_channels])
191 |
192 | if g is not None:
193 | g = self.cond_layer(g)
194 |
195 | for i in range(self.n_layers):
196 | x_in = self.in_layers[i](x)
197 | if g is not None:
198 | cond_offset = i * 2 * self.hidden_channels
199 | g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200 | else:
201 | g_l = torch.zeros_like(x_in)
202 |
203 | acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204 | acts = self.drop(acts)
205 |
206 | res_skip_acts = self.res_skip_layers[i](acts)
207 | if i < self.n_layers - 1:
208 | res_acts = res_skip_acts[:, : self.hidden_channels, :]
209 | x = (x + res_acts) * x_mask
210 | output = output + res_skip_acts[:, self.hidden_channels :, :]
211 | else:
212 | output = output + res_skip_acts
213 | return output * x_mask
214 |
215 | def remove_weight_norm(self):
216 | if self.gin_channels != 0:
217 | torch.nn.utils.remove_weight_norm(self.cond_layer)
218 | for l in self.in_layers:
219 | torch.nn.utils.remove_weight_norm(l)
220 | for l in self.res_skip_layers:
221 | torch.nn.utils.remove_weight_norm(l)
222 |
223 |
224 | class ResBlock1(torch.nn.Module):
225 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226 | super(ResBlock1, self).__init__()
227 | self.convs1 = nn.ModuleList(
228 | [
229 | weight_norm(
230 | Conv1d(
231 | channels,
232 | channels,
233 | kernel_size,
234 | 1,
235 | dilation=dilation[0],
236 | padding=get_padding(kernel_size, dilation[0]),
237 | )
238 | ),
239 | weight_norm(
240 | Conv1d(
241 | channels,
242 | channels,
243 | kernel_size,
244 | 1,
245 | dilation=dilation[1],
246 | padding=get_padding(kernel_size, dilation[1]),
247 | )
248 | ),
249 | weight_norm(
250 | Conv1d(
251 | channels,
252 | channels,
253 | kernel_size,
254 | 1,
255 | dilation=dilation[2],
256 | padding=get_padding(kernel_size, dilation[2]),
257 | )
258 | ),
259 | ]
260 | )
261 | self.convs1.apply(init_weights)
262 |
263 | self.convs2 = nn.ModuleList(
264 | [
265 | weight_norm(
266 | Conv1d(
267 | channels,
268 | channels,
269 | kernel_size,
270 | 1,
271 | dilation=1,
272 | padding=get_padding(kernel_size, 1),
273 | )
274 | ),
275 | weight_norm(
276 | Conv1d(
277 | channels,
278 | channels,
279 | kernel_size,
280 | 1,
281 | dilation=1,
282 | padding=get_padding(kernel_size, 1),
283 | )
284 | ),
285 | weight_norm(
286 | Conv1d(
287 | channels,
288 | channels,
289 | kernel_size,
290 | 1,
291 | dilation=1,
292 | padding=get_padding(kernel_size, 1),
293 | )
294 | ),
295 | ]
296 | )
297 | self.convs2.apply(init_weights)
298 |
299 | def forward(self, x, x_mask=None):
300 | for c1, c2 in zip(self.convs1, self.convs2):
301 | xt = F.leaky_relu(x, LRELU_SLOPE)
302 | if x_mask is not None:
303 | xt = xt * x_mask
304 | xt = c1(xt)
305 | xt = F.leaky_relu(xt, LRELU_SLOPE)
306 | if x_mask is not None:
307 | xt = xt * x_mask
308 | xt = c2(xt)
309 | x = xt + x
310 | if x_mask is not None:
311 | x = x * x_mask
312 | return x
313 |
314 | def remove_weight_norm(self):
315 | for l in self.convs1:
316 | remove_weight_norm(l)
317 | for l in self.convs2:
318 | remove_weight_norm(l)
319 |
320 |
321 | class ResBlock2(torch.nn.Module):
322 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323 | super(ResBlock2, self).__init__()
324 | self.convs = nn.ModuleList(
325 | [
326 | weight_norm(
327 | Conv1d(
328 | channels,
329 | channels,
330 | kernel_size,
331 | 1,
332 | dilation=dilation[0],
333 | padding=get_padding(kernel_size, dilation[0]),
334 | )
335 | ),
336 | weight_norm(
337 | Conv1d(
338 | channels,
339 | channels,
340 | kernel_size,
341 | 1,
342 | dilation=dilation[1],
343 | padding=get_padding(kernel_size, dilation[1]),
344 | )
345 | ),
346 | ]
347 | )
348 | self.convs.apply(init_weights)
349 |
350 | def forward(self, x, x_mask=None):
351 | for c in self.convs:
352 | xt = F.leaky_relu(x, LRELU_SLOPE)
353 | if x_mask is not None:
354 | xt = xt * x_mask
355 | xt = c(xt)
356 | x = xt + x
357 | if x_mask is not None:
358 | x = x * x_mask
359 | return x
360 |
361 | def remove_weight_norm(self):
362 | for l in self.convs:
363 | remove_weight_norm(l)
364 |
365 |
366 | class Log(nn.Module):
367 | def forward(self, x, x_mask, reverse=False, **kwargs):
368 | if not reverse:
369 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370 | logdet = torch.sum(-y, [1, 2])
371 | return y, logdet
372 | else:
373 | x = torch.exp(x) * x_mask
374 | return x
375 |
376 |
377 | class Flip(nn.Module):
378 | def forward(self, x, *args, reverse=False, **kwargs):
379 | x = torch.flip(x, [1])
380 | if not reverse:
381 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382 | return x, logdet
383 | else:
384 | return x
385 |
386 |
387 | class ElementwiseAffine(nn.Module):
388 | def __init__(self, channels):
389 | super().__init__()
390 | self.channels = channels
391 | self.m = nn.Parameter(torch.zeros(channels, 1))
392 | self.logs = nn.Parameter(torch.zeros(channels, 1))
393 |
394 | def forward(self, x, x_mask, reverse=False, **kwargs):
395 | if not reverse:
396 | y = self.m + torch.exp(self.logs) * x
397 | y = y * x_mask
398 | logdet = torch.sum(self.logs * x_mask, [1, 2])
399 | return y, logdet
400 | else:
401 | x = (x - self.m) * torch.exp(-self.logs) * x_mask
402 | return x
403 |
404 |
405 | class ResidualCouplingLayer(nn.Module):
406 | def __init__(
407 | self,
408 | channels,
409 | hidden_channels,
410 | kernel_size,
411 | dilation_rate,
412 | n_layers,
413 | p_dropout=0,
414 | gin_channels=0,
415 | mean_only=False,
416 | ):
417 | assert channels % 2 == 0, "channels should be divisible by 2"
418 | super().__init__()
419 | self.channels = channels
420 | self.hidden_channels = hidden_channels
421 | self.kernel_size = kernel_size
422 | self.dilation_rate = dilation_rate
423 | self.n_layers = n_layers
424 | self.half_channels = channels // 2
425 | self.mean_only = mean_only
426 |
427 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428 | self.enc = WN(
429 | hidden_channels,
430 | kernel_size,
431 | dilation_rate,
432 | n_layers,
433 | p_dropout=p_dropout,
434 | gin_channels=gin_channels,
435 | )
436 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437 | self.post.weight.data.zero_()
438 | self.post.bias.data.zero_()
439 |
440 | def forward(self, x, x_mask, g=None, reverse=False):
441 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442 | h = self.pre(x0) * x_mask
443 | h = self.enc(h, x_mask, g=g)
444 | stats = self.post(h) * x_mask
445 | if not self.mean_only:
446 | m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447 | else:
448 | m = stats
449 | logs = torch.zeros_like(m)
450 |
451 | if not reverse:
452 | x1 = m + x1 * torch.exp(logs) * x_mask
453 | x = torch.cat([x0, x1], 1)
454 | logdet = torch.sum(logs, [1, 2])
455 | return x, logdet
456 | else:
457 | x1 = (x1 - m) * torch.exp(-logs) * x_mask
458 | x = torch.cat([x0, x1], 1)
459 | return x
460 |
461 | def remove_weight_norm(self):
462 | self.enc.remove_weight_norm()
463 |
464 |
465 | class ConvFlow(nn.Module):
466 | def __init__(
467 | self,
468 | in_channels,
469 | filter_channels,
470 | kernel_size,
471 | n_layers,
472 | num_bins=10,
473 | tail_bound=5.0,
474 | ):
475 | super().__init__()
476 | self.in_channels = in_channels
477 | self.filter_channels = filter_channels
478 | self.kernel_size = kernel_size
479 | self.n_layers = n_layers
480 | self.num_bins = num_bins
481 | self.tail_bound = tail_bound
482 | self.half_channels = in_channels // 2
483 |
484 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486 | self.proj = nn.Conv1d(
487 | filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488 | )
489 | self.proj.weight.data.zero_()
490 | self.proj.bias.data.zero_()
491 |
492 | def forward(self, x, x_mask, g=None, reverse=False):
493 | x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494 | h = self.pre(x0)
495 | h = self.convs(h, x_mask, g=g)
496 | h = self.proj(h) * x_mask
497 |
498 | b, c, t = x0.shape
499 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
500 |
501 | unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502 | unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503 | self.filter_channels
504 | )
505 | unnormalized_derivatives = h[..., 2 * self.num_bins :]
506 |
507 | x1, logabsdet = piecewise_rational_quadratic_transform(
508 | x1,
509 | unnormalized_widths,
510 | unnormalized_heights,
511 | unnormalized_derivatives,
512 | inverse=reverse,
513 | tails="linear",
514 | tail_bound=self.tail_bound,
515 | )
516 |
517 | x = torch.cat([x0, x1], 1) * x_mask
518 | logdet = torch.sum(logabsdet * x_mask, [1, 2])
519 | if not reverse:
520 | return x, logdet
521 | else:
522 | return x
523 |
--------------------------------------------------------------------------------
/infer_pack/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 | import numpy as np
5 |
6 |
7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
9 | DEFAULT_MIN_DERIVATIVE = 1e-3
10 |
11 |
12 | def piecewise_rational_quadratic_transform(
13 | inputs,
14 | unnormalized_widths,
15 | unnormalized_heights,
16 | unnormalized_derivatives,
17 | inverse=False,
18 | tails=None,
19 | tail_bound=1.0,
20 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22 | min_derivative=DEFAULT_MIN_DERIVATIVE,
23 | ):
24 | if tails is None:
25 | spline_fn = rational_quadratic_spline
26 | spline_kwargs = {}
27 | else:
28 | spline_fn = unconstrained_rational_quadratic_spline
29 | spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30 |
31 | outputs, logabsdet = spline_fn(
32 | inputs=inputs,
33 | unnormalized_widths=unnormalized_widths,
34 | unnormalized_heights=unnormalized_heights,
35 | unnormalized_derivatives=unnormalized_derivatives,
36 | inverse=inverse,
37 | min_bin_width=min_bin_width,
38 | min_bin_height=min_bin_height,
39 | min_derivative=min_derivative,
40 | **spline_kwargs
41 | )
42 | return outputs, logabsdet
43 |
44 |
45 | def searchsorted(bin_locations, inputs, eps=1e-6):
46 | bin_locations[..., -1] += eps
47 | return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48 |
49 |
50 | def unconstrained_rational_quadratic_spline(
51 | inputs,
52 | unnormalized_widths,
53 | unnormalized_heights,
54 | unnormalized_derivatives,
55 | inverse=False,
56 | tails="linear",
57 | tail_bound=1.0,
58 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60 | min_derivative=DEFAULT_MIN_DERIVATIVE,
61 | ):
62 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63 | outside_interval_mask = ~inside_interval_mask
64 |
65 | outputs = torch.zeros_like(inputs)
66 | logabsdet = torch.zeros_like(inputs)
67 |
68 | if tails == "linear":
69 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70 | constant = np.log(np.exp(1 - min_derivative) - 1)
71 | unnormalized_derivatives[..., 0] = constant
72 | unnormalized_derivatives[..., -1] = constant
73 |
74 | outputs[outside_interval_mask] = inputs[outside_interval_mask]
75 | logabsdet[outside_interval_mask] = 0
76 | else:
77 | raise RuntimeError("{} tails are not implemented.".format(tails))
78 |
79 | (
80 | outputs[inside_interval_mask],
81 | logabsdet[inside_interval_mask],
82 | ) = rational_quadratic_spline(
83 | inputs=inputs[inside_interval_mask],
84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87 | inverse=inverse,
88 | left=-tail_bound,
89 | right=tail_bound,
90 | bottom=-tail_bound,
91 | top=tail_bound,
92 | min_bin_width=min_bin_width,
93 | min_bin_height=min_bin_height,
94 | min_derivative=min_derivative,
95 | )
96 |
97 | return outputs, logabsdet
98 |
99 |
100 | def rational_quadratic_spline(
101 | inputs,
102 | unnormalized_widths,
103 | unnormalized_heights,
104 | unnormalized_derivatives,
105 | inverse=False,
106 | left=0.0,
107 | right=1.0,
108 | bottom=0.0,
109 | top=1.0,
110 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112 | min_derivative=DEFAULT_MIN_DERIVATIVE,
113 | ):
114 | if torch.min(inputs) < left or torch.max(inputs) > right:
115 | raise ValueError("Input to a transform is not within its domain")
116 |
117 | num_bins = unnormalized_widths.shape[-1]
118 |
119 | if min_bin_width * num_bins > 1.0:
120 | raise ValueError("Minimal bin width too large for the number of bins")
121 | if min_bin_height * num_bins > 1.0:
122 | raise ValueError("Minimal bin height too large for the number of bins")
123 |
124 | widths = F.softmax(unnormalized_widths, dim=-1)
125 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126 | cumwidths = torch.cumsum(widths, dim=-1)
127 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128 | cumwidths = (right - left) * cumwidths + left
129 | cumwidths[..., 0] = left
130 | cumwidths[..., -1] = right
131 | widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132 |
133 | derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134 |
135 | heights = F.softmax(unnormalized_heights, dim=-1)
136 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137 | cumheights = torch.cumsum(heights, dim=-1)
138 | cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139 | cumheights = (top - bottom) * cumheights + bottom
140 | cumheights[..., 0] = bottom
141 | cumheights[..., -1] = top
142 | heights = cumheights[..., 1:] - cumheights[..., :-1]
143 |
144 | if inverse:
145 | bin_idx = searchsorted(cumheights, inputs)[..., None]
146 | else:
147 | bin_idx = searchsorted(cumwidths, inputs)[..., None]
148 |
149 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151 |
152 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153 | delta = heights / widths
154 | input_delta = delta.gather(-1, bin_idx)[..., 0]
155 |
156 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158 |
159 | input_heights = heights.gather(-1, bin_idx)[..., 0]
160 |
161 | if inverse:
162 | a = (inputs - input_cumheights) * (
163 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
164 | ) + input_heights * (input_delta - input_derivatives)
165 | b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166 | input_derivatives + input_derivatives_plus_one - 2 * input_delta
167 | )
168 | c = -input_delta * (inputs - input_cumheights)
169 |
170 | discriminant = b.pow(2) - 4 * a * c
171 | assert (discriminant >= 0).all()
172 |
173 | root = (2 * c) / (-b - torch.sqrt(discriminant))
174 | outputs = root * input_bin_widths + input_cumwidths
175 |
176 | theta_one_minus_theta = root * (1 - root)
177 | denominator = input_delta + (
178 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179 | * theta_one_minus_theta
180 | )
181 | derivative_numerator = input_delta.pow(2) * (
182 | input_derivatives_plus_one * root.pow(2)
183 | + 2 * input_delta * theta_one_minus_theta
184 | + input_derivatives * (1 - root).pow(2)
185 | )
186 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187 |
188 | return outputs, -logabsdet
189 | else:
190 | theta = (inputs - input_cumwidths) / input_bin_widths
191 | theta_one_minus_theta = theta * (1 - theta)
192 |
193 | numerator = input_heights * (
194 | input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195 | )
196 | denominator = input_delta + (
197 | (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198 | * theta_one_minus_theta
199 | )
200 | outputs = input_cumheights + numerator / denominator
201 |
202 | derivative_numerator = input_delta.pow(2) * (
203 | input_derivatives_plus_one * theta.pow(2)
204 | + 2 * input_delta * theta_one_minus_theta
205 | + input_derivatives * (1 - theta).pow(2)
206 | )
207 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208 |
209 | return outputs, logabsdet
210 |
--------------------------------------------------------------------------------
/my_utils.py:
--------------------------------------------------------------------------------
1 | import ffmpeg
2 | import numpy as np
3 |
4 |
5 | def load_audio(file, sr):
6 | try:
7 | # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8 | # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9 | # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10 | file = (
11 | file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12 | ) # 防止小白拷路径头尾带了空格和"和回车
13 | out, _ = (
14 | ffmpeg.input(file, threads=0)
15 | .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 | )
18 | except Exception as e:
19 | raise RuntimeError(f"Failed to load audio: {e}")
20 |
21 | return np.frombuffer(out, np.float32).flatten()
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numba==0.56.4
2 | numpy==1.23.5
3 | scipy==1.9.3
4 | librosa==0.9.2
5 | llvmlite==0.39.0
6 | fairseq==0.12.2
7 | faiss-cpu==1.7.0; sys_platform == "darwin"
8 | faiss-cpu==1.7.2; sys_platform != "darwin"
9 | gradio
10 | Cython
11 | future>=0.18.3
12 | pydub>=0.25.1
13 | soundfile>=0.12.1
14 | ffmpeg-python>=0.2.0
15 | tensorboardX
16 | functorch>=2.0.0
17 | Jinja2>=3.1.2
18 | json5>=0.9.11
19 | Markdown
20 | matplotlib>=3.7.1
21 | matplotlib-inline>=0.1.6
22 | praat-parselmouth>=0.4.3
23 | Pillow>=9.1.1
24 | pyworld>=0.3.2
25 | resampy>=0.4.2
26 | scikit-learn>=1.2.2
27 | starlette>=0.26.1
28 | tensorboard
29 | tensorboard-data-server
30 | tensorboard-plugin-wit
31 | torchgen>=0.0.1
32 | tqdm>=4.65.0
33 | tornado>=6.2
34 | Werkzeug>=2.2.3
35 | uc-micro-py>=1.0.1
36 | sympy>=1.11.1
37 | tabulate>=0.9.0
38 | PyYAML>=6.0
39 | pyasn1>=0.4.8
40 | pyasn1-modules>=0.2.8
41 | fsspec>=2023.3.0
42 | absl-py>=1.4.0
43 | audioread
44 | uvicorn>=0.21.1
45 | colorama>=0.4.6
46 | customtkinter
47 | torchcrepe
--------------------------------------------------------------------------------
/rvcgui.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 | from tkinter import filedialog
4 | import soundfile as sf
5 | import tkinter as tk
6 | import customtkinter as ctk
7 |
8 | import os
9 | import sys
10 | import torch
11 | import warnings
12 | import customtkinter as ctk
13 |
14 | now_dir = os.getcwd()
15 | sys.path.append(now_dir)
16 | tmp = os.path.join(now_dir, "TEMP")
17 | os.makedirs(os.path.join(now_dir, "models"), exist_ok=True)
18 | os.makedirs(os.path.join(now_dir, "output"), exist_ok=True)
19 | os.environ["TEMP"] = tmp
20 | warnings.filterwarnings("ignore")
21 | torch.manual_seed(114514)
22 |
23 | from vc_infer_pipeline import VC
24 | from fairseq import checkpoint_utils
25 | from scipy.io import wavfile
26 | from my_utils import load_audio
27 | from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
28 | from infer_pack.modelsv2 import SynthesizerTrnMs768NSFsid_nono, SynthesizerTrnMs768NSFsid
29 | from multiprocessing import cpu_count
30 | import threading
31 | from time import sleep
32 | from time import sleep
33 | import traceback
34 | import numpy as np
35 | import subprocess
36 | import zipfile
37 | from config import Config
38 |
39 | config = Config()
40 |
41 |
42 |
43 | def extract_model_from_zip(zip_path, output_dir):
44 | # Extract the folder name from the zip file path
45 | folder_name = os.path.splitext(os.path.basename(zip_path))[0]
46 |
47 | # Create a folder with the same name as the zip file inside the output directory
48 | output_folder = os.path.join(output_dir, folder_name)
49 | os.makedirs(output_folder, exist_ok=True)
50 |
51 | with zipfile.ZipFile(zip_path, 'r') as zip_ref:
52 | for member in zip_ref.namelist():
53 | if (member.endswith('.pth') and not (os.path.basename(member).startswith("G_") or os.path.basename(member).startswith("D_")) and zip_ref.getinfo(member).file_size < 200*(1024**2)) or (member.endswith('.index') and not (os.path.basename(member).startswith("trained"))):
54 | # Extract the file to the output folder
55 | zip_ref.extract(member, output_folder)
56 |
57 | # Move the file to the top level of the output folder
58 | file_path = os.path.join(output_folder, member)
59 | new_path = os.path.join(output_folder, os.path.basename(file_path))
60 | os.rename(file_path, new_path)
61 |
62 | print(f"Model files extracted to folder: {output_folder}")
63 |
64 |
65 | def play_audio(file_path):
66 | if sys.platform == 'win32':
67 | audio_file = os.path.abspath(file_path)
68 | subprocess.call(['start', '', audio_file], shell=True)
69 | elif sys.platform == 'darwin':
70 | audio_file = 'path/to/audio/file.wav'
71 | subprocess.call(['open', audio_file])
72 | elif sys.platform == 'linux':
73 | audio_file = 'path/to/audio/file.wav'
74 | subprocess.call(['xdg-open', audio_file])
75 |
76 | def get_full_path(path):
77 | return os.path.abspath(path)
78 |
79 | hubert_model = None
80 | device = config.device
81 | print(device)
82 | is_half = config.is_half
83 |
84 | def load_hubert():
85 | global hubert_model
86 | models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
87 | ["hubert_base.pt"],
88 | suffix="",
89 | )
90 | hubert_model = models[0]
91 | hubert_model = hubert_model.to(config.device)
92 | if is_half:
93 | hubert_model = hubert_model.half()
94 | else:
95 | hubert_model = hubert_model.float()
96 | hubert_model.eval()
97 |
98 |
99 | def vc_single(
100 | sid,
101 | input_audio,
102 | f0_up_key,
103 | f0_file,
104 | f0_method,
105 | file_index,
106 | index_rate,
107 | crepe_hop_length,
108 | output_path=None,
109 | ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
110 | global tgt_sr, net_g, vc, hubert_model
111 | if input_audio is None:
112 | return "You need to upload an audio", None
113 | f0_up_key = int(f0_up_key)
114 | try:
115 | audio = load_audio(input_audio, 16000)
116 | times = [0, 0, 0]
117 | if hubert_model == None:
118 | load_hubert()
119 | if_f0 = cpt.get("f0", 1)
120 | file_index = (
121 | file_index.strip(" ")
122 | .strip('"')
123 | .strip("\n")
124 | .strip('"')
125 | .strip(" ")
126 | .replace("trained", "added")
127 | ) # 防止小白写错,自动帮他替换掉
128 |
129 | audio_opt = vc.pipeline(
130 | hubert_model,
131 | net_g,
132 | sid,
133 | audio,
134 | times,
135 | f0_up_key,
136 | f0_method,
137 | file_index,
138 | # file_big_npy,
139 | index_rate,
140 | if_f0,
141 | version,
142 | crepe_hop_length,
143 | None,
144 | )
145 | print(
146 | "npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep=""
147 | )
148 |
149 | if output_path is not None:
150 | sf.write(output_path, audio_opt, tgt_sr, format='WAV')
151 |
152 | return "Success", (tgt_sr, audio_opt)
153 | except:
154 | info = traceback.format_exc()
155 | print(info)
156 | return info, (None, None)
157 |
158 |
159 | def vc_multi(
160 | sid,
161 | dir_path,
162 | opt_root,
163 | paths,
164 | f0_up_key,
165 | f0_method,
166 | file_index,
167 | index_rate,
168 | ):
169 | try:
170 | dir_path = (
171 | dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
172 | ) # 防止小白拷路径头尾带了空格和"和回车
173 | opt_root = opt_root.strip(" ").strip(
174 | '"').strip("\n").strip('"').strip(" ")
175 | os.makedirs(opt_root, exist_ok=True)
176 | try:
177 | if dir_path != "":
178 | paths = [os.path.join(dir_path, name)
179 | for name in os.listdir(dir_path)]
180 | else:
181 | paths = [path.name for path in paths]
182 | except:
183 | traceback.print_exc()
184 | paths = [path.name for path in paths]
185 | infos = []
186 | for path in paths:
187 | info, opt = vc_single(
188 | sid,
189 | path,
190 | f0_up_key,
191 | None,
192 | f0_method,
193 | file_index,
194 | index_rate,
195 | )
196 | if info == "Success":
197 | try:
198 | tgt_sr, audio_opt = opt
199 | wavfile.write(
200 | "%s/%s" % (opt_root, os.path.basename(path)
201 | ), tgt_sr, audio_opt
202 | )
203 | except:
204 | info = traceback.format_exc()
205 | infos.append("%s->%s" % (os.path.basename(path), info))
206 | yield "\n".join(infos)
207 | yield "\n".join(infos)
208 | except:
209 | yield traceback.format_exc()
210 |
211 |
212 | # 一个选项卡全局只能有一个音色
213 | def get_vc(weight_root, sid):
214 | global n_spk, tgt_sr, net_g, vc, cpt, version
215 | if sid == "" or sid == []:
216 | global hubert_model
217 | if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
218 | print("clean_empty_cache")
219 | del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
220 | hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
221 | if torch.cuda.is_available():
222 | torch.cuda.empty_cache()
223 | ###楼下不这么折腾清理不干净
224 | if_f0 = cpt.get("f0", 1)
225 | version = cpt.get("version", "v1")
226 | if version == "v1":
227 | if if_f0 == 1:
228 | net_g = SynthesizerTrnMs256NSFsid(
229 | *cpt["config"], is_half=config.is_half
230 | )
231 | else:
232 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
233 | elif version == "v2":
234 | if if_f0 == 1:
235 | net_g = SynthesizerTrnMs768NSFsid(
236 | *cpt["config"], is_half=config.is_half
237 | )
238 | else:
239 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
240 | del net_g, cpt
241 | if torch.cuda.is_available():
242 | torch.cuda.empty_cache()
243 | cpt = None
244 | return {"visible": False, "__type__": "update"}
245 | person = (weight_root)
246 | print("loading %s" % person)
247 | cpt = torch.load(person, map_location="cpu")
248 | tgt_sr = cpt["config"][-1]
249 | cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
250 | if_f0 = cpt.get("f0", 1)
251 | version = cpt.get("version", "v1")
252 | if version == "v1":
253 | if if_f0 == 1:
254 | net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
255 | else:
256 | net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
257 | elif version == "v2":
258 | if if_f0 == 1:
259 | net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
260 | else:
261 | net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
262 | del net_g.enc_q
263 | print(net_g.load_state_dict(cpt["weight"], strict=False))
264 | net_g.eval().to(config.device)
265 | if config.is_half:
266 | net_g = net_g.half()
267 | else:
268 | net_g = net_g.float()
269 | vc = VC(tgt_sr, config)
270 | n_spk = cpt["config"][-3]
271 | return {"visible": True, "maximum": n_spk, "__type__": "update"}
272 |
273 |
274 | def clean():
275 | return {"value": "", "__type__": "update"}
276 |
277 |
278 | def if_done(done, p):
279 | while 1:
280 | if p.poll() == None:
281 | sleep(0.5)
282 | else:
283 | break
284 | done[0] = True
285 |
286 |
287 | def if_done_multi(done, ps):
288 | while 1:
289 | # poll==None代表进程未结束
290 | # 只要有一个进程未结束都不停
291 | flag = 1
292 | for p in ps:
293 | if p.poll() == None:
294 | flag = 0
295 | sleep(0.5)
296 | break
297 | if flag == 1:
298 | break
299 | done[0] = True
300 |
301 |
302 | # window
303 |
304 |
305 | def outputkey(length=5):
306 | # generate all possible characters
307 | characters = string.ascii_letters + string.digits
308 | return ''.join(random.choices(characters, k=length))
309 | # choose `length` characters randomly from the list and join them into a string
310 |
311 | def refresh_model_list():
312 | global model_folders
313 | model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join(
314 | models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))]
315 | model_list.configure(values=model_folders)
316 | model_list.update()
317 |
318 | def browse_zip():
319 | global zip_file
320 | zip_file = filedialog.askopenfilename(
321 | initialdir=os.getcwd(),
322 | title="Select file",
323 | filetypes=(("zip files", "*.zip"), ("all files", "*.*")),
324 | )
325 | extract_model_from_zip(zip_file, models_dir)
326 | refresh_model_list()
327 |
328 | def get_output_path(file_path):
329 |
330 | if not os.path.exists(file_path):
331 | # change the file extension to .wav
332 |
333 | return file_path # File path does not exist, return as is
334 |
335 | # Split file path into directory, base filename, and extension
336 | dir_name, file_name = os.path.split(file_path)
337 | file_name, file_ext = os.path.splitext(file_name)
338 |
339 | # Initialize index to 1
340 | index = 1
341 |
342 | # Increment index until a new file path is found
343 | while True:
344 | new_dir = f"{dir_name}\\{chosenOne}\\"
345 | new_file_name = f"{file_name}_RVC_{index}{file_ext}"
346 | new_file_path = os.path.join(new_dir, new_file_name)
347 | if not os.path.exists(new_file_path):
348 | # change the file extension to .wav
349 | if not os.path.exists(new_dir):
350 | os.makedirs(new_dir)
351 | new_file_path = os.path.splitext(new_file_path)[0] + ".wav"
352 | return new_file_path # Found new file path, return it
353 | index += 1
354 |
355 | def on_button_click():
356 | output_audio_frame.pack_forget()
357 | result_state.pack_forget()
358 | run_button.configure(state="disabled")
359 |
360 | # Get values from user input widgets
361 | sid = sid_entry.get()
362 | input_audio = input_audio_entry.get()
363 | f0_pitch = round(f0_pitch_entry.get())
364 | crepe_hop_length = round((crepe_hop_length_entry.get()) * 64)
365 | f0_file = None
366 | f0_method = f0_method_entry.get()
367 | file_index = file_index_entry.get()
368 | # file_big_npy = file_big_npy_entry.get()
369 | index_rate = round(index_rate_entry.get(),2)
370 | global output_file
371 | output_file = get_output_path(input_audio)
372 | print("sid: ", sid, "input_audio: ", input_audio, "f0_pitch: ", f0_pitch, "f0_file: ", f0_file, "f0_method: ", f0_method,
373 | "file_index: ", file_index, "file_big_npy: ", "index_rate: ", index_rate, "output_file: ", output_file)
374 | # Call the vc_single function with the user input values
375 | if model_loaded == True and os.path.isfile(input_audio):
376 | try:
377 | loading_frame.pack(padx=10, pady=10)
378 | loading_progress.start()
379 |
380 | result, audio_opt = vc_single(
381 | 0, input_audio, f0_pitch, None, f0_method, file_index, index_rate,crepe_hop_length, output_file)
382 | # output_label.configure(text=result + "\n saved at" + output_file)
383 | print(os.path.join(output_file))
384 | if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
385 | print(output_file)
386 |
387 | run_button.configure(state="enabled")
388 | message = result
389 | result_state.configure(text_color="green")
390 | last_output_file.configure(text=output_file)
391 | output_audio_frame.pack(padx=10, pady=10)
392 | else:
393 | message = result
394 | result_state.configure(text_color="red")
395 |
396 | except Exception as e:
397 | print(e)
398 | message = "Voice conversion failed", e
399 |
400 | # Update the output label with the result
401 | # output_label.configure(text=result + "\n saved at" + output_file)
402 |
403 | run_button.configure(state="enabled")
404 | else:
405 | message = "Please select a model and input audio file"
406 | run_button.configure(state="enabled")
407 | result_state.configure(text_color="red")
408 |
409 | loading_progress.stop()
410 | loading_frame.pack_forget()
411 | result_state.pack(padx=10, pady=10, side="top")
412 | result_state.configure(text=message)
413 |
414 |
415 | def browse_file():
416 | filepath = filedialog.askopenfilename (
417 | filetypes=[("Audio Files", ["*.mp3","*.wav"])])
418 | filepath = os.path.normpath(filepath) # Normalize file path
419 | input_audio_entry.delete(0, tk.END)
420 | input_audio_entry.insert(0, filepath)
421 |
422 |
423 |
424 | def start_processing():
425 |
426 | t = threading.Thread(target=on_button_click)
427 | t.start()
428 |
429 |
430 | # Create tkinter window and widgets
431 | root = ctk.CTk()
432 | ctk.set_appearance_mode("dark")
433 | root.title("RVC GUI")
434 | # Get screen dimensions
435 | screen_width = root.winfo_screenwidth()
436 | screen_height = root.winfo_screenheight()
437 |
438 | # Set GUI dimensions as a percentage of screen size
439 |
440 | gui_height = int(screen_height * 0.85) # 80% of screen height
441 | gui_dimensions = f"800x{gui_height}"
442 |
443 | root.geometry(gui_dimensions)
444 | root.resizable(False, True)
445 |
446 | model_loaded = False
447 |
448 | def selected_model(choice):
449 | global chosenOne
450 | chosenOne = choice
451 | file_index_entry.delete(0, ctk.END)
452 | model_dir = os.path.join(models_dir, choice)
453 | pth_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f))
454 | and f.endswith(".pth") and not (f.startswith("G_") or f.startswith("D_"))
455 | and os.path.getsize(os.path.join(model_dir, f)) < 200*(1024**2)]
456 |
457 | if pth_files:
458 | global pth_file_path
459 | pth_file_path = os.path.join(model_dir, pth_files[0])
460 | npy_files = [f for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f))
461 | and f.endswith(".index")]
462 | if npy_files:
463 | npy_files_dir = [os.path.join(model_dir, f) for f in npy_files]
464 | if len(npy_files_dir) == 1:
465 | index_file = npy_files_dir[0]
466 | print(f".pth file directory: {pth_file_path}")
467 | print(f".index file directory: {index_file}")
468 | file_index_entry.insert(0, os.path.normpath(index_file))
469 | else:
470 | print(f"Incomplete set of .index files found in {model_dir}")
471 | else:
472 | print(f"No .index files found in {model_dir}")
473 | get_vc(pth_file_path, 0)
474 | global model_loaded
475 | model_loaded = True
476 | else:
477 | print(f"No eligible .pth files found in {model_dir}")
478 |
479 |
480 | def index_slider_event(value):
481 | index_rate_label.configure(
482 | text='Feature retrieval rate: %s' % round(value, 2))
483 | # print(value)
484 |
485 |
486 | def pitch_slider_event(value):
487 | f0_pitch_label.configure(text='Pitch: %s' % round(value))
488 | # print(value)
489 |
490 | def crepe_hop_length_slider_event(value):
491 | crepe_hop_length_label.configure(text='crepe hop: %s' % round((value) * 64))
492 | # print(value)
493 |
494 |
495 | # hide crepe hop length slider if crepe is not selected
496 | def crepe_hop_length_slider_visibility(value):
497 | if value == "crepe" or value == "crepe-tiny":
498 | crepe_hop_length_label.grid(row=2, column=0, padx=10, pady=5, )
499 | crepe_hop_length_entry.grid(row=2, column=1, padx=10, pady=5, )
500 | else:
501 | crepe_hop_length_label.grid_remove()
502 | crepe_hop_length_entry.grid_remove()
503 |
504 | def update_config(selected):
505 | global device, is_half # declare newconfig as a global variable
506 | if selected == "GPU":
507 | device = "cuda:0"
508 | # is_half = True
509 | else:
510 | if torch.backends.mps.is_available():
511 | device = "mps"
512 | # is_half = False
513 | else:
514 | device = "cpu"
515 | is_half = False
516 |
517 | config.device = device
518 | config.is_half = is_half
519 |
520 |
521 | if "pth_file_path" in globals():
522 | load_hubert()
523 | get_vc(pth_file_path, 0)
524 |
525 |
526 | models_dir = "./models"
527 | model_folders = [f for f in os.listdir(models_dir) if os.path.isdir(os.path.join(
528 | models_dir, f)) and any(f.endswith(".pth") for f in os.listdir(os.path.join(models_dir, f)))]
529 |
530 |
531 | master_frame = ctk.CTkFrame(master=root, height=500)
532 | master_frame.pack(padx=5, pady=5)
533 |
534 |
535 | left_frame = ctk.CTkFrame(master=master_frame, )
536 | left_frame.grid(row=0, column=0, padx=10, pady=10, sticky="nsew")
537 |
538 | right_frame = ctk.CTkFrame(master=master_frame, )
539 | right_frame.grid(row=0, column=1, pady=10, padx=10, sticky="nsew")
540 |
541 |
542 | inputpath_frame = ctk.CTkFrame(master=left_frame)
543 | inputpath_frame.grid(row=0, column=0, padx=15, pady=10, sticky="nsew")
544 |
545 |
546 | output_audio_frame = ctk.CTkFrame(master=root)
547 |
548 | select_model_frame = ctk.CTkFrame(left_frame)
549 | select_model_frame.grid(row=1, column=0, padx=15, pady=10, sticky="nsew")
550 |
551 | pitch_frame = ctk.CTkFrame(left_frame)
552 | pitch_frame.grid(row=3, column=0, padx=10, pady=5, sticky="nsew")
553 |
554 |
555 |
556 | # Get the list of .pth files in the models directory
557 |
558 |
559 |
560 | sid_label = ctk.CTkLabel(select_model_frame, text="Speaker ID:")
561 | sid_entry = ctk.CTkEntry(select_model_frame)
562 | sid_entry.insert(0, "0")
563 | sid_entry.configure(state="disabled")
564 |
565 | # intiilizing model select widget
566 | select_model = ctk.StringVar(value="Select a model")
567 | model_list = ctk.CTkOptionMenu(select_model_frame, values=model_folders,
568 | command=selected_model,
569 | variable=select_model
570 | )
571 |
572 | # intiilizing audio file input widget
573 | input_audio_label = ctk.CTkLabel(inputpath_frame, text="Input audio file:")
574 | browse_button = ctk.CTkButton(
575 | inputpath_frame, text="Browse", command=browse_file)
576 | input_audio_entry = ctk.CTkEntry(inputpath_frame)
577 |
578 | # intiilizing pitch widget
579 | f0_pitch_label = ctk.CTkLabel(pitch_frame, text="Pitch: 0")
580 | f0_pitch_entry = ctk.CTkSlider(
581 | pitch_frame, from_=-20, to=20, number_of_steps=100, command=pitch_slider_event, )
582 | f0_pitch_entry.set(0)
583 |
584 | # intiilizing crepe hop length widget
585 | crepe_hop_length_label = ctk.CTkLabel(pitch_frame, text="crepe hop: 128")
586 | crepe_hop_length_entry = ctk.CTkSlider(
587 | pitch_frame, from_=1, to=8, number_of_steps=7, command=crepe_hop_length_slider_event)
588 | crepe_hop_length_entry.set(2)
589 |
590 | # intiilizing f0 file widget
591 | #f0_file_label = ctk.CTkLabel(right_frame, text="F0 file (Optional/Not Tested)")
592 | #f0_file_entry = ctk.CTkEntry(right_frame, width=250)
593 |
594 | # intiilizing f0 method widget
595 | f0_method_label = ctk.CTkLabel(
596 | pitch_frame, text="F0 method")
597 | f0_method_entry = ctk.CTkSegmentedButton(
598 | pitch_frame, height=40, values=["dio", "pm","harvest", "crepe", "crepe-tiny" ], command=crepe_hop_length_slider_visibility)
599 | f0_method_entry.set("dio")
600 |
601 | # intiilizing index file widget
602 | file_index_label = ctk.CTkLabel(right_frame, text=".index File (Recommended)")
603 | file_index_entry = ctk.CTkEntry(right_frame, width=250)
604 |
605 | # intiilizing big npy file widget
606 |
607 |
608 |
609 | # intiilizing index rate widget
610 | index_rate_entry = ctk.CTkSlider(
611 | right_frame, from_=0, to=1, number_of_steps=20, command=index_slider_event, )
612 | index_rate_entry.set(0.4)
613 | index_rate_label = ctk.CTkLabel(
614 | right_frame, text="Feature retrieval rate: 0.4" )
615 |
616 | # intiilizing run button widget
617 | run_button = ctk.CTkButton(
618 | left_frame, fg_color="green", hover_color="darkgreen", text="Convert", command=start_processing)
619 |
620 | # intiilizing output label widget
621 | output_label = ctk.CTkLabel(right_frame, text="")
622 |
623 | # intiilizing Notes label widget
624 | notes_label = ctk.CTkLabel(left_frame, justify="left", text_color="#8A8A8A", text="Tips: \n 1. harvest and crepe are the highest quality, but also the slowest methods. \n 2. dio and pm are the lightest and fastest methods, but also the lowest quality.")
625 |
626 | # intiilizing loading progress bar widget
627 |
628 | loading_frame = ctk.CTkFrame(master=root, width=200)
629 |
630 | laoding_label = ctk.CTkLabel(loading_frame, text="Converting..., If the window is not responding, Please wait.")
631 | laoding_label.pack(padx=10, pady=10)
632 | loading_progress = ctk.CTkProgressBar(master=loading_frame, width=200)
633 | loading_progress.configure(mode="indeterminate")
634 | loading_progress.pack(padx=10, pady=10)
635 |
636 | # intiilizing result state label widget
637 | result_state = ctk.CTkLabel(
638 | root, text="", height=50, width=100, corner_radius=10)
639 |
640 | # intiilizing change device widget
641 | change_device_label = ctk.CTkLabel( right_frame, text="Processing mode")
642 | change_device = ctk.CTkSegmentedButton(
643 | right_frame, command=lambda value: update_config(value))
644 | change_device.configure(
645 | values=["GPU", "CPU"])
646 |
647 | if "cpu" in device.lower() or device.lower() == "cpu":
648 | change_device.set("CPU")
649 | change_device.configure(state="disabled")
650 |
651 | else:
652 | change_device.set("GPU")
653 |
654 | # intiilizing last output label & open output button widget
655 | last_output_label = ctk.CTkLabel(output_audio_frame, text="Output path: ")
656 | last_output_file = ctk.CTkLabel(output_audio_frame, text="", text_color="green")
657 | open_output_button = ctk.CTkButton(output_audio_frame, text="Open", command=lambda: play_audio(output_file))
658 |
659 | # intiilizing import models button widget
660 | import_moodels_button = ctk.CTkButton(right_frame, fg_color="darkred", hover_color="black", corner_radius=20, text="Import model from .zip", command=browse_zip)
661 |
662 |
663 |
664 | # button = ctk.CTkButton(root, text="Open Window", command=open_window)
665 | # button.pack()
666 |
667 |
668 |
669 | # Packing widgets into window
670 | notes_label.grid(row=5, column=0, padx=10, pady=10)
671 | change_device_label.grid(row=1, column=0, columnspan=2, padx=10, pady=5)
672 | change_device.grid(row=2, column=0, columnspan=2, padx=10, pady=5)
673 | last_output_label.grid( pady=10, row=0, column=0)
674 | last_output_file.grid( pady=10, row=0, column=1)
675 | open_output_button.grid(pady=10, row=1, column=0, columnspan=2)
676 | import_moodels_button.grid(padx=10, pady=10, row=0, column=0)
677 | model_list.grid(padx=10, pady=10, row=0, column=2)
678 | sid_label.grid(padx=10, pady=10, row=0, column=0)
679 | sid_entry.grid(padx=0, pady=10, row=0, column= 1)
680 | browse_button.grid(padx=10, pady=10, row=0, column=2)
681 | input_audio_label.grid(padx=10, pady=10, row=0, column=0)
682 | input_audio_entry.grid(padx=10, pady=10, row=0, column=1)
683 | f0_method_label.grid(padx=10, pady=10, row=0, column=0)
684 | f0_method_entry.grid(padx=10, pady=10, row=0, column=1)
685 | #crepe_hop_length_label.grid(padx=10, pady=10, row=1, column=0)
686 | #crepe_hop_length_entry.grid(padx=10, pady=10, row=1, column=1)
687 | f0_pitch_label.grid(padx=10, pady=10, row=3, column=0)
688 | f0_pitch_entry.grid(padx=10, pady=10, row=3, column=1)
689 | #0_file_label.grid(padx=10, pady=10)
690 | #f0_file_entry.grid(padx=10, pady=10)
691 | file_index_label.grid(padx=10, pady=10)
692 | file_index_entry.grid(padx=10, pady=10)
693 |
694 |
695 | index_rate_label.grid(padx=10, pady=10)
696 | index_rate_entry.grid(padx=10, pady=10)
697 | run_button.grid(padx=30, pady=30, row=4, column=0, columnspan=2)
698 | output_label.grid(padx=0, pady=10)
699 |
700 | root.mainloop()
701 |
--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
1 | python -m pip install -U pip setuptools wheel
2 | pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118
3 | pip install -r requirements.txt
--------------------------------------------------------------------------------
/trainset_preprocess_pipeline_print.py:
--------------------------------------------------------------------------------
1 | import sys, os, multiprocessing
2 | from scipy import signal
3 |
4 | now_dir = os.getcwd()
5 | sys.path.append(now_dir)
6 |
7 | inp_root = sys.argv[1]
8 | sr = int(sys.argv[2])
9 | n_p = int(sys.argv[3])
10 | exp_dir = sys.argv[4]
11 | noparallel = sys.argv[5] == "True"
12 | import numpy as np, os, traceback
13 | from slicer2 import Slicer
14 | import librosa, traceback
15 | from scipy.io import wavfile
16 | import multiprocessing
17 | from my_utils import load_audio
18 |
19 | mutex = multiprocessing.Lock()
20 | f = open("%s/preprocess.log" % exp_dir, "a+")
21 |
22 |
23 | def println(strr):
24 | mutex.acquire()
25 | print(strr)
26 | f.write("%s\n" % strr)
27 | f.flush()
28 | mutex.release()
29 |
30 |
31 | class PreProcess:
32 | def __init__(self, sr, exp_dir):
33 | self.slicer = Slicer(
34 | sr=sr,
35 | threshold=-40,
36 | min_length=800,
37 | min_interval=400,
38 | hop_size=15,
39 | max_sil_kept=150,
40 | )
41 | self.sr = sr
42 | self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
43 | self.per = 3.0
44 | self.overlap = 0.3
45 | self.tail = self.per + self.overlap
46 | self.max = 0.95
47 | self.alpha = 0.8
48 | self.exp_dir = exp_dir
49 | self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
50 | self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
51 | os.makedirs(self.exp_dir, exist_ok=True)
52 | os.makedirs(self.gt_wavs_dir, exist_ok=True)
53 | os.makedirs(self.wavs16k_dir, exist_ok=True)
54 |
55 | def norm_write(self, tmp_audio, idx0, idx1):
56 | tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
57 | 1 - self.alpha
58 | ) * tmp_audio
59 | wavfile.write(
60 | "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
61 | self.sr,
62 | tmp_audio.astype(np.float32),
63 | )
64 | tmp_audio = librosa.resample(tmp_audio, orig_sr=self.sr, target_sr=16000)#, res_type="soxr_vhq"
65 | wavfile.write(
66 | "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
67 | 16000,
68 | tmp_audio.astype(np.float32),
69 | )
70 |
71 | def pipeline(self, path, idx0):
72 | try:
73 | audio = load_audio(path, self.sr)
74 | # zero phased digital filter cause pre-ringing noise...
75 | # audio = signal.filtfilt(self.bh, self.ah, audio)
76 | audio = signal.lfilter(self.bh, self.ah, audio)
77 |
78 | idx1 = 0
79 | for audio in self.slicer.slice(audio):
80 | i = 0
81 | while 1:
82 | start = int(self.sr * (self.per - self.overlap) * i)
83 | i += 1
84 | if len(audio[start:]) > self.tail * self.sr:
85 | tmp_audio = audio[start : start + int(self.per * self.sr)]
86 | self.norm_write(tmp_audio, idx0, idx1)
87 | idx1 += 1
88 | else:
89 | tmp_audio = audio[start:]
90 | idx1 += 1
91 | break
92 | self.norm_write(tmp_audio, idx0, idx1)
93 | println("%s->Suc." % path)
94 | except:
95 | println("%s->%s" % (path, traceback.format_exc()))
96 |
97 | def pipeline_mp(self, infos):
98 | for path, idx0 in infos:
99 | self.pipeline(path, idx0)
100 |
101 | def pipeline_mp_inp_dir(self, inp_root, n_p):
102 | try:
103 | infos = [
104 | ("%s/%s" % (inp_root, name), idx)
105 | for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
106 | ]
107 | if noparallel:
108 | for i in range(n_p):
109 | self.pipeline_mp(infos[i::n_p])
110 | else:
111 | ps = []
112 | for i in range(n_p):
113 | p = multiprocessing.Process(
114 | target=self.pipeline_mp, args=(infos[i::n_p],)
115 | )
116 | p.start()
117 | ps.append(p)
118 | for p in ps:
119 | p.join()
120 | except:
121 | println("Fail. %s" % traceback.format_exc())
122 |
123 |
124 | def preprocess_trainset(inp_root, sr, n_p, exp_dir):
125 | pp = PreProcess(sr, exp_dir)
126 | println("start preprocess")
127 | println(sys.argv)
128 | pp.pipeline_mp_inp_dir(inp_root, n_p)
129 | println("end preprocess")
130 |
131 |
132 | if __name__ == "__main__":
133 | preprocess_trainset(inp_root, sr, n_p, exp_dir)
134 |
--------------------------------------------------------------------------------
/vc_infer_pipeline.py:
--------------------------------------------------------------------------------
1 | import numpy as np, parselmouth, torch, pdb
2 | from time import time as ttime
3 | import torch.nn.functional as F
4 | import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5 | import scipy.signal as signal
6 | import pyworld, os, traceback, faiss
7 | from scipy import signal
8 | from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation
9 |
10 | bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
11 |
12 | class VC(object):
13 | def __init__(self, tgt_sr, config):
14 | self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
15 | config.x_pad,
16 | config.x_query,
17 | config.x_center,
18 | config.x_max,
19 | config.is_half,
20 | )
21 | self.sr = 16000 # hubert输入采样率
22 | self.window = 160 # 每帧点数
23 | self.t_pad = self.sr * self.x_pad # 每条前后pad时间
24 | self.t_pad_tgt = tgt_sr * self.x_pad
25 | self.t_pad2 = self.t_pad * 2
26 | self.t_query = self.sr * self.x_query # 查询切点前后查询时间
27 | self.t_center = self.sr * self.x_center # 查询切点位置
28 | self.t_max = self.sr * self.x_max # 免查询时长阈值
29 | self.device = config.device
30 |
31 | #region f0 Overhaul Region
32 | # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
33 | def get_optimal_torch_device(self, index: int = 0) -> torch.device:
34 | # Get cuda device
35 | if torch.cuda.is_available():
36 | return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
37 | elif torch.backends.mps.is_available():
38 | return torch.device("mps")
39 | # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
40 | # Else wise return the "cpu" as a torch device,
41 | return torch.device("cpu")
42 |
43 | # Get the f0 via parselmouth computation
44 | def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len):
45 | f0 = (
46 | parselmouth.Sound(x, self.sr)
47 | .to_pitch_ac(
48 | time_step=time_step / 1000,
49 | voicing_threshold=0.6,
50 | pitch_floor=f0_min,
51 | pitch_ceiling=f0_max,
52 | )
53 | .selected_array["frequency"]
54 | )
55 | pad_size = (p_len - len(f0) + 1) // 2
56 | if pad_size > 0 or p_len - len(f0) - pad_size > 0:
57 | f0 = np.pad(
58 | f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
59 | )
60 | return f0
61 |
62 | # Get the f0 via the pyworld computation. Fork Feature +dio along with harvest
63 | def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type):
64 | if f0_type == "harvest":
65 | f0, t = pyworld.harvest(
66 | x.astype(np.double),
67 | fs=self.sr,
68 | f0_ceil=f0_max,
69 | f0_floor=f0_min,
70 | frame_period=10,
71 | )
72 | elif f0_type == "dio":
73 | f0, t = pyworld.dio(
74 | x.astype(np.double),
75 | fs=self.sr,
76 | f0_ceil=f0_max,
77 | f0_floor=f0_min,
78 | frame_period=10,
79 | )
80 | f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
81 | f0 = signal.medfilt(f0, 3)
82 | return f0
83 |
84 | # Fork Feature: Get the f0 via the crepe algorithm from torchcrepe
85 | def get_f0_crepe_computation(
86 | self,
87 | x,
88 | f0_min,
89 | f0_max,
90 | p_len,
91 | hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
92 | model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
93 | ):
94 | x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
95 | x /= np.quantile(np.abs(x), 0.999)
96 | torch_device = self.get_optimal_torch_device()
97 | audio = torch.from_numpy(x).to(torch_device, copy=True)
98 | audio = torch.unsqueeze(audio, dim=0)
99 | if audio.ndim == 2 and audio.shape[0] > 1:
100 | audio = torch.mean(audio, dim=0, keepdim=True).detach()
101 | audio = audio.detach()
102 | print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
103 | pitch: Tensor = torchcrepe.predict(
104 | audio,
105 | self.sr,
106 | hop_length,
107 | f0_min,
108 | f0_max,
109 | model,
110 | batch_size=hop_length * 2,
111 | device=torch_device,
112 | pad=True
113 | )
114 | p_len = p_len or x.shape[0] // hop_length
115 | # Resize the pitch for final f0
116 | source = np.array(pitch.squeeze(0).cpu().float().numpy())
117 | source[source < 0.001] = np.nan
118 | target = np.interp(
119 | np.arange(0, len(source) * p_len, len(source)) / p_len,
120 | np.arange(0, len(source)),
121 | source
122 | )
123 | f0 = np.nan_to_num(target)
124 | return f0 # Resized f0
125 |
126 | #endregion
127 |
128 | def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None):
129 | time_step = self.window / self.sr * 1000
130 | f0_min = 50
131 | f0_max = 1100
132 | f0_mel_min = 1127 * np.log(1 + f0_min / 700)
133 | f0_mel_max = 1127 * np.log(1 + f0_max / 700)
134 | if f0_method == "pm":
135 | f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len)
136 | elif f0_method == "harvest":
137 | f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest")
138 | elif f0_method == "dio": # Fork Feature
139 | f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio")
140 | elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
141 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
142 | elif f0_method == "crepe-tiny": # For Feature add crepe-tiny model
143 | f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
144 |
145 | print("Using the following f0 method: " + f0_method)
146 | f0 *= pow(2, f0_up_key / 12)
147 | # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
148 | tf0 = self.sr // self.window # 每秒f0点数
149 | if inp_f0 is not None:
150 | delta_t = np.round(
151 | (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
152 | ).astype("int16")
153 | replace_f0 = np.interp(
154 | list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
155 | )
156 | shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
157 | f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
158 | :shape
159 | ]
160 | # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
161 | f0bak = f0.copy()
162 | f0_mel = 1127 * np.log(1 + f0 / 700)
163 | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
164 | f0_mel_max - f0_mel_min
165 | ) + 1
166 | f0_mel[f0_mel <= 1] = 1
167 | f0_mel[f0_mel > 255] = 255
168 | f0_coarse = np.rint(f0_mel).astype(np.int)
169 |
170 | return f0_coarse, f0bak # 1-0
171 |
172 | def vc(
173 | self,
174 | model,
175 | net_g,
176 | sid,
177 | audio0,
178 | pitch,
179 | pitchf,
180 | times,
181 | index,
182 | big_npy,
183 | index_rate,
184 | version,
185 | ): # ,file_index,file_big_npy
186 | feats = torch.from_numpy(audio0)
187 | if self.is_half:
188 | feats = feats.half()
189 | else:
190 | feats = feats.float()
191 | if feats.dim() == 2: # double channels
192 | feats = feats.mean(-1)
193 | assert feats.dim() == 1, feats.dim()
194 | feats = feats.view(1, -1)
195 | padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
196 |
197 | inputs = {
198 | "source": feats.to(self.device),
199 | "padding_mask": padding_mask,
200 | "output_layer": 9 if version == "v1" else 12,
201 | }
202 | t0 = ttime()
203 | with torch.no_grad():
204 | logits = model.extract_features(**inputs)
205 | feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
206 |
207 | if (
208 | isinstance(index, type(None)) == False
209 | and isinstance(big_npy, type(None)) == False
210 | and index_rate != 0
211 | ):
212 | npy = feats[0].cpu().numpy()
213 | if self.is_half:
214 | npy = npy.astype("float32")
215 |
216 | # _, I = index.search(npy, 1)
217 | # npy = big_npy[I.squeeze()]
218 |
219 | score, ix = index.search(npy, k=8)
220 | weight = np.square(1 / score)
221 | weight /= weight.sum(axis=1, keepdims=True)
222 | npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
223 |
224 | if self.is_half:
225 | npy = npy.astype("float16")
226 | feats = (
227 | torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
228 | + (1 - index_rate) * feats
229 | )
230 |
231 | feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
232 | t1 = ttime()
233 | p_len = audio0.shape[0] // self.window
234 | if feats.shape[1] < p_len:
235 | p_len = feats.shape[1]
236 | if pitch != None and pitchf != None:
237 | pitch = pitch[:, :p_len]
238 | pitchf = pitchf[:, :p_len]
239 | p_len = torch.tensor([p_len], device=self.device).long()
240 | with torch.no_grad():
241 | if pitch != None and pitchf != None:
242 | audio1 = (
243 | (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
244 | .data.cpu()
245 | .float()
246 | .numpy()
247 | )
248 | else:
249 | audio1 = (
250 | (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
251 | )
252 | del feats, p_len, padding_mask
253 | if torch.cuda.is_available():
254 | torch.cuda.empty_cache()
255 | t2 = ttime()
256 | times[0] += t1 - t0
257 | times[2] += t2 - t1
258 | return audio1
259 |
260 | def pipeline(
261 | self,
262 | model,
263 | net_g,
264 | sid,
265 | audio,
266 | times,
267 | f0_up_key,
268 | f0_method,
269 | file_index,
270 | # file_big_npy,
271 | index_rate,
272 | if_f0,
273 | version,
274 | crepe_hop_length,
275 | f0_file=None,
276 | ):
277 | if (
278 | file_index != ""
279 | # and file_big_npy != ""
280 | # and os.path.exists(file_big_npy) == True
281 | and os.path.exists(file_index) == True
282 | and index_rate != 0
283 | ):
284 | try:
285 | index = faiss.read_index(file_index)
286 | # big_npy = np.load(file_big_npy)
287 | big_npy = index.reconstruct_n(0, index.ntotal)
288 | except:
289 | traceback.print_exc()
290 | index = big_npy = None
291 | else:
292 | index = big_npy = None
293 | audio = signal.filtfilt(bh, ah, audio)
294 | audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
295 | opt_ts = []
296 | if audio_pad.shape[0] > self.t_max:
297 | audio_sum = np.zeros_like(audio)
298 | for i in range(self.window):
299 | audio_sum += audio_pad[i : i - self.window]
300 | for t in range(self.t_center, audio.shape[0], self.t_center):
301 | opt_ts.append(
302 | t
303 | - self.t_query
304 | + np.where(
305 | np.abs(audio_sum[t - self.t_query : t + self.t_query])
306 | == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
307 | )[0][0]
308 | )
309 | s = 0
310 | audio_opt = []
311 | t = None
312 | t1 = ttime()
313 | audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
314 | p_len = audio_pad.shape[0] // self.window
315 | inp_f0 = None
316 | if hasattr(f0_file, "name") == True:
317 | try:
318 | with open(f0_file.name, "r") as f:
319 | lines = f.read().strip("\n").split("\n")
320 | inp_f0 = []
321 | for line in lines:
322 | inp_f0.append([float(i) for i in line.split(",")])
323 | inp_f0 = np.array(inp_f0, dtype="float32")
324 | except:
325 | traceback.print_exc()
326 | sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
327 | pitch, pitchf = None, None
328 | if if_f0 == 1:
329 | pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0)
330 | pitch = pitch[:p_len]
331 | pitchf = pitchf[:p_len]
332 | pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
333 | pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float32).unsqueeze(0).float()
334 | t2 = ttime()
335 | times[1] += t2 - t1
336 | for t in opt_ts:
337 | t = t // self.window * self.window
338 | if if_f0 == 1:
339 | audio_opt.append(
340 | self.vc(
341 | model,
342 | net_g,
343 | sid,
344 | audio_pad[s : t + self.t_pad2 + self.window],
345 | pitch[:, s // self.window : (t + self.t_pad2) // self.window],
346 | pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
347 | times,
348 | index,
349 | big_npy,
350 | index_rate,
351 | version,
352 | )[self.t_pad_tgt : -self.t_pad_tgt]
353 | )
354 | else:
355 | audio_opt.append(
356 | self.vc(
357 | model,
358 | net_g,
359 | sid,
360 | audio_pad[s : t + self.t_pad2 + self.window],
361 | None,
362 | None,
363 | times,
364 | index,
365 | big_npy,
366 | index_rate,
367 | version,
368 | )[self.t_pad_tgt : -self.t_pad_tgt]
369 | )
370 | s = t
371 | if if_f0 == 1:
372 | audio_opt.append(
373 | self.vc(
374 | model,
375 | net_g,
376 | sid,
377 | audio_pad[t:],
378 | pitch[:, t // self.window :] if t is not None else pitch,
379 | pitchf[:, t // self.window :] if t is not None else pitchf,
380 | times,
381 | index,
382 | big_npy,
383 | index_rate,
384 | version,
385 | )[self.t_pad_tgt : -self.t_pad_tgt]
386 | )
387 | else:
388 | audio_opt.append(
389 | self.vc(
390 | model,
391 | net_g,
392 | sid,
393 | audio_pad[t:],
394 | None,
395 | None,
396 | times,
397 | index,
398 | big_npy,
399 | index_rate,
400 | version,
401 | )[self.t_pad_tgt : -self.t_pad_tgt]
402 | )
403 | audio_opt = np.concatenate(audio_opt)
404 | del pitch, pitchf, sid
405 | if torch.cuda.is_available():
406 | torch.cuda.empty_cache()
407 | return audio_opt
--------------------------------------------------------------------------------