├── utils ├── __init__.py ├── audio.py └── utils.py ├── egs └── visinger2 │ ├── __init__.py │ ├── bash │ └── train.sh │ ├── config.json │ ├── inference.py │ ├── dataset.py │ ├── train.py │ └── models.py ├── text └── npu │ ├── __init__.py │ ├── symbol_converter.py │ └── symbols.py ├── requirements_3090.txt ├── prepare_multispeaker.py ├── ds_inference.py ├── preprocess_multispeaker.py ├── README.md ├── modules ├── losses.py ├── commons.py ├── ddsp.py ├── transforms.py ├── modules.py ├── attentions.py └── stft.py ├── preprocess └── mel_processing.py ├── preprocess.py └── infer └── __init__.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /egs/visinger2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text/npu/__init__.py: -------------------------------------------------------------------------------- 1 | from text.npu import symbols 2 | from text.npu.symbol_converter import * -------------------------------------------------------------------------------- /requirements_3090.txt: -------------------------------------------------------------------------------- 1 | ipython==8.8.0 2 | librosa==0.8.1 3 | matplotlib==3.3.2 4 | numpy==1.19.2 5 | pyworld==0.3.0 6 | scipy==1.5.2 7 | soundfile==0.11.0 8 | torch==1.8.1 9 | tqdm==4.50.2 10 | -------------------------------------------------------------------------------- /egs/visinger2/bash/train.sh: -------------------------------------------------------------------------------- 1 | 2 | num_gpu=$1 3 | 4 | cd $(dirname $(dirname $0)) 5 | exp_dir=$(pwd) 6 | base_dir=$(dirname $(dirname $exp_dir)) 7 | config=${exp_dir}/config.json 8 | 9 | export PYTHONPATH=$base_dir 10 | export PYTHONIOENCODING=UTF-8 11 | 12 | CUDA_VISIBLE_DEVICES=${num_gpu} python train.py -c config.json 13 | 14 | -------------------------------------------------------------------------------- /prepare_multispeaker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | for spk in os.listdir("data"): 5 | if os.path.isdir(f"data/{spk}"): 6 | if os.path.exists(f"data/{spk}/raw/wavs"): 7 | shutil.move(f"data/{spk}/raw/wavs", f"data/{spk}") 8 | shutil.move(f"data/{spk}/raw/transcriptions.txt", f"data/{spk}") 9 | shutil.rmtree(f"data/{spk}/raw") 10 | 11 | -------------------------------------------------------------------------------- /ds_inference.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import re 5 | 6 | import numpy as np 7 | import soundfile 8 | import torch 9 | import tqdm 10 | from scipy.interpolate import interp1d 11 | 12 | from utils import utils 13 | from egs.visinger2.models import SynthesizerTrn 14 | from infer import preprocess, cross_fade, infer_ds 15 | 16 | trans = -12 17 | speaker = "otto" 18 | ds_path = "infer/share.ds" 19 | config_json = "egs/visinger2/config.json" 20 | checkpoint_path = f"/Volumes/Extend/下载/G_157000.pth" 21 | file_name = os.path.splitext(os.path.basename(ds_path))[0] 22 | step = re.findall(r'G_(\d+)\.pth', checkpoint_path)[0] 23 | 24 | 25 | ds = json.load(open(ds_path)) 26 | hps = utils.get_hparams_from_file(config_json) 27 | net_g = SynthesizerTrn(hps) 28 | _ = net_g.eval() 29 | _ = utils.load_checkpoint(checkpoint_path, net_g, None) 30 | 31 | audio = infer_ds(net_g, hps, ds, speaker, trans) 32 | soundfile.write(f"samples/{speaker}_{file_name}_{step}step.wav", audio, 44100) 33 | -------------------------------------------------------------------------------- /preprocess_multispeaker.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | 4 | data_root = "data" 5 | 6 | 7 | transcriptions = glob.glob(f"{data_root}/*/transcriptions.txt") 8 | spk2id = {} 9 | spk_id = 0 10 | ms_transcriptions = open(f'{data_root}/transcriptions.txt', "w") 11 | ms_train_set = open(f'{data_root}/train.list', "w") 12 | ms_test_set = open(f'{data_root}/test.list', "w") 13 | for transcription in transcriptions: 14 | spk = transcription.split("/")[-2] 15 | spk2id[spk] = spk_id 16 | spk_id += 1 17 | for line in open(transcription).readlines(): 18 | ms_transcriptions.write(f"{spk}/{line}") 19 | for line in open(transcription.replace("transcriptions.txt", "train.list")): 20 | ms_train_set.write(f"{spk}/{line}") 21 | for line in open(transcription.replace("transcriptions.txt", "test.list")): 22 | ms_test_set.write(f"{spk}/{line}") 23 | 24 | ms_transcriptions.close() 25 | ms_train_set.close() 26 | ms_test_set.close() 27 | print("请手动将说话人与id的映射粘贴至config文件中") 28 | print(json.dumps(spk2id)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VISinger2 2 | 3 | 本仓库将VISinger2对接DiffSinger社区,兼容DiffSinger社区nomidi格式数据集、ds工程文件。相比于DiffSinger,本模型有着极快的合成速度,但不使用预训练模型情况下训练速度相对较慢,模型音质上限也低于DiffSinger 4 | 5 | 目前训练、推理代码还不是很易用,之后会逐步进行完善 6 | 7 | ## 数据集准备 8 | 先按照DiffSinger nomidi格式制作数据集,放入data目录下 9 | + 高质量数据集制作可以参照[DiffSinger数据集教程](https://www.yuque.com/sunsa-i3ayc/sivu7h/dx9xof9k1dg305aq) 10 | 11 | [//]: # (+ 低质量数据追求省事可以使用[自动化数据集制作脚本](https://github.com/innnky/audio-preprocessing-scripts) (目前除了mfa部分基本可以做到一键完成)) 12 | ```shell 13 | data 14 | ├───speaker0 15 | │ └───raw 16 | │ ├──wavs 17 | │ └──transcriptions.txt 18 | └───speaker1 19 | └───raw 20 | ├──wavs 21 | └──transcriptions.txt 22 | ``` 23 | 之后依次执行 24 | ```shell 25 | # 调整文件夹结构 26 | python prepare_multispeaker.py 27 | # 生成mel与pitch 28 | python preprocess.py 29 | # 生成多说话人配置 30 | python preprocess_multispeaker.py 31 | # 之后将上一部生成的spk2id粘贴到配置文件egs/visinger2/config.json中 32 | ``` 33 | ## 训练 34 | ```shell 35 | cd egs/visinger2 36 | bash bash/train.sh 0 37 | ``` 38 | ## 推理 39 | 修改 ds_inference.py 中ds工程、说话人、模型路径 40 | 41 | python ds_inference.py 42 | -------------------------------------------------------------------------------- /text/npu/symbol_converter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | from text.npu.symbols import * 4 | import os 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _ttsing_phone_to_id = {p: i for i, p in enumerate(ttsing_phone_set)} 8 | _ttsing_pitch_to_id = {p: i for i, p in enumerate(ttsing_pitch_set)} 9 | _ttsing_slur_to_id = {s: i for i, s in enumerate(ttsing_slur_set)} 10 | 11 | ttsing_phone_to_int = {} 12 | int_to_ttsing_phone = {} 13 | for idx, item in enumerate(ttsing_phone_set): 14 | ttsing_phone_to_int[item] = idx 15 | int_to_ttsing_phone[idx] = item 16 | 17 | ttsing_pitch_to_int = {} 18 | int_to_ttsing_pitch = {} 19 | for idx, item in enumerate(ttsing_pitch_set): 20 | ttsing_pitch_to_int[item] = idx 21 | int_to_ttsing_pitch[idx] = item 22 | 23 | # opencpop 24 | ttsing_opencpop_pitch_to_int = {} 25 | for idx, item in enumerate(ttsing_opencpop_pitch_set): 26 | ttsing_opencpop_pitch_to_int[item] = idx 27 | 28 | ttsing_slur_to_int = {} 29 | int_to_ttsing_slur = {} 30 | for idx, item in enumerate(ttsing_slur_set): 31 | ttsing_slur_to_int[item] = idx 32 | int_to_ttsing_slur[idx] = item 33 | 34 | 35 | -------------------------------------------------------------------------------- /modules/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import modules.commons 5 | import math 6 | 7 | def feature_loss(fmap_r, fmap_g): 8 | loss = 0 9 | for dr, dg in zip(fmap_r, fmap_g): 10 | for rl, gl in zip(dr, dg): 11 | rl = rl.float().detach() 12 | gl = gl.float() 13 | loss += torch.mean(torch.abs(rl - gl)) 14 | 15 | return loss * 2 16 | 17 | 18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 19 | loss = 0 20 | r_losses = [] 21 | g_losses = [] 22 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 23 | dr = dr.float() 24 | dg = dg.float() 25 | r_loss = torch.mean((1-dr)**2) 26 | g_loss = torch.mean(dg**2) 27 | loss += (r_loss + g_loss) 28 | r_losses.append(r_loss.item()) 29 | g_losses.append(g_loss.item()) 30 | 31 | return loss, r_losses, g_losses 32 | 33 | 34 | def generator_loss(disc_outputs): 35 | loss = 0 36 | gen_losses = [] 37 | for dg in disc_outputs: 38 | dg = dg.float() 39 | l = torch.mean((1-dg)**2) 40 | gen_losses.append(l) 41 | loss += l 42 | 43 | return loss, gen_losses 44 | 45 | 46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 47 | """ 48 | z_p, logs_q: [b, h, t_t] 49 | m_p, logs_p: [b, h, t_t] 50 | """ 51 | z_p = z_p.float() 52 | logs_q = logs_q.float() 53 | m_p = m_p.float() 54 | logs_p = logs_p.float() 55 | z_mask = z_mask.float() 56 | 57 | kl = logs_p - logs_q - 0.5 58 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) 59 | kl = torch.sum(kl * z_mask) 60 | l = kl / torch.sum(z_mask) 61 | return l 62 | 63 | -------------------------------------------------------------------------------- /egs/visinger2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "port": 8001, 7 | "epochs": 10000, 8 | "learning_rate": 2e-4, 9 | "betas": [0.8, 0.99], 10 | "eps": 1e-9, 11 | "batch_size": 8, 12 | "accumulation_steps": 1, 13 | "fp16_run": false, 14 | "lr_decay": 0.998, 15 | "segment_size": 10240, 16 | "init_lr_ratio": 1, 17 | "warmup_epochs": 0, 18 | "c_mel": 45, 19 | "save_dir": "logdir/visinger2" 20 | }, 21 | "data": { 22 | "data_dir":"../../data", 23 | "dataset_type": "SingDataset", 24 | "collate_type": "SingCollate", 25 | "training_filelist":"train.list", 26 | "training_labellist":"transcriptions.txt", 27 | "validation_filelist":"test.list", 28 | "validation_labellist":"transcriptions.txt", 29 | "max_wav_value": 32768.0, 30 | "sample_rate": 44100, 31 | "n_fft": 2048, 32 | "fmin": 0, 33 | "fmax": 22050, 34 | "hop_size": 512, 35 | "win_size": 2048, 36 | "acoustic_dim": 80, 37 | "min_level_db": -115, 38 | "ref_level_db": 20, 39 | "min_db": -115, 40 | "max_abs_value": 4.0, 41 | "n_speakers": 200, 42 | "spk2id": {"opencpop": 0, "taffy": 1, "otto": 2, "nanami": 3} 43 | }, 44 | "model": { 45 | "hidden_channels": 192, 46 | "spk_channels": 192, 47 | "filter_channels": 768, 48 | "n_heads": 2, 49 | "n_layers": 4, 50 | "kernel_size": 3, 51 | "p_dropout": 0.1, 52 | "prior_hidden_channels": 192, 53 | "prior_filter_channels": 768, 54 | "prior_n_heads": 2, 55 | "prior_n_layers": 4, 56 | "prior_kernel_size": 3, 57 | "prior_p_dropout": 0.1, 58 | "resblock": "1", 59 | "use_spectral_norm": false, 60 | "resblock_kernel_sizes": [3,7,11], 61 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 62 | "upsample_rates": [8,8,4,2], 63 | "upsample_initial_channel": 256, 64 | "upsample_kernel_sizes": [16,16,8,4], 65 | "n_harmonic": 64, 66 | "n_bands": 65 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /text/npu/symbols.py: -------------------------------------------------------------------------------- 1 | 2 | ttsing_phone_set = ['_'] + [ 3 | "b", "c", "ch", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", 4 | "s", "sh", "t", "x", "z", "zh", "a", "ai", "an", "ang", "ao", "e", "ei", 5 | "en", "eng", "er", "iii", "ii", "i", "ia", "ian", "iang", "iao", "ie", "in", 6 | "ing", "iong", "iou", "o", "ong", "ou", "u", "ua", "uai", "uan", "uang", 7 | "uei", "uen", "ueng", "uo", "v", "van", "ve", "vn", "AH", "AA", "AO", "ER", 8 | "IH", "IY", "UH", "UW", "EH", "AE", "AY", "EY", "OY", "AW", "OW", "P", "B", 9 | "T", "D", "K", "G", "M", "N", "NG", "L", "S", "Z", "Y", "TH", "DH", "SH", 10 | "ZH", "CH", "JH", "V", "W", "F", "R", "HH", "AH0", "AA0", "AO0", "ER0", 11 | "IH0", "IY0", "UH0", "UW0", "EH0", "AE0", "AY0", "EY0", "OY0", "AW0", "OW0", 12 | "AH1", "AA1", "AO1", "ER1", "IH1", "IY1", "UH1", "UW1", "EH1", "AE1", "AY1", 13 | "EY1", "OY1", "AW1", "OW1", "AH2", "AA2", "AO2", "ER2", "IH2", "IY2", "UH2", 14 | "UW2", "EH2", "AE2", "AY2", "EY2", "OY2", "AW2", "OW2", "AH3", "AA3", "AO3", 15 | "ER3", "IH3", "IY3", "UH3", "UW3", "EH3", "AE3", "AY3", "EY3", "OY3", "AW3", 16 | "OW3", "D-1", "T-1", "P*", "B*", "T*", "D*", "K*", "G*", "M*", "N*", "NG*", 17 | "L*", "S*", "Z*", "Y*", "TH*", "DH*", "SH*", "ZH*", "CH*", "JH*", "V*", 18 | "W*", "F*", "R*", "HH*", "sp", "sil", "or", "ar", "aor", "our", "angr", 19 | "eir", "engr", "air", "ianr", "iaor", "ir", "ingr", "ur", "iiir", "uar", 20 | "uangr", "uenr", "iir", "ongr", "uor", "ueir", "iar", "iangr", "inr", 21 | "iour", "vr", "uanr", "ruai", "TR", "rest", 22 | # opencpop 23 | 'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu', 24 | "iour", "vr", "uanr", "ruai", "TR", "rest", 25 | # opencpop 26 | 'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu', 27 | # opencpop-strict 28 | 'i0', 'E', 'En' 29 | ] 30 | 31 | ttsing_pitch_set = ['_'] + [ 32 | "C0", "C1", "C2", "C3", "C4", "C5", "C6", "C#/Db0", "C#/Db1", "C#/Db2", 33 | "C#/Db3", "C#/Db4", "C#/Db5", "C#/Db6", "D0", "D1", "D2", "D3", "D4", "D5", 34 | "D6", "D#/Eb0", "D#/Eb1", "D#/Eb2", "D#/Eb3", "D#/Eb4", "D#/Eb5", "D#/Eb6", 35 | "E0", "E1", "E2", "E3", "E4", "E5", "E6", "F0", "F1", "F2", "F3", "F4", 36 | "F5", "F6", "F#/Gb0", "F#/Gb1", "F#/Gb2", "F#/Gb3", "F#/Gb4", "F#/Gb5", 37 | "F#/Gb6", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G#/Ab0", "G#/Ab1", 38 | "G#/Ab2", "G#/Ab3", "G#/Ab4", "G#/Ab5", "G#/Ab6", "A0", "A1", "A2", "A3", 39 | "A4", "A5", "A6", "A#/Bb0", "A#/Bb1", "A#/Bb2", "A#/Bb3", "A#/Bb4", 40 | "A#/Bb5", "A#/Bb6", "B0", "B1", "B2", "B3", "B4", "B5", "B6", "RestRest" 41 | ] 42 | 43 | ttsing_opencpop_pitch_set = ['_'] + [ 44 | "C0", "C1", "C2", "C3", "C4", "C5", "C6", 45 | "C#0/Db0", "C#1/Db1", "C#2/Db2", "C#3/Db3", "C#4/Db4", "C#5/Db5", "C#6/Db6", 46 | "D0", "D1", "D2", "D3", "D4", "D5", "D6", 47 | "D#0/Eb0", "D#1/Eb1", "D#2/Eb2", "D#3/Eb3", "D#4/Eb4", "D#5/Eb5", "D#6/Eb6", 48 | "E0", "E1", "E2", "E3", "E4", "E5", "E6", 49 | "F0", "F1", "F2", "F3", "F4", "F5", "F6", 50 | "F#0/Gb0", "F#1/Gb1", "F#2/Gb2", "F#3/Gb3", "F#4/Gb4", "F#5/Gb5", "F#6/Gb6", 51 | "G0", "G1", "G2", "G3", "G4", "G5", "G6", 52 | "G#0/Ab0", "G#1/Ab1", "G#2/Ab2", "G#3/Ab3", "G#4/Ab4", "G#5/Ab5", "G#6/Ab6", 53 | "A0", "A1", "A2", "A3", "A4", "A5", "A6", 54 | "A#0/Bb0", "A#1/Bb1", "A#2/Bb2", "A#3/Bb3", "A#4/Bb4", "A#5/Bb5", "A#6/Bb6", 55 | "B0", "B1", "B2", "B3", "B4", "B5", "B6", 56 | "RestRest", "rest" 57 | ] 58 | 59 | ttsing_slur_set = ['_'] + ['0', '1'] 60 | 61 | 62 | -------------------------------------------------------------------------------- /utils/audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import linalg as LA 3 | import librosa 4 | from scipy.io import wavfile 5 | import soundfile as sf 6 | import librosa.filters 7 | 8 | 9 | def load_wav(wav_path, raw_sr, target_sr=16000, win_size=800, hop_size=200): 10 | audio = librosa.core.load(wav_path, sr=raw_sr)[0] 11 | if raw_sr != target_sr: 12 | audio = librosa.core.resample(audio, 13 | raw_sr, 14 | target_sr, 15 | res_type='kaiser_best') 16 | target_length = (audio.size // hop_size + 17 | win_size // hop_size) * hop_size 18 | pad_len = (target_length - audio.size) // 2 19 | if audio.size % 2 == 0: 20 | audio = np.pad(audio, (pad_len, pad_len), mode='reflect') 21 | else: 22 | audio = np.pad(audio, (pad_len, pad_len + 1), mode='reflect') 23 | return audio 24 | 25 | 26 | def save_wav(wav, path, sample_rate, norm=False): 27 | if norm: 28 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 29 | wavfile.write(path, sample_rate, wav.astype(np.int16)) 30 | else: 31 | sf.write(path, wav, sample_rate) 32 | 33 | 34 | _mel_basis = None 35 | _inv_mel_basis = None 36 | 37 | 38 | def _build_mel_basis(hparams): 39 | assert hparams.fmax <= hparams.sample_rate // 2 40 | return librosa.filters.mel(hparams.sample_rate, 41 | hparams.n_fft, 42 | n_mels=hparams.acoustic_dim, 43 | fmin=hparams.fmin, 44 | fmax=hparams.fmax) 45 | 46 | 47 | def _linear_to_mel(spectogram, hparams): 48 | global _mel_basis 49 | if _mel_basis is None: 50 | _mel_basis = _build_mel_basis(hparams) 51 | return np.dot(_mel_basis, spectogram) 52 | 53 | 54 | def _mel_to_linear(mel_spectrogram, hparams): 55 | global _inv_mel_basis 56 | if _inv_mel_basis is None: 57 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 58 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 59 | 60 | 61 | def _stft(y, hparams): 62 | return librosa.stft(y=y, 63 | n_fft=hparams.n_fft, 64 | hop_length=hparams.hop_size, 65 | win_length=hparams.win_size) 66 | 67 | 68 | def _amp_to_db(x, hparams): 69 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 70 | return 20 * np.log10(np.maximum(min_level, x)) 71 | 72 | def _normalize(S, hparams): 73 | return hparams.max_abs_value * np.clip(((S - hparams.min_db) / 74 | (-hparams.min_db)), 0, 1) 75 | 76 | def _db_to_amp(x): 77 | return np.power(10.0, (x) * 0.05) 78 | 79 | 80 | def _stft(y, hparams): 81 | return librosa.stft(y=y, 82 | n_fft=hparams.n_fft, 83 | hop_length=hparams.hop_size, 84 | win_length=hparams.win_size) 85 | 86 | 87 | def _istft(y, hparams): 88 | return librosa.istft(y, 89 | hop_length=hparams.hop_size, 90 | win_length=hparams.win_size) 91 | 92 | 93 | def melspectrogram(wav, hparams): 94 | D = _stft(wav, hparams) 95 | S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), 96 | hparams) - hparams.ref_level_db 97 | return _normalize(S, hparams) 98 | 99 | 100 | -------------------------------------------------------------------------------- /preprocess/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | 53 | global hann_window 54 | dtype_device = str(y.dtype) + '_' + str(y.device) 55 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 56 | if wnsize_dtype_device not in hann_window: 57 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 58 | 59 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 60 | y = y.squeeze(1) 61 | 62 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 63 | center=center, pad_mode='reflect', normalized=False, onesided=True) 64 | 65 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 66 | return spec 67 | 68 | 69 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 70 | global mel_basis 71 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 72 | fmax_dtype_device = str(fmax) + '_' + dtype_device 73 | if fmax_dtype_device not in mel_basis: 74 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 75 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 76 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 77 | spec = spectral_normalize_torch(spec) 78 | return spec 79 | 80 | 81 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 82 | 83 | global mel_basis, hann_window 84 | dtype_device = str(y.dtype) + '_' + str(y.device) 85 | fmax_dtype_device = str(fmax) + '_' + dtype_device 86 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 87 | if fmax_dtype_device not in mel_basis: 88 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 89 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 90 | if wnsize_dtype_device not in hann_window: 91 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 92 | 93 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 94 | y = y.squeeze(1) 95 | 96 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 97 | center=center, pad_mode='reflect', normalized=False, onesided=True) 98 | 99 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 100 | 101 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 102 | spec = spectral_normalize_torch(spec) 103 | 104 | return spec 105 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import sys 4 | import argparse 5 | import numpy as np 6 | from multiprocessing import cpu_count 7 | from concurrent.futures import ProcessPoolExecutor 8 | from functools import partial 9 | from utils import audio 10 | import utils.utils as utils 11 | from tqdm import tqdm 12 | import pyworld as pw 13 | from random import shuffle 14 | 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | def extract_mel(wav, hparams): 19 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 20 | return mel_spectrogram.T, wav 21 | 22 | def extract_pitch(wav, hps): 23 | # rapt may be better 24 | f0, _ = pw.harvest(wav.astype(np.float64), 25 | hps.sample_rate, 26 | frame_period=hps.hop_size / hps.sample_rate * 1000) 27 | return f0 28 | 29 | def process_utterance(hps, data_root, item): 30 | out_dir = data_root 31 | 32 | wav_path = os.path.join(data_root, "wavs", 33 | "{}.wav".format(item)) 34 | wav = audio.load_wav(wav_path, 35 | raw_sr=hps.data.sample_rate, 36 | target_sr=hps.data.sample_rate, 37 | win_size=hps.data.win_size, 38 | hop_size=hps.data.hop_size) 39 | 40 | mel, _ = extract_mel(wav, hps.data) 41 | out_mel_dir = os.path.join(out_dir, "mels") 42 | os.makedirs(out_mel_dir, exist_ok=True) 43 | mel_path = os.path.join(out_mel_dir, item) 44 | np.save(mel_path, mel) 45 | 46 | pitch = extract_pitch(wav, hps.data) 47 | out_pitch_dir = os.path.join(out_dir, "pitch") 48 | os.makedirs(out_pitch_dir, exist_ok=True) 49 | pitch_path = os.path.join(out_pitch_dir, item) 50 | np.save(pitch_path, pitch) 51 | 52 | 53 | def process(args, hps, data_dir): 54 | print(os.path.join(data_dir, "wavs")) 55 | if(not os.path.exists(os.path.join(data_dir, "file.list"))): 56 | with open(os.path.join(data_dir, "file.list") , "w") as out_file: 57 | files = os.listdir(os.path.join(data_dir, "wavs")) 58 | files = [i for i in files if i.endswith(".wav")] 59 | for f in files: 60 | out_file.write(f.strip().split(".")[0] + '\n') 61 | metadata = [ 62 | item.strip() for item in open( 63 | os.path.join(data_dir, "file.list")).readlines() 64 | ] 65 | executor = ProcessPoolExecutor(max_workers=args.num_workers) 66 | results = [] 67 | for item in metadata: 68 | results.append(executor.submit(partial(process_utterance, hps, data_dir, item))) 69 | return [result.result() for result in tqdm(results)] 70 | 71 | def split_dataset(data_dir): 72 | metadata = [ 73 | item.strip() for item in open( 74 | os.path.join(data_dir, "file.list")).readlines() 75 | ] 76 | shuffle(metadata) 77 | train_set = metadata[:-2] 78 | test_set = metadata[-2:] 79 | with open(os.path.join(data_dir, "train.list"), "w") as ts: 80 | for item in train_set: 81 | ts.write(item+"\n") 82 | with open(os.path.join(data_dir, "test.list"), "w") as ts: 83 | for item in test_set: 84 | ts.write(item+"\n") 85 | 86 | def main(): 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('--config', 89 | default='egs/visinger2/config.json', 90 | help='json files for configurations.') 91 | parser.add_argument('--num_workers', type=int, default=int(cpu_count()) // 2) 92 | 93 | args = parser.parse_args() 94 | hps = utils.get_hparams_from_file(args.config) 95 | spklist = [spk for spk in os.listdir("data") if os.path.isdir(f"data/{spk}") and not os.path.exists(f"data/{spk}/test.list")] 96 | for spk in tqdm(spklist): 97 | print(f"preprocessing {spk}") 98 | data_dir = f"data/{spk}" 99 | process(args, hps, data_dir) 100 | split_dataset(data_dir) 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /egs/visinger2/inference.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import IPython.display as ipd 3 | 4 | import sys 5 | import os 6 | import json 7 | import math 8 | import torch 9 | from torch import nn 10 | from torch.nn import functional as F 11 | from torch.utils.data import DataLoader 12 | 13 | import modules.commons as commons 14 | import utils.utils as utils 15 | from models import SynthesizerTrn 16 | from text import npu 17 | from scipy.io.wavfile import write 18 | from tqdm import tqdm 19 | import numpy as np 20 | import time 21 | import argparse 22 | 23 | def parse_label(hps, pho, pitchid, dur, slur, gtdur): 24 | phos = [] 25 | pitchs = [] 26 | durs = [] 27 | slurs = [] 28 | gtdurs = [] 29 | 30 | for index in range(len(pho.split())): 31 | phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]]) 32 | pitchs.append(npu.symbol_converter.ttsing_opencpop_pitch_to_int[pitchid.strip().split()[index]]) 33 | durs.append(float(dur.strip().split()[index])) 34 | slurs.append(int(slur.strip().split()[index])) 35 | gtdurs.append(float(gtdur.strip().split()[index])) 36 | 37 | phos = np.asarray(phos, dtype=np.int32) 38 | pitchs = np.asarray(pitchs, dtype=np.int32) 39 | durs = np.asarray(durs, dtype=np.float32) 40 | slurs = np.asarray(slurs, dtype=np.int32) 41 | gtdurs = np.asarray(gtdurs, dtype=np.float32) 42 | gtdurs = np.ceil(gtdurs / (hps.data.hop_size / hps.data.sample_rate)) 43 | 44 | phos = torch.LongTensor(phos) 45 | pitchs = torch.LongTensor(pitchs) 46 | durs = torch.FloatTensor(durs) 47 | slurs = torch.LongTensor(slurs) 48 | gtdurs = torch.LongTensor(gtdurs) 49 | return phos, pitchs, durs, slurs, gtdurs 50 | 51 | def load_model(model_dir): 52 | 53 | # load config and model 54 | model_path = utils.latest_checkpoint_path(model_dir) 55 | config_path = os.path.join(model_dir, "config.json") 56 | 57 | hps = utils.get_hparams_from_file(config_path) 58 | 59 | print("Load model from : ", model_path) 60 | print("config: ", config_path) 61 | 62 | net_g = SynthesizerTrn(hps) 63 | _ = net_g.eval() 64 | _ = utils.load_checkpoint(model_path, net_g, None) 65 | return net_g, hps 66 | 67 | def inference_label2wav(net_g, label_list_path, output_dir, hps, cuda_id=None): 68 | 69 | id2label = {} 70 | with open(label_list_path, "r") as in_file: 71 | for line in in_file.readlines(): 72 | fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|') 73 | id2label[fileid] = [phones, pitchid, dur, slur, gtdur] 74 | 75 | for file_name in tqdm(id2label.keys()): 76 | pho, pitchid, dur, slur, gtdur = id2label[file_name] 77 | pho, pitchid, dur, slur, gtdur = parse_label(hps, pho, pitchid, dur, slur, gtdur) 78 | 79 | with torch.no_grad(): 80 | 81 | # data 82 | pho_lengths = torch.LongTensor([pho.size(0)]) 83 | pho = pho.unsqueeze(0) 84 | pitchid = pitchid.unsqueeze(0) 85 | dur = dur.unsqueeze(0) 86 | slur = slur.unsqueeze(0) 87 | 88 | if(cuda_id != None): 89 | net_g = net_g.cuda(0) 90 | pho = pho.cuda(0) 91 | pho_lengths = pho_lengths.cuda(0) 92 | pitchid = pitchid.cuda(0) 93 | dur = dur.cuda(0) 94 | slur = slur.cuda(0) 95 | 96 | # infer 97 | o, _, _ = net_g.infer(pho, pho_lengths, pitchid, dur, slur) 98 | audio = o[0,0].data.cpu().float().numpy() 99 | audio = audio * 32768 #hps.data.max_wav_value 100 | audio = audio.astype(np.int16) 101 | 102 | # save 103 | write(os.path.join(output_dir, file_name.split('.')[0] + '.wav' ), hps.data.sample_rate, audio) 104 | 105 | if __name__ == "__main__": 106 | 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument('-model_dir', '--model_dir', type=str, required=True) 109 | parser.add_argument('-input_dir', '--input_dir', type=str, required=True) 110 | parser.add_argument('-output_dir', '--output_dir', type=str, required=True) 111 | args = parser.parse_args() 112 | 113 | model_dir = args.model_dir 114 | input_dir = args.input_dir 115 | output_dir = args.output_dir 116 | 117 | model, hps = load_model(model_dir) 118 | if(not os.path.exists(output_dir)): 119 | os.makedirs(output_dir) 120 | print("load model end!") 121 | 122 | inference_label2wav(model, input_dir, output_dir, hps, cuda_id=0) 123 | 124 | -------------------------------------------------------------------------------- /modules/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size*dilation - dilation)/2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def intersperse(lst, item): 25 | result = [item] * (len(lst) * 2 + 1) 26 | result[1::2] = lst 27 | return result 28 | 29 | 30 | def kl_divergence(m_p, logs_p, m_q, logs_q): 31 | """KL(P||Q)""" 32 | kl = (logs_q - logs_p) - 0.5 33 | kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | # print("ret shape: ",ret.shape, ids_str) 51 | for i in range(x.size(0)): 52 | idx_str = ids_str[i] 53 | idx_end = idx_str + segment_size 54 | ret[i] = x[i, :, idx_str:idx_end] 55 | return ret 56 | 57 | 58 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 59 | b, d, t = x.size() 60 | if x_lengths is None: 61 | x_lengths = t 62 | ids_str_max = x_lengths - segment_size - 1 63 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 64 | ret = slice_segments(x, ids_str, segment_size) 65 | return ret, ids_str 66 | 67 | 68 | def get_timing_signal_1d( 69 | length, channels, min_timescale=1.0, max_timescale=1.0e4): 70 | position = torch.arange(length, dtype=torch.float) 71 | num_timescales = channels // 2 72 | log_timescale_increment = ( 73 | math.log(float(max_timescale) / float(min_timescale)) / 74 | (num_timescales - 1)) 75 | inv_timescales = min_timescale * torch.exp( 76 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) 77 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 78 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 79 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 80 | signal = signal.view(1, channels, length) 81 | return signal 82 | 83 | 84 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 85 | b, channels, length = x.size() 86 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 87 | return x + signal.to(dtype=x.dtype, device=x.device) 88 | 89 | 90 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 91 | b, channels, length = x.size() 92 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 93 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 94 | 95 | 96 | def subsequent_mask(length): 97 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 98 | return mask 99 | 100 | 101 | @torch.jit.script 102 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 103 | n_channels_int = n_channels[0] 104 | in_act = input_a + input_b 105 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 106 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 107 | acts = t_act * s_act 108 | return acts 109 | 110 | 111 | def convert_pad_shape(pad_shape): 112 | l = pad_shape[::-1] 113 | pad_shape = [item for sublist in l for item in sublist] 114 | return pad_shape 115 | 116 | 117 | def shift_1d(x): 118 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 119 | return x 120 | 121 | 122 | def sequence_mask(length, max_length=None): 123 | if max_length is None: 124 | max_length = length.max() 125 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 126 | return x.unsqueeze(0) < length.unsqueeze(1) 127 | 128 | 129 | def generate_path(duration, mask): 130 | """ 131 | duration: [b, 1, t_x] 132 | mask: [b, 1, t_y, t_x] 133 | """ 134 | device = duration.device 135 | 136 | b, _, t_y, t_x = mask.shape 137 | cum_duration = torch.cumsum(duration, -1) 138 | 139 | cum_duration_flat = cum_duration.view(b * t_x) 140 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 141 | path = path.view(b, t_x, t_y) 142 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 143 | path = path.unsqueeze(1).transpose(2,3) * mask 144 | return path 145 | 146 | 147 | def clip_grad_value_(parameters, clip_value, norm_type=2): 148 | if isinstance(parameters, torch.Tensor): 149 | parameters = [parameters] 150 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 151 | norm_type = float(norm_type) 152 | if clip_value is not None: 153 | clip_value = float(clip_value) 154 | 155 | total_norm = 0 156 | for p in parameters: 157 | param_norm = p.grad.data.norm(norm_type) 158 | total_norm += param_norm.item() ** norm_type 159 | if clip_value is not None: 160 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 161 | total_norm = total_norm ** (1. / norm_type) 162 | return total_norm 163 | -------------------------------------------------------------------------------- /modules/ddsp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import torch.fft as fft 5 | import numpy as np 6 | import librosa as li 7 | import math 8 | from scipy.signal import get_window 9 | 10 | def safe_log(x): 11 | return torch.log(x + 1e-7) 12 | 13 | 14 | @torch.no_grad() 15 | def mean_std_loudness(dataset): 16 | mean = 0 17 | std = 0 18 | n = 0 19 | for _, _, l in dataset: 20 | n += 1 21 | mean += (l.mean().item() - mean) / n 22 | std += (l.std().item() - std) / n 23 | return mean, std 24 | 25 | 26 | def multiscale_fft(signal, scales, overlap): 27 | stfts = [] 28 | for s in scales: 29 | S = torch.stft( 30 | signal, 31 | s, 32 | int(s * (1 - overlap)), 33 | s, 34 | torch.hann_window(s).to(signal), 35 | True, 36 | normalized=True, 37 | return_complex=True, 38 | ).abs() 39 | stfts.append(S) 40 | return stfts 41 | 42 | 43 | def resample(x, factor: int): 44 | batch, frame, channel = x.shape 45 | x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame) 46 | 47 | window = torch.hann_window( 48 | factor * 2, 49 | dtype=x.dtype, 50 | device=x.device, 51 | ).reshape(1, 1, -1) 52 | y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x) 53 | y[..., ::factor] = x 54 | y[..., -1:] = x[..., -1:] 55 | y = torch.nn.functional.pad(y, [factor, factor]) 56 | y = torch.nn.functional.conv1d(y, window)[..., :-1] 57 | 58 | y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1) 59 | 60 | return y 61 | 62 | 63 | def upsample(signal, factor): 64 | signal = signal.permute(0, 2, 1) 65 | signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor) 66 | return signal.permute(0, 2, 1) 67 | 68 | 69 | def remove_above_nyquist(amplitudes, pitch, sampling_rate): 70 | n_harm = amplitudes.shape[-1] 71 | pitches = pitch * torch.arange(1, n_harm + 1).to(pitch) 72 | aa = (pitches < sampling_rate / 2).float() + 1e-4 73 | return amplitudes * aa 74 | 75 | 76 | def scale_function(x): 77 | return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7 78 | 79 | 80 | def extract_loudness(signal, sampling_rate, block_size, n_fft=2048): 81 | S = li.stft( 82 | signal, 83 | n_fft=n_fft, 84 | hop_length=block_size, 85 | win_length=n_fft, 86 | center=True, 87 | ) 88 | S = np.log(abs(S) + 1e-7) 89 | f = li.fft_frequencies(sampling_rate, n_fft) 90 | a_weight = li.A_weighting(f) 91 | 92 | S = S + a_weight.reshape(-1, 1) 93 | 94 | S = np.mean(S, 0)[..., :-1] 95 | 96 | return S 97 | 98 | 99 | def extract_pitch(signal, sampling_rate, block_size): 100 | length = signal.shape[-1] // block_size 101 | f0 = crepe.predict( 102 | signal, 103 | sampling_rate, 104 | step_size=int(1000 * block_size / sampling_rate), 105 | verbose=1, 106 | center=True, 107 | viterbi=True, 108 | ) 109 | f0 = f0[1].reshape(-1)[:-1] 110 | 111 | if f0.shape[-1] != length: 112 | f0 = np.interp( 113 | np.linspace(0, 1, length, endpoint=False), 114 | np.linspace(0, 1, f0.shape[-1], endpoint=False), 115 | f0, 116 | ) 117 | 118 | return f0 119 | 120 | 121 | def mlp(in_size, hidden_size, n_layers): 122 | channels = [in_size] + (n_layers) * [hidden_size] 123 | net = [] 124 | for i in range(n_layers): 125 | net.append(nn.Linear(channels[i], channels[i + 1])) 126 | net.append(nn.LayerNorm(channels[i + 1])) 127 | net.append(nn.LeakyReLU()) 128 | return nn.Sequential(*net) 129 | 130 | 131 | def gru(n_input, hidden_size): 132 | return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True) 133 | 134 | 135 | def harmonic_synth(pitch, amplitudes, sampling_rate): 136 | n_harmonic = amplitudes.shape[-1] 137 | omega = torch.cumsum(2 * math.pi * pitch / sampling_rate, 1) 138 | omegas = omega * torch.arange(1, n_harmonic + 1).to(omega) 139 | signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True) 140 | return signal 141 | 142 | 143 | def amp_to_impulse_response(amp, target_size): 144 | amp = torch.stack([amp, torch.zeros_like(amp)], -1) 145 | amp = torch.view_as_complex(amp) 146 | amp = fft.irfft(amp) 147 | 148 | filter_size = amp.shape[-1] 149 | 150 | amp = torch.roll(amp, filter_size // 2, -1) 151 | win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device) 152 | 153 | amp = amp * win 154 | 155 | amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size))) 156 | amp = torch.roll(amp, -filter_size // 2, -1) 157 | 158 | return amp 159 | 160 | 161 | def fft_convolve(signal, kernel): 162 | signal = nn.functional.pad(signal, (0, signal.shape[-1])) 163 | kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0)) 164 | 165 | output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel)) 166 | output = output[..., output.shape[-1] // 2:] 167 | 168 | return output 169 | 170 | 171 | def init_kernels(win_len, win_inc, fft_len, win_type=None, invers=False): 172 | if win_type == 'None' or win_type is None: 173 | window = np.ones(win_len) 174 | else: 175 | window = get_window(win_type, win_len, fftbins=True)#**0.5 176 | 177 | N = fft_len 178 | fourier_basis = np.fft.rfft(np.eye(N))[:win_len] 179 | real_kernel = np.real(fourier_basis) 180 | imag_kernel = np.imag(fourier_basis) 181 | kernel = np.concatenate([real_kernel, imag_kernel], 1).T 182 | 183 | if invers : 184 | kernel = np.linalg.pinv(kernel).T 185 | 186 | kernel = kernel*window 187 | kernel = kernel[:, None, :] 188 | return torch.from_numpy(kernel.astype(np.float32)), torch.from_numpy(window[None,:,None].astype(np.float32)) 189 | 190 | -------------------------------------------------------------------------------- /modules/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import sys 4 | import argparse 5 | import logging 6 | import json 7 | import subprocess 8 | import numpy as np 9 | from scipy.io.wavfile import read 10 | import torch 11 | 12 | MATPLOTLIB_FLAG = False 13 | 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 15 | logger = logging 16 | 17 | 18 | def load_checkpoint(checkpoint_path, model, optimizer=None): 19 | assert os.path.isfile(checkpoint_path) 20 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 21 | iteration = checkpoint_dict['iteration'] 22 | learning_rate = checkpoint_dict['learning_rate'] 23 | if optimizer is not None: 24 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 25 | saved_state_dict = checkpoint_dict['model'] 26 | if hasattr(model, 'module'): 27 | state_dict = model.module.state_dict() 28 | else: 29 | state_dict = model.state_dict() 30 | new_state_dict = {} 31 | for k, v in state_dict.items(): 32 | try: 33 | new_state_dict[k] = saved_state_dict[k] 34 | assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape) 35 | except: 36 | print("error, %s is not in the checkpoint" % k) 37 | logger.info("%s is not in the checkpoint" % k) 38 | new_state_dict[k] = v 39 | if hasattr(model, 'module'): 40 | model.module.load_state_dict(new_state_dict) 41 | else: 42 | model.load_state_dict(new_state_dict) 43 | print("load ") 44 | logger.info("Loaded checkpoint '{}' (iteration {})".format( 45 | checkpoint_path, iteration)) 46 | return model, optimizer, learning_rate, iteration 47 | 48 | 49 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, val_steps): 50 | ckptname = checkpoint_path.split(os.sep)[-1] 51 | newest_step = int(ckptname.split(".")[0].split("_")[1]) 52 | last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps * 2)) 53 | if newest_step >= val_steps * 2: 54 | os.system(f"rm {last_ckptname}") 55 | 56 | logger.info("Saving model and optimizer state at iteration {} to {}".format( 57 | iteration, checkpoint_path)) 58 | if hasattr(model, 'module'): 59 | state_dict = model.module.state_dict() 60 | else: 61 | state_dict = model.state_dict() 62 | torch.save({'model': state_dict, 63 | 'iteration': iteration, 64 | 'optimizer': optimizer.state_dict(), 65 | 'learning_rate': learning_rate}, checkpoint_path) 66 | 67 | 68 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): 69 | for k, v in scalars.items(): 70 | writer.add_scalar(k, v, global_step) 71 | for k, v in histograms.items(): 72 | writer.add_histogram(k, v, global_step) 73 | for k, v in images.items(): 74 | writer.add_image(k, v, global_step, dataformats='HWC') 75 | for k, v in audios.items(): 76 | writer.add_audio(k, v, global_step, audio_sampling_rate) 77 | 78 | 79 | def latest_checkpoint_path(dir_path, regex="G_*.pth"): 80 | f_list = glob.glob(os.path.join(dir_path, regex)) 81 | f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) 82 | x = f_list[-1] 83 | print(x) 84 | return x 85 | 86 | 87 | def plot_spectrogram_to_numpy(spectrogram): 88 | global MATPLOTLIB_FLAG 89 | if not MATPLOTLIB_FLAG: 90 | import matplotlib 91 | matplotlib.use("Agg") 92 | MATPLOTLIB_FLAG = True 93 | mpl_logger = logging.getLogger('matplotlib') 94 | mpl_logger.setLevel(logging.WARNING) 95 | import matplotlib.pylab as plt 96 | import numpy as np 97 | 98 | fig, ax = plt.subplots(figsize=(10, 2)) 99 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 100 | interpolation='none') 101 | plt.colorbar(im, ax=ax) 102 | plt.xlabel("Frames") 103 | plt.ylabel("Channels") 104 | plt.tight_layout() 105 | 106 | fig.canvas.draw() 107 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 108 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 109 | plt.close() 110 | return data 111 | 112 | 113 | def plot_alignment_to_numpy(alignment, info=None): 114 | global MATPLOTLIB_FLAG 115 | if not MATPLOTLIB_FLAG: 116 | import matplotlib 117 | matplotlib.use("Agg") 118 | MATPLOTLIB_FLAG = True 119 | mpl_logger = logging.getLogger('matplotlib') 120 | mpl_logger.setLevel(logging.WARNING) 121 | import matplotlib.pylab as plt 122 | import numpy as np 123 | 124 | fig, ax = plt.subplots(figsize=(6, 4)) 125 | im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', 126 | interpolation='none') 127 | fig.colorbar(im, ax=ax) 128 | xlabel = 'Decoder timestep' 129 | if info is not None: 130 | xlabel += '\n\n' + info 131 | plt.xlabel(xlabel) 132 | plt.ylabel('Encoder timestep') 133 | plt.tight_layout() 134 | 135 | fig.canvas.draw() 136 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 137 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 138 | plt.close() 139 | return data 140 | 141 | 142 | def load_wav_to_torch(full_path): 143 | sampling_rate, data = read(full_path) 144 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 145 | 146 | 147 | def load_filepaths_and_text(filename, split="|"): 148 | with open(filename, encoding='utf-8') as f: 149 | filepaths_and_text = [line.strip().split(split) for line in f] 150 | return filepaths_and_text 151 | 152 | 153 | def get_hparams(init=True): 154 | parser = argparse.ArgumentParser() 155 | parser.add_argument('-c', '--config', type=str, default="./configs/base.json", 156 | help='JSON file for configuration') 157 | # parser.add_argument('-m', '--model', type=str, required=True, 158 | # help='Model name') 159 | 160 | args = parser.parse_args() 161 | 162 | config_path = args.config 163 | with open(config_path, "r") as f: 164 | data = f.read() 165 | config = json.loads(data) 166 | 167 | hparams = HParams(**config) 168 | # hparams.model_dir = model_dir 169 | model_dir = hparams.train.save_dir 170 | config_save_path = os.path.join(model_dir, "config.json") 171 | 172 | if not os.path.exists(model_dir): 173 | os.makedirs(model_dir) 174 | 175 | with open(config_save_path, "w") as f: 176 | f.write(data) 177 | return hparams 178 | 179 | 180 | def get_hparams_from_dir(model_dir): 181 | config_save_path = os.path.join(model_dir, "config.json") 182 | with open(config_save_path, "r") as f: 183 | data = f.read() 184 | config = json.loads(data) 185 | 186 | hparams = HParams(**config) 187 | hparams.model_dir = model_dir 188 | return hparams 189 | 190 | 191 | def get_hparams_from_file(config_path): 192 | with open(config_path, "r") as f: 193 | data = f.read() 194 | config = json.loads(data) 195 | 196 | hparams = HParams(**config) 197 | return hparams 198 | 199 | 200 | def check_git_hash(model_dir): 201 | source_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 202 | if not os.path.exists(os.path.join(source_dir, ".git")): 203 | logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( 204 | source_dir 205 | )) 206 | return 207 | 208 | cur_hash = subprocess.getoutput("git rev-parse HEAD") 209 | 210 | path = os.path.join(model_dir, "githash") 211 | if os.path.exists(path): 212 | saved_hash = open(path).read() 213 | if saved_hash != cur_hash: 214 | logger.warn("git hash values are different. {}(saved) != {}(current)".format( 215 | saved_hash[:8], cur_hash[:8])) 216 | else: 217 | open(path, "w").write(cur_hash) 218 | 219 | 220 | def get_logger(model_dir, filename="train.log"): 221 | global logger 222 | logger = logging.getLogger(os.path.basename(model_dir)) 223 | logger.setLevel(logging.DEBUG) 224 | 225 | formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") 226 | if not os.path.exists(model_dir): 227 | os.makedirs(model_dir) 228 | h = logging.FileHandler(os.path.join(model_dir, filename)) 229 | h.setLevel(logging.DEBUG) 230 | h.setFormatter(formatter) 231 | logger.addHandler(h) 232 | return logger 233 | 234 | 235 | def count_parameters(model): 236 | return sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6 237 | 238 | 239 | class HParams(): 240 | def __init__(self, **kwargs): 241 | for k, v in kwargs.items(): 242 | if type(v) == dict: 243 | v = HParams(**v) 244 | self[k] = v 245 | 246 | def keys(self): 247 | return self.__dict__.keys() 248 | 249 | def items(self): 250 | return self.__dict__.items() 251 | 252 | def values(self): 253 | return self.__dict__.values() 254 | 255 | def __len__(self): 256 | return len(self.__dict__) 257 | 258 | def __getitem__(self, key): 259 | return getattr(self, key) 260 | 261 | def __setitem__(self, key, value): 262 | return setattr(self, key, value) 263 | 264 | def __contains__(self, key): 265 | return key in self.__dict__ 266 | 267 | def __repr__(self): 268 | return self.__dict__.__repr__() 269 | -------------------------------------------------------------------------------- /egs/visinger2/dataset.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import string 5 | import random 6 | import numpy as np 7 | import math 8 | import json 9 | from torch.utils.data import DataLoader 10 | import torch 11 | 12 | sys.path.append('../..') 13 | from utils.audio import load_wav 14 | from text import npu 15 | 16 | class BaseDataset(torch.utils.data.Dataset): 17 | 18 | def __init__(self, hparams, fileid_list_path): 19 | self.hparams = hparams 20 | self.fileid_list = self.get_fileid_list(fileid_list_path) 21 | random.seed(hparams.train.seed) 22 | random.shuffle(self.fileid_list) 23 | if(hparams.data.n_speakers > 0): 24 | self.spk2id = hparams.data.spk2id 25 | 26 | def get_fileid_list(self, fileid_list_path): 27 | fileid_list = [] 28 | with open(fileid_list_path, 'r') as f: 29 | for line in f.readlines(): 30 | fileid_list.append(line.strip()) 31 | 32 | return fileid_list 33 | 34 | def __len__(self): 35 | return len(self.fileid_list) 36 | 37 | class SingDataset(BaseDataset): 38 | def __init__(self, hparams, data_dir, fileid_list_path, label_list_path): 39 | BaseDataset.__init__(self, hparams, os.path.join(data_dir, fileid_list_path)) 40 | self.hps = hparams 41 | 42 | with open(os.path.join(data_dir, label_list_path), "r") as in_file: 43 | self.id2label = {} 44 | for line in in_file.readlines(): 45 | fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|') 46 | self.id2label[fileid] = [phones, pitchid, dur, slur, gtdur] 47 | 48 | self.data_dir = data_dir 49 | # self.__filter__() 50 | 51 | def __filter__(self): 52 | new_fileid_list = [] 53 | print("before filter: ", len(self.fileid_list)) 54 | for file_id in self.fileid_list: 55 | _is_qualified = True 56 | if(not os.path.exists(os.path.join(self.label_dir, self.fileid_list[index] + '.lab')) or 57 | not os.path.exists(os.path.join(self.dur_dir, self.fileid_list[index] + '.lab')) or 58 | not os.path.exists(os.path.join(self.mel_dir, self.fileid_list[index] + '.npy')) or 59 | not os.path.exists(os.path.join(self.pitch_dir, self.fileid_list[index] + '.npy'))): 60 | _is_qualified = False 61 | if(_is_qualified): 62 | new_fileid_list.append(file_id) 63 | self.fileid_list = new_fileid_list 64 | print("after filter: ", len(self.fileid_list)) 65 | 66 | def interpolate_f0(self, data): 67 | ''' 68 | 对F0进行插值处理 69 | ''' 70 | data = np.reshape(data, (data.size, 1)) 71 | 72 | vuv_vector = np.zeros((data.size, 1),dtype=np.float32) 73 | vuv_vector[data > 0.0] = 1.0 74 | vuv_vector[data <= 0.0] = 0.0 75 | 76 | ip_data = data 77 | 78 | frame_number = data.size 79 | last_value = 0.0 80 | for i in range(frame_number): 81 | if data[i] <= 0.0: 82 | j = i + 1 83 | for j in range(i + 1, frame_number): 84 | if data[j] > 0.0: 85 | break 86 | if j < frame_number - 1: 87 | if last_value > 0.0: 88 | step = (data[j] - data[i - 1]) / float(j - i) 89 | for k in range(i, j): 90 | ip_data[k] = data[i - 1] + step * (k - i + 1) 91 | else: 92 | for k in range(i, j): 93 | ip_data[k] = data[j] 94 | else: 95 | for k in range(i, frame_number): 96 | ip_data[k] = last_value 97 | else: 98 | ip_data[i] = data[i] 99 | last_value = data[i] 100 | 101 | return ip_data, vuv_vector 102 | 103 | def parse_label(self, pho, pitchid, dur, slur, gtdur): 104 | phos = [] 105 | pitchs = [] 106 | durs = [] 107 | slurs = [] 108 | gtdurs = [] 109 | 110 | for index in range(len(pho.split())): 111 | phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]]) 112 | pitchs.append(0) 113 | durs.append(0) 114 | slurs.append(0) 115 | gtdurs.append(float(gtdur.strip().split()[index])) 116 | 117 | phos = np.asarray(phos, dtype=np.int32) 118 | pitchs = np.asarray(pitchs, dtype=np.int32) 119 | durs = np.asarray(durs, dtype=np.float32) 120 | slurs = np.asarray(slurs, dtype=np.int32) 121 | gtdurs = np.asarray(gtdurs, dtype=np.float32) 122 | 123 | acc_duration = np.cumsum(gtdurs) 124 | acc_duration = np.pad(acc_duration, (1, 0), 'constant', constant_values=(0,)) 125 | acc_duration_frames = np.ceil(acc_duration / (self.hps.data.hop_size / self.hps.data.sample_rate)) 126 | gtdurs = acc_duration_frames[1:] - acc_duration_frames[:-1] 127 | 128 | phos = torch.LongTensor(phos) 129 | pitchs = torch.LongTensor(pitchs) 130 | durs = torch.FloatTensor(durs) 131 | slurs = torch.LongTensor(slurs) 132 | gtdurs = torch.LongTensor(gtdurs) 133 | return phos, pitchs, durs, slurs, gtdurs 134 | 135 | def __getitem__(self, index): 136 | 137 | pho, pitchid, dur, slur, gtdur = self.id2label[self.fileid_list[index]] 138 | pho, pitchid, dur, slur, gtdur = self.parse_label(pho, pitchid, dur, slur, gtdur) 139 | sum_dur = gtdur.sum() 140 | spk, fileid = self.fileid_list[index].split("/") 141 | spkid = self.spk2id[spk] 142 | mel = np.load(os.path.join(self.data_dir, spk, "mels", fileid + '.npy')) 143 | if mel.shape[0] <150: 144 | print("drop short audio:", self.fileid_list[index]) 145 | return None 146 | assert mel.shape[1] == 80 147 | if(mel.shape[0] != sum_dur): 148 | if(abs(mel.shape[0] - sum_dur) > 3): 149 | print("dataset error mel: ",mel.shape, sum_dur) 150 | return None 151 | if(mel.shape[0] > sum_dur): 152 | mel = mel[:sum_dur] 153 | else: 154 | mel = np.concatenate([mel, mel.min() * np.ones([sum_dur - mel.shape[0], self.hps.data.acoustic_dim])], axis=0) 155 | mel = torch.FloatTensor(mel).transpose(0, 1) 156 | 157 | f0 = np.load(os.path.join(self.data_dir, spk, "pitch", fileid + '.npy')).reshape([-1]) 158 | f0, _ = self.interpolate_f0(f0) 159 | f0 = f0.reshape([-1]) 160 | if(f0.shape[0] != sum_dur): 161 | if(abs(f0.shape[0] - sum_dur) > 3): 162 | print("dataset error f0 : ",f0.shape, sum_dur) 163 | return None 164 | if(f0.shape[0] > sum_dur): 165 | f0 = f0[:sum_dur] 166 | else: 167 | f0 = np.concatenate([f0, np.zeros([sum_dur - f0.shape[0]])], axis=0) 168 | f0 = torch.FloatTensor(f0).reshape([1, -1]) 169 | 170 | wav = load_wav(os.path.join(self.data_dir, spk, "wavs", fileid + '.wav'), 171 | raw_sr=self.hparams.data.sample_rate, 172 | target_sr=self.hparams.data.sample_rate, 173 | win_size=self.hparams.data.win_size, 174 | hop_size=self.hparams.data.hop_size) 175 | wav = wav.reshape(-1) 176 | if(wav.shape[0] != sum_dur * self.hparams.data.hop_size): 177 | if(abs(wav.shape[0] - sum_dur * self.hparams.data.hop_size) > 3 * self.hparams.data.hop_size): 178 | print("dataset error wav : ", wav.shape, sum_dur) 179 | return None 180 | if(wav.shape[0] > sum_dur * self.hparams.data.hop_size): 181 | wav = wav[:sum_dur * self.hparams.data.hop_size] 182 | else: 183 | wav = np.concatenate([wav, np.zeros([sum_dur * self.hparams.data.hop_size - wav.shape[0]])], axis=0) 184 | wav = torch.FloatTensor(wav).reshape([1, -1]) 185 | 186 | return pho, pitchid, dur, slur, gtdur, mel, f0, wav, spkid 187 | 188 | 189 | class SingCollate(): 190 | 191 | def __init__(self, hparams): 192 | self.hparams = hparams 193 | self.mel_dim = self.hparams.data.acoustic_dim 194 | 195 | def __call__(self, batch): 196 | 197 | batch = [b for b in batch if b is not None] 198 | 199 | input_lengths, ids_sorted_decreasing = torch.sort( 200 | torch.LongTensor([len(x[0]) for x in batch]), 201 | dim=0, descending=True) 202 | 203 | max_phone_len = max([len(x[0]) for x in batch]) 204 | max_pitchid_len = max([len(x[1]) for x in batch]) 205 | max_dur_len = max([len(x[2]) for x in batch]) 206 | max_slur_len = max([len(x[3]) for x in batch]) 207 | max_gtdur_len = max([len(x[4]) for x in batch]) 208 | max_mel_len = max([x[5].size(1) for x in batch]) 209 | max_f0_len = max([x[6].size(1) for x in batch]) 210 | max_wav_len = max([x[7].size(1) for x in batch]) 211 | 212 | phone_lengths = torch.LongTensor(len(batch)) 213 | pitchid_lengths = torch.LongTensor(len(batch)) 214 | dur_lengths = torch.LongTensor(len(batch)) 215 | slur_lengths = torch.LongTensor(len(batch)) 216 | gtdur_lengths = torch.LongTensor(len(batch)) 217 | mel_lengths = torch.LongTensor(len(batch)) 218 | f0_lengths = torch.LongTensor(len(batch)) 219 | wav_lengths = torch.LongTensor(len(batch)) 220 | 221 | phone_padded = torch.LongTensor(len(batch), max_phone_len) 222 | pitchid_padded = torch.LongTensor(len(batch), max_pitchid_len) 223 | dur_padded = torch.FloatTensor(len(batch), max_dur_len) 224 | slur_padded = torch.LongTensor(len(batch), max_slur_len) 225 | gtdur_padded = torch.LongTensor(len(batch), 1, max_gtdur_len) 226 | mel_padded = torch.FloatTensor(len(batch), self.hparams.data.acoustic_dim, max_mel_len) 227 | f0_padded = torch.FloatTensor(len(batch), 1, max_f0_len) 228 | wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) 229 | spkids = torch.LongTensor(len(batch)) 230 | 231 | phone_padded.zero_() 232 | pitchid_padded.zero_() 233 | dur_padded.zero_() 234 | slur_padded.zero_() 235 | gtdur_padded.zero_() 236 | mel_padded.zero_() 237 | f0_padded.zero_() 238 | wav_padded.zero_() 239 | 240 | for i in range(len(ids_sorted_decreasing)): 241 | row = batch[ids_sorted_decreasing[i]] 242 | 243 | phone = row[0] 244 | phone_padded[i, :phone.size(0)] = phone 245 | phone_lengths[i] = phone.size(0) 246 | 247 | pitchid = row[1] 248 | pitchid_padded[i, :pitchid.size(0)] = pitchid 249 | pitchid_lengths[i] = pitchid.size(0) 250 | 251 | dur = row[2] 252 | dur_padded[i, :dur.size(0)] = dur 253 | dur_lengths[i] = dur.size(0) 254 | 255 | slur = row[3] 256 | slur_padded[i, :slur.size(0)] = slur 257 | slur_lengths[i] = slur.size(0) 258 | 259 | gtdur = row[4] 260 | gtdur_padded[i, :, :gtdur.size(0)] = gtdur 261 | gtdur_lengths[i] = gtdur.size(0) 262 | 263 | mel = row[5] 264 | mel_padded[i, :, :mel.size(1)] = mel 265 | mel_lengths[i] = mel.size(1) 266 | 267 | f0 = row[6] 268 | f0_padded[i, :, :f0.size(1)] = f0 269 | f0_lengths[i] = f0.size(1) 270 | 271 | wav = row[7] 272 | wav_padded[i, :, :wav.size(1)] = wav 273 | wav_lengths[i] = wav.size(1) 274 | 275 | spkids[i] = row[8] 276 | 277 | data_dict = {} 278 | data_dict["phone"] = phone_padded 279 | data_dict["phone_lengths"] = phone_lengths 280 | data_dict["pitchid"] = pitchid_padded 281 | data_dict["dur"] = dur_padded 282 | data_dict["slur"] = slur_padded 283 | data_dict["gtdur"] = gtdur_padded 284 | data_dict["mel"] = mel_padded 285 | data_dict["f0"] = f0_padded 286 | data_dict["wav"] = wav_padded 287 | 288 | data_dict["mel_lengths"] = mel_lengths 289 | data_dict["f0_lengths"] = f0_lengths 290 | data_dict["wav_lengths"] = wav_lengths 291 | data_dict["spkid"] = spkids 292 | 293 | return data_dict 294 | 295 | 296 | class DatasetConstructor(): 297 | 298 | def __init__(self, hparams, num_replicas=1, rank=1): 299 | self.hparams = hparams 300 | self.num_replicas = num_replicas 301 | self.rank = rank 302 | self.dataset_function = {"SingDataset": SingDataset} 303 | self.collate_function = {"SingCollate": SingCollate} 304 | self._get_components() 305 | 306 | def _get_components(self): 307 | self._init_datasets() 308 | self._init_collate() 309 | self._init_data_loaders() 310 | 311 | def _init_datasets(self): 312 | self._train_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.training_filelist, self.hparams.data.training_labellist) 313 | self._valid_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.validation_filelist, self.hparams.data.validation_labellist) 314 | 315 | def _init_collate(self): 316 | self._collate_fn = self.collate_function[self.hparams.data.collate_type](self.hparams) 317 | 318 | def _init_data_loaders(self): 319 | train_sampler = torch.utils.data.distributed.DistributedSampler(self._train_dataset, num_replicas=self.num_replicas, rank=self.rank, shuffle=True) 320 | 321 | self.train_loader = DataLoader(self._train_dataset, num_workers=4, shuffle=False, 322 | batch_size=self.hparams.train.batch_size, pin_memory=True, 323 | drop_last=True, collate_fn=self._collate_fn, sampler=train_sampler) 324 | 325 | self.valid_loader = DataLoader(self._valid_dataset, num_workers=1, shuffle=False, 326 | batch_size=1, pin_memory=True, 327 | drop_last=True, collate_fn=self._collate_fn) 328 | 329 | def get_train_loader(self): 330 | return self.train_loader 331 | 332 | def get_valid_loader(self): 333 | return self.valid_loader 334 | 335 | -------------------------------------------------------------------------------- /modules/modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | from torch.autograd import Function 9 | from typing import Any, Optional, Tuple 10 | 11 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 12 | from torch.nn.utils import weight_norm, remove_weight_norm 13 | 14 | import modules.commons as commons 15 | import modules.attentions as attentions 16 | from modules.commons import init_weights, get_padding 17 | from modules.transforms import piecewise_rational_quadratic_transform 18 | 19 | 20 | LRELU_SLOPE = 0.1 21 | 22 | 23 | class LayerNorm(nn.Module): 24 | def __init__(self, channels, eps=1e-5): 25 | super().__init__() 26 | self.channels = channels 27 | self.eps = eps 28 | 29 | self.gamma = nn.Parameter(torch.ones(channels)) 30 | self.beta = nn.Parameter(torch.zeros(channels)) 31 | 32 | def forward(self, x): 33 | x = x.transpose(1, -1) 34 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 35 | return x.transpose(1, -1) 36 | 37 | 38 | class ConvReluNorm(nn.Module): 39 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 40 | super().__init__() 41 | self.in_channels = in_channels 42 | self.hidden_channels = hidden_channels 43 | self.out_channels = out_channels 44 | self.kernel_size = kernel_size 45 | self.n_layers = n_layers 46 | self.p_dropout = p_dropout 47 | assert n_layers > 1, "Number of layers should be larger than 0." 48 | 49 | self.conv_layers = nn.ModuleList() 50 | self.norm_layers = nn.ModuleList() 51 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 52 | self.norm_layers.append(LayerNorm(hidden_channels)) 53 | self.relu_drop = nn.Sequential( 54 | nn.ReLU(), 55 | nn.Dropout(p_dropout)) 56 | for _ in range(n_layers-1): 57 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 58 | self.norm_layers.append(LayerNorm(hidden_channels)) 59 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 60 | self.proj.weight.data.zero_() 61 | self.proj.bias.data.zero_() 62 | 63 | def forward(self, x, x_mask): 64 | x_org = x 65 | for i in range(self.n_layers): 66 | x = self.conv_layers[i](x * x_mask) 67 | x = self.norm_layers[i](x) 68 | x = self.relu_drop(x) 69 | x = x_org + self.proj(x) 70 | return x * x_mask 71 | 72 | 73 | class DDSConv(nn.Module): 74 | """ 75 | Dialted and Depth-Separable Convolution 76 | """ 77 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 78 | super().__init__() 79 | self.channels = channels 80 | self.kernel_size = kernel_size 81 | self.n_layers = n_layers 82 | self.p_dropout = p_dropout 83 | 84 | self.drop = nn.Dropout(p_dropout) 85 | self.convs_sep = nn.ModuleList() 86 | self.convs_1x1 = nn.ModuleList() 87 | self.norms_1 = nn.ModuleList() 88 | self.norms_2 = nn.ModuleList() 89 | for i in range(n_layers): 90 | dilation = kernel_size ** i 91 | padding = (kernel_size * dilation - dilation) // 2 92 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 93 | groups=channels, dilation=dilation, padding=padding 94 | )) 95 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 96 | self.norms_1.append(LayerNorm(channels)) 97 | self.norms_2.append(LayerNorm(channels)) 98 | 99 | def forward(self, x, x_mask, g=None): 100 | if g is not None: 101 | x = x + g 102 | for i in range(self.n_layers): 103 | y = self.convs_sep[i](x * x_mask) 104 | y = self.norms_1[i](y) 105 | y = F.gelu(y) 106 | y = self.convs_1x1[i](y) 107 | y = self.norms_2[i](y) 108 | y = F.gelu(y) 109 | y = self.drop(y) 110 | x = x + y 111 | return x * x_mask 112 | 113 | 114 | class WN(torch.nn.Module): 115 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=0, spk_channels=0, p_dropout=0): 116 | super(WN, self).__init__() 117 | assert(kernel_size % 2 == 1) 118 | self.hidden_channels =hidden_channels 119 | self.kernel_size = kernel_size, 120 | self.dilation_rate = dilation_rate 121 | self.n_layers = n_layers 122 | self.n_speakers = n_speakers 123 | self.spk_channels = spk_channels 124 | self.p_dropout = p_dropout 125 | 126 | self.in_layers = torch.nn.ModuleList() 127 | self.res_skip_layers = torch.nn.ModuleList() 128 | self.drop = nn.Dropout(p_dropout) 129 | 130 | if n_speakers > 0: 131 | cond_layer = torch.nn.Conv1d(spk_channels, 2*hidden_channels*n_layers, 1) 132 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 133 | 134 | for i in range(n_layers): 135 | dilation = dilation_rate ** i 136 | padding = int((kernel_size * dilation - dilation) / 2) 137 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 138 | dilation=dilation, padding=padding) 139 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 140 | self.in_layers.append(in_layer) 141 | 142 | # last one is not necessary 143 | if i < n_layers - 1: 144 | res_skip_channels = 2 * hidden_channels 145 | else: 146 | res_skip_channels = hidden_channels 147 | 148 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 149 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 150 | self.res_skip_layers.append(res_skip_layer) 151 | 152 | def forward(self, x, x_mask, g=None, **kwargs): 153 | output = torch.zeros_like(x) 154 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 155 | 156 | if g is not None: 157 | g = self.cond_layer(g) 158 | 159 | for i in range(self.n_layers): 160 | x_in = self.in_layers[i](x) 161 | if g is not None: 162 | cond_offset = i * 2 * self.hidden_channels 163 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 164 | else: 165 | g_l = torch.zeros_like(x_in) 166 | 167 | acts = commons.fused_add_tanh_sigmoid_multiply( 168 | x_in, 169 | g_l, 170 | n_channels_tensor) 171 | acts = self.drop(acts) 172 | 173 | res_skip_acts = self.res_skip_layers[i](acts) 174 | if i < self.n_layers - 1: 175 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 176 | x = (x + res_acts) * x_mask 177 | output = output + res_skip_acts[:,self.hidden_channels:,:] 178 | else: 179 | output = output + res_skip_acts 180 | return output * x_mask 181 | 182 | def remove_weight_norm(self): 183 | if self.n_speakers > 0: 184 | torch.nn.utils.remove_weight_norm(self.cond_layer) 185 | for l in self.in_layers: 186 | torch.nn.utils.remove_weight_norm(l) 187 | for l in self.res_skip_layers: 188 | torch.nn.utils.remove_weight_norm(l) 189 | 190 | 191 | class ResBlock1(torch.nn.Module): 192 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 193 | super(ResBlock1, self).__init__() 194 | self.convs1 = nn.ModuleList([ 195 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 196 | padding=get_padding(kernel_size, dilation[0]))), 197 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 198 | padding=get_padding(kernel_size, dilation[1]))), 199 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 200 | padding=get_padding(kernel_size, dilation[2]))) 201 | ]) 202 | self.convs1.apply(init_weights) 203 | 204 | self.convs2 = nn.ModuleList([ 205 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 206 | padding=get_padding(kernel_size, 1))), 207 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 208 | padding=get_padding(kernel_size, 1))), 209 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 210 | padding=get_padding(kernel_size, 1))) 211 | ]) 212 | self.convs2.apply(init_weights) 213 | 214 | def forward(self, x, x_mask=None): 215 | for c1, c2 in zip(self.convs1, self.convs2): 216 | xt = F.leaky_relu(x, LRELU_SLOPE) 217 | if x_mask is not None: 218 | xt = xt * x_mask 219 | xt = c1(xt) 220 | xt = F.leaky_relu(xt, LRELU_SLOPE) 221 | if x_mask is not None: 222 | xt = xt * x_mask 223 | xt = c2(xt) 224 | x = xt + x 225 | if x_mask is not None: 226 | x = x * x_mask 227 | return x 228 | 229 | def remove_weight_norm(self): 230 | for l in self.convs1: 231 | remove_weight_norm(l) 232 | for l in self.convs2: 233 | remove_weight_norm(l) 234 | 235 | 236 | class ResBlock2(torch.nn.Module): 237 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 238 | super(ResBlock2, self).__init__() 239 | self.convs = nn.ModuleList([ 240 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 241 | padding=get_padding(kernel_size, dilation[0]))), 242 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 243 | padding=get_padding(kernel_size, dilation[1]))) 244 | ]) 245 | self.convs.apply(init_weights) 246 | 247 | def forward(self, x, x_mask=None): 248 | for c in self.convs: 249 | xt = F.leaky_relu(x, LRELU_SLOPE) 250 | if x_mask is not None: 251 | xt = xt * x_mask 252 | xt = c(xt) 253 | x = xt + x 254 | if x_mask is not None: 255 | x = x * x_mask 256 | return x 257 | 258 | def remove_weight_norm(self): 259 | for l in self.convs: 260 | remove_weight_norm(l) 261 | 262 | 263 | class Log(nn.Module): 264 | def forward(self, x, x_mask, reverse=False, **kwargs): 265 | if not reverse: 266 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 267 | logdet = torch.sum(-y, [1, 2]) 268 | return y, logdet 269 | else: 270 | x = torch.exp(x) * x_mask 271 | return x 272 | 273 | 274 | class Flip(nn.Module): 275 | def forward(self, x, *args, reverse=False, **kwargs): 276 | x = torch.flip(x, [1]) 277 | if not reverse: 278 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 279 | return x, logdet 280 | else: 281 | return x 282 | 283 | 284 | class ElementwiseAffine(nn.Module): 285 | def __init__(self, channels): 286 | super().__init__() 287 | self.channels = channels 288 | self.m = nn.Parameter(torch.zeros(channels,1)) 289 | self.logs = nn.Parameter(torch.zeros(channels,1)) 290 | 291 | def forward(self, x, x_mask, reverse=False, **kwargs): 292 | if not reverse: 293 | y = self.m + torch.exp(self.logs) * x 294 | y = y * x_mask 295 | logdet = torch.sum(self.logs * x_mask, [1,2]) 296 | return y, logdet 297 | else: 298 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 299 | return x 300 | 301 | 302 | class ResidualCouplingLayer(nn.Module): 303 | def __init__(self, 304 | channels, 305 | hidden_channels, 306 | kernel_size, 307 | dilation_rate, 308 | n_layers, 309 | p_dropout=0, 310 | n_speakers=0, 311 | spk_channels=0, 312 | mean_only=False): 313 | assert channels % 2 == 0, "channels should be divisible by 2" 314 | super().__init__() 315 | self.channels = channels 316 | self.hidden_channels = hidden_channels 317 | self.kernel_size = kernel_size 318 | self.dilation_rate = dilation_rate 319 | self.n_layers = n_layers 320 | self.half_channels = channels // 2 321 | self.mean_only = mean_only 322 | 323 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 324 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, n_speakers=n_speakers, spk_channels=spk_channels) 325 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 326 | self.post.weight.data.zero_() 327 | self.post.bias.data.zero_() 328 | 329 | def forward(self, x, x_mask, g=None, reverse=False): 330 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 331 | h = self.pre(x0) * x_mask 332 | h = self.enc(h, x_mask, g=g) 333 | stats = self.post(h) * x_mask 334 | if not self.mean_only: 335 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 336 | else: 337 | m = stats 338 | logs = torch.zeros_like(m) 339 | 340 | if not reverse: 341 | x1 = m + x1 * torch.exp(logs) * x_mask 342 | x = torch.cat([x0, x1], 1) 343 | logdet = torch.sum(logs, [1,2]) 344 | return x, logdet 345 | else: 346 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 347 | x = torch.cat([x0, x1], 1) 348 | return x 349 | 350 | class ResidualCouplingBlock(nn.Module): 351 | def __init__(self, 352 | channels, 353 | hidden_channels, 354 | kernel_size, 355 | dilation_rate, 356 | n_layers, 357 | n_flows=4, 358 | n_speakers=0, 359 | gin_channels=0): 360 | super().__init__() 361 | self.channels = channels 362 | self.hidden_channels = hidden_channels 363 | self.kernel_size = kernel_size 364 | self.dilation_rate = dilation_rate 365 | self.n_layers = n_layers 366 | self.n_flows = n_flows 367 | self.gin_channels = gin_channels 368 | 369 | self.flows = nn.ModuleList() 370 | for i in range(n_flows): 371 | self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=n_speakers, spk_channels=gin_channels, mean_only=True)) 372 | self.flows.append(Flip()) 373 | 374 | def forward(self, x, x_mask, g=None, reverse=False): 375 | if not reverse: 376 | for flow in self.flows: 377 | x, _ = flow(x, x_mask, g=g, reverse=reverse) 378 | else: 379 | for flow in reversed(self.flows): 380 | x = flow(x, x_mask, g=g, reverse=reverse) 381 | return x 382 | 383 | 384 | class ConvFlow(nn.Module): 385 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 386 | super().__init__() 387 | self.in_channels = in_channels 388 | self.filter_channels = filter_channels 389 | self.kernel_size = kernel_size 390 | self.n_layers = n_layers 391 | self.num_bins = num_bins 392 | self.tail_bound = tail_bound 393 | self.half_channels = in_channels // 2 394 | 395 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 396 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 397 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 398 | self.proj.weight.data.zero_() 399 | self.proj.bias.data.zero_() 400 | 401 | def forward(self, x, x_mask, g=None, reverse=False): 402 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 403 | h = self.pre(x0) 404 | h = self.convs(h, x_mask, g=g) 405 | h = self.proj(h) * x_mask 406 | 407 | b, c, t = x0.shape 408 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 409 | 410 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 411 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 412 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 413 | 414 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 415 | unnormalized_widths, 416 | unnormalized_heights, 417 | unnormalized_derivatives, 418 | inverse=reverse, 419 | tails='linear', 420 | tail_bound=self.tail_bound 421 | ) 422 | 423 | x = torch.cat([x0, x1], 1) * x_mask 424 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 425 | if not reverse: 426 | return x, logdet 427 | else: 428 | return x 429 | 430 | 431 | class ResStack(nn.Module): 432 | def __init__(self, channel, kernel_size=3, base=3, nums=4): 433 | super(ResStack, self).__init__() 434 | 435 | self.layers = nn.ModuleList([ 436 | nn.Sequential( 437 | nn.LeakyReLU(), 438 | nn.utils.weight_norm(nn.Conv1d(channel, channel, 439 | kernel_size=kernel_size, dilation=base**i, padding=base**i)), 440 | nn.LeakyReLU(), 441 | nn.utils.weight_norm(nn.Conv1d(channel, channel, 442 | kernel_size=kernel_size, dilation=1, padding=1)), 443 | ) 444 | for i in range(nums) 445 | ]) 446 | 447 | def forward(self, x): 448 | for layer in self.layers: 449 | x = x + layer(x) 450 | return x 451 | -------------------------------------------------------------------------------- /modules/attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | import modules.commons as commons 9 | 10 | 11 | class LayerNorm(nn.Module): 12 | def __init__(self, channels, eps=1e-5): 13 | super().__init__() 14 | self.channels = channels 15 | self.eps = eps 16 | 17 | self.gamma = nn.Parameter(torch.ones(channels)) 18 | self.beta = nn.Parameter(torch.zeros(channels)) 19 | 20 | def forward(self, x): 21 | x = x.transpose(1, -1) 22 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 23 | return x.transpose(1, -1) 24 | 25 | 26 | class Encoder(nn.Module): 27 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 28 | super().__init__() 29 | self.hidden_channels = hidden_channels 30 | self.filter_channels = filter_channels 31 | self.n_heads = n_heads 32 | self.n_layers = n_layers 33 | self.kernel_size = kernel_size 34 | self.p_dropout = p_dropout 35 | self.window_size = window_size 36 | 37 | self.drop = nn.Dropout(p_dropout) 38 | self.attn_layers = nn.ModuleList() 39 | self.norm_layers_1 = nn.ModuleList() 40 | self.ffn_layers = nn.ModuleList() 41 | self.norm_layers_2 = nn.ModuleList() 42 | for i in range(self.n_layers): 43 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 44 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 45 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 46 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 47 | 48 | def forward(self, x, x_mask): 49 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 50 | x = x * x_mask 51 | for i in range(self.n_layers): 52 | y = self.attn_layers[i](x, x, attn_mask) 53 | y = self.drop(y) 54 | x = self.norm_layers_1[i](x + y) 55 | 56 | y = self.ffn_layers[i](x, x_mask) 57 | y = self.drop(y) 58 | x = self.norm_layers_2[i](x + y) 59 | x = x * x_mask 60 | return x 61 | 62 | class Decoder(nn.Module): 63 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 64 | super().__init__() 65 | self.hidden_channels = hidden_channels 66 | self.filter_channels = filter_channels 67 | self.n_heads = n_heads 68 | self.n_layers = n_layers 69 | self.kernel_size = kernel_size 70 | self.p_dropout = p_dropout 71 | self.proximal_bias = proximal_bias 72 | self.proximal_init = proximal_init 73 | 74 | self.drop = nn.Dropout(p_dropout) 75 | self.self_attn_layers = nn.ModuleList() 76 | self.norm_layers_0 = nn.ModuleList() 77 | self.encdec_attn_layers = nn.ModuleList() 78 | self.norm_layers_1 = nn.ModuleList() 79 | self.ffn_layers = nn.ModuleList() 80 | self.norm_layers_2 = nn.ModuleList() 81 | for i in range(self.n_layers): 82 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 83 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 84 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 85 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 86 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 87 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 88 | 89 | def forward(self, x, x_mask, h, h_mask): 90 | """ 91 | x: decoder input 92 | h: encoder output 93 | """ 94 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 95 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 96 | x = x * x_mask 97 | for i in range(self.n_layers): 98 | y = self.self_attn_layers[i](x, x, self_attn_mask) 99 | y = self.drop(y) 100 | x = self.norm_layers_0[i](x + y) 101 | 102 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 103 | y = self.drop(y) 104 | x = self.norm_layers_1[i](x + y) 105 | 106 | y = self.ffn_layers[i](x, x_mask) 107 | y = self.drop(y) 108 | x = self.norm_layers_2[i](x + y) 109 | x = x * x_mask 110 | return x 111 | 112 | class FFT(nn.Module): 113 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 114 | super().__init__() 115 | self.hidden_channels = hidden_channels 116 | self.filter_channels = filter_channels 117 | self.n_heads = n_heads 118 | self.n_layers = n_layers 119 | self.kernel_size = kernel_size 120 | self.p_dropout = p_dropout 121 | self.proximal_bias = proximal_bias 122 | self.proximal_init = proximal_init 123 | 124 | self.drop = nn.Dropout(p_dropout) 125 | self.self_attn_layers = nn.ModuleList() 126 | self.norm_layers_0 = nn.ModuleList() 127 | self.ffn_layers = nn.ModuleList() 128 | self.norm_layers_1 = nn.ModuleList() 129 | for i in range(self.n_layers): 130 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 131 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 132 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 133 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 134 | 135 | def forward(self, x, x_mask): 136 | """ 137 | x: decoder input 138 | h: encoder output 139 | """ 140 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 141 | x = x * x_mask 142 | for i in range(self.n_layers): 143 | y = self.self_attn_layers[i](x, x, self_attn_mask) 144 | y = self.drop(y) 145 | x = self.norm_layers_0[i](x + y) 146 | 147 | y = self.ffn_layers[i](x, x_mask) 148 | y = self.drop(y) 149 | x = self.norm_layers_1[i](x + y) 150 | x = x * x_mask 151 | return x 152 | 153 | 154 | class FFNs(nn.Module): 155 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 156 | super().__init__() 157 | self.hidden_channels = hidden_channels 158 | self.filter_channels = filter_channels 159 | self.n_heads = n_heads 160 | self.n_layers = n_layers 161 | self.kernel_size = kernel_size 162 | self.p_dropout = p_dropout 163 | self.proximal_bias = proximal_bias 164 | self.proximal_init = proximal_init 165 | 166 | self.drop = nn.Dropout(p_dropout) 167 | #self.self_attn_layers = nn.ModuleList() 168 | #self.norm_layers_0 = nn.ModuleList() 169 | self.ffn_layers = nn.ModuleList() 170 | self.norm_layers_1 = nn.ModuleList() 171 | for i in range(self.n_layers): 172 | #self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 173 | #self.norm_layers_0.append(LayerNorm(hidden_channels)) 174 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 175 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 176 | 177 | def forward(self, x, x_mask): 178 | """ 179 | x: decoder input 180 | h: encoder output 181 | """ 182 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 183 | x = x * x_mask 184 | for i in range(self.n_layers): 185 | #y = self.self_attn_layers[i](x, x, self_attn_mask) 186 | #y = self.drop(y) 187 | #x = self.norm_layers_0[i](x + y) 188 | 189 | y = self.ffn_layers[i](x, x_mask) 190 | y = self.drop(y) 191 | x = self.norm_layers_1[i](x + y) 192 | x = x * x_mask 193 | return x 194 | 195 | class MultiHeadAttention(nn.Module): 196 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 197 | super().__init__() 198 | assert channels % n_heads == 0 199 | 200 | self.channels = channels 201 | self.out_channels = out_channels 202 | self.n_heads = n_heads 203 | self.p_dropout = p_dropout 204 | self.window_size = window_size 205 | self.heads_share = heads_share 206 | self.block_length = block_length 207 | self.proximal_bias = proximal_bias 208 | self.proximal_init = proximal_init 209 | self.attn = None 210 | 211 | self.k_channels = channels // n_heads 212 | self.conv_q = nn.Conv1d(channels, channels, 1) 213 | self.conv_k = nn.Conv1d(channels, channels, 1) 214 | self.conv_v = nn.Conv1d(channels, channels, 1) 215 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 216 | self.drop = nn.Dropout(p_dropout) 217 | 218 | if window_size is not None: 219 | n_heads_rel = 1 if heads_share else n_heads 220 | rel_stddev = self.k_channels**-0.5 221 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 222 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 223 | 224 | nn.init.xavier_uniform_(self.conv_q.weight) 225 | nn.init.xavier_uniform_(self.conv_k.weight) 226 | nn.init.xavier_uniform_(self.conv_v.weight) 227 | if proximal_init: 228 | with torch.no_grad(): 229 | self.conv_k.weight.copy_(self.conv_q.weight) 230 | self.conv_k.bias.copy_(self.conv_q.bias) 231 | 232 | def forward(self, x, c, attn_mask=None): 233 | q = self.conv_q(x) 234 | k = self.conv_k(c) 235 | v = self.conv_v(c) 236 | 237 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 238 | 239 | x = self.conv_o(x) 240 | return x 241 | 242 | def attention(self, query, key, value, mask=None): 243 | # reshape [b, d, t] -> [b, n_h, t, d_k] 244 | b, d, t_s, t_t = (*key.size(), query.size(2)) 245 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 246 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 247 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 248 | 249 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 250 | if self.window_size is not None: 251 | assert t_s == t_t, "Relative attention is only available for self-attention." 252 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 253 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 254 | scores_local = self._relative_position_to_absolute_position(rel_logits) 255 | scores = scores + scores_local 256 | if self.proximal_bias: 257 | assert t_s == t_t, "Proximal bias is only available for self-attention." 258 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 259 | if mask is not None: 260 | scores = scores.masked_fill(mask == 0, -1e4) 261 | if self.block_length is not None: 262 | assert t_s == t_t, "Local attention is only available for self-attention." 263 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 264 | scores = scores.masked_fill(block_mask == 0, -1e4) 265 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 266 | p_attn = self.drop(p_attn) 267 | output = torch.matmul(p_attn, value) 268 | if self.window_size is not None: 269 | relative_weights = self._absolute_position_to_relative_position(p_attn) 270 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 271 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 272 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 273 | return output, p_attn 274 | 275 | def _matmul_with_relative_values(self, x, y): 276 | """ 277 | x: [b, h, l, m] 278 | y: [h or 1, m, d] 279 | ret: [b, h, l, d] 280 | """ 281 | ret = torch.matmul(x, y.unsqueeze(0)) 282 | return ret 283 | 284 | def _matmul_with_relative_keys(self, x, y): 285 | """ 286 | x: [b, h, l, d] 287 | y: [h or 1, m, d] 288 | ret: [b, h, l, m] 289 | """ 290 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 291 | return ret 292 | 293 | def _get_relative_embeddings(self, relative_embeddings, length): 294 | max_relative_position = 2 * self.window_size + 1 295 | # Pad first before slice to avoid using cond ops. 296 | pad_length = max(length - (self.window_size + 1), 0) 297 | slice_start_position = max((self.window_size + 1) - length, 0) 298 | slice_end_position = slice_start_position + 2 * length - 1 299 | if pad_length > 0: 300 | padded_relative_embeddings = F.pad( 301 | relative_embeddings, 302 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 303 | else: 304 | padded_relative_embeddings = relative_embeddings 305 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 306 | return used_relative_embeddings 307 | 308 | def _relative_position_to_absolute_position(self, x): 309 | """ 310 | x: [b, h, l, 2*l-1] 311 | ret: [b, h, l, l] 312 | """ 313 | batch, heads, length, _ = x.size() 314 | # Concat columns of pad to shift from relative to absolute indexing. 315 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 316 | 317 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 318 | x_flat = x.view([batch, heads, length * 2 * length]) 319 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 320 | 321 | # Reshape and slice out the padded elements. 322 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 323 | return x_final 324 | 325 | def _absolute_position_to_relative_position(self, x): 326 | """ 327 | x: [b, h, l, l] 328 | ret: [b, h, l, 2*l-1] 329 | """ 330 | batch, heads, length, _ = x.size() 331 | # padd along column 332 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 333 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 334 | # add 0's in the beginning that will skew the elements after reshape 335 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 336 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 337 | return x_final 338 | 339 | def _attention_bias_proximal(self, length): 340 | """Bias for self-attention to encourage attention to close positions. 341 | Args: 342 | length: an integer scalar. 343 | Returns: 344 | a Tensor with shape [1, 1, length, length] 345 | """ 346 | r = torch.arange(length, dtype=torch.float32) 347 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 348 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 349 | 350 | 351 | class FFN(nn.Module): 352 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 353 | super().__init__() 354 | self.in_channels = in_channels 355 | self.out_channels = out_channels 356 | self.filter_channels = filter_channels 357 | self.kernel_size = kernel_size 358 | self.p_dropout = p_dropout 359 | self.activation = activation 360 | self.causal = causal 361 | 362 | if causal: 363 | self.padding = self._causal_padding 364 | else: 365 | self.padding = self._same_padding 366 | 367 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 368 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 369 | self.drop = nn.Dropout(p_dropout) 370 | 371 | def forward(self, x, x_mask): 372 | x = self.conv_1(self.padding(x * x_mask)) 373 | if self.activation == "gelu": 374 | x = x * torch.sigmoid(1.702 * x) 375 | else: 376 | x = torch.relu(x) 377 | x = self.drop(x) 378 | x = self.conv_2(self.padding(x * x_mask)) 379 | return x * x_mask 380 | 381 | def _causal_padding(self, x): 382 | if self.kernel_size == 1: 383 | return x 384 | pad_l = self.kernel_size - 1 385 | pad_r = 0 386 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 387 | x = F.pad(x, commons.convert_pad_shape(padding)) 388 | return x 389 | 390 | def _same_padding(self, x): 391 | if self.kernel_size == 1: 392 | return x 393 | pad_l = (self.kernel_size - 1) // 2 394 | pad_r = self.kernel_size // 2 395 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 396 | x = F.pad(x, commons.convert_pad_shape(padding)) 397 | return x 398 | -------------------------------------------------------------------------------- /egs/visinger2/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import itertools 6 | import math 7 | import time 8 | import logging 9 | 10 | import torch 11 | from torch import nn, optim 12 | from torch.nn import functional as F 13 | from torch.utils.data import DataLoader 14 | from torch.utils.tensorboard import SummaryWriter 15 | import torch.multiprocessing as mp 16 | import torch.distributed as dist 17 | from torch.nn.parallel import DistributedDataParallel as DDP 18 | from torch.cuda.amp import autocast, GradScaler 19 | 20 | sys.path.append('../..') 21 | import modules.commons as commons 22 | import utils.utils as utils 23 | 24 | from dataset import DatasetConstructor 25 | 26 | from models import ( 27 | SynthesizerTrn, 28 | Discriminator 29 | ) 30 | 31 | from modules.losses import ( 32 | generator_loss, 33 | discriminator_loss, 34 | feature_loss, 35 | kl_loss, 36 | ) 37 | from preprocess.mel_processing import mel_spectrogram_torch, spec_to_mel_torch, spectrogram_torch 38 | 39 | torch.backends.cudnn.benchmark = True 40 | global_step = 0 41 | use_cuda = torch.cuda.is_available() 42 | print("use_cuda, ", use_cuda) 43 | 44 | numba_logger = logging.getLogger('numba') 45 | numba_logger.setLevel(logging.WARNING) 46 | 47 | 48 | def main(): 49 | """Assume Single Node Multi GPUs Training Only""" 50 | 51 | hps = utils.get_hparams() 52 | os.environ['MASTER_ADDR'] = 'localhost' 53 | os.environ['MASTER_PORT'] = str(hps.train.port) 54 | 55 | if (torch.cuda.is_available()): 56 | n_gpus = torch.cuda.device_count() 57 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) 58 | else: 59 | cpurun(0, 1, hps) 60 | 61 | 62 | def run(rank, n_gpus, hps): 63 | global global_step 64 | if rank == 0: 65 | logger = utils.get_logger(hps.train.save_dir) 66 | logger.info(hps.train) 67 | logger.info(hps.data) 68 | logger.info(hps.model) 69 | utils.check_git_hash(hps.train.save_dir) 70 | writer = SummaryWriter(log_dir=hps.train.save_dir) 71 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval")) 72 | 73 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) 74 | torch.manual_seed(hps.train.seed) 75 | torch.cuda.set_device(rank) 76 | dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank) 77 | 78 | train_loader = dataset_constructor.get_train_loader() 79 | if rank == 0: 80 | valid_loader = dataset_constructor.get_valid_loader() 81 | 82 | net_g = SynthesizerTrn(hps).cuda(rank) 83 | net_d = Discriminator(hps, hps.model.use_spectral_norm).cuda(rank) 84 | 85 | optim_g = torch.optim.AdamW( 86 | net_g.parameters(), 87 | hps.train.learning_rate, 88 | betas=hps.train.betas, 89 | eps=hps.train.eps) 90 | optim_d = torch.optim.AdamW( 91 | net_d.parameters(), 92 | hps.train.learning_rate, 93 | betas=hps.train.betas, 94 | eps=hps.train.eps) 95 | net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) 96 | net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) 97 | try: 98 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g, 99 | optim_g) 100 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_d, 101 | optim_d) 102 | global_step = (epoch_str - 1) * len(train_loader) 103 | except: 104 | epoch_str = 1 105 | global_step = 0 106 | 107 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) 108 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) 109 | 110 | for epoch in range(epoch_str, hps.train.epochs + 1): 111 | if rank == 0: 112 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], 113 | [train_loader, valid_loader], logger, [writer, writer_eval]) 114 | else: 115 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], 116 | [train_loader, None], None, None) 117 | scheduler_g.step() 118 | scheduler_d.step() 119 | 120 | 121 | def cpurun(rank, n_gpus, hps): 122 | global global_step 123 | if rank == 0: 124 | logger = utils.get_logger(hps.train.save_dir) 125 | logger.info(hps.train) 126 | logger.info(hps.data) 127 | logger.info(hps.model) 128 | utils.check_git_hash(hps.train.save_dir) 129 | writer = SummaryWriter(log_dir=hps.train.save_dir) 130 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval")) 131 | torch.manual_seed(hps.train.seed) 132 | dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank) 133 | 134 | train_loader = dataset_constructor.get_train_loader() 135 | if rank == 0: 136 | valid_loader = dataset_constructor.get_valid_loader() 137 | 138 | net_g = SynthesizerTrn(hps) 139 | net_d = Discriminator(hps, hps.model.use_spectral_norm) 140 | 141 | optim_g = torch.optim.AdamW( 142 | net_g.parameters(), 143 | hps.train.learning_rate, 144 | betas=hps.train.betas, 145 | eps=hps.train.eps) 146 | optim_d = torch.optim.AdamW( 147 | net_d.parameters(), 148 | hps.train.learning_rate, 149 | betas=hps.train.betas, 150 | eps=hps.train.eps) 151 | try: 152 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g, 153 | optim_g) 154 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_g, 155 | optim_g) 156 | global_step = (epoch_str - 1) * len(train_loader) 157 | except: 158 | epoch_str = 1 159 | global_step = 0 160 | 161 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) 162 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) 163 | 164 | for epoch in range(epoch_str, hps.train.epochs + 1): 165 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], 166 | [train_loader, valid_loader], logger, [writer, writer_eval]) 167 | 168 | scheduler_g.step() 169 | scheduler_d.step() 170 | 171 | 172 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, loaders, logger, writers): 173 | net_g, net_d = nets 174 | optim_g, optim_d = optims 175 | scheduler_g, scheduler_d = schedulers 176 | train_loader, eval_loader = loaders 177 | if writers is not None: 178 | writer, writer_eval = writers 179 | 180 | train_loader.sampler.set_epoch(epoch) 181 | global global_step 182 | 183 | net_g.train() 184 | net_d.train() 185 | for batch_idx, data_dict in enumerate(train_loader): 186 | 187 | phone = data_dict["phone"] 188 | pitchid = data_dict["pitchid"] 189 | dur = data_dict["dur"] 190 | slur = data_dict["slur"] 191 | gtdur = data_dict["gtdur"] 192 | mel = data_dict["mel"] 193 | f0 = data_dict["f0"] 194 | wav = data_dict["wav"] 195 | spkid = data_dict["spkid"] 196 | 197 | phone_lengths = data_dict["phone_lengths"] 198 | mel_lengths = data_dict["mel_lengths"] 199 | wav_lengths = data_dict["wav_lengths"] 200 | f0_lengths = data_dict["f0_lengths"] 201 | 202 | # data 203 | if (use_cuda): 204 | phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True) 205 | pitchid = pitchid.cuda(rank, non_blocking=True) 206 | dur = dur.cuda(rank, non_blocking=True) 207 | slur = slur.cuda(rank, non_blocking=True) 208 | gtdur = gtdur.cuda(rank, non_blocking=True) 209 | mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True) 210 | wav, wav_lengths = wav.cuda(rank, non_blocking=True), wav_lengths.cuda(rank, non_blocking=True) 211 | f0, f0_lengths = f0.cuda(rank, non_blocking=True), f0_lengths.cuda(rank, non_blocking=True) 212 | spkid = spkid.cuda(rank, non_blocking=True) 213 | 214 | # forward 215 | y_hat, ids_slice, LF0, y_ddsp, kl_div, predict_mel, mask = net_g(phone, phone_lengths, pitchid, dur, slur, 216 | gtdur, f0, mel, mel_lengths, spk_id=spkid) 217 | y_ddsp = y_ddsp.unsqueeze(1) 218 | 219 | # Discriminator 220 | y = commons.slice_segments(wav, ids_slice * hps.data.hop_size, hps.train.segment_size) # slice 221 | y_ddsp_mel = mel_spectrogram_torch( 222 | y_ddsp.squeeze(1), 223 | hps.data.n_fft, 224 | hps.data.acoustic_dim, 225 | hps.data.sample_rate, 226 | hps.data.hop_size, 227 | hps.data.win_size, 228 | hps.data.fmin, 229 | hps.data.fmax 230 | ) 231 | 232 | y_logspec = torch.log(spectrogram_torch( 233 | y.squeeze(1), 234 | hps.data.n_fft, 235 | hps.data.sample_rate, 236 | hps.data.hop_size, 237 | hps.data.win_size 238 | ) + 1e-7) 239 | 240 | y_ddsp_logspec = torch.log(spectrogram_torch( 241 | y_ddsp.squeeze(1), 242 | hps.data.n_fft, 243 | hps.data.sample_rate, 244 | hps.data.hop_size, 245 | hps.data.win_size 246 | ) + 1e-7) 247 | 248 | y_mel = mel_spectrogram_torch( 249 | y.squeeze(1), 250 | hps.data.n_fft, 251 | hps.data.acoustic_dim, 252 | hps.data.sample_rate, 253 | hps.data.hop_size, 254 | hps.data.win_size, 255 | hps.data.fmin, 256 | hps.data.fmax 257 | ) 258 | y_hat_mel = mel_spectrogram_torch( 259 | y_hat.squeeze(1), 260 | hps.data.n_fft, 261 | hps.data.acoustic_dim, 262 | hps.data.sample_rate, 263 | hps.data.hop_size, 264 | hps.data.win_size, 265 | hps.data.fmin, 266 | hps.data.fmax 267 | ) 268 | 269 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) 270 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 271 | loss_disc_all = loss_disc 272 | 273 | optim_d.zero_grad() 274 | loss_disc_all.backward() 275 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) 276 | optim_d.step() 277 | 278 | # loss 279 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) 280 | 281 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * 45 282 | loss_mel_dsp = F.l1_loss(y_mel, y_ddsp_mel) * 45 283 | loss_spec_dsp = F.l1_loss(y_logspec, y_ddsp_logspec) * 45 284 | 285 | loss_mel_am = F.mse_loss(mel * mask, predict_mel * mask) # * 10 286 | 287 | loss_fm = feature_loss(fmap_r, fmap_g) 288 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 289 | 290 | loss_fm = loss_fm / 2 291 | loss_gen = loss_gen / 2 292 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_mel_dsp + kl_div + loss_mel_am + loss_spec_dsp 293 | 294 | loss_gen_all = loss_gen_all / hps.train.accumulation_steps 295 | 296 | loss_gen_all.backward() 297 | if ((global_step + 1) % hps.train.accumulation_steps == 0): 298 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) 299 | optim_g.step() 300 | optim_g.zero_grad() 301 | 302 | if rank == 0: 303 | if (global_step + 1) % (hps.train.accumulation_steps * 10) == 0: 304 | logger.info(["step&time", global_step, time.asctime(time.localtime(time.time()))]) 305 | logger.info(["mel&mel_dsp&spec_dsp: ", loss_mel, loss_mel_dsp, loss_spec_dsp]) 306 | logger.info(["adv&fm: ", loss_gen, loss_fm]) 307 | logger.info(["kl: ", kl_div]) 308 | logger.info(["am&dur: ", loss_mel_am]) 309 | 310 | if global_step % hps.train.log_interval == 0: 311 | lr = optim_g.param_groups[0]['lr'] 312 | losses = [loss_gen_all, loss_mel] 313 | logger.info('Train Epoch: {} [{:.0f}%]'.format( 314 | epoch, 315 | 100. * batch_idx / len(train_loader))) 316 | logger.info([x.item() for x in losses] + [global_step, lr]) 317 | 318 | scalar_dict = {"loss/total": loss_gen_all, 319 | "loss/mel": loss_mel, 320 | "loss/adv": loss_gen, 321 | "loss/fm": loss_fm, 322 | "loss/mel_ddsp": loss_mel_dsp, 323 | "loss/spec_ddsp": loss_spec_dsp, 324 | "loss/mel_am": loss_mel_am, 325 | "loss/kl_div": kl_div, 326 | "learning_rate": lr} 327 | 328 | utils.summarize( 329 | writer=writer, 330 | global_step=global_step, 331 | scalars=scalar_dict) 332 | 333 | if global_step % hps.train.eval_interval == 0: 334 | logger.info(['All training params(G): ', utils.count_parameters(net_g), ' M']) 335 | # print('Sub training params(G): ', \ 336 | # 'text_encoder: ', utils.count_parameters(net_g.module.text_encoder), ' M, ', \ 337 | # 'decoder: ', utils.count_parameters(net_g.module.decoder), ' M, ', \ 338 | # 'mel_decoder: ', utils.count_parameters(net_g.module.mel_decoder), ' M, ', \ 339 | # 'dec: ', utils.count_parameters(net_g.module.dec), ' M, ', \ 340 | # 'dec_harm: ', utils.count_parameters(net_g.module.dec_harm), ' M, ', \ 341 | # 'dec_noise: ', utils.count_parameters(net_g.module.dec_noise), ' M, ', \ 342 | # 'posterior: ', utils.count_parameters(net_g.module.posterior_encoder), ' M, ', \ 343 | # ) 344 | 345 | evaluate(hps, net_g, eval_loader, writer_eval) 346 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, 347 | os.path.join(hps.train.save_dir, "G_{}.pth".format(global_step)), hps.train.eval_interval) 348 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, 349 | os.path.join(hps.train.save_dir, "D_{}.pth".format(global_step)), hps.train.eval_interval) 350 | net_g.train() 351 | global_step += 1 352 | 353 | if rank == 0: 354 | logger.info('====> Epoch: {}'.format(epoch)) 355 | 356 | 357 | def evaluate(hps, generator, eval_loader, writer_eval): 358 | generator.eval() 359 | image_dict = {} 360 | audio_dict = {} 361 | with torch.no_grad(): 362 | for batch_idx, data_dict in enumerate(eval_loader): 363 | if batch_idx == 4: 364 | break 365 | phone = data_dict["phone"] 366 | pitchid = data_dict["pitchid"] 367 | dur = data_dict["dur"] 368 | slur = data_dict["slur"] 369 | gtdur = data_dict["gtdur"] 370 | mel = data_dict["mel"] 371 | f0 = data_dict["f0"] 372 | wav = data_dict["wav"] 373 | spkid = data_dict["spkid"] 374 | 375 | phone_lengths = data_dict["phone_lengths"] 376 | mel_lengths = data_dict["mel_lengths"] 377 | wav_lengths = data_dict["wav_lengths"] 378 | f0_lengths = data_dict["f0_lengths"] 379 | 380 | # data 381 | if (use_cuda): 382 | phone, phone_lengths = phone.cuda(0), phone_lengths.cuda(0) 383 | pitchid = pitchid.cuda(0) 384 | dur = dur.cuda(0) 385 | slur = slur.cuda(0) 386 | wav = wav.cuda(0) 387 | mel = mel.cuda(0) 388 | f0 = f0.cuda(0) 389 | gtdur = gtdur.cuda(0) 390 | spkid = spkid.cuda(0) 391 | # remove else 392 | phone = phone[:1] 393 | phone_lengths = phone_lengths[:1] 394 | pitchid = pitchid[:1] 395 | dur = dur[:1] 396 | slur = slur[:1] 397 | wav = wav[:1] 398 | mel = mel[:1] 399 | f0 = f0[:1] 400 | gtdur = gtdur[:1] 401 | spkid = spkid[:1] 402 | 403 | y_hat, y_harm, y_noise = generator.module.infer(phone, phone_lengths, pitchid, dur, slur, gtdur=gtdur, F0=f0, 404 | spk_id=spkid) 405 | spec = spectrogram_torch( 406 | wav.squeeze(1), 407 | hps.data.n_fft, 408 | hps.data.sample_rate, 409 | hps.data.hop_size, 410 | hps.data.win_size 411 | ) 412 | 413 | y_mel = mel_spectrogram_torch( 414 | wav.squeeze(1), 415 | hps.data.n_fft, 416 | hps.data.acoustic_dim, 417 | hps.data.sample_rate, 418 | hps.data.hop_size, 419 | hps.data.win_size, 420 | hps.data.fmin, 421 | hps.data.fmax 422 | ) 423 | y_hat_mel = mel_spectrogram_torch( 424 | y_hat.squeeze(1), 425 | hps.data.n_fft, 426 | hps.data.acoustic_dim, 427 | hps.data.sample_rate, 428 | hps.data.hop_size, 429 | hps.data.win_size, 430 | hps.data.fmin, 431 | hps.data.fmax 432 | ) 433 | image_dict.update({ 434 | f"gen/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()), 435 | }) 436 | audio_dict.update( { 437 | f"gen/audio_{batch_idx}": y_hat[0, :, :], 438 | f"gen/harm_{batch_idx}": y_harm[0, :, :], 439 | "gen/noise": y_noise[0, :, :] 440 | }) 441 | # if global_step == 0: 442 | image_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) 443 | audio_dict.update({f"gt/audio_{batch_idx}": wav[0, :, :wav_lengths[0]]}) 444 | 445 | utils.summarize( 446 | writer=writer_eval, 447 | global_step=global_step, 448 | images=image_dict, 449 | audios=audio_dict, 450 | audio_sampling_rate=hps.data.sample_rate 451 | ) 452 | generator.train() 453 | 454 | 455 | if __name__ == "__main__": 456 | main() 457 | -------------------------------------------------------------------------------- /modules/stft.py: -------------------------------------------------------------------------------- 1 | from librosa.util import pad_center, tiny 2 | from scipy.signal import get_window 3 | from torch import Tensor 4 | from torch.autograd import Variable 5 | from typing import Optional, Tuple 6 | 7 | import librosa 8 | import librosa.util as librosa_util 9 | import math 10 | import numpy as np 11 | import scipy 12 | import torch 13 | import torch.nn.functional as F 14 | import warnings 15 | 16 | 17 | def create_fb_matrix( 18 | n_freqs: int, 19 | f_min: float, 20 | f_max: float, 21 | n_mels: int, 22 | sample_rate: int, 23 | norm: Optional[str] = None 24 | ) -> Tensor: 25 | r"""Create a frequency bin conversion matrix. 26 | 27 | Args: 28 | n_freqs (int): Number of frequencies to highlight/apply 29 | f_min (float): Minimum frequency (Hz) 30 | f_max (float): Maximum frequency (Hz) 31 | n_mels (int): Number of mel filterbanks 32 | sample_rate (int): Sample rate of the audio waveform 33 | norm (Optional[str]): If 'slaney', divide the triangular mel weights by the width of the mel band 34 | (area normalization). (Default: ``None``) 35 | 36 | Returns: 37 | Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``) 38 | meaning number of frequencies to highlight/apply to x the number of filterbanks. 39 | Each column is a filterbank so that assuming there is a matrix A of 40 | size (..., ``n_freqs``), the applied result would be 41 | ``A * create_fb_matrix(A.size(-1), ...)``. 42 | """ 43 | 44 | if norm is not None and norm != "slaney": 45 | raise ValueError("norm must be one of None or 'slaney'") 46 | 47 | # freq bins 48 | # Equivalent filterbank construction by Librosa 49 | all_freqs = torch.linspace(0, sample_rate // 2, n_freqs) 50 | 51 | # calculate mel freq bins 52 | # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.)) 53 | m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0)) 54 | m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0)) 55 | m_pts = torch.linspace(m_min, m_max, n_mels + 2) 56 | # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.) 57 | f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0) 58 | # calculate the difference between each mel point and each stft freq point in hertz 59 | f_diff = f_pts[1:] - f_pts[:-1] # (n_mels + 1) 60 | slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) # (n_freqs, n_mels + 2) 61 | # create overlapping triangles 62 | down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_mels) 63 | up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_mels) 64 | fb = torch.min(down_slopes, up_slopes) 65 | fb = torch.clamp(fb, 1e-6, 1) 66 | 67 | if norm is not None and norm == "slaney": 68 | # Slaney-style mel is scaled to be approx constant energy per channel 69 | enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels]) 70 | fb *= enorm.unsqueeze(0) 71 | return fb 72 | 73 | 74 | def lfilter( 75 | waveform: Tensor, 76 | a_coeffs: Tensor, 77 | b_coeffs: Tensor, 78 | clamp: bool = True, 79 | ) -> Tensor: 80 | r"""Perform an IIR filter by evaluating difference equation. 81 | 82 | Args: 83 | waveform (Tensor): audio waveform of dimension of ``(..., time)``. Must be normalized to -1 to 1. 84 | a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(n_order + 1)``. 85 | Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``. 86 | Must be same size as b_coeffs (pad with 0's as necessary). 87 | b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(n_order + 1)``. 88 | Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``. 89 | Must be same size as a_coeffs (pad with 0's as necessary). 90 | clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``) 91 | 92 | Returns: 93 | Tensor: Waveform with dimension of ``(..., time)``. 94 | """ 95 | # pack batch 96 | shape = waveform.size() 97 | waveform = waveform.reshape(-1, shape[-1]) 98 | 99 | assert (a_coeffs.size(0) == b_coeffs.size(0)) 100 | assert (len(waveform.size()) == 2) 101 | assert (waveform.device == a_coeffs.device) 102 | assert (b_coeffs.device == a_coeffs.device) 103 | 104 | device = waveform.device 105 | dtype = waveform.dtype 106 | n_channel, n_sample = waveform.size() 107 | n_order = a_coeffs.size(0) 108 | n_sample_padded = n_sample + n_order - 1 109 | assert (n_order > 0) 110 | 111 | # Pad the input and create output 112 | padded_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device) 113 | padded_waveform[:, (n_order - 1):] = waveform 114 | padded_output_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device) 115 | 116 | # Set up the coefficients matrix 117 | # Flip coefficients' order 118 | a_coeffs_flipped = a_coeffs.flip(0) 119 | b_coeffs_flipped = b_coeffs.flip(0) 120 | 121 | # calculate windowed_input_signal in parallel 122 | # create indices of original with shape (n_channel, n_order, n_sample) 123 | window_idxs = torch.arange(n_sample, device=device).unsqueeze(0) + torch.arange(n_order, device=device).unsqueeze(1) 124 | window_idxs = window_idxs.repeat(n_channel, 1, 1) 125 | window_idxs += (torch.arange(n_channel, device=device).unsqueeze(-1).unsqueeze(-1) * n_sample_padded) 126 | window_idxs = window_idxs.long() 127 | # (n_order, ) matmul (n_channel, n_order, n_sample) -> (n_channel, n_sample) 128 | input_signal_windows = torch.matmul(b_coeffs_flipped, torch.take(padded_waveform, window_idxs)) 129 | 130 | input_signal_windows.div_(a_coeffs[0]) 131 | a_coeffs_flipped.div_(a_coeffs[0]) 132 | for i_sample, o0 in enumerate(input_signal_windows.t()): 133 | windowed_output_signal = padded_output_waveform[:, i_sample:(i_sample + n_order)] 134 | o0.addmv_(windowed_output_signal, a_coeffs_flipped, alpha=-1) 135 | padded_output_waveform[:, i_sample + n_order - 1] = o0 136 | 137 | output = padded_output_waveform[:, (n_order - 1):] 138 | 139 | if clamp: 140 | output = torch.clamp(output, min=-1., max=1.) 141 | 142 | # unpack batch 143 | output = output.reshape(shape[:-1] + output.shape[-1:]) 144 | 145 | return output 146 | 147 | 148 | 149 | def biquad( 150 | waveform: Tensor, 151 | b0: float, 152 | b1: float, 153 | b2: float, 154 | a0: float, 155 | a1: float, 156 | a2: float 157 | ) -> Tensor: 158 | r"""Perform a biquad filter of input tensor. Initial conditions set to 0. 159 | https://en.wikipedia.org/wiki/Digital_biquad_filter 160 | 161 | Args: 162 | waveform (Tensor): audio waveform of dimension of `(..., time)` 163 | b0 (float): numerator coefficient of current input, x[n] 164 | b1 (float): numerator coefficient of input one time step ago x[n-1] 165 | b2 (float): numerator coefficient of input two time steps ago x[n-2] 166 | a0 (float): denominator coefficient of current output y[n], typically 1 167 | a1 (float): denominator coefficient of current output y[n-1] 168 | a2 (float): denominator coefficient of current output y[n-2] 169 | 170 | Returns: 171 | Tensor: Waveform with dimension of `(..., time)` 172 | """ 173 | 174 | device = waveform.device 175 | dtype = waveform.dtype 176 | 177 | output_waveform = lfilter( 178 | waveform, 179 | torch.tensor([a0, a1, a2], dtype=dtype, device=device), 180 | torch.tensor([b0, b1, b2], dtype=dtype, device=device) 181 | ) 182 | return output_waveform 183 | 184 | 185 | 186 | def _dB2Linear(x: float) -> float: 187 | return math.exp(x * math.log(10) / 20.0) 188 | 189 | 190 | def highpass_biquad( 191 | waveform: Tensor, 192 | sample_rate: int, 193 | cutoff_freq: float, 194 | Q: float = 0.707 195 | ) -> Tensor: 196 | r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation. 197 | 198 | Args: 199 | waveform (Tensor): audio waveform of dimension of `(..., time)` 200 | sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) 201 | cutoff_freq (float): filter cutoff frequency 202 | Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) 203 | 204 | Returns: 205 | Tensor: Waveform dimension of `(..., time)` 206 | """ 207 | w0 = 2 * math.pi * cutoff_freq / sample_rate 208 | alpha = math.sin(w0) / 2. / Q 209 | 210 | b0 = (1 + math.cos(w0)) / 2 211 | b1 = -1 - math.cos(w0) 212 | b2 = b0 213 | a0 = 1 + alpha 214 | a1 = -2 * math.cos(w0) 215 | a2 = 1 - alpha 216 | return biquad(waveform, b0, b1, b2, a0, a1, a2) 217 | 218 | 219 | 220 | def lowpass_biquad( 221 | waveform: Tensor, 222 | sample_rate: int, 223 | cutoff_freq: float, 224 | Q: float = 0.707 225 | ) -> Tensor: 226 | r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation. 227 | 228 | Args: 229 | waveform (torch.Tensor): audio waveform of dimension of `(..., time)` 230 | sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) 231 | cutoff_freq (float): filter cutoff frequency 232 | Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``) 233 | 234 | Returns: 235 | Tensor: Waveform of dimension of `(..., time)` 236 | """ 237 | w0 = 2 * math.pi * cutoff_freq / sample_rate 238 | alpha = math.sin(w0) / 2 / Q 239 | 240 | b0 = (1 - math.cos(w0)) / 2 241 | b1 = 1 - math.cos(w0) 242 | b2 = b0 243 | a0 = 1 + alpha 244 | a1 = -2 * math.cos(w0) 245 | a2 = 1 - alpha 246 | return biquad(waveform, b0, b1, b2, a0, a1, a2) 247 | 248 | 249 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 250 | n_fft=800, dtype=np.float32, norm=None): 251 | """ 252 | # from librosa 0.6 253 | Compute the sum-square envelope of a window function at a given hop length. 254 | 255 | This is used to estimate modulation effects induced by windowing 256 | observations in short-time fourier transforms. 257 | 258 | Parameters 259 | ---------- 260 | window : string, tuple, number, callable, or list-like 261 | Window specification, as in `get_window` 262 | 263 | n_frames : int > 0 264 | The number of analysis frames 265 | 266 | hop_length : int > 0 267 | The number of samples to advance between frames 268 | 269 | win_length : [optional] 270 | The length of the window function. By default, this matches `n_fft`. 271 | 272 | n_fft : int > 0 273 | The length of each analysis frame. 274 | 275 | dtype : np.dtype 276 | The data type of the output 277 | 278 | Returns 279 | ------- 280 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 281 | The sum-squared envelope of the window function 282 | """ 283 | if win_length is None: 284 | win_length = n_fft 285 | 286 | n = n_fft + hop_length * (n_frames - 1) 287 | x = np.zeros(n, dtype=dtype) 288 | 289 | # Compute the squared window at the desired length 290 | win_sq = get_window(window, win_length, fftbins=True) 291 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 292 | win_sq = librosa_util.pad_center(win_sq, n_fft) 293 | 294 | # Fill the envelope 295 | for i in range(n_frames): 296 | sample = i * hop_length 297 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 298 | return x 299 | 300 | 301 | class MelScale(torch.nn.Module): 302 | r"""Turn a normal STFT into a mel frequency STFT, using a conversion 303 | matrix. This uses triangular filter banks. 304 | 305 | User can control which device the filter bank (`fb`) is (e.g. fb.to(spec_f.device)). 306 | 307 | Args: 308 | n_mels (int, optional): Number of mel filterbanks. (Default: ``128``) 309 | sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``) 310 | f_min (float, optional): Minimum frequency. (Default: ``0.``) 311 | f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``) 312 | n_stft (int, optional): Number of bins in STFT. Calculated from first input 313 | if None is given. See ``n_fft`` in :class:`Spectrogram`. (Default: ``None``) 314 | """ 315 | __constants__ = ['n_mels', 'sample_rate', 'f_min', 'f_max'] 316 | 317 | def __init__(self, 318 | n_mels: int = 128, 319 | sample_rate: int = 24000, 320 | f_min: float = 0., 321 | f_max: Optional[float] = None, 322 | n_stft: Optional[int] = None) -> None: 323 | super(MelScale, self).__init__() 324 | self.n_mels = n_mels 325 | self.sample_rate = sample_rate 326 | self.f_max = f_max if f_max is not None else float(sample_rate // 2) 327 | self.f_min = f_min 328 | 329 | assert f_min <= self.f_max, 'Require f_min: %f < f_max: %f' % (f_min, self.f_max) 330 | 331 | fb = torch.empty(0) if n_stft is None else create_fb_matrix( 332 | n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate) 333 | self.register_buffer('fb', fb) 334 | 335 | def forward(self, specgram: Tensor) -> Tensor: 336 | r""" 337 | Args: 338 | specgram (Tensor): A spectrogram STFT of dimension (..., freq, time). 339 | 340 | Returns: 341 | Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time). 342 | """ 343 | 344 | # pack batch 345 | shape = specgram.size() 346 | specgram = specgram.reshape(-1, shape[-2], shape[-1]) 347 | 348 | if self.fb.numel() == 0: 349 | tmp_fb = create_fb_matrix(specgram.size(1), self.f_min, self.f_max, self.n_mels, self.sample_rate) 350 | # Attributes cannot be reassigned outside __init__ so workaround 351 | self.fb.resize_(tmp_fb.size()) 352 | self.fb.copy_(tmp_fb) 353 | 354 | # (channel, frequency, time).transpose(...) dot (frequency, n_mels) 355 | # -> (channel, time, n_mels).transpose(...) 356 | mel_specgram = torch.matmul(specgram.transpose(1, 2), self.fb).transpose(1, 2) 357 | 358 | # unpack batch 359 | mel_specgram = mel_specgram.reshape(shape[:-2] + mel_specgram.shape[-2:]) 360 | 361 | return mel_specgram 362 | 363 | 364 | class TorchSTFT(torch.nn.Module): 365 | def __init__(self, fft_size, hop_size, win_size, 366 | normalized=False, domain='linear', 367 | mel_scale=False, ref_level_db=20, min_level_db=-100): 368 | super().__init__() 369 | self.fft_size = fft_size 370 | self.hop_size = hop_size 371 | self.win_size = win_size 372 | self.ref_level_db = ref_level_db 373 | self.min_level_db = min_level_db 374 | self.window = torch.hann_window(win_size) 375 | self.normalized = normalized 376 | self.domain = domain 377 | self.mel_scale = MelScale(n_mels=(fft_size // 2 + 1), 378 | n_stft=(fft_size // 2 + 1)) if mel_scale else None 379 | 380 | def transform(self, x): 381 | x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size, 382 | self.window.type_as(x), normalized=self.normalized) 383 | real = x_stft[..., 0] 384 | imag = x_stft[..., 1] 385 | mag = torch.clamp(real ** 2 + imag ** 2, min=1e-7) 386 | mag = torch.sqrt(mag) 387 | phase = torch.atan2(imag, real) 388 | 389 | if self.mel_scale is not None: 390 | mag = self.mel_scale(mag) 391 | 392 | if self.domain == 'log': 393 | mag = 20 * torch.log10(mag) - self.ref_level_db 394 | mag = torch.clamp((mag - self.min_level_db) / -self.min_level_db, 0, 1) 395 | return mag, phase 396 | elif self.domain == 'linear': 397 | return mag, phase 398 | elif self.domain == 'double': 399 | log_mag = 20 * torch.log10(mag) - self.ref_level_db 400 | log_mag = torch.clamp((log_mag - self.min_level_db) / -self.min_level_db, 0, 1) 401 | return torch.cat((mag, log_mag), dim=1), phase 402 | 403 | def complex(self, x): 404 | x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size, 405 | self.window.type_as(x), normalized=self.normalized) 406 | real = x_stft[..., 0] 407 | imag = x_stft[..., 1] 408 | return real, imag 409 | 410 | 411 | 412 | class STFT(torch.nn.Module): 413 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 414 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 415 | window='hann'): 416 | super(STFT, self).__init__() 417 | self.filter_length = filter_length 418 | self.hop_length = hop_length 419 | self.win_length = win_length 420 | self.window = window 421 | self.forward_transform = None 422 | scale = self.filter_length / self.hop_length 423 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 424 | 425 | cutoff = int((self.filter_length / 2 + 1)) 426 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 427 | np.imag(fourier_basis[:cutoff, :])]) 428 | 429 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 430 | inverse_basis = torch.FloatTensor( 431 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 432 | 433 | if window is not None: 434 | assert(filter_length >= win_length) 435 | # get window and zero center pad it to filter_length 436 | fft_window = get_window(window, win_length, fftbins=True) 437 | fft_window = pad_center(fft_window, filter_length) 438 | fft_window = torch.from_numpy(fft_window).float() 439 | 440 | # window the bases 441 | forward_basis *= fft_window 442 | inverse_basis *= fft_window 443 | 444 | self.register_buffer('forward_basis', forward_basis.float()) 445 | self.register_buffer('inverse_basis', inverse_basis.float()) 446 | 447 | def transform(self, input_data): 448 | num_batches = input_data.size(0) 449 | num_samples = input_data.size(1) 450 | 451 | self.num_samples = num_samples 452 | 453 | # similar to librosa, reflect-pad the input 454 | input_data = input_data.view(num_batches, 1, num_samples) 455 | input_data = F.pad( 456 | input_data.unsqueeze(1), 457 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 458 | mode='reflect') 459 | input_data = input_data.squeeze(1) 460 | 461 | forward_transform = F.conv1d( 462 | input_data, 463 | Variable(self.forward_basis, requires_grad=False), 464 | stride=self.hop_length, 465 | padding=0) 466 | 467 | cutoff = int((self.filter_length / 2) + 1) 468 | real_part = forward_transform[:, :cutoff, :] 469 | imag_part = forward_transform[:, cutoff:, :] 470 | 471 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 472 | phase = torch.autograd.Variable( 473 | torch.atan2(imag_part.data, real_part.data)) 474 | 475 | return magnitude, phase 476 | 477 | def inverse(self, magnitude, phase): 478 | recombine_magnitude_phase = torch.cat( 479 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 480 | 481 | inverse_transform = F.conv_transpose1d( 482 | recombine_magnitude_phase, 483 | Variable(self.inverse_basis, requires_grad=False), 484 | stride=self.hop_length, 485 | padding=0) 486 | 487 | if self.window is not None: 488 | window_sum = window_sumsquare( 489 | self.window, magnitude.size(-1), hop_length=self.hop_length, 490 | win_length=self.win_length, n_fft=self.filter_length, 491 | dtype=np.float32) 492 | # remove modulation effects 493 | approx_nonzero_indices = torch.from_numpy( 494 | np.where(window_sum > tiny(window_sum))[0]) 495 | window_sum = torch.autograd.Variable( 496 | torch.from_numpy(window_sum), requires_grad=False) 497 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 498 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 499 | 500 | # scale by hop ratio 501 | inverse_transform *= float(self.filter_length) / self.hop_length 502 | 503 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 504 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 505 | 506 | return inverse_transform 507 | 508 | def forward(self, input_data): 509 | self.magnitude, self.phase = self.transform(input_data) 510 | reconstruction = self.inverse(self.magnitude, self.phase) 511 | return reconstruction 512 | 513 | -------------------------------------------------------------------------------- /infer/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import librosa 4 | import numpy as np 5 | import torch 6 | import tqdm 7 | from text import npu 8 | 9 | def resize2d_f0(x, target_len): 10 | source = np.array(x) 11 | source[source < 0.001] = np.nan 12 | target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), 13 | source) 14 | res = np.nan_to_num(target) 15 | return res 16 | 17 | 18 | def preprocess(ds): 19 | note_list = ds["note_seq"] 20 | midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0 21 | for x in note_list.split(" ")] 22 | f0_seq = None 23 | if ds["f0_seq"] is not None: 24 | f0_seq = [float(i.strip()) for i in ds["f0_seq"].split(" ")] 25 | f0_seq = np.array(f0_seq) 26 | phseq = ds["ph_seq"].split(" ") 27 | newphseq = [] 28 | for ph in phseq: 29 | newphseq.append(npu.ttsing_phone_to_int[ph]) 30 | phseq = newphseq 31 | phseq = np.array(phseq) 32 | pitch = 440 * (2 ** ((np.array(midis) - 69) / 12)) 33 | durations = [float(i) for i in ds["ph_dur"].split(" ")] 34 | accu_dur = 0 35 | accu_durs = [] 36 | for dur in durations: 37 | accu_dur += dur 38 | accu_durs.append(accu_dur) 39 | accu_durs = np.array(accu_durs) 40 | accu_durs = (accu_durs * 44100 // 512).astype(int) 41 | sub_durs = np.zeros_like(accu_durs) 42 | sub_durs[1:accu_durs.shape[0]] = accu_durs[:accu_durs.shape[0]-1] 43 | durations = accu_durs-sub_durs 44 | f0_seq = resize2d_f0(f0_seq, sum(durations)) 45 | pos = 0 46 | for i, d in enumerate(durations): 47 | if phseq[i] == 0: 48 | f0_seq[pos:pos + d] = 0 49 | pos += d 50 | 51 | return f0_seq,pitch, phseq, durations 52 | 53 | if __name__ == '__main__': 54 | inp = { 55 | "text": "SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP", 56 | "ph_seq": "SP x ing z ou z ai w ei x ian b ian y van s i0 y i d e g uai d ao SP z i0 y ou d e t iao zh e zh ir j ian sh ang d e w u d ao SP q ing y ing d e x iang an y ing zh ong c ang f u d e b o s i0 m ao d eng d ai x ia y i g e m u u b iao SP", 57 | "note_seq": "rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 A#4 A#4 A4 A4 G4 G4 D4 D4 G4 G4 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 C5 C5 C5 C5 G5 G5 C5 C5 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 C5 C5 B4 B4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 G4 G4 D4 D4 G4 G4 F4 F4 G4 G4 A#4 A#4 C5 C5 C#5 D5 D5 rest", 58 | "note_dur_seq": "0.6 0.136 0.136 0.137 0.137 0.545 0.545 0.546 0.546 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2719998 0.2719998 0.546 0.546 0.5450001 0.5450001 0.2730002 0.2730002 0.4089999 0.4089999 0.1370001 0.1359997 0.1359997 0.1360002 0.1360002 0.546 0.546 0.5450001 0.5450001 0.2729998 0.2729998 0.2730002 0.2730002 0.2719998 0.2719998 0.546 0.546 0.2730002 0.2730002 0.5449996 0.5449996 0.6820002 0.6820002 0.1359997 0.1370001 0.1370001 0.1360006 0.1360006 0.5450001 0.5450001 0.5459995 0.5459995 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.3640003 0.3640003 0.1809998 0.1809998 0.3640003 0.3640003 0.1820002 0.1820002 0.3639994 0.3639994 0.1810007 0.1810007 0.3639994 0.3639994 0.1820002 0.1820002 0.4090004 0.4090004 0.4089994 0.4089994 0.2729998 0.2729998 0.2720003 0.2720003 0.5460005 0.8179989 0.8179989 0.5", 59 | "is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0", 60 | "ph_dur": "0.3875 0.2125 0.070091 0.065909 0.082455 0.054545 0.474545 0.070455 0.339182 0.206818 0.244727 0.027273 0.207091 0.065909 0.163909 0.109091 0.272 0 0.442591 0.103409 0.447273 0.097727 0.224137 0.048864 0.409 0.088136 0.048864 0.070091 0.065909 0.081455 0.054545 0.452818 0.093182 0.37 0.175 0.103682 0.169318 0.115046 0.157955 0.1845 0.0875 0.475545 0.070455 0.273 0 0.506363 0.038636 0.682 0.054182 0.081818 0.076773 0.060227 0.097364 0.038636 0.354091 0.190909 0.546 0.202545 0.070455 0.168591 0.103409 0.218454 0.054545 0.2765 0.0875 0.148045 0.032955 0.325364 0.038636 0.067227 0.114773 0.270818 0.093182 0.148046 0.032955 0.286727 0.077273 0.057 0.125 0.409 0 0.381727 0.027273 0.152545 0.120455 0.272 0.441653 0.104348 0.817999 0.5", 61 | "f0_timestep": "0.005", 62 | "f0_seq": "587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 587.0 586.9 586.7 586.1 585.4 584.8 584.1 583.4 582.9 582.5 582.3 582.5 582.9 583.4 584.1 584.9 585.5 586.1 586.7 587.0 587.3 587.6 587.9 588.0 588.1 588.4 588.7 588.7 588.7 588.0 586.4 584.1 580.8 575.8 568.7 560.8 552.0 540.9 531.0 522.2 513.8 506.6 501.7 497.9 495.0 493.8 493.0 492.6 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.6 493.2 494.1 495.6 498.7 502.5 507.6 515.5 523.9 532.9 543.2 553.7 562.4 570.3 577.2 581.7 584.6 586.9 588.2 588.7 588.7 588.6 588.3 588.1 588.0 587.8 587.5 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 586.9 586.7 587.0 587.0 587.0 587.0 587.5 588.7 590.8 594.1 599.0 607.7 617.7 630.6 647.9 667.1 686.3 706.4 727.1 743.0 755.2 765.1 773.3 778.6 781.6 783.4 784.4 784.4 784.4 784.4 784.7 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.4 784.4 784.4 784.4 783.8 782.3 779.9 775.1 768.7 759.5 747.9 731.5 712.9 694.2 674.0 652.5 636.1 622.4 610.1 601.9 596.0 591.8 589.1 587.8 587.0 587.0 587.0 587.0 586.8 586.8 587.1 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.6 587.9 588.0 588.1 588.5 589.1 589.4 589.4 589.1 588.4 586.8 584.5 581.2 575.9 570.6 564.1 556.0 548.8 542.3 536.2 531.1 527.3 524.8 522.6 521.9 521.5 521.4 521.6 521.9 522.4 522.6 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.6 523.9 524.1 524.4 524.8 525.4 525.8 526.0 526.2 525.7 524.9 523.3 521.1 518.6 515.3 511.3 507.6 504.0 499.9 497.3 495.0 493.1 492.0 491.4 491.1 491.4 491.6 492.1 492.6 492.9 493.2 493.4 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.1 494.3 494.5 494.8 495.1 495.6 496.1 496.4 496.6 496.5 495.8 494.7 493.2 491.0 487.9 484.7 481.2 477.3 473.8 470.9 468.4 466.2 464.8 464.1 463.6 463.7 463.9 464.2 464.7 465.1 465.4 465.6 465.8 466.1 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.4 466.7 466.9 467.2 467.5 468.0 468.4 468.6 468.9 468.3 467.6 466.4 464.4 462.0 459.3 456.0 452.2 449.0 446.0 443.1 441.0 439.5 438.5 437.9 437.5 437.7 437.9 438.4 438.8 439.1 439.3 439.6 439.8 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.3 440.5 440.5 440.7 441.0 441.4 441.5 441.5 441.3 440.6 439.1 437.0 434.2 430.6 426.3 420.5 415.3 410.1 404.6 400.5 397.2 394.5 392.6 391.4 390.9 390.6 390.6 390.8 391.1 391.4 391.5 391.6 391.8 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.4 293.5 293.5 293.5 293.5 293.7 294.3 295.4 297.0 299.5 303.8 308.9 315.3 323.9 333.6 343.2 353.2 363.5 371.5 377.6 382.5 386.6 389.3 390.8 391.7 392.2 392.2 392.2 392.2 392.4 392.3 392.1 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.8 391.8 391.6 391.3 390.9 390.5 390.1 389.5 389.1 388.9 389.1 389.6 390.3 391.2 392.8 394.5 397.0 400.6 405.3 411.1 419.5 431.0 443.7 458.9 479.8 497.9 515.2 532.6 546.7 557.1 565.4 571.7 575.6 577.8 579.1 580.0 580.4 580.8 581.5 582.7 582.9 583.5 584.4 585.1 585.6 586.2 586.8 587.0 587.3 587.7 588.0 588.0 588.2 588.5 588.7 588.7 588.5 587.7 586.3 583.3 579.0 573.7 567.1 558.7 548.3 538.6 529.1 519.2 511.5 505.6 500.7 496.9 494.8 493.6 492.7 492.5 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.7 493.3 494.5 496.5 499.4 503.7 510.1 517.2 525.5 536.3 546.3 555.5 564.6 572.6 578.1 582.6 585.6 587.3 588.3 588.7 588.7 588.6 588.3 588.0 588.0 587.7 587.4 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 586.8 586.8 587.0 587.0 587.0 587.0 587.8 589.1 591.4 595.5 601.9 609.7 619.7 636.1 652.5 670.9 692.6 712.9 730.2 745.9 759.5 768.7 775.1 779.9 782.3 783.8 784.4 784.4 784.4 784.4 784.8 784.5 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.2 784.6 784.7 784.4 784.4 784.4 784.4 783.5 781.7 778.6 773.8 766.5 755.2 743.0 727.1 706.4 686.3 667.1 649.2 632.8 617.7 607.7 600.4 594.3 590.8 588.9 587.6 587.0 587.0 587.0 587.0 586.7 586.9 587.2 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.7 588.0 588.0 588.2 588.7 589.1 589.4 589.3 589.0 588.2 586.1 583.4 579.6 574.8 569.0 561.3 554.4 547.5 540.1 534.7 530.2 526.6 524.1 522.5 521.7 521.4 521.4 521.6 522.1 522.5 522.6 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.4 523.7 523.9 524.2 524.5 525.0 525.6 525.9 526.1 526.1 525.5 524.5 522.9 520.7 517.6 514.2 510.6 506.6 502.6 499.4 496.7 494.2 492.7 491.9 491.3 491.2 491.4 491.7 492.3 492.7 493.0 493.2 493.5 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.6 493.4 493.1 492.9 492.6 492.1 491.6 491.3 491.1 491.5 492.2 493.3 495.2 497.8 500.6 504.0 508.4 512.0 515.6 518.9 521.6 523.6 524.9 525.8 526.3 526.0 525.8 525.3 524.8 524.4 524.1 523.8 523.6 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.8 524.0 524.4 525.0 525.5 526.0 526.9 527.4 527.7 527.8 527.5 527.0 526.4 525.5 524.5 523.5 522.4 521.3 520.4 519.7 519.2 518.7 518.7 519.0 519.5 520.2 520.8 521.4 521.9 522.4 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.6 523.3 523.3 523.5 523.9 524.8 526.3 529.0 533.6 539.5 548.4 560.5 577.8 598.5 620.8 646.5 675.9 700.1 720.9 741.4 755.4 765.4 773.0 778.1 781.1 782.6 783.5 783.9 784.0 784.5 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.1 784.0 783.7 783.1 782.0 780.0 775.7 770.5 762.0 748.5 731.9 712.5 688.4 660.8 635.5 611.2 586.0 569.0 555.3 543.8 535.9 531.0 527.6 525.2 524.2 523.7 523.3 523.3 522.9 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.9 522.7 522.2 521.7 521.1 520.5 519.8 519.3 519.0 519.0 519.2 519.7 520.5 521.5 522.7 524.5 526.0 528.0 530.9 534.3 538.4 543.6 549.7 555.5 561.5 568.0 572.4 575.9 578.8 580.8 581.9 582.6 582.6 582.6 582.3 582.0 581.9 582.3 582.7 583.1 583.8 584.6 585.2 585.8 586.4 586.8 587.1 587.4 587.7 588.0 588.0 588.3 588.6 588.7 588.7 588.3 587.3 585.1 582.6 578.1 572.6 564.6 555.5 546.3 536.3 525.5 517.2 510.1 503.7 499.4 496.5 494.5 493.3 492.7 492.5 492.7 492.7 492.7 492.7 492.7 492.6 492.5 492.9 493.6 494.8 497.3 501.0 505.6 511.5 519.2 529.1 538.6 548.3 558.7 567.1 573.7 579.4 583.6 586.0 587.7 588.7 588.7 588.7 588.5 588.2 588.0 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.0 586.7 586.9 587.0 587.0 587.0 587.2 588.0 589.4 592.4 597.0 603.3 612.5 625.1 639.3 655.8 678.6 698.1 716.6 735.3 750.3 761.4 770.3 777.1 780.8 782.7 784.0 784.4 784.4 784.4 784.5 784.8 784.4 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.4 784.7 784.7 784.4 784.4 784.4 784.4 783.1 781.2 777.8 771.8 763.2 752.8 739.1 720.2 702.1 682.4 663.3 643.9 627.9 615.7 605.9 598.0 593.1 590.3 588.3 587.3 587.0 587.0 587.0 586.9 586.7 587.0 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.4 587.7 588.0 588.0 588.3 588.8 589.2 589.4 589.3 588.8 587.6 585.6 582.8 578.7 573.1 566.7 559.9 552.7 544.8 538.6 533.7 528.8 525.8 523.7 522.3 521.6 521.4 521.4 521.7 522.2 522.5 522.6 522.8 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.7 524.0 524.3 524.6 525.2 525.7 525.9 526.2 525.9 525.2 524.2 522.4 519.7 516.9 513.5 509.2 505.4 501.9 498.6 495.9 493.9 492.5 491.6 491.1 491.2 491.5 491.9 492.4 492.8 493.1 493.3 493.6 493.8 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.2 494.5 494.7 494.9 495.4 495.9 496.3 496.5 496.6 496.2 495.5 493.9 491.9 489.5 486.4 482.6 479.1 475.7 471.9 469.4 467.2 465.5 464.4 463.8 463.5 463.8 464.0 464.5 465.0 465.3 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.8 467.1 467.5 468.0 468.5 469.1 469.7 470.0 470.2 470.1 469.7 469.2 468.5 467.6 466.7 465.7 464.7 463.9 463.2 462.7 462.2 462.1 462.3 462.7 463.2 463.9 464.4 464.8 465.3 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.3 468.9 469.5 469.9 470.2 470.2 469.9 469.4 468.7 468.0 467.0 466.0 465.1 464.2 463.4 462.9 462.4 462.1 462.2 462.5 462.9 463.6 464.2 464.6 465.1 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 467.0 467.4 467.9 468.4 469.0 469.6 470.0 470.2 470.2 469.7 469.2 468.6 467.7 466.7 465.8 464.9 463.9 463.2 462.8 462.3 462.1 462.3 462.6 463.1 463.8 464.3 464.7 465.2 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.2 468.8 469.5 469.9 470.2 470.2 469.9 469.4 468.9 468.1 467.1 466.2 465.3 464.2 463.5 462.9 462.5 462.1 462.2 462.4 462.9 463.6 464.1 464.6 465.1 465.4 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 466.7 466.9 467.2 467.2 467.2 467.0 466.1 464.4 462.4 458.9 454.5 448.1 440.9 433.6 425.7 417.1 410.5 404.8 399.8 396.4 394.0 392.4 391.5 391.1 391.1 391.1 391.3 391.5 391.5 391.7 391.8 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.3 293.5 293.5 293.5 293.5 293.7 294.2 295.1 296.6 299.0 303.0 307.8 313.9 322.0 331.7 341.2 351.0 362.0 369.9 376.4 382.2 386.3 388.9 390.6 391.7 392.2 392.2 392.2 392.2 392.3 392.4 392.2 392.2 392.0 392.2 392.2 392.4 392.4 392.5 392.8 393.2 393.4 393.4 393.2 392.7 391.5 389.7 387.4 384.2 380.0 375.6 370.9 366.3 361.5 357.5 354.4 351.9 350.0 348.8 348.3 348.1 348.0 348.1 348.4 348.7 348.8 348.8 349.0 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.0 348.9 348.8 348.7 348.4 348.2 348.0 348.1 348.2 348.7 350.0 351.6 353.9 356.8 360.5 365.4 370.0 374.7 379.8 383.6 386.8 389.5 391.4 392.6 393.1 393.4 393.4 393.2 392.9 392.6 392.4 392.4 392.2 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.7 391.5 391.5 391.4 391.2 391.1 391.1 391.2 392.0 393.1 394.8 398.1 402.1 407.1 413.7 421.4 429.0 436.7 445.1 451.4 456.3 460.7 463.6 465.4 466.6 467.2 467.2 467.2 467.0 466.8 466.7 466.6 466.4 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.0 465.8 465.6 465.6 465.3 464.8 464.6 464.6 464.7 465.2 466.4 468.4 470.9 474.6 479.8 485.4 491.4 498.4 504.8 510.2 514.9 519.2 521.7 523.5 524.6 525.0 525.1 525.0 524.6 524.1 523.9 523.9 523.6 523.4 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.7 522.4 522.1 521.7 521.2 520.7 520.5 520.2 520.8 521.6 522.9 525.2 528.0 531.1 534.9 539.4 543.3 546.9 550.5 553.1 555.0 556.3 557.1 557.5 557.3 557.0 556.4 555.9 555.5 555.2 554.9 554.6 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.0 553.8 553.5 553.2 552.8 552.2 551.7 551.4 551.2 551.7 552.6 554.0 556.1 558.8 562.0 566.5 570.6 574.7 579.3 582.7 585.5 587.8 589.4 590.2 590.7 590.4 590.1 589.6 589.0 588.5 588.2 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 587.0 586.9 586.4 585.9 585.3 584.6 583.8 583.2 582.7 582.4 582.4 582.7 583.0 583.6 584.5 585.1 585.7 586.3 586.8 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0", 63 | "input_type": "phoneme", 64 | "offset": 72.491 65 | } 66 | res = preprocess(inp) 67 | print(res) 68 | print([float(i) for i in res[0]]) 69 | 70 | def cross_fade(a: np.ndarray, b: np.ndarray, idx: int): 71 | result = np.zeros(idx + b.shape[0]) 72 | fade_len = a.shape[0] - idx 73 | np.copyto(dst=result[:idx], src=a[:idx]) 74 | k = np.linspace(0, 1.0, num=fade_len, endpoint=True) 75 | result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len] 76 | np.copyto(dst=result[a.shape[0]:], src=b[fade_len:]) 77 | return result 78 | 79 | 80 | def infer_ds(model, hps, ds, speaker, trans): 81 | 82 | sample_rate = 44100 83 | 84 | result = np.zeros(0) 85 | current_length = 0 86 | for inp in tqdm.tqdm(ds): 87 | spkid = hps.data.spk2id[speaker] 88 | f0_seq, pitch, phseq, durations = preprocess(inp) 89 | 90 | f0 = torch.FloatTensor(f0_seq).unsqueeze(0) 91 | 92 | text_norm = torch.LongTensor(phseq) 93 | x_tst = text_norm.unsqueeze(0) 94 | x_tst_lengths = torch.LongTensor([text_norm.size(0)]) 95 | spk = torch.LongTensor([spkid]) 96 | manual_f0 = torch.FloatTensor(f0).unsqueeze(0) 97 | manual_dur = torch.LongTensor(durations).unsqueeze(0) 98 | t1 = time.time() 99 | with torch.no_grad(): 100 | infer_res = model.infer(x_tst, x_tst_lengths, None, None, 101 | None, gtdur=manual_dur, spk_id=spk, 102 | F0=manual_f0 * 2 ** (trans / 12)) 103 | seg_audio = infer_res[0][0, 0].data.float().numpy() 104 | try: 105 | offset_ = inp['offset'] 106 | except: 107 | offset_ = 0 108 | silent_length = round(offset_ * sample_rate) - current_length 109 | if silent_length >= 0: 110 | result = np.append(result, np.zeros(silent_length)) 111 | result = np.append(result, seg_audio) 112 | else: 113 | result = cross_fade(result, seg_audio, current_length + silent_length) 114 | current_length = current_length + silent_length + seg_audio.shape[0] 115 | print("infer time:", time.time() - t1) 116 | return result 117 | 118 | 119 | 120 | 121 | # 122 | # midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0 123 | # for x in note_lst] -------------------------------------------------------------------------------- /egs/visinger2/models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | import math 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 8 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 9 | 10 | sys.path.append('../..') 11 | import modules.commons as commons 12 | import modules.modules as modules 13 | import modules.attentions as attentions 14 | 15 | from modules.commons import init_weights, get_padding 16 | from text.npu.symbols import ttsing_phone_set, ttsing_opencpop_pitch_set, ttsing_slur_set 17 | 18 | from modules.ddsp import mlp, gru, scale_function, remove_above_nyquist, upsample 19 | from modules.ddsp import harmonic_synth, amp_to_impulse_response, fft_convolve 20 | from modules.ddsp import resample 21 | 22 | from modules.stft import TorchSTFT 23 | 24 | import torch.distributions as D 25 | 26 | from modules.losses import ( 27 | generator_loss, 28 | discriminator_loss, 29 | feature_loss, 30 | kl_loss 31 | ) 32 | 33 | LRELU_SLOPE = 0.1 34 | 35 | 36 | class DurationPredictor(nn.Module): 37 | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_speakers=0, spk_channels=0): 38 | super().__init__() 39 | 40 | self.in_channels = in_channels 41 | self.filter_channels = filter_channels 42 | self.kernel_size = kernel_size 43 | self.p_dropout = p_dropout 44 | self.spk_channels = spk_channels 45 | 46 | self.drop = nn.Dropout(p_dropout) 47 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) 48 | self.norm_1 = modules.LayerNorm(filter_channels) 49 | self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) 50 | self.norm_2 = modules.LayerNorm(filter_channels) 51 | self.conv_3 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) 52 | self.norm_3 = modules.LayerNorm(filter_channels) 53 | self.proj = nn.Conv1d(filter_channels, 2, 1) 54 | 55 | if n_speakers != 0: 56 | self.cond = nn.Conv1d(spk_channels, in_channels, 1) 57 | 58 | def forward(self, x, x_mask, spk_emb=None): 59 | # x = torch.detach(x) 60 | if spk_emb is not None: 61 | spk_emb = torch.detach(spk_emb) 62 | x = x + self.cond(spk_emb) 63 | 64 | x = self.conv_1(x * x_mask) 65 | x = torch.relu(x) 66 | x = self.norm_1(x) 67 | x = self.drop(x) 68 | 69 | x = self.conv_2(x * x_mask) 70 | x = torch.relu(x) 71 | x = self.norm_2(x) 72 | x = self.drop(x) 73 | 74 | x = self.conv_3(x * x_mask) 75 | x = torch.relu(x) 76 | x = self.norm_3(x) 77 | x = self.drop(x) 78 | 79 | x = self.proj(x * x_mask) 80 | return x * x_mask 81 | 82 | 83 | class TextEncoder(nn.Module): 84 | def __init__(self, 85 | n_vocab, 86 | out_channels, 87 | hidden_channels, 88 | filter_channels, 89 | n_heads, 90 | n_layers, 91 | kernel_size, 92 | p_dropout): 93 | super().__init__() 94 | self.n_vocab = n_vocab 95 | self.out_channels = out_channels 96 | self.hidden_channels = hidden_channels 97 | self.filter_channels = filter_channels 98 | self.n_heads = n_heads 99 | self.n_layers = n_layers 100 | self.kernel_size = kernel_size 101 | self.p_dropout = p_dropout 102 | 103 | self.emb_phone = nn.Embedding(len(ttsing_phone_set), 256) 104 | nn.init.normal_(self.emb_phone.weight, 0.0, 256 ** -0.5) 105 | 106 | self.pre_net = torch.nn.Linear(256, hidden_channels) 107 | 108 | self.encoder = attentions.Encoder( 109 | hidden_channels, 110 | filter_channels, 111 | n_heads, 112 | n_layers, 113 | kernel_size, 114 | p_dropout) 115 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 116 | 117 | def forward(self, phone, phone_lengths, pitchid, dur, slur): 118 | phone_end = self.emb_phone(phone) * math.sqrt(256) 119 | x = phone_end 120 | 121 | x = self.pre_net(x) 122 | x = torch.transpose(x, 1, -1) # [b, h, t] 123 | 124 | x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(x.dtype) 125 | 126 | x = self.encoder(x * x_mask, x_mask) 127 | x = self.proj(x) * x_mask 128 | 129 | return x, x_mask 130 | 131 | 132 | def pad_v2(input_ele, mel_max_length=None): 133 | if mel_max_length: 134 | max_len = mel_max_length 135 | else: 136 | max_len = max([input_ele[i].size(0) for i in range(len(input_ele))]) 137 | 138 | out_list = list() 139 | for i, batch in enumerate(input_ele): 140 | if len(batch.shape) == 1: 141 | one_batch_padded = F.pad( 142 | batch, (0, max_len - batch.size(0)), "constant", 0.0 143 | ) 144 | elif len(batch.shape) == 2: 145 | one_batch_padded = F.pad( 146 | batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0 147 | ) 148 | out_list.append(one_batch_padded) 149 | out_padded = torch.stack(out_list) 150 | return out_padded 151 | 152 | 153 | class LengthRegulator(nn.Module): 154 | """ Length Regulator """ 155 | 156 | def __init__(self): 157 | super(LengthRegulator, self).__init__() 158 | 159 | def LR(self, x, duration, max_len): 160 | x = torch.transpose(x, 1, 2) 161 | output = list() 162 | mel_len = list() 163 | for batch, expand_target in zip(x, duration): 164 | expanded = self.expand(batch, expand_target) 165 | output.append(expanded) 166 | mel_len.append(expanded.shape[0]) 167 | 168 | if max_len is not None: 169 | output = pad_v2(output, max_len) 170 | else: 171 | output = pad_v2(output) 172 | output = torch.transpose(output, 1, 2) 173 | return output, torch.LongTensor(mel_len) 174 | 175 | def expand(self, batch, predicted): 176 | predicted = torch.squeeze(predicted) 177 | out = list() 178 | 179 | for i, vec in enumerate(batch): 180 | expand_size = predicted[i].item() 181 | state_info_index = torch.unsqueeze(torch.arange(0, expand_size), 1).float() 182 | state_info_length = torch.unsqueeze(torch.Tensor([expand_size] * expand_size), 1).float() 183 | state_info = torch.cat([state_info_index, state_info_length], 1).to(vec.device) 184 | new_vec = vec.expand(max(int(expand_size), 0), -1) 185 | new_vec = torch.cat([new_vec, state_info], 1) 186 | out.append(new_vec) 187 | out = torch.cat(out, 0) 188 | return out 189 | 190 | def forward(self, x, duration, max_len): 191 | output, mel_len = self.LR(x, duration, max_len) 192 | return output, mel_len 193 | 194 | 195 | class PriorDecoder(nn.Module): 196 | def __init__(self, 197 | out_bn_channels, 198 | hidden_channels, 199 | filter_channels, 200 | n_heads, 201 | n_layers, 202 | kernel_size, 203 | p_dropout, 204 | n_speakers=0, 205 | spk_channels=0): 206 | super().__init__() 207 | self.out_bn_channels = out_bn_channels 208 | self.hidden_channels = hidden_channels 209 | self.filter_channels = filter_channels 210 | self.n_heads = n_heads 211 | self.n_layers = n_layers 212 | self.kernel_size = kernel_size 213 | self.p_dropout = p_dropout 214 | self.spk_channels = spk_channels 215 | 216 | self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1) 217 | self.decoder = attentions.FFT( 218 | hidden_channels, 219 | filter_channels, 220 | n_heads, 221 | n_layers, 222 | kernel_size, 223 | p_dropout) 224 | self.proj = nn.Conv1d(hidden_channels, out_bn_channels, 1) 225 | 226 | if n_speakers != 0: 227 | self.cond = nn.Conv1d(spk_channels, hidden_channels, 1) 228 | 229 | def forward(self, x, x_lengths, spk_emb=None): 230 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 231 | 232 | x = self.prenet(x) * x_mask 233 | 234 | if (spk_emb is not None): 235 | x = x + self.cond(spk_emb) 236 | 237 | x = self.decoder(x * x_mask, x_mask) 238 | 239 | bn = self.proj(x) * x_mask 240 | 241 | return bn, x_mask 242 | 243 | 244 | class Decoder(nn.Module): 245 | def __init__(self, 246 | out_channels, 247 | hidden_channels, 248 | filter_channels, 249 | n_heads, 250 | n_layers, 251 | kernel_size, 252 | p_dropout, 253 | n_speakers=0, 254 | spk_channels=0): 255 | super().__init__() 256 | self.out_channels = out_channels 257 | self.hidden_channels = hidden_channels 258 | self.filter_channels = filter_channels 259 | self.n_heads = n_heads 260 | self.n_layers = n_layers 261 | self.kernel_size = kernel_size 262 | self.p_dropout = p_dropout 263 | self.spk_channels = spk_channels 264 | 265 | self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1) 266 | self.decoder = attentions.FFT( 267 | hidden_channels, 268 | filter_channels, 269 | n_heads, 270 | n_layers, 271 | kernel_size, 272 | p_dropout) 273 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 274 | 275 | if n_speakers != 0: 276 | self.cond = nn.Conv1d(spk_channels, hidden_channels, 1) 277 | 278 | def forward(self, x, x_lengths, spk_emb=None): 279 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 280 | 281 | x = self.prenet(x) * x_mask 282 | 283 | if (spk_emb is not None): 284 | x = x + self.cond(spk_emb) 285 | 286 | x = self.decoder(x * x_mask, x_mask) 287 | 288 | x = self.proj(x) * x_mask 289 | 290 | return x, x_mask 291 | 292 | 293 | class ConvReluNorm(nn.Module): 294 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 295 | super().__init__() 296 | self.in_channels = in_channels 297 | self.hidden_channels = hidden_channels 298 | self.out_channels = out_channels 299 | self.kernel_size = kernel_size 300 | self.n_layers = n_layers 301 | self.p_dropout = p_dropout 302 | assert n_layers > 1, "Number of layers should be larger than 0." 303 | 304 | self.conv_layers = nn.ModuleList() 305 | self.norm_layers = nn.ModuleList() 306 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) 307 | self.norm_layers.append(LayerNorm(hidden_channels)) 308 | self.relu_drop = nn.Sequential( 309 | nn.ReLU(), 310 | nn.Dropout(p_dropout)) 311 | for _ in range(n_layers - 1): 312 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) 313 | self.norm_layers.append(LayerNorm(hidden_channels)) 314 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 315 | self.proj.weight.data.zero_() 316 | self.proj.bias.data.zero_() 317 | 318 | def forward(self, x): 319 | x = self.conv_layers[0](x) 320 | x = self.norm_layers[0](x) 321 | x = self.relu_drop(x) 322 | 323 | for i in range(1, self.n_layers): 324 | x_ = self.conv_layers[i](x) 325 | x_ = self.norm_layers[i](x_) 326 | x_ = self.relu_drop(x_) 327 | x = (x + x_) / 2 328 | x = self.proj(x) 329 | return x 330 | 331 | 332 | class PosteriorEncoder(nn.Module): 333 | def __init__(self, 334 | hps, 335 | in_channels, 336 | out_channels, 337 | hidden_channels, 338 | kernel_size, 339 | dilation_rate, 340 | n_layers): 341 | super().__init__() 342 | self.in_channels = in_channels 343 | self.out_channels = out_channels 344 | self.hidden_channels = hidden_channels 345 | self.kernel_size = kernel_size 346 | self.dilation_rate = dilation_rate 347 | self.n_layers = n_layers 348 | 349 | self.pre = nn.Conv1d(in_channels, hidden_channels, 1) 350 | self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=hps.data.n_speakers, spk_channels=hps.model.spk_channels) 351 | # self.enc = ConvReluNorm(hidden_channels, 352 | # hidden_channels, 353 | # hidden_channels, 354 | # kernel_size, 355 | # n_layers, 356 | # 0.1) 357 | self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) 358 | 359 | def forward(self, x, x_lengths, g=None): 360 | x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) 361 | x = self.pre(x) * x_mask 362 | x = self.enc(x, x_mask, g=g) 363 | stats = self.proj(x) * x_mask 364 | return stats, x_mask 365 | 366 | 367 | class ResBlock3(torch.nn.Module): 368 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 369 | super(ResBlock3, self).__init__() 370 | self.convs = nn.ModuleList([ 371 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 372 | padding=get_padding(kernel_size, dilation[0]))) 373 | ]) 374 | self.convs.apply(init_weights) 375 | 376 | def forward(self, x, x_mask=None): 377 | for c in self.convs: 378 | xt = F.leaky_relu(x, LRELU_SLOPE) 379 | if x_mask is not None: 380 | xt = xt * x_mask 381 | xt = c(xt) 382 | x = xt + x 383 | if x_mask is not None: 384 | x = x * x_mask 385 | return x 386 | 387 | def remove_weight_norm(self): 388 | for l in self.convs: 389 | remove_weight_norm(l) 390 | 391 | 392 | class Generator_Harm(torch.nn.Module): 393 | def __init__(self, hps): 394 | super(Generator_Harm, self).__init__() 395 | self.hps = hps 396 | 397 | self.prenet = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1) 398 | 399 | self.net = ConvReluNorm(hps.model.hidden_channels, 400 | hps.model.hidden_channels, 401 | hps.model.hidden_channels, 402 | hps.model.kernel_size, 403 | 8, 404 | hps.model.p_dropout) 405 | 406 | # self.rnn = nn.LSTM(input_size=hps.model.hidden_channels, 407 | # hidden_size=hps.model.hidden_channels, 408 | # num_layers=1, 409 | # bias=True, 410 | # batch_first=True, 411 | # dropout=0.5, 412 | # bidirectional=True) 413 | self.postnet = Conv1d(hps.model.hidden_channels, hps.model.n_harmonic + 1, 3, padding=1) 414 | 415 | def forward(self, f0, harm, mask): 416 | pitch = f0.transpose(1, 2) 417 | harm = self.prenet(harm) 418 | 419 | harm = self.net(harm) * mask 420 | # harm = harm.transpose(1, 2) 421 | # harm, (hs, hc) = self.rnn(harm) 422 | # harm = harm.transpose(1, 2) 423 | 424 | harm = self.postnet(harm) 425 | harm = harm.transpose(1, 2) 426 | param = harm 427 | 428 | param = scale_function(param) 429 | total_amp = param[..., :1] 430 | amplitudes = param[..., 1:] 431 | amplitudes = remove_above_nyquist( 432 | amplitudes, 433 | pitch, 434 | self.hps.data.sample_rate, 435 | ) 436 | amplitudes /= amplitudes.sum(-1, keepdim=True) 437 | amplitudes *= total_amp 438 | 439 | amplitudes = upsample(amplitudes, self.hps.data.hop_size) 440 | pitch = upsample(pitch, self.hps.data.hop_size) 441 | 442 | n_harmonic = amplitudes.shape[-1] 443 | omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1) 444 | omegas = omega * torch.arange(1, n_harmonic + 1).to(omega) 445 | signal_harmonics = (torch.sin(omegas) * amplitudes) 446 | signal_harmonics = signal_harmonics.transpose(1, 2) 447 | return signal_harmonics 448 | 449 | 450 | class Generator(torch.nn.Module): 451 | def __init__(self, hps, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, 452 | upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, spk_channels=0): 453 | super(Generator, self).__init__() 454 | self.num_kernels = len(resblock_kernel_sizes) 455 | self.num_upsamples = len(upsample_rates) 456 | self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) 457 | self.upsample_rates = upsample_rates 458 | self.n_speakers = n_speakers 459 | 460 | resblock = modules.ResBlock1 if resblock == '1' else modules.R 461 | 462 | self.downs = nn.ModuleList() 463 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 464 | i = len(upsample_rates) - 1 - i 465 | u = upsample_rates[i] 466 | k = upsample_kernel_sizes[i] 467 | # print("down: ",upsample_initial_channel//(2**(i+1))," -> ", upsample_initial_channel//(2**i)) 468 | self.downs.append(weight_norm( 469 | Conv1d(hps.model.n_harmonic + 2, hps.model.n_harmonic + 2, 470 | k, u, padding=k // 2))) 471 | 472 | self.resblocks_downs = nn.ModuleList() 473 | for i in range(len(self.downs)): 474 | j = len(upsample_rates) - 1 - i 475 | self.resblocks_downs.append(ResBlock3(hps.model.n_harmonic + 2, 3, (1, 3))) 476 | 477 | self.concat_pre = Conv1d(upsample_initial_channel + hps.model.n_harmonic + 2, upsample_initial_channel, 3, 1, 478 | padding=1) 479 | self.concat_conv = nn.ModuleList() 480 | for i in range(len(upsample_rates)): 481 | ch = upsample_initial_channel // (2 ** (i + 1)) 482 | self.concat_conv.append(Conv1d(ch + hps.model.n_harmonic + 2, ch, 3, 1, padding=1, bias=False)) 483 | 484 | self.ups = nn.ModuleList() 485 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 486 | self.ups.append(weight_norm( 487 | ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), 488 | k, u, padding=(k - u) // 2))) 489 | 490 | self.resblocks = nn.ModuleList() 491 | for i in range(len(self.ups)): 492 | ch = upsample_initial_channel // (2 ** (i + 1)) 493 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 494 | self.resblocks.append(resblock(ch, k, d)) 495 | 496 | self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) 497 | self.ups.apply(init_weights) 498 | 499 | if self.n_speakers != 0: 500 | self.cond = nn.Conv1d(spk_channels, upsample_initial_channel, 1) 501 | 502 | def forward(self, x, ddsp, g=None): 503 | 504 | x = self.conv_pre(x) 505 | 506 | if g is not None: 507 | x = x + self.cond(g) 508 | 509 | se = ddsp 510 | res_features = [se] 511 | for i in range(self.num_upsamples): 512 | in_size = se.size(2) 513 | se = self.downs[i](se) 514 | se = self.resblocks_downs[i](se) 515 | up_rate = self.upsample_rates[self.num_upsamples - 1 - i] 516 | se = se[:, :, : in_size // up_rate] 517 | res_features.append(se) 518 | 519 | x = torch.cat([x, se], 1) 520 | x = self.concat_pre(x) 521 | 522 | for i in range(self.num_upsamples): 523 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 524 | in_size = x.size(2) 525 | x = self.ups[i](x) 526 | # 保证维度正确,丢掉多余通道 527 | x = x[:, :, : in_size * self.upsample_rates[i]] 528 | 529 | x = torch.cat([x, res_features[self.num_upsamples - 1 - i]], 1) 530 | x = self.concat_conv[i](x) 531 | 532 | xs = None 533 | for j in range(self.num_kernels): 534 | if xs is None: 535 | xs = self.resblocks[i * self.num_kernels + j](x) 536 | else: 537 | xs += self.resblocks[i * self.num_kernels + j](x) 538 | x = xs / self.num_kernels 539 | 540 | x = F.leaky_relu(x) 541 | x = self.conv_post(x) 542 | x = torch.tanh(x) 543 | 544 | return x 545 | 546 | def remove_weight_norm(self): 547 | print('Removing weight norm...') 548 | for l in self.ups: 549 | remove_weight_norm(l) 550 | for l in self.resblocks: 551 | l.remove_weight_norm() 552 | 553 | 554 | class Generator_Noise(torch.nn.Module): 555 | def __init__(self, hps): 556 | super(Generator_Noise, self).__init__() 557 | self.hps = hps 558 | self.win_size = hps.data.win_size 559 | self.hop_size = hps.data.hop_size 560 | self.fft_size = hps.data.n_fft 561 | self.istft_pre = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1) 562 | 563 | self.net = ConvReluNorm(hps.model.hidden_channels, 564 | hps.model.hidden_channels, 565 | hps.model.hidden_channels, 566 | hps.model.kernel_size, 567 | 8, 568 | hps.model.p_dropout) 569 | 570 | self.istft_amplitude = torch.nn.Conv1d(hps.model.hidden_channels, self.fft_size // 2 + 1, 1, 1) 571 | self.window = torch.hann_window(self.win_size) 572 | 573 | def forward(self, x, mask): 574 | istft_x = x 575 | istft_x = self.istft_pre(istft_x) 576 | 577 | istft_x = self.net(istft_x) * mask 578 | 579 | amp = self.istft_amplitude(istft_x).unsqueeze(-1) 580 | phase = (torch.rand(amp.shape) * 2 * 3.14 - 3.14).to(amp) 581 | 582 | real = amp * torch.cos(phase) 583 | imag = amp * torch.sin(phase) 584 | spec = torch.cat([real, imag], 3) 585 | istft_x = torch.istft(spec, self.fft_size, self.hop_size, self.win_size, self.window.to(amp), True, 586 | length=x.shape[2] * self.hop_size, return_complex=False) 587 | 588 | return istft_x.unsqueeze(1) 589 | 590 | 591 | class LayerNorm(nn.Module): 592 | def __init__(self, channels, eps=1e-5): 593 | super().__init__() 594 | self.channels = channels 595 | self.eps = eps 596 | 597 | self.gamma = nn.Parameter(torch.ones(channels)) 598 | self.beta = nn.Parameter(torch.zeros(channels)) 599 | 600 | def forward(self, x): 601 | x = x.transpose(1, -1) 602 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 603 | return x.transpose(1, -1) 604 | 605 | 606 | class DiscriminatorP(torch.nn.Module): 607 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 608 | super(DiscriminatorP, self).__init__() 609 | self.period = period 610 | self.use_spectral_norm = use_spectral_norm 611 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 612 | self.convs = nn.ModuleList([ 613 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 614 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 615 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 616 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 617 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), 618 | ]) 619 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 620 | 621 | def forward(self, x): 622 | fmap = [] 623 | 624 | # 1d to 2d 625 | b, c, t = x.shape 626 | if t % self.period != 0: # pad first 627 | n_pad = self.period - (t % self.period) 628 | x = F.pad(x, (0, n_pad), "reflect") 629 | t = t + n_pad 630 | x = x.view(b, c, t // self.period, self.period) 631 | 632 | for l in self.convs: 633 | x = l(x) 634 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 635 | fmap.append(x) 636 | x = self.conv_post(x) 637 | fmap.append(x) 638 | x = torch.flatten(x, 1, -1) 639 | 640 | return x, fmap 641 | 642 | 643 | class DiscriminatorS(torch.nn.Module): 644 | def __init__(self, use_spectral_norm=False): 645 | super(DiscriminatorS, self).__init__() 646 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 647 | self.convs = nn.ModuleList([ 648 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 649 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 650 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 651 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), 652 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), 653 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 654 | ]) 655 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 656 | 657 | def forward(self, x): 658 | fmap = [] 659 | 660 | for l in self.convs: 661 | x = l(x) 662 | x = F.leaky_relu(x, modules.LRELU_SLOPE) 663 | fmap.append(x) 664 | x = self.conv_post(x) 665 | fmap.append(x) 666 | x = torch.flatten(x, 1, -1) 667 | 668 | return x, fmap 669 | 670 | 671 | class MultiFrequencyDiscriminator(nn.Module): 672 | def __init__(self, 673 | hop_lengths=[128, 256, 512], 674 | hidden_channels=[256, 512, 512], 675 | domain='double', mel_scale=True): 676 | super(MultiFrequencyDiscriminator, self).__init__() 677 | 678 | self.stfts = nn.ModuleList([ 679 | TorchSTFT(fft_size=x * 4, hop_size=x, win_size=x * 4, 680 | normalized=True, domain=domain, mel_scale=mel_scale) 681 | for x in hop_lengths]) 682 | 683 | self.domain = domain 684 | if domain == 'double': 685 | self.discriminators = nn.ModuleList([ 686 | BaseFrequenceDiscriminator(2, c) 687 | for x, c in zip(hop_lengths, hidden_channels)]) 688 | else: 689 | self.discriminators = nn.ModuleList([ 690 | BaseFrequenceDiscriminator(1, c) 691 | for x, c in zip(hop_lengths, hidden_channels)]) 692 | 693 | def forward(self, x): 694 | scores, feats = list(), list() 695 | for stft, layer in zip(self.stfts, self.discriminators): 696 | # print(stft) 697 | mag, phase = stft.transform(x.squeeze()) 698 | if self.domain == 'double': 699 | mag = torch.stack(torch.chunk(mag, 2, dim=1), dim=1) 700 | else: 701 | mag = mag.unsqueeze(1) 702 | 703 | score, feat = layer(mag) 704 | scores.append(score) 705 | feats.append(feat) 706 | return scores, feats 707 | 708 | 709 | class BaseFrequenceDiscriminator(nn.Module): 710 | def __init__(self, in_channels, hidden_channels=512): 711 | super(BaseFrequenceDiscriminator, self).__init__() 712 | 713 | self.discriminator = nn.ModuleList() 714 | self.discriminator += [ 715 | nn.Sequential( 716 | nn.ReflectionPad2d((1, 1, 1, 1)), 717 | nn.utils.weight_norm(nn.Conv2d( 718 | in_channels, hidden_channels // 32, 719 | kernel_size=(3, 3), stride=(1, 1))) 720 | ), 721 | nn.Sequential( 722 | nn.LeakyReLU(0.2, True), 723 | nn.ReflectionPad2d((1, 1, 1, 1)), 724 | nn.utils.weight_norm(nn.Conv2d( 725 | hidden_channels // 32, hidden_channels // 16, 726 | kernel_size=(3, 3), stride=(2, 2))) 727 | ), 728 | nn.Sequential( 729 | nn.LeakyReLU(0.2, True), 730 | nn.ReflectionPad2d((1, 1, 1, 1)), 731 | nn.utils.weight_norm(nn.Conv2d( 732 | hidden_channels // 16, hidden_channels // 8, 733 | kernel_size=(3, 3), stride=(1, 1))) 734 | ), 735 | nn.Sequential( 736 | nn.LeakyReLU(0.2, True), 737 | nn.ReflectionPad2d((1, 1, 1, 1)), 738 | nn.utils.weight_norm(nn.Conv2d( 739 | hidden_channels // 8, hidden_channels // 4, 740 | kernel_size=(3, 3), stride=(2, 2))) 741 | ), 742 | nn.Sequential( 743 | nn.LeakyReLU(0.2, True), 744 | nn.ReflectionPad2d((1, 1, 1, 1)), 745 | nn.utils.weight_norm(nn.Conv2d( 746 | hidden_channels // 4, hidden_channels // 2, 747 | kernel_size=(3, 3), stride=(1, 1))) 748 | ), 749 | nn.Sequential( 750 | nn.LeakyReLU(0.2, True), 751 | nn.ReflectionPad2d((1, 1, 1, 1)), 752 | nn.utils.weight_norm(nn.Conv2d( 753 | hidden_channels // 2, hidden_channels, 754 | kernel_size=(3, 3), stride=(2, 2))) 755 | ), 756 | nn.Sequential( 757 | nn.LeakyReLU(0.2, True), 758 | nn.ReflectionPad2d((1, 1, 1, 1)), 759 | nn.utils.weight_norm(nn.Conv2d( 760 | hidden_channels, 1, 761 | kernel_size=(3, 3), stride=(1, 1))) 762 | ) 763 | ] 764 | 765 | def forward(self, x): 766 | hiddens = [] 767 | for layer in self.discriminator: 768 | x = layer(x) 769 | hiddens.append(x) 770 | return x, hiddens[-1] 771 | 772 | 773 | class Discriminator(torch.nn.Module): 774 | def __init__(self, hps, use_spectral_norm=False): 775 | super(Discriminator, self).__init__() 776 | periods = [2, 3, 5, 7, 11] 777 | 778 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 779 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] 780 | self.discriminators = nn.ModuleList(discs) 781 | self.disc_multfrequency = MultiFrequencyDiscriminator(hop_lengths=[int(hps.data.sample_rate * 2.5 / 1000), 782 | int(hps.data.sample_rate * 5 / 1000), 783 | int(hps.data.sample_rate * 7.5 / 1000), 784 | int(hps.data.sample_rate * 10 / 1000), 785 | int(hps.data.sample_rate * 12.5 / 1000), 786 | int(hps.data.sample_rate * 15 / 1000)], 787 | hidden_channels=[256, 256, 256, 256, 256]) 788 | 789 | def forward(self, y, y_hat): 790 | y_d_rs = [] 791 | y_d_gs = [] 792 | fmap_rs = [] 793 | fmap_gs = [] 794 | for i, d in enumerate(self.discriminators): 795 | y_d_r, fmap_r = d(y) 796 | y_d_g, fmap_g = d(y_hat) 797 | y_d_rs.append(y_d_r) 798 | y_d_gs.append(y_d_g) 799 | fmap_rs.append(fmap_r) 800 | fmap_gs.append(fmap_g) 801 | scores_r, fmaps_r = self.disc_multfrequency(y) 802 | scores_g, fmaps_g = self.disc_multfrequency(y_hat) 803 | for i in range(len(scores_r)): 804 | y_d_rs.append(scores_r[i]) 805 | y_d_gs.append(scores_g[i]) 806 | fmap_rs.append(fmaps_r[i]) 807 | fmap_gs.append(fmaps_g[i]) 808 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 809 | 810 | 811 | class SynthesizerTrn(nn.Module): 812 | """ 813 | Model 814 | """ 815 | 816 | def __init__(self, hps): 817 | super().__init__() 818 | self.hps = hps 819 | 820 | self.text_encoder = TextEncoder( 821 | len(ttsing_phone_set), 822 | hps.model.prior_hidden_channels, 823 | hps.model.prior_hidden_channels, 824 | hps.model.prior_filter_channels, 825 | hps.model.prior_n_heads, 826 | hps.model.prior_n_layers, 827 | hps.model.prior_kernel_size, 828 | hps.model.prior_p_dropout) 829 | 830 | self.decoder = PriorDecoder( 831 | hps.model.hidden_channels * 2, 832 | hps.model.prior_hidden_channels, 833 | hps.model.prior_filter_channels, 834 | hps.model.prior_n_heads, 835 | hps.model.prior_n_layers, 836 | hps.model.prior_kernel_size, 837 | hps.model.prior_p_dropout, 838 | n_speakers=hps.data.n_speakers, 839 | spk_channels=hps.model.spk_channels 840 | ) 841 | 842 | self.f0_decoder = Decoder( 843 | 1, 844 | hps.model.prior_hidden_channels, 845 | hps.model.prior_filter_channels, 846 | hps.model.prior_n_heads, 847 | hps.model.prior_n_layers, 848 | hps.model.prior_kernel_size, 849 | hps.model.prior_p_dropout, 850 | n_speakers=hps.data.n_speakers, 851 | spk_channels=hps.model.spk_channels 852 | ) 853 | 854 | self.mel_decoder = Decoder( 855 | hps.data.acoustic_dim, 856 | hps.model.prior_hidden_channels, 857 | hps.model.prior_filter_channels, 858 | hps.model.prior_n_heads, 859 | hps.model.prior_n_layers, 860 | hps.model.prior_kernel_size, 861 | hps.model.prior_p_dropout, 862 | n_speakers=hps.data.n_speakers, 863 | spk_channels=hps.model.spk_channels 864 | ) 865 | 866 | self.posterior_encoder = PosteriorEncoder( 867 | hps, 868 | hps.data.acoustic_dim, 869 | hps.model.hidden_channels, 870 | hps.model.hidden_channels, 3, 1, 8) 871 | 872 | self.dropout = nn.Dropout(0.2) 873 | 874 | self.duration_predictor = DurationPredictor( 875 | hps.model.prior_hidden_channels, 876 | hps.model.prior_hidden_channels, 877 | 3, 878 | 0.5, 879 | n_speakers=hps.data.n_speakers, 880 | spk_channels=hps.model.spk_channels) 881 | self.LR = LengthRegulator() 882 | 883 | self.dec = Generator(hps, 884 | hps.model.hidden_channels, 885 | hps.model.resblock, 886 | hps.model.resblock_kernel_sizes, 887 | hps.model.resblock_dilation_sizes, 888 | hps.model.upsample_rates, 889 | hps.model.upsample_initial_channel, 890 | hps.model.upsample_kernel_sizes, 891 | n_speakers=hps.data.n_speakers, 892 | spk_channels=hps.model.spk_channels) 893 | 894 | self.dec_harm = Generator_Harm(hps) 895 | 896 | self.dec_noise = Generator_Noise(hps) 897 | 898 | self.f0_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1) 899 | self.energy_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1) 900 | self.mel_prenet = nn.Conv1d(hps.data.acoustic_dim, hps.model.prior_hidden_channels + 2, 3, padding=1) 901 | 902 | if hps.data.n_speakers > 1: 903 | self.emb_spk = nn.Embedding(hps.data.n_speakers, hps.model.spk_channels) 904 | self.flow = modules.ResidualCouplingBlock(hps.model.prior_hidden_channels, hps.model.hidden_channels, 5, 1, 4,n_speakers=hps.data.n_speakers, gin_channels=hps.model.spk_channels) 905 | 906 | def forward(self, phone, phone_lengths, pitchid, dur, slur, gtdur, F0, mel, bn_lengths, spk_id=None): 907 | if self.hps.data.n_speakers > 0: 908 | g = self.emb_spk(spk_id).unsqueeze(-1) # [b, h, 1] 909 | else: 910 | g = None 911 | 912 | # Encoder 913 | x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur) 914 | 915 | # LR 916 | decoder_input, mel_len = self.LR(x, gtdur, None) 917 | 918 | LF0 = 2595. * torch.log10(1. + F0 / 700.) 919 | LF0 = LF0 / 500 920 | 921 | # aam 922 | predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), bn_lengths, spk_emb=g) 923 | 924 | predict_energy = predict_mel.detach().sum(1).unsqueeze(1) / self.hps.data.acoustic_dim 925 | 926 | decoder_input = decoder_input + \ 927 | self.f0_prenet(LF0) + \ 928 | self.energy_prenet(predict_energy) + \ 929 | self.mel_prenet(predict_mel.detach()) 930 | decoder_output, predict_bn_mask = self.decoder(decoder_input, bn_lengths, spk_emb=g) 931 | 932 | prior_info = decoder_output 933 | m_p = prior_info[:, :self.hps.model.hidden_channels, :] 934 | logs_p = prior_info[:, self.hps.model.hidden_channels:, :] 935 | 936 | # posterior 937 | posterior, y_mask = self.posterior_encoder(mel, bn_lengths,g=g) 938 | m_q = posterior[:, :self.hps.model.hidden_channels, :] 939 | logs_q = posterior[:, self.hps.model.hidden_channels:, :] 940 | z = (m_q + torch.randn_like(m_q) * torch.exp(logs_q)) * y_mask 941 | z_p = self.flow(z, y_mask, g=g) 942 | 943 | # kl loss 944 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, y_mask) 945 | 946 | p_z = z 947 | p_z = self.dropout(p_z) 948 | 949 | pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size) 950 | omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1) 951 | sin = torch.sin(omega).transpose(1, 2) 952 | 953 | # dsp synthesize 954 | noise_x = self.dec_noise(p_z, y_mask) 955 | harm_x = self.dec_harm(F0, p_z, y_mask) 956 | 957 | # dsp waveform 958 | dsp_o = torch.cat([harm_x, noise_x], axis=1) 959 | 960 | decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1) 961 | 962 | # dsp based HiFiGAN vocoder 963 | x_slice, ids_slice = commons.rand_slice_segments(p_z, bn_lengths, 964 | self.hps.train.segment_size // self.hps.data.hop_size) 965 | F0_slice = commons.slice_segments(F0, ids_slice, self.hps.train.segment_size // self.hps.data.hop_size) 966 | dsp_slice = commons.slice_segments(dsp_o, ids_slice * self.hps.data.hop_size, self.hps.train.segment_size) 967 | condition_slice = commons.slice_segments(decoder_condition, ids_slice * self.hps.data.hop_size, 968 | self.hps.train.segment_size) 969 | o = self.dec(x_slice, condition_slice.detach(), g=g) 970 | 971 | return o, ids_slice, LF0 * predict_bn_mask, dsp_slice.sum(1), loss_kl, predict_mel, predict_bn_mask 972 | 973 | def infer(self, phone, phone_lengths, pitchid, dur, slur, gtdur=None, spk_id=None, length_scale=1., F0=None, noise_scale=0.8): 974 | 975 | if self.hps.data.n_speakers > 0: 976 | g = self.emb_spk(spk_id).unsqueeze(-1) # [b, h, 1] 977 | else: 978 | g = None 979 | 980 | # Encoder 981 | x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur) 982 | 983 | # dur 984 | y_lengths = torch.clamp_min(torch.sum(gtdur.squeeze(1), [1]), 1).long() 985 | LF0 = 2595. * torch.log10(1. + F0 / 700.) 986 | LF0 = LF0 / 500 987 | # LR 988 | decoder_input, mel_len = self.LR(x, gtdur, None) 989 | 990 | # aam 991 | predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), y_lengths, spk_emb=g) 992 | 993 | predict_energy = predict_mel.sum(1).unsqueeze(1) / self.hps.data.acoustic_dim 994 | 995 | decoder_input = decoder_input + \ 996 | self.f0_prenet(LF0) + \ 997 | self.energy_prenet(predict_energy) + \ 998 | self.mel_prenet(predict_mel) 999 | decoder_output, y_mask = self.decoder(decoder_input, y_lengths, spk_emb=g) 1000 | 1001 | prior_info = decoder_output 1002 | 1003 | m_p = prior_info[:, :self.hps.model.hidden_channels, :] 1004 | logs_p = prior_info[:, self.hps.model.hidden_channels:, :] 1005 | z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale 1006 | z = self.flow(z_p, y_mask, g=g, reverse=True) 1007 | 1008 | prior_z = z 1009 | 1010 | noise_x = self.dec_noise(prior_z, y_mask) 1011 | 1012 | harm_x = self.dec_harm(F0, prior_z, y_mask) 1013 | 1014 | pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size) 1015 | omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1) 1016 | sin = torch.sin(omega).transpose(1, 2) 1017 | 1018 | decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1) 1019 | 1020 | # dsp based HiFiGAN vocoder 1021 | o = self.dec(prior_z, decoder_condition, g=g) 1022 | 1023 | return o, harm_x.sum(1).unsqueeze(1), noise_x 1024 | --------------------------------------------------------------------------------