├── utils
    ├── __init__.py
    ├── audio.py
    └── utils.py
├── egs
    └── visinger2
    │   ├── __init__.py
    │   ├── bash
    │       └── train.sh
    │   ├── config.json
    │   ├── inference.py
    │   ├── dataset.py
    │   ├── train.py
    │   └── models.py
├── text
    └── npu
    │   ├── __init__.py
    │   ├── symbol_converter.py
    │   └── symbols.py
├── requirements_3090.txt
├── prepare_multispeaker.py
├── ds_inference.py
├── preprocess_multispeaker.py
├── README.md
├── modules
    ├── losses.py
    ├── commons.py
    ├── ddsp.py
    ├── transforms.py
    ├── modules.py
    ├── attentions.py
    └── stft.py
├── preprocess
    └── mel_processing.py
├── preprocess.py
└── infer
    └── __init__.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/egs/visinger2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text/npu/__init__.py:
--------------------------------------------------------------------------------
1 | from text.npu import symbols
2 | from text.npu.symbol_converter import *


--------------------------------------------------------------------------------
/requirements_3090.txt:
--------------------------------------------------------------------------------
 1 | ipython==8.8.0
 2 | librosa==0.8.1
 3 | matplotlib==3.3.2
 4 | numpy==1.19.2
 5 | pyworld==0.3.0
 6 | scipy==1.5.2
 7 | soundfile==0.11.0
 8 | torch==1.8.1
 9 | tqdm==4.50.2
10 | 


--------------------------------------------------------------------------------
/egs/visinger2/bash/train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | num_gpu=$1
 3 | 
 4 | cd $(dirname $(dirname $0))
 5 | exp_dir=$(pwd)
 6 | base_dir=$(dirname $(dirname $exp_dir))
 7 | config=${exp_dir}/config.json
 8 | 
 9 | export PYTHONPATH=$base_dir
10 | export PYTHONIOENCODING=UTF-8
11 | 
12 | CUDA_VISIBLE_DEVICES=${num_gpu} python train.py -c config.json
13 | 
14 | 


--------------------------------------------------------------------------------
/prepare_multispeaker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | for spk in os.listdir("data"):
 5 |     if os.path.isdir(f"data/{spk}"):
 6 |         if os.path.exists(f"data/{spk}/raw/wavs"):
 7 |             shutil.move(f"data/{spk}/raw/wavs", f"data/{spk}")
 8 |             shutil.move(f"data/{spk}/raw/transcriptions.txt", f"data/{spk}")
 9 |             shutil.rmtree(f"data/{spk}/raw")
10 | 
11 | 


--------------------------------------------------------------------------------
/ds_inference.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import time
 4 | import re
 5 | 
 6 | import numpy as np
 7 | import soundfile
 8 | import torch
 9 | import tqdm
10 | from scipy.interpolate import interp1d
11 | 
12 | from utils import utils
13 | from egs.visinger2.models import SynthesizerTrn
14 | from infer import preprocess, cross_fade, infer_ds
15 | 
16 | trans = -12
17 | speaker = "otto"
18 | ds_path = "infer/share.ds"
19 | config_json = "egs/visinger2/config.json"
20 | checkpoint_path = f"/Volumes/Extend/下载/G_157000.pth"
21 | file_name = os.path.splitext(os.path.basename(ds_path))[0]
22 | step = re.findall(r'G_(\d+)\.pth', checkpoint_path)[0]
23 | 
24 | 
25 | ds = json.load(open(ds_path))
26 | hps = utils.get_hparams_from_file(config_json)
27 | net_g = SynthesizerTrn(hps)
28 | _ = net_g.eval()
29 | _ = utils.load_checkpoint(checkpoint_path, net_g, None)
30 | 
31 | audio = infer_ds(net_g, hps, ds, speaker, trans)
32 | soundfile.write(f"samples/{speaker}_{file_name}_{step}step.wav", audio, 44100)
33 | 


--------------------------------------------------------------------------------
/preprocess_multispeaker.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | 
 4 | data_root = "data"
 5 | 
 6 | 
 7 | transcriptions = glob.glob(f"{data_root}/*/transcriptions.txt")
 8 | spk2id = {}
 9 | spk_id = 0
10 | ms_transcriptions = open(f'{data_root}/transcriptions.txt', "w")
11 | ms_train_set = open(f'{data_root}/train.list', "w")
12 | ms_test_set = open(f'{data_root}/test.list', "w")
13 | for transcription in transcriptions:
14 |     spk = transcription.split("/")[-2]
15 |     spk2id[spk] = spk_id
16 |     spk_id += 1
17 |     for line in open(transcription).readlines():
18 |         ms_transcriptions.write(f"{spk}/{line}")
19 |     for line in open(transcription.replace("transcriptions.txt", "train.list")):
20 |         ms_train_set.write(f"{spk}/{line}")
21 |     for line in open(transcription.replace("transcriptions.txt", "test.list")):
22 |         ms_test_set.write(f"{spk}/{line}")
23 | 
24 | ms_transcriptions.close()
25 | ms_train_set.close()
26 | ms_test_set.close()
27 | print("请手动将说话人与id的映射粘贴至config文件中")
28 | print(json.dumps(spk2id))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VISinger2
 2 | 
 3 | 本仓库将VISinger2对接DiffSinger社区，兼容DiffSinger社区nomidi格式数据集、ds工程文件。相比于DiffSinger，本模型有着极快的合成速度，但不使用预训练模型情况下训练速度相对较慢，模型音质上限也低于DiffSinger
 4 | 
 5 | 目前训练、推理代码还不是很易用，之后会逐步进行完善
 6 | 
 7 | ## 数据集准备
 8 | 先按照DiffSinger nomidi格式制作数据集，放入data目录下
 9 | + 高质量数据集制作可以参照[DiffSinger数据集教程](https://www.yuque.com/sunsa-i3ayc/sivu7h/dx9xof9k1dg305aq) 
10 | 
11 | [//]: # (+ 低质量数据追求省事可以使用[自动化数据集制作脚本]&#40;https://github.com/innnky/audio-preprocessing-scripts&#41; （目前除了mfa部分基本可以做到一键完成）)
12 | ```shell
13 | data
14 | ├───speaker0
15 | │   └───raw
16 | │        ├──wavs
17 | │        └──transcriptions.txt
18 | └───speaker1
19 |     └───raw
20 |          ├──wavs
21 |          └──transcriptions.txt
22 | ```
23 | 之后依次执行
24 | ```shell
25 | # 调整文件夹结构
26 | python prepare_multispeaker.py
27 | # 生成mel与pitch
28 | python preprocess.py
29 | # 生成多说话人配置
30 | python preprocess_multispeaker.py
31 | # 之后将上一部生成的spk2id粘贴到配置文件egs/visinger2/config.json中
32 | ```
33 | ## 训练
34 | ```shell
35 | cd egs/visinger2
36 | bash bash/train.sh 0
37 | ```
38 | ## 推理
39 | 修改 ds_inference.py 中ds工程、说话人、模型路径
40 | 
41 | python ds_inference.py
42 | 


--------------------------------------------------------------------------------
/text/npu/symbol_converter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import numpy as np
 3 | from text.npu.symbols import *
 4 | import os
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _ttsing_phone_to_id = {p: i for i, p in enumerate(ttsing_phone_set)}
 8 | _ttsing_pitch_to_id = {p: i for i, p in enumerate(ttsing_pitch_set)}
 9 | _ttsing_slur_to_id = {s: i for i, s in enumerate(ttsing_slur_set)}
10 | 
11 | ttsing_phone_to_int = {}
12 | int_to_ttsing_phone = {}
13 | for idx, item in enumerate(ttsing_phone_set):
14 |     ttsing_phone_to_int[item] = idx
15 |     int_to_ttsing_phone[idx] = item
16 | 
17 | ttsing_pitch_to_int = {}
18 | int_to_ttsing_pitch = {}
19 | for idx, item in enumerate(ttsing_pitch_set):
20 |     ttsing_pitch_to_int[item] = idx
21 |     int_to_ttsing_pitch[idx] = item
22 | 
23 | # opencpop
24 | ttsing_opencpop_pitch_to_int = {}
25 | for idx, item in enumerate(ttsing_opencpop_pitch_set):
26 |     ttsing_opencpop_pitch_to_int[item] = idx
27 | 
28 | ttsing_slur_to_int = {}
29 | int_to_ttsing_slur = {}
30 | for idx, item in enumerate(ttsing_slur_set):
31 |     ttsing_slur_to_int[item] = idx
32 |     int_to_ttsing_slur[idx] = item
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/modules/losses.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | from torch.nn import functional as F
 3 | 
 4 | import modules.commons
 5 | import math
 6 | 
 7 | def feature_loss(fmap_r, fmap_g):
 8 |   loss = 0
 9 |   for dr, dg in zip(fmap_r, fmap_g):
10 |     for rl, gl in zip(dr, dg):
11 |       rl = rl.float().detach()
12 |       gl = gl.float()
13 |       loss += torch.mean(torch.abs(rl - gl))
14 | 
15 |   return loss * 2 
16 | 
17 | 
18 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19 |   loss = 0
20 |   r_losses = []
21 |   g_losses = []
22 |   for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23 |     dr = dr.float()
24 |     dg = dg.float()
25 |     r_loss = torch.mean((1-dr)**2)
26 |     g_loss = torch.mean(dg**2)
27 |     loss += (r_loss + g_loss)
28 |     r_losses.append(r_loss.item())
29 |     g_losses.append(g_loss.item())
30 | 
31 |   return loss, r_losses, g_losses
32 | 
33 | 
34 | def generator_loss(disc_outputs):
35 |   loss = 0
36 |   gen_losses = []
37 |   for dg in disc_outputs:
38 |     dg = dg.float()
39 |     l = torch.mean((1-dg)**2)
40 |     gen_losses.append(l)
41 |     loss += l
42 | 
43 |   return loss, gen_losses
44 | 
45 | 
46 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47 |   """
48 |   z_p, logs_q: [b, h, t_t]
49 |   m_p, logs_p: [b, h, t_t]
50 |   """
51 |   z_p = z_p.float()
52 |   logs_q = logs_q.float()
53 |   m_p = m_p.float()
54 |   logs_p = logs_p.float()
55 |   z_mask = z_mask.float()
56 | 
57 |   kl = logs_p - logs_q - 0.5
58 |   kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
59 |   kl = torch.sum(kl * z_mask)
60 |   l = kl / torch.sum(z_mask)
61 |   return l
62 | 
63 | 


--------------------------------------------------------------------------------
/egs/visinger2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 1000,
 5 |     "seed": 1234,
 6 |     "port": 8001,
 7 |     "epochs": 10000,
 8 |     "learning_rate": 2e-4,
 9 |     "betas": [0.8, 0.99],
10 |     "eps": 1e-9,
11 |     "batch_size": 8,
12 |     "accumulation_steps": 1,
13 |     "fp16_run": false,
14 |     "lr_decay": 0.998,
15 |     "segment_size": 10240,
16 |     "init_lr_ratio": 1,
17 |     "warmup_epochs": 0,
18 |     "c_mel": 45,
19 |     "save_dir": "logdir/visinger2"
20 |   },
21 |   "data": {
22 |     "data_dir":"../../data",
23 |     "dataset_type": "SingDataset",
24 |     "collate_type": "SingCollate",
25 |     "training_filelist":"train.list",
26 |     "training_labellist":"transcriptions.txt",
27 |     "validation_filelist":"test.list",
28 |     "validation_labellist":"transcriptions.txt",
29 |     "max_wav_value": 32768.0,
30 |     "sample_rate": 44100,
31 |     "n_fft": 2048,
32 |     "fmin": 0,
33 |     "fmax": 22050,
34 |     "hop_size": 512,
35 |     "win_size": 2048,
36 |     "acoustic_dim": 80,
37 |     "min_level_db": -115,
38 |     "ref_level_db": 20,
39 |     "min_db": -115,
40 |     "max_abs_value": 4.0,
41 |     "n_speakers": 200,
42 |     "spk2id": {"opencpop": 0, "taffy": 1, "otto": 2, "nanami": 3}
43 |   },
44 |   "model": {
45 |     "hidden_channels": 192,
46 |     "spk_channels": 192,
47 |     "filter_channels": 768,
48 |     "n_heads": 2,
49 |     "n_layers": 4,
50 |     "kernel_size": 3,
51 |     "p_dropout": 0.1,
52 |     "prior_hidden_channels": 192,
53 |     "prior_filter_channels": 768,
54 |     "prior_n_heads": 2,
55 |     "prior_n_layers": 4,
56 |     "prior_kernel_size": 3,
57 |     "prior_p_dropout": 0.1,
58 |     "resblock": "1",
59 |     "use_spectral_norm": false,
60 |     "resblock_kernel_sizes": [3,7,11],
61 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
62 |     "upsample_rates": [8,8,4,2],
63 |     "upsample_initial_channel": 256,
64 |     "upsample_kernel_sizes": [16,16,8,4],
65 |     "n_harmonic": 64,
66 |     "n_bands": 65
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/text/npu/symbols.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ttsing_phone_set = ['_'] + [
 3 |     "b", "c", "ch", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r",
 4 |     "s", "sh", "t", "x", "z", "zh", "a", "ai", "an", "ang", "ao", "e", "ei",
 5 |     "en", "eng", "er", "iii", "ii", "i", "ia", "ian", "iang", "iao", "ie", "in",
 6 |     "ing", "iong", "iou", "o", "ong", "ou", "u", "ua", "uai", "uan", "uang",
 7 |     "uei", "uen", "ueng", "uo", "v", "van", "ve", "vn", "AH", "AA", "AO", "ER",
 8 |     "IH", "IY", "UH", "UW", "EH", "AE", "AY", "EY", "OY", "AW", "OW", "P", "B",
 9 |     "T", "D", "K", "G", "M", "N", "NG", "L", "S", "Z", "Y", "TH", "DH", "SH",
10 |     "ZH", "CH", "JH", "V", "W", "F", "R", "HH", "AH0", "AA0", "AO0", "ER0",
11 |     "IH0", "IY0", "UH0", "UW0", "EH0", "AE0", "AY0", "EY0", "OY0", "AW0", "OW0",
12 |     "AH1", "AA1", "AO1", "ER1", "IH1", "IY1", "UH1", "UW1", "EH1", "AE1", "AY1",
13 |     "EY1", "OY1", "AW1", "OW1", "AH2", "AA2", "AO2", "ER2", "IH2", "IY2", "UH2",
14 |     "UW2", "EH2", "AE2", "AY2", "EY2", "OY2", "AW2", "OW2", "AH3", "AA3", "AO3",
15 |     "ER3", "IH3", "IY3", "UH3", "UW3", "EH3", "AE3", "AY3", "EY3", "OY3", "AW3",
16 |     "OW3", "D-1", "T-1", "P*", "B*", "T*", "D*", "K*", "G*", "M*", "N*", "NG*",
17 |     "L*", "S*", "Z*", "Y*", "TH*", "DH*", "SH*", "ZH*", "CH*", "JH*", "V*",
18 |     "W*", "F*", "R*", "HH*", "sp", "sil", "or", "ar", "aor", "our", "angr",
19 |     "eir", "engr", "air", "ianr", "iaor", "ir", "ingr", "ur", "iiir", "uar",
20 |     "uangr", "uenr", "iir", "ongr", "uor", "ueir", "iar", "iangr", "inr",
21 |     "iour", "vr", "uanr", "ruai", "TR", "rest", 
22 |     # opencpop
23 |     'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu',
24 |     "iour", "vr", "uanr", "ruai", "TR", "rest",
25 |     # opencpop
26 |     'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu',
27 |     # opencpop-strict
28 |     'i0', 'E', 'En'
29 | ]
30 | 
31 | ttsing_pitch_set = ['_'] + [
32 |     "C0", "C1", "C2", "C3", "C4", "C5", "C6", "C#/Db0", "C#/Db1", "C#/Db2",
33 |     "C#/Db3", "C#/Db4", "C#/Db5", "C#/Db6", "D0", "D1", "D2", "D3", "D4", "D5",
34 |     "D6", "D#/Eb0", "D#/Eb1", "D#/Eb2", "D#/Eb3", "D#/Eb4", "D#/Eb5", "D#/Eb6",
35 |     "E0", "E1", "E2", "E3", "E4", "E5", "E6", "F0", "F1", "F2", "F3", "F4",
36 |     "F5", "F6", "F#/Gb0", "F#/Gb1", "F#/Gb2", "F#/Gb3", "F#/Gb4", "F#/Gb5",
37 |     "F#/Gb6", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G#/Ab0", "G#/Ab1",
38 |     "G#/Ab2", "G#/Ab3", "G#/Ab4", "G#/Ab5", "G#/Ab6", "A0", "A1", "A2", "A3",
39 |     "A4", "A5", "A6", "A#/Bb0", "A#/Bb1", "A#/Bb2", "A#/Bb3", "A#/Bb4",
40 |     "A#/Bb5", "A#/Bb6", "B0", "B1", "B2", "B3", "B4", "B5", "B6", "RestRest"
41 | ]
42 | 
43 | ttsing_opencpop_pitch_set = ['_'] + [
44 |     "C0", "C1", "C2", "C3", "C4", "C5", "C6", 
45 |     "C#0/Db0", "C#1/Db1", "C#2/Db2", "C#3/Db3", "C#4/Db4", "C#5/Db5", "C#6/Db6", 
46 |     "D0", "D1", "D2", "D3", "D4", "D5", "D6", 
47 |     "D#0/Eb0", "D#1/Eb1", "D#2/Eb2", "D#3/Eb3", "D#4/Eb4", "D#5/Eb5", "D#6/Eb6",
48 |     "E0", "E1", "E2", "E3", "E4", "E5", "E6", 
49 |     "F0", "F1", "F2", "F3", "F4", "F5", "F6", 
50 |     "F#0/Gb0", "F#1/Gb1", "F#2/Gb2", "F#3/Gb3", "F#4/Gb4", "F#5/Gb5", "F#6/Gb6",
51 |     "G0", "G1", "G2", "G3", "G4", "G5", "G6", 
52 |     "G#0/Ab0", "G#1/Ab1", "G#2/Ab2", "G#3/Ab3", "G#4/Ab4", "G#5/Ab5", "G#6/Ab6", 
53 |     "A0", "A1", "A2", "A3", "A4", "A5", "A6", 
54 |     "A#0/Bb0", "A#1/Bb1", "A#2/Bb2", "A#3/Bb3", "A#4/Bb4", "A#5/Bb5", "A#6/Bb6", 
55 |     "B0", "B1", "B2", "B3", "B4", "B5", "B6", 
56 |     "RestRest", "rest"
57 | ]
58 | 
59 | ttsing_slur_set = ['_'] + ['0', '1']
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import linalg as LA
  3 | import librosa
  4 | from scipy.io import wavfile
  5 | import soundfile as sf
  6 | import librosa.filters
  7 | 
  8 | 
  9 | def load_wav(wav_path, raw_sr, target_sr=16000, win_size=800, hop_size=200):
 10 |     audio = librosa.core.load(wav_path, sr=raw_sr)[0]
 11 |     if raw_sr != target_sr:
 12 |         audio = librosa.core.resample(audio,
 13 |                                       raw_sr,
 14 |                                       target_sr,
 15 |                                       res_type='kaiser_best')
 16 |         target_length = (audio.size // hop_size +
 17 |                          win_size // hop_size) * hop_size
 18 |         pad_len = (target_length - audio.size) // 2
 19 |         if audio.size % 2 == 0:
 20 |             audio = np.pad(audio, (pad_len, pad_len), mode='reflect')
 21 |         else:
 22 |             audio = np.pad(audio, (pad_len, pad_len + 1), mode='reflect')
 23 |     return audio
 24 | 
 25 | 
 26 | def save_wav(wav, path, sample_rate, norm=False):
 27 |     if norm:
 28 |         wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 29 |         wavfile.write(path, sample_rate, wav.astype(np.int16))
 30 |     else:
 31 |         sf.write(path, wav, sample_rate)
 32 | 
 33 | 
 34 | _mel_basis = None
 35 | _inv_mel_basis = None
 36 | 
 37 | 
 38 | def _build_mel_basis(hparams):
 39 |     assert hparams.fmax <= hparams.sample_rate // 2
 40 |     return librosa.filters.mel(hparams.sample_rate,
 41 |                                hparams.n_fft,
 42 |                                n_mels=hparams.acoustic_dim,
 43 |                                fmin=hparams.fmin,
 44 |                                fmax=hparams.fmax)
 45 | 
 46 | 
 47 | def _linear_to_mel(spectogram, hparams):
 48 |     global _mel_basis
 49 |     if _mel_basis is None:
 50 |         _mel_basis = _build_mel_basis(hparams)
 51 |     return np.dot(_mel_basis, spectogram)
 52 | 
 53 | 
 54 | def _mel_to_linear(mel_spectrogram, hparams):
 55 |     global _inv_mel_basis
 56 |     if _inv_mel_basis is None:
 57 |         _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
 58 |     return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
 59 | 
 60 | 
 61 | def _stft(y, hparams):
 62 |     return librosa.stft(y=y,
 63 |                         n_fft=hparams.n_fft,
 64 |                         hop_length=hparams.hop_size,
 65 |                         win_length=hparams.win_size)
 66 | 
 67 | 
 68 | def _amp_to_db(x, hparams):
 69 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
 70 |     return 20 * np.log10(np.maximum(min_level, x))
 71 | 
 72 | def _normalize(S, hparams):
 73 |     return hparams.max_abs_value * np.clip(((S - hparams.min_db) /
 74 |                                          (-hparams.min_db)), 0, 1)
 75 | 
 76 | def _db_to_amp(x):
 77 |     return np.power(10.0, (x) * 0.05)
 78 | 
 79 | 
 80 | def _stft(y, hparams):
 81 |     return librosa.stft(y=y,
 82 |                         n_fft=hparams.n_fft,
 83 |                         hop_length=hparams.hop_size,
 84 |                         win_length=hparams.win_size)
 85 | 
 86 | 
 87 | def _istft(y, hparams):
 88 |     return librosa.istft(y,
 89 |                          hop_length=hparams.hop_size,
 90 |                          win_length=hparams.win_size)
 91 | 
 92 | 
 93 | def melspectrogram(wav, hparams):
 94 |     D = _stft(wav, hparams)
 95 |     S = _amp_to_db(_linear_to_mel(np.abs(D), hparams),
 96 |                    hparams) - hparams.ref_level_db
 97 |     return _normalize(S, hparams)
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/preprocess/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     
 53 |     global hann_window
 54 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 55 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 56 |     if wnsize_dtype_device not in hann_window:
 57 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 58 | 
 59 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 60 |     y = y.squeeze(1)
 61 | 
 62 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 63 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 64 | 
 65 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 66 |     return spec
 67 | 
 68 | 
 69 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 70 |     global mel_basis
 71 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 72 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 73 |     if fmax_dtype_device not in mel_basis:
 74 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 75 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
 76 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 77 |     spec = spectral_normalize_torch(spec)
 78 |     return spec
 79 | 
 80 | 
 81 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 82 |      
 83 |     global mel_basis, hann_window
 84 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 85 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 86 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 87 |     if fmax_dtype_device not in mel_basis:
 88 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 89 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
 90 |     if wnsize_dtype_device not in hann_window:
 91 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 92 | 
 93 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 94 |     y = y.squeeze(1)
 95 | 
 96 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 97 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 98 | 
 99 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
100 | 
101 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
102 |     spec = spectral_normalize_torch(spec)
103 | 
104 |     return spec
105 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import sys
  4 | import argparse
  5 | import numpy as np
  6 | from multiprocessing import cpu_count
  7 | from concurrent.futures import ProcessPoolExecutor
  8 | from functools import partial
  9 | from utils import audio
 10 | import utils.utils as utils
 11 | from tqdm import tqdm
 12 | import pyworld as pw
 13 | from random import shuffle
 14 | 
 15 | import warnings
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | def extract_mel(wav, hparams):
 19 |     mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
 20 |     return mel_spectrogram.T, wav
 21 | 
 22 | def extract_pitch(wav, hps):
 23 |     # rapt may be better
 24 |     f0, _ = pw.harvest(wav.astype(np.float64),
 25 |                    hps.sample_rate,
 26 |                    frame_period=hps.hop_size / hps.sample_rate * 1000)
 27 |     return f0
 28 | 
 29 | def process_utterance(hps, data_root, item):
 30 |     out_dir = data_root
 31 | 
 32 |     wav_path = os.path.join(data_root, "wavs",
 33 |                             "{}.wav".format(item))
 34 |     wav = audio.load_wav(wav_path,
 35 |                          raw_sr=hps.data.sample_rate,
 36 |                          target_sr=hps.data.sample_rate,
 37 |                          win_size=hps.data.win_size,
 38 |                          hop_size=hps.data.hop_size)
 39 | 
 40 |     mel, _ = extract_mel(wav, hps.data)
 41 |     out_mel_dir = os.path.join(out_dir, "mels")
 42 |     os.makedirs(out_mel_dir, exist_ok=True)
 43 |     mel_path = os.path.join(out_mel_dir, item)
 44 |     np.save(mel_path, mel)
 45 | 
 46 |     pitch = extract_pitch(wav, hps.data)
 47 |     out_pitch_dir = os.path.join(out_dir, "pitch")
 48 |     os.makedirs(out_pitch_dir, exist_ok=True)
 49 |     pitch_path = os.path.join(out_pitch_dir, item)
 50 |     np.save(pitch_path, pitch)
 51 | 
 52 | 
 53 | def process(args, hps, data_dir):
 54 |     print(os.path.join(data_dir, "wavs"))
 55 |     if(not os.path.exists(os.path.join(data_dir, "file.list"))):
 56 |         with open(os.path.join(data_dir, "file.list") , "w") as out_file:
 57 |             files = os.listdir(os.path.join(data_dir, "wavs"))
 58 |             files = [i for i in files if i.endswith(".wav")]
 59 |             for f in files:
 60 |                 out_file.write(f.strip().split(".")[0] + '\n')
 61 |     metadata = [
 62 |         item.strip() for item in open(
 63 |             os.path.join(data_dir, "file.list")).readlines()
 64 |     ]
 65 |     executor = ProcessPoolExecutor(max_workers=args.num_workers)
 66 |     results = []
 67 |     for item in metadata:
 68 |         results.append(executor.submit(partial(process_utterance, hps, data_dir, item)))
 69 |     return [result.result() for result in tqdm(results)]
 70 | 
 71 | def split_dataset(data_dir):
 72 |     metadata = [
 73 |         item.strip() for item in open(
 74 |             os.path.join(data_dir, "file.list")).readlines()
 75 |     ]
 76 |     shuffle(metadata)
 77 |     train_set = metadata[:-2]
 78 |     test_set =  metadata[-2:]
 79 |     with open(os.path.join(data_dir, "train.list"), "w") as ts:
 80 |         for item in train_set:
 81 |             ts.write(item+"\n")
 82 |     with open(os.path.join(data_dir, "test.list"), "w") as ts:
 83 |         for item in test_set:
 84 |             ts.write(item+"\n")
 85 | 
 86 | def main():
 87 |     parser = argparse.ArgumentParser()
 88 |     parser.add_argument('--config',
 89 |                         default='egs/visinger2/config.json',
 90 |                         help='json files for configurations.')
 91 |     parser.add_argument('--num_workers', type=int, default=int(cpu_count()) // 2)
 92 | 
 93 |     args = parser.parse_args()
 94 |     hps = utils.get_hparams_from_file(args.config)
 95 |     spklist = [spk for spk in os.listdir("data") if os.path.isdir(f"data/{spk}") and not os.path.exists(f"data/{spk}/test.list")]
 96 |     for spk in tqdm(spklist):
 97 |         print(f"preprocessing {spk}")
 98 |         data_dir = f"data/{spk}"
 99 |         process(args, hps, data_dir)
100 |         split_dataset(data_dir)
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/egs/visinger2/inference.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import IPython.display as ipd
  3 | 
  4 | import sys
  5 | import os
  6 | import json
  7 | import math
  8 | import torch
  9 | from torch import nn
 10 | from torch.nn import functional as F
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | import modules.commons as commons
 14 | import utils.utils as utils
 15 | from models import SynthesizerTrn
 16 | from text import npu
 17 | from scipy.io.wavfile import write
 18 | from tqdm import tqdm
 19 | import numpy as np
 20 | import time
 21 | import argparse
 22 | 
 23 | def parse_label(hps, pho, pitchid, dur, slur, gtdur):
 24 |     phos = []
 25 |     pitchs = []
 26 |     durs = []
 27 |     slurs = []
 28 |     gtdurs = []
 29 | 
 30 |     for index in range(len(pho.split())):
 31 |         phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]])
 32 |         pitchs.append(npu.symbol_converter.ttsing_opencpop_pitch_to_int[pitchid.strip().split()[index]])
 33 |         durs.append(float(dur.strip().split()[index]))
 34 |         slurs.append(int(slur.strip().split()[index]))
 35 |         gtdurs.append(float(gtdur.strip().split()[index]))
 36 | 
 37 |     phos = np.asarray(phos, dtype=np.int32)
 38 |     pitchs = np.asarray(pitchs, dtype=np.int32)
 39 |     durs = np.asarray(durs, dtype=np.float32)
 40 |     slurs = np.asarray(slurs, dtype=np.int32)
 41 |     gtdurs = np.asarray(gtdurs, dtype=np.float32)
 42 |     gtdurs = np.ceil(gtdurs / (hps.data.hop_size / hps.data.sample_rate))
 43 | 
 44 |     phos = torch.LongTensor(phos)
 45 |     pitchs = torch.LongTensor(pitchs)
 46 |     durs = torch.FloatTensor(durs)
 47 |     slurs = torch.LongTensor(slurs)
 48 |     gtdurs = torch.LongTensor(gtdurs)
 49 |     return phos, pitchs, durs, slurs, gtdurs
 50 | 
 51 | def load_model(model_dir):
 52 | 
 53 |     # load config and model
 54 |     model_path = utils.latest_checkpoint_path(model_dir)
 55 |     config_path = os.path.join(model_dir, "config.json")
 56 |     
 57 |     hps = utils.get_hparams_from_file(config_path)
 58 | 
 59 |     print("Load model from : ", model_path)
 60 |     print("config: ", config_path)
 61 | 
 62 |     net_g = SynthesizerTrn(hps)
 63 |     _ = net_g.eval()
 64 |     _ = utils.load_checkpoint(model_path, net_g, None)
 65 |     return net_g, hps
 66 | 
 67 | def inference_label2wav(net_g, label_list_path, output_dir, hps, cuda_id=None):
 68 | 
 69 |     id2label = {}
 70 |     with open(label_list_path, "r") as in_file:
 71 |         for line in in_file.readlines():
 72 |             fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|')
 73 |             id2label[fileid] = [phones, pitchid, dur, slur, gtdur]
 74 | 
 75 |     for file_name in tqdm(id2label.keys()):
 76 |         pho, pitchid, dur, slur, gtdur = id2label[file_name]
 77 |         pho, pitchid, dur, slur, gtdur = parse_label(hps, pho, pitchid, dur, slur, gtdur)
 78 | 
 79 |         with torch.no_grad():
 80 | 
 81 |             # data
 82 |             pho_lengths = torch.LongTensor([pho.size(0)])
 83 |             pho = pho.unsqueeze(0)
 84 |             pitchid = pitchid.unsqueeze(0)
 85 |             dur = dur.unsqueeze(0)
 86 |             slur = slur.unsqueeze(0)
 87 | 
 88 |             if(cuda_id != None):
 89 |                 net_g = net_g.cuda(0)
 90 |                 pho = pho.cuda(0)
 91 |                 pho_lengths = pho_lengths.cuda(0)
 92 |                 pitchid = pitchid.cuda(0)
 93 |                 dur = dur.cuda(0)
 94 |                 slur = slur.cuda(0)
 95 | 
 96 |             # infer
 97 |             o, _, _ = net_g.infer(pho, pho_lengths, pitchid, dur, slur)
 98 |             audio = o[0,0].data.cpu().float().numpy()
 99 |             audio = audio * 32768 #hps.data.max_wav_value
100 |             audio = audio.astype(np.int16)
101 |            
102 |             # save
103 |             write(os.path.join(output_dir, file_name.split('.')[0] + '.wav' ), hps.data.sample_rate, audio)
104 | 
105 | if __name__ == "__main__":
106 | 
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument('-model_dir', '--model_dir', type=str, required=True)
109 |     parser.add_argument('-input_dir', '--input_dir', type=str, required=True)
110 |     parser.add_argument('-output_dir', '--output_dir', type=str, required=True)
111 |     args = parser.parse_args()
112 | 
113 |     model_dir = args.model_dir
114 |     input_dir = args.input_dir
115 |     output_dir = args.output_dir
116 | 
117 |     model, hps = load_model(model_dir)
118 |     if(not os.path.exists(output_dir)):
119 |         os.makedirs(output_dir)
120 |     print("load model end!")
121 | 
122 |     inference_label2wav(model, input_dir, output_dir, hps, cuda_id=0)
123 | 
124 | 


--------------------------------------------------------------------------------
/modules/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |   classname = m.__class__.__name__
 10 |   if classname.find("Conv") != -1:
 11 |     m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |   return int((kernel_size*dilation - dilation)/2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |   l = pad_shape[::-1]
 20 |   pad_shape = [item for sublist in l for item in sublist]
 21 |   return pad_shape
 22 | 
 23 | 
 24 | def intersperse(lst, item):
 25 |   result = [item] * (len(lst) * 2 + 1)
 26 |   result[1::2] = lst
 27 |   return result
 28 | 
 29 | 
 30 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 31 |   """KL(P||Q)"""
 32 |   kl = (logs_q - logs_p) - 0.5
 33 |   kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
 34 |   return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |   """Sample from the Gumbel distribution, protect from overflows."""
 39 |   uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |   return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |   g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |   return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |   ret = torch.zeros_like(x[:, :, :segment_size])
 50 |   # print("ret shape: ",ret.shape, ids_str)
 51 |   for i in range(x.size(0)):
 52 |     idx_str = ids_str[i]
 53 |     idx_end = idx_str + segment_size
 54 |     ret[i] = x[i, :, idx_str:idx_end]
 55 |   return ret
 56 | 
 57 | 
 58 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 59 |   b, d, t = x.size()
 60 |   if x_lengths is None:
 61 |     x_lengths = t
 62 |   ids_str_max = x_lengths - segment_size - 1
 63 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 64 |   ret = slice_segments(x, ids_str, segment_size)
 65 |   return ret, ids_str
 66 | 
 67 | 
 68 | def get_timing_signal_1d(
 69 |     length, channels, min_timescale=1.0, max_timescale=1.0e4):
 70 |   position = torch.arange(length, dtype=torch.float)
 71 |   num_timescales = channels // 2
 72 |   log_timescale_increment = (
 73 |       math.log(float(max_timescale) / float(min_timescale)) /
 74 |       (num_timescales - 1))
 75 |   inv_timescales = min_timescale * torch.exp(
 76 |       torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
 77 |   scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 78 |   signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 79 |   signal = F.pad(signal, [0, 0, 0, channels % 2])
 80 |   signal = signal.view(1, channels, length)
 81 |   return signal
 82 | 
 83 | 
 84 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 85 |   b, channels, length = x.size()
 86 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 87 |   return x + signal.to(dtype=x.dtype, device=x.device)
 88 | 
 89 | 
 90 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 91 |   b, channels, length = x.size()
 92 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 93 |   return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 94 | 
 95 | 
 96 | def subsequent_mask(length):
 97 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 98 |   return mask
 99 | 
100 | 
101 | @torch.jit.script
102 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
103 |   n_channels_int = n_channels[0]
104 |   in_act = input_a + input_b
105 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
106 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
107 |   acts = t_act * s_act
108 |   return acts
109 | 
110 | 
111 | def convert_pad_shape(pad_shape):
112 |   l = pad_shape[::-1]
113 |   pad_shape = [item for sublist in l for item in sublist]
114 |   return pad_shape
115 | 
116 | 
117 | def shift_1d(x):
118 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
119 |   return x
120 | 
121 | 
122 | def sequence_mask(length, max_length=None):
123 |   if max_length is None:
124 |     max_length = length.max()
125 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
126 |   return x.unsqueeze(0) < length.unsqueeze(1)
127 | 
128 | 
129 | def generate_path(duration, mask):
130 |   """
131 |   duration: [b, 1, t_x]
132 |   mask: [b, 1, t_y, t_x]
133 |   """
134 |   device = duration.device
135 |   
136 |   b, _, t_y, t_x = mask.shape
137 |   cum_duration = torch.cumsum(duration, -1)
138 |   
139 |   cum_duration_flat = cum_duration.view(b * t_x)
140 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
141 |   path = path.view(b, t_x, t_y)
142 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
143 |   path = path.unsqueeze(1).transpose(2,3) * mask
144 |   return path
145 | 
146 | 
147 | def clip_grad_value_(parameters, clip_value, norm_type=2):
148 |   if isinstance(parameters, torch.Tensor):
149 |     parameters = [parameters]
150 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
151 |   norm_type = float(norm_type)
152 |   if clip_value is not None:
153 |     clip_value = float(clip_value)
154 | 
155 |   total_norm = 0
156 |   for p in parameters:
157 |     param_norm = p.grad.data.norm(norm_type)
158 |     total_norm += param_norm.item() ** norm_type
159 |     if clip_value is not None:
160 |       p.grad.data.clamp_(min=-clip_value, max=clip_value)
161 |   total_norm = total_norm ** (1. / norm_type)
162 |   return total_norm
163 | 


--------------------------------------------------------------------------------
/modules/ddsp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional as F
  4 | import torch.fft as fft
  5 | import numpy as np
  6 | import librosa as li
  7 | import math
  8 | from scipy.signal import get_window
  9 | 
 10 | def safe_log(x):
 11 |     return torch.log(x + 1e-7)
 12 | 
 13 | 
 14 | @torch.no_grad()
 15 | def mean_std_loudness(dataset):
 16 |     mean = 0
 17 |     std = 0
 18 |     n = 0
 19 |     for _, _, l in dataset:
 20 |         n += 1
 21 |         mean += (l.mean().item() - mean) / n
 22 |         std += (l.std().item() - std) / n
 23 |     return mean, std
 24 | 
 25 | 
 26 | def multiscale_fft(signal, scales, overlap):
 27 |     stfts = []
 28 |     for s in scales:
 29 |         S = torch.stft(
 30 |             signal,
 31 |             s,
 32 |             int(s * (1 - overlap)),
 33 |             s,
 34 |             torch.hann_window(s).to(signal),
 35 |             True,
 36 |             normalized=True,
 37 |             return_complex=True,
 38 |         ).abs()
 39 |         stfts.append(S)
 40 |     return stfts
 41 | 
 42 | 
 43 | def resample(x, factor: int):
 44 |     batch, frame, channel = x.shape
 45 |     x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame)
 46 | 
 47 |     window = torch.hann_window(
 48 |         factor * 2,
 49 |         dtype=x.dtype,
 50 |         device=x.device,
 51 |     ).reshape(1, 1, -1)
 52 |     y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x)
 53 |     y[..., ::factor] = x
 54 |     y[..., -1:] = x[..., -1:]
 55 |     y = torch.nn.functional.pad(y, [factor, factor])
 56 |     y = torch.nn.functional.conv1d(y, window)[..., :-1]
 57 | 
 58 |     y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1)
 59 | 
 60 |     return y
 61 | 
 62 | 
 63 | def upsample(signal, factor):
 64 |     signal = signal.permute(0, 2, 1)
 65 |     signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor)
 66 |     return signal.permute(0, 2, 1)
 67 | 
 68 | 
 69 | def remove_above_nyquist(amplitudes, pitch, sampling_rate):
 70 |     n_harm = amplitudes.shape[-1]
 71 |     pitches = pitch * torch.arange(1, n_harm + 1).to(pitch)
 72 |     aa = (pitches < sampling_rate / 2).float() + 1e-4
 73 |     return amplitudes * aa
 74 | 
 75 | 
 76 | def scale_function(x):
 77 |     return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7
 78 | 
 79 | 
 80 | def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
 81 |     S = li.stft(
 82 |         signal,
 83 |         n_fft=n_fft,
 84 |         hop_length=block_size,
 85 |         win_length=n_fft,
 86 |         center=True,
 87 |     )
 88 |     S = np.log(abs(S) + 1e-7)
 89 |     f = li.fft_frequencies(sampling_rate, n_fft)
 90 |     a_weight = li.A_weighting(f)
 91 | 
 92 |     S = S + a_weight.reshape(-1, 1)
 93 | 
 94 |     S = np.mean(S, 0)[..., :-1]
 95 | 
 96 |     return S
 97 | 
 98 | 
 99 | def extract_pitch(signal, sampling_rate, block_size):
100 |     length = signal.shape[-1] // block_size
101 |     f0 = crepe.predict(
102 |         signal,
103 |         sampling_rate,
104 |         step_size=int(1000 * block_size / sampling_rate),
105 |         verbose=1,
106 |         center=True,
107 |         viterbi=True,
108 |     )
109 |     f0 = f0[1].reshape(-1)[:-1]
110 | 
111 |     if f0.shape[-1] != length:
112 |         f0 = np.interp(
113 |             np.linspace(0, 1, length, endpoint=False),
114 |             np.linspace(0, 1, f0.shape[-1], endpoint=False),
115 |             f0,
116 |         )
117 | 
118 |     return f0
119 | 
120 | 
121 | def mlp(in_size, hidden_size, n_layers):
122 |     channels = [in_size] + (n_layers) * [hidden_size]
123 |     net = []
124 |     for i in range(n_layers):
125 |         net.append(nn.Linear(channels[i], channels[i + 1]))
126 |         net.append(nn.LayerNorm(channels[i + 1]))
127 |         net.append(nn.LeakyReLU())
128 |     return nn.Sequential(*net)
129 | 
130 | 
131 | def gru(n_input, hidden_size):
132 |     return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True)
133 | 
134 | 
135 | def harmonic_synth(pitch, amplitudes, sampling_rate):
136 |     n_harmonic = amplitudes.shape[-1]
137 |     omega = torch.cumsum(2 * math.pi * pitch / sampling_rate, 1)
138 |     omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
139 |     signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True)
140 |     return signal
141 | 
142 | 
143 | def amp_to_impulse_response(amp, target_size):
144 |     amp = torch.stack([amp, torch.zeros_like(amp)], -1)
145 |     amp = torch.view_as_complex(amp)
146 |     amp = fft.irfft(amp)
147 | 
148 |     filter_size = amp.shape[-1]
149 | 
150 |     amp = torch.roll(amp, filter_size // 2, -1)
151 |     win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device)
152 | 
153 |     amp = amp * win
154 | 
155 |     amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size)))
156 |     amp = torch.roll(amp, -filter_size // 2, -1)
157 | 
158 |     return amp
159 | 
160 | 
161 | def fft_convolve(signal, kernel):
162 |     signal = nn.functional.pad(signal, (0, signal.shape[-1]))
163 |     kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0))
164 | 
165 |     output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel))
166 |     output = output[..., output.shape[-1] // 2:]
167 | 
168 |     return output
169 | 
170 | 
171 | def init_kernels(win_len, win_inc, fft_len, win_type=None, invers=False):
172 |     if win_type == 'None' or win_type is None:
173 |         window = np.ones(win_len)
174 |     else:
175 |         window = get_window(win_type, win_len, fftbins=True)#**0.5
176 |     
177 |     N = fft_len
178 |     fourier_basis = np.fft.rfft(np.eye(N))[:win_len]
179 |     real_kernel = np.real(fourier_basis)
180 |     imag_kernel = np.imag(fourier_basis)
181 |     kernel = np.concatenate([real_kernel, imag_kernel], 1).T
182 |     
183 |     if invers :
184 |         kernel = np.linalg.pinv(kernel).T 
185 | 
186 |     kernel = kernel*window
187 |     kernel = kernel[:, None, :]
188 |     return torch.from_numpy(kernel.astype(np.float32)), torch.from_numpy(window[None,:,None].astype(np.float32))
189 | 
190 | 


--------------------------------------------------------------------------------
/modules/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | import numpy as np
  9 | from scipy.io.wavfile import read
 10 | import torch
 11 | 
 12 | MATPLOTLIB_FLAG = False
 13 | 
 14 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 15 | logger = logging
 16 | 
 17 | 
 18 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 19 |     assert os.path.isfile(checkpoint_path)
 20 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 21 |     iteration = checkpoint_dict['iteration']
 22 |     learning_rate = checkpoint_dict['learning_rate']
 23 |     if optimizer is not None:
 24 |         optimizer.load_state_dict(checkpoint_dict['optimizer'])
 25 |     saved_state_dict = checkpoint_dict['model']
 26 |     if hasattr(model, 'module'):
 27 |         state_dict = model.module.state_dict()
 28 |     else:
 29 |         state_dict = model.state_dict()
 30 |     new_state_dict = {}
 31 |     for k, v in state_dict.items():
 32 |         try:
 33 |             new_state_dict[k] = saved_state_dict[k]
 34 |             assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
 35 |         except:
 36 |             print("error, %s is not in the checkpoint" % k)
 37 |             logger.info("%s is not in the checkpoint" % k)
 38 |             new_state_dict[k] = v
 39 |     if hasattr(model, 'module'):
 40 |         model.module.load_state_dict(new_state_dict)
 41 |     else:
 42 |         model.load_state_dict(new_state_dict)
 43 |     print("load ")
 44 |     logger.info("Loaded checkpoint '{}' (iteration {})".format(
 45 |         checkpoint_path, iteration))
 46 |     return model, optimizer, learning_rate, iteration
 47 | 
 48 | 
 49 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, val_steps):
 50 |     ckptname = checkpoint_path.split(os.sep)[-1]
 51 |     newest_step = int(ckptname.split(".")[0].split("_")[1])
 52 |     last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps * 2))
 53 |     if newest_step >= val_steps * 2:
 54 |         os.system(f"rm {last_ckptname}")
 55 | 
 56 |     logger.info("Saving model and optimizer state at iteration {} to {}".format(
 57 |         iteration, checkpoint_path))
 58 |     if hasattr(model, 'module'):
 59 |         state_dict = model.module.state_dict()
 60 |     else:
 61 |         state_dict = model.state_dict()
 62 |     torch.save({'model': state_dict,
 63 |                 'iteration': iteration,
 64 |                 'optimizer': optimizer.state_dict(),
 65 |                 'learning_rate': learning_rate}, checkpoint_path)
 66 | 
 67 | 
 68 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
 69 |     for k, v in scalars.items():
 70 |         writer.add_scalar(k, v, global_step)
 71 |     for k, v in histograms.items():
 72 |         writer.add_histogram(k, v, global_step)
 73 |     for k, v in images.items():
 74 |         writer.add_image(k, v, global_step, dataformats='HWC')
 75 |     for k, v in audios.items():
 76 |         writer.add_audio(k, v, global_step, audio_sampling_rate)
 77 | 
 78 | 
 79 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
 80 |     f_list = glob.glob(os.path.join(dir_path, regex))
 81 |     f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
 82 |     x = f_list[-1]
 83 |     print(x)
 84 |     return x
 85 | 
 86 | 
 87 | def plot_spectrogram_to_numpy(spectrogram):
 88 |     global MATPLOTLIB_FLAG
 89 |     if not MATPLOTLIB_FLAG:
 90 |         import matplotlib
 91 |         matplotlib.use("Agg")
 92 |         MATPLOTLIB_FLAG = True
 93 |         mpl_logger = logging.getLogger('matplotlib')
 94 |         mpl_logger.setLevel(logging.WARNING)
 95 |     import matplotlib.pylab as plt
 96 |     import numpy as np
 97 | 
 98 |     fig, ax = plt.subplots(figsize=(10, 2))
 99 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
100 |                    interpolation='none')
101 |     plt.colorbar(im, ax=ax)
102 |     plt.xlabel("Frames")
103 |     plt.ylabel("Channels")
104 |     plt.tight_layout()
105 | 
106 |     fig.canvas.draw()
107 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
108 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
109 |     plt.close()
110 |     return data
111 | 
112 | 
113 | def plot_alignment_to_numpy(alignment, info=None):
114 |     global MATPLOTLIB_FLAG
115 |     if not MATPLOTLIB_FLAG:
116 |         import matplotlib
117 |         matplotlib.use("Agg")
118 |         MATPLOTLIB_FLAG = True
119 |         mpl_logger = logging.getLogger('matplotlib')
120 |         mpl_logger.setLevel(logging.WARNING)
121 |     import matplotlib.pylab as plt
122 |     import numpy as np
123 | 
124 |     fig, ax = plt.subplots(figsize=(6, 4))
125 |     im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
126 |                    interpolation='none')
127 |     fig.colorbar(im, ax=ax)
128 |     xlabel = 'Decoder timestep'
129 |     if info is not None:
130 |         xlabel += '\n\n' + info
131 |     plt.xlabel(xlabel)
132 |     plt.ylabel('Encoder timestep')
133 |     plt.tight_layout()
134 | 
135 |     fig.canvas.draw()
136 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
137 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
138 |     plt.close()
139 |     return data
140 | 
141 | 
142 | def load_wav_to_torch(full_path):
143 |     sampling_rate, data = read(full_path)
144 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
145 | 
146 | 
147 | def load_filepaths_and_text(filename, split="|"):
148 |     with open(filename, encoding='utf-8') as f:
149 |         filepaths_and_text = [line.strip().split(split) for line in f]
150 |     return filepaths_and_text
151 | 
152 | 
153 | def get_hparams(init=True):
154 |     parser = argparse.ArgumentParser()
155 |     parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
156 |                         help='JSON file for configuration')
157 |     # parser.add_argument('-m', '--model', type=str, required=True,
158 |     #                    help='Model name')
159 | 
160 |     args = parser.parse_args()
161 | 
162 |     config_path = args.config
163 |     with open(config_path, "r") as f:
164 |         data = f.read()
165 |     config = json.loads(data)
166 | 
167 |     hparams = HParams(**config)
168 |     # hparams.model_dir = model_dir
169 |     model_dir = hparams.train.save_dir
170 |     config_save_path = os.path.join(model_dir, "config.json")
171 | 
172 |     if not os.path.exists(model_dir):
173 |         os.makedirs(model_dir)
174 | 
175 |     with open(config_save_path, "w") as f:
176 |         f.write(data)
177 |     return hparams
178 | 
179 | 
180 | def get_hparams_from_dir(model_dir):
181 |     config_save_path = os.path.join(model_dir, "config.json")
182 |     with open(config_save_path, "r") as f:
183 |         data = f.read()
184 |     config = json.loads(data)
185 | 
186 |     hparams = HParams(**config)
187 |     hparams.model_dir = model_dir
188 |     return hparams
189 | 
190 | 
191 | def get_hparams_from_file(config_path):
192 |     with open(config_path, "r") as f:
193 |         data = f.read()
194 |     config = json.loads(data)
195 | 
196 |     hparams = HParams(**config)
197 |     return hparams
198 | 
199 | 
200 | def check_git_hash(model_dir):
201 |     source_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
202 |     if not os.path.exists(os.path.join(source_dir, ".git")):
203 |         logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
204 |             source_dir
205 |         ))
206 |         return
207 | 
208 |     cur_hash = subprocess.getoutput("git rev-parse HEAD")
209 | 
210 |     path = os.path.join(model_dir, "githash")
211 |     if os.path.exists(path):
212 |         saved_hash = open(path).read()
213 |         if saved_hash != cur_hash:
214 |             logger.warn("git hash values are different. {}(saved) != {}(current)".format(
215 |                 saved_hash[:8], cur_hash[:8]))
216 |     else:
217 |         open(path, "w").write(cur_hash)
218 | 
219 | 
220 | def get_logger(model_dir, filename="train.log"):
221 |     global logger
222 |     logger = logging.getLogger(os.path.basename(model_dir))
223 |     logger.setLevel(logging.DEBUG)
224 | 
225 |     formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
226 |     if not os.path.exists(model_dir):
227 |         os.makedirs(model_dir)
228 |     h = logging.FileHandler(os.path.join(model_dir, filename))
229 |     h.setLevel(logging.DEBUG)
230 |     h.setFormatter(formatter)
231 |     logger.addHandler(h)
232 |     return logger
233 | 
234 | 
235 | def count_parameters(model):
236 |     return sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
237 | 
238 | 
239 | class HParams():
240 |     def __init__(self, **kwargs):
241 |         for k, v in kwargs.items():
242 |             if type(v) == dict:
243 |                 v = HParams(**v)
244 |             self[k] = v
245 | 
246 |     def keys(self):
247 |         return self.__dict__.keys()
248 | 
249 |     def items(self):
250 |         return self.__dict__.items()
251 | 
252 |     def values(self):
253 |         return self.__dict__.values()
254 | 
255 |     def __len__(self):
256 |         return len(self.__dict__)
257 | 
258 |     def __getitem__(self, key):
259 |         return getattr(self, key)
260 | 
261 |     def __setitem__(self, key, value):
262 |         return setattr(self, key, value)
263 | 
264 |     def __contains__(self, key):
265 |         return key in self.__dict__
266 | 
267 |     def __repr__(self):
268 |         return self.__dict__.__repr__()
269 | 


--------------------------------------------------------------------------------
/egs/visinger2/dataset.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import string
  5 | import random
  6 | import numpy as np
  7 | import math
  8 | import json
  9 | from torch.utils.data import DataLoader
 10 | import torch
 11 | 
 12 | sys.path.append('../..')
 13 | from utils.audio import load_wav
 14 | from text import npu
 15 | 
 16 | class BaseDataset(torch.utils.data.Dataset):
 17 | 
 18 |     def __init__(self, hparams, fileid_list_path):
 19 |         self.hparams = hparams
 20 |         self.fileid_list = self.get_fileid_list(fileid_list_path)
 21 |         random.seed(hparams.train.seed)
 22 |         random.shuffle(self.fileid_list)
 23 |         if(hparams.data.n_speakers > 0):
 24 |             self.spk2id = hparams.data.spk2id
 25 | 
 26 |     def get_fileid_list(self, fileid_list_path):
 27 |         fileid_list = []
 28 |         with open(fileid_list_path, 'r') as f:
 29 |             for line in f.readlines():
 30 |                 fileid_list.append(line.strip())
 31 | 
 32 |         return fileid_list
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.fileid_list)
 36 | 
 37 | class SingDataset(BaseDataset):
 38 |     def __init__(self, hparams, data_dir, fileid_list_path, label_list_path):
 39 |         BaseDataset.__init__(self, hparams, os.path.join(data_dir, fileid_list_path))
 40 |         self.hps = hparams
 41 | 
 42 |         with open(os.path.join(data_dir, label_list_path), "r") as in_file:
 43 |             self.id2label = {}
 44 |             for line in in_file.readlines():
 45 |                 fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|')
 46 |                 self.id2label[fileid] = [phones, pitchid, dur, slur, gtdur]
 47 | 
 48 |         self.data_dir = data_dir
 49 |         # self.__filter__()
 50 | 
 51 |     def __filter__(self):
 52 |         new_fileid_list = []
 53 |         print("before filter: ", len(self.fileid_list))
 54 |         for file_id in self.fileid_list:
 55 |             _is_qualified = True
 56 |             if(not os.path.exists(os.path.join(self.label_dir, self.fileid_list[index] + '.lab')) or 
 57 |                 not os.path.exists(os.path.join(self.dur_dir, self.fileid_list[index] + '.lab')) or 
 58 |                 not os.path.exists(os.path.join(self.mel_dir, self.fileid_list[index] + '.npy')) or 
 59 |                 not os.path.exists(os.path.join(self.pitch_dir, self.fileid_list[index] + '.npy'))):
 60 |                 _is_qualified = False
 61 |             if(_is_qualified):
 62 |                 new_fileid_list.append(file_id)
 63 |         self.fileid_list = new_fileid_list
 64 |         print("after filter: ", len(self.fileid_list))
 65 | 
 66 |     def interpolate_f0(self, data):
 67 |         '''
 68 |         对F0进行插值处理
 69 |         '''
 70 |         data = np.reshape(data, (data.size, 1))
 71 | 
 72 |         vuv_vector = np.zeros((data.size, 1),dtype=np.float32)
 73 |         vuv_vector[data > 0.0] = 1.0
 74 |         vuv_vector[data <= 0.0] = 0.0
 75 | 
 76 |         ip_data = data
 77 | 
 78 |         frame_number = data.size
 79 |         last_value = 0.0
 80 |         for i in range(frame_number):
 81 |             if data[i] <= 0.0:
 82 |                 j = i + 1
 83 |                 for j in range(i + 1, frame_number):
 84 |                     if data[j] > 0.0:
 85 |                         break
 86 |                 if j < frame_number - 1:
 87 |                     if last_value > 0.0:
 88 |                         step = (data[j] - data[i - 1]) / float(j - i)
 89 |                         for k in range(i, j):
 90 |                             ip_data[k] = data[i - 1] + step * (k - i + 1)
 91 |                     else:
 92 |                         for k in range(i, j):
 93 |                             ip_data[k] = data[j]
 94 |                 else:
 95 |                     for k in range(i, frame_number):
 96 |                         ip_data[k] = last_value
 97 |             else:
 98 |                 ip_data[i] = data[i]
 99 |                 last_value = data[i]
100 | 
101 |         return ip_data, vuv_vector
102 | 
103 |     def parse_label(self, pho, pitchid, dur, slur, gtdur):
104 |         phos = []
105 |         pitchs = []
106 |         durs = []
107 |         slurs = []
108 |         gtdurs = []
109 | 
110 |         for index in range(len(pho.split())):
111 |             phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]])
112 |             pitchs.append(0)
113 |             durs.append(0)
114 |             slurs.append(0)
115 |             gtdurs.append(float(gtdur.strip().split()[index]))
116 | 
117 |         phos = np.asarray(phos, dtype=np.int32)
118 |         pitchs = np.asarray(pitchs, dtype=np.int32)
119 |         durs = np.asarray(durs, dtype=np.float32)
120 |         slurs = np.asarray(slurs, dtype=np.int32)
121 |         gtdurs = np.asarray(gtdurs, dtype=np.float32)
122 | 
123 |         acc_duration = np.cumsum(gtdurs)
124 |         acc_duration = np.pad(acc_duration, (1, 0), 'constant', constant_values=(0,))
125 |         acc_duration_frames = np.ceil(acc_duration / (self.hps.data.hop_size / self.hps.data.sample_rate))
126 |         gtdurs = acc_duration_frames[1:] - acc_duration_frames[:-1]
127 | 
128 |         phos = torch.LongTensor(phos)
129 |         pitchs = torch.LongTensor(pitchs)
130 |         durs = torch.FloatTensor(durs)
131 |         slurs = torch.LongTensor(slurs)
132 |         gtdurs = torch.LongTensor(gtdurs)
133 |         return phos, pitchs, durs, slurs, gtdurs
134 | 
135 |     def __getitem__(self, index):
136 | 
137 |         pho, pitchid, dur, slur, gtdur = self.id2label[self.fileid_list[index]]
138 |         pho, pitchid, dur, slur, gtdur = self.parse_label(pho, pitchid, dur, slur, gtdur)
139 |         sum_dur = gtdur.sum()
140 |         spk, fileid = self.fileid_list[index].split("/")
141 |         spkid = self.spk2id[spk]
142 |         mel = np.load(os.path.join(self.data_dir, spk, "mels", fileid + '.npy'))
143 |         if mel.shape[0] <150:
144 |             print("drop short audio:", self.fileid_list[index])
145 |             return None
146 |         assert mel.shape[1] == 80
147 |         if(mel.shape[0] != sum_dur):
148 |             if(abs(mel.shape[0] - sum_dur) > 3):
149 |                 print("dataset error mel: ",mel.shape, sum_dur)
150 |                 return None
151 |             if(mel.shape[0] > sum_dur):
152 |                 mel = mel[:sum_dur]
153 |             else:
154 |                 mel = np.concatenate([mel, mel.min() * np.ones([sum_dur - mel.shape[0], self.hps.data.acoustic_dim])], axis=0)
155 |         mel = torch.FloatTensor(mel).transpose(0, 1)
156 | 
157 |         f0 = np.load(os.path.join(self.data_dir, spk, "pitch", fileid + '.npy')).reshape([-1])
158 |         f0, _ = self.interpolate_f0(f0)
159 |         f0 = f0.reshape([-1])
160 |         if(f0.shape[0] != sum_dur):
161 |             if(abs(f0.shape[0] - sum_dur) > 3):
162 |                 print("dataset error f0 : ",f0.shape, sum_dur)
163 |                 return None
164 |             if(f0.shape[0] > sum_dur):
165 |                 f0 = f0[:sum_dur]
166 |             else:
167 |                 f0 = np.concatenate([f0, np.zeros([sum_dur - f0.shape[0]])], axis=0)
168 |         f0 = torch.FloatTensor(f0).reshape([1, -1])
169 | 
170 |         wav = load_wav(os.path.join(self.data_dir, spk, "wavs", fileid + '.wav'),
171 |                        raw_sr=self.hparams.data.sample_rate,
172 |                        target_sr=self.hparams.data.sample_rate,
173 |                        win_size=self.hparams.data.win_size,
174 |                        hop_size=self.hparams.data.hop_size)
175 |         wav = wav.reshape(-1)
176 |         if(wav.shape[0] != sum_dur * self.hparams.data.hop_size):
177 |             if(abs(wav.shape[0] - sum_dur * self.hparams.data.hop_size) > 3 * self.hparams.data.hop_size):
178 |                 print("dataset error wav : ", wav.shape, sum_dur)
179 |                 return None
180 |             if(wav.shape[0] > sum_dur * self.hparams.data.hop_size):
181 |                 wav = wav[:sum_dur * self.hparams.data.hop_size]
182 |             else:
183 |                 wav = np.concatenate([wav, np.zeros([sum_dur * self.hparams.data.hop_size - wav.shape[0]])], axis=0)
184 |         wav = torch.FloatTensor(wav).reshape([1, -1])
185 | 
186 |         return pho, pitchid, dur, slur, gtdur, mel, f0, wav, spkid
187 | 
188 | 
189 | class SingCollate():
190 | 
191 |     def __init__(self, hparams):
192 |         self.hparams = hparams
193 |         self.mel_dim = self.hparams.data.acoustic_dim
194 | 
195 |     def __call__(self, batch):
196 |         
197 |         batch = [b for b in batch if b is not None]
198 | 
199 |         input_lengths, ids_sorted_decreasing = torch.sort(
200 |             torch.LongTensor([len(x[0]) for x in batch]),
201 |             dim=0, descending=True)
202 |         
203 |         max_phone_len = max([len(x[0]) for x in batch])
204 |         max_pitchid_len = max([len(x[1]) for x in batch])
205 |         max_dur_len = max([len(x[2]) for x in batch])
206 |         max_slur_len = max([len(x[3]) for x in batch])
207 |         max_gtdur_len = max([len(x[4]) for x in batch])
208 |         max_mel_len = max([x[5].size(1) for x in batch])
209 |         max_f0_len = max([x[6].size(1) for x in batch])
210 |         max_wav_len = max([x[7].size(1) for x in batch])
211 | 
212 |         phone_lengths = torch.LongTensor(len(batch))
213 |         pitchid_lengths = torch.LongTensor(len(batch))
214 |         dur_lengths = torch.LongTensor(len(batch))
215 |         slur_lengths = torch.LongTensor(len(batch))
216 |         gtdur_lengths = torch.LongTensor(len(batch))
217 |         mel_lengths = torch.LongTensor(len(batch))
218 |         f0_lengths = torch.LongTensor(len(batch))
219 |         wav_lengths = torch.LongTensor(len(batch))
220 | 
221 |         phone_padded = torch.LongTensor(len(batch), max_phone_len)
222 |         pitchid_padded = torch.LongTensor(len(batch), max_pitchid_len)
223 |         dur_padded = torch.FloatTensor(len(batch), max_dur_len)
224 |         slur_padded = torch.LongTensor(len(batch), max_slur_len)
225 |         gtdur_padded = torch.LongTensor(len(batch), 1, max_gtdur_len)
226 |         mel_padded = torch.FloatTensor(len(batch), self.hparams.data.acoustic_dim, max_mel_len)
227 |         f0_padded = torch.FloatTensor(len(batch), 1, max_f0_len)
228 |         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
229 |         spkids = torch.LongTensor(len(batch))
230 | 
231 |         phone_padded.zero_()
232 |         pitchid_padded.zero_()
233 |         dur_padded.zero_()
234 |         slur_padded.zero_()
235 |         gtdur_padded.zero_()
236 |         mel_padded.zero_()
237 |         f0_padded.zero_()
238 |         wav_padded.zero_()
239 |         
240 |         for i in range(len(ids_sorted_decreasing)):
241 |             row = batch[ids_sorted_decreasing[i]]
242 | 
243 |             phone = row[0]
244 |             phone_padded[i, :phone.size(0)] = phone
245 |             phone_lengths[i] = phone.size(0)
246 | 
247 |             pitchid = row[1]
248 |             pitchid_padded[i, :pitchid.size(0)] = pitchid
249 |             pitchid_lengths[i] = pitchid.size(0)
250 | 
251 |             dur = row[2]
252 |             dur_padded[i, :dur.size(0)] = dur
253 |             dur_lengths[i] = dur.size(0)
254 | 
255 |             slur = row[3]
256 |             slur_padded[i, :slur.size(0)] = slur
257 |             slur_lengths[i] = slur.size(0)
258 | 
259 |             gtdur = row[4]
260 |             gtdur_padded[i, :, :gtdur.size(0)] = gtdur
261 |             gtdur_lengths[i] = gtdur.size(0)
262 | 
263 |             mel = row[5]
264 |             mel_padded[i, :, :mel.size(1)] = mel
265 |             mel_lengths[i] = mel.size(1)
266 |             
267 |             f0 = row[6]
268 |             f0_padded[i, :, :f0.size(1)] = f0
269 |             f0_lengths[i] = f0.size(1)
270 | 
271 |             wav = row[7]
272 |             wav_padded[i, :, :wav.size(1)] = wav
273 |             wav_lengths[i] = wav.size(1)
274 | 
275 |             spkids[i] = row[8]
276 | 
277 |         data_dict = {}
278 |         data_dict["phone"] = phone_padded
279 |         data_dict["phone_lengths"] = phone_lengths
280 |         data_dict["pitchid"] = pitchid_padded
281 |         data_dict["dur"] = dur_padded
282 |         data_dict["slur"] = slur_padded
283 |         data_dict["gtdur"] = gtdur_padded
284 |         data_dict["mel"] = mel_padded
285 |         data_dict["f0"] = f0_padded
286 |         data_dict["wav"] = wav_padded
287 | 
288 |         data_dict["mel_lengths"] = mel_lengths
289 |         data_dict["f0_lengths"] = f0_lengths
290 |         data_dict["wav_lengths"] = wav_lengths
291 |         data_dict["spkid"] = spkids
292 | 
293 |         return data_dict
294 | 
295 | 
296 | class DatasetConstructor():
297 | 
298 |     def __init__(self, hparams, num_replicas=1, rank=1):
299 |         self.hparams = hparams
300 |         self.num_replicas = num_replicas
301 |         self.rank = rank
302 |         self.dataset_function = {"SingDataset": SingDataset}
303 |         self.collate_function = {"SingCollate": SingCollate}
304 |         self._get_components()
305 | 
306 |     def _get_components(self):
307 |         self._init_datasets()
308 |         self._init_collate()
309 |         self._init_data_loaders()
310 | 
311 |     def _init_datasets(self):
312 |         self._train_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.training_filelist, self.hparams.data.training_labellist)
313 |         self._valid_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.validation_filelist, self.hparams.data.validation_labellist)
314 | 
315 |     def _init_collate(self):
316 |         self._collate_fn = self.collate_function[self.hparams.data.collate_type](self.hparams)
317 | 
318 |     def _init_data_loaders(self):
319 |         train_sampler = torch.utils.data.distributed.DistributedSampler(self._train_dataset, num_replicas=self.num_replicas, rank=self.rank, shuffle=True)
320 |         
321 |         self.train_loader = DataLoader(self._train_dataset, num_workers=4, shuffle=False,
322 |                                        batch_size=self.hparams.train.batch_size, pin_memory=True,
323 |                                        drop_last=True, collate_fn=self._collate_fn, sampler=train_sampler)
324 | 
325 |         self.valid_loader = DataLoader(self._valid_dataset, num_workers=1, shuffle=False,
326 |                                        batch_size=1, pin_memory=True,
327 |                                        drop_last=True, collate_fn=self._collate_fn)
328 | 
329 |     def get_train_loader(self):
330 |         return self.train_loader
331 | 
332 |     def get_valid_loader(self):
333 |         return self.valid_loader
334 | 
335 | 


--------------------------------------------------------------------------------
/modules/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | from torch.autograd import Function
  9 | from typing import Any, Optional, Tuple
 10 | 
 11 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 12 | from torch.nn.utils import weight_norm, remove_weight_norm
 13 | 
 14 | import modules.commons as commons
 15 | import modules.attentions as attentions
 16 | from modules.commons import init_weights, get_padding
 17 | from modules.transforms import piecewise_rational_quadratic_transform
 18 | 
 19 | 
 20 | LRELU_SLOPE = 0.1
 21 | 
 22 | 
 23 | class LayerNorm(nn.Module):
 24 |   def __init__(self, channels, eps=1e-5):
 25 |     super().__init__()
 26 |     self.channels = channels
 27 |     self.eps = eps
 28 | 
 29 |     self.gamma = nn.Parameter(torch.ones(channels))
 30 |     self.beta = nn.Parameter(torch.zeros(channels))
 31 | 
 32 |   def forward(self, x):
 33 |     x = x.transpose(1, -1)
 34 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 35 |     return x.transpose(1, -1)
 36 | 
 37 |  
 38 | class ConvReluNorm(nn.Module):
 39 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 40 |     super().__init__()
 41 |     self.in_channels = in_channels
 42 |     self.hidden_channels = hidden_channels
 43 |     self.out_channels = out_channels
 44 |     self.kernel_size = kernel_size
 45 |     self.n_layers = n_layers
 46 |     self.p_dropout = p_dropout
 47 |     assert n_layers > 1, "Number of layers should be larger than 0."
 48 | 
 49 |     self.conv_layers = nn.ModuleList()
 50 |     self.norm_layers = nn.ModuleList()
 51 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 52 |     self.norm_layers.append(LayerNorm(hidden_channels))
 53 |     self.relu_drop = nn.Sequential(
 54 |         nn.ReLU(),
 55 |         nn.Dropout(p_dropout))
 56 |     for _ in range(n_layers-1):
 57 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 58 |       self.norm_layers.append(LayerNorm(hidden_channels))
 59 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 60 |     self.proj.weight.data.zero_()
 61 |     self.proj.bias.data.zero_()
 62 | 
 63 |   def forward(self, x, x_mask):
 64 |     x_org = x
 65 |     for i in range(self.n_layers):
 66 |       x = self.conv_layers[i](x * x_mask)
 67 |       x = self.norm_layers[i](x)
 68 |       x = self.relu_drop(x)
 69 |     x = x_org + self.proj(x)
 70 |     return x * x_mask
 71 | 
 72 | 
 73 | class DDSConv(nn.Module):
 74 |   """
 75 |   Dialted and Depth-Separable Convolution
 76 |   """
 77 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 78 |     super().__init__()
 79 |     self.channels = channels
 80 |     self.kernel_size = kernel_size
 81 |     self.n_layers = n_layers
 82 |     self.p_dropout = p_dropout
 83 | 
 84 |     self.drop = nn.Dropout(p_dropout)
 85 |     self.convs_sep = nn.ModuleList()
 86 |     self.convs_1x1 = nn.ModuleList()
 87 |     self.norms_1 = nn.ModuleList()
 88 |     self.norms_2 = nn.ModuleList()
 89 |     for i in range(n_layers):
 90 |       dilation = kernel_size ** i
 91 |       padding = (kernel_size * dilation - dilation) // 2
 92 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 93 |           groups=channels, dilation=dilation, padding=padding
 94 |       ))
 95 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 96 |       self.norms_1.append(LayerNorm(channels))
 97 |       self.norms_2.append(LayerNorm(channels))
 98 | 
 99 |   def forward(self, x, x_mask, g=None):
100 |     if g is not None:
101 |       x = x + g
102 |     for i in range(self.n_layers):
103 |       y = self.convs_sep[i](x * x_mask)
104 |       y = self.norms_1[i](y)
105 |       y = F.gelu(y)
106 |       y = self.convs_1x1[i](y)
107 |       y = self.norms_2[i](y)
108 |       y = F.gelu(y)
109 |       y = self.drop(y)
110 |       x = x + y
111 |     return x * x_mask
112 | 
113 | 
114 | class WN(torch.nn.Module):
115 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=0, spk_channels=0, p_dropout=0):
116 |     super(WN, self).__init__()
117 |     assert(kernel_size % 2 == 1)
118 |     self.hidden_channels =hidden_channels
119 |     self.kernel_size = kernel_size,
120 |     self.dilation_rate = dilation_rate
121 |     self.n_layers = n_layers
122 |     self.n_speakers = n_speakers
123 |     self.spk_channels = spk_channels
124 |     self.p_dropout = p_dropout
125 | 
126 |     self.in_layers = torch.nn.ModuleList()
127 |     self.res_skip_layers = torch.nn.ModuleList()
128 |     self.drop = nn.Dropout(p_dropout)
129 | 
130 |     if n_speakers > 0:
131 |       cond_layer = torch.nn.Conv1d(spk_channels, 2*hidden_channels*n_layers, 1)
132 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
133 | 
134 |     for i in range(n_layers):
135 |       dilation = dilation_rate ** i
136 |       padding = int((kernel_size * dilation - dilation) / 2)
137 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
138 |                                  dilation=dilation, padding=padding)
139 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
140 |       self.in_layers.append(in_layer)
141 | 
142 |       # last one is not necessary
143 |       if i < n_layers - 1:
144 |         res_skip_channels = 2 * hidden_channels
145 |       else:
146 |         res_skip_channels = hidden_channels
147 | 
148 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
149 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
150 |       self.res_skip_layers.append(res_skip_layer)
151 | 
152 |   def forward(self, x, x_mask, g=None, **kwargs):
153 |     output = torch.zeros_like(x)
154 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
155 | 
156 |     if g is not None:
157 |       g = self.cond_layer(g)
158 | 
159 |     for i in range(self.n_layers):
160 |       x_in = self.in_layers[i](x)
161 |       if g is not None:
162 |         cond_offset = i * 2 * self.hidden_channels
163 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
164 |       else:
165 |         g_l = torch.zeros_like(x_in)
166 | 
167 |       acts = commons.fused_add_tanh_sigmoid_multiply(
168 |           x_in,
169 |           g_l,
170 |           n_channels_tensor)
171 |       acts = self.drop(acts)
172 | 
173 |       res_skip_acts = self.res_skip_layers[i](acts)
174 |       if i < self.n_layers - 1:
175 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
176 |         x = (x + res_acts) * x_mask
177 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
178 |       else:
179 |         output = output + res_skip_acts
180 |     return output * x_mask
181 | 
182 |   def remove_weight_norm(self):
183 |     if self.n_speakers > 0:
184 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
185 |     for l in self.in_layers:
186 |       torch.nn.utils.remove_weight_norm(l)
187 |     for l in self.res_skip_layers:
188 |      torch.nn.utils.remove_weight_norm(l)
189 | 
190 | 
191 | class ResBlock1(torch.nn.Module):
192 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
193 |         super(ResBlock1, self).__init__()
194 |         self.convs1 = nn.ModuleList([
195 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
196 |                                padding=get_padding(kernel_size, dilation[0]))),
197 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
198 |                                padding=get_padding(kernel_size, dilation[1]))),
199 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
200 |                                padding=get_padding(kernel_size, dilation[2])))
201 |         ])
202 |         self.convs1.apply(init_weights)
203 | 
204 |         self.convs2 = nn.ModuleList([
205 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206 |                                padding=get_padding(kernel_size, 1))),
207 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
208 |                                padding=get_padding(kernel_size, 1))),
209 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
210 |                                padding=get_padding(kernel_size, 1)))
211 |         ])
212 |         self.convs2.apply(init_weights)
213 | 
214 |     def forward(self, x, x_mask=None):
215 |         for c1, c2 in zip(self.convs1, self.convs2):
216 |             xt = F.leaky_relu(x, LRELU_SLOPE)
217 |             if x_mask is not None:
218 |                 xt = xt * x_mask
219 |             xt = c1(xt)
220 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
221 |             if x_mask is not None:
222 |                 xt = xt * x_mask
223 |             xt = c2(xt)
224 |             x = xt + x
225 |         if x_mask is not None:
226 |             x = x * x_mask
227 |         return x
228 | 
229 |     def remove_weight_norm(self):
230 |         for l in self.convs1:
231 |             remove_weight_norm(l)
232 |         for l in self.convs2:
233 |             remove_weight_norm(l)
234 | 
235 | 
236 | class ResBlock2(torch.nn.Module):
237 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
238 |         super(ResBlock2, self).__init__()
239 |         self.convs = nn.ModuleList([
240 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
241 |                                padding=get_padding(kernel_size, dilation[0]))),
242 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
243 |                                padding=get_padding(kernel_size, dilation[1])))
244 |         ])
245 |         self.convs.apply(init_weights)
246 | 
247 |     def forward(self, x, x_mask=None):
248 |         for c in self.convs:
249 |             xt = F.leaky_relu(x, LRELU_SLOPE)
250 |             if x_mask is not None:
251 |                 xt = xt * x_mask
252 |             xt = c(xt)
253 |             x = xt + x
254 |         if x_mask is not None:
255 |             x = x * x_mask
256 |         return x
257 | 
258 |     def remove_weight_norm(self):
259 |         for l in self.convs:
260 |             remove_weight_norm(l)
261 | 
262 | 
263 | class Log(nn.Module):
264 |   def forward(self, x, x_mask, reverse=False, **kwargs):
265 |     if not reverse:
266 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
267 |       logdet = torch.sum(-y, [1, 2])
268 |       return y, logdet
269 |     else:
270 |       x = torch.exp(x) * x_mask
271 |       return x
272 |     
273 | 
274 | class Flip(nn.Module):
275 |   def forward(self, x, *args, reverse=False, **kwargs):
276 |     x = torch.flip(x, [1])
277 |     if not reverse:
278 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
279 |       return x, logdet
280 |     else:
281 |       return x
282 | 
283 | 
284 | class ElementwiseAffine(nn.Module):
285 |   def __init__(self, channels):
286 |     super().__init__()
287 |     self.channels = channels
288 |     self.m = nn.Parameter(torch.zeros(channels,1))
289 |     self.logs = nn.Parameter(torch.zeros(channels,1))
290 | 
291 |   def forward(self, x, x_mask, reverse=False, **kwargs):
292 |     if not reverse:
293 |       y = self.m + torch.exp(self.logs) * x
294 |       y = y * x_mask
295 |       logdet = torch.sum(self.logs * x_mask, [1,2])
296 |       return y, logdet
297 |     else:
298 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
299 |       return x
300 | 
301 | 
302 | class ResidualCouplingLayer(nn.Module):
303 |   def __init__(self,
304 |       channels,
305 |       hidden_channels,
306 |       kernel_size,
307 |       dilation_rate,
308 |       n_layers,
309 |       p_dropout=0,
310 |       n_speakers=0,
311 |       spk_channels=0,
312 |       mean_only=False):
313 |     assert channels % 2 == 0, "channels should be divisible by 2"
314 |     super().__init__()
315 |     self.channels = channels
316 |     self.hidden_channels = hidden_channels
317 |     self.kernel_size = kernel_size
318 |     self.dilation_rate = dilation_rate
319 |     self.n_layers = n_layers
320 |     self.half_channels = channels // 2
321 |     self.mean_only = mean_only
322 | 
323 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
324 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, n_speakers=n_speakers, spk_channels=spk_channels)
325 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
326 |     self.post.weight.data.zero_()
327 |     self.post.bias.data.zero_()
328 | 
329 |   def forward(self, x, x_mask, g=None, reverse=False):
330 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
331 |     h = self.pre(x0) * x_mask
332 |     h = self.enc(h, x_mask, g=g)
333 |     stats = self.post(h) * x_mask
334 |     if not self.mean_only:
335 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
336 |     else:
337 |       m = stats
338 |       logs = torch.zeros_like(m)
339 | 
340 |     if not reverse:
341 |       x1 = m + x1 * torch.exp(logs) * x_mask
342 |       x = torch.cat([x0, x1], 1)
343 |       logdet = torch.sum(logs, [1,2])
344 |       return x, logdet
345 |     else:
346 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
347 |       x = torch.cat([x0, x1], 1)
348 |       return x
349 | 
350 | class ResidualCouplingBlock(nn.Module):
351 |   def __init__(self,
352 |       channels,
353 |       hidden_channels,
354 |       kernel_size,
355 |       dilation_rate,
356 |       n_layers,
357 |       n_flows=4,
358 |       n_speakers=0,
359 |       gin_channels=0):
360 |     super().__init__()
361 |     self.channels = channels
362 |     self.hidden_channels = hidden_channels
363 |     self.kernel_size = kernel_size
364 |     self.dilation_rate = dilation_rate
365 |     self.n_layers = n_layers
366 |     self.n_flows = n_flows
367 |     self.gin_channels = gin_channels
368 | 
369 |     self.flows = nn.ModuleList()
370 |     for i in range(n_flows):
371 |       self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=n_speakers, spk_channels=gin_channels, mean_only=True))
372 |       self.flows.append(Flip())
373 | 
374 |   def forward(self, x, x_mask, g=None, reverse=False):
375 |     if not reverse:
376 |       for flow in self.flows:
377 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
378 |     else:
379 |       for flow in reversed(self.flows):
380 |         x = flow(x, x_mask, g=g, reverse=reverse)
381 |     return x
382 | 
383 | 
384 | class ConvFlow(nn.Module):
385 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
386 |     super().__init__()
387 |     self.in_channels = in_channels
388 |     self.filter_channels = filter_channels
389 |     self.kernel_size = kernel_size
390 |     self.n_layers = n_layers
391 |     self.num_bins = num_bins
392 |     self.tail_bound = tail_bound
393 |     self.half_channels = in_channels // 2
394 | 
395 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
396 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
397 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
398 |     self.proj.weight.data.zero_()
399 |     self.proj.bias.data.zero_()
400 | 
401 |   def forward(self, x, x_mask, g=None, reverse=False):
402 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
403 |     h = self.pre(x0)
404 |     h = self.convs(h, x_mask, g=g)
405 |     h = self.proj(h) * x_mask
406 | 
407 |     b, c, t = x0.shape
408 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
409 | 
410 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
411 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
412 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
413 | 
414 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
415 |         unnormalized_widths,
416 |         unnormalized_heights,
417 |         unnormalized_derivatives,
418 |         inverse=reverse,
419 |         tails='linear',
420 |         tail_bound=self.tail_bound
421 |     )
422 | 
423 |     x = torch.cat([x0, x1], 1) * x_mask
424 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
425 |     if not reverse:
426 |         return x, logdet
427 |     else:
428 |         return x
429 | 
430 | 
431 | class ResStack(nn.Module):
432 |   def __init__(self, channel, kernel_size=3, base=3, nums=4):
433 |     super(ResStack, self).__init__()
434 | 
435 |     self.layers = nn.ModuleList([
436 |       nn.Sequential(
437 |           nn.LeakyReLU(),
438 |           nn.utils.weight_norm(nn.Conv1d(channel, channel,
439 |               kernel_size=kernel_size, dilation=base**i, padding=base**i)),
440 |           nn.LeakyReLU(),
441 |           nn.utils.weight_norm(nn.Conv1d(channel, channel,
442 |               kernel_size=kernel_size, dilation=1, padding=1)),
443 |       )
444 |       for i in range(nums)
445 |     ])
446 | 
447 |   def forward(self, x):
448 |     for layer in self.layers:
449 |       x = x + layer(x)
450 |     return x
451 | 


--------------------------------------------------------------------------------
/modules/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import modules.commons as commons
  9 | 
 10 | 
 11 | class LayerNorm(nn.Module):
 12 |   def __init__(self, channels, eps=1e-5):
 13 |     super().__init__()
 14 |     self.channels = channels
 15 |     self.eps = eps
 16 | 
 17 |     self.gamma = nn.Parameter(torch.ones(channels))
 18 |     self.beta = nn.Parameter(torch.zeros(channels))
 19 | 
 20 |   def forward(self, x):
 21 |     x = x.transpose(1, -1)
 22 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 23 |     return x.transpose(1, -1)
 24 | 
 25 | 
 26 | class Encoder(nn.Module):
 27 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 28 |     super().__init__()
 29 |     self.hidden_channels = hidden_channels
 30 |     self.filter_channels = filter_channels
 31 |     self.n_heads = n_heads
 32 |     self.n_layers = n_layers
 33 |     self.kernel_size = kernel_size
 34 |     self.p_dropout = p_dropout
 35 |     self.window_size = window_size
 36 | 
 37 |     self.drop = nn.Dropout(p_dropout)
 38 |     self.attn_layers = nn.ModuleList()
 39 |     self.norm_layers_1 = nn.ModuleList()
 40 |     self.ffn_layers = nn.ModuleList()
 41 |     self.norm_layers_2 = nn.ModuleList()
 42 |     for i in range(self.n_layers):
 43 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 44 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 45 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 46 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 47 | 
 48 |   def forward(self, x, x_mask):
 49 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 50 |     x = x * x_mask
 51 |     for i in range(self.n_layers):
 52 |       y = self.attn_layers[i](x, x, attn_mask)
 53 |       y = self.drop(y)
 54 |       x = self.norm_layers_1[i](x + y)
 55 | 
 56 |       y = self.ffn_layers[i](x, x_mask)
 57 |       y = self.drop(y)
 58 |       x = self.norm_layers_2[i](x + y)
 59 |     x = x * x_mask
 60 |     return x
 61 | 
 62 | class Decoder(nn.Module):
 63 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 64 |     super().__init__()
 65 |     self.hidden_channels = hidden_channels
 66 |     self.filter_channels = filter_channels
 67 |     self.n_heads = n_heads
 68 |     self.n_layers = n_layers
 69 |     self.kernel_size = kernel_size
 70 |     self.p_dropout = p_dropout
 71 |     self.proximal_bias = proximal_bias
 72 |     self.proximal_init = proximal_init
 73 | 
 74 |     self.drop = nn.Dropout(p_dropout)
 75 |     self.self_attn_layers = nn.ModuleList()
 76 |     self.norm_layers_0 = nn.ModuleList()
 77 |     self.encdec_attn_layers = nn.ModuleList()
 78 |     self.norm_layers_1 = nn.ModuleList()
 79 |     self.ffn_layers = nn.ModuleList()
 80 |     self.norm_layers_2 = nn.ModuleList()
 81 |     for i in range(self.n_layers):
 82 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 83 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 84 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 85 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 86 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 87 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 88 | 
 89 |   def forward(self, x, x_mask, h, h_mask):
 90 |     """
 91 |     x: decoder input
 92 |     h: encoder output
 93 |     """
 94 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 95 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 96 |     x = x * x_mask
 97 |     for i in range(self.n_layers):
 98 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 99 |       y = self.drop(y)
100 |       x = self.norm_layers_0[i](x + y)
101 | 
102 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
103 |       y = self.drop(y)
104 |       x = self.norm_layers_1[i](x + y)
105 |       
106 |       y = self.ffn_layers[i](x, x_mask)
107 |       y = self.drop(y)
108 |       x = self.norm_layers_2[i](x + y)
109 |     x = x * x_mask
110 |     return x
111 | 
112 | class FFT(nn.Module):
113 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
114 |     super().__init__()
115 |     self.hidden_channels = hidden_channels
116 |     self.filter_channels = filter_channels
117 |     self.n_heads = n_heads
118 |     self.n_layers = n_layers
119 |     self.kernel_size = kernel_size
120 |     self.p_dropout = p_dropout
121 |     self.proximal_bias = proximal_bias
122 |     self.proximal_init = proximal_init
123 | 
124 |     self.drop = nn.Dropout(p_dropout)
125 |     self.self_attn_layers = nn.ModuleList()
126 |     self.norm_layers_0 = nn.ModuleList()
127 |     self.ffn_layers = nn.ModuleList()
128 |     self.norm_layers_1 = nn.ModuleList()
129 |     for i in range(self.n_layers):
130 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
131 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
132 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
133 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
134 | 
135 |   def forward(self, x, x_mask):
136 |     """
137 |     x: decoder input
138 |     h: encoder output
139 |     """
140 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
141 |     x = x * x_mask
142 |     for i in range(self.n_layers):
143 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
144 |       y = self.drop(y)
145 |       x = self.norm_layers_0[i](x + y)
146 |       
147 |       y = self.ffn_layers[i](x, x_mask)
148 |       y = self.drop(y)
149 |       x = self.norm_layers_1[i](x + y) 
150 |     x = x * x_mask
151 |     return x
152 | 
153 | 
154 | class FFNs(nn.Module):
155 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
156 |     super().__init__()
157 |     self.hidden_channels = hidden_channels
158 |     self.filter_channels = filter_channels
159 |     self.n_heads = n_heads
160 |     self.n_layers = n_layers
161 |     self.kernel_size = kernel_size
162 |     self.p_dropout = p_dropout
163 |     self.proximal_bias = proximal_bias
164 |     self.proximal_init = proximal_init
165 | 
166 |     self.drop = nn.Dropout(p_dropout)
167 |     #self.self_attn_layers = nn.ModuleList()
168 |     #self.norm_layers_0 = nn.ModuleList()
169 |     self.ffn_layers = nn.ModuleList()
170 |     self.norm_layers_1 = nn.ModuleList()
171 |     for i in range(self.n_layers):
172 |       #self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
173 |       #self.norm_layers_0.append(LayerNorm(hidden_channels))
174 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
175 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
176 | 
177 |   def forward(self, x, x_mask):
178 |     """
179 |     x: decoder input
180 |     h: encoder output
181 |     """
182 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
183 |     x = x * x_mask
184 |     for i in range(self.n_layers):
185 |       #y = self.self_attn_layers[i](x, x, self_attn_mask)
186 |       #y = self.drop(y)
187 |       #x = self.norm_layers_0[i](x + y)
188 |       
189 |       y = self.ffn_layers[i](x, x_mask)
190 |       y = self.drop(y)
191 |       x = self.norm_layers_1[i](x + y) 
192 |     x = x * x_mask
193 |     return x
194 | 
195 | class MultiHeadAttention(nn.Module):
196 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
197 |     super().__init__()
198 |     assert channels % n_heads == 0
199 | 
200 |     self.channels = channels
201 |     self.out_channels = out_channels
202 |     self.n_heads = n_heads
203 |     self.p_dropout = p_dropout
204 |     self.window_size = window_size
205 |     self.heads_share = heads_share
206 |     self.block_length = block_length
207 |     self.proximal_bias = proximal_bias
208 |     self.proximal_init = proximal_init
209 |     self.attn = None
210 | 
211 |     self.k_channels = channels // n_heads
212 |     self.conv_q = nn.Conv1d(channels, channels, 1)
213 |     self.conv_k = nn.Conv1d(channels, channels, 1)
214 |     self.conv_v = nn.Conv1d(channels, channels, 1)
215 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
216 |     self.drop = nn.Dropout(p_dropout)
217 | 
218 |     if window_size is not None:
219 |       n_heads_rel = 1 if heads_share else n_heads
220 |       rel_stddev = self.k_channels**-0.5
221 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
222 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
223 | 
224 |     nn.init.xavier_uniform_(self.conv_q.weight)
225 |     nn.init.xavier_uniform_(self.conv_k.weight)
226 |     nn.init.xavier_uniform_(self.conv_v.weight)
227 |     if proximal_init:
228 |       with torch.no_grad():
229 |         self.conv_k.weight.copy_(self.conv_q.weight)
230 |         self.conv_k.bias.copy_(self.conv_q.bias)
231 |       
232 |   def forward(self, x, c, attn_mask=None):
233 |     q = self.conv_q(x)
234 |     k = self.conv_k(c)
235 |     v = self.conv_v(c)
236 |     
237 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
238 | 
239 |     x = self.conv_o(x)
240 |     return x
241 | 
242 |   def attention(self, query, key, value, mask=None):
243 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
244 |     b, d, t_s, t_t = (*key.size(), query.size(2))
245 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
246 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
247 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
248 | 
249 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
250 |     if self.window_size is not None:
251 |       assert t_s == t_t, "Relative attention is only available for self-attention."
252 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
253 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
254 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
255 |       scores = scores + scores_local
256 |     if self.proximal_bias:
257 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
258 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
259 |     if mask is not None:
260 |       scores = scores.masked_fill(mask == 0, -1e4)
261 |       if self.block_length is not None:
262 |         assert t_s == t_t, "Local attention is only available for self-attention."
263 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
264 |         scores = scores.masked_fill(block_mask == 0, -1e4)
265 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
266 |     p_attn = self.drop(p_attn)
267 |     output = torch.matmul(p_attn, value)
268 |     if self.window_size is not None:
269 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
270 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
271 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
272 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
273 |     return output, p_attn
274 | 
275 |   def _matmul_with_relative_values(self, x, y):
276 |     """
277 |     x: [b, h, l, m]
278 |     y: [h or 1, m, d]
279 |     ret: [b, h, l, d]
280 |     """
281 |     ret = torch.matmul(x, y.unsqueeze(0))
282 |     return ret
283 | 
284 |   def _matmul_with_relative_keys(self, x, y):
285 |     """
286 |     x: [b, h, l, d]
287 |     y: [h or 1, m, d]
288 |     ret: [b, h, l, m]
289 |     """
290 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
291 |     return ret
292 | 
293 |   def _get_relative_embeddings(self, relative_embeddings, length):
294 |     max_relative_position = 2 * self.window_size + 1
295 |     # Pad first before slice to avoid using cond ops.
296 |     pad_length = max(length - (self.window_size + 1), 0)
297 |     slice_start_position = max((self.window_size + 1) - length, 0)
298 |     slice_end_position = slice_start_position + 2 * length - 1
299 |     if pad_length > 0:
300 |       padded_relative_embeddings = F.pad(
301 |           relative_embeddings,
302 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
303 |     else:
304 |       padded_relative_embeddings = relative_embeddings
305 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
306 |     return used_relative_embeddings
307 | 
308 |   def _relative_position_to_absolute_position(self, x):
309 |     """
310 |     x: [b, h, l, 2*l-1]
311 |     ret: [b, h, l, l]
312 |     """
313 |     batch, heads, length, _ = x.size()
314 |     # Concat columns of pad to shift from relative to absolute indexing.
315 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
316 | 
317 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
318 |     x_flat = x.view([batch, heads, length * 2 * length])
319 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
320 | 
321 |     # Reshape and slice out the padded elements.
322 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
323 |     return x_final
324 | 
325 |   def _absolute_position_to_relative_position(self, x):
326 |     """
327 |     x: [b, h, l, l]
328 |     ret: [b, h, l, 2*l-1]
329 |     """
330 |     batch, heads, length, _ = x.size()
331 |     # padd along column
332 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
333 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
334 |     # add 0's in the beginning that will skew the elements after reshape
335 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
336 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
337 |     return x_final
338 | 
339 |   def _attention_bias_proximal(self, length):
340 |     """Bias for self-attention to encourage attention to close positions.
341 |     Args:
342 |       length: an integer scalar.
343 |     Returns:
344 |       a Tensor with shape [1, 1, length, length]
345 |     """
346 |     r = torch.arange(length, dtype=torch.float32)
347 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
348 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
349 | 
350 | 
351 | class FFN(nn.Module):
352 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
353 |     super().__init__()
354 |     self.in_channels = in_channels
355 |     self.out_channels = out_channels
356 |     self.filter_channels = filter_channels
357 |     self.kernel_size = kernel_size
358 |     self.p_dropout = p_dropout
359 |     self.activation = activation
360 |     self.causal = causal
361 | 
362 |     if causal:
363 |       self.padding = self._causal_padding
364 |     else:
365 |       self.padding = self._same_padding
366 | 
367 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
368 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
369 |     self.drop = nn.Dropout(p_dropout)
370 | 
371 |   def forward(self, x, x_mask):
372 |     x = self.conv_1(self.padding(x * x_mask))
373 |     if self.activation == "gelu":
374 |       x = x * torch.sigmoid(1.702 * x)
375 |     else:
376 |       x = torch.relu(x)
377 |     x = self.drop(x)
378 |     x = self.conv_2(self.padding(x * x_mask))
379 |     return x * x_mask
380 |   
381 |   def _causal_padding(self, x):
382 |     if self.kernel_size == 1:
383 |       return x
384 |     pad_l = self.kernel_size - 1
385 |     pad_r = 0
386 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
387 |     x = F.pad(x, commons.convert_pad_shape(padding))
388 |     return x
389 | 
390 |   def _same_padding(self, x):
391 |     if self.kernel_size == 1:
392 |       return x
393 |     pad_l = (self.kernel_size - 1) // 2
394 |     pad_r = self.kernel_size // 2
395 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
396 |     x = F.pad(x, commons.convert_pad_shape(padding))
397 |     return x
398 | 


--------------------------------------------------------------------------------
/egs/visinger2/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import itertools
  6 | import math
  7 | import time
  8 | import logging
  9 | 
 10 | import torch
 11 | from torch import nn, optim
 12 | from torch.nn import functional as F
 13 | from torch.utils.data import DataLoader
 14 | from torch.utils.tensorboard import SummaryWriter
 15 | import torch.multiprocessing as mp
 16 | import torch.distributed as dist
 17 | from torch.nn.parallel import DistributedDataParallel as DDP
 18 | from torch.cuda.amp import autocast, GradScaler
 19 | 
 20 | sys.path.append('../..')
 21 | import modules.commons as commons
 22 | import utils.utils as utils
 23 | 
 24 | from dataset import DatasetConstructor
 25 | 
 26 | from models import (
 27 |     SynthesizerTrn,
 28 |     Discriminator
 29 | )
 30 | 
 31 | from modules.losses import (
 32 |     generator_loss,
 33 |     discriminator_loss,
 34 |     feature_loss,
 35 |     kl_loss,
 36 | )
 37 | from preprocess.mel_processing import mel_spectrogram_torch, spec_to_mel_torch, spectrogram_torch
 38 | 
 39 | torch.backends.cudnn.benchmark = True
 40 | global_step = 0
 41 | use_cuda = torch.cuda.is_available()
 42 | print("use_cuda, ", use_cuda)
 43 | 
 44 | numba_logger = logging.getLogger('numba')
 45 | numba_logger.setLevel(logging.WARNING)
 46 | 
 47 | 
 48 | def main():
 49 |     """Assume Single Node Multi GPUs Training Only"""
 50 | 
 51 |     hps = utils.get_hparams()
 52 |     os.environ['MASTER_ADDR'] = 'localhost'
 53 |     os.environ['MASTER_PORT'] = str(hps.train.port)
 54 | 
 55 |     if (torch.cuda.is_available()):
 56 |         n_gpus = torch.cuda.device_count()
 57 |         mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
 58 |     else:
 59 |         cpurun(0, 1, hps)
 60 | 
 61 | 
 62 | def run(rank, n_gpus, hps):
 63 |     global global_step
 64 |     if rank == 0:
 65 |         logger = utils.get_logger(hps.train.save_dir)
 66 |         logger.info(hps.train)
 67 |         logger.info(hps.data)
 68 |         logger.info(hps.model)
 69 |         utils.check_git_hash(hps.train.save_dir)
 70 |         writer = SummaryWriter(log_dir=hps.train.save_dir)
 71 |         writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval"))
 72 | 
 73 |     dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
 74 |     torch.manual_seed(hps.train.seed)
 75 |     torch.cuda.set_device(rank)
 76 |     dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank)
 77 | 
 78 |     train_loader = dataset_constructor.get_train_loader()
 79 |     if rank == 0:
 80 |         valid_loader = dataset_constructor.get_valid_loader()
 81 | 
 82 |     net_g = SynthesizerTrn(hps).cuda(rank)
 83 |     net_d = Discriminator(hps, hps.model.use_spectral_norm).cuda(rank)
 84 | 
 85 |     optim_g = torch.optim.AdamW(
 86 |         net_g.parameters(),
 87 |         hps.train.learning_rate,
 88 |         betas=hps.train.betas,
 89 |         eps=hps.train.eps)
 90 |     optim_d = torch.optim.AdamW(
 91 |         net_d.parameters(),
 92 |         hps.train.learning_rate,
 93 |         betas=hps.train.betas,
 94 |         eps=hps.train.eps)
 95 |     net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
 96 |     net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
 97 |     try:
 98 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g,
 99 |                                                    optim_g)
100 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_d,
101 |                                                    optim_d)
102 |         global_step = (epoch_str - 1) * len(train_loader)
103 |     except:
104 |         epoch_str = 1
105 |         global_step = 0
106 | 
107 |     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
108 |     scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
109 | 
110 |     for epoch in range(epoch_str, hps.train.epochs + 1):
111 |         if rank == 0:
112 |             train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
113 |                                [train_loader, valid_loader], logger, [writer, writer_eval])
114 |         else:
115 |             train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
116 |                                [train_loader, None], None, None)
117 |         scheduler_g.step()
118 |         scheduler_d.step()
119 | 
120 | 
121 | def cpurun(rank, n_gpus, hps):
122 |     global global_step
123 |     if rank == 0:
124 |         logger = utils.get_logger(hps.train.save_dir)
125 |         logger.info(hps.train)
126 |         logger.info(hps.data)
127 |         logger.info(hps.model)
128 |         utils.check_git_hash(hps.train.save_dir)
129 |         writer = SummaryWriter(log_dir=hps.train.save_dir)
130 |         writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval"))
131 |     torch.manual_seed(hps.train.seed)
132 |     dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank)
133 | 
134 |     train_loader = dataset_constructor.get_train_loader()
135 |     if rank == 0:
136 |         valid_loader = dataset_constructor.get_valid_loader()
137 | 
138 |     net_g = SynthesizerTrn(hps)
139 |     net_d = Discriminator(hps, hps.model.use_spectral_norm)
140 | 
141 |     optim_g = torch.optim.AdamW(
142 |         net_g.parameters(),
143 |         hps.train.learning_rate,
144 |         betas=hps.train.betas,
145 |         eps=hps.train.eps)
146 |     optim_d = torch.optim.AdamW(
147 |         net_d.parameters(),
148 |         hps.train.learning_rate,
149 |         betas=hps.train.betas,
150 |         eps=hps.train.eps)
151 |     try:
152 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g,
153 |                                                    optim_g)
154 |         _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_g,
155 |                                                    optim_g)
156 |         global_step = (epoch_str - 1) * len(train_loader)
157 |     except:
158 |         epoch_str = 1
159 |         global_step = 0
160 | 
161 |     scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
162 |     scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
163 | 
164 |     for epoch in range(epoch_str, hps.train.epochs + 1):
165 |         train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
166 |                            [train_loader, valid_loader], logger, [writer, writer_eval])
167 | 
168 |         scheduler_g.step()
169 |         scheduler_d.step()
170 | 
171 | 
172 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, loaders, logger, writers):
173 |     net_g, net_d = nets
174 |     optim_g, optim_d = optims
175 |     scheduler_g, scheduler_d = schedulers
176 |     train_loader, eval_loader = loaders
177 |     if writers is not None:
178 |         writer, writer_eval = writers
179 | 
180 |     train_loader.sampler.set_epoch(epoch)
181 |     global global_step
182 | 
183 |     net_g.train()
184 |     net_d.train()
185 |     for batch_idx, data_dict in enumerate(train_loader):
186 | 
187 |         phone = data_dict["phone"]
188 |         pitchid = data_dict["pitchid"]
189 |         dur = data_dict["dur"]
190 |         slur = data_dict["slur"]
191 |         gtdur = data_dict["gtdur"]
192 |         mel = data_dict["mel"]
193 |         f0 = data_dict["f0"]
194 |         wav = data_dict["wav"]
195 |         spkid = data_dict["spkid"]
196 | 
197 |         phone_lengths = data_dict["phone_lengths"]
198 |         mel_lengths = data_dict["mel_lengths"]
199 |         wav_lengths = data_dict["wav_lengths"]
200 |         f0_lengths = data_dict["f0_lengths"]
201 | 
202 |         # data
203 |         if (use_cuda):
204 |             phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True)
205 |             pitchid = pitchid.cuda(rank, non_blocking=True)
206 |             dur = dur.cuda(rank, non_blocking=True)
207 |             slur = slur.cuda(rank, non_blocking=True)
208 |             gtdur = gtdur.cuda(rank, non_blocking=True)
209 |             mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True)
210 |             wav, wav_lengths = wav.cuda(rank, non_blocking=True), wav_lengths.cuda(rank, non_blocking=True)
211 |             f0, f0_lengths = f0.cuda(rank, non_blocking=True), f0_lengths.cuda(rank, non_blocking=True)
212 |             spkid = spkid.cuda(rank, non_blocking=True)
213 | 
214 |         # forward
215 |         y_hat, ids_slice, LF0, y_ddsp, kl_div, predict_mel, mask = net_g(phone, phone_lengths, pitchid, dur, slur,
216 |                                                                          gtdur, f0, mel, mel_lengths, spk_id=spkid)
217 |         y_ddsp = y_ddsp.unsqueeze(1)
218 | 
219 |         # Discriminator
220 |         y = commons.slice_segments(wav, ids_slice * hps.data.hop_size, hps.train.segment_size)  # slice
221 |         y_ddsp_mel = mel_spectrogram_torch(
222 |             y_ddsp.squeeze(1),
223 |             hps.data.n_fft,
224 |             hps.data.acoustic_dim,
225 |             hps.data.sample_rate,
226 |             hps.data.hop_size,
227 |             hps.data.win_size,
228 |             hps.data.fmin,
229 |             hps.data.fmax
230 |         )
231 | 
232 |         y_logspec = torch.log(spectrogram_torch(
233 |             y.squeeze(1),
234 |             hps.data.n_fft,
235 |             hps.data.sample_rate,
236 |             hps.data.hop_size,
237 |             hps.data.win_size
238 |         ) + 1e-7)
239 | 
240 |         y_ddsp_logspec = torch.log(spectrogram_torch(
241 |             y_ddsp.squeeze(1),
242 |             hps.data.n_fft,
243 |             hps.data.sample_rate,
244 |             hps.data.hop_size,
245 |             hps.data.win_size
246 |         ) + 1e-7)
247 | 
248 |         y_mel = mel_spectrogram_torch(
249 |             y.squeeze(1),
250 |             hps.data.n_fft,
251 |             hps.data.acoustic_dim,
252 |             hps.data.sample_rate,
253 |             hps.data.hop_size,
254 |             hps.data.win_size,
255 |             hps.data.fmin,
256 |             hps.data.fmax
257 |         )
258 |         y_hat_mel = mel_spectrogram_torch(
259 |             y_hat.squeeze(1),
260 |             hps.data.n_fft,
261 |             hps.data.acoustic_dim,
262 |             hps.data.sample_rate,
263 |             hps.data.hop_size,
264 |             hps.data.win_size,
265 |             hps.data.fmin,
266 |             hps.data.fmax
267 |         )
268 | 
269 |         y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
270 |         loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
271 |         loss_disc_all = loss_disc
272 | 
273 |         optim_d.zero_grad()
274 |         loss_disc_all.backward()
275 |         grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
276 |         optim_d.step()
277 | 
278 |         # loss
279 |         y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
280 | 
281 |         loss_mel = F.l1_loss(y_mel, y_hat_mel) * 45
282 |         loss_mel_dsp = F.l1_loss(y_mel, y_ddsp_mel) * 45
283 |         loss_spec_dsp = F.l1_loss(y_logspec, y_ddsp_logspec) * 45
284 | 
285 |         loss_mel_am = F.mse_loss(mel * mask, predict_mel * mask)  # * 10
286 | 
287 |         loss_fm = feature_loss(fmap_r, fmap_g)
288 |         loss_gen, losses_gen = generator_loss(y_d_hat_g)
289 | 
290 |         loss_fm = loss_fm / 2
291 |         loss_gen = loss_gen / 2
292 |         loss_gen_all = loss_gen + loss_fm + loss_mel + loss_mel_dsp + kl_div + loss_mel_am + loss_spec_dsp
293 | 
294 |         loss_gen_all = loss_gen_all / hps.train.accumulation_steps
295 | 
296 |         loss_gen_all.backward()
297 |         if ((global_step + 1) % hps.train.accumulation_steps == 0):
298 |             grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
299 |             optim_g.step()
300 |             optim_g.zero_grad()
301 | 
302 |         if rank == 0:
303 |             if (global_step + 1) % (hps.train.accumulation_steps * 10) == 0:
304 |                 logger.info(["step&time", global_step, time.asctime(time.localtime(time.time()))])
305 |                 logger.info(["mel&mel_dsp&spec_dsp: ", loss_mel, loss_mel_dsp, loss_spec_dsp])
306 |                 logger.info(["adv&fm: ", loss_gen, loss_fm])
307 |                 logger.info(["kl: ", kl_div])
308 |                 logger.info(["am&dur: ", loss_mel_am])
309 | 
310 |             if global_step % hps.train.log_interval == 0:
311 |                 lr = optim_g.param_groups[0]['lr']
312 |                 losses = [loss_gen_all, loss_mel]
313 |                 logger.info('Train Epoch: {} [{:.0f}%]'.format(
314 |                     epoch,
315 |                     100. * batch_idx / len(train_loader)))
316 |                 logger.info([x.item() for x in losses] + [global_step, lr])
317 | 
318 |                 scalar_dict = {"loss/total": loss_gen_all,
319 |                                "loss/mel": loss_mel,
320 |                                "loss/adv": loss_gen,
321 |                                "loss/fm": loss_fm,
322 |                                "loss/mel_ddsp": loss_mel_dsp,
323 |                                "loss/spec_ddsp": loss_spec_dsp,
324 |                                "loss/mel_am": loss_mel_am,
325 |                                "loss/kl_div": kl_div,
326 |                                "learning_rate": lr}
327 | 
328 |                 utils.summarize(
329 |                     writer=writer,
330 |                     global_step=global_step,
331 |                     scalars=scalar_dict)
332 | 
333 |             if global_step % hps.train.eval_interval == 0:
334 |                 logger.info(['All training params(G): ', utils.count_parameters(net_g), ' M'])
335 |                 # print('Sub training params(G): ', \
336 |                 #      'text_encoder: ', utils.count_parameters(net_g.module.text_encoder), ' M, ', \
337 |                 #      'decoder: ', utils.count_parameters(net_g.module.decoder), ' M, ', \
338 |                 #      'mel_decoder: ', utils.count_parameters(net_g.module.mel_decoder), ' M, ', \
339 |                 #      'dec: ', utils.count_parameters(net_g.module.dec), ' M, ', \
340 |                 #      'dec_harm: ', utils.count_parameters(net_g.module.dec_harm), ' M, ', \
341 |                 #      'dec_noise: ', utils.count_parameters(net_g.module.dec_noise), ' M, ', \
342 |                 #      'posterior: ', utils.count_parameters(net_g.module.posterior_encoder), ' M, ', \
343 |                 #     )
344 | 
345 |                 evaluate(hps, net_g, eval_loader, writer_eval)
346 |                 utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
347 |                                       os.path.join(hps.train.save_dir, "G_{}.pth".format(global_step)), hps.train.eval_interval)
348 |                 utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
349 |                                       os.path.join(hps.train.save_dir, "D_{}.pth".format(global_step)), hps.train.eval_interval)
350 |                 net_g.train()
351 |         global_step += 1
352 | 
353 |     if rank == 0:
354 |         logger.info('====> Epoch: {}'.format(epoch))
355 | 
356 | 
357 | def evaluate(hps, generator, eval_loader, writer_eval):
358 |     generator.eval()
359 |     image_dict = {}
360 |     audio_dict = {}
361 |     with torch.no_grad():
362 |         for batch_idx, data_dict in enumerate(eval_loader):
363 |             if batch_idx == 4:
364 |                 break
365 |             phone = data_dict["phone"]
366 |             pitchid = data_dict["pitchid"]
367 |             dur = data_dict["dur"]
368 |             slur = data_dict["slur"]
369 |             gtdur = data_dict["gtdur"]
370 |             mel = data_dict["mel"]
371 |             f0 = data_dict["f0"]
372 |             wav = data_dict["wav"]
373 |             spkid = data_dict["spkid"]
374 | 
375 |             phone_lengths = data_dict["phone_lengths"]
376 |             mel_lengths = data_dict["mel_lengths"]
377 |             wav_lengths = data_dict["wav_lengths"]
378 |             f0_lengths = data_dict["f0_lengths"]
379 | 
380 |             # data
381 |             if (use_cuda):
382 |                 phone, phone_lengths = phone.cuda(0), phone_lengths.cuda(0)
383 |                 pitchid = pitchid.cuda(0)
384 |                 dur = dur.cuda(0)
385 |                 slur = slur.cuda(0)
386 |                 wav = wav.cuda(0)
387 |                 mel = mel.cuda(0)
388 |                 f0 = f0.cuda(0)
389 |                 gtdur = gtdur.cuda(0)
390 |                 spkid = spkid.cuda(0)
391 |             # remove else
392 |             phone = phone[:1]
393 |             phone_lengths = phone_lengths[:1]
394 |             pitchid = pitchid[:1]
395 |             dur = dur[:1]
396 |             slur = slur[:1]
397 |             wav = wav[:1]
398 |             mel = mel[:1]
399 |             f0 = f0[:1]
400 |             gtdur = gtdur[:1]
401 |             spkid = spkid[:1]
402 | 
403 |             y_hat, y_harm, y_noise = generator.module.infer(phone, phone_lengths, pitchid, dur, slur, gtdur=gtdur, F0=f0,
404 |                                                             spk_id=spkid)
405 |             spec = spectrogram_torch(
406 |                 wav.squeeze(1),
407 |                 hps.data.n_fft,
408 |                 hps.data.sample_rate,
409 |                 hps.data.hop_size,
410 |                 hps.data.win_size
411 |             )
412 | 
413 |             y_mel = mel_spectrogram_torch(
414 |                 wav.squeeze(1),
415 |                 hps.data.n_fft,
416 |                 hps.data.acoustic_dim,
417 |                 hps.data.sample_rate,
418 |                 hps.data.hop_size,
419 |                 hps.data.win_size,
420 |                 hps.data.fmin,
421 |                 hps.data.fmax
422 |             )
423 |             y_hat_mel = mel_spectrogram_torch(
424 |                 y_hat.squeeze(1),
425 |                 hps.data.n_fft,
426 |                 hps.data.acoustic_dim,
427 |                 hps.data.sample_rate,
428 |                 hps.data.hop_size,
429 |                 hps.data.win_size,
430 |                 hps.data.fmin,
431 |                 hps.data.fmax
432 |             )
433 |             image_dict.update({
434 |                 f"gen/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
435 |             })
436 |             audio_dict.update( {
437 |                 f"gen/audio_{batch_idx}": y_hat[0, :, :],
438 |                 f"gen/harm_{batch_idx}": y_harm[0, :, :],
439 |                 "gen/noise": y_noise[0, :, :]
440 |             })
441 |             # if global_step == 0:
442 |             image_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
443 |             audio_dict.update({f"gt/audio_{batch_idx}": wav[0, :, :wav_lengths[0]]})
444 | 
445 |     utils.summarize(
446 |         writer=writer_eval,
447 |         global_step=global_step,
448 |         images=image_dict,
449 |         audios=audio_dict,
450 |         audio_sampling_rate=hps.data.sample_rate
451 |     )
452 |     generator.train()
453 | 
454 | 
455 | if __name__ == "__main__":
456 |     main()
457 | 


--------------------------------------------------------------------------------
/modules/stft.py:
--------------------------------------------------------------------------------
  1 | from librosa.util import pad_center, tiny
  2 | from scipy.signal import get_window
  3 | from torch import Tensor
  4 | from torch.autograd import Variable
  5 | from typing import Optional, Tuple
  6 | 
  7 | import librosa
  8 | import librosa.util as librosa_util
  9 | import math
 10 | import numpy as np
 11 | import scipy
 12 | import torch
 13 | import torch.nn.functional as F
 14 | import warnings
 15 | 
 16 | 
 17 | def create_fb_matrix(
 18 |         n_freqs: int,
 19 |         f_min: float,
 20 |         f_max: float,
 21 |         n_mels: int,
 22 |         sample_rate: int,
 23 |         norm: Optional[str] = None
 24 | ) -> Tensor:
 25 |     r"""Create a frequency bin conversion matrix.
 26 | 
 27 |     Args:
 28 |         n_freqs (int): Number of frequencies to highlight/apply
 29 |         f_min (float): Minimum frequency (Hz)
 30 |         f_max (float): Maximum frequency (Hz)
 31 |         n_mels (int): Number of mel filterbanks
 32 |         sample_rate (int): Sample rate of the audio waveform
 33 |         norm (Optional[str]): If 'slaney', divide the triangular mel weights by the width of the mel band
 34 |         (area normalization). (Default: ``None``)
 35 | 
 36 |     Returns:
 37 |         Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
 38 |         meaning number of frequencies to highlight/apply to x the number of filterbanks.
 39 |         Each column is a filterbank so that assuming there is a matrix A of
 40 |         size (..., ``n_freqs``), the applied result would be
 41 |         ``A * create_fb_matrix(A.size(-1), ...)``.
 42 |     """
 43 | 
 44 |     if norm is not None and norm != "slaney":
 45 |         raise ValueError("norm must be one of None or 'slaney'")
 46 | 
 47 |     # freq bins
 48 |     # Equivalent filterbank construction by Librosa
 49 |     all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
 50 | 
 51 |     # calculate mel freq bins
 52 |     # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.))
 53 |     m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
 54 |     m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
 55 |     m_pts = torch.linspace(m_min, m_max, n_mels + 2)
 56 |     # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.)
 57 |     f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
 58 |     # calculate the difference between each mel point and each stft freq point in hertz
 59 |     f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
 60 |     slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_mels + 2)
 61 |     # create overlapping triangles
 62 |     down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
 63 |     up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
 64 |     fb = torch.min(down_slopes, up_slopes)
 65 |     fb = torch.clamp(fb, 1e-6, 1)
 66 | 
 67 |     if norm is not None and norm == "slaney":
 68 |         # Slaney-style mel is scaled to be approx constant energy per channel
 69 |         enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels])
 70 |         fb *= enorm.unsqueeze(0)
 71 |     return fb
 72 | 
 73 | 
 74 | def lfilter(
 75 |         waveform: Tensor,
 76 |         a_coeffs: Tensor,
 77 |         b_coeffs: Tensor,
 78 |         clamp: bool = True,
 79 | ) -> Tensor:
 80 |     r"""Perform an IIR filter by evaluating difference equation.
 81 | 
 82 |     Args:
 83 |         waveform (Tensor): audio waveform of dimension of ``(..., time)``.  Must be normalized to -1 to 1.
 84 |         a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(n_order + 1)``.
 85 |                                 Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
 86 |                                 Must be same size as b_coeffs (pad with 0's as necessary).
 87 |         b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(n_order + 1)``.
 88 |                                  Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
 89 |                                  Must be same size as a_coeffs (pad with 0's as necessary).
 90 |         clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
 91 | 
 92 |     Returns:
 93 |         Tensor: Waveform with dimension of ``(..., time)``.
 94 |     """
 95 |     # pack batch
 96 |     shape = waveform.size()
 97 |     waveform = waveform.reshape(-1, shape[-1])
 98 | 
 99 |     assert (a_coeffs.size(0) == b_coeffs.size(0))
100 |     assert (len(waveform.size()) == 2)
101 |     assert (waveform.device == a_coeffs.device)
102 |     assert (b_coeffs.device == a_coeffs.device)
103 | 
104 |     device = waveform.device
105 |     dtype = waveform.dtype
106 |     n_channel, n_sample = waveform.size()
107 |     n_order = a_coeffs.size(0)
108 |     n_sample_padded = n_sample + n_order - 1
109 |     assert (n_order > 0)
110 | 
111 |     # Pad the input and create output
112 |     padded_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device)
113 |     padded_waveform[:, (n_order - 1):] = waveform
114 |     padded_output_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device)
115 | 
116 |     # Set up the coefficients matrix
117 |     # Flip coefficients' order
118 |     a_coeffs_flipped = a_coeffs.flip(0)
119 |     b_coeffs_flipped = b_coeffs.flip(0)
120 | 
121 |     # calculate windowed_input_signal in parallel
122 |     # create indices of original with shape (n_channel, n_order, n_sample)
123 |     window_idxs = torch.arange(n_sample, device=device).unsqueeze(0) + torch.arange(n_order, device=device).unsqueeze(1)
124 |     window_idxs = window_idxs.repeat(n_channel, 1, 1)
125 |     window_idxs += (torch.arange(n_channel, device=device).unsqueeze(-1).unsqueeze(-1) * n_sample_padded)
126 |     window_idxs = window_idxs.long()
127 |     # (n_order, ) matmul (n_channel, n_order, n_sample) -> (n_channel, n_sample)
128 |     input_signal_windows = torch.matmul(b_coeffs_flipped, torch.take(padded_waveform, window_idxs))
129 | 
130 |     input_signal_windows.div_(a_coeffs[0])
131 |     a_coeffs_flipped.div_(a_coeffs[0])
132 |     for i_sample, o0 in enumerate(input_signal_windows.t()):
133 |         windowed_output_signal = padded_output_waveform[:, i_sample:(i_sample + n_order)]
134 |         o0.addmv_(windowed_output_signal, a_coeffs_flipped, alpha=-1)
135 |         padded_output_waveform[:, i_sample + n_order - 1] = o0
136 | 
137 |     output = padded_output_waveform[:, (n_order - 1):]
138 | 
139 |     if clamp:
140 |         output = torch.clamp(output, min=-1., max=1.)
141 | 
142 |     # unpack batch
143 |     output = output.reshape(shape[:-1] + output.shape[-1:])
144 | 
145 |     return output
146 | 
147 | 
148 | 
149 | def biquad(
150 |         waveform: Tensor,
151 |         b0: float,
152 |         b1: float,
153 |         b2: float,
154 |         a0: float,
155 |         a1: float,
156 |         a2: float
157 | ) -> Tensor:
158 |     r"""Perform a biquad filter of input tensor.  Initial conditions set to 0.
159 |     https://en.wikipedia.org/wiki/Digital_biquad_filter
160 | 
161 |     Args:
162 |         waveform (Tensor): audio waveform of dimension of `(..., time)`
163 |         b0 (float): numerator coefficient of current input, x[n]
164 |         b1 (float): numerator coefficient of input one time step ago x[n-1]
165 |         b2 (float): numerator coefficient of input two time steps ago x[n-2]
166 |         a0 (float): denominator coefficient of current output y[n], typically 1
167 |         a1 (float): denominator coefficient of current output y[n-1]
168 |         a2 (float): denominator coefficient of current output y[n-2]
169 | 
170 |     Returns:
171 |         Tensor: Waveform with dimension of `(..., time)`
172 |     """
173 | 
174 |     device = waveform.device
175 |     dtype = waveform.dtype
176 | 
177 |     output_waveform = lfilter(
178 |         waveform,
179 |         torch.tensor([a0, a1, a2], dtype=dtype, device=device),
180 |         torch.tensor([b0, b1, b2], dtype=dtype, device=device)
181 |     )
182 |     return output_waveform
183 | 
184 | 
185 | 
186 | def _dB2Linear(x: float) -> float:
187 |     return math.exp(x * math.log(10) / 20.0)
188 | 
189 | 
190 | def highpass_biquad(
191 |         waveform: Tensor,
192 |         sample_rate: int,
193 |         cutoff_freq: float,
194 |         Q: float = 0.707
195 | ) -> Tensor:
196 |     r"""Design biquad highpass filter and perform filtering.  Similar to SoX implementation.
197 | 
198 |     Args:
199 |         waveform (Tensor): audio waveform of dimension of `(..., time)`
200 |         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
201 |         cutoff_freq (float): filter cutoff frequency
202 |         Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
203 | 
204 |     Returns:
205 |         Tensor: Waveform dimension of `(..., time)`
206 |     """
207 |     w0 = 2 * math.pi * cutoff_freq / sample_rate
208 |     alpha = math.sin(w0) / 2. / Q
209 | 
210 |     b0 = (1 + math.cos(w0)) / 2
211 |     b1 = -1 - math.cos(w0)
212 |     b2 = b0
213 |     a0 = 1 + alpha
214 |     a1 = -2 * math.cos(w0)
215 |     a2 = 1 - alpha
216 |     return biquad(waveform, b0, b1, b2, a0, a1, a2)
217 | 
218 | 
219 | 
220 | def lowpass_biquad(
221 |         waveform: Tensor,
222 |         sample_rate: int,
223 |         cutoff_freq: float,
224 |         Q: float = 0.707
225 | ) -> Tensor:
226 |     r"""Design biquad lowpass filter and perform filtering.  Similar to SoX implementation.
227 | 
228 |     Args:
229 |         waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
230 |         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
231 |         cutoff_freq (float): filter cutoff frequency
232 |         Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
233 | 
234 |     Returns:
235 |         Tensor: Waveform of dimension of `(..., time)`
236 |     """
237 |     w0 = 2 * math.pi * cutoff_freq / sample_rate
238 |     alpha = math.sin(w0) / 2 / Q
239 | 
240 |     b0 = (1 - math.cos(w0)) / 2
241 |     b1 = 1 - math.cos(w0)
242 |     b2 = b0
243 |     a0 = 1 + alpha
244 |     a1 = -2 * math.cos(w0)
245 |     a2 = 1 - alpha
246 |     return biquad(waveform, b0, b1, b2, a0, a1, a2)
247 | 
248 | 
249 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
250 |                      n_fft=800, dtype=np.float32, norm=None):
251 |     """
252 |     # from librosa 0.6
253 |     Compute the sum-square envelope of a window function at a given hop length.
254 | 
255 |     This is used to estimate modulation effects induced by windowing
256 |     observations in short-time fourier transforms.
257 | 
258 |     Parameters
259 |     ----------
260 |     window : string, tuple, number, callable, or list-like
261 |         Window specification, as in `get_window`
262 | 
263 |     n_frames : int > 0
264 |         The number of analysis frames
265 | 
266 |     hop_length : int > 0
267 |         The number of samples to advance between frames
268 | 
269 |     win_length : [optional]
270 |         The length of the window function.  By default, this matches `n_fft`.
271 | 
272 |     n_fft : int > 0
273 |         The length of each analysis frame.
274 | 
275 |     dtype : np.dtype
276 |         The data type of the output
277 | 
278 |     Returns
279 |     -------
280 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
281 |         The sum-squared envelope of the window function
282 |     """
283 |     if win_length is None:
284 |         win_length = n_fft
285 | 
286 |     n = n_fft + hop_length * (n_frames - 1)
287 |     x = np.zeros(n, dtype=dtype)
288 | 
289 |     # Compute the squared window at the desired length
290 |     win_sq = get_window(window, win_length, fftbins=True)
291 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
292 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
293 | 
294 |     # Fill the envelope
295 |     for i in range(n_frames):
296 |         sample = i * hop_length
297 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
298 |     return x
299 | 
300 | 
301 | class MelScale(torch.nn.Module):
302 |     r"""Turn a normal STFT into a mel frequency STFT, using a conversion
303 |     matrix.  This uses triangular filter banks.
304 | 
305 |     User can control which device the filter bank (`fb`) is (e.g. fb.to(spec_f.device)).
306 | 
307 |     Args:
308 |         n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
309 |         sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
310 |         f_min (float, optional): Minimum frequency. (Default: ``0.``)
311 |         f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
312 |         n_stft (int, optional): Number of bins in STFT. Calculated from first input
313 |             if None is given.  See ``n_fft`` in :class:`Spectrogram`. (Default: ``None``)
314 |     """
315 |     __constants__ = ['n_mels', 'sample_rate', 'f_min', 'f_max']
316 | 
317 |     def __init__(self,
318 |                  n_mels: int = 128,
319 |                  sample_rate: int = 24000,
320 |                  f_min: float = 0.,
321 |                  f_max: Optional[float] = None,
322 |                  n_stft: Optional[int] = None) -> None:
323 |         super(MelScale, self).__init__()
324 |         self.n_mels = n_mels
325 |         self.sample_rate = sample_rate
326 |         self.f_max = f_max if f_max is not None else float(sample_rate // 2)
327 |         self.f_min = f_min
328 | 
329 |         assert f_min <= self.f_max, 'Require f_min: %f < f_max: %f' % (f_min, self.f_max)
330 | 
331 |         fb = torch.empty(0) if n_stft is None else create_fb_matrix(
332 |             n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate)
333 |         self.register_buffer('fb', fb)
334 | 
335 |     def forward(self, specgram: Tensor) -> Tensor:
336 |         r"""
337 |         Args:
338 |             specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
339 | 
340 |         Returns:
341 |             Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
342 |         """
343 | 
344 |         # pack batch
345 |         shape = specgram.size()
346 |         specgram = specgram.reshape(-1, shape[-2], shape[-1])
347 | 
348 |         if self.fb.numel() == 0:
349 |             tmp_fb = create_fb_matrix(specgram.size(1), self.f_min, self.f_max, self.n_mels, self.sample_rate)
350 |             # Attributes cannot be reassigned outside __init__ so workaround
351 |             self.fb.resize_(tmp_fb.size())
352 |             self.fb.copy_(tmp_fb)
353 | 
354 |         # (channel, frequency, time).transpose(...) dot (frequency, n_mels)
355 |         # -> (channel, time, n_mels).transpose(...)
356 |         mel_specgram = torch.matmul(specgram.transpose(1, 2), self.fb).transpose(1, 2)
357 | 
358 |         # unpack batch
359 |         mel_specgram = mel_specgram.reshape(shape[:-2] + mel_specgram.shape[-2:])
360 | 
361 |         return mel_specgram
362 | 
363 | 
364 | class TorchSTFT(torch.nn.Module):
365 |     def __init__(self, fft_size, hop_size, win_size,
366 |                  normalized=False, domain='linear',
367 |                  mel_scale=False, ref_level_db=20, min_level_db=-100):
368 |         super().__init__()
369 |         self.fft_size = fft_size
370 |         self.hop_size = hop_size
371 |         self.win_size = win_size
372 |         self.ref_level_db = ref_level_db
373 |         self.min_level_db = min_level_db
374 |         self.window = torch.hann_window(win_size)
375 |         self.normalized = normalized
376 |         self.domain = domain
377 |         self.mel_scale = MelScale(n_mels=(fft_size // 2 + 1),
378 |             n_stft=(fft_size // 2 + 1)) if mel_scale else None
379 |         
380 |     def transform(self, x):
381 |         x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size,
382 |                             self.window.type_as(x), normalized=self.normalized)
383 |         real = x_stft[..., 0]
384 |         imag = x_stft[..., 1]
385 |         mag = torch.clamp(real ** 2 + imag ** 2, min=1e-7)
386 |         mag = torch.sqrt(mag)
387 |         phase = torch.atan2(imag, real)
388 | 
389 |         if self.mel_scale is not None:
390 |             mag = self.mel_scale(mag)
391 | 
392 |         if self.domain == 'log':
393 |             mag = 20 * torch.log10(mag) - self.ref_level_db
394 |             mag = torch.clamp((mag - self.min_level_db) / -self.min_level_db, 0, 1)
395 |             return mag, phase
396 |         elif self.domain == 'linear':
397 |             return mag, phase
398 |         elif self.domain == 'double':
399 |             log_mag = 20 * torch.log10(mag) - self.ref_level_db
400 |             log_mag = torch.clamp((log_mag - self.min_level_db) / -self.min_level_db, 0, 1)
401 |             return torch.cat((mag, log_mag), dim=1), phase
402 | 
403 |     def complex(self, x):
404 |         x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size,
405 |                             self.window.type_as(x), normalized=self.normalized)
406 |         real = x_stft[..., 0]
407 |         imag = x_stft[..., 1]
408 |         return real, imag
409 | 
410 | 
411 | 
412 | class STFT(torch.nn.Module):
413 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
414 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
415 |                  window='hann'):
416 |         super(STFT, self).__init__()
417 |         self.filter_length = filter_length
418 |         self.hop_length = hop_length
419 |         self.win_length = win_length
420 |         self.window = window
421 |         self.forward_transform = None
422 |         scale = self.filter_length / self.hop_length
423 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
424 | 
425 |         cutoff = int((self.filter_length / 2 + 1))
426 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
427 |                                    np.imag(fourier_basis[:cutoff, :])])
428 | 
429 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
430 |         inverse_basis = torch.FloatTensor(
431 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
432 | 
433 |         if window is not None:
434 |             assert(filter_length >= win_length)
435 |             # get window and zero center pad it to filter_length
436 |             fft_window = get_window(window, win_length, fftbins=True)
437 |             fft_window = pad_center(fft_window, filter_length)
438 |             fft_window = torch.from_numpy(fft_window).float()
439 | 
440 |             # window the bases
441 |             forward_basis *= fft_window
442 |             inverse_basis *= fft_window
443 | 
444 |         self.register_buffer('forward_basis', forward_basis.float())
445 |         self.register_buffer('inverse_basis', inverse_basis.float())
446 | 
447 |     def transform(self, input_data):
448 |         num_batches = input_data.size(0)
449 |         num_samples = input_data.size(1)
450 | 
451 |         self.num_samples = num_samples
452 | 
453 |         # similar to librosa, reflect-pad the input
454 |         input_data = input_data.view(num_batches, 1, num_samples)
455 |         input_data = F.pad(
456 |             input_data.unsqueeze(1),
457 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
458 |             mode='reflect')
459 |         input_data = input_data.squeeze(1)
460 | 
461 |         forward_transform = F.conv1d(
462 |             input_data,
463 |             Variable(self.forward_basis, requires_grad=False),
464 |             stride=self.hop_length,
465 |             padding=0)
466 | 
467 |         cutoff = int((self.filter_length / 2) + 1)
468 |         real_part = forward_transform[:, :cutoff, :]
469 |         imag_part = forward_transform[:, cutoff:, :]
470 | 
471 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
472 |         phase = torch.autograd.Variable(
473 |             torch.atan2(imag_part.data, real_part.data))
474 | 
475 |         return magnitude, phase
476 | 
477 |     def inverse(self, magnitude, phase):
478 |         recombine_magnitude_phase = torch.cat(
479 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
480 | 
481 |         inverse_transform = F.conv_transpose1d(
482 |             recombine_magnitude_phase,
483 |             Variable(self.inverse_basis, requires_grad=False),
484 |             stride=self.hop_length,
485 |             padding=0)
486 | 
487 |         if self.window is not None:
488 |             window_sum = window_sumsquare(
489 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
490 |                 win_length=self.win_length, n_fft=self.filter_length,
491 |                 dtype=np.float32)
492 |             # remove modulation effects
493 |             approx_nonzero_indices = torch.from_numpy(
494 |                 np.where(window_sum > tiny(window_sum))[0])
495 |             window_sum = torch.autograd.Variable(
496 |                 torch.from_numpy(window_sum), requires_grad=False)
497 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
498 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
499 | 
500 |             # scale by hop ratio
501 |             inverse_transform *= float(self.filter_length) / self.hop_length
502 | 
503 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
504 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
505 | 
506 |         return inverse_transform
507 | 
508 |     def forward(self, input_data):
509 |         self.magnitude, self.phase = self.transform(input_data)
510 |         reconstruction = self.inverse(self.magnitude, self.phase)
511 |         return reconstruction
512 | 
513 | 


--------------------------------------------------------------------------------
/infer/__init__.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import librosa
  4 | import numpy as np
  5 | import torch
  6 | import tqdm
  7 | from text import npu
  8 | 
  9 | def resize2d_f0(x, target_len):
 10 |     source = np.array(x)
 11 |     source[source < 0.001] = np.nan
 12 |     target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
 13 |                        source)
 14 |     res = np.nan_to_num(target)
 15 |     return res
 16 | 
 17 | 
 18 | def preprocess(ds):
 19 |     note_list = ds["note_seq"]
 20 |     midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
 21 |                          for x in note_list.split(" ")]
 22 |     f0_seq = None
 23 |     if ds["f0_seq"] is not None:
 24 |         f0_seq = [float(i.strip()) for i in ds["f0_seq"].split(" ")]
 25 |         f0_seq = np.array(f0_seq)
 26 |     phseq = ds["ph_seq"].split(" ")
 27 |     newphseq = []
 28 |     for ph in phseq:
 29 |         newphseq.append(npu.ttsing_phone_to_int[ph])
 30 |     phseq = newphseq
 31 |     phseq = np.array(phseq)
 32 |     pitch = 440 * (2 ** ((np.array(midis) - 69) / 12))
 33 |     durations = [float(i) for i in ds["ph_dur"].split(" ")]
 34 |     accu_dur = 0
 35 |     accu_durs = []
 36 |     for dur in durations:
 37 |         accu_dur += dur
 38 |         accu_durs.append(accu_dur)
 39 |     accu_durs = np.array(accu_durs)
 40 |     accu_durs = (accu_durs * 44100 // 512).astype(int)
 41 |     sub_durs = np.zeros_like(accu_durs)
 42 |     sub_durs[1:accu_durs.shape[0]] = accu_durs[:accu_durs.shape[0]-1]
 43 |     durations = accu_durs-sub_durs
 44 |     f0_seq = resize2d_f0(f0_seq, sum(durations))
 45 |     pos = 0
 46 |     for i, d in enumerate(durations):
 47 |         if phseq[i] == 0:
 48 |             f0_seq[pos:pos + d] = 0
 49 |         pos += d
 50 | 
 51 |     return f0_seq,pitch, phseq, durations
 52 | 
 53 | if __name__ == '__main__':
 54 |     inp = {
 55 |         "text": "SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP",
 56 |         "ph_seq": "SP x ing z ou z ai w ei x ian b ian y van s i0 y i d e g uai d ao SP z i0 y ou d e t iao zh e zh ir j ian sh ang d e w u d ao SP q ing y ing d e x iang an y ing zh ong c ang f u d e b o s i0 m ao d eng d ai x ia y i g e m u u b iao SP",
 57 |         "note_seq": "rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 A#4 A#4 A4 A4 G4 G4 D4 D4 G4 G4 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 C5 C5 C5 C5 G5 G5 C5 C5 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 C5 C5 B4 B4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 G4 G4 D4 D4 G4 G4 F4 F4 G4 G4 A#4 A#4 C5 C5 C#5 D5 D5 rest",
 58 |         "note_dur_seq": "0.6 0.136 0.136 0.137 0.137 0.545 0.545 0.546 0.546 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2719998 0.2719998 0.546 0.546 0.5450001 0.5450001 0.2730002 0.2730002 0.4089999 0.4089999 0.1370001 0.1359997 0.1359997 0.1360002 0.1360002 0.546 0.546 0.5450001 0.5450001 0.2729998 0.2729998 0.2730002 0.2730002 0.2719998 0.2719998 0.546 0.546 0.2730002 0.2730002 0.5449996 0.5449996 0.6820002 0.6820002 0.1359997 0.1370001 0.1370001 0.1360006 0.1360006 0.5450001 0.5450001 0.5459995 0.5459995 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.3640003 0.3640003 0.1809998 0.1809998 0.3640003 0.3640003 0.1820002 0.1820002 0.3639994 0.3639994 0.1810007 0.1810007 0.3639994 0.3639994 0.1820002 0.1820002 0.4090004 0.4090004 0.4089994 0.4089994 0.2729998 0.2729998 0.2720003 0.2720003 0.5460005 0.8179989 0.8179989 0.5",
 59 |         "is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0",
 60 |         "ph_dur": "0.3875 0.2125 0.070091 0.065909 0.082455 0.054545 0.474545 0.070455 0.339182 0.206818 0.244727 0.027273 0.207091 0.065909 0.163909 0.109091 0.272 0 0.442591 0.103409 0.447273 0.097727 0.224137 0.048864 0.409 0.088136 0.048864 0.070091 0.065909 0.081455 0.054545 0.452818 0.093182 0.37 0.175 0.103682 0.169318 0.115046 0.157955 0.1845 0.0875 0.475545 0.070455 0.273 0 0.506363 0.038636 0.682 0.054182 0.081818 0.076773 0.060227 0.097364 0.038636 0.354091 0.190909 0.546 0.202545 0.070455 0.168591 0.103409 0.218454 0.054545 0.2765 0.0875 0.148045 0.032955 0.325364 0.038636 0.067227 0.114773 0.270818 0.093182 0.148046 0.032955 0.286727 0.077273 0.057 0.125 0.409 0 0.381727 0.027273 0.152545 0.120455 0.272 0.441653 0.104348 0.817999 0.5",
 61 |         "f0_timestep": "0.005",
 62 |         "f0_seq": "587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 587.0 586.9 586.7 586.1 585.4 584.8 584.1 583.4 582.9 582.5 582.3 582.5 582.9 583.4 584.1 584.9 585.5 586.1 586.7 587.0 587.3 587.6 587.9 588.0 588.1 588.4 588.7 588.7 588.7 588.0 586.4 584.1 580.8 575.8 568.7 560.8 552.0 540.9 531.0 522.2 513.8 506.6 501.7 497.9 495.0 493.8 493.0 492.6 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.6 493.2 494.1 495.6 498.7 502.5 507.6 515.5 523.9 532.9 543.2 553.7 562.4 570.3 577.2 581.7 584.6 586.9 588.2 588.7 588.7 588.6 588.3 588.1 588.0 587.8 587.5 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 586.9 586.7 587.0 587.0 587.0 587.0 587.5 588.7 590.8 594.1 599.0 607.7 617.7 630.6 647.9 667.1 686.3 706.4 727.1 743.0 755.2 765.1 773.3 778.6 781.6 783.4 784.4 784.4 784.4 784.4 784.7 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.4 784.4 784.4 784.4 783.8 782.3 779.9 775.1 768.7 759.5 747.9 731.5 712.9 694.2 674.0 652.5 636.1 622.4 610.1 601.9 596.0 591.8 589.1 587.8 587.0 587.0 587.0 587.0 586.8 586.8 587.1 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.6 587.9 588.0 588.1 588.5 589.1 589.4 589.4 589.1 588.4 586.8 584.5 581.2 575.9 570.6 564.1 556.0 548.8 542.3 536.2 531.1 527.3 524.8 522.6 521.9 521.5 521.4 521.6 521.9 522.4 522.6 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.6 523.9 524.1 524.4 524.8 525.4 525.8 526.0 526.2 525.7 524.9 523.3 521.1 518.6 515.3 511.3 507.6 504.0 499.9 497.3 495.0 493.1 492.0 491.4 491.1 491.4 491.6 492.1 492.6 492.9 493.2 493.4 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.1 494.3 494.5 494.8 495.1 495.6 496.1 496.4 496.6 496.5 495.8 494.7 493.2 491.0 487.9 484.7 481.2 477.3 473.8 470.9 468.4 466.2 464.8 464.1 463.6 463.7 463.9 464.2 464.7 465.1 465.4 465.6 465.8 466.1 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.4 466.7 466.9 467.2 467.5 468.0 468.4 468.6 468.9 468.3 467.6 466.4 464.4 462.0 459.3 456.0 452.2 449.0 446.0 443.1 441.0 439.5 438.5 437.9 437.5 437.7 437.9 438.4 438.8 439.1 439.3 439.6 439.8 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.3 440.5 440.5 440.7 441.0 441.4 441.5 441.5 441.3 440.6 439.1 437.0 434.2 430.6 426.3 420.5 415.3 410.1 404.6 400.5 397.2 394.5 392.6 391.4 390.9 390.6 390.6 390.8 391.1 391.4 391.5 391.6 391.8 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.4 293.5 293.5 293.5 293.5 293.7 294.3 295.4 297.0 299.5 303.8 308.9 315.3 323.9 333.6 343.2 353.2 363.5 371.5 377.6 382.5 386.6 389.3 390.8 391.7 392.2 392.2 392.2 392.2 392.4 392.3 392.1 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.8 391.8 391.6 391.3 390.9 390.5 390.1 389.5 389.1 388.9 389.1 389.6 390.3 391.2 392.8 394.5 397.0 400.6 405.3 411.1 419.5 431.0 443.7 458.9 479.8 497.9 515.2 532.6 546.7 557.1 565.4 571.7 575.6 577.8 579.1 580.0 580.4 580.8 581.5 582.7 582.9 583.5 584.4 585.1 585.6 586.2 586.8 587.0 587.3 587.7 588.0 588.0 588.2 588.5 588.7 588.7 588.5 587.7 586.3 583.3 579.0 573.7 567.1 558.7 548.3 538.6 529.1 519.2 511.5 505.6 500.7 496.9 494.8 493.6 492.7 492.5 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.7 493.3 494.5 496.5 499.4 503.7 510.1 517.2 525.5 536.3 546.3 555.5 564.6 572.6 578.1 582.6 585.6 587.3 588.3 588.7 588.7 588.6 588.3 588.0 588.0 587.7 587.4 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 586.8 586.8 587.0 587.0 587.0 587.0 587.8 589.1 591.4 595.5 601.9 609.7 619.7 636.1 652.5 670.9 692.6 712.9 730.2 745.9 759.5 768.7 775.1 779.9 782.3 783.8 784.4 784.4 784.4 784.4 784.8 784.5 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.2 784.6 784.7 784.4 784.4 784.4 784.4 783.5 781.7 778.6 773.8 766.5 755.2 743.0 727.1 706.4 686.3 667.1 649.2 632.8 617.7 607.7 600.4 594.3 590.8 588.9 587.6 587.0 587.0 587.0 587.0 586.7 586.9 587.2 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.7 588.0 588.0 588.2 588.7 589.1 589.4 589.3 589.0 588.2 586.1 583.4 579.6 574.8 569.0 561.3 554.4 547.5 540.1 534.7 530.2 526.6 524.1 522.5 521.7 521.4 521.4 521.6 522.1 522.5 522.6 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.4 523.7 523.9 524.2 524.5 525.0 525.6 525.9 526.1 526.1 525.5 524.5 522.9 520.7 517.6 514.2 510.6 506.6 502.6 499.4 496.7 494.2 492.7 491.9 491.3 491.2 491.4 491.7 492.3 492.7 493.0 493.2 493.5 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.6 493.4 493.1 492.9 492.6 492.1 491.6 491.3 491.1 491.5 492.2 493.3 495.2 497.8 500.6 504.0 508.4 512.0 515.6 518.9 521.6 523.6 524.9 525.8 526.3 526.0 525.8 525.3 524.8 524.4 524.1 523.8 523.6 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.8 524.0 524.4 525.0 525.5 526.0 526.9 527.4 527.7 527.8 527.5 527.0 526.4 525.5 524.5 523.5 522.4 521.3 520.4 519.7 519.2 518.7 518.7 519.0 519.5 520.2 520.8 521.4 521.9 522.4 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.6 523.3 523.3 523.5 523.9 524.8 526.3 529.0 533.6 539.5 548.4 560.5 577.8 598.5 620.8 646.5 675.9 700.1 720.9 741.4 755.4 765.4 773.0 778.1 781.1 782.6 783.5 783.9 784.0 784.5 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.1 784.0 783.7 783.1 782.0 780.0 775.7 770.5 762.0 748.5 731.9 712.5 688.4 660.8 635.5 611.2 586.0 569.0 555.3 543.8 535.9 531.0 527.6 525.2 524.2 523.7 523.3 523.3 522.9 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.9 522.7 522.2 521.7 521.1 520.5 519.8 519.3 519.0 519.0 519.2 519.7 520.5 521.5 522.7 524.5 526.0 528.0 530.9 534.3 538.4 543.6 549.7 555.5 561.5 568.0 572.4 575.9 578.8 580.8 581.9 582.6 582.6 582.6 582.3 582.0 581.9 582.3 582.7 583.1 583.8 584.6 585.2 585.8 586.4 586.8 587.1 587.4 587.7 588.0 588.0 588.3 588.6 588.7 588.7 588.3 587.3 585.1 582.6 578.1 572.6 564.6 555.5 546.3 536.3 525.5 517.2 510.1 503.7 499.4 496.5 494.5 493.3 492.7 492.5 492.7 492.7 492.7 492.7 492.7 492.6 492.5 492.9 493.6 494.8 497.3 501.0 505.6 511.5 519.2 529.1 538.6 548.3 558.7 567.1 573.7 579.4 583.6 586.0 587.7 588.7 588.7 588.7 588.5 588.2 588.0 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.0 586.7 586.9 587.0 587.0 587.0 587.2 588.0 589.4 592.4 597.0 603.3 612.5 625.1 639.3 655.8 678.6 698.1 716.6 735.3 750.3 761.4 770.3 777.1 780.8 782.7 784.0 784.4 784.4 784.4 784.5 784.8 784.4 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.4 784.7 784.7 784.4 784.4 784.4 784.4 783.1 781.2 777.8 771.8 763.2 752.8 739.1 720.2 702.1 682.4 663.3 643.9 627.9 615.7 605.9 598.0 593.1 590.3 588.3 587.3 587.0 587.0 587.0 586.9 586.7 587.0 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.4 587.7 588.0 588.0 588.3 588.8 589.2 589.4 589.3 588.8 587.6 585.6 582.8 578.7 573.1 566.7 559.9 552.7 544.8 538.6 533.7 528.8 525.8 523.7 522.3 521.6 521.4 521.4 521.7 522.2 522.5 522.6 522.8 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.7 524.0 524.3 524.6 525.2 525.7 525.9 526.2 525.9 525.2 524.2 522.4 519.7 516.9 513.5 509.2 505.4 501.9 498.6 495.9 493.9 492.5 491.6 491.1 491.2 491.5 491.9 492.4 492.8 493.1 493.3 493.6 493.8 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.2 494.5 494.7 494.9 495.4 495.9 496.3 496.5 496.6 496.2 495.5 493.9 491.9 489.5 486.4 482.6 479.1 475.7 471.9 469.4 467.2 465.5 464.4 463.8 463.5 463.8 464.0 464.5 465.0 465.3 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.8 467.1 467.5 468.0 468.5 469.1 469.7 470.0 470.2 470.1 469.7 469.2 468.5 467.6 466.7 465.7 464.7 463.9 463.2 462.7 462.2 462.1 462.3 462.7 463.2 463.9 464.4 464.8 465.3 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.3 468.9 469.5 469.9 470.2 470.2 469.9 469.4 468.7 468.0 467.0 466.0 465.1 464.2 463.4 462.9 462.4 462.1 462.2 462.5 462.9 463.6 464.2 464.6 465.1 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 467.0 467.4 467.9 468.4 469.0 469.6 470.0 470.2 470.2 469.7 469.2 468.6 467.7 466.7 465.8 464.9 463.9 463.2 462.8 462.3 462.1 462.3 462.6 463.1 463.8 464.3 464.7 465.2 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.2 468.8 469.5 469.9 470.2 470.2 469.9 469.4 468.9 468.1 467.1 466.2 465.3 464.2 463.5 462.9 462.5 462.1 462.2 462.4 462.9 463.6 464.1 464.6 465.1 465.4 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 466.7 466.9 467.2 467.2 467.2 467.0 466.1 464.4 462.4 458.9 454.5 448.1 440.9 433.6 425.7 417.1 410.5 404.8 399.8 396.4 394.0 392.4 391.5 391.1 391.1 391.1 391.3 391.5 391.5 391.7 391.8 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.3 293.5 293.5 293.5 293.5 293.7 294.2 295.1 296.6 299.0 303.0 307.8 313.9 322.0 331.7 341.2 351.0 362.0 369.9 376.4 382.2 386.3 388.9 390.6 391.7 392.2 392.2 392.2 392.2 392.3 392.4 392.2 392.2 392.0 392.2 392.2 392.4 392.4 392.5 392.8 393.2 393.4 393.4 393.2 392.7 391.5 389.7 387.4 384.2 380.0 375.6 370.9 366.3 361.5 357.5 354.4 351.9 350.0 348.8 348.3 348.1 348.0 348.1 348.4 348.7 348.8 348.8 349.0 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.0 348.9 348.8 348.7 348.4 348.2 348.0 348.1 348.2 348.7 350.0 351.6 353.9 356.8 360.5 365.4 370.0 374.7 379.8 383.6 386.8 389.5 391.4 392.6 393.1 393.4 393.4 393.2 392.9 392.6 392.4 392.4 392.2 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.7 391.5 391.5 391.4 391.2 391.1 391.1 391.2 392.0 393.1 394.8 398.1 402.1 407.1 413.7 421.4 429.0 436.7 445.1 451.4 456.3 460.7 463.6 465.4 466.6 467.2 467.2 467.2 467.0 466.8 466.7 466.6 466.4 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.0 465.8 465.6 465.6 465.3 464.8 464.6 464.6 464.7 465.2 466.4 468.4 470.9 474.6 479.8 485.4 491.4 498.4 504.8 510.2 514.9 519.2 521.7 523.5 524.6 525.0 525.1 525.0 524.6 524.1 523.9 523.9 523.6 523.4 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.7 522.4 522.1 521.7 521.2 520.7 520.5 520.2 520.8 521.6 522.9 525.2 528.0 531.1 534.9 539.4 543.3 546.9 550.5 553.1 555.0 556.3 557.1 557.5 557.3 557.0 556.4 555.9 555.5 555.2 554.9 554.6 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.0 553.8 553.5 553.2 552.8 552.2 551.7 551.4 551.2 551.7 552.6 554.0 556.1 558.8 562.0 566.5 570.6 574.7 579.3 582.7 585.5 587.8 589.4 590.2 590.7 590.4 590.1 589.6 589.0 588.5 588.2 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 587.0 586.9 586.4 585.9 585.3 584.6 583.8 583.2 582.7 582.4 582.4 582.7 583.0 583.6 584.5 585.1 585.7 586.3 586.8 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0",
 63 |         "input_type": "phoneme",
 64 |         "offset": 72.491
 65 |     }
 66 |     res = preprocess(inp)
 67 |     print(res)
 68 |     print([float(i) for i in res[0]])
 69 | 
 70 | def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
 71 |     result = np.zeros(idx + b.shape[0])
 72 |     fade_len = a.shape[0] - idx
 73 |     np.copyto(dst=result[:idx], src=a[:idx])
 74 |     k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
 75 |     result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
 76 |     np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
 77 |     return result
 78 | 
 79 | 
 80 | def infer_ds(model, hps, ds, speaker, trans):
 81 | 
 82 |     sample_rate = 44100
 83 | 
 84 |     result = np.zeros(0)
 85 |     current_length = 0
 86 |     for inp in tqdm.tqdm(ds):
 87 |         spkid = hps.data.spk2id[speaker]
 88 |         f0_seq, pitch, phseq, durations = preprocess(inp)
 89 | 
 90 |         f0 = torch.FloatTensor(f0_seq).unsqueeze(0)
 91 | 
 92 |         text_norm = torch.LongTensor(phseq)
 93 |         x_tst = text_norm.unsqueeze(0)
 94 |         x_tst_lengths = torch.LongTensor([text_norm.size(0)])
 95 |         spk = torch.LongTensor([spkid])
 96 |         manual_f0 = torch.FloatTensor(f0).unsqueeze(0)
 97 |         manual_dur = torch.LongTensor(durations).unsqueeze(0)
 98 |         t1 = time.time()
 99 |         with torch.no_grad():
100 |             infer_res = model.infer(x_tst, x_tst_lengths, None, None,
101 |                                     None, gtdur=manual_dur, spk_id=spk,
102 |                                     F0=manual_f0 * 2 ** (trans / 12))
103 |         seg_audio = infer_res[0][0, 0].data.float().numpy()
104 |         try:
105 |             offset_ = inp['offset']
106 |         except:
107 |             offset_ = 0
108 |         silent_length = round(offset_ * sample_rate) - current_length
109 |         if silent_length >= 0:
110 |             result = np.append(result, np.zeros(silent_length))
111 |             result = np.append(result, seg_audio)
112 |         else:
113 |             result = cross_fade(result, seg_audio, current_length + silent_length)
114 |         current_length = current_length + silent_length + seg_audio.shape[0]
115 |         print("infer time:", time.time() - t1)
116 |     return result
117 | 
118 | 
119 | 
120 | 
121 | #
122 | # midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
123 | #                      for x in note_lst]


--------------------------------------------------------------------------------
/egs/visinger2/models.py:
--------------------------------------------------------------------------------
   1 | import sys
   2 | import copy
   3 | import math
   4 | import torch
   5 | from torch import nn
   6 | from torch.nn import functional as F
   7 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
   8 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
   9 | 
  10 | sys.path.append('../..')
  11 | import modules.commons as commons
  12 | import modules.modules as modules
  13 | import modules.attentions as attentions
  14 | 
  15 | from modules.commons import init_weights, get_padding
  16 | from text.npu.symbols import ttsing_phone_set, ttsing_opencpop_pitch_set, ttsing_slur_set
  17 | 
  18 | from modules.ddsp import mlp, gru, scale_function, remove_above_nyquist, upsample
  19 | from modules.ddsp import harmonic_synth, amp_to_impulse_response, fft_convolve
  20 | from modules.ddsp import resample
  21 | 
  22 | from modules.stft import TorchSTFT
  23 | 
  24 | import torch.distributions as D
  25 | 
  26 | from modules.losses import (
  27 |     generator_loss,
  28 |     discriminator_loss,
  29 |     feature_loss,
  30 |     kl_loss
  31 | )
  32 | 
  33 | LRELU_SLOPE = 0.1
  34 | 
  35 | 
  36 | class DurationPredictor(nn.Module):
  37 |     def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_speakers=0, spk_channels=0):
  38 |         super().__init__()
  39 | 
  40 |         self.in_channels = in_channels
  41 |         self.filter_channels = filter_channels
  42 |         self.kernel_size = kernel_size
  43 |         self.p_dropout = p_dropout
  44 |         self.spk_channels = spk_channels
  45 | 
  46 |         self.drop = nn.Dropout(p_dropout)
  47 |         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
  48 |         self.norm_1 = modules.LayerNorm(filter_channels)
  49 |         self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
  50 |         self.norm_2 = modules.LayerNorm(filter_channels)
  51 |         self.conv_3 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
  52 |         self.norm_3 = modules.LayerNorm(filter_channels)
  53 |         self.proj = nn.Conv1d(filter_channels, 2, 1)
  54 | 
  55 |         if n_speakers != 0:
  56 |             self.cond = nn.Conv1d(spk_channels, in_channels, 1)
  57 | 
  58 |     def forward(self, x, x_mask, spk_emb=None):
  59 |         # x = torch.detach(x)
  60 |         if spk_emb is not None:
  61 |             spk_emb = torch.detach(spk_emb)
  62 |             x = x + self.cond(spk_emb)
  63 | 
  64 |         x = self.conv_1(x * x_mask)
  65 |         x = torch.relu(x)
  66 |         x = self.norm_1(x)
  67 |         x = self.drop(x)
  68 | 
  69 |         x = self.conv_2(x * x_mask)
  70 |         x = torch.relu(x)
  71 |         x = self.norm_2(x)
  72 |         x = self.drop(x)
  73 | 
  74 |         x = self.conv_3(x * x_mask)
  75 |         x = torch.relu(x)
  76 |         x = self.norm_3(x)
  77 |         x = self.drop(x)
  78 | 
  79 |         x = self.proj(x * x_mask)
  80 |         return x * x_mask
  81 | 
  82 | 
  83 | class TextEncoder(nn.Module):
  84 |     def __init__(self,
  85 |                  n_vocab,
  86 |                  out_channels,
  87 |                  hidden_channels,
  88 |                  filter_channels,
  89 |                  n_heads,
  90 |                  n_layers,
  91 |                  kernel_size,
  92 |                  p_dropout):
  93 |         super().__init__()
  94 |         self.n_vocab = n_vocab
  95 |         self.out_channels = out_channels
  96 |         self.hidden_channels = hidden_channels
  97 |         self.filter_channels = filter_channels
  98 |         self.n_heads = n_heads
  99 |         self.n_layers = n_layers
 100 |         self.kernel_size = kernel_size
 101 |         self.p_dropout = p_dropout
 102 | 
 103 |         self.emb_phone = nn.Embedding(len(ttsing_phone_set), 256)
 104 |         nn.init.normal_(self.emb_phone.weight, 0.0, 256 ** -0.5)
 105 | 
 106 |         self.pre_net = torch.nn.Linear(256, hidden_channels)
 107 | 
 108 |         self.encoder = attentions.Encoder(
 109 |             hidden_channels,
 110 |             filter_channels,
 111 |             n_heads,
 112 |             n_layers,
 113 |             kernel_size,
 114 |             p_dropout)
 115 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 116 | 
 117 |     def forward(self, phone, phone_lengths, pitchid, dur, slur):
 118 |         phone_end = self.emb_phone(phone) * math.sqrt(256)
 119 |         x = phone_end
 120 | 
 121 |         x = self.pre_net(x)
 122 |         x = torch.transpose(x, 1, -1)  # [b, h, t]
 123 | 
 124 |         x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(x.dtype)
 125 | 
 126 |         x = self.encoder(x * x_mask, x_mask)
 127 |         x = self.proj(x) * x_mask
 128 | 
 129 |         return x, x_mask
 130 | 
 131 | 
 132 | def pad_v2(input_ele, mel_max_length=None):
 133 |     if mel_max_length:
 134 |         max_len = mel_max_length
 135 |     else:
 136 |         max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
 137 | 
 138 |     out_list = list()
 139 |     for i, batch in enumerate(input_ele):
 140 |         if len(batch.shape) == 1:
 141 |             one_batch_padded = F.pad(
 142 |                 batch, (0, max_len - batch.size(0)), "constant", 0.0
 143 |             )
 144 |         elif len(batch.shape) == 2:
 145 |             one_batch_padded = F.pad(
 146 |                 batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
 147 |             )
 148 |         out_list.append(one_batch_padded)
 149 |     out_padded = torch.stack(out_list)
 150 |     return out_padded
 151 | 
 152 | 
 153 | class LengthRegulator(nn.Module):
 154 |     """ Length Regulator """
 155 | 
 156 |     def __init__(self):
 157 |         super(LengthRegulator, self).__init__()
 158 | 
 159 |     def LR(self, x, duration, max_len):
 160 |         x = torch.transpose(x, 1, 2)
 161 |         output = list()
 162 |         mel_len = list()
 163 |         for batch, expand_target in zip(x, duration):
 164 |             expanded = self.expand(batch, expand_target)
 165 |             output.append(expanded)
 166 |             mel_len.append(expanded.shape[0])
 167 | 
 168 |         if max_len is not None:
 169 |             output = pad_v2(output, max_len)
 170 |         else:
 171 |             output = pad_v2(output)
 172 |         output = torch.transpose(output, 1, 2)
 173 |         return output, torch.LongTensor(mel_len)
 174 | 
 175 |     def expand(self, batch, predicted):
 176 |         predicted = torch.squeeze(predicted)
 177 |         out = list()
 178 | 
 179 |         for i, vec in enumerate(batch):
 180 |             expand_size = predicted[i].item()
 181 |             state_info_index = torch.unsqueeze(torch.arange(0, expand_size), 1).float()
 182 |             state_info_length = torch.unsqueeze(torch.Tensor([expand_size] * expand_size), 1).float()
 183 |             state_info = torch.cat([state_info_index, state_info_length], 1).to(vec.device)
 184 |             new_vec = vec.expand(max(int(expand_size), 0), -1)
 185 |             new_vec = torch.cat([new_vec, state_info], 1)
 186 |             out.append(new_vec)
 187 |         out = torch.cat(out, 0)
 188 |         return out
 189 | 
 190 |     def forward(self, x, duration, max_len):
 191 |         output, mel_len = self.LR(x, duration, max_len)
 192 |         return output, mel_len
 193 | 
 194 | 
 195 | class PriorDecoder(nn.Module):
 196 |     def __init__(self,
 197 |                  out_bn_channels,
 198 |                  hidden_channels,
 199 |                  filter_channels,
 200 |                  n_heads,
 201 |                  n_layers,
 202 |                  kernel_size,
 203 |                  p_dropout,
 204 |                  n_speakers=0,
 205 |                  spk_channels=0):
 206 |         super().__init__()
 207 |         self.out_bn_channels = out_bn_channels
 208 |         self.hidden_channels = hidden_channels
 209 |         self.filter_channels = filter_channels
 210 |         self.n_heads = n_heads
 211 |         self.n_layers = n_layers
 212 |         self.kernel_size = kernel_size
 213 |         self.p_dropout = p_dropout
 214 |         self.spk_channels = spk_channels
 215 | 
 216 |         self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1)
 217 |         self.decoder = attentions.FFT(
 218 |             hidden_channels,
 219 |             filter_channels,
 220 |             n_heads,
 221 |             n_layers,
 222 |             kernel_size,
 223 |             p_dropout)
 224 |         self.proj = nn.Conv1d(hidden_channels, out_bn_channels, 1)
 225 | 
 226 |         if n_speakers != 0:
 227 |             self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
 228 | 
 229 |     def forward(self, x, x_lengths, spk_emb=None):
 230 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 231 | 
 232 |         x = self.prenet(x) * x_mask
 233 | 
 234 |         if (spk_emb is not None):
 235 |             x = x + self.cond(spk_emb)
 236 | 
 237 |         x = self.decoder(x * x_mask, x_mask)
 238 | 
 239 |         bn = self.proj(x) * x_mask
 240 | 
 241 |         return bn, x_mask
 242 | 
 243 | 
 244 | class Decoder(nn.Module):
 245 |     def __init__(self,
 246 |                  out_channels,
 247 |                  hidden_channels,
 248 |                  filter_channels,
 249 |                  n_heads,
 250 |                  n_layers,
 251 |                  kernel_size,
 252 |                  p_dropout,
 253 |                  n_speakers=0,
 254 |                  spk_channels=0):
 255 |         super().__init__()
 256 |         self.out_channels = out_channels
 257 |         self.hidden_channels = hidden_channels
 258 |         self.filter_channels = filter_channels
 259 |         self.n_heads = n_heads
 260 |         self.n_layers = n_layers
 261 |         self.kernel_size = kernel_size
 262 |         self.p_dropout = p_dropout
 263 |         self.spk_channels = spk_channels
 264 | 
 265 |         self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1)
 266 |         self.decoder = attentions.FFT(
 267 |             hidden_channels,
 268 |             filter_channels,
 269 |             n_heads,
 270 |             n_layers,
 271 |             kernel_size,
 272 |             p_dropout)
 273 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 274 | 
 275 |         if n_speakers != 0:
 276 |             self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
 277 | 
 278 |     def forward(self, x, x_lengths, spk_emb=None):
 279 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 280 | 
 281 |         x = self.prenet(x) * x_mask
 282 | 
 283 |         if (spk_emb is not None):
 284 |             x = x + self.cond(spk_emb)
 285 | 
 286 |         x = self.decoder(x * x_mask, x_mask)
 287 | 
 288 |         x = self.proj(x) * x_mask
 289 | 
 290 |         return x, x_mask
 291 | 
 292 | 
 293 | class ConvReluNorm(nn.Module):
 294 |     def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 295 |         super().__init__()
 296 |         self.in_channels = in_channels
 297 |         self.hidden_channels = hidden_channels
 298 |         self.out_channels = out_channels
 299 |         self.kernel_size = kernel_size
 300 |         self.n_layers = n_layers
 301 |         self.p_dropout = p_dropout
 302 |         assert n_layers > 1, "Number of layers should be larger than 0."
 303 | 
 304 |         self.conv_layers = nn.ModuleList()
 305 |         self.norm_layers = nn.ModuleList()
 306 |         self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
 307 |         self.norm_layers.append(LayerNorm(hidden_channels))
 308 |         self.relu_drop = nn.Sequential(
 309 |             nn.ReLU(),
 310 |             nn.Dropout(p_dropout))
 311 |         for _ in range(n_layers - 1):
 312 |             self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
 313 |             self.norm_layers.append(LayerNorm(hidden_channels))
 314 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 315 |         self.proj.weight.data.zero_()
 316 |         self.proj.bias.data.zero_()
 317 | 
 318 |     def forward(self, x):
 319 |         x = self.conv_layers[0](x)
 320 |         x = self.norm_layers[0](x)
 321 |         x = self.relu_drop(x)
 322 | 
 323 |         for i in range(1, self.n_layers):
 324 |             x_ = self.conv_layers[i](x)
 325 |             x_ = self.norm_layers[i](x_)
 326 |             x_ = self.relu_drop(x_)
 327 |             x = (x + x_) / 2
 328 |         x = self.proj(x)
 329 |         return x
 330 | 
 331 | 
 332 | class PosteriorEncoder(nn.Module):
 333 |     def __init__(self,
 334 |                  hps,
 335 |                  in_channels,
 336 |                  out_channels,
 337 |                  hidden_channels,
 338 |                  kernel_size,
 339 |                  dilation_rate,
 340 |                  n_layers):
 341 |         super().__init__()
 342 |         self.in_channels = in_channels
 343 |         self.out_channels = out_channels
 344 |         self.hidden_channels = hidden_channels
 345 |         self.kernel_size = kernel_size
 346 |         self.dilation_rate = dilation_rate
 347 |         self.n_layers = n_layers
 348 | 
 349 |         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
 350 |         self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=hps.data.n_speakers, spk_channels=hps.model.spk_channels)
 351 |         # self.enc = ConvReluNorm(hidden_channels,
 352 |         #                         hidden_channels,
 353 |         #                         hidden_channels,
 354 |         #                         kernel_size,
 355 |         #                         n_layers,
 356 |         #                         0.1)
 357 |         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 358 | 
 359 |     def forward(self, x, x_lengths, g=None):
 360 |         x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 361 |         x = self.pre(x) * x_mask
 362 |         x = self.enc(x, x_mask, g=g)
 363 |         stats = self.proj(x) * x_mask
 364 |         return stats, x_mask
 365 | 
 366 | 
 367 | class ResBlock3(torch.nn.Module):
 368 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
 369 |         super(ResBlock3, self).__init__()
 370 |         self.convs = nn.ModuleList([
 371 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 372 |                                padding=get_padding(kernel_size, dilation[0])))
 373 |         ])
 374 |         self.convs.apply(init_weights)
 375 | 
 376 |     def forward(self, x, x_mask=None):
 377 |         for c in self.convs:
 378 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 379 |             if x_mask is not None:
 380 |                 xt = xt * x_mask
 381 |             xt = c(xt)
 382 |             x = xt + x
 383 |         if x_mask is not None:
 384 |             x = x * x_mask
 385 |         return x
 386 | 
 387 |     def remove_weight_norm(self):
 388 |         for l in self.convs:
 389 |             remove_weight_norm(l)
 390 | 
 391 | 
 392 | class Generator_Harm(torch.nn.Module):
 393 |     def __init__(self, hps):
 394 |         super(Generator_Harm, self).__init__()
 395 |         self.hps = hps
 396 | 
 397 |         self.prenet = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
 398 | 
 399 |         self.net = ConvReluNorm(hps.model.hidden_channels,
 400 |                                 hps.model.hidden_channels,
 401 |                                 hps.model.hidden_channels,
 402 |                                 hps.model.kernel_size,
 403 |                                 8,
 404 |                                 hps.model.p_dropout)
 405 | 
 406 |         # self.rnn = nn.LSTM(input_size=hps.model.hidden_channels,
 407 |         #    hidden_size=hps.model.hidden_channels,
 408 |         #    num_layers=1,
 409 |         #    bias=True,
 410 |         #    batch_first=True,
 411 |         #    dropout=0.5,
 412 |         #    bidirectional=True)
 413 |         self.postnet = Conv1d(hps.model.hidden_channels, hps.model.n_harmonic + 1, 3, padding=1)
 414 | 
 415 |     def forward(self, f0, harm, mask):
 416 |         pitch = f0.transpose(1, 2)
 417 |         harm = self.prenet(harm)
 418 | 
 419 |         harm = self.net(harm) * mask
 420 |         # harm = harm.transpose(1, 2)
 421 |         # harm, (hs, hc) = self.rnn(harm)
 422 |         # harm = harm.transpose(1, 2)
 423 | 
 424 |         harm = self.postnet(harm)
 425 |         harm = harm.transpose(1, 2)
 426 |         param = harm
 427 | 
 428 |         param = scale_function(param)
 429 |         total_amp = param[..., :1]
 430 |         amplitudes = param[..., 1:]
 431 |         amplitudes = remove_above_nyquist(
 432 |             amplitudes,
 433 |             pitch,
 434 |             self.hps.data.sample_rate,
 435 |         )
 436 |         amplitudes /= amplitudes.sum(-1, keepdim=True)
 437 |         amplitudes *= total_amp
 438 | 
 439 |         amplitudes = upsample(amplitudes, self.hps.data.hop_size)
 440 |         pitch = upsample(pitch, self.hps.data.hop_size)
 441 | 
 442 |         n_harmonic = amplitudes.shape[-1]
 443 |         omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
 444 |         omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
 445 |         signal_harmonics = (torch.sin(omegas) * amplitudes)
 446 |         signal_harmonics = signal_harmonics.transpose(1, 2)
 447 |         return signal_harmonics
 448 | 
 449 | 
 450 | class Generator(torch.nn.Module):
 451 |     def __init__(self, hps, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
 452 |                  upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, spk_channels=0):
 453 |         super(Generator, self).__init__()
 454 |         self.num_kernels = len(resblock_kernel_sizes)
 455 |         self.num_upsamples = len(upsample_rates)
 456 |         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
 457 |         self.upsample_rates = upsample_rates
 458 |         self.n_speakers = n_speakers
 459 | 
 460 |         resblock = modules.ResBlock1 if resblock == '1' else modules.R
 461 | 
 462 |         self.downs = nn.ModuleList()
 463 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
 464 |             i = len(upsample_rates) - 1 - i
 465 |             u = upsample_rates[i]
 466 |             k = upsample_kernel_sizes[i]
 467 |             # print("down: ",upsample_initial_channel//(2**(i+1))," -> ", upsample_initial_channel//(2**i))
 468 |             self.downs.append(weight_norm(
 469 |                 Conv1d(hps.model.n_harmonic + 2, hps.model.n_harmonic + 2,
 470 |                        k, u, padding=k // 2)))
 471 | 
 472 |         self.resblocks_downs = nn.ModuleList()
 473 |         for i in range(len(self.downs)):
 474 |             j = len(upsample_rates) - 1 - i
 475 |             self.resblocks_downs.append(ResBlock3(hps.model.n_harmonic + 2, 3, (1, 3)))
 476 | 
 477 |         self.concat_pre = Conv1d(upsample_initial_channel + hps.model.n_harmonic + 2, upsample_initial_channel, 3, 1,
 478 |                                  padding=1)
 479 |         self.concat_conv = nn.ModuleList()
 480 |         for i in range(len(upsample_rates)):
 481 |             ch = upsample_initial_channel // (2 ** (i + 1))
 482 |             self.concat_conv.append(Conv1d(ch + hps.model.n_harmonic + 2, ch, 3, 1, padding=1, bias=False))
 483 | 
 484 |         self.ups = nn.ModuleList()
 485 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
 486 |             self.ups.append(weight_norm(
 487 |                 ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
 488 |                                 k, u, padding=(k - u) // 2)))
 489 | 
 490 |         self.resblocks = nn.ModuleList()
 491 |         for i in range(len(self.ups)):
 492 |             ch = upsample_initial_channel // (2 ** (i + 1))
 493 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
 494 |                 self.resblocks.append(resblock(ch, k, d))
 495 | 
 496 |         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
 497 |         self.ups.apply(init_weights)
 498 | 
 499 |         if self.n_speakers != 0:
 500 |             self.cond = nn.Conv1d(spk_channels, upsample_initial_channel, 1)
 501 | 
 502 |     def forward(self, x, ddsp, g=None):
 503 | 
 504 |         x = self.conv_pre(x)
 505 | 
 506 |         if g is not None:
 507 |             x = x + self.cond(g)
 508 | 
 509 |         se = ddsp
 510 |         res_features = [se]
 511 |         for i in range(self.num_upsamples):
 512 |             in_size = se.size(2)
 513 |             se = self.downs[i](se)
 514 |             se = self.resblocks_downs[i](se)
 515 |             up_rate = self.upsample_rates[self.num_upsamples - 1 - i]
 516 |             se = se[:, :, : in_size // up_rate]
 517 |             res_features.append(se)
 518 | 
 519 |         x = torch.cat([x, se], 1)
 520 |         x = self.concat_pre(x)
 521 | 
 522 |         for i in range(self.num_upsamples):
 523 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 524 |             in_size = x.size(2)
 525 |             x = self.ups[i](x)
 526 |             # 保证维度正确，丢掉多余通道
 527 |             x = x[:, :, : in_size * self.upsample_rates[i]]
 528 | 
 529 |             x = torch.cat([x, res_features[self.num_upsamples - 1 - i]], 1)
 530 |             x = self.concat_conv[i](x)
 531 | 
 532 |             xs = None
 533 |             for j in range(self.num_kernels):
 534 |                 if xs is None:
 535 |                     xs = self.resblocks[i * self.num_kernels + j](x)
 536 |                 else:
 537 |                     xs += self.resblocks[i * self.num_kernels + j](x)
 538 |             x = xs / self.num_kernels
 539 | 
 540 |         x = F.leaky_relu(x)
 541 |         x = self.conv_post(x)
 542 |         x = torch.tanh(x)
 543 | 
 544 |         return x
 545 | 
 546 |     def remove_weight_norm(self):
 547 |         print('Removing weight norm...')
 548 |         for l in self.ups:
 549 |             remove_weight_norm(l)
 550 |         for l in self.resblocks:
 551 |             l.remove_weight_norm()
 552 | 
 553 | 
 554 | class Generator_Noise(torch.nn.Module):
 555 |     def __init__(self, hps):
 556 |         super(Generator_Noise, self).__init__()
 557 |         self.hps = hps
 558 |         self.win_size = hps.data.win_size
 559 |         self.hop_size = hps.data.hop_size
 560 |         self.fft_size = hps.data.n_fft
 561 |         self.istft_pre = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
 562 | 
 563 |         self.net = ConvReluNorm(hps.model.hidden_channels,
 564 |                                 hps.model.hidden_channels,
 565 |                                 hps.model.hidden_channels,
 566 |                                 hps.model.kernel_size,
 567 |                                 8,
 568 |                                 hps.model.p_dropout)
 569 | 
 570 |         self.istft_amplitude = torch.nn.Conv1d(hps.model.hidden_channels, self.fft_size // 2 + 1, 1, 1)
 571 |         self.window = torch.hann_window(self.win_size)
 572 | 
 573 |     def forward(self, x, mask):
 574 |         istft_x = x
 575 |         istft_x = self.istft_pre(istft_x)
 576 | 
 577 |         istft_x = self.net(istft_x) * mask
 578 | 
 579 |         amp = self.istft_amplitude(istft_x).unsqueeze(-1)
 580 |         phase = (torch.rand(amp.shape) * 2 * 3.14 - 3.14).to(amp)
 581 | 
 582 |         real = amp * torch.cos(phase)
 583 |         imag = amp * torch.sin(phase)
 584 |         spec = torch.cat([real, imag], 3)
 585 |         istft_x = torch.istft(spec, self.fft_size, self.hop_size, self.win_size, self.window.to(amp), True,
 586 |                               length=x.shape[2] * self.hop_size, return_complex=False)
 587 | 
 588 |         return istft_x.unsqueeze(1)
 589 | 
 590 | 
 591 | class LayerNorm(nn.Module):
 592 |     def __init__(self, channels, eps=1e-5):
 593 |         super().__init__()
 594 |         self.channels = channels
 595 |         self.eps = eps
 596 | 
 597 |         self.gamma = nn.Parameter(torch.ones(channels))
 598 |         self.beta = nn.Parameter(torch.zeros(channels))
 599 | 
 600 |     def forward(self, x):
 601 |         x = x.transpose(1, -1)
 602 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 603 |         return x.transpose(1, -1)
 604 | 
 605 | 
 606 | class DiscriminatorP(torch.nn.Module):
 607 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
 608 |         super(DiscriminatorP, self).__init__()
 609 |         self.period = period
 610 |         self.use_spectral_norm = use_spectral_norm
 611 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 612 |         self.convs = nn.ModuleList([
 613 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 614 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 615 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 616 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 617 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
 618 |         ])
 619 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
 620 | 
 621 |     def forward(self, x):
 622 |         fmap = []
 623 | 
 624 |         # 1d to 2d
 625 |         b, c, t = x.shape
 626 |         if t % self.period != 0:  # pad first
 627 |             n_pad = self.period - (t % self.period)
 628 |             x = F.pad(x, (0, n_pad), "reflect")
 629 |             t = t + n_pad
 630 |         x = x.view(b, c, t // self.period, self.period)
 631 | 
 632 |         for l in self.convs:
 633 |             x = l(x)
 634 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 635 |             fmap.append(x)
 636 |         x = self.conv_post(x)
 637 |         fmap.append(x)
 638 |         x = torch.flatten(x, 1, -1)
 639 | 
 640 |         return x, fmap
 641 | 
 642 | 
 643 | class DiscriminatorS(torch.nn.Module):
 644 |     def __init__(self, use_spectral_norm=False):
 645 |         super(DiscriminatorS, self).__init__()
 646 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 647 |         self.convs = nn.ModuleList([
 648 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
 649 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
 650 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
 651 |             norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
 652 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
 653 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
 654 |         ])
 655 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
 656 | 
 657 |     def forward(self, x):
 658 |         fmap = []
 659 | 
 660 |         for l in self.convs:
 661 |             x = l(x)
 662 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
 663 |             fmap.append(x)
 664 |         x = self.conv_post(x)
 665 |         fmap.append(x)
 666 |         x = torch.flatten(x, 1, -1)
 667 | 
 668 |         return x, fmap
 669 | 
 670 | 
 671 | class MultiFrequencyDiscriminator(nn.Module):
 672 |     def __init__(self,
 673 |                  hop_lengths=[128, 256, 512],
 674 |                  hidden_channels=[256, 512, 512],
 675 |                  domain='double', mel_scale=True):
 676 |         super(MultiFrequencyDiscriminator, self).__init__()
 677 | 
 678 |         self.stfts = nn.ModuleList([
 679 |             TorchSTFT(fft_size=x * 4, hop_size=x, win_size=x * 4,
 680 |                       normalized=True, domain=domain, mel_scale=mel_scale)
 681 |             for x in hop_lengths])
 682 | 
 683 |         self.domain = domain
 684 |         if domain == 'double':
 685 |             self.discriminators = nn.ModuleList([
 686 |                 BaseFrequenceDiscriminator(2, c)
 687 |                 for x, c in zip(hop_lengths, hidden_channels)])
 688 |         else:
 689 |             self.discriminators = nn.ModuleList([
 690 |                 BaseFrequenceDiscriminator(1, c)
 691 |                 for x, c in zip(hop_lengths, hidden_channels)])
 692 | 
 693 |     def forward(self, x):
 694 |         scores, feats = list(), list()
 695 |         for stft, layer in zip(self.stfts, self.discriminators):
 696 |             # print(stft)
 697 |             mag, phase = stft.transform(x.squeeze())
 698 |             if self.domain == 'double':
 699 |                 mag = torch.stack(torch.chunk(mag, 2, dim=1), dim=1)
 700 |             else:
 701 |                 mag = mag.unsqueeze(1)
 702 | 
 703 |             score, feat = layer(mag)
 704 |             scores.append(score)
 705 |             feats.append(feat)
 706 |         return scores, feats
 707 | 
 708 | 
 709 | class BaseFrequenceDiscriminator(nn.Module):
 710 |     def __init__(self, in_channels, hidden_channels=512):
 711 |         super(BaseFrequenceDiscriminator, self).__init__()
 712 | 
 713 |         self.discriminator = nn.ModuleList()
 714 |         self.discriminator += [
 715 |             nn.Sequential(
 716 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 717 |                 nn.utils.weight_norm(nn.Conv2d(
 718 |                     in_channels, hidden_channels // 32,
 719 |                     kernel_size=(3, 3), stride=(1, 1)))
 720 |             ),
 721 |             nn.Sequential(
 722 |                 nn.LeakyReLU(0.2, True),
 723 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 724 |                 nn.utils.weight_norm(nn.Conv2d(
 725 |                     hidden_channels // 32, hidden_channels // 16,
 726 |                     kernel_size=(3, 3), stride=(2, 2)))
 727 |             ),
 728 |             nn.Sequential(
 729 |                 nn.LeakyReLU(0.2, True),
 730 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 731 |                 nn.utils.weight_norm(nn.Conv2d(
 732 |                     hidden_channels // 16, hidden_channels // 8,
 733 |                     kernel_size=(3, 3), stride=(1, 1)))
 734 |             ),
 735 |             nn.Sequential(
 736 |                 nn.LeakyReLU(0.2, True),
 737 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 738 |                 nn.utils.weight_norm(nn.Conv2d(
 739 |                     hidden_channels // 8, hidden_channels // 4,
 740 |                     kernel_size=(3, 3), stride=(2, 2)))
 741 |             ),
 742 |             nn.Sequential(
 743 |                 nn.LeakyReLU(0.2, True),
 744 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 745 |                 nn.utils.weight_norm(nn.Conv2d(
 746 |                     hidden_channels // 4, hidden_channels // 2,
 747 |                     kernel_size=(3, 3), stride=(1, 1)))
 748 |             ),
 749 |             nn.Sequential(
 750 |                 nn.LeakyReLU(0.2, True),
 751 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 752 |                 nn.utils.weight_norm(nn.Conv2d(
 753 |                     hidden_channels // 2, hidden_channels,
 754 |                     kernel_size=(3, 3), stride=(2, 2)))
 755 |             ),
 756 |             nn.Sequential(
 757 |                 nn.LeakyReLU(0.2, True),
 758 |                 nn.ReflectionPad2d((1, 1, 1, 1)),
 759 |                 nn.utils.weight_norm(nn.Conv2d(
 760 |                     hidden_channels, 1,
 761 |                     kernel_size=(3, 3), stride=(1, 1)))
 762 |             )
 763 |         ]
 764 | 
 765 |     def forward(self, x):
 766 |         hiddens = []
 767 |         for layer in self.discriminator:
 768 |             x = layer(x)
 769 |             hiddens.append(x)
 770 |         return x, hiddens[-1]
 771 | 
 772 | 
 773 | class Discriminator(torch.nn.Module):
 774 |     def __init__(self, hps, use_spectral_norm=False):
 775 |         super(Discriminator, self).__init__()
 776 |         periods = [2, 3, 5, 7, 11]
 777 | 
 778 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
 779 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
 780 |         self.discriminators = nn.ModuleList(discs)
 781 |         self.disc_multfrequency = MultiFrequencyDiscriminator(hop_lengths=[int(hps.data.sample_rate * 2.5 / 1000),
 782 |                                                                            int(hps.data.sample_rate * 5 / 1000),
 783 |                                                                            int(hps.data.sample_rate * 7.5 / 1000),
 784 |                                                                            int(hps.data.sample_rate * 10 / 1000),
 785 |                                                                            int(hps.data.sample_rate * 12.5 / 1000),
 786 |                                                                            int(hps.data.sample_rate * 15 / 1000)],
 787 |                                                               hidden_channels=[256, 256, 256, 256, 256])
 788 | 
 789 |     def forward(self, y, y_hat):
 790 |         y_d_rs = []
 791 |         y_d_gs = []
 792 |         fmap_rs = []
 793 |         fmap_gs = []
 794 |         for i, d in enumerate(self.discriminators):
 795 |             y_d_r, fmap_r = d(y)
 796 |             y_d_g, fmap_g = d(y_hat)
 797 |             y_d_rs.append(y_d_r)
 798 |             y_d_gs.append(y_d_g)
 799 |             fmap_rs.append(fmap_r)
 800 |             fmap_gs.append(fmap_g)
 801 |         scores_r, fmaps_r = self.disc_multfrequency(y)
 802 |         scores_g, fmaps_g = self.disc_multfrequency(y_hat)
 803 |         for i in range(len(scores_r)):
 804 |             y_d_rs.append(scores_r[i])
 805 |             y_d_gs.append(scores_g[i])
 806 |             fmap_rs.append(fmaps_r[i])
 807 |             fmap_gs.append(fmaps_g[i])
 808 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 809 | 
 810 | 
 811 | class SynthesizerTrn(nn.Module):
 812 |     """
 813 |     Model
 814 |     """
 815 | 
 816 |     def __init__(self, hps):
 817 |         super().__init__()
 818 |         self.hps = hps
 819 | 
 820 |         self.text_encoder = TextEncoder(
 821 |             len(ttsing_phone_set),
 822 |             hps.model.prior_hidden_channels,
 823 |             hps.model.prior_hidden_channels,
 824 |             hps.model.prior_filter_channels,
 825 |             hps.model.prior_n_heads,
 826 |             hps.model.prior_n_layers,
 827 |             hps.model.prior_kernel_size,
 828 |             hps.model.prior_p_dropout)
 829 | 
 830 |         self.decoder = PriorDecoder(
 831 |             hps.model.hidden_channels * 2,
 832 |             hps.model.prior_hidden_channels,
 833 |             hps.model.prior_filter_channels,
 834 |             hps.model.prior_n_heads,
 835 |             hps.model.prior_n_layers,
 836 |             hps.model.prior_kernel_size,
 837 |             hps.model.prior_p_dropout,
 838 |             n_speakers=hps.data.n_speakers,
 839 |             spk_channels=hps.model.spk_channels
 840 |         )
 841 | 
 842 |         self.f0_decoder = Decoder(
 843 |             1,
 844 |             hps.model.prior_hidden_channels,
 845 |             hps.model.prior_filter_channels,
 846 |             hps.model.prior_n_heads,
 847 |             hps.model.prior_n_layers,
 848 |             hps.model.prior_kernel_size,
 849 |             hps.model.prior_p_dropout,
 850 |             n_speakers=hps.data.n_speakers,
 851 |             spk_channels=hps.model.spk_channels
 852 |         )
 853 | 
 854 |         self.mel_decoder = Decoder(
 855 |             hps.data.acoustic_dim,
 856 |             hps.model.prior_hidden_channels,
 857 |             hps.model.prior_filter_channels,
 858 |             hps.model.prior_n_heads,
 859 |             hps.model.prior_n_layers,
 860 |             hps.model.prior_kernel_size,
 861 |             hps.model.prior_p_dropout,
 862 |             n_speakers=hps.data.n_speakers,
 863 |             spk_channels=hps.model.spk_channels
 864 |         )
 865 | 
 866 |         self.posterior_encoder = PosteriorEncoder(
 867 |             hps,
 868 |             hps.data.acoustic_dim,
 869 |             hps.model.hidden_channels,
 870 |             hps.model.hidden_channels, 3, 1, 8)
 871 | 
 872 |         self.dropout = nn.Dropout(0.2)
 873 | 
 874 |         self.duration_predictor = DurationPredictor(
 875 |             hps.model.prior_hidden_channels,
 876 |             hps.model.prior_hidden_channels,
 877 |             3,
 878 |             0.5,
 879 |             n_speakers=hps.data.n_speakers,
 880 |             spk_channels=hps.model.spk_channels)
 881 |         self.LR = LengthRegulator()
 882 | 
 883 |         self.dec = Generator(hps,
 884 |                              hps.model.hidden_channels,
 885 |                              hps.model.resblock,
 886 |                              hps.model.resblock_kernel_sizes,
 887 |                              hps.model.resblock_dilation_sizes,
 888 |                              hps.model.upsample_rates,
 889 |                              hps.model.upsample_initial_channel,
 890 |                              hps.model.upsample_kernel_sizes,
 891 |                              n_speakers=hps.data.n_speakers,
 892 |                              spk_channels=hps.model.spk_channels)
 893 | 
 894 |         self.dec_harm = Generator_Harm(hps)
 895 | 
 896 |         self.dec_noise = Generator_Noise(hps)
 897 | 
 898 |         self.f0_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1)
 899 |         self.energy_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1)
 900 |         self.mel_prenet = nn.Conv1d(hps.data.acoustic_dim, hps.model.prior_hidden_channels + 2, 3, padding=1)
 901 | 
 902 |         if hps.data.n_speakers > 1:
 903 |             self.emb_spk = nn.Embedding(hps.data.n_speakers, hps.model.spk_channels)
 904 |         self.flow = modules.ResidualCouplingBlock(hps.model.prior_hidden_channels, hps.model.hidden_channels, 5, 1, 4,n_speakers=hps.data.n_speakers, gin_channels=hps.model.spk_channels)
 905 | 
 906 |     def forward(self, phone, phone_lengths, pitchid, dur, slur, gtdur, F0, mel, bn_lengths, spk_id=None):
 907 |         if self.hps.data.n_speakers > 0:
 908 |             g = self.emb_spk(spk_id).unsqueeze(-1)  # [b, h, 1]
 909 |         else:
 910 |             g = None
 911 | 
 912 |         # Encoder
 913 |         x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur)
 914 | 
 915 |         # LR
 916 |         decoder_input, mel_len = self.LR(x, gtdur, None)
 917 | 
 918 |         LF0 = 2595. * torch.log10(1. + F0 / 700.)
 919 |         LF0 = LF0 / 500
 920 | 
 921 |         # aam
 922 |         predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), bn_lengths, spk_emb=g)
 923 | 
 924 |         predict_energy = predict_mel.detach().sum(1).unsqueeze(1) / self.hps.data.acoustic_dim
 925 | 
 926 |         decoder_input = decoder_input + \
 927 |                         self.f0_prenet(LF0) + \
 928 |                         self.energy_prenet(predict_energy) + \
 929 |                         self.mel_prenet(predict_mel.detach())
 930 |         decoder_output, predict_bn_mask = self.decoder(decoder_input, bn_lengths, spk_emb=g)
 931 | 
 932 |         prior_info = decoder_output
 933 |         m_p = prior_info[:, :self.hps.model.hidden_channels, :]
 934 |         logs_p = prior_info[:, self.hps.model.hidden_channels:, :]
 935 | 
 936 |         # posterior
 937 |         posterior, y_mask = self.posterior_encoder(mel, bn_lengths,g=g)
 938 |         m_q = posterior[:, :self.hps.model.hidden_channels, :]
 939 |         logs_q = posterior[:, self.hps.model.hidden_channels:, :]
 940 |         z = (m_q + torch.randn_like(m_q) * torch.exp(logs_q)) * y_mask
 941 |         z_p = self.flow(z, y_mask, g=g)
 942 | 
 943 |         # kl loss
 944 |         loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, y_mask)
 945 | 
 946 |         p_z = z
 947 |         p_z = self.dropout(p_z)
 948 | 
 949 |         pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size)
 950 |         omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
 951 |         sin = torch.sin(omega).transpose(1, 2)
 952 | 
 953 |         # dsp synthesize
 954 |         noise_x = self.dec_noise(p_z, y_mask)
 955 |         harm_x = self.dec_harm(F0, p_z, y_mask)
 956 | 
 957 |         # dsp waveform
 958 |         dsp_o = torch.cat([harm_x, noise_x], axis=1)
 959 | 
 960 |         decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1)
 961 | 
 962 |         # dsp based HiFiGAN vocoder
 963 |         x_slice, ids_slice = commons.rand_slice_segments(p_z, bn_lengths,
 964 |                                                          self.hps.train.segment_size // self.hps.data.hop_size)
 965 |         F0_slice = commons.slice_segments(F0, ids_slice, self.hps.train.segment_size // self.hps.data.hop_size)
 966 |         dsp_slice = commons.slice_segments(dsp_o, ids_slice * self.hps.data.hop_size, self.hps.train.segment_size)
 967 |         condition_slice = commons.slice_segments(decoder_condition, ids_slice * self.hps.data.hop_size,
 968 |                                                  self.hps.train.segment_size)
 969 |         o = self.dec(x_slice, condition_slice.detach(), g=g)
 970 | 
 971 |         return o, ids_slice, LF0 * predict_bn_mask, dsp_slice.sum(1), loss_kl, predict_mel, predict_bn_mask
 972 | 
 973 |     def infer(self, phone, phone_lengths, pitchid, dur, slur, gtdur=None, spk_id=None, length_scale=1., F0=None, noise_scale=0.8):
 974 | 
 975 |         if self.hps.data.n_speakers > 0:
 976 |             g = self.emb_spk(spk_id).unsqueeze(-1)  # [b, h, 1]
 977 |         else:
 978 |             g = None
 979 | 
 980 |         # Encoder
 981 |         x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur)
 982 | 
 983 |         # dur
 984 |         y_lengths = torch.clamp_min(torch.sum(gtdur.squeeze(1), [1]), 1).long()
 985 |         LF0 = 2595. * torch.log10(1. + F0 / 700.)
 986 |         LF0 = LF0 / 500
 987 |         # LR
 988 |         decoder_input, mel_len = self.LR(x, gtdur, None)
 989 | 
 990 |         # aam
 991 |         predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), y_lengths, spk_emb=g)
 992 | 
 993 |         predict_energy = predict_mel.sum(1).unsqueeze(1) / self.hps.data.acoustic_dim
 994 | 
 995 |         decoder_input = decoder_input + \
 996 |                         self.f0_prenet(LF0) + \
 997 |                         self.energy_prenet(predict_energy) + \
 998 |                         self.mel_prenet(predict_mel)
 999 |         decoder_output, y_mask = self.decoder(decoder_input, y_lengths, spk_emb=g)
1000 | 
1001 |         prior_info = decoder_output
1002 | 
1003 |         m_p = prior_info[:, :self.hps.model.hidden_channels, :]
1004 |         logs_p = prior_info[:, self.hps.model.hidden_channels:, :]
1005 |         z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
1006 |         z = self.flow(z_p, y_mask, g=g, reverse=True)
1007 | 
1008 |         prior_z = z
1009 | 
1010 |         noise_x = self.dec_noise(prior_z, y_mask)
1011 | 
1012 |         harm_x = self.dec_harm(F0, prior_z, y_mask)
1013 | 
1014 |         pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size)
1015 |         omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
1016 |         sin = torch.sin(omega).transpose(1, 2)
1017 | 
1018 |         decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1)
1019 | 
1020 |         # dsp based HiFiGAN vocoder
1021 |         o = self.dec(prior_z, decoder_condition, g=g)
1022 | 
1023 |         return o, harm_x.sum(1).unsqueeze(1), noise_x
1024 | 


--------------------------------------------------------------------------------