├── .gitignore ├── LICENSE ├── README.md ├── configs └── 48k.json ├── filelist.py ├── filelists └── .gitignore ├── nsf_hifigan ├── __init__.py ├── data │ ├── __init__.py │ ├── collate.py │ └── dataset.py ├── hparams.py ├── mel_processing.py ├── model │ ├── __init__.py │ ├── commons.py │ ├── discriminators │ │ ├── __init__.py │ │ ├── discriminator.py │ │ ├── multi_period_discriminator.py │ │ └── multi_scale_discriminator.py │ ├── generators │ │ ├── __init__.py │ │ ├── cond_module.py │ │ ├── filter_module.py │ │ ├── generator.py │ │ └── source_module.py │ ├── loss.py │ ├── modules │ │ ├── __init__.py │ │ ├── conv1_keep_length.py │ │ └── moving_average.py │ ├── nsf_hifigan.py │ └── pipeline.py ├── pyin.py └── utils.py ├── requirements.txt ├── split.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | /logs_*/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Vtuber Plan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NSF-HiFiGAN 2 | This project is a modified implementation of NSF-HiFiGAN models. 3 | For more information on NSF models, please visit https://nii-yamagishilab.github.io/samples-nsf/ 4 | 5 | Original repository is [NSF-HIFIGAN](https://github.com/PlayVoice/NSF-HIFIGAN). -------------------------------------------------------------------------------- /configs/48k.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "max_epochs": 20000, 4 | "accumulate_grad_batches": 2, 5 | "default_root_dir": "./logs_48k", 6 | "val_check_interval": 1.0, 7 | "gradient_clip_val": 0.5 8 | }, 9 | "train": { 10 | "log_interval": 200, 11 | "eval_interval": 1000, 12 | "seed": 1234, 13 | "max_epochs": 20000, 14 | "generator_learning_rate": 0.0002, 15 | "discriminator_learning_rate": 0.0002, 16 | "betas": [ 17 | 0.8, 18 | 0.99 19 | ], 20 | "eps": 1e-9, 21 | "batch_size": 32, 22 | "fp16_run": true, 23 | "lr_decay": 0.999875, 24 | "segment_size": 16384, 25 | "init_lr_ratio": 1, 26 | "warmup_epochs": 0, 27 | "c_mel": 0, 28 | "c_spec": 45 29 | }, 30 | "data": { 31 | "training_files": "filelists/48k_audio_filelist_train.txt", 32 | "validation_files": "filelists/48k_audio_filelist_valid.txt", 33 | "sampling_rate": 48000, 34 | "filter_length": 2048, 35 | "hop_length": 512, 36 | "win_length": 2048, 37 | "n_mel_channels": 128, 38 | "mel_fmin": 0.0, 39 | "mel_fmax": null, 40 | "num_pitch": 512 41 | }, 42 | "model": { 43 | "use_spectral_norm": false, 44 | "multi_period_discriminator_periods": [2,3,5,7,11,13,17,19,23,37] 45 | } 46 | } -------------------------------------------------------------------------------- /filelist.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import tqdm 5 | import soundfile as sf 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-i', '--input', type=str, default="./dataset", help='Dataset path') 10 | parser.add_argument('-o', '--output', type=str, default="./filelists/48k_audio_filelist.txt", help='File list output path') 11 | parser.add_argument('-s', '--sr', type=int, default=48000, help='File target sample rate') 12 | args = parser.parse_args() 13 | 14 | if not os.path.exists(os.path.dirname(args.output)): 15 | os.makedirs(os.path.dirname(args.output), exist_ok=True) 16 | 17 | audio_files = list(glob.glob(os.path.join(args.input, "**/*.wav"), recursive=True)) 18 | 19 | target_sr = args.sr 20 | total_time = 0 21 | with open(args.output, "w", encoding="utf-8") as f: 22 | for i, audio_path in enumerate(tqdm.tqdm(audio_files)): 23 | audio = sf.SoundFile(audio_path) 24 | sec = audio.frames / audio.samplerate 25 | if audio.frames / audio.samplerate * target_sr < 16384 * 1.2: 26 | continue 27 | audio_path = audio_path.replace("\\", "/") 28 | f.write(f"{audio_path}\n") 29 | total_time += sec 30 | 31 | print(f"Total time: {total_time//3600}h") -------------------------------------------------------------------------------- /filelists/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | -------------------------------------------------------------------------------- /nsf_hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/data/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/data/collate.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | class MelCollate(): 5 | def __init__(self, return_ids: bool = False): 6 | self.return_ids = return_ids 7 | 8 | def __call__(self, batch): 9 | # Right zero-pad all one-hot text sequences to max input length 10 | _, ids_sorted_decreasing = torch.sort( 11 | torch.LongTensor([x["wav"].size(1) for x in batch]), 12 | dim=0, descending=True) 13 | 14 | max_x_wav_len = max([x["wav"].size(1) for x in batch]) 15 | max_x_pitch_len = max([x["pitch"].size(1) for x in batch]) 16 | max_y_wav_len = max([x["wav"].size(1) for x in batch]) 17 | 18 | x_wav_lengths = torch.LongTensor(len(batch)) 19 | x_pitch_lengths = torch.LongTensor(len(batch)) 20 | y_wav_lengths = torch.LongTensor(len(batch)) 21 | 22 | x_wav_padded = torch.zeros(len(batch), 1, max_x_wav_len, dtype=torch.float32) 23 | x_pitch_padded = torch.zeros(len(batch), max_x_pitch_len, dtype=torch.float32) 24 | y_wav_padded = torch.zeros(len(batch), 1, max_y_wav_len, dtype=torch.float32) 25 | 26 | for i in range(len(ids_sorted_decreasing)): 27 | row = batch[ids_sorted_decreasing[i]] 28 | 29 | wav = row["wav"] 30 | x_wav_padded[i, :, :wav.size(1)] = wav 31 | x_wav_lengths[i] = wav.size(1) 32 | 33 | pitch = row["pitch"] 34 | x_pitch_padded[i, :pitch.size(1)] = pitch 35 | x_pitch_lengths[i] = pitch.size(1) 36 | 37 | wav = row["wav"] 38 | y_wav_padded[i, :, :wav.size(1)] = wav 39 | y_wav_lengths[i] = wav.size(1) 40 | 41 | ret = { 42 | "x_wav_values": x_wav_padded, 43 | "x_wav_lengths": x_wav_lengths, 44 | "x_pitch_values": x_pitch_padded, 45 | "x_pitch_lengths": x_pitch_lengths, 46 | "y_wav_values": y_wav_padded, 47 | "y_wav_lengths": y_wav_lengths, 48 | } 49 | 50 | if self.return_ids: 51 | ret.update("ids", "ids_sorted_decreasing") 52 | return ret 53 | 54 | -------------------------------------------------------------------------------- /nsf_hifigan/data/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from typing import Optional 4 | 5 | import torch 6 | import torchaudio 7 | 8 | import numpy as np 9 | import librosa 10 | from librosa import pyin 11 | 12 | from ..utils import load_filepaths, load_wav_to_torch 13 | 14 | resamplers = {} 15 | 16 | def load_audio(filename: str, sr: Optional[int] = None): 17 | global resamplers 18 | audio, sampling_rate = load_wav_to_torch(filename) 19 | 20 | if sr is not None and sampling_rate != sr: 21 | # not match, then resample 22 | if sr in resamplers: 23 | resampler = resamplers[(sampling_rate, sr)] 24 | else: 25 | resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=sr) 26 | resamplers[(sampling_rate, sr)] = resampler 27 | audio = resampler(audio) 28 | sampling_rate = sr 29 | # raise ValueError("{} {} SR doesn't match target {} SR".format(sampling_rate, self.sampling_rate)) 30 | return audio 31 | 32 | def normalize_pitch(pitch, mean, std): 33 | zeros = (pitch == 0.0) 34 | pitch -= mean[:, None] 35 | pitch /= std[:, None] 36 | pitch[zeros] = 0.0 37 | return pitch 38 | 39 | def estimate_pitch(audio: np.ndarray, sr: int, n_fft: int, win_length: int, hop_length: int, 40 | method='pyin', normalize_mean=None, normalize_std=None, n_formants=1): 41 | if type(normalize_mean) is float or type(normalize_mean) is list: 42 | normalize_mean = torch.tensor(normalize_mean) 43 | 44 | if type(normalize_std) is float or type(normalize_std) is list: 45 | normalize_std = torch.tensor(normalize_std) 46 | 47 | if method == 'pyin': 48 | snd, sr = audio, sr 49 | pad_size = int((n_fft-hop_length)/2) 50 | snd = np.pad(snd, (pad_size, pad_size), mode='reflect') 51 | 52 | pitch_mel, voiced_flag, voiced_probs = pyin( 53 | snd, 54 | fmin=librosa.note_to_hz('C2'), 55 | fmax=librosa.note_to_hz('C7'), 56 | sr=sr, 57 | frame_length=win_length, 58 | hop_length=hop_length, 59 | center=False, 60 | pad_mode='reflect') 61 | # assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0 62 | 63 | pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel) 64 | pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0) 65 | # pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1))) 66 | 67 | if n_formants > 1: 68 | raise NotImplementedError 69 | else: 70 | raise ValueError 71 | 72 | pitch_mel = pitch_mel.float() 73 | 74 | if normalize_mean is not None: 75 | assert normalize_std is not None 76 | pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std) 77 | 78 | return pitch_mel 79 | 80 | def get_pitch(audio: str, sr: int, filter_length: int, win_length: int, hop_length: int): 81 | pitch_mel = estimate_pitch( 82 | audio=audio, sr=sr, n_fft=filter_length, 83 | win_length=win_length, hop_length=hop_length, method='pyin', 84 | normalize_mean=None, normalize_std=None, n_formants=1) 85 | 86 | return pitch_mel 87 | 88 | class MelDataset(torch.utils.data.Dataset): 89 | def __init__(self, audiopaths: str, hparams): 90 | self.audiopaths = load_filepaths(audiopaths) 91 | self.hparams = hparams 92 | self.sampling_rate = hparams.sampling_rate 93 | self.filter_length = hparams.filter_length 94 | self.hop_length = hparams.hop_length 95 | self.win_length = hparams.win_length 96 | self.mel_fmin = hparams.mel_fmin 97 | self.mel_fmax = hparams.mel_fmax 98 | self.n_mel_channels = hparams.n_mel_channels 99 | 100 | self.resamplers = {} 101 | 102 | random.seed(1234) 103 | random.shuffle(self.audiopaths) 104 | 105 | def get_item(self, index: int): 106 | audio_path = self.audiopaths[index] 107 | 108 | audio_wav = load_audio(audio_path, sr=self.sampling_rate) 109 | 110 | audio_pitch = get_pitch( 111 | audio_wav.numpy(), 112 | self.sampling_rate, 113 | self.hparams.filter_length, 114 | self.hparams.win_length, 115 | self.hparams.hop_length 116 | ) 117 | 118 | return { 119 | "wav": audio_wav.unsqueeze(0), 120 | "pitch": audio_pitch, 121 | } 122 | 123 | def __getitem__(self, index): 124 | ret = self.get_item(index) 125 | return ret 126 | 127 | def __len__(self): 128 | return len(self.audiopaths) 129 | -------------------------------------------------------------------------------- /nsf_hifigan/hparams.py: -------------------------------------------------------------------------------- 1 | 2 | class HParams(): 3 | def __init__(self, **kwargs): 4 | for k, v in kwargs.items(): 5 | if type(v) == dict: 6 | v = HParams(**v) 7 | self[k] = v 8 | 9 | def keys(self): 10 | return self.__dict__.keys() 11 | 12 | def items(self): 13 | return self.__dict__.items() 14 | 15 | def values(self): 16 | return self.__dict__.values() 17 | 18 | def __len__(self): 19 | return len(self.__dict__) 20 | 21 | def __getitem__(self, key): 22 | return getattr(self, key) 23 | 24 | def __setitem__(self, key, value): 25 | return setattr(self, key, value) 26 | 27 | def __contains__(self, key): 28 | return key in self.__dict__ 29 | 30 | def __repr__(self): 31 | return self.__dict__.__repr__() 32 | -------------------------------------------------------------------------------- /nsf_hifigan/mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | import torchaudio 16 | 17 | import logging 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool=False): 52 | if torch.min(y) < -1.: 53 | logging.warning(f'min value is {torch.min(y).detach().cpu().item()}') 54 | if torch.max(y) > 1.: 55 | logging.warning(f'max value is {torch.max(y).detach().cpu().item()}') 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + '_' + str(y.device) 59 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) 68 | 69 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 70 | 71 | return spec 72 | 73 | def spectrogram_torch_audio(y, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool=False): 74 | if torch.min(y) < -1.: 75 | logging.warning(f'min value is {torch.min(y).detach().cpu().item()}') 76 | if torch.max(y) > 1.: 77 | logging.warning(f'max value is {torch.max(y).detach().cpu().item()}') 78 | 79 | global hann_window 80 | dtype_device = str(y.dtype) + '_' + str(y.device) 81 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 82 | if wnsize_dtype_device not in hann_window: 83 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 84 | 85 | pad = int((n_fft-hop_size)/2) 86 | 87 | spec = torchaudio.functional.spectrogram(y, pad, hann_window[wnsize_dtype_device], 88 | n_fft, hop_size, win_size, None, 89 | center=center, pad_mode='reflect', normalized=False, onesided=True) 90 | 91 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 92 | 93 | return spec 94 | 95 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 96 | global mel_basis 97 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 98 | fmax_dtype_device = str(fmax) + '_' + dtype_device 99 | if fmax_dtype_device not in mel_basis: 100 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 101 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 102 | 103 | if len(spec.shape) == 3: 104 | mel_matrix = mel_basis[fmax_dtype_device].unsqueeze(0) 105 | else: 106 | mel_matrix = mel_basis[fmax_dtype_device] 107 | spec = torch.matmul(mel_matrix, spec) 108 | spec = spectral_normalize_torch(spec) 109 | return spec 110 | 111 | 112 | def mel_spectrogram_torch(y, n_fft: int, num_mels: int, sampling_rate: int, hop_size: int, win_size: int, fmin: int, fmax: int, center: bool=False): 113 | if torch.min(y) < -1.: 114 | logging.warning(f'min value is {torch.min(y).detach().cpu().item()}') 115 | if torch.max(y) > 1.: 116 | logging.warning(f'max value is {torch.max(y).detach().cpu().item()}') 117 | 118 | global mel_basis, hann_window 119 | dtype_device = str(y.dtype) + '_' + str(y.device) 120 | fmax_dtype_device = str(fmax) + '_' + dtype_device 121 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 122 | if fmax_dtype_device not in mel_basis: 123 | mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) 124 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 125 | if wnsize_dtype_device not in hann_window: 126 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 127 | 128 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 129 | y = y.squeeze(1) 130 | 131 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 132 | center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) 133 | 134 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 135 | 136 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 137 | spec = spectral_normalize_torch(spec) 138 | 139 | return spec 140 | -------------------------------------------------------------------------------- /nsf_hifigan/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/model/commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size * dilation - dilation) / 2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def intersperse(lst, item): 25 | result = [item] * (len(lst) * 2 + 1) 26 | result[1::2] = lst 27 | return result 28 | 29 | 30 | def kl_divergence(m_p, logs_p, m_q, logs_q): 31 | """KL(P||Q)""" 32 | kl = (logs_q - logs_p) - 0.5 33 | kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b], device=x_lengths.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d( 68 | length, channels, min_timescale=1.0, max_timescale=1.0e4): 69 | position = torch.arange(length, dtype=torch.float) 70 | num_timescales = channels // 2 71 | log_timescale_increment = ( 72 | math.log(float(max_timescale) / float(min_timescale)) / 73 | (num_timescales - 1)) 74 | inv_timescales = min_timescale * torch.exp( 75 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2,3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1. / norm_type) 161 | return total_norm 162 | -------------------------------------------------------------------------------- /nsf_hifigan/model/discriminators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/discriminators/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/model/discriminators/discriminator.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | LRELU_SLOPE = 0.1 7 | 8 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 9 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 10 | from ..commons import get_padding 11 | 12 | class DiscriminatorP(torch.nn.Module): 13 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 14 | super(DiscriminatorP, self).__init__() 15 | self.period = period 16 | self.use_spectral_norm = use_spectral_norm 17 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 18 | self.convs = nn.ModuleList([ 19 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 20 | norm_f(Conv2d(32, 64, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 21 | norm_f(Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 22 | norm_f(Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 23 | norm_f(Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 24 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), 25 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), 26 | ]) 27 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 28 | 29 | def forward(self, x): 30 | fmap = [] 31 | 32 | # 1d to 2d 33 | b, c, t = x.shape 34 | if t % self.period != 0: # pad first 35 | n_pad = self.period - (t % self.period) 36 | x = F.pad(x, (0, n_pad), "reflect") 37 | t = t + n_pad 38 | x = x.view(b, c, t // self.period, self.period) 39 | 40 | for l in self.convs: 41 | x = l(x) 42 | x = F.leaky_relu(x, LRELU_SLOPE) 43 | fmap.append(x) 44 | x = self.conv_post(x) 45 | fmap.append(x) 46 | x = torch.flatten(x, 1, -1) 47 | 48 | return x, fmap 49 | 50 | 51 | class DiscriminatorS(torch.nn.Module): 52 | def __init__(self, use_spectral_norm=False): 53 | super(DiscriminatorS, self).__init__() 54 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 55 | self.convs = nn.ModuleList([ 56 | norm_f(Conv1d(1, 16, 15, 1, padding=7)), 57 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), 58 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), 59 | norm_f(Conv1d(256, 512, 41, 4, groups=64, padding=20)), 60 | norm_f(Conv1d(512, 1024, 41, 4, groups=256, padding=20)), 61 | norm_f(Conv1d(1024, 1024, 41, 4, groups=512, padding=20)), 62 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 63 | ]) 64 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 65 | 66 | def forward(self, x): 67 | fmap = [] 68 | 69 | for l in self.convs: 70 | x = l(x) 71 | x = F.leaky_relu(x, LRELU_SLOPE) 72 | fmap.append(x) 73 | x = self.conv_post(x) 74 | fmap.append(x) 75 | x = torch.flatten(x, 1, -1) 76 | 77 | return x, fmap 78 | 79 | class DiscriminatorSpec(torch.nn.Module): 80 | def __init__(self, n_fft: int=1024, kernel_size:int=5, stride=3, use_spectral_norm=False): 81 | super(DiscriminatorSpec, self).__init__() 82 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 83 | in_channel = n_fft // 2 + 1 84 | self.convs = nn.ModuleList([ 85 | norm_f(Conv1d(in_channel, 2048, 5, 1, padding=2)), 86 | norm_f(Conv1d(2048, 4096, 41, 4, groups=4, padding=20)), 87 | norm_f(Conv1d(4096, 4096, 5, 1, padding=2)), 88 | ]) 89 | self.conv_post = norm_f(Conv1d(4096, 1, 3, 1, padding=1)) 90 | 91 | def forward(self, x): 92 | fmap = [] 93 | 94 | for l in self.convs: 95 | x = l(x) 96 | x = F.leaky_relu(x, LRELU_SLOPE) 97 | fmap.append(x) 98 | x = self.conv_post(x) 99 | fmap.append(x) 100 | x = torch.flatten(x, 1, -1) 101 | 102 | return x, fmap 103 | 104 | -------------------------------------------------------------------------------- /nsf_hifigan/model/discriminators/multi_period_discriminator.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import List 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | from .discriminator import DiscriminatorP, DiscriminatorS 8 | 9 | class MultiPeriodDiscriminator(torch.nn.Module): 10 | def __init__(self, periods: List[int]=[2, 3, 5, 7, 11, 17, 23, 37], use_spectral_norm: bool=False): 11 | super(MultiPeriodDiscriminator, self).__init__() 12 | self.periods = periods 13 | discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] 14 | discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] 15 | self.discriminators = nn.ModuleList(discs) 16 | 17 | def forward(self, y, y_hat, g=None): 18 | y_d_rs = [] 19 | y_d_gs = [] 20 | fmap_rs = [] 21 | fmap_gs = [] 22 | for i, d in enumerate(self.discriminators): 23 | y_d_r, fmap_r = d(y) 24 | y_d_g, fmap_g = d(y_hat) 25 | y_d_rs.append(y_d_r) 26 | y_d_gs.append(y_d_g) 27 | fmap_rs.append(fmap_r) 28 | fmap_gs.append(fmap_g) 29 | 30 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 31 | 32 | -------------------------------------------------------------------------------- /nsf_hifigan/model/discriminators/multi_scale_discriminator.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torch.nn import AvgPool1d 6 | 7 | from .discriminator import DiscriminatorP, DiscriminatorS 8 | 9 | 10 | class MultiScaleDiscriminator(torch.nn.Module): 11 | def __init__(self, use_spectral_norm=False): 12 | super(MultiScaleDiscriminator, self).__init__() 13 | self.discriminators = nn.ModuleList([ 14 | DiscriminatorS(use_spectral_norm=use_spectral_norm), 15 | DiscriminatorS(), 16 | DiscriminatorS(), 17 | DiscriminatorS(), 18 | DiscriminatorS(), 19 | ]) 20 | self.meanpools = nn.ModuleList([ 21 | AvgPool1d(kernel_size=4, stride=2, padding=2), 22 | AvgPool1d(kernel_size=4, stride=2, padding=2), 23 | AvgPool1d(kernel_size=4, stride=2, padding=2), 24 | AvgPool1d(kernel_size=4, stride=2, padding=2) 25 | ]) 26 | 27 | def forward(self, y, y_hat): 28 | y_d_rs = [] 29 | y_d_gs = [] 30 | fmap_rs = [] 31 | fmap_gs = [] 32 | for i, d in enumerate(self.discriminators): 33 | if i != 0: 34 | y = self.meanpools[i-1](y) 35 | y_hat = self.meanpools[i-1](y_hat) 36 | y_d_r, fmap_r = d(y) 37 | y_d_g, fmap_g = d(y_hat) 38 | y_d_rs.append(y_d_r) 39 | fmap_rs.append(fmap_r) 40 | y_d_gs.append(y_d_g) 41 | fmap_gs.append(fmap_g) 42 | 43 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs -------------------------------------------------------------------------------- /nsf_hifigan/model/generators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/generators/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/model/generators/cond_module.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from nsf_hifigan.model.modules.conv1_keep_length import Conv1dKeepLength 7 | 8 | from ..modules.moving_average import MovingAverage 9 | 10 | class UpSampleLayer(nn.Module): 11 | """ Wrapper over up-sampling 12 | Input tensor: (batchsize=1, length, dim) 13 | Ouput tensor: (batchsize=1, length * up-sampling_factor, dim) 14 | """ 15 | def __init__(self, feature_dim, up_sampling_factor, smoothing=False): 16 | super(UpSampleLayer, self).__init__() 17 | # wrap a up_sampling layer 18 | self.scale_factor = up_sampling_factor 19 | self.l_upsamp = nn.Upsample(scale_factor=self.scale_factor) 20 | if smoothing: 21 | self.l_ave1 = MovingAverage(feature_dim, self.scale_factor) 22 | self.l_ave2 = MovingAverage(feature_dim, self.scale_factor) 23 | else: 24 | self.l_ave1 = nn.Identity() 25 | self.l_ave2 = nn.Identity() 26 | return 27 | 28 | def forward(self, x): 29 | # permute to (batchsize=1, dim, length) 30 | up_sampled_data = self.l_upsamp(x.permute(0, 2, 1)) 31 | 32 | # permute it backt to (batchsize=1, length, dim) 33 | # and do two moving average 34 | return self.l_ave1(self.l_ave2(up_sampled_data.permute(0, 2, 1))) 35 | 36 | 37 | class CondModuleHnSincNSF(nn.Module): 38 | """ Condition module for hn-sinc-NSF 39 | 40 | Upsample and transform input features 41 | CondModuleHnSincNSF(input_dimension, output_dimension, up_sample_rate, 42 | blstm_dimension = 64, cnn_kernel_size = 3) 43 | 44 | Spec, F0, cut_off_freq = CondModuleHnSincNSF(features, F0) 45 | 46 | Both input features should be frame-level features 47 | If x doesn't contain F0, just ignore the returned F0 48 | 49 | CondModuleHnSincNSF(input_dim, output_dim, up_sample, 50 | blstm_s = 64, cnn_kernel_s = 3, 51 | voiced_threshold = 0): 52 | 53 | input_dim: sum of dimensions of input features 54 | output_dim: dim of the feature Spec to be used by neural filter-block 55 | up_sample: up sampling rate of input features 56 | blstm_s: dimension of the features from blstm (default 64) 57 | cnn_kernel_s: kernel size of CNN in condition module (default 3) 58 | voiced_threshold: f0 > voiced_threshold is voiced, otherwise unvoiced 59 | """ 60 | def __init__(self, input_dim, output_dim, up_sample, \ 61 | blstm_s = 64, cnn_kernel_s = 3, voiced_threshold = 0): 62 | super(CondModuleHnSincNSF, self).__init__() 63 | 64 | # input feature dimension 65 | self.input_dim = input_dim 66 | self.output_dim = output_dim 67 | self.up_sample = up_sample 68 | self.blstm_s = blstm_s 69 | self.cnn_kernel_s = cnn_kernel_s 70 | self.cut_f_smooth = up_sample * 4 71 | self.voiced_threshold = voiced_threshold 72 | 73 | # the blstm layer 74 | self.l_blstm = nn.LSTM(input_dim, self.blstm_s // 2, bidirectional=True, batch_first=True) 75 | 76 | # the CNN layer (+1 dim for cut_off_frequence of sinc filter) 77 | self.l_conv1d = Conv1dKeepLength(self.blstm_s, self.output_dim, dilation_s = 1, kernel_s = self.cnn_kernel_s) 78 | # Upsampling layer for hidden features 79 | self.l_upsamp = UpSampleLayer(self.output_dim, self.up_sample, True) 80 | # separate layer for up-sampling normalized F0 values 81 | self.l_upsamp_f0_hi = UpSampleLayer(1, self.up_sample, True) 82 | 83 | # Upsampling for F0: don't smooth up-sampled F0 84 | self.l_upsamp_F0 = UpSampleLayer(1, self.up_sample, False) 85 | 86 | # Another smoothing layer to smooth the cut-off frequency 87 | # for sinc filters. Use a larger window to smooth 88 | self.l_cut_f_smooth = MovingAverage(1, self.cut_f_smooth) 89 | 90 | def get_cut_f(self, hidden_feat, f0): 91 | """ cut_f = get_cut_f(self, feature, f0) 92 | feature: (batchsize, length, dim=1) 93 | f0: (batchsize, length, dim=1) 94 | """ 95 | # generate uv signal 96 | uv = torch.ones_like(f0) * (f0 > self.voiced_threshold) 97 | # hidden_feat is between (-1, 1) after conv1d with tanh 98 | # (-0.2, 0.2) + 0.3 = (0.1, 0.5) 99 | # voiced: (0.1, 0.5) + 0.4 = (0.5, 0.9) 100 | # unvoiced: (0.1, 0.5) = (0.1, 0.5) 101 | return hidden_feat * 0.2 + uv * 0.4 + 0.3 102 | 103 | 104 | def forward(self, feature, f0): 105 | """ spec, f0 = forward(self, feature, f0) 106 | feature: (batchsize, length, dim) 107 | f0: (batchsize, length, dim=1), which should be F0 at frame-level 108 | 109 | spec: (batchsize, length, self.output_dim), at wave-level 110 | f0: (batchsize, length, 1), at wave-level 111 | """ 112 | feature_h, feature_c = self.l_blstm(feature) 113 | tmp = self.l_upsamp(self.l_conv1d(feature_h)) 114 | tmp_f0 = self.l_upsamp_f0_hi(f0) 115 | # concatenat normed F0 with hidden spectral features 116 | context = torch.cat((tmp[:, :, :-1], tmp_f0), dim=2) 117 | 118 | # hidden feature for cut-off frequency 119 | hidden_cut_f = tmp[:, :, self.output_dim-1:] 120 | 121 | # directly up-sample F0 without smoothing 122 | f0_upsamp = self.l_upsamp_F0(f0) 123 | 124 | # get the cut-off-frequency from output of CNN 125 | cut_f = self.get_cut_f(hidden_cut_f, f0_upsamp) 126 | # smooth the cut-off-frequency using fixed average smoothing 127 | cut_f_smoothed = self.l_cut_f_smooth(cut_f) 128 | 129 | # return 130 | return context, f0_upsamp, cut_f_smoothed, hidden_cut_f 131 | -------------------------------------------------------------------------------- /nsf_hifigan/model/generators/filter_module.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import numpy as np 7 | 8 | from nsf_hifigan.model.modules.conv1_keep_length import Conv1dKeepLength 9 | 10 | class TimeVarFIRFilter(nn.Module): 11 | """ TimeVarFIRFilter 12 | Given sequences of filter coefficients and a signal, do filtering 13 | 14 | Filter coefs: (batchsize=1, signal_length, filter_order = K) 15 | Signal: (batchsize=1, signal_length, 1) 16 | 17 | For batch 0: 18 | For n in [1, sequence_length): 19 | output(0, n, 1) = \sum_{k=1}^{K} signal(0, n-k, 1)*coef(0, n, k) 20 | 21 | Note: filter coef (0, n, :) is only used to compute the output 22 | at (0, n, 1) 23 | """ 24 | def __init__(self): 25 | super(TimeVarFIRFilter, self).__init__() 26 | 27 | def forward(self, signal, f_coef): 28 | """ 29 | Filter coefs: (batchsize=1, signal_length, filter_order = K) 30 | Signal: (batchsize=1, signal_length, 1) 31 | 32 | Output: (batchsize=1, signal_length, 1) 33 | 34 | For n in [1, sequence_length): 35 | output(0, n, 1)= \sum_{k=1}^{K} signal(0, n-k, 1)*coef(0, n, k) 36 | 37 | This method may be not efficient: 38 | 39 | Suppose signal [x_1, ..., x_N], filter [a_1, ..., a_K] 40 | output [y_1, y_2, y_3, ..., y_N, *, * ... *] 41 | = a_1 * [x_1, x_2, x_3, ..., x_N, 0, ..., 0] 42 | + a_2 * [ 0, x_1, x_2, x_3, ..., x_N, 0, ..., 0] 43 | + a_3 * [ 0, 0, x_1, x_2, x_3, ..., x_N, 0, ..., 0] 44 | """ 45 | signal_l = signal.shape[1] 46 | order_k = f_coef.shape[-1] 47 | 48 | # pad to (batchsize=1, signal_length + filter_order-1, dim) 49 | padded_signal = F.pad(signal, (0, 0, 0, order_k - 1)) 50 | 51 | y = torch.zeros_like(signal) 52 | # roll and weighted sum, only take [0:signal_length] 53 | for k in range(order_k): 54 | y += torch.roll(padded_signal, k, dims=1)[:, 0:signal_l, :] \ 55 | * f_coef[:, :, k:k+1] 56 | # done 57 | return y 58 | 59 | 60 | class SincFilter(nn.Module): 61 | """ SincFilter 62 | Given the cut-off-frequency, produce the low-pass and high-pass 63 | windowed-sinc-filters. 64 | If input cut-off-frequency is (batchsize=1, signal_length, 1), 65 | output filter coef is (batchsize=1, signal_length, filter_order). 66 | For each time step in [1, signal_length), we calculate one 67 | filter for low-pass sinc filter and another for high-pass filter. 68 | 69 | Example: 70 | import scipy 71 | import scipy.signal 72 | import numpy as np 73 | 74 | filter_order = 31 75 | cut_f = 0.2 76 | sinc_layer = SincFilter(filter_order) 77 | lp_coef, hp_coef = sinc_layer(torch.ones(1, 10, 1) * cut_f) 78 | 79 | w, h1 = scipy.signal.freqz(lp_coef[0, 0, :].numpy(), [1]) 80 | w, h2 = scipy.signal.freqz(hp_coef[0, 0, :].numpy(), [1]) 81 | plt.plot(w, 20*np.log10(np.abs(h1))) 82 | plt.plot(w, 20*np.log10(np.abs(h2))) 83 | plt.plot([cut_f * np.pi, cut_f * np.pi], [-100, 0]) 84 | """ 85 | def __init__(self, filter_order): 86 | super(SincFilter, self).__init__() 87 | # Make the filter oder an odd number 88 | # [-(M-1)/2, ... 0, (M-1)/2] 89 | # 90 | self.half_k = (filter_order - 1) // 2 91 | self.order = self.half_k * 2 +1 92 | 93 | def hamming_w(self, n_index): 94 | """ prepare hamming window for each time step 95 | n_index (batchsize=1, signal_length, filter_order) 96 | For each time step, n_index will be [-(M-1)/2, ... 0, (M-1)/2] 97 | n_index[0, 0, :] = [-(M-1)/2, ... 0, (M-1)/2] 98 | n_index[0, 1, :] = [-(M-1)/2, ... 0, (M-1)/2] 99 | ... 100 | output (batchsize=1, signal_length, filter_order) 101 | output[0, 0, :] = hamming_window 102 | output[0, 1, :] = hamming_window 103 | ... 104 | """ 105 | # Hamming window 106 | return 0.54 + 0.46 * torch.cos(2 * np.pi * n_index / self.order) 107 | 108 | def sinc(self, x): 109 | """ Normalized sinc-filter sin( pi * x) / pi * x 110 | https://en.wikipedia.org/wiki/Sinc_function 111 | 112 | Assume x (batchsize, signal_length, filter_order) and 113 | x[0, 0, :] = [-half_order, - half_order+1, ... 0, ..., half_order] 114 | x[:, :, self.half_order] -> time index = 0, sinc(0)=1 115 | """ 116 | y = torch.zeros_like(x) 117 | y[:,:,0:self.half_k]=torch.sin(np.pi * x[:, :, 0:self.half_k]) / (np.pi * x[:, :, 0:self.half_k]) 118 | y[:,:,self.half_k+1:]=torch.sin(np.pi * x[:, :, self.half_k+1:]) / (np.pi * x[:, :, self.half_k+1:]) 119 | y[:,:,self.half_k] = 1 120 | return y 121 | 122 | def forward(self, cut_f): 123 | """ lp_coef, hp_coef = forward(self, cut_f) 124 | cut-off frequency cut_f (batchsize=1, length, dim = 1) 125 | 126 | lp_coef: low-pass filter coefs (batchsize, length, filter_order) 127 | hp_coef: high-pass filter coefs (batchsize, length, filter_order) 128 | """ 129 | # create the filter order index 130 | with torch.no_grad(): 131 | # [- (M-1) / 2, ..., 0, ..., (M-1)/2] 132 | lp_coef = torch.arange(-self.half_k, self.half_k + 1, 133 | device=cut_f.device) 134 | # [[[- (M-1) / 2, ..., 0, ..., (M-1)/2], 135 | # [- (M-1) / 2, ..., 0, ..., (M-1)/2], 136 | # ... 137 | # ], 138 | # [[- (M-1) / 2, ..., 0, ..., (M-1)/2], 139 | # [- (M-1) / 2, ..., 0, ..., (M-1)/2], 140 | # ... 141 | # ]] 142 | lp_coef = lp_coef.repeat(cut_f.shape[0], cut_f.shape[1], 1) 143 | 144 | hp_coef = torch.arange(-self.half_k, self.half_k + 1, 145 | device=cut_f.device) 146 | hp_coef = hp_coef.repeat(cut_f.shape[0], cut_f.shape[1], 1) 147 | 148 | # temporary buffer of [-1^n] for gain norm in hp_coef 149 | tmp_one = torch.pow(-1, hp_coef) 150 | 151 | # unnormalized filter coefs with hamming window 152 | lp_coef = cut_f * self.sinc(cut_f * lp_coef) * self.hamming_w(lp_coef) 153 | 154 | hp_coef = (self.sinc(hp_coef) \ 155 | - cut_f * self.sinc(cut_f * hp_coef)) \ 156 | * self.hamming_w(hp_coef) 157 | 158 | # normalize the coef to make gain at 0/pi is 0 dB 159 | # sum_n lp_coef[n] 160 | lp_coef_norm = torch.sum(lp_coef, axis=2).unsqueeze(-1) 161 | # sum_n hp_coef[n] * -1^n 162 | hp_coef_norm = torch.sum(hp_coef * tmp_one, axis=2).unsqueeze(-1) 163 | 164 | lp_coef = lp_coef / lp_coef_norm 165 | hp_coef = hp_coef / hp_coef_norm 166 | 167 | # return normed coef 168 | return lp_coef, hp_coef 169 | 170 | class NeuralFilterBlock(nn.Module): 171 | """ Wrapper over a single filter block 172 | """ 173 | def __init__(self, signal_size, hidden_size, kernel_size=3, conv_num=10): 174 | super(NeuralFilterBlock, self).__init__() 175 | self.signal_size = signal_size 176 | self.hidden_size = hidden_size 177 | self.kernel_size = kernel_size 178 | self.conv_num = conv_num 179 | self.dilation_s = [np.power(2, x) for x in np.arange(conv_num)] 180 | 181 | # ff layer to expand dimension 182 | self.l_ff_1 = nn.Linear(signal_size, hidden_size, bias=False) 183 | self.l_ff_1_tanh = nn.Tanh() 184 | 185 | # dilated conv layers 186 | tmp = [Conv1dKeepLength(hidden_size, hidden_size, x, kernel_size, causal=True, bias=False) \ 187 | for x in self.dilation_s] 188 | self.l_convs = nn.ModuleList(tmp) 189 | 190 | # ff layer to de-expand dimension 191 | self.l_ff_2 = nn.Linear(hidden_size, hidden_size//4, bias=False) 192 | self.l_ff_2_tanh = nn.Tanh() 193 | self.l_ff_3 = nn.Linear(hidden_size//4, signal_size, bias=False) 194 | self.l_ff_3_tanh = nn.Tanh() 195 | 196 | # a simple scale 197 | self.scale = nn.Parameter(torch.tensor([1/len(self.l_convs)]), requires_grad=False) 198 | return 199 | 200 | def forward(self, signal, context): 201 | """ 202 | Assume: signal (batchsize=1, length, signal_size) 203 | context (batchsize=1, length, hidden_size) 204 | Output: (batchsize=1, length, signal_size) 205 | """ 206 | # expand dimension 207 | tmp_hidden = self.l_ff_1_tanh(self.l_ff_1(signal)) 208 | 209 | # loop over dilated convs 210 | # output of a d-conv is input + context + d-conv(input) 211 | for l_conv in self.l_convs: 212 | tmp_hidden = tmp_hidden + l_conv(tmp_hidden) + context 213 | 214 | # to be consistent with legacy configuration in CURRENNT 215 | tmp_hidden = tmp_hidden * self.scale 216 | 217 | # compress the dimesion and skip-add 218 | tmp_hidden = self.l_ff_2_tanh(self.l_ff_2(tmp_hidden)) 219 | tmp_hidden = self.l_ff_3_tanh(self.l_ff_3(tmp_hidden)) 220 | output_signal = tmp_hidden + signal 221 | 222 | return output_signal 223 | 224 | 225 | class FilterModuleHnSincNSF(nn.Module): 226 | """ Filter for Hn-sinc-NSF 227 | FilterModuleHnSincNSF(signal_size, hidden_size, sinc_order = 31, 228 | block_num = 5, kernel_size = 3, 229 | conv_num_in_block = 10) 230 | signal_size: signal dimension (should be 1) 231 | hidden_size: dimension of hidden features inside neural filter block 232 | sinc_order: order of the sinc filter 233 | block_num: number of neural filter blocks in harmonic branch 234 | kernel_size: kernel size in dilated CNN 235 | conv_num_in_block: number of d-conv1d in one neural filter block 236 | 237 | Usage: 238 | output = FilterModuleHnSincNSF(har_source, noi_source, cut_f, context) 239 | har_source: source for harmonic branch (batchsize, length, dim=1) 240 | noi_source: source for noise branch (batchsize, length, dim=1) 241 | cut_f: cut-off-frequency of sinc filters (batchsize, length, dim=1) 242 | context: hidden features to be added (batchsize, length, dim) 243 | output: (batchsize, length, dim=1) 244 | """ 245 | def __init__(self, signal_size, hidden_size, sinc_order = 31, \ 246 | block_num = 5, kernel_size = 3, conv_num_in_block = 10): 247 | super(FilterModuleHnSincNSF, self).__init__() 248 | self.signal_size = signal_size 249 | self.hidden_size = hidden_size 250 | self.kernel_size = kernel_size 251 | self.block_num = block_num 252 | self.conv_num_in_block = conv_num_in_block 253 | self.sinc_order = sinc_order 254 | 255 | # filter blocks for harmonic branch 256 | tmp = [NeuralFilterBlock(signal_size, hidden_size, \ 257 | kernel_size, conv_num_in_block) \ 258 | for x in range(self.block_num)] 259 | self.l_har_blocks = nn.ModuleList(tmp) 260 | 261 | # filter blocks for noise branch (only one block, 5 sub-blocks) 262 | tmp = [NeuralFilterBlock(signal_size, hidden_size, \ 263 | kernel_size, conv_num_in_block // 2) \ 264 | for x in range(1)] 265 | self.l_noi_blocks = nn.ModuleList(tmp) 266 | 267 | # sinc filter generators and time-variant filtering layer 268 | self.l_sinc_coef = SincFilter(self.sinc_order) 269 | self.l_tv_filtering = TimeVarFIRFilter() 270 | 271 | def forward(self, har_component, noi_component, cond_feat, cut_f): 272 | # harmonic component 273 | for l_har_block in self.l_har_blocks: 274 | har_component = l_har_block(har_component, cond_feat) 275 | # noise componebt 276 | for l_noi_block in self.l_noi_blocks: 277 | noi_component = l_noi_block(noi_component, cond_feat) 278 | 279 | # get sinc filter coefficients 280 | lp_coef, hp_coef = self.l_sinc_coef(cut_f) 281 | 282 | # time-variant filtering 283 | har_signal = self.l_tv_filtering(har_component, lp_coef) 284 | noi_signal = self.l_tv_filtering(noi_component, hp_coef) 285 | 286 | # get output 287 | return har_signal + noi_signal 288 | -------------------------------------------------------------------------------- /nsf_hifigan/model/generators/generator.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from functools import reduce 4 | import operator 5 | from typing import List, Union 6 | import torch 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 11 | 12 | from nsf_hifigan.model.generators.cond_module import CondModuleHnSincNSF 13 | from nsf_hifigan.model.generators.filter_module import FilterModuleHnSincNSF 14 | from nsf_hifigan.model.generators.source_module import SourceModuleHnNSF 15 | 16 | class NSFHiFiGANGenerator(torch.nn.Module): 17 | """ Model definition 18 | """ 19 | def __init__(self, in_dim: int, out_dim: int, 20 | upsampling_rate: int, 21 | sampling_rate: int, 22 | sine_amp: float=0.1, 23 | noise_std: float=0.003, 24 | hidden_dim: int=64, 25 | cnn_kernel_s: int=3, 26 | filter_block_num: int=5, 27 | cnn_num_in_block: int=10, 28 | harmonic_num: int=7, 29 | sinc_order: int=31 30 | ): 31 | super(NSFHiFiGANGenerator, self).__init__() 32 | 33 | self.input_dim = in_dim 34 | self.output_dim = out_dim 35 | 36 | # configurations 37 | # amplitude of sine waveform (for each harmonic) 38 | self.sine_amp = sine_amp 39 | # standard deviation of Gaussian noise for additive noise 40 | self.noise_std = noise_std 41 | # dimension of hidden features in filter blocks 42 | self.hidden_dim = hidden_dim 43 | # upsampling rate on input acoustic features (16kHz * 5ms = 80) 44 | # assume input_reso has the same value 45 | self.upsampling_rate = upsampling_rate 46 | # sampling rate (Hz) 47 | self.sampling_rate = sampling_rate 48 | # CNN kernel size in filter blocks 49 | self.cnn_kernel_s = cnn_kernel_s 50 | # number of filter blocks (for harmonic branch) 51 | # noise branch only uses 1 block 52 | self.filter_block_num = filter_block_num 53 | # number of dilated CNN in each filter block 54 | self.cnn_num_in_block = cnn_num_in_block 55 | # number of harmonic overtones in source 56 | self.harmonic_num = harmonic_num 57 | # order of sinc-windowed-FIR-filter 58 | self.sinc_order = sinc_order 59 | 60 | # the three modules 61 | self.m_cond = CondModuleHnSincNSF(self.input_dim, self.hidden_dim, self.upsampling_rate, cnn_kernel_s=self.cnn_kernel_s) 62 | 63 | self.m_source = SourceModuleHnNSF(self.sampling_rate, self.harmonic_num, self.sine_amp, self.noise_std) 64 | 65 | self.m_filter = FilterModuleHnSincNSF(self.output_dim, self.hidden_dim, self.sinc_order, self.filter_block_num, \ 66 | self.cnn_kernel_s, self.cnn_num_in_block) 67 | # loss function on spectra 68 | # self.m_aux_loss = LossAuxGen() 69 | 70 | def forward(self, feat, f0): 71 | """ definition of forward method 72 | Assume x (batchsize=1, length, dim) 73 | Return output(batchsize=1, length) 74 | """ 75 | f0 = f0.unsqueeze(2) 76 | # condition module 77 | # feature-to-filter-block, f0-up-sampled, cut-off-f-for-sinc, 78 | # hidden-feature-for-cut-off-f 79 | cond_feat, f0_upsamped, cut_f, hid_cut_f = self.m_cond(feat, f0) 80 | 81 | # source module 82 | # harmonic-source, noise-source (for noise branch), uv 83 | har_source, noi_source, uv = self.m_source(f0_upsamped) 84 | 85 | # neural filter module (including sinc-based FIR filtering) 86 | # output 87 | output = self.m_filter(har_source, noi_source, cond_feat, cut_f) 88 | 89 | return output 90 | 91 | def loss_aux(self, nat_wav, gen_tuple, data_in): 92 | return self.m_aux_loss.compute(gen_tuple, nat_wav) 93 | 94 | -------------------------------------------------------------------------------- /nsf_hifigan/model/generators/source_module.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | import numpy as np 7 | 8 | class SineGen(nn.Module): 9 | """ Definition of sine generator 10 | SineGen(samp_rate, harmonic_num = 0, 11 | sine_amp = 0.1, noise_std = 0.003, 12 | voiced_threshold = 0, 13 | flag_for_pulse=False) 14 | 15 | samp_rate: sampling rate in Hz 16 | harmonic_num: number of harmonic overtones (default 0) 17 | sine_amp: amplitude of sine-wavefrom (default 0.1) 18 | noise_std: std of Gaussian noise (default 0.003) 19 | voiced_thoreshold: F0 threshold for U/V classification (default 0) 20 | flag_for_pulse: this SinGen is used inside PulseGen (default False) 21 | 22 | Note: when flag_for_pulse is True, the first time step of a voiced 23 | segment is always sin(np.pi) or cos(0) 24 | """ 25 | def __init__(self, samp_rate, harmonic_num = 0, 26 | sine_amp = 0.1, noise_std = 0.003, 27 | voiced_threshold = 0, 28 | flag_for_pulse=False): 29 | super(SineGen, self).__init__() 30 | self.sine_amp = sine_amp 31 | self.noise_std = noise_std 32 | self.harmonic_num = harmonic_num 33 | self.dim = self.harmonic_num + 1 34 | self.sampling_rate = samp_rate 35 | self.voiced_threshold = voiced_threshold 36 | self.flag_for_pulse = flag_for_pulse 37 | 38 | def _f02uv(self, f0): 39 | # generate uv signal 40 | uv = torch.ones_like(f0) 41 | uv = uv * (f0 > self.voiced_threshold) 42 | return uv 43 | 44 | def _f02sine(self, f0_values): 45 | """ f0_values: (batchsize, length, dim) 46 | where dim indicates fundamental tone and overtones 47 | """ 48 | # convert to F0 in rad. The interger part n can be ignored 49 | # because 2 * np.pi * n doesn't affect phase 50 | rad_values = (f0_values / self.sampling_rate) % 1 51 | 52 | # initial phase noise (no noise for fundamental component) 53 | rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2],\ 54 | device = f0_values.device) 55 | rand_ini[:, 0] = 0 56 | rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini 57 | 58 | # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) 59 | if not self.flag_for_pulse: 60 | # for normal case 61 | 62 | # To prevent torch.cumsum numerical overflow, 63 | # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. 64 | # Buffer tmp_over_one_idx indicates the time step to add -1. 65 | # This will not change F0 of sine because (x-1) * 2*pi = x *2*pi 66 | tmp_over_one = torch.cumsum(rad_values, 1) % 1 67 | tmp_over_one_idx = (tmp_over_one[:, 1:, :] - 68 | tmp_over_one[:, :-1, :]) < 0 69 | cumsum_shift = torch.zeros_like(rad_values) 70 | cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 71 | 72 | sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) 73 | * 2 * np.pi) 74 | else: 75 | # If necessary, make sure that the first time step of every 76 | # voiced segments is sin(pi) or cos(0) 77 | # This is used for pulse-train generation 78 | 79 | # identify the last time step in unvoiced segments 80 | uv = self._f02uv(f0_values) 81 | uv_1 = torch.roll(uv, shifts=-1, dims=1) 82 | uv_1[:, -1, :] = 1 83 | u_loc = (uv < 1) * (uv_1 > 0) 84 | 85 | # get the instantanouse phase 86 | tmp_cumsum = torch.cumsum(rad_values, dim=1) 87 | # different batch needs to be processed differently 88 | for idx in range(f0_values.shape[0]): 89 | temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] 90 | temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] 91 | # stores the accumulation of i.phase within 92 | # each voiced segments 93 | tmp_cumsum[idx, :, :] = 0 94 | tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum 95 | 96 | # rad_values - tmp_cumsum: remove the accumulation of i.phase 97 | # within the previous voiced segment. 98 | i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) 99 | 100 | # get the sines 101 | sines = torch.cos(i_phase * 2 * np.pi) 102 | return sines 103 | 104 | 105 | def forward(self, f0): 106 | """ sine_tensor, uv = forward(f0) 107 | input F0: tensor(batchsize=1, length, dim=1) 108 | f0 for unvoiced steps should be 0 109 | output sine_tensor: tensor(batchsize=1, length, dim) 110 | output uv: tensor(batchsize=1, length, 1) 111 | """ 112 | with torch.no_grad(): 113 | f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \ 114 | device=f0.device) 115 | # fundamental component 116 | f0_buf[:, :, 0] = f0[:, :, 0] 117 | for idx in np.arange(self.harmonic_num): 118 | # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic 119 | f0_buf[:, :, idx+1] = f0_buf[:, :, 0] * (idx+2) 120 | 121 | # generate sine waveforms 122 | sine_waves = self._f02sine(f0_buf) * self.sine_amp 123 | 124 | # generate uv signal 125 | #uv = torch.ones(f0.shape) 126 | #uv = uv * (f0 > self.voiced_threshold) 127 | uv = self._f02uv(f0) 128 | 129 | # noise: for unvoiced should be similar to sine_amp 130 | # std = self.sine_amp/3 -> max value ~ self.sine_amp 131 | #. for voiced regions is self.noise_std 132 | noise_amp = uv * self.noise_std + (1-uv) * self.sine_amp / 3 133 | noise = noise_amp * torch.randn_like(sine_waves) 134 | 135 | # first: set the unvoiced part to 0 by uv 136 | # then: additive noise 137 | sine_waves = sine_waves * uv + noise 138 | return sine_waves, uv, noise 139 | 140 | 141 | class SourceModuleHnNSF(nn.Module): 142 | """ SourceModule for hn-nsf 143 | SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 144 | add_noise_std=0.003, voiced_threshod=0) 145 | sampling_rate: sampling_rate in Hz 146 | harmonic_num: number of harmonic above F0 (default: 0) 147 | sine_amp: amplitude of sine source signal (default: 0.1) 148 | add_noise_std: std of additive Gaussian noise (default: 0.003) 149 | note that amplitude of noise in unvoiced is decided 150 | by sine_amp 151 | voiced_threshold: threhold to set U/V given F0 (default: 0) 152 | 153 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 154 | F0_sampled (batchsize, length, 1) 155 | Sine_source (batchsize, length, 1) 156 | noise_source (batchsize, length 1) 157 | uv (batchsize, length, 1) 158 | """ 159 | def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, 160 | add_noise_std=0.003, voiced_threshod=0): 161 | super(SourceModuleHnNSF, self).__init__() 162 | 163 | self.sine_amp = sine_amp 164 | self.noise_std = add_noise_std 165 | 166 | # to produce sine waveforms 167 | self.l_sin_gen = SineGen(sampling_rate, harmonic_num, 168 | sine_amp, add_noise_std, voiced_threshod) 169 | 170 | # to merge source harmonics into a single excitation 171 | self.l_linear = nn.Linear(harmonic_num+1, 1) 172 | self.l_tanh = nn.Tanh() 173 | 174 | def forward(self, x): 175 | """ 176 | Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) 177 | F0_sampled (batchsize, length, 1) 178 | Sine_source (batchsize, length, 1) 179 | noise_source (batchsize, length 1) 180 | """ 181 | # source for harmonic branch 182 | sine_wavs, uv, _ = self.l_sin_gen(x) 183 | sine_merge = self.l_tanh(self.l_linear(sine_wavs)) 184 | 185 | # source for noise branch, in the same shape as uv 186 | noise = torch.randn_like(uv) * self.sine_amp / 3 187 | return sine_merge, noise, uv 188 | -------------------------------------------------------------------------------- /nsf_hifigan/model/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | def feature_loss(fmap_r, fmap_g): 5 | loss = 0 6 | for dr, dg in zip(fmap_r, fmap_g): 7 | for rl, gl in zip(dr, dg): 8 | rl = rl.float().detach() 9 | gl = gl.float() 10 | loss += torch.mean(torch.abs(rl - gl)) 11 | 12 | return loss * 2 13 | 14 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 15 | loss = 0 16 | r_losses = [] 17 | g_losses = [] 18 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 19 | dr = dr.float() 20 | dg = dg.float() 21 | r_loss = torch.mean((1 - dr) ** 2) 22 | g_loss = torch.mean(dg ** 2) 23 | loss += (r_loss + g_loss) 24 | r_losses.append(r_loss.item()) 25 | g_losses.append(g_loss.item()) 26 | 27 | return loss, r_losses, g_losses 28 | 29 | def generator_loss(disc_outputs): 30 | loss = 0 31 | gen_losses = [] 32 | for dg in disc_outputs: 33 | dg = dg.float() 34 | l = torch.mean((1-dg)**2) 35 | gen_losses.append(l) 36 | loss += l 37 | 38 | return loss, gen_losses 39 | 40 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 41 | """ 42 | z_p, logs_q: [b, h, t_t] 43 | m_p, logs_p: [b, h, t_t] 44 | """ 45 | z_p = z_p.float() 46 | logs_q = logs_q.float() 47 | m_p = m_p.float() 48 | logs_p = logs_p.float() 49 | z_mask = z_mask.float() 50 | 51 | kl = logs_p - logs_q - 0.5 52 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) 53 | kl = torch.sum(kl * z_mask) 54 | l = kl / torch.sum(z_mask) 55 | return l 56 | -------------------------------------------------------------------------------- /nsf_hifigan/model/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/modules/__init__.py -------------------------------------------------------------------------------- /nsf_hifigan/model/modules/conv1_keep_length.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | class Conv1dKeepLength(nn.Conv1d): 6 | """ Wrapper for causal convolution 7 | Input tensor: (batchsize=1, length, dim_in) 8 | Output tensor: (batchsize=1, length, dim_out) 9 | https://github.com/pytorch/pytorch/issues/1333 10 | Note: Tanh is optional 11 | """ 12 | def __init__(self, input_dim, output_dim, dilation_s, kernel_s, 13 | causal = False, stride = 1, groups=1, bias=True, \ 14 | tanh = True, pad_mode='constant'): 15 | super(Conv1dKeepLength, self).__init__( 16 | input_dim, output_dim, kernel_s, stride=stride, 17 | padding = 0, dilation = dilation_s, groups=groups, bias=bias) 18 | 19 | self.pad_mode = pad_mode 20 | 21 | self.causal = causal 22 | # input & output length will be the same 23 | if self.causal: 24 | # left pad to make the convolution causal 25 | self.pad_le = dilation_s * (kernel_s - 1) 26 | self.pad_ri = 0 27 | else: 28 | # pad on both sizes 29 | self.pad_le = dilation_s * (kernel_s - 1) // 2 30 | self.pad_ri = dilation_s * (kernel_s - 1) - self.pad_le 31 | 32 | if tanh: 33 | self.l_ac = nn.Tanh() 34 | else: 35 | self.l_ac = nn.Identity() 36 | 37 | def forward(self, data): 38 | # permute to (batchsize=1, dim, length) 39 | # add one dimension (batchsize=1, dim, ADDED_DIM, length) 40 | # pad to ADDED_DIM 41 | # squeeze and return to (batchsize=1, dim, length) 42 | # https://github.com/pytorch/pytorch/issues/1333 43 | x = F.pad(data.permute(0, 2, 1).unsqueeze(2), \ 44 | (self.pad_le, self.pad_ri, 0, 0), 45 | mode = self.pad_mode).squeeze(2) 46 | # tanh(conv1()) 47 | # permmute back to (batchsize=1, length, dim) 48 | output = self.l_ac(super(Conv1dKeepLength, self).forward(x)) 49 | return output.permute(0, 2, 1) 50 | -------------------------------------------------------------------------------- /nsf_hifigan/model/modules/moving_average.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from .conv1_keep_length import Conv1dKeepLength 6 | 7 | class MovingAverage(Conv1dKeepLength): 8 | """ Wrapper to define a moving average smoothing layer 9 | Note: MovingAverage can be implemented using TimeInvFIRFilter too. 10 | Here we define another Module dicrectly on Conv1DKeepLength 11 | """ 12 | def __init__(self, feature_dim, window_len, causal=False, \ 13 | pad_mode='replicate'): 14 | super(MovingAverage, self).__init__( 15 | feature_dim, feature_dim, 1, window_len, causal, 16 | groups=feature_dim, bias=False, tanh=False, \ 17 | pad_mode=pad_mode) 18 | # set the weighting coefficients 19 | nn.init.constant_(self.weight, 1/window_len) 20 | # turn off grad for this layer 21 | for p in self.parameters(): 22 | p.requires_grad = False 23 | 24 | def forward(self, data): 25 | return super(MovingAverage, self).forward(data) -------------------------------------------------------------------------------- /nsf_hifigan/model/nsf_hifigan.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | from typing import Any, Dict 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | from torch import optim 8 | import torchaudio 9 | import torchaudio.transforms as T 10 | 11 | import pytorch_lightning as pl 12 | import torchmetrics 13 | 14 | from nsf_hifigan.mel_processing import spectrogram_torch_audio 15 | 16 | from .discriminators.multi_scale_discriminator import MultiScaleDiscriminator 17 | from .discriminators.multi_period_discriminator import MultiPeriodDiscriminator 18 | from .generators.generator import NSFHiFiGANGenerator 19 | 20 | from .loss import discriminator_loss, kl_loss,feature_loss, generator_loss 21 | from .. import utils 22 | from .commons import slice_segments, rand_slice_segments, sequence_mask, clip_grad_value_ 23 | 24 | from .pipeline import AudioPipeline 25 | 26 | class NSF_HifiGAN(pl.LightningModule): 27 | def __init__(self, **kwargs): 28 | super().__init__() 29 | self.save_hyperparameters(*[k for k in kwargs]) 30 | 31 | self.net_g = NSFHiFiGANGenerator( 32 | in_dim = self.hparams.data.n_mel_channels, 33 | out_dim = 1, 34 | upsampling_rate = self.hparams.data.hop_length, 35 | sampling_rate = self.hparams.data.sampling_rate, 36 | ) 37 | self.net_period_d = MultiPeriodDiscriminator( 38 | periods=self.hparams.model.multi_period_discriminator_periods, 39 | use_spectral_norm=self.hparams.model.use_spectral_norm 40 | ) 41 | self.net_scale_d = MultiScaleDiscriminator(use_spectral_norm=self.hparams.model.use_spectral_norm) 42 | 43 | self.audio_pipeline = AudioPipeline(freq=self.hparams.data.sampling_rate, 44 | n_fft=self.hparams.data.filter_length, 45 | n_mel=self.hparams.data.n_mel_channels, 46 | win_length=self.hparams.data.win_length, 47 | hop_length=self.hparams.data.hop_length) 48 | for param in self.audio_pipeline.parameters(): 49 | param.requires_grad = False 50 | 51 | # metrics 52 | self.valid_spec_loss = torchmetrics.MeanMetric() 53 | 54 | def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int, optimizer_idx: int): 55 | x_wav, x_wav_lengths = batch["x_wav_values"], batch["x_wav_lengths"] 56 | x_pitch, x_pitch_lengths = batch["x_pitch_values"], batch["x_pitch_lengths"] 57 | y_wav, y_wav_lengths = batch["y_wav_values"], batch["y_wav_lengths"] 58 | 59 | with torch.inference_mode(): 60 | x_mel = self.audio_pipeline(x_wav.squeeze(1), aug=True) 61 | x_mel_lengths = (x_wav_lengths / self.hparams.data.hop_length).long() 62 | 63 | x_mel, ids_slice = rand_slice_segments(x_mel, x_mel_lengths, self.hparams.train.segment_size // self.hparams.data.hop_length) 64 | x_pitch = slice_segments(x_pitch.unsqueeze(1), ids_slice, self.hparams.train.segment_size // self.hparams.data.hop_length).squeeze(1) # slice 65 | y_wav = slice_segments(y_wav, ids_slice * self.hparams.data.hop_length, self.hparams.train.segment_size) # slice 66 | 67 | y_spec = spectrogram_torch_audio( 68 | y_wav.squeeze(1).float(), 69 | self.hparams.data.filter_length, 70 | self.hparams.data.sampling_rate, 71 | self.hparams.data.hop_length, 72 | self.hparams.data.win_length, 73 | False 74 | ) 75 | 76 | # generator forward 77 | y_hat = self.net_g(x_mel.transpose(1,2), x_pitch).transpose(1,2) 78 | 79 | y_spec_hat = spectrogram_torch_audio( 80 | y_hat.squeeze(1).float(), 81 | self.hparams.data.filter_length, 82 | self.hparams.data.sampling_rate, 83 | self.hparams.data.hop_length, 84 | self.hparams.data.win_length, 85 | False 86 | ) 87 | 88 | # Discriminator 89 | if optimizer_idx == 0: 90 | # MPD 91 | y_dp_hat_r, y_dp_hat_g, _, _ = self.net_period_d(y_wav, y_hat.detach()) 92 | loss_disc_p, losses_disc_p_r, losses_disc_p_g = discriminator_loss(y_dp_hat_r, y_dp_hat_g) 93 | 94 | # MSD 95 | y_ds_hat_r, y_ds_hat_g, _, _ = self.net_scale_d(y_wav, y_hat.detach()) 96 | loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g) 97 | 98 | loss_disc_all = loss_disc_p + loss_disc_s 99 | 100 | # log 101 | lr = self.optim_g.param_groups[0]['lr'] 102 | scalar_dict = {"train/d/loss_total": loss_disc_all, "learning_rate": lr} 103 | scalar_dict.update({"train/d_p_r/{}".format(i): v for i, v in enumerate(losses_disc_p_r)}) 104 | scalar_dict.update({"train/d_p_g/{}".format(i): v for i, v in enumerate(losses_disc_p_g)}) 105 | scalar_dict.update({"train/d_s_r/{}".format(i): v for i, v in enumerate(losses_disc_s_r)}) 106 | scalar_dict.update({"train/d_s_g/{}".format(i): v for i, v in enumerate(losses_disc_s_g)}) 107 | 108 | image_dict = {} 109 | 110 | tensorboard = self.logger.experiment 111 | 112 | utils.summarize( 113 | writer=tensorboard, 114 | global_step=self.global_step, 115 | images=image_dict, 116 | scalars=scalar_dict) 117 | 118 | return loss_disc_all 119 | 120 | # Generator 121 | if optimizer_idx == 1: 122 | y_dp_hat_r, y_dp_hat_g, fmap_p_r, fmap_p_g = self.net_period_d(y_wav, y_hat) 123 | loss_p_fm = feature_loss(fmap_p_r, fmap_p_g) 124 | loss_p_gen, losses_p_gen = generator_loss(y_dp_hat_g) 125 | 126 | y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = self.net_scale_d(y_wav, y_hat) 127 | loss_s_fm = feature_loss(fmap_s_r, fmap_s_g) 128 | loss_s_gen, losses_s_gen = generator_loss(y_ds_hat_g) 129 | 130 | # mel 131 | loss_spec = F.l1_loss(y_spec_hat, y_spec) * self.hparams.train.c_spec 132 | 133 | loss_gen_all = (loss_s_gen + loss_s_fm) + (loss_p_gen + loss_p_fm) + loss_spec 134 | 135 | # Logging to TensorBoard by default 136 | lr = self.optim_g.param_groups[0]['lr'] 137 | scalar_dict = {"train/g/loss_total": loss_gen_all, "learning_rate": lr} 138 | scalar_dict.update({ 139 | "train/g/p_fm": loss_p_fm, 140 | "train/g/s_fm": loss_s_fm, 141 | "train/g/p_gen": loss_p_gen, 142 | "train/g/s_gen": loss_s_gen, 143 | "train/g/loss_spec": loss_spec, 144 | }) 145 | 146 | scalar_dict.update({"train/g/p_gen_{}".format(i): v for i, v in enumerate(losses_p_gen)}) 147 | scalar_dict.update({"train/g/s_gen_{}".format(i): v for i, v in enumerate(losses_s_gen)}) 148 | 149 | image_dict = {} 150 | 151 | tensorboard = self.logger.experiment 152 | utils.summarize( 153 | writer=tensorboard, 154 | global_step=self.global_step, 155 | images=image_dict, 156 | scalars=scalar_dict) 157 | return loss_gen_all 158 | 159 | def validation_step(self, batch, batch_idx): 160 | self.net_g.eval() 161 | 162 | x_wav, x_wav_lengths = batch["x_wav_values"], batch["x_wav_lengths"] 163 | x_pitch, x_pitch_lengths = batch["x_pitch_values"], batch["x_pitch_lengths"] 164 | y_wav, y_wav_lengths = batch["y_wav_values"], batch["y_wav_lengths"] 165 | 166 | with torch.inference_mode(): 167 | x_mel = self.audio_pipeline(x_wav.squeeze(1), aug=False) 168 | x_mel_lengths = (x_wav_lengths / self.hparams.data.hop_length).long() 169 | 170 | y_spec = spectrogram_torch_audio(y_wav.squeeze(1), 171 | self.hparams.data.filter_length, 172 | self.hparams.data.sampling_rate, 173 | self.hparams.data.hop_length, 174 | self.hparams.data.win_length, center=False) 175 | y_spec_lengths = (y_wav_lengths / self.hparams.data.hop_length).long() 176 | 177 | # remove else 178 | y_wav_hat = self.net_g(x_mel.transpose(1,2), x_pitch).transpose(1,2) 179 | y_hat_lengths = torch.tensor([y_wav_hat.shape[2]], dtype=torch.long) 180 | 181 | y_spec_hat = spectrogram_torch_audio(y_wav_hat.squeeze(1), 182 | self.hparams.data.filter_length, 183 | self.hparams.data.sampling_rate, 184 | self.hparams.data.hop_length, 185 | self.hparams.data.win_length, center=False) 186 | 187 | image_dict = { 188 | "gen/spec": utils.plot_spectrogram_to_numpy(y_spec_hat[0].cpu().numpy()), 189 | "gt/spec": utils.plot_spectrogram_to_numpy(y_spec[0].cpu().numpy()) 190 | } 191 | audio_dict = { 192 | "gen/audio": y_wav_hat[0,:,:y_hat_lengths[0]].squeeze(0).float(), 193 | "gt/audio": y_wav[0,:,:y_wav_lengths[0]].squeeze(0).float() 194 | } 195 | 196 | spec_mask = torch.unsqueeze(sequence_mask(x_mel_lengths.long(), y_spec.size(2)), 1).to(y_spec.dtype) 197 | 198 | # metrics compute 199 | y_spec_masked = y_spec * spec_mask 200 | y_spec_masked_hat = y_spec_hat * spec_mask 201 | valid_spec_loss_step = F.l1_loss(y_spec_masked_hat, y_spec_masked) 202 | self.valid_spec_loss.update(valid_spec_loss_step.item()) 203 | self.log("valid/loss_spec_step", valid_spec_loss_step.item(), sync_dist=True) 204 | 205 | # logging 206 | tensorboard = self.logger.experiment 207 | utils.summarize( 208 | writer=tensorboard, 209 | global_step=self.global_step, 210 | images=image_dict, 211 | audios=audio_dict, 212 | audio_sampling_rate=self.hparams.data.sampling_rate, 213 | ) 214 | 215 | def validation_epoch_end(self, outputs) -> None: 216 | self.net_g.eval() 217 | valid_spec_loss_epoch = self.valid_spec_loss.compute() 218 | self.log("valid/loss_spec_epoch", valid_spec_loss_epoch.item(), sync_dist=True) 219 | self.valid_spec_loss.reset() 220 | 221 | def configure_optimizers(self): 222 | self.optim_g = torch.optim.AdamW( 223 | self.net_g.parameters(), 224 | self.hparams.train.generator_learning_rate, 225 | betas=self.hparams.train.betas, 226 | eps=self.hparams.train.eps) 227 | self.optim_d = torch.optim.AdamW( 228 | itertools.chain(self.net_period_d.parameters(), self.net_scale_d.parameters()), 229 | self.hparams.train.discriminator_learning_rate, 230 | betas=self.hparams.train.betas, 231 | eps=self.hparams.train.eps) 232 | self.scheduler_g = torch.optim.lr_scheduler.ExponentialLR(self.optim_g, gamma=self.hparams.train.lr_decay) 233 | self.scheduler_g.last_epoch = self.current_epoch - 1 234 | self.scheduler_d = torch.optim.lr_scheduler.ExponentialLR(self.optim_d, gamma=self.hparams.train.lr_decay) 235 | self.scheduler_d.last_epoch = self.current_epoch - 1 236 | 237 | return [self.optim_d, self.optim_g], [self.scheduler_d, self.scheduler_g] -------------------------------------------------------------------------------- /nsf_hifigan/model/pipeline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import torchaudio.transforms as T 4 | 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | from torch import optim 9 | import random 10 | 11 | import numpy as np 12 | 13 | class GaussianNoise(torch.nn.Module): 14 | def __init__(self, min_snr=0.0001, max_snr=0.01): 15 | """ 16 | :param min_snr: Minimum signal-to-noise ratio 17 | :param max_snr: Maximum signal-to-noise ratio 18 | """ 19 | super().__init__() 20 | self.min_snr = min_snr 21 | self.max_snr = max_snr 22 | 23 | def forward(self, audio): 24 | std = torch.std(audio) 25 | noise_std = random.uniform(self.min_snr * std, self.max_snr * std) 26 | 27 | norm_dist = torch.distributions.normal.Normal(0.0, noise_std) 28 | noise = norm_dist.rsample(audio.shape).type(audio.dtype).to(audio.device) 29 | 30 | return audio + noise 31 | 32 | class AudioPipeline(torch.nn.Module): 33 | def __init__( 34 | self, 35 | freq=16000, 36 | n_fft=1024, 37 | n_mel=128, 38 | win_length=1024, 39 | hop_length=256 40 | ): 41 | super().__init__() 42 | 43 | self.freq=freq 44 | 45 | pad = int((n_fft-hop_length)/2) 46 | self.spec = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, 47 | pad=pad, power=None,center=False, pad_mode='reflect', normalized=False, onesided=True) 48 | 49 | # self.strech = T.TimeStretch(hop_length=hop_length, n_freq=freq) 50 | self.spec_aug = torch.nn.Sequential( 51 | GaussianNoise(min_snr=0.0001, max_snr=0.02), 52 | T.FrequencyMasking(freq_mask_param=80), 53 | # T.TimeMasking(time_mask_param=80), 54 | ) 55 | 56 | self.mel_scale = T.MelScale(n_mels=n_mel, sample_rate=freq, n_stft=n_fft // 2 + 1) 57 | 58 | def forward(self, waveform: torch.Tensor, aug: bool=False) -> torch.Tensor: 59 | shift_waveform = waveform 60 | # Convert to power spectrogram 61 | spec = self.spec(shift_waveform) 62 | spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) 63 | # Apply SpecAugment 64 | if aug: 65 | spec = self.spec_aug(spec) 66 | # Convert to mel-scale 67 | mel = self.mel_scale(spec) 68 | return mel -------------------------------------------------------------------------------- /nsf_hifigan/pyin.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from numpy.lib.stride_tricks import as_strided 4 | 5 | class ParameterError(Exception): 6 | """Exception class for mal-formed inputs""" 7 | pass 8 | 9 | def valid_audio(y, *, mono=True): 10 | if not isinstance(y, np.ndarray): 11 | raise TypeError("Audio data must be of type numpy.ndarray") 12 | 13 | if not np.issubdtype(y.dtype, np.floating): 14 | raise TypeError("Audio data must be floating-point") 15 | 16 | if y.ndim == 0: 17 | raise TypeError( 18 | "Audio data must be at least one-dimensional, given y.shape={}".format( 19 | y.shape 20 | ) 21 | ) 22 | 23 | if mono and y.ndim != 1: 24 | raise TypeError( 25 | "Invalid shape for monophonic audio: " 26 | "ndim={:d}, shape={}".format(y.ndim, y.shape) 27 | ) 28 | 29 | if not np.isfinite(y).all(): 30 | raise TypeError("Audio buffer is not finite everywhere") 31 | 32 | return True 33 | 34 | def frame(x, *, frame_length, hop_length, axis=-1, writeable=False, subok=False): 35 | # This implementation is derived from numpy.lib.stride_tricks.sliding_window_view (1.20.0) 36 | # https://numpy.org/doc/stable/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html 37 | 38 | x = np.array(x, copy=False, subok=subok) 39 | 40 | if x.shape[axis] < frame_length: 41 | raise ParameterError( 42 | "Input is too short (n={:d})" 43 | " for frame_length={:d}".format(x.shape[axis], frame_length) 44 | ) 45 | 46 | if hop_length < 1: 47 | raise ParameterError("Invalid hop_length: {:d}".format(hop_length)) 48 | 49 | # put our new within-frame axis at the end for now 50 | out_strides = x.strides + tuple([x.strides[axis]]) 51 | 52 | # Reduce the shape on the framing axis 53 | x_shape_trimmed = list(x.shape) 54 | x_shape_trimmed[axis] -= frame_length - 1 55 | 56 | out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) 57 | xw = as_strided( 58 | x, strides=out_strides, shape=out_shape, subok=subok, writeable=writeable 59 | ) 60 | 61 | if axis < 0: 62 | target_axis = axis - 1 63 | else: 64 | target_axis = axis + 1 65 | 66 | xw = np.moveaxis(xw, -1, target_axis) 67 | 68 | # Downsample along the target axis 69 | slices = [slice(None)] * xw.ndim 70 | slices[axis] = slice(0, None, hop_length) 71 | return xw[tuple(slices)] 72 | 73 | def pyin(y, *, fmin, fmax, sr=22050, frame_length=2048, win_length=None, hop_length=None, 74 | n_thresholds=100, beta_parameters=(2, 18), boltzmann_parameter=2, resolution=0.1, 75 | max_transition_rate=35.92, switch_prob=0.01, no_trough_prob=0.01, fill_na=np.nan, 76 | center=True, pad_mode="constant" 77 | ): 78 | 79 | if fmin is None or fmax is None: 80 | raise ParameterError('both "fmin" and "fmax" must be provided') 81 | 82 | # Set the default window length if it is not already specified. 83 | if win_length is None: 84 | win_length = frame_length // 2 85 | 86 | if win_length >= frame_length: 87 | raise ParameterError( 88 | "win_length={} cannot exceed given frame_length={}".format( 89 | win_length, frame_length 90 | ) 91 | ) 92 | 93 | # Set the default hop if it is not already specified. 94 | if hop_length is None: 95 | hop_length = frame_length // 4 96 | 97 | # Check that audio is valid. 98 | valid_audio(y, mono=False) 99 | 100 | # Pad the time series so that frames are centered 101 | if center: 102 | padding = [(0, 0) for _ in y.shape] 103 | padding[-1] = (frame_length // 2, frame_length // 2) 104 | y = np.pad(y, padding, mode=pad_mode) 105 | 106 | # Frame audio. 107 | y_frames = frame(y, frame_length=frame_length, hop_length=hop_length) 108 | 109 | # Calculate minimum and maximum periods 110 | min_period = max(int(np.floor(sr / fmax)), 1) 111 | max_period = min(int(np.ceil(sr / fmin)), frame_length - win_length - 1) 112 | 113 | # Calculate cumulative mean normalized difference function. 114 | yin_frames = _cumulative_mean_normalized_difference( 115 | y_frames, frame_length, win_length, min_period, max_period 116 | ) 117 | 118 | # Parabolic interpolation. 119 | parabolic_shifts = _parabolic_interpolation(yin_frames) 120 | 121 | # Find Yin candidates and probabilities. 122 | # The implementation here follows the official pYIN software which 123 | # differs from the method described in the paper. 124 | # 1. Define the prior over the thresholds. 125 | thresholds = np.linspace(0, 1, n_thresholds + 1) 126 | beta_cdf = scipy.stats.beta.cdf(thresholds, beta_parameters[0], beta_parameters[1]) 127 | beta_probs = np.diff(beta_cdf) 128 | 129 | n_bins_per_semitone = int(np.ceil(1.0 / resolution)) 130 | n_pitch_bins = int(np.floor(12 * n_bins_per_semitone * np.log2(fmax / fmin))) + 1 131 | 132 | def _helper(a, b): 133 | return __pyin_helper( 134 | a, 135 | b, 136 | sr, 137 | thresholds, 138 | boltzmann_parameter, 139 | beta_probs, 140 | no_trough_prob, 141 | min_period, 142 | fmin, 143 | n_pitch_bins, 144 | n_bins_per_semitone, 145 | ) 146 | 147 | helper = np.vectorize(_helper, signature="(f,t),(k,t)->(1,d,t),(j,t)") 148 | observation_probs, voiced_prob = helper(yin_frames, parabolic_shifts) 149 | 150 | # Construct transition matrix. 151 | max_semitones_per_frame = round(max_transition_rate * 12 * hop_length / sr) 152 | transition_width = max_semitones_per_frame * n_bins_per_semitone + 1 153 | # Construct the within voicing transition probabilities 154 | transition = sequence.transition_local( 155 | n_pitch_bins, transition_width, window="triangle", wrap=False 156 | ) 157 | 158 | # Include across voicing transition probabilities 159 | t_switch = sequence.transition_loop(2, 1 - switch_prob) 160 | transition = np.kron(t_switch, transition) 161 | 162 | p_init = np.zeros(2 * n_pitch_bins) 163 | p_init[n_pitch_bins:] = 1 / n_pitch_bins 164 | 165 | states = seq.viterbi(observation_probs, transition, p_init=p_init) 166 | 167 | # Find f0 corresponding to each decoded pitch bin. 168 | freqs = fmin * 2 ** (np.arange(n_pitch_bins) / (12 * n_bins_per_semitone)) 169 | f0 = freqs[states % n_pitch_bins] 170 | voiced_flag = states < n_pitch_bins 171 | 172 | if fill_na is not None: 173 | f0[~voiced_flag] = fill_na 174 | 175 | return f0[..., 0, :], voiced_flag[..., 0, :], voiced_prob[..., 0, :] 176 | 177 | 178 | def __pyin_helper( 179 | yin_frames, 180 | parabolic_shifts, 181 | sr, 182 | thresholds, 183 | boltzmann_parameter, 184 | beta_probs, 185 | no_trough_prob, 186 | min_period, 187 | fmin, 188 | n_pitch_bins, 189 | n_bins_per_semitone, 190 | ): 191 | 192 | yin_probs = np.zeros_like(yin_frames) 193 | 194 | for i, yin_frame in enumerate(yin_frames.T): 195 | # 2. For each frame find the troughs. 196 | is_trough = util.localmin(yin_frame) 197 | 198 | is_trough[0] = yin_frame[0] < yin_frame[1] 199 | (trough_index,) = np.nonzero(is_trough) 200 | 201 | if len(trough_index) == 0: 202 | continue 203 | 204 | # 3. Find the troughs below each threshold. 205 | # these are the local minima of the frame, could get them directly without the trough index 206 | trough_heights = yin_frame[trough_index] 207 | trough_thresholds = np.less.outer(trough_heights, thresholds[1:]) 208 | 209 | # 4. Define the prior over the troughs. 210 | # Smaller periods are weighted more. 211 | trough_positions = np.cumsum(trough_thresholds, axis=0) - 1 212 | n_troughs = np.count_nonzero(trough_thresholds, axis=0) 213 | 214 | trough_prior = scipy.stats.boltzmann.pmf( 215 | trough_positions, boltzmann_parameter, n_troughs 216 | ) 217 | 218 | trough_prior[~trough_thresholds] = 0 219 | 220 | # 5. For each threshold add probability to global minimum if no trough is below threshold, 221 | # else add probability to each trough below threshold biased by prior. 222 | 223 | probs = trough_prior.dot(beta_probs) 224 | 225 | global_min = np.argmin(trough_heights) 226 | n_thresholds_below_min = np.count_nonzero(~trough_thresholds[global_min, :]) 227 | probs[global_min] += no_trough_prob * np.sum( 228 | beta_probs[:n_thresholds_below_min] 229 | ) 230 | 231 | yin_probs[trough_index, i] = probs 232 | 233 | yin_period, frame_index = np.nonzero(yin_probs) 234 | 235 | # Refine peak by parabolic interpolation. 236 | period_candidates = min_period + yin_period 237 | period_candidates = period_candidates + parabolic_shifts[yin_period, frame_index] 238 | f0_candidates = sr / period_candidates 239 | 240 | # Find pitch bin corresponding to each f0 candidate. 241 | bin_index = 12 * n_bins_per_semitone * np.log2(f0_candidates / fmin) 242 | bin_index = np.clip(np.round(bin_index), 0, n_pitch_bins).astype(int) 243 | 244 | # Observation probabilities. 245 | observation_probs = np.zeros((2 * n_pitch_bins, yin_frames.shape[1])) 246 | observation_probs[bin_index, frame_index] = yin_probs[yin_period, frame_index] 247 | 248 | voiced_prob = np.clip( 249 | np.sum(observation_probs[:n_pitch_bins, :], axis=0, keepdims=True), 0, 1 250 | ) 251 | observation_probs[n_pitch_bins:, :] = (1 - voiced_prob) / n_pitch_bins 252 | 253 | return observation_probs[np.newaxis], voiced_prob -------------------------------------------------------------------------------- /nsf_hifigan/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import sys 4 | import torch 5 | import torchaudio 6 | from typing import Any, Dict, List, Tuple 7 | 8 | 9 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 10 | logger = logging 11 | 12 | def load_filepaths(filename: str) -> List[List[str]]: 13 | with open(filename, encoding='utf-8') as f: 14 | filepaths = [line.rstrip() for line in f] 15 | return filepaths 16 | 17 | def load_wav_to_torch(full_path: str) -> Tuple[torch.FloatTensor, int]: 18 | data, sampling_rate = torchaudio.load(full_path) 19 | if len(data.shape) >= 2: 20 | data = torch.mean(data, dim=0) 21 | return data, sampling_rate 22 | 23 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): 24 | for k, v in scalars.items(): 25 | writer.add_scalar(k, v, global_step) 26 | for k, v in histograms.items(): 27 | writer.add_histogram(k, v, global_step) 28 | for k, v in images.items(): 29 | writer.add_image(k, v, global_step, dataformats='HWC') 30 | for k, v in audios.items(): 31 | writer.add_audio(k, v, global_step, audio_sampling_rate) 32 | 33 | MATPLOTLIB_FLAG = False 34 | def plot_spectrogram_to_numpy(spectrogram): 35 | global MATPLOTLIB_FLAG 36 | if not MATPLOTLIB_FLAG: 37 | import matplotlib 38 | matplotlib.use("Agg") 39 | MATPLOTLIB_FLAG = True 40 | mpl_logger = logging.getLogger('matplotlib') 41 | mpl_logger.setLevel(logging.WARNING) 42 | import matplotlib.pylab as plt 43 | import numpy as np 44 | 45 | fig, ax = plt.subplots(figsize=(10, 2)) 46 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 47 | interpolation='none') 48 | plt.colorbar(im, ax=ax) 49 | plt.xlabel("Frames") 50 | plt.ylabel("Channels") 51 | plt.tight_layout() 52 | 53 | fig.canvas.draw() 54 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 55 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 56 | plt.close() 57 | return data -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/requirements.txt -------------------------------------------------------------------------------- /split.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import os 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-i', '--input', type=str, default="./filelists/48k_audio_filelist.txt", help='filelist path') 8 | parser.add_argument('-o', '--output', type=str, default="./filelists", help='File list output path') 9 | args = parser.parse_args() 10 | 11 | random.seed(1234) 12 | 13 | with open(args.input, "r", encoding="utf-8") as f: 14 | lines = f.readlines() 15 | 16 | lines = sorted(lines) 17 | random.shuffle(lines) 18 | 19 | origin_filename = os.path.basename(args.input) 20 | data_len = len(lines) 21 | 22 | valid_num = int(data_len * 0.001) 23 | test_num = int(data_len * 0.001) 24 | 25 | with open(os.path.join(args.output, origin_filename.replace(".txt", "_train.txt")), "w", encoding="utf-8") as f: 26 | f.writelines(lines[:-valid_num-test_num]) 27 | 28 | with open(os.path.join(args.output, origin_filename.replace(".txt", "_valid.txt")), "w", encoding="utf-8") as f: 29 | f.writelines(lines[-valid_num-test_num:-test_num]) 30 | 31 | with open(os.path.join(args.output, origin_filename.replace(".txt", "_test.txt")), "w", encoding="utf-8") as f: 32 | f.writelines(lines[-test_num:]) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import json 4 | import glob 5 | import argparse 6 | from typing import Optional 7 | import torch 8 | import torchaudio 9 | import tqdm 10 | from torch import nn, optim 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader 13 | from nsf_hifigan.model.nsf_hifigan import NSF_HifiGAN 14 | 15 | from nsf_hifigan.data.collate import MelCollate 16 | 17 | import pytorch_lightning as pl 18 | from pytorch_lightning.callbacks import ModelCheckpoint 19 | from pytorch_lightning.callbacks.early_stopping import EarlyStopping 20 | from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler 21 | 22 | from nsf_hifigan.hparams import HParams 23 | from nsf_hifigan.data.dataset import MelDataset, MelDataset 24 | 25 | def get_hparams(config_path: str) -> HParams: 26 | with open(config_path, "r") as f: 27 | data = f.read() 28 | config = json.loads(data) 29 | 30 | hparams = HParams(**config) 31 | return hparams 32 | 33 | def last_checkpoint(path: str) -> Optional[str]: 34 | ckpt_path = None 35 | if os.path.exists(os.path.join(path, "lightning_logs")): 36 | versions = glob.glob(os.path.join(path, "lightning_logs", "version_*")) 37 | if len(list(versions)) > 0: 38 | last_ver = sorted(list(versions), key=lambda p: int(p.split("_")[-1]))[-1] 39 | last_ckpt = os.path.join(last_ver, "checkpoints/last.ckpt") 40 | if os.path.exists(last_ckpt): 41 | ckpt_path = last_ckpt 42 | return ckpt_path 43 | 44 | def get_train_params(args, hparams): 45 | devices = [int(n.strip()) for n in args.device.split(",")] 46 | 47 | checkpoint_callback = ModelCheckpoint( 48 | dirpath=None, save_last=True, every_n_train_steps=2000, save_weights_only=False, 49 | monitor="valid/loss_mel_epoch", mode="min", save_top_k=5 50 | ) 51 | earlystop_callback = EarlyStopping(monitor="valid/loss_mel_epoch", mode="min", patience=13) 52 | 53 | trainer_params = { 54 | "accelerator": args.accelerator, 55 | "callbacks": [checkpoint_callback, earlystop_callback], 56 | } 57 | 58 | if args.accelerator != "cpu": 59 | trainer_params["devices"] = devices 60 | 61 | if len(devices) > 1: 62 | trainer_params["strategy"] = "ddp" 63 | 64 | trainer_params.update(hparams.trainer) 65 | 66 | if hparams.train.fp16_run: 67 | trainer_params["amp_backend"] = "native" 68 | trainer_params["precision"] = 16 69 | 70 | trainer_params["num_nodes"] = args.num_nodes 71 | 72 | return trainer_params 73 | 74 | def main(): 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('-c', '--config', type=str, default="./configs/48k.json", help='JSON file for configuration') 77 | parser.add_argument('-a', '--accelerator', type=str, default="gpu", help='training device') 78 | parser.add_argument('-d', '--device', type=str, default="0", help='training device ids') 79 | parser.add_argument('-n', '--num-nodes', type=int, default=1, help='training node number') 80 | args = parser.parse_args() 81 | 82 | hparams = get_hparams(args.config) 83 | pl.utilities.seed.seed_everything(hparams.train.seed) 84 | 85 | devices = [int(n.strip()) for n in args.device.split(",")] 86 | 87 | # data 88 | train_dataset = MelDataset(hparams.data.training_files, hparams.data) 89 | valid_dataset = MelDataset(hparams.data.validation_files, hparams.data) 90 | 91 | collate_fn = MelCollate() 92 | 93 | trainer_params = get_train_params(args, hparams) 94 | if "strategy" in trainer_params and trainer_params["strategy"] == "ddp": 95 | batch_per_gpu = hparams.train.batch_size // len(devices) 96 | else: 97 | batch_per_gpu = hparams.train.batch_size 98 | 99 | train_loader = DataLoader(train_dataset, batch_size=batch_per_gpu, num_workers=8, shuffle=True, pin_memory=True, collate_fn=collate_fn) 100 | valid_loader = DataLoader(valid_dataset, batch_size=4, num_workers=4, shuffle=False, pin_memory=True, collate_fn=collate_fn) 101 | 102 | # model 103 | model = NSF_HifiGAN(**hparams) 104 | 105 | # profiler = AdvancedProfiler(filename="profile.txt") 106 | trainer = pl.Trainer(**trainer_params) # , profiler=profiler, max_steps=200 107 | # resume training 108 | ckpt_path = last_checkpoint(hparams.trainer.default_root_dir) 109 | trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=valid_loader, ckpt_path=ckpt_path) 110 | 111 | if __name__ == "__main__": 112 | main() 113 | --------------------------------------------------------------------------------