├── .gitignore
├── LICENSE
├── README.md
├── configs
    └── 48k.json
├── filelist.py
├── filelists
    └── .gitignore
├── nsf_hifigan
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── collate.py
    │   └── dataset.py
    ├── hparams.py
    ├── mel_processing.py
    ├── model
    │   ├── __init__.py
    │   ├── commons.py
    │   ├── discriminators
    │   │   ├── __init__.py
    │   │   ├── discriminator.py
    │   │   ├── multi_period_discriminator.py
    │   │   └── multi_scale_discriminator.py
    │   ├── generators
    │   │   ├── __init__.py
    │   │   ├── cond_module.py
    │   │   ├── filter_module.py
    │   │   ├── generator.py
    │   │   └── source_module.py
    │   ├── loss.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── conv1_keep_length.py
    │   │   └── moving_average.py
    │   ├── nsf_hifigan.py
    │   └── pipeline.py
    ├── pyin.py
    └── utils.py
├── requirements.txt
├── split.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | 
132 | /logs_*/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Vtuber Plan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NSF-HiFiGAN
2 | This project is a modified implementation of NSF-HiFiGAN models.
3 | For more information on NSF models, please visit https://nii-yamagishilab.github.io/samples-nsf/
4 | 
5 | Original repository is [NSF-HIFIGAN](https://github.com/PlayVoice/NSF-HIFIGAN).


--------------------------------------------------------------------------------
/configs/48k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "trainer": {
 3 |         "max_epochs": 20000,
 4 |         "accumulate_grad_batches": 2,
 5 |         "default_root_dir": "./logs_48k",
 6 |         "val_check_interval": 1.0,
 7 |         "gradient_clip_val": 0.5
 8 |     },
 9 |     "train": {
10 |         "log_interval": 200,
11 |         "eval_interval": 1000,
12 |         "seed": 1234,
13 |         "max_epochs": 20000,
14 |         "generator_learning_rate": 0.0002,
15 |         "discriminator_learning_rate": 0.0002,
16 |         "betas": [
17 |             0.8,
18 |             0.99
19 |         ],
20 |         "eps": 1e-9,
21 |         "batch_size": 32,
22 |         "fp16_run": true,
23 |         "lr_decay": 0.999875,
24 |         "segment_size": 16384,
25 |         "init_lr_ratio": 1,
26 |         "warmup_epochs": 0,
27 |         "c_mel": 0,
28 |         "c_spec": 45
29 |     },
30 |     "data": {
31 |         "training_files": "filelists/48k_audio_filelist_train.txt",
32 |         "validation_files": "filelists/48k_audio_filelist_valid.txt",
33 |         "sampling_rate": 48000,
34 |         "filter_length": 2048,
35 |         "hop_length": 512,
36 |         "win_length": 2048,
37 |         "n_mel_channels": 128,
38 |         "mel_fmin": 0.0,
39 |         "mel_fmax": null,
40 |         "num_pitch": 512
41 |     },
42 |     "model": {
43 |         "use_spectral_norm": false,
44 |         "multi_period_discriminator_periods": [2,3,5,7,11,13,17,19,23,37]
45 |     }
46 | }


--------------------------------------------------------------------------------
/filelist.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import tqdm
 5 | import soundfile as sf
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('-i', '--input', type=str, default="./dataset", help='Dataset path')
10 |     parser.add_argument('-o', '--output', type=str, default="./filelists/48k_audio_filelist.txt", help='File list output path')
11 |     parser.add_argument('-s', '--sr', type=int, default=48000, help='File target sample rate')
12 |     args = parser.parse_args()
13 | 
14 |     if not os.path.exists(os.path.dirname(args.output)):
15 |         os.makedirs(os.path.dirname(args.output), exist_ok=True)
16 | 
17 |     audio_files = list(glob.glob(os.path.join(args.input, "**/*.wav"), recursive=True))
18 | 
19 |     target_sr = args.sr
20 |     total_time = 0
21 |     with open(args.output, "w", encoding="utf-8") as f:
22 |         for i, audio_path in enumerate(tqdm.tqdm(audio_files)):
23 |             audio = sf.SoundFile(audio_path)
24 |             sec = audio.frames / audio.samplerate
25 |             if audio.frames / audio.samplerate * target_sr < 16384 * 1.2:
26 |                 continue
27 |             audio_path = audio_path.replace("\\", "/")
28 |             f.write(f"{audio_path}\n")
29 |             total_time += sec
30 |     
31 |     print(f"Total time: {total_time//3600}h")


--------------------------------------------------------------------------------
/filelists/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | 


--------------------------------------------------------------------------------
/nsf_hifigan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/data/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/data/collate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | 
 4 | class MelCollate():
 5 |     def __init__(self, return_ids: bool = False):
 6 |         self.return_ids = return_ids
 7 | 
 8 |     def __call__(self, batch):
 9 |         # Right zero-pad all one-hot text sequences to max input length
10 |         _, ids_sorted_decreasing = torch.sort(
11 |             torch.LongTensor([x["wav"].size(1) for x in batch]),
12 |             dim=0, descending=True)
13 | 
14 |         max_x_wav_len = max([x["wav"].size(1) for x in batch])
15 |         max_x_pitch_len = max([x["pitch"].size(1) for x in batch])
16 |         max_y_wav_len = max([x["wav"].size(1) for x in batch])
17 | 
18 |         x_wav_lengths = torch.LongTensor(len(batch))
19 |         x_pitch_lengths = torch.LongTensor(len(batch))
20 |         y_wav_lengths = torch.LongTensor(len(batch))
21 | 
22 |         x_wav_padded = torch.zeros(len(batch), 1, max_x_wav_len, dtype=torch.float32)
23 |         x_pitch_padded = torch.zeros(len(batch), max_x_pitch_len, dtype=torch.float32)
24 |         y_wav_padded = torch.zeros(len(batch), 1, max_y_wav_len, dtype=torch.float32)
25 | 
26 |         for i in range(len(ids_sorted_decreasing)):
27 |             row = batch[ids_sorted_decreasing[i]]
28 | 
29 |             wav = row["wav"]
30 |             x_wav_padded[i, :, :wav.size(1)] = wav
31 |             x_wav_lengths[i] = wav.size(1)
32 | 
33 |             pitch = row["pitch"]
34 |             x_pitch_padded[i, :pitch.size(1)] = pitch
35 |             x_pitch_lengths[i] = pitch.size(1)
36 | 
37 |             wav = row["wav"]
38 |             y_wav_padded[i, :, :wav.size(1)] = wav
39 |             y_wav_lengths[i] = wav.size(1)
40 | 
41 |         ret = {
42 |             "x_wav_values": x_wav_padded,
43 |             "x_wav_lengths": x_wav_lengths,
44 |             "x_pitch_values": x_pitch_padded,
45 |             "x_pitch_lengths": x_pitch_lengths,
46 |             "y_wav_values": y_wav_padded,
47 |             "y_wav_lengths": y_wav_lengths,
48 |         }
49 | 
50 |         if self.return_ids:
51 |             ret.update("ids", "ids_sorted_decreasing")
52 |         return ret
53 | 
54 | 


--------------------------------------------------------------------------------
/nsf_hifigan/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | from typing import Optional
  4 | 
  5 | import torch
  6 | import torchaudio
  7 | 
  8 | import numpy as np
  9 | import librosa
 10 | from librosa import pyin
 11 | 
 12 | from ..utils import load_filepaths, load_wav_to_torch
 13 | 
 14 | resamplers = {}
 15 | 
 16 | def load_audio(filename: str, sr: Optional[int] = None):
 17 |     global resamplers
 18 |     audio, sampling_rate = load_wav_to_torch(filename)
 19 | 
 20 |     if sr is not None and sampling_rate != sr:
 21 |         # not match, then resample
 22 |         if sr in resamplers:
 23 |             resampler = resamplers[(sampling_rate, sr)]
 24 |         else:
 25 |             resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=sr)
 26 |             resamplers[(sampling_rate, sr)] = resampler
 27 |         audio = resampler(audio)
 28 |         sampling_rate = sr
 29 |         # raise ValueError("{} {} SR doesn't match target {} SR".format(sampling_rate, self.sampling_rate))
 30 |     return audio
 31 | 
 32 | def normalize_pitch(pitch, mean, std):
 33 |     zeros = (pitch == 0.0)
 34 |     pitch -= mean[:, None]
 35 |     pitch /= std[:, None]
 36 |     pitch[zeros] = 0.0
 37 |     return pitch
 38 | 
 39 | def estimate_pitch(audio: np.ndarray, sr: int, n_fft: int, win_length: int, hop_length: int,
 40 |                     method='pyin', normalize_mean=None, normalize_std=None, n_formants=1):
 41 |     if type(normalize_mean) is float or type(normalize_mean) is list:
 42 |         normalize_mean = torch.tensor(normalize_mean)
 43 | 
 44 |     if type(normalize_std) is float or type(normalize_std) is list:
 45 |         normalize_std = torch.tensor(normalize_std)
 46 | 
 47 |     if method == 'pyin':
 48 |         snd, sr = audio, sr
 49 |         pad_size = int((n_fft-hop_length)/2)
 50 |         snd = np.pad(snd, (pad_size, pad_size), mode='reflect')
 51 | 
 52 |         pitch_mel, voiced_flag, voiced_probs = pyin(
 53 |             snd,
 54 |             fmin=librosa.note_to_hz('C2'),
 55 |             fmax=librosa.note_to_hz('C7'),
 56 |             sr=sr,
 57 |             frame_length=win_length,
 58 |             hop_length=hop_length,
 59 |             center=False,
 60 |             pad_mode='reflect')
 61 |         # assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0
 62 | 
 63 |         pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel)
 64 |         pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0)
 65 |         # pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1)))
 66 | 
 67 |         if n_formants > 1:
 68 |             raise NotImplementedError
 69 |     else:
 70 |         raise ValueError
 71 | 
 72 |     pitch_mel = pitch_mel.float()
 73 | 
 74 |     if normalize_mean is not None:
 75 |         assert normalize_std is not None
 76 |         pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std)
 77 | 
 78 |     return pitch_mel
 79 | 
 80 | def get_pitch(audio: str, sr: int, filter_length: int, win_length: int, hop_length: int):
 81 |     pitch_mel = estimate_pitch(
 82 |         audio=audio, sr=sr, n_fft=filter_length,
 83 |         win_length=win_length, hop_length=hop_length, method='pyin',
 84 |         normalize_mean=None, normalize_std=None, n_formants=1)
 85 | 
 86 |     return pitch_mel
 87 | 
 88 | class MelDataset(torch.utils.data.Dataset):
 89 |     def __init__(self, audiopaths: str, hparams):
 90 |         self.audiopaths = load_filepaths(audiopaths)
 91 |         self.hparams = hparams
 92 |         self.sampling_rate  = hparams.sampling_rate
 93 |         self.filter_length  = hparams.filter_length
 94 |         self.hop_length     = hparams.hop_length
 95 |         self.win_length     = hparams.win_length
 96 |         self.mel_fmin       = hparams.mel_fmin
 97 |         self.mel_fmax       = hparams.mel_fmax
 98 |         self.n_mel_channels = hparams.n_mel_channels
 99 | 
100 |         self.resamplers = {}
101 | 
102 |         random.seed(1234)
103 |         random.shuffle(self.audiopaths)
104 | 
105 |     def get_item(self, index: int):
106 |         audio_path = self.audiopaths[index]
107 |         
108 |         audio_wav = load_audio(audio_path, sr=self.sampling_rate)
109 | 
110 |         audio_pitch = get_pitch(
111 |             audio_wav.numpy(),
112 |             self.sampling_rate,
113 |             self.hparams.filter_length,
114 |             self.hparams.win_length,
115 |             self.hparams.hop_length
116 |         )
117 | 
118 |         return {
119 |             "wav": audio_wav.unsqueeze(0),
120 |             "pitch": audio_pitch,
121 |         }
122 | 
123 |     def __getitem__(self, index):
124 |         ret = self.get_item(index)
125 |         return ret
126 | 
127 |     def __len__(self):
128 |         return len(self.audiopaths)
129 | 


--------------------------------------------------------------------------------
/nsf_hifigan/hparams.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class HParams():
 3 |     def __init__(self, **kwargs):
 4 |         for k, v in kwargs.items():
 5 |             if type(v) == dict:
 6 |                 v = HParams(**v)
 7 |             self[k] = v
 8 | 
 9 |     def keys(self):
10 |         return self.__dict__.keys()
11 | 
12 |     def items(self):
13 |         return self.__dict__.items()
14 | 
15 |     def values(self):
16 |         return self.__dict__.values()
17 | 
18 |     def __len__(self):
19 |         return len(self.__dict__)
20 | 
21 |     def __getitem__(self, key):
22 |         return getattr(self, key)
23 | 
24 |     def __setitem__(self, key, value):
25 |         return setattr(self, key, value)
26 | 
27 |     def __contains__(self, key):
28 |         return key in self.__dict__
29 | 
30 |     def __repr__(self):
31 |         return self.__dict__.__repr__()
32 | 


--------------------------------------------------------------------------------
/nsf_hifigan/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | import torchaudio
 16 | 
 17 | import logging
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool=False):
 52 |     if torch.min(y) < -1.:
 53 |         logging.warning(f'min value is {torch.min(y).detach().cpu().item()}')
 54 |     if torch.max(y) > 1.:
 55 |         logging.warning(f'max value is {torch.max(y).detach().cpu().item()}')
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 62 | 
 63 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
 68 |     
 69 |     spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
 70 | 
 71 |     return spec
 72 | 
 73 | def spectrogram_torch_audio(y, n_fft: int, sampling_rate: int, hop_size: int, win_size: int, center: bool=False):
 74 |     if torch.min(y) < -1.:
 75 |         logging.warning(f'min value is {torch.min(y).detach().cpu().item()}')
 76 |     if torch.max(y) > 1.:
 77 |         logging.warning(f'max value is {torch.max(y).detach().cpu().item()}')
 78 | 
 79 |     global hann_window
 80 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 81 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 82 |     if wnsize_dtype_device not in hann_window:
 83 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 84 | 
 85 |     pad = int((n_fft-hop_size)/2)
 86 | 
 87 |     spec = torchaudio.functional.spectrogram(y, pad, hann_window[wnsize_dtype_device],
 88 |             n_fft, hop_size, win_size, None,
 89 |             center=center, pad_mode='reflect', normalized=False, onesided=True)
 90 |     
 91 |     spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
 92 | 
 93 |     return spec
 94 | 
 95 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 96 |     global mel_basis
 97 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 98 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 99 |     if fmax_dtype_device not in mel_basis:
100 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
101 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
102 | 
103 |     if len(spec.shape) == 3:
104 |         mel_matrix = mel_basis[fmax_dtype_device].unsqueeze(0)
105 |     else:
106 |         mel_matrix = mel_basis[fmax_dtype_device]
107 |     spec = torch.matmul(mel_matrix, spec)
108 |     spec = spectral_normalize_torch(spec)
109 |     return spec
110 | 
111 | 
112 | def mel_spectrogram_torch(y, n_fft: int, num_mels: int, sampling_rate: int, hop_size: int, win_size: int, fmin: int, fmax: int, center: bool=False):
113 |     if torch.min(y) < -1.:
114 |         logging.warning(f'min value is {torch.min(y).detach().cpu().item()}')
115 |     if torch.max(y) > 1.:
116 |         logging.warning(f'max value is {torch.max(y).detach().cpu().item()}')
117 | 
118 |     global mel_basis, hann_window
119 |     dtype_device = str(y.dtype) + '_' + str(y.device)
120 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
121 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
122 |     if fmax_dtype_device not in mel_basis:
123 |         mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
124 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
125 |     if wnsize_dtype_device not in hann_window:
126 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
127 | 
128 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
129 |     y = y.squeeze(1)
130 | 
131 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
132 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
133 | 
134 |     spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
135 | 
136 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
137 |     spec = spectral_normalize_torch(spec)
138 | 
139 |     return spec
140 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/model/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |   classname = m.__class__.__name__
 10 |   if classname.find("Conv") != -1:
 11 |     m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |   return int((kernel_size * dilation - dilation) / 2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |   l = pad_shape[::-1]
 20 |   pad_shape = [item for sublist in l for item in sublist]
 21 |   return pad_shape
 22 | 
 23 | 
 24 | def intersperse(lst, item):
 25 |   result = [item] * (len(lst) * 2 + 1)
 26 |   result[1::2] = lst
 27 |   return result
 28 | 
 29 | 
 30 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 31 |   """KL(P||Q)"""
 32 |   kl = (logs_q - logs_p) - 0.5
 33 |   kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
 34 |   return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |   """Sample from the Gumbel distribution, protect from overflows."""
 39 |   uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |   return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |   g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |   return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |   ret = torch.zeros_like(x[:, :, :segment_size])
 50 |   for i in range(x.size(0)):
 51 |     idx_str = ids_str[i]
 52 |     idx_end = idx_str + segment_size
 53 |     ret[i] = x[i, :, idx_str:idx_end]
 54 |   return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |   b, d, t = x.size()
 59 |   if x_lengths is None:
 60 |     x_lengths = t
 61 |   ids_str_max = x_lengths - segment_size + 1
 62 |   ids_str = (torch.rand([b], device=x_lengths.device) * ids_str_max).to(dtype=torch.long)
 63 |   ret = slice_segments(x, ids_str, segment_size)
 64 |   return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(
 68 |     length, channels, min_timescale=1.0, max_timescale=1.0e4):
 69 |   position = torch.arange(length, dtype=torch.float)
 70 |   num_timescales = channels // 2
 71 |   log_timescale_increment = (
 72 |       math.log(float(max_timescale) / float(min_timescale)) /
 73 |       (num_timescales - 1))
 74 |   inv_timescales = min_timescale * torch.exp(
 75 |       torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
 76 |   scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |   signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |   signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |   signal = signal.view(1, channels, length)
 80 |   return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |   b, channels, length = x.size()
 85 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |   return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |   b, channels, length = x.size()
 91 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |   return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |   return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |   n_channels_int = n_channels[0]
103 |   in_act = input_a + input_b
104 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |   acts = t_act * s_act
107 |   return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |   l = pad_shape[::-1]
112 |   pad_shape = [item for sublist in l for item in sublist]
113 |   return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |   return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |   if max_length is None:
123 |     max_length = length.max()
124 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |   return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |   """
130 |   duration: [b, 1, t_x]
131 |   mask: [b, 1, t_y, t_x]
132 |   """
133 |   device = duration.device
134 |   
135 |   b, _, t_y, t_x = mask.shape
136 |   cum_duration = torch.cumsum(duration, -1)
137 |   
138 |   cum_duration_flat = cum_duration.view(b * t_x)
139 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |   path = path.view(b, t_x, t_y)
141 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |   path = path.unsqueeze(1).transpose(2,3) * mask
143 |   return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |   if isinstance(parameters, torch.Tensor):
148 |     parameters = [parameters]
149 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |   norm_type = float(norm_type)
151 |   if clip_value is not None:
152 |     clip_value = float(clip_value)
153 | 
154 |   total_norm = 0
155 |   for p in parameters:
156 |     param_norm = p.grad.data.norm(norm_type)
157 |     total_norm += param_norm.item() ** norm_type
158 |     if clip_value is not None:
159 |       p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |   total_norm = total_norm ** (1. / norm_type)
161 |   return total_norm
162 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/discriminators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/discriminators/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/model/discriminators/discriminator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | LRELU_SLOPE = 0.1
  7 | 
  8 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  9 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 10 | from ..commons import get_padding
 11 | 
 12 | class DiscriminatorP(torch.nn.Module):
 13 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
 14 |         super(DiscriminatorP, self).__init__()
 15 |         self.period = period
 16 |         self.use_spectral_norm = use_spectral_norm
 17 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 18 |         self.convs = nn.ModuleList([
 19 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 20 |             norm_f(Conv2d(32, 64, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 21 |             norm_f(Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 22 |             norm_f(Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 23 |             norm_f(Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 24 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
 25 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
 26 |         ])
 27 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
 28 | 
 29 |     def forward(self, x):
 30 |         fmap = []
 31 | 
 32 |         # 1d to 2d
 33 |         b, c, t = x.shape
 34 |         if t % self.period != 0: # pad first
 35 |             n_pad = self.period - (t % self.period)
 36 |             x = F.pad(x, (0, n_pad), "reflect")
 37 |             t = t + n_pad
 38 |         x = x.view(b, c, t // self.period, self.period)
 39 | 
 40 |         for l in self.convs:
 41 |             x = l(x)
 42 |             x = F.leaky_relu(x, LRELU_SLOPE)
 43 |             fmap.append(x)
 44 |         x = self.conv_post(x)
 45 |         fmap.append(x)
 46 |         x = torch.flatten(x, 1, -1)
 47 | 
 48 |         return x, fmap
 49 | 
 50 | 
 51 | class DiscriminatorS(torch.nn.Module):
 52 |     def __init__(self, use_spectral_norm=False):
 53 |         super(DiscriminatorS, self).__init__()
 54 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 55 |         self.convs = nn.ModuleList([
 56 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
 57 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
 58 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
 59 |             norm_f(Conv1d(256, 512, 41, 4, groups=64, padding=20)),
 60 |             norm_f(Conv1d(512, 1024, 41, 4, groups=256, padding=20)),
 61 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=512, padding=20)),
 62 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
 63 |         ])
 64 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
 65 | 
 66 |     def forward(self, x):
 67 |         fmap = []
 68 | 
 69 |         for l in self.convs:
 70 |             x = l(x)
 71 |             x = F.leaky_relu(x, LRELU_SLOPE)
 72 |             fmap.append(x)
 73 |         x = self.conv_post(x)
 74 |         fmap.append(x)
 75 |         x = torch.flatten(x, 1, -1)
 76 | 
 77 |         return x, fmap
 78 | 
 79 | class DiscriminatorSpec(torch.nn.Module):
 80 |     def __init__(self, n_fft: int=1024, kernel_size:int=5, stride=3, use_spectral_norm=False):
 81 |         super(DiscriminatorSpec, self).__init__()
 82 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
 83 |         in_channel = n_fft // 2 + 1
 84 |         self.convs = nn.ModuleList([
 85 |             norm_f(Conv1d(in_channel, 2048, 5, 1, padding=2)),
 86 |             norm_f(Conv1d(2048, 4096, 41, 4, groups=4, padding=20)),
 87 |             norm_f(Conv1d(4096, 4096, 5, 1, padding=2)),
 88 |         ])
 89 |         self.conv_post = norm_f(Conv1d(4096, 1, 3, 1, padding=1))
 90 | 
 91 |     def forward(self, x):
 92 |         fmap = []
 93 | 
 94 |         for l in self.convs:
 95 |             x = l(x)
 96 |             x = F.leaky_relu(x, LRELU_SLOPE)
 97 |             fmap.append(x)
 98 |         x = self.conv_post(x)
 99 |         fmap.append(x)
100 |         x = torch.flatten(x, 1, -1)
101 | 
102 |         return x, fmap
103 | 
104 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/discriminators/multi_period_discriminator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import List
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import functional as F
 6 | 
 7 | from .discriminator import DiscriminatorP, DiscriminatorS
 8 | 
 9 | class MultiPeriodDiscriminator(torch.nn.Module):
10 |     def __init__(self, periods: List[int]=[2, 3, 5, 7, 11, 17, 23, 37], use_spectral_norm: bool=False):
11 |         super(MultiPeriodDiscriminator, self).__init__()
12 |         self.periods = periods
13 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
14 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
15 |         self.discriminators = nn.ModuleList(discs)
16 | 
17 |     def forward(self, y, y_hat, g=None):
18 |         y_d_rs = []
19 |         y_d_gs = []
20 |         fmap_rs = []
21 |         fmap_gs = []
22 |         for i, d in enumerate(self.discriminators):
23 |             y_d_r, fmap_r = d(y)
24 |             y_d_g, fmap_g = d(y_hat)
25 |             y_d_rs.append(y_d_r)
26 |             y_d_gs.append(y_d_g)
27 |             fmap_rs.append(fmap_r)
28 |             fmap_gs.append(fmap_g)
29 | 
30 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
31 | 
32 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/discriminators/multi_scale_discriminator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | from torch import nn
 4 | from torch.nn import functional as F
 5 | from torch.nn import AvgPool1d
 6 | 
 7 | from .discriminator import DiscriminatorP, DiscriminatorS
 8 | 
 9 | 
10 | class MultiScaleDiscriminator(torch.nn.Module):
11 |     def __init__(self, use_spectral_norm=False):
12 |         super(MultiScaleDiscriminator, self).__init__()
13 |         self.discriminators = nn.ModuleList([
14 |             DiscriminatorS(use_spectral_norm=use_spectral_norm),
15 |             DiscriminatorS(),
16 |             DiscriminatorS(),
17 |             DiscriminatorS(),
18 |             DiscriminatorS(),
19 |         ])
20 |         self.meanpools = nn.ModuleList([
21 |             AvgPool1d(kernel_size=4, stride=2, padding=2),
22 |             AvgPool1d(kernel_size=4, stride=2, padding=2),
23 |             AvgPool1d(kernel_size=4, stride=2, padding=2),
24 |             AvgPool1d(kernel_size=4, stride=2, padding=2)
25 |         ])
26 | 
27 |     def forward(self, y, y_hat):
28 |         y_d_rs = []
29 |         y_d_gs = []
30 |         fmap_rs = []
31 |         fmap_gs = []
32 |         for i, d in enumerate(self.discriminators):
33 |             if i != 0:
34 |                 y = self.meanpools[i-1](y)
35 |                 y_hat = self.meanpools[i-1](y_hat)
36 |             y_d_r, fmap_r = d(y)
37 |             y_d_g, fmap_g = d(y_hat)
38 |             y_d_rs.append(y_d_r)
39 |             fmap_rs.append(fmap_r)
40 |             y_d_gs.append(y_d_g)
41 |             fmap_gs.append(fmap_g)
42 | 
43 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs


--------------------------------------------------------------------------------
/nsf_hifigan/model/generators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/generators/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/model/generators/cond_module.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | from nsf_hifigan.model.modules.conv1_keep_length import Conv1dKeepLength
  7 | 
  8 | from ..modules.moving_average import MovingAverage
  9 | 
 10 | class UpSampleLayer(nn.Module):
 11 |     """ Wrapper over up-sampling
 12 |     Input tensor: (batchsize=1, length, dim)
 13 |     Ouput tensor: (batchsize=1, length * up-sampling_factor, dim)
 14 |     """
 15 |     def __init__(self, feature_dim, up_sampling_factor, smoothing=False):
 16 |         super(UpSampleLayer, self).__init__()
 17 |         # wrap a up_sampling layer
 18 |         self.scale_factor = up_sampling_factor
 19 |         self.l_upsamp = nn.Upsample(scale_factor=self.scale_factor)
 20 |         if smoothing:
 21 |             self.l_ave1 = MovingAverage(feature_dim, self.scale_factor)
 22 |             self.l_ave2 = MovingAverage(feature_dim, self.scale_factor)
 23 |         else:
 24 |             self.l_ave1 = nn.Identity()
 25 |             self.l_ave2 = nn.Identity()
 26 |         return
 27 |     
 28 |     def forward(self, x):
 29 |         # permute to (batchsize=1, dim, length)
 30 |         up_sampled_data = self.l_upsamp(x.permute(0, 2, 1))
 31 | 
 32 |         # permute it backt to (batchsize=1, length, dim)
 33 |         # and do two moving average
 34 |         return self.l_ave1(self.l_ave2(up_sampled_data.permute(0, 2, 1)))
 35 |     
 36 | 
 37 | class CondModuleHnSincNSF(nn.Module):
 38 |     """ Condition module for hn-sinc-NSF
 39 | 
 40 |     Upsample and transform input features
 41 |     CondModuleHnSincNSF(input_dimension, output_dimension, up_sample_rate,
 42 |                blstm_dimension = 64, cnn_kernel_size = 3)
 43 |     
 44 |     Spec, F0, cut_off_freq = CondModuleHnSincNSF(features, F0)
 45 | 
 46 |     Both input features should be frame-level features
 47 |     If x doesn't contain F0, just ignore the returned F0
 48 |     
 49 |     CondModuleHnSincNSF(input_dim, output_dim, up_sample, 
 50 |                         blstm_s = 64, cnn_kernel_s = 3, 
 51 |                         voiced_threshold = 0):
 52 | 
 53 |     input_dim: sum of dimensions of input features
 54 |     output_dim: dim of the feature Spec to be used by neural filter-block
 55 |     up_sample: up sampling rate of input features
 56 |     blstm_s: dimension of the features from blstm (default 64)
 57 |     cnn_kernel_s: kernel size of CNN in condition module (default 3)
 58 |     voiced_threshold: f0 > voiced_threshold is voiced, otherwise unvoiced
 59 |     """
 60 |     def __init__(self, input_dim, output_dim, up_sample, \
 61 |                  blstm_s = 64, cnn_kernel_s = 3, voiced_threshold = 0):
 62 |         super(CondModuleHnSincNSF, self).__init__()
 63 | 
 64 |         # input feature dimension
 65 |         self.input_dim = input_dim
 66 |         self.output_dim = output_dim
 67 |         self.up_sample = up_sample
 68 |         self.blstm_s = blstm_s
 69 |         self.cnn_kernel_s = cnn_kernel_s
 70 |         self.cut_f_smooth = up_sample * 4
 71 |         self.voiced_threshold = voiced_threshold
 72 | 
 73 |         # the blstm layer
 74 |         self.l_blstm = nn.LSTM(input_dim, self.blstm_s // 2, bidirectional=True, batch_first=True)
 75 | 
 76 |         # the CNN layer (+1 dim for cut_off_frequence of sinc filter)
 77 |         self.l_conv1d = Conv1dKeepLength(self.blstm_s, self.output_dim, dilation_s = 1, kernel_s = self.cnn_kernel_s)
 78 |         # Upsampling layer for hidden features
 79 |         self.l_upsamp = UpSampleLayer(self.output_dim, self.up_sample, True)
 80 |         # separate layer for up-sampling normalized F0 values
 81 |         self.l_upsamp_f0_hi = UpSampleLayer(1, self.up_sample, True)
 82 |         
 83 |         # Upsampling for F0: don't smooth up-sampled F0
 84 |         self.l_upsamp_F0 = UpSampleLayer(1, self.up_sample, False)
 85 | 
 86 |         # Another smoothing layer to smooth the cut-off frequency
 87 |         # for sinc filters. Use a larger window to smooth
 88 |         self.l_cut_f_smooth = MovingAverage(1, self.cut_f_smooth)
 89 | 
 90 |     def get_cut_f(self, hidden_feat, f0):
 91 |         """ cut_f = get_cut_f(self, feature, f0)
 92 |         feature: (batchsize, length, dim=1)
 93 |         f0: (batchsize, length, dim=1)        
 94 |         """ 
 95 |         # generate uv signal
 96 |         uv = torch.ones_like(f0) * (f0 > self.voiced_threshold)
 97 |         # hidden_feat is between (-1, 1) after conv1d with tanh
 98 |         # (-0.2, 0.2) + 0.3 = (0.1, 0.5)
 99 |         # voiced:   (0.1, 0.5) + 0.4 = (0.5, 0.9)
100 |         # unvoiced: (0.1, 0.5) = (0.1, 0.5)
101 |         return hidden_feat * 0.2 + uv * 0.4 + 0.3
102 |         
103 |     
104 |     def forward(self, feature, f0):
105 |         """ spec, f0 = forward(self, feature, f0)
106 |         feature: (batchsize, length, dim)
107 |         f0: (batchsize, length, dim=1), which should be F0 at frame-level
108 |         
109 |         spec: (batchsize, length, self.output_dim), at wave-level
110 |         f0: (batchsize, length, 1), at wave-level
111 |         """
112 |         feature_h, feature_c = self.l_blstm(feature)
113 |         tmp = self.l_upsamp(self.l_conv1d(feature_h))
114 |         tmp_f0 = self.l_upsamp_f0_hi(f0)
115 |         # concatenat normed F0 with hidden spectral features
116 |         context = torch.cat((tmp[:, :, :-1], tmp_f0), dim=2)
117 |         
118 |         # hidden feature for cut-off frequency
119 |         hidden_cut_f = tmp[:, :, self.output_dim-1:]
120 | 
121 |         # directly up-sample F0 without smoothing
122 |         f0_upsamp = self.l_upsamp_F0(f0)
123 | 
124 |         # get the cut-off-frequency from output of CNN
125 |         cut_f = self.get_cut_f(hidden_cut_f, f0_upsamp)
126 |         # smooth the cut-off-frequency using fixed average smoothing
127 |         cut_f_smoothed = self.l_cut_f_smooth(cut_f)
128 | 
129 |         # return
130 |         return context, f0_upsamp, cut_f_smoothed, hidden_cut_f
131 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/generators/filter_module.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import numpy as np
  7 | 
  8 | from nsf_hifigan.model.modules.conv1_keep_length import Conv1dKeepLength
  9 | 
 10 | class TimeVarFIRFilter(nn.Module):
 11 |     """ TimeVarFIRFilter
 12 |     Given sequences of filter coefficients and a signal, do filtering
 13 |     
 14 |     Filter coefs: (batchsize=1, signal_length, filter_order = K)
 15 |     Signal:       (batchsize=1, signal_length, 1)
 16 |     
 17 |     For batch 0:
 18 |      For n in [1, sequence_length):
 19 |        output(0, n, 1) = \sum_{k=1}^{K} signal(0, n-k, 1)*coef(0, n, k)
 20 |        
 21 |     Note: filter coef (0, n, :) is only used to compute the output 
 22 |           at (0, n, 1)
 23 |     """
 24 |     def __init__(self):
 25 |         super(TimeVarFIRFilter, self).__init__()
 26 |     
 27 |     def forward(self, signal, f_coef):
 28 |         """ 
 29 |         Filter coefs: (batchsize=1, signal_length, filter_order = K)
 30 |         Signal:       (batchsize=1, signal_length, 1)
 31 |         
 32 |         Output:       (batchsize=1, signal_length, 1)
 33 |         
 34 |         For n in [1, sequence_length):
 35 |           output(0, n, 1)= \sum_{k=1}^{K} signal(0, n-k, 1)*coef(0, n, k)
 36 |           
 37 |         This method may be not efficient:
 38 |         
 39 |         Suppose signal [x_1, ..., x_N], filter [a_1, ..., a_K]
 40 |         output         [y_1, y_2, y_3, ..., y_N, *, * ... *]
 41 |                = a_1 * [x_1, x_2, x_3, ..., x_N,   0, ...,   0]
 42 |                + a_2 * [  0, x_1, x_2, x_3, ..., x_N,   0, ...,  0]
 43 |                + a_3 * [  0,   0, x_1, x_2, x_3, ..., x_N, 0, ...,  0]
 44 |         """
 45 |         signal_l = signal.shape[1]
 46 |         order_k = f_coef.shape[-1]
 47 | 
 48 |         # pad to (batchsize=1, signal_length + filter_order-1, dim)
 49 |         padded_signal = F.pad(signal, (0, 0, 0, order_k - 1))
 50 |         
 51 |         y = torch.zeros_like(signal)
 52 |         # roll and weighted sum, only take [0:signal_length]
 53 |         for k in range(order_k):
 54 |             y += torch.roll(padded_signal, k, dims=1)[:, 0:signal_l, :] \
 55 |                       * f_coef[:, :, k:k+1]
 56 |         # done
 57 |         return y
 58 | 
 59 | 
 60 | class SincFilter(nn.Module):
 61 |     """ SincFilter
 62 |         Given the cut-off-frequency, produce the low-pass and high-pass
 63 |         windowed-sinc-filters.
 64 |         If input cut-off-frequency is (batchsize=1, signal_length, 1),
 65 |         output filter coef is (batchsize=1, signal_length, filter_order).
 66 |         For each time step in [1, signal_length), we calculate one
 67 |         filter for low-pass sinc filter and another for high-pass filter.
 68 |         
 69 |         Example:
 70 |         import scipy
 71 |         import scipy.signal
 72 |         import numpy as np
 73 |         
 74 |         filter_order = 31
 75 |         cut_f = 0.2
 76 |         sinc_layer = SincFilter(filter_order)
 77 |         lp_coef, hp_coef = sinc_layer(torch.ones(1, 10, 1) * cut_f)
 78 |         
 79 |         w, h1 = scipy.signal.freqz(lp_coef[0, 0, :].numpy(), [1])
 80 |         w, h2 = scipy.signal.freqz(hp_coef[0, 0, :].numpy(), [1])
 81 |         plt.plot(w, 20*np.log10(np.abs(h1)))
 82 |         plt.plot(w, 20*np.log10(np.abs(h2)))
 83 |         plt.plot([cut_f * np.pi, cut_f * np.pi], [-100, 0])
 84 |     """
 85 |     def __init__(self, filter_order):
 86 |         super(SincFilter, self).__init__()
 87 |         # Make the filter oder an odd number
 88 |         #  [-(M-1)/2, ... 0, (M-1)/2]
 89 |         # 
 90 |         self.half_k = (filter_order - 1) // 2
 91 |         self.order = self.half_k * 2 +1
 92 |         
 93 |     def hamming_w(self, n_index):
 94 |         """ prepare hamming window for each time step
 95 |         n_index (batchsize=1, signal_length, filter_order)
 96 |             For each time step, n_index will be [-(M-1)/2, ... 0, (M-1)/2]
 97 |             n_index[0, 0, :] = [-(M-1)/2, ... 0, (M-1)/2]
 98 |             n_index[0, 1, :] = [-(M-1)/2, ... 0, (M-1)/2]
 99 |             ...
100 |         output  (batchsize=1, signal_length, filter_order)
101 |             output[0, 0, :] = hamming_window
102 |             output[0, 1, :] = hamming_window
103 |             ...
104 |         """
105 |         # Hamming window
106 |         return 0.54 + 0.46 * torch.cos(2 * np.pi * n_index / self.order)
107 |     
108 |     def sinc(self, x):
109 |         """ Normalized sinc-filter sin( pi * x) / pi * x
110 |         https://en.wikipedia.org/wiki/Sinc_function
111 |         
112 |         Assume x (batchsize, signal_length, filter_order) and 
113 |         x[0, 0, :] = [-half_order, - half_order+1, ... 0, ..., half_order]
114 |         x[:, :, self.half_order] -> time index = 0, sinc(0)=1
115 |         """
116 |         y = torch.zeros_like(x)
117 |         y[:,:,0:self.half_k]=torch.sin(np.pi * x[:, :, 0:self.half_k]) / (np.pi * x[:, :, 0:self.half_k])
118 |         y[:,:,self.half_k+1:]=torch.sin(np.pi * x[:, :, self.half_k+1:]) / (np.pi * x[:, :, self.half_k+1:])
119 |         y[:,:,self.half_k] = 1
120 |         return y
121 |         
122 |     def forward(self, cut_f):
123 |         """ lp_coef, hp_coef = forward(self, cut_f)
124 |         cut-off frequency cut_f (batchsize=1, length, dim = 1)
125 |     
126 |         lp_coef: low-pass filter coefs  (batchsize, length, filter_order)
127 |         hp_coef: high-pass filter coefs (batchsize, length, filter_order)
128 |         """
129 |         # create the filter order index
130 |         with torch.no_grad():   
131 |             # [- (M-1) / 2, ..., 0, ..., (M-1)/2]
132 |             lp_coef = torch.arange(-self.half_k, self.half_k + 1, 
133 |                                    device=cut_f.device)
134 |             # [[[- (M-1) / 2, ..., 0, ..., (M-1)/2],
135 |             #   [- (M-1) / 2, ..., 0, ..., (M-1)/2],
136 |             #   ...
137 |             #  ],
138 |             #  [[- (M-1) / 2, ..., 0, ..., (M-1)/2],
139 |             #   [- (M-1) / 2, ..., 0, ..., (M-1)/2],
140 |             #   ...
141 |             #  ]]
142 |             lp_coef = lp_coef.repeat(cut_f.shape[0], cut_f.shape[1], 1)
143 |             
144 |             hp_coef = torch.arange(-self.half_k, self.half_k + 1, 
145 |                                    device=cut_f.device)
146 |             hp_coef = hp_coef.repeat(cut_f.shape[0], cut_f.shape[1], 1)
147 |             
148 |             # temporary buffer of [-1^n] for gain norm in hp_coef
149 |             tmp_one = torch.pow(-1, hp_coef)
150 |             
151 |         # unnormalized filter coefs with hamming window
152 |         lp_coef = cut_f * self.sinc(cut_f * lp_coef) * self.hamming_w(lp_coef)
153 |         
154 |         hp_coef = (self.sinc(hp_coef) \
155 |                    - cut_f * self.sinc(cut_f * hp_coef)) \
156 |                   * self.hamming_w(hp_coef)
157 |         
158 |         # normalize the coef to make gain at 0/pi is 0 dB
159 |         # sum_n lp_coef[n]
160 |         lp_coef_norm = torch.sum(lp_coef, axis=2).unsqueeze(-1)
161 |         # sum_n hp_coef[n] * -1^n
162 |         hp_coef_norm = torch.sum(hp_coef * tmp_one, axis=2).unsqueeze(-1)
163 |         
164 |         lp_coef = lp_coef / lp_coef_norm
165 |         hp_coef = hp_coef / hp_coef_norm
166 |         
167 |         # return normed coef
168 |         return lp_coef, hp_coef
169 | 
170 | class NeuralFilterBlock(nn.Module):
171 |     """ Wrapper over a single filter block
172 |     """
173 |     def __init__(self, signal_size, hidden_size, kernel_size=3, conv_num=10):
174 |         super(NeuralFilterBlock, self).__init__()
175 |         self.signal_size = signal_size
176 |         self.hidden_size = hidden_size
177 |         self.kernel_size = kernel_size
178 |         self.conv_num = conv_num
179 |         self.dilation_s = [np.power(2, x) for x in np.arange(conv_num)]
180 | 
181 |         # ff layer to expand dimension
182 |         self.l_ff_1 = nn.Linear(signal_size, hidden_size, bias=False)
183 |         self.l_ff_1_tanh = nn.Tanh()
184 |         
185 |         # dilated conv layers
186 |         tmp = [Conv1dKeepLength(hidden_size, hidden_size, x, kernel_size, causal=True, bias=False) \
187 |                for x in self.dilation_s]
188 |         self.l_convs = nn.ModuleList(tmp)
189 |                 
190 |         # ff layer to de-expand dimension
191 |         self.l_ff_2 = nn.Linear(hidden_size, hidden_size//4, bias=False)
192 |         self.l_ff_2_tanh = nn.Tanh()
193 |         self.l_ff_3 = nn.Linear(hidden_size//4, signal_size, bias=False)
194 |         self.l_ff_3_tanh = nn.Tanh()        
195 | 
196 |         # a simple scale
197 |         self.scale = nn.Parameter(torch.tensor([1/len(self.l_convs)]), requires_grad=False)
198 |         return
199 | 
200 |     def forward(self, signal, context):
201 |         """
202 |         Assume: signal (batchsize=1, length, signal_size)
203 |                 context (batchsize=1, length, hidden_size)
204 |         Output: (batchsize=1, length, signal_size)
205 |         """
206 |         # expand dimension
207 |         tmp_hidden = self.l_ff_1_tanh(self.l_ff_1(signal))
208 |         
209 |         # loop over dilated convs
210 |         # output of a d-conv is input + context + d-conv(input)
211 |         for l_conv in self.l_convs:
212 |             tmp_hidden = tmp_hidden + l_conv(tmp_hidden) + context
213 |             
214 |         # to be consistent with legacy configuration in CURRENNT
215 |         tmp_hidden = tmp_hidden * self.scale
216 |         
217 |         # compress the dimesion and skip-add
218 |         tmp_hidden = self.l_ff_2_tanh(self.l_ff_2(tmp_hidden))
219 |         tmp_hidden = self.l_ff_3_tanh(self.l_ff_3(tmp_hidden))
220 |         output_signal = tmp_hidden + signal
221 |         
222 |         return output_signal
223 |     
224 | 
225 | class FilterModuleHnSincNSF(nn.Module):
226 |     """ Filter for Hn-sinc-NSF
227 |     FilterModuleHnSincNSF(signal_size, hidden_size, sinc_order = 31,
228 |                           block_num = 5, kernel_size = 3, 
229 |                           conv_num_in_block = 10)
230 |     signal_size: signal dimension (should be 1)
231 |     hidden_size: dimension of hidden features inside neural filter block
232 |     sinc_order: order of the sinc filter
233 |     block_num: number of neural filter blocks in harmonic branch
234 |     kernel_size: kernel size in dilated CNN
235 |     conv_num_in_block: number of d-conv1d in one neural filter block
236 | 
237 |     Usage:
238 |     output = FilterModuleHnSincNSF(har_source, noi_source, cut_f, context)
239 |     har_source: source for harmonic branch (batchsize, length, dim=1)
240 |     noi_source: source for noise branch (batchsize, length, dim=1)
241 |     cut_f: cut-off-frequency of sinc filters (batchsize, length, dim=1)
242 |     context: hidden features to be added (batchsize, length, dim)
243 |     output: (batchsize, length, dim=1)    
244 |     """
245 |     def __init__(self, signal_size, hidden_size, sinc_order = 31, \
246 |                  block_num = 5, kernel_size = 3, conv_num_in_block = 10):
247 |         super(FilterModuleHnSincNSF, self).__init__()        
248 |         self.signal_size = signal_size
249 |         self.hidden_size = hidden_size
250 |         self.kernel_size = kernel_size
251 |         self.block_num = block_num
252 |         self.conv_num_in_block = conv_num_in_block
253 |         self.sinc_order = sinc_order
254 |         
255 |         # filter blocks for harmonic branch
256 |         tmp = [NeuralFilterBlock(signal_size, hidden_size, \
257 |                                  kernel_size, conv_num_in_block) \
258 |                for x in range(self.block_num)]
259 |         self.l_har_blocks = nn.ModuleList(tmp)
260 | 
261 |         # filter blocks for noise branch (only one block, 5 sub-blocks)
262 |         tmp = [NeuralFilterBlock(signal_size, hidden_size, \
263 |                                  kernel_size, conv_num_in_block // 2) \
264 |                for x in range(1)]
265 |         self.l_noi_blocks = nn.ModuleList(tmp)
266 | 
267 |         # sinc filter generators and time-variant filtering layer
268 |         self.l_sinc_coef = SincFilter(self.sinc_order)
269 |         self.l_tv_filtering = TimeVarFIRFilter()
270 | 
271 |     def forward(self, har_component, noi_component, cond_feat, cut_f):
272 |         # harmonic component
273 |         for l_har_block in self.l_har_blocks:
274 |             har_component = l_har_block(har_component, cond_feat)
275 |         # noise componebt
276 |         for l_noi_block in self.l_noi_blocks:
277 |             noi_component = l_noi_block(noi_component, cond_feat)
278 |         
279 |         # get sinc filter coefficients
280 |         lp_coef, hp_coef = self.l_sinc_coef(cut_f)
281 | 
282 |         # time-variant filtering
283 |         har_signal = self.l_tv_filtering(har_component, lp_coef)
284 |         noi_signal = self.l_tv_filtering(noi_component, hp_coef)
285 | 
286 |         # get output 
287 |         return har_signal + noi_signal
288 |         


--------------------------------------------------------------------------------
/nsf_hifigan/model/generators/generator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from functools import reduce
 4 | import operator
 5 | from typing import List, Union
 6 | import torch
 7 | from torch import nn
 8 | from torch.nn import functional as F
 9 | 
10 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11 | 
12 | from nsf_hifigan.model.generators.cond_module import CondModuleHnSincNSF
13 | from nsf_hifigan.model.generators.filter_module import FilterModuleHnSincNSF
14 | from nsf_hifigan.model.generators.source_module import SourceModuleHnNSF
15 | 
16 | class NSFHiFiGANGenerator(torch.nn.Module):
17 |     """ Model definition
18 |     """
19 |     def __init__(self, in_dim: int, out_dim: int,
20 |             upsampling_rate: int,
21 |             sampling_rate: int,
22 |             sine_amp: float=0.1,
23 |             noise_std: float=0.003,
24 |             hidden_dim: int=64,
25 |             cnn_kernel_s: int=3,
26 |             filter_block_num: int=5,
27 |             cnn_num_in_block: int=10,
28 |             harmonic_num: int=7,
29 |             sinc_order: int=31
30 |         ):
31 |         super(NSFHiFiGANGenerator, self).__init__()
32 | 
33 |         self.input_dim = in_dim
34 |         self.output_dim = out_dim
35 | 
36 |         # configurations
37 |         # amplitude of sine waveform (for each harmonic)
38 |         self.sine_amp = sine_amp
39 |         # standard deviation of Gaussian noise for additive noise
40 |         self.noise_std = noise_std
41 |         # dimension of hidden features in filter blocks
42 |         self.hidden_dim = hidden_dim
43 |         # upsampling rate on input acoustic features (16kHz * 5ms = 80)
44 |         # assume input_reso has the same value
45 |         self.upsampling_rate = upsampling_rate
46 |         # sampling rate (Hz)
47 |         self.sampling_rate = sampling_rate
48 |         # CNN kernel size in filter blocks        
49 |         self.cnn_kernel_s = cnn_kernel_s
50 |         # number of filter blocks (for harmonic branch)
51 |         # noise branch only uses 1 block
52 |         self.filter_block_num = filter_block_num
53 |         # number of dilated CNN in each filter block
54 |         self.cnn_num_in_block = cnn_num_in_block
55 |         # number of harmonic overtones in source
56 |         self.harmonic_num = harmonic_num
57 |         # order of sinc-windowed-FIR-filter
58 |         self.sinc_order = sinc_order
59 | 
60 |         # the three modules
61 |         self.m_cond = CondModuleHnSincNSF(self.input_dim, self.hidden_dim, self.upsampling_rate, cnn_kernel_s=self.cnn_kernel_s)
62 | 
63 |         self.m_source = SourceModuleHnNSF(self.sampling_rate, self.harmonic_num, self.sine_amp, self.noise_std)
64 |         
65 |         self.m_filter = FilterModuleHnSincNSF(self.output_dim, self.hidden_dim, self.sinc_order, self.filter_block_num, \
66 |                                             self.cnn_kernel_s, self.cnn_num_in_block)
67 |         # loss function on spectra
68 |         # self.m_aux_loss = LossAuxGen()
69 |     
70 |     def forward(self, feat, f0):
71 |         """ definition of forward method 
72 |         Assume x (batchsize=1, length, dim)
73 |         Return output(batchsize=1, length)
74 |         """
75 |         f0 = f0.unsqueeze(2)
76 |         # condition module
77 |         # feature-to-filter-block, f0-up-sampled, cut-off-f-for-sinc,
78 |         # hidden-feature-for-cut-off-f
79 |         cond_feat, f0_upsamped, cut_f, hid_cut_f = self.m_cond(feat, f0)
80 | 
81 |         # source module
82 |         # harmonic-source, noise-source (for noise branch), uv
83 |         har_source, noi_source, uv = self.m_source(f0_upsamped)
84 |         
85 |         # neural filter module (including sinc-based FIR filtering)
86 |         # output
87 |         output = self.m_filter(har_source, noi_source, cond_feat, cut_f)
88 |         
89 |         return output
90 |     
91 |     def loss_aux(self, nat_wav, gen_tuple, data_in):
92 |         return self.m_aux_loss.compute(gen_tuple, nat_wav)
93 | 
94 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/generators/source_module.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | 
  6 | import numpy as np
  7 | 
  8 | class SineGen(nn.Module):
  9 |     """ Definition of sine generator
 10 |     SineGen(samp_rate, harmonic_num = 0, 
 11 |             sine_amp = 0.1, noise_std = 0.003,
 12 |             voiced_threshold = 0,
 13 |             flag_for_pulse=False)
 14 |     
 15 |     samp_rate: sampling rate in Hz
 16 |     harmonic_num: number of harmonic overtones (default 0)
 17 |     sine_amp: amplitude of sine-wavefrom (default 0.1)
 18 |     noise_std: std of Gaussian noise (default 0.003)
 19 |     voiced_thoreshold: F0 threshold for U/V classification (default 0)
 20 |     flag_for_pulse: this SinGen is used inside PulseGen (default False)
 21 |     
 22 |     Note: when flag_for_pulse is True, the first time step of a voiced
 23 |         segment is always sin(np.pi) or cos(0)
 24 |     """
 25 |     def __init__(self, samp_rate, harmonic_num = 0, 
 26 |                  sine_amp = 0.1, noise_std = 0.003,
 27 |                  voiced_threshold = 0,
 28 |                  flag_for_pulse=False):
 29 |         super(SineGen, self).__init__()
 30 |         self.sine_amp = sine_amp
 31 |         self.noise_std = noise_std
 32 |         self.harmonic_num = harmonic_num
 33 |         self.dim = self.harmonic_num + 1
 34 |         self.sampling_rate = samp_rate
 35 |         self.voiced_threshold = voiced_threshold
 36 |         self.flag_for_pulse = flag_for_pulse
 37 |     
 38 |     def _f02uv(self, f0):
 39 |         # generate uv signal
 40 |         uv = torch.ones_like(f0)
 41 |         uv = uv * (f0 > self.voiced_threshold)
 42 |         return uv
 43 |             
 44 |     def _f02sine(self, f0_values):
 45 |         """ f0_values: (batchsize, length, dim)
 46 |             where dim indicates fundamental tone and overtones
 47 |         """
 48 |         # convert to F0 in rad. The interger part n can be ignored
 49 |         # because 2 * np.pi * n doesn't affect phase
 50 |         rad_values = (f0_values / self.sampling_rate) % 1
 51 |         
 52 |         # initial phase noise (no noise for fundamental component)
 53 |         rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2],\
 54 |                               device = f0_values.device)
 55 |         rand_ini[:, 0] = 0
 56 |         rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
 57 |                 
 58 |         # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
 59 |         if not self.flag_for_pulse:
 60 |             # for normal case
 61 | 
 62 |             # To prevent torch.cumsum numerical overflow,
 63 |             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
 64 |             # Buffer tmp_over_one_idx indicates the time step to add -1.
 65 |             # This will not change F0 of sine because (x-1) * 2*pi = x *2*pi
 66 |             tmp_over_one = torch.cumsum(rad_values, 1) % 1
 67 |             tmp_over_one_idx = (tmp_over_one[:, 1:, :] - 
 68 |                                 tmp_over_one[:, :-1, :]) < 0
 69 |             cumsum_shift = torch.zeros_like(rad_values)
 70 |             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
 71 | 
 72 |             sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) 
 73 |                               * 2 * np.pi)
 74 |         else:
 75 |             # If necessary, make sure that the first time step of every 
 76 |             # voiced segments is sin(pi) or cos(0)
 77 |             # This is used for pulse-train generation
 78 |             
 79 |             # identify the last time step in unvoiced segments
 80 |             uv = self._f02uv(f0_values)
 81 |             uv_1 = torch.roll(uv, shifts=-1, dims=1)
 82 |             uv_1[:, -1, :] = 1
 83 |             u_loc = (uv < 1) * (uv_1 > 0)
 84 |             
 85 |             # get the instantanouse phase
 86 |             tmp_cumsum = torch.cumsum(rad_values, dim=1)
 87 |             # different batch needs to be processed differently
 88 |             for idx in range(f0_values.shape[0]):
 89 |                 temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
 90 |                 temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
 91 |                 # stores the accumulation of i.phase within 
 92 |                 # each voiced segments
 93 |                 tmp_cumsum[idx, :, :] = 0
 94 |                 tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
 95 | 
 96 |             # rad_values - tmp_cumsum: remove the accumulation of i.phase
 97 |             # within the previous voiced segment.
 98 |             i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
 99 | 
100 |             # get the sines
101 |             sines = torch.cos(i_phase * 2 * np.pi)
102 |         return  sines
103 |     
104 |     
105 |     def forward(self, f0):
106 |         """ sine_tensor, uv = forward(f0)
107 |         input F0: tensor(batchsize=1, length, dim=1)
108 |                   f0 for unvoiced steps should be 0
109 |         output sine_tensor: tensor(batchsize=1, length, dim)
110 |         output uv: tensor(batchsize=1, length, 1)
111 |         """
112 |         with torch.no_grad():
113 |             f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \
114 |                                     device=f0.device)
115 |             # fundamental component
116 |             f0_buf[:, :, 0] = f0[:, :, 0]
117 |             for idx in np.arange(self.harmonic_num):
118 |                 # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
119 |                 f0_buf[:, :, idx+1] = f0_buf[:, :, 0] * (idx+2)
120 |                 
121 |             # generate sine waveforms
122 |             sine_waves = self._f02sine(f0_buf) * self.sine_amp
123 |             
124 |             # generate uv signal
125 |             #uv = torch.ones(f0.shape)
126 |             #uv = uv * (f0 > self.voiced_threshold)
127 |             uv = self._f02uv(f0)
128 |             
129 |             # noise: for unvoiced should be similar to sine_amp
130 |             #        std = self.sine_amp/3 -> max value ~ self.sine_amp
131 |             #.       for voiced regions is self.noise_std
132 |             noise_amp = uv * self.noise_std + (1-uv) * self.sine_amp / 3
133 |             noise = noise_amp * torch.randn_like(sine_waves)
134 |             
135 |             # first: set the unvoiced part to 0 by uv
136 |             # then: additive noise
137 |             sine_waves = sine_waves * uv + noise
138 |         return sine_waves, uv, noise
139 | 
140 | 
141 | class SourceModuleHnNSF(nn.Module):
142 |     """ SourceModule for hn-nsf 
143 |     SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, 
144 |                  add_noise_std=0.003, voiced_threshod=0)
145 |     sampling_rate: sampling_rate in Hz
146 |     harmonic_num: number of harmonic above F0 (default: 0)
147 |     sine_amp: amplitude of sine source signal (default: 0.1)
148 |     add_noise_std: std of additive Gaussian noise (default: 0.003)
149 |         note that amplitude of noise in unvoiced is decided
150 |         by sine_amp
151 |     voiced_threshold: threhold to set U/V given F0 (default: 0)
152 | 
153 |     Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
154 |     F0_sampled (batchsize, length, 1)
155 |     Sine_source (batchsize, length, 1)
156 |     noise_source (batchsize, length 1)
157 |     uv (batchsize, length, 1)
158 |     """
159 |     def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, 
160 |                  add_noise_std=0.003, voiced_threshod=0):
161 |         super(SourceModuleHnNSF, self).__init__()
162 |         
163 |         self.sine_amp = sine_amp
164 |         self.noise_std = add_noise_std
165 | 
166 |         # to produce sine waveforms
167 |         self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
168 |                                  sine_amp, add_noise_std, voiced_threshod)
169 | 
170 |         # to merge source harmonics into a single excitation
171 |         self.l_linear = nn.Linear(harmonic_num+1, 1)
172 |         self.l_tanh = nn.Tanh()
173 | 
174 |     def forward(self, x):
175 |         """
176 |         Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
177 |         F0_sampled (batchsize, length, 1)
178 |         Sine_source (batchsize, length, 1)
179 |         noise_source (batchsize, length 1)
180 |         """
181 |         # source for harmonic branch
182 |         sine_wavs, uv, _ = self.l_sin_gen(x)
183 |         sine_merge = self.l_tanh(self.l_linear(sine_wavs))
184 | 
185 |         # source for noise branch, in the same shape as uv
186 |         noise = torch.randn_like(uv) * self.sine_amp / 3
187 |         return sine_merge, noise, uv
188 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | 
 4 | def feature_loss(fmap_r, fmap_g):
 5 |     loss = 0
 6 |     for dr, dg in zip(fmap_r, fmap_g):
 7 |         for rl, gl in zip(dr, dg):
 8 |             rl = rl.float().detach()
 9 |             gl = gl.float()
10 |             loss += torch.mean(torch.abs(rl - gl))
11 | 
12 |     return loss * 2
13 | 
14 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
15 |     loss = 0
16 |     r_losses = []
17 |     g_losses = []
18 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
19 |         dr = dr.float()
20 |         dg = dg.float()
21 |         r_loss = torch.mean((1 - dr) ** 2)
22 |         g_loss = torch.mean(dg ** 2)
23 |         loss += (r_loss + g_loss)
24 |         r_losses.append(r_loss.item())
25 |         g_losses.append(g_loss.item())
26 | 
27 |     return loss, r_losses, g_losses
28 | 
29 | def generator_loss(disc_outputs):
30 |     loss = 0
31 |     gen_losses = []
32 |     for dg in disc_outputs:
33 |         dg = dg.float()
34 |         l = torch.mean((1-dg)**2)
35 |         gen_losses.append(l)
36 |         loss += l
37 | 
38 |     return loss, gen_losses
39 | 
40 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
41 |     """
42 |     z_p, logs_q: [b, h, t_t]
43 |     m_p, logs_p: [b, h, t_t]
44 |     """
45 |     z_p = z_p.float()
46 |     logs_q = logs_q.float()
47 |     m_p = m_p.float()
48 |     logs_p = logs_p.float()
49 |     z_mask = z_mask.float()
50 | 
51 |     kl = logs_p - logs_q - 0.5
52 |     kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
53 |     kl = torch.sum(kl * z_mask)
54 |     l = kl / torch.sum(z_mask)
55 |     return l
56 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/nsf_hifigan/model/modules/__init__.py


--------------------------------------------------------------------------------
/nsf_hifigan/model/modules/conv1_keep_length.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | class Conv1dKeepLength(nn.Conv1d):
 6 |     """ Wrapper for causal convolution
 7 |     Input tensor:  (batchsize=1, length, dim_in)
 8 |     Output tensor: (batchsize=1, length, dim_out)
 9 |     https://github.com/pytorch/pytorch/issues/1333
10 |     Note: Tanh is optional
11 |     """
12 |     def __init__(self, input_dim, output_dim, dilation_s, kernel_s, 
13 |                  causal = False, stride = 1, groups=1, bias=True, \
14 |                  tanh = True, pad_mode='constant'):
15 |         super(Conv1dKeepLength, self).__init__(
16 |             input_dim, output_dim, kernel_s, stride=stride,
17 |             padding = 0, dilation = dilation_s, groups=groups, bias=bias)
18 | 
19 |         self.pad_mode = pad_mode
20 | 
21 |         self.causal = causal
22 |         # input & output length will be the same        
23 |         if self.causal:
24 |             # left pad to make the convolution causal
25 |             self.pad_le = dilation_s * (kernel_s - 1)
26 |             self.pad_ri = 0
27 |         else:
28 |             # pad on both sizes
29 |             self.pad_le = dilation_s * (kernel_s - 1) // 2
30 |             self.pad_ri = dilation_s * (kernel_s - 1) - self.pad_le
31 | 
32 |         if tanh:
33 |             self.l_ac = nn.Tanh()
34 |         else:
35 |             self.l_ac = nn.Identity()
36 |         
37 |     def forward(self, data):
38 |         # permute to (batchsize=1, dim, length)
39 |         # add one dimension (batchsize=1, dim, ADDED_DIM, length)
40 |         # pad to ADDED_DIM
41 |         # squeeze and return to (batchsize=1, dim, length)
42 |         # https://github.com/pytorch/pytorch/issues/1333
43 |         x = F.pad(data.permute(0, 2, 1).unsqueeze(2), \
44 |                               (self.pad_le, self.pad_ri, 0, 0),
45 |                               mode = self.pad_mode).squeeze(2)
46 |         # tanh(conv1())
47 |         # permmute back to (batchsize=1, length, dim)
48 |         output = self.l_ac(super(Conv1dKeepLength, self).forward(x))
49 |         return output.permute(0, 2, 1)
50 | 


--------------------------------------------------------------------------------
/nsf_hifigan/model/modules/moving_average.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | from .conv1_keep_length import Conv1dKeepLength
 6 | 
 7 | class MovingAverage(Conv1dKeepLength):
 8 |     """ Wrapper to define a moving average smoothing layer
 9 |     Note: MovingAverage can be implemented using TimeInvFIRFilter too.
10 |           Here we define another Module dicrectly on Conv1DKeepLength
11 |     """
12 |     def __init__(self, feature_dim, window_len, causal=False, \
13 |                  pad_mode='replicate'):
14 |         super(MovingAverage, self).__init__(
15 |             feature_dim, feature_dim, 1, window_len, causal,
16 |             groups=feature_dim, bias=False, tanh=False, \
17 |             pad_mode=pad_mode)
18 |         # set the weighting coefficients
19 |         nn.init.constant_(self.weight, 1/window_len)
20 |         # turn off grad for this layer
21 |         for p in self.parameters():
22 |             p.requires_grad = False
23 |             
24 |     def forward(self, data):
25 |         return super(MovingAverage, self).forward(data)


--------------------------------------------------------------------------------
/nsf_hifigan/model/nsf_hifigan.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import itertools
  3 | from typing import Any, Dict
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | from torch import optim
  8 | import torchaudio
  9 | import torchaudio.transforms as T
 10 | 
 11 | import pytorch_lightning as pl
 12 | import torchmetrics
 13 | 
 14 | from nsf_hifigan.mel_processing import spectrogram_torch_audio
 15 | 
 16 | from .discriminators.multi_scale_discriminator import MultiScaleDiscriminator
 17 | from .discriminators.multi_period_discriminator import MultiPeriodDiscriminator
 18 | from .generators.generator import NSFHiFiGANGenerator
 19 | 
 20 | from .loss import discriminator_loss, kl_loss,feature_loss, generator_loss
 21 | from .. import utils
 22 | from .commons import slice_segments, rand_slice_segments, sequence_mask, clip_grad_value_
 23 | 
 24 | from .pipeline import AudioPipeline
 25 | 
 26 | class NSF_HifiGAN(pl.LightningModule):
 27 |     def __init__(self, **kwargs):
 28 |         super().__init__()
 29 |         self.save_hyperparameters(*[k for k in kwargs])
 30 | 
 31 |         self.net_g = NSFHiFiGANGenerator(
 32 |             in_dim = self.hparams.data.n_mel_channels,
 33 |             out_dim = 1,
 34 |             upsampling_rate = self.hparams.data.hop_length,
 35 |             sampling_rate = self.hparams.data.sampling_rate,
 36 |         )
 37 |         self.net_period_d = MultiPeriodDiscriminator(
 38 |             periods=self.hparams.model.multi_period_discriminator_periods,
 39 |             use_spectral_norm=self.hparams.model.use_spectral_norm
 40 |         )
 41 |         self.net_scale_d = MultiScaleDiscriminator(use_spectral_norm=self.hparams.model.use_spectral_norm)
 42 | 
 43 |         self.audio_pipeline = AudioPipeline(freq=self.hparams.data.sampling_rate,
 44 |                                             n_fft=self.hparams.data.filter_length,
 45 |                                             n_mel=self.hparams.data.n_mel_channels,
 46 |                                             win_length=self.hparams.data.win_length,
 47 |                                             hop_length=self.hparams.data.hop_length)
 48 |         for param in self.audio_pipeline.parameters():
 49 |             param.requires_grad = False
 50 |         
 51 |         # metrics
 52 |         self.valid_spec_loss = torchmetrics.MeanMetric()
 53 | 
 54 |     def training_step(self, batch: Dict[str, torch.Tensor], batch_idx: int, optimizer_idx: int):
 55 |         x_wav, x_wav_lengths = batch["x_wav_values"], batch["x_wav_lengths"]
 56 |         x_pitch, x_pitch_lengths = batch["x_pitch_values"], batch["x_pitch_lengths"]
 57 |         y_wav, y_wav_lengths = batch["y_wav_values"], batch["y_wav_lengths"]
 58 |         
 59 |         with torch.inference_mode():
 60 |             x_mel = self.audio_pipeline(x_wav.squeeze(1), aug=True)
 61 |             x_mel_lengths = (x_wav_lengths / self.hparams.data.hop_length).long()
 62 | 
 63 |         x_mel, ids_slice = rand_slice_segments(x_mel, x_mel_lengths, self.hparams.train.segment_size // self.hparams.data.hop_length)
 64 |         x_pitch = slice_segments(x_pitch.unsqueeze(1), ids_slice, self.hparams.train.segment_size // self.hparams.data.hop_length).squeeze(1) # slice
 65 |         y_wav = slice_segments(y_wav, ids_slice * self.hparams.data.hop_length, self.hparams.train.segment_size) # slice
 66 | 
 67 |         y_spec = spectrogram_torch_audio(
 68 |             y_wav.squeeze(1).float(),
 69 |             self.hparams.data.filter_length,
 70 |             self.hparams.data.sampling_rate,
 71 |             self.hparams.data.hop_length,
 72 |             self.hparams.data.win_length,
 73 |             False
 74 |         )
 75 | 
 76 |         # generator forward
 77 |         y_hat = self.net_g(x_mel.transpose(1,2), x_pitch).transpose(1,2)
 78 | 
 79 |         y_spec_hat = spectrogram_torch_audio(
 80 |             y_hat.squeeze(1).float(),
 81 |             self.hparams.data.filter_length,
 82 |             self.hparams.data.sampling_rate,
 83 |             self.hparams.data.hop_length,
 84 |             self.hparams.data.win_length,
 85 |             False
 86 |         )
 87 | 
 88 |         # Discriminator
 89 |         if optimizer_idx == 0:
 90 |             # MPD
 91 |             y_dp_hat_r, y_dp_hat_g, _, _ = self.net_period_d(y_wav, y_hat.detach())
 92 |             loss_disc_p, losses_disc_p_r, losses_disc_p_g = discriminator_loss(y_dp_hat_r, y_dp_hat_g)
 93 | 
 94 |             # MSD
 95 |             y_ds_hat_r, y_ds_hat_g, _, _ = self.net_scale_d(y_wav, y_hat.detach())
 96 |             loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)
 97 | 
 98 |             loss_disc_all = loss_disc_p + loss_disc_s
 99 | 
100 |             # log
101 |             lr = self.optim_g.param_groups[0]['lr']
102 |             scalar_dict = {"train/d/loss_total": loss_disc_all, "learning_rate": lr}
103 |             scalar_dict.update({"train/d_p_r/{}".format(i): v for i, v in enumerate(losses_disc_p_r)})
104 |             scalar_dict.update({"train/d_p_g/{}".format(i): v for i, v in enumerate(losses_disc_p_g)})
105 |             scalar_dict.update({"train/d_s_r/{}".format(i): v for i, v in enumerate(losses_disc_s_r)})
106 |             scalar_dict.update({"train/d_s_g/{}".format(i): v for i, v in enumerate(losses_disc_s_g)})
107 | 
108 |             image_dict = {}
109 |             
110 |             tensorboard = self.logger.experiment
111 | 
112 |             utils.summarize(
113 |                 writer=tensorboard,
114 |                 global_step=self.global_step, 
115 |                 images=image_dict,
116 |                 scalars=scalar_dict)
117 |             
118 |             return loss_disc_all
119 | 
120 |         # Generator
121 |         if optimizer_idx == 1:
122 |             y_dp_hat_r, y_dp_hat_g, fmap_p_r, fmap_p_g = self.net_period_d(y_wav, y_hat)
123 |             loss_p_fm = feature_loss(fmap_p_r, fmap_p_g)
124 |             loss_p_gen, losses_p_gen = generator_loss(y_dp_hat_g)
125 | 
126 |             y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = self.net_scale_d(y_wav, y_hat)
127 |             loss_s_fm = feature_loss(fmap_s_r, fmap_s_g)
128 |             loss_s_gen, losses_s_gen = generator_loss(y_ds_hat_g)
129 | 
130 |             # mel
131 |             loss_spec = F.l1_loss(y_spec_hat, y_spec) * self.hparams.train.c_spec
132 | 
133 |             loss_gen_all = (loss_s_gen + loss_s_fm) + (loss_p_gen + loss_p_fm) + loss_spec
134 | 
135 |             # Logging to TensorBoard by default
136 |             lr = self.optim_g.param_groups[0]['lr']
137 |             scalar_dict = {"train/g/loss_total": loss_gen_all, "learning_rate": lr}
138 |             scalar_dict.update({
139 |                 "train/g/p_fm": loss_p_fm,
140 |                 "train/g/s_fm": loss_s_fm,
141 |                 "train/g/p_gen": loss_p_gen,
142 |                 "train/g/s_gen": loss_s_gen,
143 |                 "train/g/loss_spec": loss_spec,
144 |             })
145 | 
146 |             scalar_dict.update({"train/g/p_gen_{}".format(i): v for i, v in enumerate(losses_p_gen)})
147 |             scalar_dict.update({"train/g/s_gen_{}".format(i): v for i, v in enumerate(losses_s_gen)})
148 | 
149 |             image_dict = {}
150 |             
151 |             tensorboard = self.logger.experiment
152 |             utils.summarize(
153 |                 writer=tensorboard,
154 |                 global_step=self.global_step, 
155 |                 images=image_dict,
156 |                 scalars=scalar_dict)
157 |             return loss_gen_all
158 | 
159 |     def validation_step(self, batch, batch_idx):
160 |         self.net_g.eval()
161 |         
162 |         x_wav, x_wav_lengths = batch["x_wav_values"], batch["x_wav_lengths"]
163 |         x_pitch, x_pitch_lengths = batch["x_pitch_values"], batch["x_pitch_lengths"]
164 |         y_wav, y_wav_lengths = batch["y_wav_values"], batch["y_wav_lengths"]
165 |         
166 |         with torch.inference_mode():
167 |             x_mel = self.audio_pipeline(x_wav.squeeze(1), aug=False)
168 |             x_mel_lengths = (x_wav_lengths / self.hparams.data.hop_length).long()
169 | 
170 |         y_spec = spectrogram_torch_audio(y_wav.squeeze(1),
171 |             self.hparams.data.filter_length,
172 |             self.hparams.data.sampling_rate,
173 |             self.hparams.data.hop_length,
174 |             self.hparams.data.win_length, center=False)
175 |         y_spec_lengths = (y_wav_lengths / self.hparams.data.hop_length).long()
176 | 
177 |         # remove else
178 |         y_wav_hat = self.net_g(x_mel.transpose(1,2), x_pitch).transpose(1,2)
179 |         y_hat_lengths = torch.tensor([y_wav_hat.shape[2]], dtype=torch.long)
180 | 
181 |         y_spec_hat = spectrogram_torch_audio(y_wav_hat.squeeze(1),
182 |             self.hparams.data.filter_length,
183 |             self.hparams.data.sampling_rate,
184 |             self.hparams.data.hop_length,
185 |             self.hparams.data.win_length, center=False)
186 | 
187 |         image_dict = {
188 |             "gen/spec": utils.plot_spectrogram_to_numpy(y_spec_hat[0].cpu().numpy()),
189 |             "gt/spec": utils.plot_spectrogram_to_numpy(y_spec[0].cpu().numpy())
190 |         }
191 |         audio_dict = {
192 |             "gen/audio": y_wav_hat[0,:,:y_hat_lengths[0]].squeeze(0).float(),
193 |             "gt/audio": y_wav[0,:,:y_wav_lengths[0]].squeeze(0).float()
194 |         }
195 | 
196 |         spec_mask = torch.unsqueeze(sequence_mask(x_mel_lengths.long(), y_spec.size(2)), 1).to(y_spec.dtype)
197 | 
198 |         # metrics compute
199 |         y_spec_masked = y_spec * spec_mask
200 |         y_spec_masked_hat = y_spec_hat * spec_mask
201 |         valid_spec_loss_step = F.l1_loss(y_spec_masked_hat, y_spec_masked)
202 |         self.valid_spec_loss.update(valid_spec_loss_step.item())
203 |         self.log("valid/loss_spec_step", valid_spec_loss_step.item(), sync_dist=True)
204 | 
205 |         # logging
206 |         tensorboard = self.logger.experiment
207 |         utils.summarize(
208 |             writer=tensorboard,
209 |             global_step=self.global_step, 
210 |             images=image_dict,
211 |             audios=audio_dict,
212 |             audio_sampling_rate=self.hparams.data.sampling_rate,
213 |         )
214 |     
215 |     def validation_epoch_end(self, outputs) -> None:
216 |         self.net_g.eval()
217 |         valid_spec_loss_epoch = self.valid_spec_loss.compute()
218 |         self.log("valid/loss_spec_epoch", valid_spec_loss_epoch.item(), sync_dist=True)
219 |         self.valid_spec_loss.reset()
220 | 
221 |     def configure_optimizers(self):
222 |         self.optim_g = torch.optim.AdamW(
223 |             self.net_g.parameters(),
224 |             self.hparams.train.generator_learning_rate,
225 |             betas=self.hparams.train.betas,
226 |             eps=self.hparams.train.eps)
227 |         self.optim_d = torch.optim.AdamW(
228 |             itertools.chain(self.net_period_d.parameters(), self.net_scale_d.parameters()),
229 |             self.hparams.train.discriminator_learning_rate,
230 |             betas=self.hparams.train.betas,
231 |             eps=self.hparams.train.eps)
232 |         self.scheduler_g = torch.optim.lr_scheduler.ExponentialLR(self.optim_g, gamma=self.hparams.train.lr_decay)
233 |         self.scheduler_g.last_epoch = self.current_epoch - 1
234 |         self.scheduler_d = torch.optim.lr_scheduler.ExponentialLR(self.optim_d, gamma=self.hparams.train.lr_decay)
235 |         self.scheduler_d.last_epoch = self.current_epoch - 1
236 | 
237 |         return [self.optim_d, self.optim_g], [self.scheduler_d, self.scheduler_g]


--------------------------------------------------------------------------------
/nsf_hifigan/model/pipeline.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | import torchaudio.transforms as T
 4 | 
 5 | import torch
 6 | from torch import nn
 7 | from torch.nn import functional as F
 8 | from torch import optim
 9 | import random
10 | 
11 | import numpy as np
12 | 
13 | class GaussianNoise(torch.nn.Module):
14 |     def __init__(self, min_snr=0.0001, max_snr=0.01):
15 |         """
16 |         :param min_snr: Minimum signal-to-noise ratio
17 |         :param max_snr: Maximum signal-to-noise ratio
18 |         """
19 |         super().__init__()
20 |         self.min_snr = min_snr
21 |         self.max_snr = max_snr
22 | 
23 |     def forward(self, audio):
24 |         std = torch.std(audio)
25 |         noise_std = random.uniform(self.min_snr * std, self.max_snr * std)
26 | 
27 |         norm_dist = torch.distributions.normal.Normal(0.0, noise_std)
28 |         noise = norm_dist.rsample(audio.shape).type(audio.dtype).to(audio.device)
29 | 
30 |         return audio + noise
31 | 
32 | class AudioPipeline(torch.nn.Module):
33 |     def __init__(
34 |         self,
35 |         freq=16000,
36 |         n_fft=1024,
37 |         n_mel=128,
38 |         win_length=1024,
39 |         hop_length=256
40 |     ):
41 |         super().__init__()
42 | 
43 |         self.freq=freq
44 | 
45 |         pad = int((n_fft-hop_length)/2)
46 |         self.spec = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length,
47 |             pad=pad, power=None,center=False, pad_mode='reflect', normalized=False, onesided=True)
48 | 
49 |         # self.strech = T.TimeStretch(hop_length=hop_length, n_freq=freq)
50 |         self.spec_aug = torch.nn.Sequential(
51 |             GaussianNoise(min_snr=0.0001, max_snr=0.02),
52 |             T.FrequencyMasking(freq_mask_param=80),
53 |             # T.TimeMasking(time_mask_param=80),
54 |         )
55 | 
56 |         self.mel_scale = T.MelScale(n_mels=n_mel, sample_rate=freq, n_stft=n_fft // 2 + 1)
57 | 
58 |     def forward(self, waveform: torch.Tensor, aug: bool=False) -> torch.Tensor:
59 |         shift_waveform = waveform
60 |         # Convert to power spectrogram
61 |         spec = self.spec(shift_waveform)
62 |         spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
63 |         # Apply SpecAugment
64 |         if aug:
65 |             spec = self.spec_aug(spec)
66 |         # Convert to mel-scale
67 |         mel = self.mel_scale(spec)
68 |         return mel


--------------------------------------------------------------------------------
/nsf_hifigan/pyin.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from numpy.lib.stride_tricks import as_strided
  4 | 
  5 | class ParameterError(Exception):
  6 |     """Exception class for mal-formed inputs"""
  7 |     pass
  8 | 
  9 | def valid_audio(y, *, mono=True):
 10 |     if not isinstance(y, np.ndarray):
 11 |         raise TypeError("Audio data must be of type numpy.ndarray")
 12 | 
 13 |     if not np.issubdtype(y.dtype, np.floating):
 14 |         raise TypeError("Audio data must be floating-point")
 15 | 
 16 |     if y.ndim == 0:
 17 |         raise TypeError(
 18 |             "Audio data must be at least one-dimensional, given y.shape={}".format(
 19 |                 y.shape
 20 |             )
 21 |         )
 22 | 
 23 |     if mono and y.ndim != 1:
 24 |         raise TypeError(
 25 |             "Invalid shape for monophonic audio: "
 26 |             "ndim={:d}, shape={}".format(y.ndim, y.shape)
 27 |         )
 28 | 
 29 |     if not np.isfinite(y).all():
 30 |         raise TypeError("Audio buffer is not finite everywhere")
 31 | 
 32 |     return True
 33 | 
 34 | def frame(x, *, frame_length, hop_length, axis=-1, writeable=False, subok=False):
 35 |     # This implementation is derived from numpy.lib.stride_tricks.sliding_window_view (1.20.0)
 36 |     # https://numpy.org/doc/stable/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html
 37 | 
 38 |     x = np.array(x, copy=False, subok=subok)
 39 | 
 40 |     if x.shape[axis] < frame_length:
 41 |         raise ParameterError(
 42 |             "Input is too short (n={:d})"
 43 |             " for frame_length={:d}".format(x.shape[axis], frame_length)
 44 |         )
 45 | 
 46 |     if hop_length < 1:
 47 |         raise ParameterError("Invalid hop_length: {:d}".format(hop_length))
 48 | 
 49 |     # put our new within-frame axis at the end for now
 50 |     out_strides = x.strides + tuple([x.strides[axis]])
 51 | 
 52 |     # Reduce the shape on the framing axis
 53 |     x_shape_trimmed = list(x.shape)
 54 |     x_shape_trimmed[axis] -= frame_length - 1
 55 | 
 56 |     out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
 57 |     xw = as_strided(
 58 |         x, strides=out_strides, shape=out_shape, subok=subok, writeable=writeable
 59 |     )
 60 | 
 61 |     if axis < 0:
 62 |         target_axis = axis - 1
 63 |     else:
 64 |         target_axis = axis + 1
 65 | 
 66 |     xw = np.moveaxis(xw, -1, target_axis)
 67 | 
 68 |     # Downsample along the target axis
 69 |     slices = [slice(None)] * xw.ndim
 70 |     slices[axis] = slice(0, None, hop_length)
 71 |     return xw[tuple(slices)]
 72 | 
 73 | def pyin(y, *, fmin, fmax, sr=22050, frame_length=2048, win_length=None, hop_length=None,
 74 |     n_thresholds=100, beta_parameters=(2, 18), boltzmann_parameter=2, resolution=0.1,
 75 |     max_transition_rate=35.92, switch_prob=0.01, no_trough_prob=0.01, fill_na=np.nan,
 76 |     center=True, pad_mode="constant"
 77 | ):
 78 | 
 79 |     if fmin is None or fmax is None:
 80 |         raise ParameterError('both "fmin" and "fmax" must be provided')
 81 | 
 82 |     # Set the default window length if it is not already specified.
 83 |     if win_length is None:
 84 |         win_length = frame_length // 2
 85 | 
 86 |     if win_length >= frame_length:
 87 |         raise ParameterError(
 88 |             "win_length={} cannot exceed given frame_length={}".format(
 89 |                 win_length, frame_length
 90 |             )
 91 |         )
 92 | 
 93 |     # Set the default hop if it is not already specified.
 94 |     if hop_length is None:
 95 |         hop_length = frame_length // 4
 96 | 
 97 |     # Check that audio is valid.
 98 |     valid_audio(y, mono=False)
 99 | 
100 |     # Pad the time series so that frames are centered
101 |     if center:
102 |         padding = [(0, 0) for _ in y.shape]
103 |         padding[-1] = (frame_length // 2, frame_length // 2)
104 |         y = np.pad(y, padding, mode=pad_mode)
105 | 
106 |     # Frame audio.
107 |     y_frames = frame(y, frame_length=frame_length, hop_length=hop_length)
108 | 
109 |     # Calculate minimum and maximum periods
110 |     min_period = max(int(np.floor(sr / fmax)), 1)
111 |     max_period = min(int(np.ceil(sr / fmin)), frame_length - win_length - 1)
112 | 
113 |     # Calculate cumulative mean normalized difference function.
114 |     yin_frames = _cumulative_mean_normalized_difference(
115 |         y_frames, frame_length, win_length, min_period, max_period
116 |     )
117 | 
118 |     # Parabolic interpolation.
119 |     parabolic_shifts = _parabolic_interpolation(yin_frames)
120 | 
121 |     # Find Yin candidates and probabilities.
122 |     # The implementation here follows the official pYIN software which
123 |     # differs from the method described in the paper.
124 |     # 1. Define the prior over the thresholds.
125 |     thresholds = np.linspace(0, 1, n_thresholds + 1)
126 |     beta_cdf = scipy.stats.beta.cdf(thresholds, beta_parameters[0], beta_parameters[1])
127 |     beta_probs = np.diff(beta_cdf)
128 | 
129 |     n_bins_per_semitone = int(np.ceil(1.0 / resolution))
130 |     n_pitch_bins = int(np.floor(12 * n_bins_per_semitone * np.log2(fmax / fmin))) + 1
131 | 
132 |     def _helper(a, b):
133 |         return __pyin_helper(
134 |             a,
135 |             b,
136 |             sr,
137 |             thresholds,
138 |             boltzmann_parameter,
139 |             beta_probs,
140 |             no_trough_prob,
141 |             min_period,
142 |             fmin,
143 |             n_pitch_bins,
144 |             n_bins_per_semitone,
145 |         )
146 | 
147 |     helper = np.vectorize(_helper, signature="(f,t),(k,t)->(1,d,t),(j,t)")
148 |     observation_probs, voiced_prob = helper(yin_frames, parabolic_shifts)
149 | 
150 |     # Construct transition matrix.
151 |     max_semitones_per_frame = round(max_transition_rate * 12 * hop_length / sr)
152 |     transition_width = max_semitones_per_frame * n_bins_per_semitone + 1
153 |     # Construct the within voicing transition probabilities
154 |     transition = sequence.transition_local(
155 |         n_pitch_bins, transition_width, window="triangle", wrap=False
156 |     )
157 | 
158 |     # Include across voicing transition probabilities
159 |     t_switch = sequence.transition_loop(2, 1 - switch_prob)
160 |     transition = np.kron(t_switch, transition)
161 | 
162 |     p_init = np.zeros(2 * n_pitch_bins)
163 |     p_init[n_pitch_bins:] = 1 / n_pitch_bins
164 | 
165 |     states = seq.viterbi(observation_probs, transition, p_init=p_init)
166 | 
167 |     # Find f0 corresponding to each decoded pitch bin.
168 |     freqs = fmin * 2 ** (np.arange(n_pitch_bins) / (12 * n_bins_per_semitone))
169 |     f0 = freqs[states % n_pitch_bins]
170 |     voiced_flag = states < n_pitch_bins
171 | 
172 |     if fill_na is not None:
173 |         f0[~voiced_flag] = fill_na
174 | 
175 |     return f0[..., 0, :], voiced_flag[..., 0, :], voiced_prob[..., 0, :]
176 | 
177 | 
178 | def __pyin_helper(
179 |     yin_frames,
180 |     parabolic_shifts,
181 |     sr,
182 |     thresholds,
183 |     boltzmann_parameter,
184 |     beta_probs,
185 |     no_trough_prob,
186 |     min_period,
187 |     fmin,
188 |     n_pitch_bins,
189 |     n_bins_per_semitone,
190 | ):
191 | 
192 |     yin_probs = np.zeros_like(yin_frames)
193 | 
194 |     for i, yin_frame in enumerate(yin_frames.T):
195 |         # 2. For each frame find the troughs.
196 |         is_trough = util.localmin(yin_frame)
197 | 
198 |         is_trough[0] = yin_frame[0] < yin_frame[1]
199 |         (trough_index,) = np.nonzero(is_trough)
200 | 
201 |         if len(trough_index) == 0:
202 |             continue
203 | 
204 |         # 3. Find the troughs below each threshold.
205 |         # these are the local minima of the frame, could get them directly without the trough index
206 |         trough_heights = yin_frame[trough_index]
207 |         trough_thresholds = np.less.outer(trough_heights, thresholds[1:])
208 | 
209 |         # 4. Define the prior over the troughs.
210 |         # Smaller periods are weighted more.
211 |         trough_positions = np.cumsum(trough_thresholds, axis=0) - 1
212 |         n_troughs = np.count_nonzero(trough_thresholds, axis=0)
213 | 
214 |         trough_prior = scipy.stats.boltzmann.pmf(
215 |             trough_positions, boltzmann_parameter, n_troughs
216 |         )
217 | 
218 |         trough_prior[~trough_thresholds] = 0
219 | 
220 |         # 5. For each threshold add probability to global minimum if no trough is below threshold,
221 |         # else add probability to each trough below threshold biased by prior.
222 | 
223 |         probs = trough_prior.dot(beta_probs)
224 | 
225 |         global_min = np.argmin(trough_heights)
226 |         n_thresholds_below_min = np.count_nonzero(~trough_thresholds[global_min, :])
227 |         probs[global_min] += no_trough_prob * np.sum(
228 |             beta_probs[:n_thresholds_below_min]
229 |         )
230 | 
231 |         yin_probs[trough_index, i] = probs
232 | 
233 |     yin_period, frame_index = np.nonzero(yin_probs)
234 | 
235 |     # Refine peak by parabolic interpolation.
236 |     period_candidates = min_period + yin_period
237 |     period_candidates = period_candidates + parabolic_shifts[yin_period, frame_index]
238 |     f0_candidates = sr / period_candidates
239 | 
240 |     # Find pitch bin corresponding to each f0 candidate.
241 |     bin_index = 12 * n_bins_per_semitone * np.log2(f0_candidates / fmin)
242 |     bin_index = np.clip(np.round(bin_index), 0, n_pitch_bins).astype(int)
243 | 
244 |     # Observation probabilities.
245 |     observation_probs = np.zeros((2 * n_pitch_bins, yin_frames.shape[1]))
246 |     observation_probs[bin_index, frame_index] = yin_probs[yin_period, frame_index]
247 | 
248 |     voiced_prob = np.clip(
249 |         np.sum(observation_probs[:n_pitch_bins, :], axis=0, keepdims=True), 0, 1
250 |     )
251 |     observation_probs[n_pitch_bins:, :] = (1 - voiced_prob) / n_pitch_bins
252 | 
253 |     return observation_probs[np.newaxis], voiced_prob


--------------------------------------------------------------------------------
/nsf_hifigan/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import sys
 4 | import torch
 5 | import torchaudio
 6 | from typing import Any, Dict, List, Tuple
 7 | 
 8 | 
 9 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
10 | logger = logging
11 | 
12 | def load_filepaths(filename: str) -> List[List[str]]:
13 |     with open(filename, encoding='utf-8') as f:
14 |         filepaths = [line.rstrip() for line in f]
15 |     return filepaths
16 | 
17 | def load_wav_to_torch(full_path: str) -> Tuple[torch.FloatTensor, int]:
18 |     data, sampling_rate = torchaudio.load(full_path)
19 |     if len(data.shape) >= 2:
20 |         data = torch.mean(data, dim=0)
21 |     return data, sampling_rate
22 | 
23 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
24 |     for k, v in scalars.items():
25 |         writer.add_scalar(k, v, global_step)
26 |     for k, v in histograms.items():
27 |         writer.add_histogram(k, v, global_step)
28 |     for k, v in images.items():
29 |         writer.add_image(k, v, global_step, dataformats='HWC')
30 |     for k, v in audios.items():
31 |         writer.add_audio(k, v, global_step, audio_sampling_rate)
32 | 
33 | MATPLOTLIB_FLAG = False
34 | def plot_spectrogram_to_numpy(spectrogram):
35 |     global MATPLOTLIB_FLAG
36 |     if not MATPLOTLIB_FLAG:
37 |         import matplotlib
38 |         matplotlib.use("Agg")
39 |         MATPLOTLIB_FLAG = True
40 |         mpl_logger = logging.getLogger('matplotlib')
41 |         mpl_logger.setLevel(logging.WARNING)
42 |     import matplotlib.pylab as plt
43 |     import numpy as np
44 | 
45 |     fig, ax = plt.subplots(figsize=(10, 2))
46 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
47 |                    interpolation='none')
48 |     plt.colorbar(im, ax=ax)
49 |     plt.xlabel("Frames")
50 |     plt.ylabel("Channels")
51 |     plt.tight_layout()
52 | 
53 |     fig.canvas.draw()
54 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
55 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
56 |     plt.close()
57 |     return data


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vtuber-plan/NSF-HiFiGAN/0f47ae9ace00da2b1252e48c0ca38af6c1598fdb/requirements.txt


--------------------------------------------------------------------------------
/split.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import os
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('-i', '--input', type=str, default="./filelists/48k_audio_filelist.txt", help='filelist path')
 8 |     parser.add_argument('-o', '--output', type=str, default="./filelists", help='File list output path')
 9 |     args = parser.parse_args()
10 | 
11 |     random.seed(1234)
12 | 
13 |     with open(args.input, "r", encoding="utf-8") as f:
14 |         lines = f.readlines()
15 | 
16 |     lines = sorted(lines)
17 |     random.shuffle(lines)
18 | 
19 |     origin_filename = os.path.basename(args.input)
20 |     data_len = len(lines)
21 | 
22 |     valid_num = int(data_len * 0.001)
23 |     test_num = int(data_len * 0.001)
24 | 
25 |     with open(os.path.join(args.output, origin_filename.replace(".txt", "_train.txt")), "w", encoding="utf-8") as f:
26 |         f.writelines(lines[:-valid_num-test_num])
27 |     
28 |     with open(os.path.join(args.output, origin_filename.replace(".txt", "_valid.txt")), "w", encoding="utf-8") as f:
29 |         f.writelines(lines[-valid_num-test_num:-test_num])
30 |     
31 |     with open(os.path.join(args.output, origin_filename.replace(".txt", "_test.txt")), "w", encoding="utf-8") as f:
32 |         f.writelines(lines[-test_num:])


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import json
  4 | import glob
  5 | import argparse
  6 | from typing import Optional
  7 | import torch
  8 | import torchaudio
  9 | import tqdm
 10 | from torch import nn, optim
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | from nsf_hifigan.model.nsf_hifigan import NSF_HifiGAN
 14 | 
 15 | from nsf_hifigan.data.collate import MelCollate
 16 | 
 17 | import pytorch_lightning as pl
 18 | from pytorch_lightning.callbacks import ModelCheckpoint
 19 | from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 20 | from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler
 21 | 
 22 | from nsf_hifigan.hparams import HParams
 23 | from nsf_hifigan.data.dataset import MelDataset, MelDataset
 24 | 
 25 | def get_hparams(config_path: str) -> HParams:
 26 |     with open(config_path, "r") as f:
 27 |         data = f.read()
 28 |     config = json.loads(data)
 29 |     
 30 |     hparams = HParams(**config)
 31 |     return hparams
 32 | 
 33 | def last_checkpoint(path: str) -> Optional[str]:
 34 |     ckpt_path = None
 35 |     if os.path.exists(os.path.join(path, "lightning_logs")):
 36 |         versions = glob.glob(os.path.join(path, "lightning_logs", "version_*"))
 37 |         if len(list(versions)) > 0:
 38 |             last_ver = sorted(list(versions), key=lambda p: int(p.split("_")[-1]))[-1]
 39 |             last_ckpt = os.path.join(last_ver, "checkpoints/last.ckpt")
 40 |             if os.path.exists(last_ckpt):
 41 |                 ckpt_path = last_ckpt
 42 |     return ckpt_path
 43 | 
 44 | def get_train_params(args, hparams):
 45 |     devices = [int(n.strip()) for n in args.device.split(",")]
 46 | 
 47 |     checkpoint_callback = ModelCheckpoint(
 48 |         dirpath=None, save_last=True, every_n_train_steps=2000, save_weights_only=False,
 49 |         monitor="valid/loss_mel_epoch", mode="min", save_top_k=5
 50 |     )
 51 |     earlystop_callback = EarlyStopping(monitor="valid/loss_mel_epoch", mode="min", patience=13)
 52 | 
 53 |     trainer_params = {
 54 |         "accelerator": args.accelerator,
 55 |         "callbacks": [checkpoint_callback, earlystop_callback],
 56 |     }
 57 | 
 58 |     if args.accelerator != "cpu":
 59 |         trainer_params["devices"] = devices
 60 | 
 61 |     if len(devices) > 1:
 62 |         trainer_params["strategy"] = "ddp"
 63 | 
 64 |     trainer_params.update(hparams.trainer)
 65 | 
 66 |     if hparams.train.fp16_run:
 67 |         trainer_params["amp_backend"] = "native"
 68 |         trainer_params["precision"] = 16
 69 |     
 70 |     trainer_params["num_nodes"] = args.num_nodes
 71 | 
 72 |     return trainer_params
 73 | 
 74 | def main():
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument('-c', '--config', type=str, default="./configs/48k.json", help='JSON file for configuration')
 77 |     parser.add_argument('-a', '--accelerator', type=str, default="gpu", help='training device')
 78 |     parser.add_argument('-d', '--device', type=str, default="0", help='training device ids')
 79 |     parser.add_argument('-n', '--num-nodes', type=int, default=1, help='training node number')
 80 |     args = parser.parse_args()
 81 | 
 82 |     hparams = get_hparams(args.config)
 83 |     pl.utilities.seed.seed_everything(hparams.train.seed)
 84 | 
 85 |     devices = [int(n.strip()) for n in args.device.split(",")]
 86 | 
 87 |     # data
 88 |     train_dataset = MelDataset(hparams.data.training_files, hparams.data)
 89 |     valid_dataset = MelDataset(hparams.data.validation_files, hparams.data)
 90 | 
 91 |     collate_fn = MelCollate()
 92 | 
 93 |     trainer_params = get_train_params(args, hparams)
 94 |     if "strategy" in trainer_params and trainer_params["strategy"] == "ddp":
 95 |         batch_per_gpu = hparams.train.batch_size // len(devices)
 96 |     else:
 97 |         batch_per_gpu = hparams.train.batch_size
 98 |     
 99 |     train_loader = DataLoader(train_dataset, batch_size=batch_per_gpu, num_workers=8, shuffle=True, pin_memory=True, collate_fn=collate_fn)
100 |     valid_loader = DataLoader(valid_dataset, batch_size=4, num_workers=4, shuffle=False, pin_memory=True, collate_fn=collate_fn)
101 | 
102 |     # model
103 |     model = NSF_HifiGAN(**hparams)
104 | 
105 |     # profiler = AdvancedProfiler(filename="profile.txt")
106 |     trainer = pl.Trainer(**trainer_params) # , profiler=profiler, max_steps=200
107 |     # resume training
108 |     ckpt_path = last_checkpoint(hparams.trainer.default_root_dir)
109 |     trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=valid_loader, ckpt_path=ckpt_path)
110 | 
111 | if __name__ == "__main__":
112 |   main()
113 | 


--------------------------------------------------------------------------------