├── .gitignore ├── LICENSE ├── README.md ├── alignments.zip ├── audio ├── __init__.py ├── audio_processing.py ├── hparams_audio.py ├── stft.py └── tools.py ├── data └── ljspeech.py ├── dataset.py ├── eval.py ├── glow.py ├── hparams.py ├── img └── fastspeech_structure.png ├── loss.py ├── model.py ├── modules.py ├── optimizer.py ├── preprocess.py ├── requirements.txt ├── sample ├── 135000_0.wav ├── 135000_0_waveglow.wav ├── 135000_1.wav ├── 135000_1_waveglow.wav ├── 135000_2.wav ├── 135000_2_waveglow.wav ├── 135000_3.wav ├── 135000_3_waveglow.wav ├── 135000_4.wav ├── 135000_4_waveglow.wav ├── 135000_5.wav └── 135000_5_waveglow.wav ├── text ├── __init__.py ├── cleaners.py ├── cmudict.py ├── numbers.py └── symbols.py ├── train.py ├── transformer ├── Constants.py ├── Layers.py ├── Models.py ├── Modules.py ├── SubLayers.py └── __init__.py ├── utils.py └── waveglow ├── __init__.py ├── convert_model.py ├── inference.py └── mel2samp.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vscode 107 | .vscode/ 108 | 109 | data/LJSpeech-1.1 110 | data/LJSpeech-1.1.tar.bz2 111 | data/train.txt 112 | 113 | mels 114 | alignments 115 | waveglow/pretrained_model -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Zhengxi Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastSpeech-Pytorch 2 | The Implementation of FastSpeech Based on Pytorch. 3 | 4 | ## Update (2020/07/20) 5 | 1. Optimize the training process. 6 | 2. Optimize the implementation of length regulator. 7 | 3. Use the same hyper parameter as FastSpeech2. 8 | 4. **The measures of the 1, 2 and 3 make the training process 3 times faster than before.** 9 | 5. **Better speech quality.** 10 | 11 | ## Model 12 |
13 | 14 |
15 | 16 | ## My Blog 17 | - [FastSpeech Reading Notes](https://zhuanlan.zhihu.com/p/67325775) 18 | - [Details and Rethinking of this Implementation](https://zhuanlan.zhihu.com/p/67939482) 19 | 20 | ## Prepare Dataset 21 | 1. Download and extract [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/). 22 | 2. Put LJSpeech dataset in `data`. 23 | 3. Unzip `alignments.zip`. 24 | 4. Put [Nvidia pretrained waveglow model](https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing) in the `waveglow/pretrained_model` and rename as `waveglow_256channels.pt`; 25 | 5. Run `python3 preprocess.py`. 26 | 27 | ## Training 28 | Run `python3 train.py`. 29 | 30 | ## Evaluation 31 | Run `python3 eval.py`. 32 | 33 | ## Notes 34 | - In the paper of FastSpeech, authors use pre-trained Transformer-TTS model to provide the target of alignment. I didn't have a well-trained Transformer-TTS model so I use Tacotron2 instead. 35 | - I use the same hyper-parameter as [FastSpeech2](https://arxiv.org/abs/2006.04558). 36 | - The examples of audio are in `sample`. 37 | - [pretrained model](https://drive.google.com/file/d/1vMrKtbjPj9u_o3Y-8prE6hHCc6Yj4Nqk/view?usp=sharing). 38 | 39 | ## Reference 40 | 41 | ### Repository 42 | - [The Implementation of Tacotron Based on Tensorflow](https://github.com/keithito/tacotron) 43 | - [The Implementation of Transformer Based on Pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch) 44 | - [The Implementation of Transformer-TTS Based on Pytorch](https://github.com/xcmyz/Transformer-TTS) 45 | - [The Implementation of Tacotron2 Based on Pytorch](https://github.com/NVIDIA/tacotron2) 46 | - [The Implementation of FastSpeech2 Based on Pytorch](https://github.com/ming024/FastSpeech2) 47 | 48 | ### Paper 49 | - [Tacotron2](https://arxiv.org/abs/1712.05884) 50 | - [Transformer](https://arxiv.org/abs/1706.03762) 51 | - [FastSpeech](https://arxiv.org/abs/1905.09263) 52 | - [FastSpeech2](https://arxiv.org/abs/2006.04558) -------------------------------------------------------------------------------- /alignments.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/alignments.zip -------------------------------------------------------------------------------- /audio/__init__.py: -------------------------------------------------------------------------------- 1 | import audio.hparams_audio 2 | import audio.tools 3 | import audio.stft 4 | import audio.audio_processing 5 | -------------------------------------------------------------------------------- /audio/audio_processing.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/NVIDIA/tacotron2 """ 2 | 3 | import torch 4 | import numpy as np 5 | from scipy.signal import get_window 6 | import librosa.util as librosa_util 7 | 8 | 9 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 10 | n_fft=800, dtype=np.float32, norm=None): 11 | """ 12 | # from librosa 0.6 13 | Compute the sum-square envelope of a window function at a given hop length. 14 | 15 | This is used to estimate modulation effects induced by windowing 16 | observations in short-time fourier transforms. 17 | 18 | Parameters 19 | ---------- 20 | window : string, tuple, number, callable, or list-like 21 | Window specification, as in `get_window` 22 | 23 | n_frames : int > 0 24 | The number of analysis frames 25 | 26 | hop_length : int > 0 27 | The number of samples to advance between frames 28 | 29 | win_length : [optional] 30 | The length of the window function. By default, this matches `n_fft`. 31 | 32 | n_fft : int > 0 33 | The length of each analysis frame. 34 | 35 | dtype : np.dtype 36 | The data type of the output 37 | 38 | Returns 39 | ------- 40 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 41 | The sum-squared envelope of the window function 42 | """ 43 | if win_length is None: 44 | win_length = n_fft 45 | 46 | n = n_fft + hop_length * (n_frames - 1) 47 | x = np.zeros(n, dtype=dtype) 48 | 49 | # Compute the squared window at the desired length 50 | win_sq = get_window(window, win_length, fftbins=True) 51 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 52 | win_sq = librosa_util.pad_center(win_sq, n_fft) 53 | 54 | # Fill the envelope 55 | for i in range(n_frames): 56 | sample = i * hop_length 57 | x[sample:min(n, sample + n_fft) 58 | ] += win_sq[:max(0, min(n_fft, n - sample))] 59 | return x 60 | 61 | 62 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 63 | """ 64 | PARAMS 65 | ------ 66 | magnitudes: spectrogram magnitudes 67 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 68 | """ 69 | 70 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 71 | angles = angles.astype(np.float32) 72 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 73 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 74 | 75 | for i in range(n_iters): 76 | _, angles = stft_fn.transform(signal) 77 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 78 | return signal 79 | 80 | 81 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 82 | """ 83 | PARAMS 84 | ------ 85 | C: compression factor 86 | """ 87 | return torch.log(torch.clamp(x, min=clip_val) * C) 88 | 89 | 90 | def dynamic_range_decompression(x, C=1): 91 | """ 92 | PARAMS 93 | ------ 94 | C: compression factor used to compress 95 | """ 96 | return torch.exp(x) / C 97 | -------------------------------------------------------------------------------- /audio/hparams_audio.py: -------------------------------------------------------------------------------- 1 | max_wav_value = 32768.0 2 | sampling_rate = 22050 3 | filter_length = 1024 4 | hop_length = 256 5 | win_length = 1024 6 | n_mel_channels = 80 7 | mel_fmin = 0.0 8 | mel_fmax = 8000.0 9 | -------------------------------------------------------------------------------- /audio/stft.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/NVIDIA/tacotron2 """ 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import numpy as np 7 | 8 | from scipy.signal import get_window 9 | from librosa.util import pad_center, tiny 10 | from librosa.filters import mel as librosa_mel_fn 11 | 12 | from audio.audio_processing import dynamic_range_compression 13 | from audio.audio_processing import dynamic_range_decompression 14 | from audio.audio_processing import window_sumsquare 15 | 16 | 17 | class STFT(torch.nn.Module): 18 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 19 | 20 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 21 | window='hann'): 22 | super(STFT, self).__init__() 23 | self.filter_length = filter_length 24 | self.hop_length = hop_length 25 | self.win_length = win_length 26 | self.window = window 27 | self.forward_transform = None 28 | scale = self.filter_length / self.hop_length 29 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 30 | 31 | cutoff = int((self.filter_length / 2 + 1)) 32 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 33 | np.imag(fourier_basis[:cutoff, :])]) 34 | 35 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 36 | inverse_basis = torch.FloatTensor( 37 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 38 | 39 | if window is not None: 40 | assert(filter_length >= win_length) 41 | # get window and zero center pad it to filter_length 42 | fft_window = get_window(window, win_length, fftbins=True) 43 | fft_window = pad_center(fft_window, filter_length) 44 | fft_window = torch.from_numpy(fft_window).float() 45 | 46 | # window the bases 47 | forward_basis *= fft_window 48 | inverse_basis *= fft_window 49 | 50 | self.register_buffer('forward_basis', forward_basis.float()) 51 | self.register_buffer('inverse_basis', inverse_basis.float()) 52 | 53 | def transform(self, input_data): 54 | num_batches = input_data.size(0) 55 | num_samples = input_data.size(1) 56 | 57 | self.num_samples = num_samples 58 | 59 | # similar to librosa, reflect-pad the input 60 | input_data = input_data.view(num_batches, 1, num_samples) 61 | input_data = F.pad( 62 | input_data.unsqueeze(1), 63 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 64 | mode='reflect') 65 | input_data = input_data.squeeze(1) 66 | 67 | forward_transform = F.conv1d( 68 | input_data.cpu(), 69 | Variable(self.forward_basis, requires_grad=False).cpu(), 70 | stride=self.hop_length, 71 | padding=0).cpu() 72 | 73 | cutoff = int((self.filter_length / 2) + 1) 74 | real_part = forward_transform[:, :cutoff, :] 75 | imag_part = forward_transform[:, cutoff:, :] 76 | 77 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 78 | phase = torch.autograd.Variable( 79 | torch.atan2(imag_part.data, real_part.data)) 80 | 81 | return magnitude, phase 82 | 83 | def inverse(self, magnitude, phase): 84 | recombine_magnitude_phase = torch.cat( 85 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 86 | 87 | inverse_transform = F.conv_transpose1d( 88 | recombine_magnitude_phase, 89 | Variable(self.inverse_basis, requires_grad=False), 90 | stride=self.hop_length, 91 | padding=0) 92 | 93 | if self.window is not None: 94 | window_sum = window_sumsquare( 95 | self.window, magnitude.size(-1), hop_length=self.hop_length, 96 | win_length=self.win_length, n_fft=self.filter_length, 97 | dtype=np.float32) 98 | # remove modulation effects 99 | approx_nonzero_indices = torch.from_numpy( 100 | np.where(window_sum > tiny(window_sum))[0]) 101 | window_sum = torch.autograd.Variable( 102 | torch.from_numpy(window_sum), requires_grad=False) 103 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 104 | inverse_transform[:, :, 105 | approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 106 | 107 | # scale by hop ratio 108 | inverse_transform *= float(self.filter_length) / self.hop_length 109 | 110 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 111 | inverse_transform = inverse_transform[:, 112 | :, :-int(self.filter_length/2):] 113 | 114 | return inverse_transform 115 | 116 | def forward(self, input_data): 117 | self.magnitude, self.phase = self.transform(input_data) 118 | reconstruction = self.inverse(self.magnitude, self.phase) 119 | return reconstruction 120 | 121 | 122 | class TacotronSTFT(torch.nn.Module): 123 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 124 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 125 | mel_fmax=8000.0): 126 | super(TacotronSTFT, self).__init__() 127 | self.n_mel_channels = n_mel_channels 128 | self.sampling_rate = sampling_rate 129 | self.stft_fn = STFT(filter_length, hop_length, win_length) 130 | mel_basis = librosa_mel_fn( 131 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 132 | mel_basis = torch.from_numpy(mel_basis).float() 133 | self.register_buffer('mel_basis', mel_basis) 134 | 135 | def spectral_normalize(self, magnitudes): 136 | output = dynamic_range_compression(magnitudes) 137 | return output 138 | 139 | def spectral_de_normalize(self, magnitudes): 140 | output = dynamic_range_decompression(magnitudes) 141 | return output 142 | 143 | def mel_spectrogram(self, y): 144 | """Computes mel-spectrograms from a batch of waves 145 | PARAMS 146 | ------ 147 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 148 | 149 | RETURNS 150 | ------- 151 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 152 | """ 153 | assert(torch.min(y.data) >= -1) 154 | assert(torch.max(y.data) <= 1) 155 | 156 | magnitudes, phases = self.stft_fn.transform(y) 157 | magnitudes = magnitudes.data 158 | mel_output = torch.matmul(self.mel_basis, magnitudes) 159 | mel_output = self.spectral_normalize(mel_output) 160 | return mel_output 161 | -------------------------------------------------------------------------------- /audio/tools.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/NVIDIA/tacotron2 """ 2 | 3 | import torch 4 | import numpy as np 5 | from scipy.io.wavfile import read 6 | from scipy.io.wavfile import write 7 | 8 | import audio.stft as stft 9 | import audio.hparams_audio as hparams 10 | from audio.audio_processing import griffin_lim 11 | 12 | _stft = stft.TacotronSTFT( 13 | hparams.filter_length, hparams.hop_length, hparams.win_length, 14 | hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, 15 | hparams.mel_fmax) 16 | 17 | 18 | def load_wav_to_torch(full_path): 19 | sampling_rate, data = read(full_path) 20 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 21 | 22 | 23 | def get_mel(filename): 24 | audio, sampling_rate = load_wav_to_torch(filename) 25 | if sampling_rate != _stft.sampling_rate: 26 | raise ValueError("{} {} SR doesn't match target {} SR".format( 27 | sampling_rate, _stft.sampling_rate)) 28 | audio_norm = audio / hparams.max_wav_value 29 | audio_norm = audio_norm.unsqueeze(0) 30 | audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 31 | melspec = _stft.mel_spectrogram(audio_norm) 32 | melspec = torch.squeeze(melspec, 0) 33 | # melspec = torch.from_numpy(_normalize(melspec.numpy())) 34 | 35 | return melspec 36 | 37 | 38 | def get_mel_from_wav(audio): 39 | sampling_rate = hparams.sampling_rate 40 | if sampling_rate != _stft.sampling_rate: 41 | raise ValueError("{} {} SR doesn't match target {} SR".format( 42 | sampling_rate, _stft.sampling_rate)) 43 | audio_norm = audio / hparams.max_wav_value 44 | audio_norm = audio_norm.unsqueeze(0) 45 | audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 46 | melspec = _stft.mel_spectrogram(audio_norm) 47 | melspec = torch.squeeze(melspec, 0) 48 | 49 | return melspec 50 | 51 | 52 | def inv_mel_spec(mel, out_filename, griffin_iters=60): 53 | mel = torch.stack([mel]) 54 | # mel = torch.stack([torch.from_numpy(_denormalize(mel.numpy()))]) 55 | mel_decompress = _stft.spectral_de_normalize(mel) 56 | mel_decompress = mel_decompress.transpose(1, 2).data.cpu() 57 | spec_from_mel_scaling = 1000 58 | spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis) 59 | spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) 60 | spec_from_mel = spec_from_mel * spec_from_mel_scaling 61 | 62 | audio = griffin_lim(torch.autograd.Variable( 63 | spec_from_mel[:, :, :-1]), _stft.stft_fn, griffin_iters) 64 | 65 | audio = audio.squeeze() 66 | audio = audio.cpu().numpy() 67 | audio_path = out_filename 68 | write(audio_path, hparams.sampling_rate, audio) 69 | -------------------------------------------------------------------------------- /data/ljspeech.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import audio 4 | 5 | from tqdm import tqdm 6 | from functools import partial 7 | from concurrent.futures import ProcessPoolExecutor 8 | 9 | 10 | def build_from_path(in_dir, out_dir): 11 | index = 1 12 | # executor = ProcessPoolExecutor(max_workers=4) 13 | # futures = [] 14 | texts = [] 15 | 16 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 17 | for line in f.readlines(): 18 | if index % 100 == 0: 19 | print("{:d} Done".format(index)) 20 | parts = line.strip().split('|') 21 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 22 | text = parts[2] 23 | # futures.append(executor.submit( 24 | # partial(_process_utterance, out_dir, index, wav_path, text))) 25 | texts.append(_process_utterance(out_dir, index, wav_path, text)) 26 | 27 | index = index + 1 28 | 29 | # return [future.result() for future in tqdm(futures)] 30 | return texts 31 | 32 | 33 | def _process_utterance(out_dir, index, wav_path, text): 34 | # Compute a mel-scale spectrogram from the wav: 35 | mel_spectrogram = audio.tools.get_mel(wav_path).numpy().astype(np.float32) 36 | 37 | # Write the spectrograms to disk: 38 | mel_filename = 'ljspeech-mel-%05d.npy' % index 39 | np.save(os.path.join(out_dir, mel_filename), 40 | mel_spectrogram.T, allow_pickle=False) 41 | 42 | return text 43 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | from torch.utils.data import Dataset, DataLoader 4 | 5 | import numpy as np 6 | import math 7 | import time 8 | import os 9 | 10 | import hparams 11 | import audio 12 | 13 | from utils import process_text, pad_1D, pad_2D 14 | from utils import pad_1D_tensor, pad_2D_tensor 15 | from text import text_to_sequence 16 | from tqdm import tqdm 17 | 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | 21 | def get_data_to_buffer(): 22 | buffer = list() 23 | text = process_text(os.path.join("data", "train.txt")) 24 | 25 | start = time.perf_counter() 26 | for i in tqdm(range(len(text))): 27 | 28 | mel_gt_name = os.path.join( 29 | hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i+1)) 30 | mel_gt_target = np.load(mel_gt_name) 31 | duration = np.load(os.path.join( 32 | hparams.alignment_path, str(i)+".npy")) 33 | character = text[i][0:len(text[i])-1] 34 | character = np.array( 35 | text_to_sequence(character, hparams.text_cleaners)) 36 | 37 | character = torch.from_numpy(character) 38 | duration = torch.from_numpy(duration) 39 | mel_gt_target = torch.from_numpy(mel_gt_target) 40 | 41 | buffer.append({"text": character, "duration": duration, 42 | "mel_target": mel_gt_target}) 43 | 44 | end = time.perf_counter() 45 | print("cost {:.2f}s to load all data into buffer.".format(end-start)) 46 | 47 | return buffer 48 | 49 | 50 | class BufferDataset(Dataset): 51 | def __init__(self, buffer): 52 | self.buffer = buffer 53 | self.length_dataset = len(self.buffer) 54 | 55 | def __len__(self): 56 | return self.length_dataset 57 | 58 | def __getitem__(self, idx): 59 | return self.buffer[idx] 60 | 61 | 62 | def reprocess_tensor(batch, cut_list): 63 | texts = [batch[ind]["text"] for ind in cut_list] 64 | mel_targets = [batch[ind]["mel_target"] for ind in cut_list] 65 | durations = [batch[ind]["duration"] for ind in cut_list] 66 | 67 | length_text = np.array([]) 68 | for text in texts: 69 | length_text = np.append(length_text, text.size(0)) 70 | 71 | src_pos = list() 72 | max_len = int(max(length_text)) 73 | for length_src_row in length_text: 74 | src_pos.append(np.pad([i+1 for i in range(int(length_src_row))], 75 | (0, max_len-int(length_src_row)), 'constant')) 76 | src_pos = torch.from_numpy(np.array(src_pos)) 77 | 78 | length_mel = np.array(list()) 79 | for mel in mel_targets: 80 | length_mel = np.append(length_mel, mel.size(0)) 81 | 82 | mel_pos = list() 83 | max_mel_len = int(max(length_mel)) 84 | for length_mel_row in length_mel: 85 | mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))], 86 | (0, max_mel_len-int(length_mel_row)), 'constant')) 87 | mel_pos = torch.from_numpy(np.array(mel_pos)) 88 | 89 | texts = pad_1D_tensor(texts) 90 | durations = pad_1D_tensor(durations) 91 | mel_targets = pad_2D_tensor(mel_targets) 92 | 93 | out = {"text": texts, 94 | "mel_target": mel_targets, 95 | "duration": durations, 96 | "mel_pos": mel_pos, 97 | "src_pos": src_pos, 98 | "mel_max_len": max_mel_len} 99 | 100 | return out 101 | 102 | 103 | def collate_fn_tensor(batch): 104 | len_arr = np.array([d["text"].size(0) for d in batch]) 105 | index_arr = np.argsort(-len_arr) 106 | batchsize = len(batch) 107 | real_batchsize = batchsize // hparams.batch_expand_size 108 | 109 | cut_list = list() 110 | for i in range(hparams.batch_expand_size): 111 | cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize]) 112 | 113 | output = list() 114 | for i in range(hparams.batch_expand_size): 115 | output.append(reprocess_tensor(batch, cut_list[i])) 116 | 117 | return output 118 | 119 | 120 | if __name__ == "__main__": 121 | # TEST 122 | get_data_to_buffer() 123 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import argparse 4 | import numpy as np 5 | import random 6 | import time 7 | import shutil 8 | import os 9 | 10 | import hparams as hp 11 | import audio 12 | import utils 13 | import dataset 14 | import text 15 | import model as M 16 | import waveglow 17 | 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | 21 | def get_DNN(num): 22 | checkpoint_path = "checkpoint_" + str(num) + ".pth.tar" 23 | model = nn.DataParallel(M.FastSpeech()).to(device) 24 | model.load_state_dict(torch.load(os.path.join(hp.checkpoint_path, 25 | checkpoint_path))['model']) 26 | model.eval() 27 | return model 28 | 29 | 30 | def synthesis(model, text, alpha=1.0): 31 | text = np.array(phn) 32 | text = np.stack([text]) 33 | src_pos = np.array([i+1 for i in range(text.shape[1])]) 34 | src_pos = np.stack([src_pos]) 35 | sequence = torch.from_numpy(text).cuda().long() 36 | src_pos = torch.from_numpy(src_pos).cuda().long() 37 | 38 | with torch.no_grad(): 39 | _, mel = model.module.forward(sequence, src_pos, alpha=alpha) 40 | return mel[0].cpu().transpose(0, 1), mel.contiguous().transpose(1, 2) 41 | 42 | 43 | def get_data(): 44 | test1 = "I am very happy to see you again!" 45 | test2 = "Durian model is a very good speech synthesis!" 46 | test3 = "When I was twenty, I fell in love with a girl." 47 | test4 = "I remove attention module in decoder and use average pooling to implement predicting r frames at once" 48 | test5 = "You can not improve your past, but you can improve your future. Once time is wasted, life is wasted." 49 | test6 = "Death comes to all, but great achievements raise a monument which shall endure until the sun grows old." 50 | data_list = list() 51 | data_list.append(text.text_to_sequence(test1, hp.text_cleaners)) 52 | data_list.append(text.text_to_sequence(test2, hp.text_cleaners)) 53 | data_list.append(text.text_to_sequence(test3, hp.text_cleaners)) 54 | data_list.append(text.text_to_sequence(test4, hp.text_cleaners)) 55 | data_list.append(text.text_to_sequence(test5, hp.text_cleaners)) 56 | data_list.append(text.text_to_sequence(test6, hp.text_cleaners)) 57 | return data_list 58 | 59 | 60 | if __name__ == "__main__": 61 | # Test 62 | WaveGlow = utils.get_WaveGlow() 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--step', type=int, default=0) 65 | parser.add_argument("--alpha", type=float, default=1.0) 66 | args = parser.parse_args() 67 | 68 | print("use griffin-lim and waveglow") 69 | model = get_DNN(args.step) 70 | data_list = get_data() 71 | for i, phn in enumerate(data_list): 72 | mel, mel_cuda = synthesis(model, phn, args.alpha) 73 | if not os.path.exists("results"): 74 | os.mkdir("results") 75 | audio.tools.inv_mel_spec( 76 | mel, "results/"+str(args.step)+"_"+str(i)+".wav") 77 | waveglow.inference.inference( 78 | mel_cuda, WaveGlow, 79 | "results/"+str(args.step)+"_"+str(i)+"_waveglow.wav") 80 | print("Done", i + 1) 81 | 82 | s_t = time.perf_counter() 83 | for i in range(100): 84 | for _, phn in enumerate(data_list): 85 | _, _, = synthesis(model, phn, args.alpha) 86 | print(i) 87 | e_t = time.perf_counter() 88 | print((e_t - s_t) / 100.) 89 | -------------------------------------------------------------------------------- /glow.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import copy 28 | import torch 29 | from torch.autograd import Variable 30 | import torch.nn.functional as F 31 | torch.nn.Module.dump_patches = True 32 | 33 | 34 | @torch.jit.script 35 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 36 | n_channels_int = n_channels[0] 37 | in_act = input_a+input_b 38 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 39 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 40 | acts = t_act * s_act 41 | return acts 42 | 43 | 44 | class WaveGlowLoss(torch.nn.Module): 45 | def __init__(self, sigma=1.0): 46 | super(WaveGlowLoss, self).__init__() 47 | self.sigma = sigma 48 | 49 | def forward(self, model_output): 50 | z, log_s_list, log_det_W_list = model_output 51 | for i, log_s in enumerate(log_s_list): 52 | if i == 0: 53 | log_s_total = torch.sum(log_s) 54 | log_det_W_total = log_det_W_list[i] 55 | else: 56 | log_s_total = log_s_total + torch.sum(log_s) 57 | log_det_W_total += log_det_W_list[i] 58 | 59 | loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - \ 60 | log_s_total - log_det_W_total 61 | return loss/(z.size(0)*z.size(1)*z.size(2)) 62 | 63 | 64 | class Invertible1x1Conv(torch.nn.Module): 65 | """ 66 | The layer outputs both the convolution, and the log determinant 67 | of its weight matrix. If reverse=True it does convolution with 68 | inverse 69 | """ 70 | 71 | def __init__(self, c): 72 | super(Invertible1x1Conv, self).__init__() 73 | self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, 74 | bias=False) 75 | 76 | # Sample a random orthonormal matrix to initialize weights 77 | W = torch.qr(torch.FloatTensor(c, c).normal_())[0] 78 | 79 | # Ensure determinant is 1.0 not -1.0 80 | if torch.det(W) < 0: 81 | W[:, 0] = -1*W[:, 0] 82 | W = W.view(c, c, 1) 83 | self.conv.weight.data = W 84 | 85 | def forward(self, z, reverse=False): 86 | # shape 87 | batch_size, group_size, n_of_groups = z.size() 88 | 89 | W = self.conv.weight.squeeze() 90 | 91 | if reverse: 92 | if not hasattr(self, 'W_inverse'): 93 | # Reverse computation 94 | W_inverse = W.inverse() 95 | W_inverse = Variable(W_inverse[..., None]) 96 | if z.type() == 'torch.cuda.HalfTensor': 97 | W_inverse = W_inverse.half() 98 | self.W_inverse = W_inverse 99 | z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) 100 | return z 101 | else: 102 | # Forward computation 103 | log_det_W = batch_size * n_of_groups * torch.logdet(W) 104 | z = self.conv(z) 105 | return z, log_det_W 106 | 107 | 108 | class WN(torch.nn.Module): 109 | """ 110 | This is the WaveNet like layer for the affine coupling. The primary difference 111 | from WaveNet is the convolutions need not be causal. There is also no dilation 112 | size reset. The dilation only doubles on each layer 113 | """ 114 | 115 | def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, 116 | kernel_size): 117 | super(WN, self).__init__() 118 | assert(kernel_size % 2 == 1) 119 | assert(n_channels % 2 == 0) 120 | self.n_layers = n_layers 121 | self.n_channels = n_channels 122 | self.in_layers = torch.nn.ModuleList() 123 | self.res_skip_layers = torch.nn.ModuleList() 124 | self.cond_layers = torch.nn.ModuleList() 125 | 126 | start = torch.nn.Conv1d(n_in_channels, n_channels, 1) 127 | start = torch.nn.utils.weight_norm(start, name='weight') 128 | self.start = start 129 | 130 | # Initializing last layer to 0 makes the affine coupling layers 131 | # do nothing at first. This helps with training stability 132 | end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) 133 | end.weight.data.zero_() 134 | end.bias.data.zero_() 135 | self.end = end 136 | 137 | for i in range(n_layers): 138 | dilation = 2 ** i 139 | padding = int((kernel_size*dilation - dilation)/2) 140 | in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, 141 | dilation=dilation, padding=padding) 142 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 143 | self.in_layers.append(in_layer) 144 | 145 | cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1) 146 | cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 147 | self.cond_layers.append(cond_layer) 148 | 149 | # last one is not necessary 150 | if i < n_layers - 1: 151 | res_skip_channels = 2*n_channels 152 | else: 153 | res_skip_channels = n_channels 154 | res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) 155 | res_skip_layer = torch.nn.utils.weight_norm( 156 | res_skip_layer, name='weight') 157 | self.res_skip_layers.append(res_skip_layer) 158 | 159 | def forward(self, forward_input): 160 | audio, spect = forward_input 161 | audio = self.start(audio) 162 | 163 | for i in range(self.n_layers): 164 | acts = fused_add_tanh_sigmoid_multiply( 165 | self.in_layers[i](audio), 166 | self.cond_layers[i](spect), 167 | torch.IntTensor([self.n_channels])) 168 | 169 | res_skip_acts = self.res_skip_layers[i](acts) 170 | if i < self.n_layers - 1: 171 | audio = res_skip_acts[:, :self.n_channels, :] + audio 172 | skip_acts = res_skip_acts[:, self.n_channels:, :] 173 | else: 174 | skip_acts = res_skip_acts 175 | 176 | if i == 0: 177 | output = skip_acts 178 | else: 179 | output = skip_acts + output 180 | return self.end(output) 181 | 182 | 183 | class WaveGlow(torch.nn.Module): 184 | def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, 185 | n_early_size, WN_config): 186 | super(WaveGlow, self).__init__() 187 | 188 | self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, 189 | n_mel_channels, 190 | 1024, stride=256) 191 | assert(n_group % 2 == 0) 192 | self.n_flows = n_flows 193 | self.n_group = n_group 194 | self.n_early_every = n_early_every 195 | self.n_early_size = n_early_size 196 | self.WN = torch.nn.ModuleList() 197 | self.convinv = torch.nn.ModuleList() 198 | 199 | n_half = int(n_group/2) 200 | 201 | # Set up layers with the right sizes based on how many dimensions 202 | # have been output already 203 | n_remaining_channels = n_group 204 | for k in range(n_flows): 205 | if k % self.n_early_every == 0 and k > 0: 206 | n_half = n_half - int(self.n_early_size/2) 207 | n_remaining_channels = n_remaining_channels - self.n_early_size 208 | self.convinv.append(Invertible1x1Conv(n_remaining_channels)) 209 | self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) 210 | self.n_remaining_channels = n_remaining_channels # Useful during inference 211 | 212 | def forward(self, forward_input): 213 | """ 214 | forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames 215 | forward_input[1] = audio: batch x time 216 | """ 217 | spect, audio = forward_input 218 | 219 | # Upsample spectrogram to size of audio 220 | spect = self.upsample(spect) 221 | assert(spect.size(2) >= audio.size(1)) 222 | if spect.size(2) > audio.size(1): 223 | spect = spect[:, :, :audio.size(1)] 224 | 225 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 226 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 227 | 228 | audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) 229 | output_audio = [] 230 | log_s_list = [] 231 | log_det_W_list = [] 232 | 233 | for k in range(self.n_flows): 234 | if k % self.n_early_every == 0 and k > 0: 235 | output_audio.append(audio[:, :self.n_early_size, :]) 236 | audio = audio[:, self.n_early_size:, :] 237 | 238 | audio, log_det_W = self.convinv[k](audio) 239 | log_det_W_list.append(log_det_W) 240 | 241 | n_half = int(audio.size(1)/2) 242 | audio_0 = audio[:, :n_half, :] 243 | audio_1 = audio[:, n_half:, :] 244 | 245 | output = self.WN[k]((audio_0, spect)) 246 | log_s = output[:, n_half:, :] 247 | b = output[:, :n_half, :] 248 | audio_1 = torch.exp(log_s)*audio_1 + b 249 | log_s_list.append(log_s) 250 | 251 | audio = torch.cat([audio_0, audio_1], 1) 252 | 253 | output_audio.append(audio) 254 | return torch.cat(output_audio, 1), log_s_list, log_det_W_list 255 | 256 | def infer(self, spect, sigma=1.0): 257 | spect = self.upsample(spect) 258 | # trim conv artifacts. maybe pad spec to kernel multiple 259 | time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] 260 | spect = spect[:, :, :-time_cutoff] 261 | 262 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 263 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 264 | 265 | if spect.type() == 'torch.cuda.HalfTensor': 266 | audio = torch.cuda.HalfTensor(spect.size(0), 267 | self.n_remaining_channels, 268 | spect.size(2)).normal_() 269 | else: 270 | audio = torch.cuda.FloatTensor(spect.size(0), 271 | self.n_remaining_channels, 272 | spect.size(2)).normal_() 273 | 274 | audio = torch.autograd.Variable(sigma*audio) 275 | 276 | for k in reversed(range(self.n_flows)): 277 | n_half = int(audio.size(1)/2) 278 | audio_0 = audio[:, :n_half, :] 279 | audio_1 = audio[:, n_half:, :] 280 | 281 | output = self.WN[k]((audio_0, spect)) 282 | s = output[:, n_half:, :] 283 | b = output[:, :n_half, :] 284 | audio_1 = (audio_1 - b)/torch.exp(s) 285 | audio = torch.cat([audio_0, audio_1], 1) 286 | 287 | audio = self.convinv[k](audio, reverse=True) 288 | 289 | if k % self.n_early_every == 0 and k > 0: 290 | if spect.type() == 'torch.cuda.HalfTensor': 291 | z = torch.cuda.HalfTensor(spect.size( 292 | 0), self.n_early_size, spect.size(2)).normal_() 293 | else: 294 | z = torch.cuda.FloatTensor(spect.size( 295 | 0), self.n_early_size, spect.size(2)).normal_() 296 | audio = torch.cat((sigma*z, audio), 1) 297 | 298 | audio = audio.permute(0, 2, 1).contiguous().view( 299 | audio.size(0), -1).data 300 | return audio 301 | 302 | @staticmethod 303 | def remove_weightnorm(model): 304 | waveglow = model 305 | for WN in waveglow.WN: 306 | WN.start = torch.nn.utils.remove_weight_norm(WN.start) 307 | WN.in_layers = remove(WN.in_layers) 308 | WN.cond_layers = remove(WN.cond_layers) 309 | WN.res_skip_layers = remove(WN.res_skip_layers) 310 | return waveglow 311 | 312 | 313 | def remove(conv_list): 314 | new_conv_list = torch.nn.ModuleList() 315 | for old_conv in conv_list: 316 | old_conv = torch.nn.utils.remove_weight_norm(old_conv) 317 | new_conv_list.append(old_conv) 318 | return new_conv_list 319 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | # Mel 2 | num_mels = 80 3 | text_cleaners = ['english_cleaners'] 4 | 5 | # FastSpeech 6 | vocab_size = 300 7 | max_seq_len = 3000 8 | 9 | encoder_dim = 256 10 | encoder_n_layer = 4 11 | encoder_head = 2 12 | encoder_conv1d_filter_size = 1024 13 | 14 | decoder_dim = 256 15 | decoder_n_layer = 4 16 | decoder_head = 2 17 | decoder_conv1d_filter_size = 1024 18 | 19 | fft_conv1d_kernel = (9, 1) 20 | fft_conv1d_padding = (4, 0) 21 | 22 | duration_predictor_filter_size = 256 23 | duration_predictor_kernel_size = 3 24 | dropout = 0.1 25 | 26 | # Train 27 | checkpoint_path = "./model_new" 28 | logger_path = "./logger" 29 | mel_ground_truth = "./mels" 30 | alignment_path = "./alignments" 31 | 32 | batch_size = 32 33 | epochs = 2000 34 | n_warm_up_step = 4000 35 | 36 | learning_rate = 1e-3 37 | weight_decay = 1e-6 38 | grad_clip_thresh = 1.0 39 | decay_step = [500000, 1000000, 2000000] 40 | 41 | save_step = 3000 42 | log_step = 5 43 | clear_Time = 20 44 | 45 | batch_expand_size = 32 46 | -------------------------------------------------------------------------------- /img/fastspeech_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/img/fastspeech_structure.png -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class DNNLoss(nn.Module): 6 | def __init__(self): 7 | super(DNNLoss, self).__init__() 8 | self.mse_loss = nn.MSELoss() 9 | self.l1_loss = nn.L1Loss() 10 | 11 | def forward(self, mel, mel_postnet, duration_predicted, mel_target, duration_predictor_target): 12 | mel_target.requires_grad = False 13 | mel_loss = self.mse_loss(mel, mel_target) 14 | mel_postnet_loss = self.mse_loss(mel_postnet, mel_target) 15 | 16 | duration_predictor_target.requires_grad = False 17 | duration_predictor_loss = self.l1_loss(duration_predicted, 18 | duration_predictor_target.float()) 19 | 20 | return mel_loss, mel_postnet_loss, duration_predictor_loss 21 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import hparams as hp 4 | import utils 5 | 6 | from transformer.Models import Encoder, Decoder 7 | from transformer.Layers import Linear, PostNet 8 | from modules import LengthRegulator, CBHG 9 | 10 | 11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 12 | 13 | 14 | class FastSpeech(nn.Module): 15 | """ FastSpeech """ 16 | 17 | def __init__(self): 18 | super(FastSpeech, self).__init__() 19 | 20 | self.encoder = Encoder() 21 | self.length_regulator = LengthRegulator() 22 | self.decoder = Decoder() 23 | 24 | self.mel_linear = Linear(hp.decoder_dim, hp.num_mels) 25 | self.postnet = CBHG(hp.num_mels, K=8, 26 | projections=[256, hp.num_mels]) 27 | self.last_linear = Linear(hp.num_mels * 2, hp.num_mels) 28 | 29 | def mask_tensor(self, mel_output, position, mel_max_length): 30 | lengths = torch.max(position, -1)[0] 31 | mask = ~utils.get_mask_from_lengths(lengths, max_len=mel_max_length) 32 | mask = mask.unsqueeze(-1).expand(-1, -1, mel_output.size(-1)) 33 | return mel_output.masked_fill(mask, 0.) 34 | 35 | def forward(self, src_seq, src_pos, mel_pos=None, mel_max_length=None, length_target=None, alpha=1.0): 36 | encoder_output, _ = self.encoder(src_seq, src_pos) 37 | 38 | if self.training: 39 | length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, 40 | target=length_target, 41 | alpha=alpha, 42 | mel_max_length=mel_max_length) 43 | decoder_output = self.decoder(length_regulator_output, mel_pos) 44 | 45 | mel_output = self.mel_linear(decoder_output) 46 | mel_output = self.mask_tensor(mel_output, mel_pos, mel_max_length) 47 | residual = self.postnet(mel_output) 48 | residual = self.last_linear(residual) 49 | mel_postnet_output = mel_output + residual 50 | mel_postnet_output = self.mask_tensor(mel_postnet_output, 51 | mel_pos, 52 | mel_max_length) 53 | 54 | return mel_output, mel_postnet_output, duration_predictor_output 55 | else: 56 | length_regulator_output, decoder_pos = self.length_regulator(encoder_output, 57 | alpha=alpha) 58 | 59 | decoder_output = self.decoder(length_regulator_output, decoder_pos) 60 | 61 | mel_output = self.mel_linear(decoder_output) 62 | residual = self.postnet(mel_output) 63 | residual = self.last_linear(residual) 64 | mel_postnet_output = mel_output + residual 65 | 66 | return mel_output, mel_postnet_output 67 | 68 | 69 | if __name__ == "__main__": 70 | # Test 71 | model = FastSpeech() 72 | print(sum(param.numel() for param in model.parameters())) 73 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from collections import OrderedDict 6 | from numba import jit 7 | import numpy as np 8 | import copy 9 | import math 10 | 11 | import hparams as hp 12 | import utils 13 | 14 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 15 | 16 | 17 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 18 | ''' Sinusoid position encoding table ''' 19 | 20 | def cal_angle(position, hid_idx): 21 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 22 | 23 | def get_posi_angle_vec(position): 24 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 25 | 26 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) 27 | for pos_i in range(n_position)]) 28 | 29 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 30 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 31 | 32 | if padding_idx is not None: 33 | # zero vector for padding dimension 34 | sinusoid_table[padding_idx] = 0. 35 | 36 | return torch.FloatTensor(sinusoid_table) 37 | 38 | 39 | def clones(module, N): 40 | return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) 41 | 42 | 43 | # @jit(nopython=True) 44 | def create_alignment(base_mat, duration_predictor_output): 45 | N, L = duration_predictor_output.shape 46 | for i in range(N): 47 | count = 0 48 | for j in range(L): 49 | for k in range(duration_predictor_output[i][j]): 50 | base_mat[i][count+k][j] = 1 51 | count = count + duration_predictor_output[i][j] 52 | return base_mat 53 | 54 | 55 | class LengthRegulator(nn.Module): 56 | """ Length Regulator """ 57 | 58 | def __init__(self): 59 | super(LengthRegulator, self).__init__() 60 | self.duration_predictor = DurationPredictor() 61 | 62 | def LR(self, x, duration_predictor_output, mel_max_length=None): 63 | expand_max_len = torch.max( 64 | torch.sum(duration_predictor_output, -1), -1)[0] 65 | alignment = torch.zeros(duration_predictor_output.size(0), 66 | expand_max_len, 67 | duration_predictor_output.size(1)).numpy() 68 | alignment = create_alignment(alignment, 69 | duration_predictor_output.cpu().numpy()) 70 | alignment = torch.from_numpy(alignment).to(device) 71 | 72 | output = alignment @ x 73 | if mel_max_length: 74 | output = F.pad( 75 | output, (0, 0, 0, mel_max_length-output.size(1), 0, 0)) 76 | return output 77 | 78 | def forward(self, x, alpha=1.0, target=None, mel_max_length=None): 79 | duration_predictor_output = self.duration_predictor(x) 80 | 81 | if target is not None: 82 | output = self.LR(x, target, mel_max_length=mel_max_length) 83 | return output, duration_predictor_output 84 | else: 85 | duration_predictor_output = ( 86 | (duration_predictor_output + 0.5) * alpha).int() 87 | output = self.LR(x, duration_predictor_output) 88 | mel_pos = torch.stack( 89 | [torch.Tensor([i+1 for i in range(output.size(1))])]).long().to(device) 90 | 91 | return output, mel_pos 92 | 93 | 94 | class DurationPredictor(nn.Module): 95 | """ Duration Predictor """ 96 | 97 | def __init__(self): 98 | super(DurationPredictor, self).__init__() 99 | 100 | self.input_size = hp.encoder_dim 101 | self.filter_size = hp.duration_predictor_filter_size 102 | self.kernel = hp.duration_predictor_kernel_size 103 | self.conv_output_size = hp.duration_predictor_filter_size 104 | self.dropout = hp.dropout 105 | 106 | self.conv_layer = nn.Sequential(OrderedDict([ 107 | ("conv1d_1", Conv(self.input_size, 108 | self.filter_size, 109 | kernel_size=self.kernel, 110 | padding=1)), 111 | ("layer_norm_1", nn.LayerNorm(self.filter_size)), 112 | ("relu_1", nn.ReLU()), 113 | ("dropout_1", nn.Dropout(self.dropout)), 114 | ("conv1d_2", Conv(self.filter_size, 115 | self.filter_size, 116 | kernel_size=self.kernel, 117 | padding=1)), 118 | ("layer_norm_2", nn.LayerNorm(self.filter_size)), 119 | ("relu_2", nn.ReLU()), 120 | ("dropout_2", nn.Dropout(self.dropout)) 121 | ])) 122 | 123 | self.linear_layer = Linear(self.conv_output_size, 1) 124 | self.relu = nn.ReLU() 125 | 126 | def forward(self, encoder_output): 127 | out = self.conv_layer(encoder_output) 128 | out = self.linear_layer(out) 129 | out = self.relu(out) 130 | out = out.squeeze() 131 | if not self.training: 132 | out = out.unsqueeze(0) 133 | return out 134 | 135 | 136 | class BatchNormConv1d(nn.Module): 137 | def __init__(self, in_dim, out_dim, kernel_size, stride, padding, 138 | activation=None, w_init_gain='linear'): 139 | super(BatchNormConv1d, self).__init__() 140 | self.conv1d = nn.Conv1d(in_dim, out_dim, 141 | kernel_size=kernel_size, 142 | stride=stride, padding=padding, bias=False) 143 | self.bn = nn.BatchNorm1d(out_dim) 144 | self.activation = activation 145 | 146 | torch.nn.init.xavier_uniform_( 147 | self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) 148 | 149 | def forward(self, x): 150 | x = self.conv1d(x) 151 | if self.activation is not None: 152 | x = self.activation(x) 153 | return self.bn(x) 154 | 155 | 156 | class Conv(nn.Module): 157 | """ 158 | Convolution Module 159 | """ 160 | 161 | def __init__(self, 162 | in_channels, 163 | out_channels, 164 | kernel_size=1, 165 | stride=1, 166 | padding=0, 167 | dilation=1, 168 | bias=True, 169 | w_init='linear'): 170 | """ 171 | :param in_channels: dimension of input 172 | :param out_channels: dimension of output 173 | :param kernel_size: size of kernel 174 | :param stride: size of stride 175 | :param padding: size of padding 176 | :param dilation: dilation rate 177 | :param bias: boolean. if True, bias is included. 178 | :param w_init: str. weight inits with xavier initialization. 179 | """ 180 | super(Conv, self).__init__() 181 | 182 | self.conv = nn.Conv1d(in_channels, 183 | out_channels, 184 | kernel_size=kernel_size, 185 | stride=stride, 186 | padding=padding, 187 | dilation=dilation, 188 | bias=bias) 189 | 190 | nn.init.xavier_uniform_( 191 | self.conv.weight, gain=nn.init.calculate_gain(w_init)) 192 | 193 | def forward(self, x): 194 | x = x.contiguous().transpose(1, 2) 195 | x = self.conv(x) 196 | x = x.contiguous().transpose(1, 2) 197 | 198 | return x 199 | 200 | 201 | class Linear(nn.Module): 202 | """ 203 | Linear Module 204 | """ 205 | 206 | def __init__(self, in_dim, out_dim, bias=True, w_init='linear'): 207 | """ 208 | :param in_dim: dimension of input 209 | :param out_dim: dimension of output 210 | :param bias: boolean. if True, bias is included. 211 | :param w_init: str. weight inits with xavier initialization. 212 | """ 213 | super(Linear, self).__init__() 214 | self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) 215 | 216 | nn.init.xavier_uniform_( 217 | self.linear_layer.weight, 218 | gain=nn.init.calculate_gain(w_init)) 219 | 220 | def forward(self, x): 221 | return self.linear_layer(x) 222 | 223 | 224 | class Highway(nn.Module): 225 | def __init__(self, in_size, out_size): 226 | super(Highway, self).__init__() 227 | self.H = nn.Linear(in_size, out_size) 228 | self.H.bias.data.zero_() 229 | self.T = nn.Linear(in_size, out_size) 230 | self.T.bias.data.fill_(-1) 231 | self.relu = nn.ReLU() 232 | self.sigmoid = nn.Sigmoid() 233 | 234 | def forward(self, inputs): 235 | H = self.relu(self.H(inputs)) 236 | T = self.sigmoid(self.T(inputs)) 237 | return H * T + inputs * (1.0 - T) 238 | 239 | 240 | class Prenet(nn.Module): 241 | """ 242 | Prenet before passing through the network 243 | """ 244 | 245 | def __init__(self, input_size, hidden_size, output_size): 246 | super(Prenet, self).__init__() 247 | self.input_size = input_size 248 | self.output_size = output_size 249 | self.hidden_size = hidden_size 250 | self.layer = nn.Sequential(OrderedDict([ 251 | ('fc1', Linear(self.input_size, self.hidden_size)), 252 | ('relu1', nn.ReLU()), 253 | ('dropout1', nn.Dropout(0.5)), 254 | ('fc2', Linear(self.hidden_size, self.output_size)), 255 | ('relu2', nn.ReLU()), 256 | ('dropout2', nn.Dropout(0.5)), 257 | ])) 258 | 259 | def forward(self, x): 260 | out = self.layer(x) 261 | return out 262 | 263 | 264 | class CBHG(nn.Module): 265 | """CBHG module: a recurrent neural network composed of: 266 | - 1-d convolution banks 267 | - Highway networks + residual connections 268 | - Bidirectional gated recurrent units 269 | """ 270 | 271 | def __init__(self, in_dim, K=16, projections=[128, 128]): 272 | super(CBHG, self).__init__() 273 | self.in_dim = in_dim 274 | self.relu = nn.ReLU() 275 | self.conv1d_banks = nn.ModuleList( 276 | [BatchNormConv1d(in_dim, in_dim, kernel_size=k, stride=1, 277 | padding=k // 2, activation=self.relu) 278 | for k in range(1, K + 1)]) 279 | self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1) 280 | 281 | in_sizes = [K * in_dim] + projections[:-1] 282 | activations = [self.relu] * (len(projections) - 1) + [None] 283 | self.conv1d_projections = nn.ModuleList( 284 | [BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, 285 | padding=1, activation=ac) 286 | for (in_size, out_size, ac) in zip( 287 | in_sizes, projections, activations)]) 288 | 289 | self.pre_highway = nn.Linear(projections[-1], in_dim, bias=False) 290 | self.highways = nn.ModuleList( 291 | [Highway(in_dim, in_dim) for _ in range(4)]) 292 | 293 | self.gru = nn.GRU( 294 | in_dim, in_dim, 1, batch_first=True, bidirectional=True) 295 | 296 | def forward(self, inputs, input_lengths=None): 297 | # (B, T_in, in_dim) 298 | x = inputs 299 | 300 | # Needed to perform conv1d on time-axis 301 | # (B, in_dim, T_in) 302 | if x.size(-1) == self.in_dim: 303 | x = x.transpose(1, 2) 304 | 305 | T = x.size(-1) 306 | 307 | # (B, in_dim*K, T_in) 308 | # Concat conv1d bank outputs 309 | x = torch.cat([conv1d(x)[:, :, :T] 310 | for conv1d in self.conv1d_banks], dim=1) 311 | assert x.size(1) == self.in_dim * len(self.conv1d_banks) 312 | x = self.max_pool1d(x)[:, :, :T] 313 | 314 | for conv1d in self.conv1d_projections: 315 | x = conv1d(x) 316 | 317 | # (B, T_in, in_dim) 318 | # Back to the original shape 319 | x = x.transpose(1, 2) 320 | 321 | if x.size(-1) != self.in_dim: 322 | x = self.pre_highway(x) 323 | 324 | # Residual connection 325 | x += inputs 326 | for highway in self.highways: 327 | x = highway(x) 328 | 329 | if input_lengths is not None: 330 | x = nn.utils.rnn.pack_padded_sequence( 331 | x, input_lengths, batch_first=True) 332 | 333 | # (B, T_in, in_dim*2) 334 | self.gru.flatten_parameters() 335 | outputs, _ = self.gru(x) 336 | 337 | if input_lengths is not None: 338 | outputs, _ = nn.utils.rnn.pad_packed_sequence( 339 | outputs, batch_first=True) 340 | 341 | return outputs 342 | 343 | 344 | if __name__ == "__main__": 345 | # TEST 346 | a = torch.Tensor([[2, 3, 4], [1, 2, 3]]) 347 | b = torch.Tensor([[5, 6, 7], [7, 8, 9]]) 348 | c = torch.stack([a, b]) 349 | 350 | d = torch.Tensor([[1, 4], [6, 3]]).int() 351 | expand_max_len = torch.max(torch.sum(d, -1), -1)[0] 352 | base = torch.zeros(c.size(0), expand_max_len, c.size(1)) 353 | 354 | alignment = create_alignment(base.numpy(), d.numpy()) 355 | print(alignment) 356 | print(torch.from_numpy(alignment) @ c) 357 | -------------------------------------------------------------------------------- /optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ScheduledOptim(): 5 | ''' A simple wrapper class for learning rate scheduling ''' 6 | 7 | def __init__(self, optimizer, d_model, n_warmup_steps, current_steps): 8 | self._optimizer = optimizer 9 | self.n_warmup_steps = n_warmup_steps 10 | self.n_current_steps = current_steps 11 | self.init_lr = np.power(d_model, -0.5) 12 | 13 | def step_and_update_lr_frozen(self, learning_rate_frozen): 14 | for param_group in self._optimizer.param_groups: 15 | param_group['lr'] = learning_rate_frozen 16 | self._optimizer.step() 17 | 18 | def step_and_update_lr(self): 19 | self._update_learning_rate() 20 | self._optimizer.step() 21 | 22 | def get_learning_rate(self): 23 | learning_rate = 0.0 24 | for param_group in self._optimizer.param_groups: 25 | learning_rate = param_group['lr'] 26 | 27 | return learning_rate 28 | 29 | def zero_grad(self): 30 | # print(self.init_lr) 31 | self._optimizer.zero_grad() 32 | 33 | def _get_lr_scale(self): 34 | return np.min([ 35 | np.power(self.n_current_steps, -0.5), 36 | np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) 37 | 38 | def _update_learning_rate(self): 39 | ''' Learning rate scheduling per step ''' 40 | self.n_current_steps += 1 41 | lr = self.init_lr * self._get_lr_scale() 42 | 43 | for param_group in self._optimizer.param_groups: 44 | param_group['lr'] = lr 45 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import shutil 4 | import os 5 | 6 | from data import ljspeech 7 | import hparams as hp 8 | 9 | 10 | def preprocess_ljspeech(filename): 11 | in_dir = filename 12 | out_dir = hp.mel_ground_truth 13 | if not os.path.exists(out_dir): 14 | os.makedirs(out_dir, exist_ok=True) 15 | metadata = ljspeech.build_from_path(in_dir, out_dir) 16 | write_metadata(metadata, out_dir) 17 | 18 | shutil.move(os.path.join(hp.mel_ground_truth, "train.txt"), 19 | os.path.join("data", "train.txt")) 20 | 21 | 22 | def write_metadata(metadata, out_dir): 23 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 24 | for m in metadata: 25 | f.write(m + '\n') 26 | 27 | 28 | def main(): 29 | path = os.path.join("data", "LJSpeech-1.1") 30 | preprocess_ljspeech(path) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.2.0 2 | numpy==1.22.0 3 | scipy==1.4.1 4 | librosa==0.7.2 5 | inflect==0.2.5 6 | numba==0.48.0 -------------------------------------------------------------------------------- /sample/135000_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_0.wav -------------------------------------------------------------------------------- /sample/135000_0_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_0_waveglow.wav -------------------------------------------------------------------------------- /sample/135000_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_1.wav -------------------------------------------------------------------------------- /sample/135000_1_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_1_waveglow.wav -------------------------------------------------------------------------------- /sample/135000_2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_2.wav -------------------------------------------------------------------------------- /sample/135000_2_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_2_waveglow.wav -------------------------------------------------------------------------------- /sample/135000_3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_3.wav -------------------------------------------------------------------------------- /sample/135000_3_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_3_waveglow.wav -------------------------------------------------------------------------------- /sample/135000_4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_4.wav -------------------------------------------------------------------------------- /sample/135000_4_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_4_waveglow.wav -------------------------------------------------------------------------------- /sample/135000_5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_5.wav -------------------------------------------------------------------------------- /sample/135000_5_waveglow.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_5_waveglow.wav -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | from text import cleaners 4 | from text.symbols import symbols 5 | 6 | 7 | # Mappings from symbol to numeric ID and vice versa: 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 10 | 11 | # Regular expression matching text enclosed in curly braces: 12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 13 | 14 | 15 | def text_to_sequence(text, cleaner_names): 16 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 17 | 18 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 19 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 20 | 21 | Args: 22 | text: string to convert to a sequence 23 | cleaner_names: names of the cleaner functions to run the text through 24 | 25 | Returns: 26 | List of integers corresponding to the symbols in the text 27 | ''' 28 | sequence = [] 29 | 30 | # Check for curly braces and treat their contents as ARPAbet: 31 | while len(text): 32 | m = _curly_re.match(text) 33 | if not m: 34 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 35 | break 36 | sequence += _symbols_to_sequence( 37 | _clean_text(m.group(1), cleaner_names)) 38 | sequence += _arpabet_to_sequence(m.group(2)) 39 | text = m.group(3) 40 | 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | import re 18 | from unidecode import unidecode 19 | from .numbers import normalize_numbers 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def expand_numbers(text): 52 | return normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | return text.lower() 57 | 58 | 59 | def collapse_whitespace(text): 60 | return re.sub(_whitespace_re, ' ', text) 61 | 62 | 63 | def convert_to_ascii(text): 64 | return unidecode(text) 65 | 66 | 67 | def basic_cleaners(text): 68 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 69 | text = lowercase(text) 70 | text = collapse_whitespace(text) 71 | return text 72 | 73 | 74 | def transliteration_cleaners(text): 75 | '''Pipeline for non-English text that transliterates to ASCII.''' 76 | text = convert_to_ascii(text) 77 | text = lowercase(text) 78 | text = collapse_whitespace(text) 79 | return text 80 | 81 | 82 | def english_cleaners(text): 83 | '''Pipeline for English text, including number and abbreviation expansion.''' 84 | text = convert_to_ascii(text) 85 | text = lowercase(text) 86 | text = expand_numbers(text) 87 | text = expand_abbreviations(text) 88 | text = collapse_whitespace(text) 89 | return text 90 | -------------------------------------------------------------------------------- /text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | 22 | def __init__(self, file_or_path, keep_ambiguous=True): 23 | if isinstance(file_or_path, str): 24 | with open(file_or_path, encoding='latin-1') as f: 25 | entries = _parse_cmudict(f) 26 | else: 27 | entries = _parse_cmudict(file_or_path) 28 | if not keep_ambiguous: 29 | entries = {word: pron for word, 30 | pron in entries.items() if len(pron) == 1} 31 | self._entries = entries 32 | 33 | def __len__(self): 34 | return len(self._entries) 35 | 36 | def lookup(self, word): 37 | '''Returns list of ARPAbet pronunciations of the given word.''' 38 | return self._entries.get(word.upper()) 39 | 40 | 41 | _alt_re = re.compile(r'\([0-9]+\)') 42 | 43 | 44 | def _parse_cmudict(file): 45 | cmudict = {} 46 | for line in file: 47 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 48 | parts = line.split(' ') 49 | word = re.sub(_alt_re, '', parts[0]) 50 | pronunciation = _get_pronunciation(parts[1]) 51 | if pronunciation: 52 | if word in cmudict: 53 | cmudict[word].append(pronunciation) 54 | else: 55 | cmudict[word] = [pronunciation] 56 | return cmudict 57 | 58 | 59 | def _get_pronunciation(s): 60 | parts = s.strip().split(' ') 61 | for part in parts: 62 | if part not in _valid_symbol_set: 63 | return None 64 | return ' '.join(parts) 65 | -------------------------------------------------------------------------------- /text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | 8 | from text import cmudict 9 | _pad = '_' 10 | _punctuation = '!\'(),.:;? ' 11 | _special = '-' 12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 13 | 14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 15 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 16 | 17 | # Export all symbols: 18 | symbols = [_pad] + list(_special) + list(_punctuation) + \ 19 | list(_letters) + _arpabet 20 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from multiprocessing import cpu_count 6 | import numpy as np 7 | import argparse 8 | import os 9 | import time 10 | import math 11 | 12 | from model import FastSpeech 13 | from loss import DNNLoss 14 | from dataset import BufferDataset, DataLoader 15 | from dataset import get_data_to_buffer, collate_fn_tensor 16 | from optimizer import ScheduledOptim 17 | import hparams as hp 18 | import utils 19 | 20 | 21 | def main(args): 22 | # Get device 23 | device = torch.device('cuda'if torch.cuda.is_available()else 'cpu') 24 | 25 | # Define model 26 | print("Use FastSpeech") 27 | model = nn.DataParallel(FastSpeech()).to(device) 28 | print("Model Has Been Defined") 29 | num_param = utils.get_param_num(model) 30 | print('Number of TTS Parameters:', num_param) 31 | # Get buffer 32 | print("Load data to buffer") 33 | buffer = get_data_to_buffer() 34 | 35 | # Optimizer and loss 36 | optimizer = torch.optim.Adam(model.parameters(), 37 | betas=(0.9, 0.98), 38 | eps=1e-9) 39 | scheduled_optim = ScheduledOptim(optimizer, 40 | hp.decoder_dim, 41 | hp.n_warm_up_step, 42 | args.restore_step) 43 | fastspeech_loss = DNNLoss().to(device) 44 | print("Defined Optimizer and Loss Function.") 45 | 46 | # Load checkpoint if exists 47 | try: 48 | checkpoint = torch.load(os.path.join( 49 | hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step)) 50 | model.load_state_dict(checkpoint['model']) 51 | optimizer.load_state_dict(checkpoint['optimizer']) 52 | print("\n---Model Restored at Step %d---\n" % args.restore_step) 53 | except: 54 | print("\n---Start New Training---\n") 55 | if not os.path.exists(hp.checkpoint_path): 56 | os.mkdir(hp.checkpoint_path) 57 | 58 | # Init logger 59 | if not os.path.exists(hp.logger_path): 60 | os.mkdir(hp.logger_path) 61 | 62 | # Get dataset 63 | dataset = BufferDataset(buffer) 64 | 65 | # Get Training Loader 66 | training_loader = DataLoader(dataset, 67 | batch_size=hp.batch_expand_size * hp.batch_size, 68 | shuffle=True, 69 | collate_fn=collate_fn_tensor, 70 | drop_last=True, 71 | num_workers=0) 72 | total_step = hp.epochs * len(training_loader) * hp.batch_expand_size 73 | 74 | # Define Some Information 75 | Time = np.array([]) 76 | Start = time.perf_counter() 77 | 78 | # Training 79 | model = model.train() 80 | 81 | for epoch in range(hp.epochs): 82 | for i, batchs in enumerate(training_loader): 83 | # real batch start here 84 | for j, db in enumerate(batchs): 85 | start_time = time.perf_counter() 86 | 87 | current_step = i * hp.batch_expand_size + j + args.restore_step + \ 88 | epoch * len(training_loader) * hp.batch_expand_size + 1 89 | 90 | # Init 91 | scheduled_optim.zero_grad() 92 | 93 | # Get Data 94 | character = db["text"].long().to(device) 95 | mel_target = db["mel_target"].float().to(device) 96 | duration = db["duration"].int().to(device) 97 | mel_pos = db["mel_pos"].long().to(device) 98 | src_pos = db["src_pos"].long().to(device) 99 | max_mel_len = db["mel_max_len"] 100 | 101 | # Forward 102 | mel_output, mel_postnet_output, duration_predictor_output = model(character, 103 | src_pos, 104 | mel_pos=mel_pos, 105 | mel_max_length=max_mel_len, 106 | length_target=duration) 107 | 108 | # Cal Loss 109 | mel_loss, mel_postnet_loss, duration_loss = fastspeech_loss(mel_output, 110 | mel_postnet_output, 111 | duration_predictor_output, 112 | mel_target, 113 | duration) 114 | total_loss = mel_loss + mel_postnet_loss + duration_loss 115 | 116 | # Logger 117 | t_l = total_loss.item() 118 | m_l = mel_loss.item() 119 | m_p_l = mel_postnet_loss.item() 120 | d_l = duration_loss.item() 121 | 122 | with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss: 123 | f_total_loss.write(str(t_l)+"\n") 124 | 125 | with open(os.path.join("logger", "mel_loss.txt"), "a") as f_mel_loss: 126 | f_mel_loss.write(str(m_l)+"\n") 127 | 128 | with open(os.path.join("logger", "mel_postnet_loss.txt"), "a") as f_mel_postnet_loss: 129 | f_mel_postnet_loss.write(str(m_p_l)+"\n") 130 | 131 | with open(os.path.join("logger", "duration_loss.txt"), "a") as f_d_loss: 132 | f_d_loss.write(str(d_l)+"\n") 133 | 134 | # Backward 135 | total_loss.backward() 136 | 137 | # Clipping gradients to avoid gradient explosion 138 | nn.utils.clip_grad_norm_( 139 | model.parameters(), hp.grad_clip_thresh) 140 | 141 | # Update weights 142 | if args.frozen_learning_rate: 143 | scheduled_optim.step_and_update_lr_frozen( 144 | args.learning_rate_frozen) 145 | else: 146 | scheduled_optim.step_and_update_lr() 147 | 148 | # Print 149 | if current_step % hp.log_step == 0: 150 | Now = time.perf_counter() 151 | 152 | str1 = "Epoch [{}/{}], Step [{}/{}]:".format( 153 | epoch+1, hp.epochs, current_step, total_step) 154 | str2 = "Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f}, Duration Loss: {:.4f};".format( 155 | m_l, m_p_l, d_l) 156 | str3 = "Current Learning Rate is {:.6f}.".format( 157 | scheduled_optim.get_learning_rate()) 158 | str4 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format( 159 | (Now-Start), (total_step-current_step)*np.mean(Time)) 160 | 161 | print("\n" + str1) 162 | print(str2) 163 | print(str3) 164 | print(str4) 165 | 166 | with open(os.path.join("logger", "logger.txt"), "a") as f_logger: 167 | f_logger.write(str1 + "\n") 168 | f_logger.write(str2 + "\n") 169 | f_logger.write(str3 + "\n") 170 | f_logger.write(str4 + "\n") 171 | f_logger.write("\n") 172 | 173 | if current_step % hp.save_step == 0: 174 | torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict( 175 | )}, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step)) 176 | print("save model at step %d ..." % current_step) 177 | 178 | end_time = time.perf_counter() 179 | Time = np.append(Time, end_time - start_time) 180 | if len(Time) == hp.clear_Time: 181 | temp_value = np.mean(Time) 182 | Time = np.delete( 183 | Time, [i for i in range(len(Time))], axis=None) 184 | Time = np.append(Time, temp_value) 185 | 186 | 187 | if __name__ == "__main__": 188 | parser = argparse.ArgumentParser() 189 | parser.add_argument('--restore_step', type=int, default=0) 190 | parser.add_argument('--frozen_learning_rate', type=bool, default=False) 191 | parser.add_argument("--learning_rate_frozen", type=float, default=1e-3) 192 | args = parser.parse_args() 193 | main(args) 194 | -------------------------------------------------------------------------------- /transformer/Constants.py: -------------------------------------------------------------------------------- 1 | PAD = 0 2 | UNK = 1 3 | BOS = 2 4 | EOS = 3 5 | 6 | PAD_WORD = '' 7 | UNK_WORD = '' 8 | BOS_WORD = '' 9 | EOS_WORD = '' 10 | -------------------------------------------------------------------------------- /transformer/Layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | import numpy as np 5 | from collections import OrderedDict 6 | 7 | from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward 8 | from text.symbols import symbols 9 | 10 | 11 | class Linear(nn.Module): 12 | """ 13 | Linear Module 14 | """ 15 | 16 | def __init__(self, in_dim, out_dim, bias=True, w_init='linear'): 17 | """ 18 | :param in_dim: dimension of input 19 | :param out_dim: dimension of output 20 | :param bias: boolean. if True, bias is included. 21 | :param w_init: str. weight inits with xavier initialization. 22 | """ 23 | super(Linear, self).__init__() 24 | self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias) 25 | 26 | nn.init.xavier_uniform_( 27 | self.linear_layer.weight, 28 | gain=nn.init.calculate_gain(w_init)) 29 | 30 | def forward(self, x): 31 | return self.linear_layer(x) 32 | 33 | 34 | class PreNet(nn.Module): 35 | """ 36 | Pre Net before passing through the network 37 | """ 38 | 39 | def __init__(self, input_size, hidden_size, output_size, p=0.5): 40 | """ 41 | :param input_size: dimension of input 42 | :param hidden_size: dimension of hidden unit 43 | :param output_size: dimension of output 44 | """ 45 | super(PreNet, self).__init__() 46 | self.input_size = input_size 47 | self.output_size = output_size 48 | self.hidden_size = hidden_size 49 | self.layer = nn.Sequential(OrderedDict([ 50 | ('fc1', Linear(self.input_size, self.hidden_size)), 51 | ('relu1', nn.ReLU()), 52 | ('dropout1', nn.Dropout(p)), 53 | ('fc2', Linear(self.hidden_size, self.output_size)), 54 | ('relu2', nn.ReLU()), 55 | ('dropout2', nn.Dropout(p)), 56 | ])) 57 | 58 | def forward(self, input_): 59 | 60 | out = self.layer(input_) 61 | 62 | return out 63 | 64 | 65 | class Conv(nn.Module): 66 | """ 67 | Convolution Module 68 | """ 69 | 70 | def __init__(self, 71 | in_channels, 72 | out_channels, 73 | kernel_size=1, 74 | stride=1, 75 | padding=0, 76 | dilation=1, 77 | bias=True, 78 | w_init='linear'): 79 | """ 80 | :param in_channels: dimension of input 81 | :param out_channels: dimension of output 82 | :param kernel_size: size of kernel 83 | :param stride: size of stride 84 | :param padding: size of padding 85 | :param dilation: dilation rate 86 | :param bias: boolean. if True, bias is included. 87 | :param w_init: str. weight inits with xavier initialization. 88 | """ 89 | super(Conv, self).__init__() 90 | 91 | self.conv = nn.Conv1d(in_channels, 92 | out_channels, 93 | kernel_size=kernel_size, 94 | stride=stride, 95 | padding=padding, 96 | dilation=dilation, 97 | bias=bias) 98 | 99 | nn.init.xavier_uniform_( 100 | self.conv.weight, gain=nn.init.calculate_gain(w_init)) 101 | 102 | def forward(self, x): 103 | x = self.conv(x) 104 | return x 105 | 106 | 107 | class FFTBlock(torch.nn.Module): 108 | """FFT Block""" 109 | 110 | def __init__(self, 111 | d_model, 112 | d_inner, 113 | n_head, 114 | d_k, 115 | d_v, 116 | dropout=0.1): 117 | super(FFTBlock, self).__init__() 118 | self.slf_attn = MultiHeadAttention( 119 | n_head, d_model, d_k, d_v, dropout=dropout) 120 | self.pos_ffn = PositionwiseFeedForward( 121 | d_model, d_inner, dropout=dropout) 122 | 123 | def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): 124 | enc_output, enc_slf_attn = self.slf_attn( 125 | enc_input, enc_input, enc_input, mask=slf_attn_mask) 126 | enc_output *= non_pad_mask 127 | 128 | enc_output = self.pos_ffn(enc_output) 129 | enc_output *= non_pad_mask 130 | 131 | return enc_output, enc_slf_attn 132 | 133 | 134 | class ConvNorm(torch.nn.Module): 135 | def __init__(self, 136 | in_channels, 137 | out_channels, 138 | kernel_size=1, 139 | stride=1, 140 | padding=None, 141 | dilation=1, 142 | bias=True, 143 | w_init_gain='linear'): 144 | super(ConvNorm, self).__init__() 145 | 146 | if padding is None: 147 | assert(kernel_size % 2 == 1) 148 | padding = int(dilation * (kernel_size - 1) / 2) 149 | 150 | self.conv = torch.nn.Conv1d(in_channels, 151 | out_channels, 152 | kernel_size=kernel_size, 153 | stride=stride, 154 | padding=padding, 155 | dilation=dilation, 156 | bias=bias) 157 | 158 | torch.nn.init.xavier_uniform_( 159 | self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) 160 | 161 | def forward(self, signal): 162 | conv_signal = self.conv(signal) 163 | 164 | return conv_signal 165 | 166 | 167 | class PostNet(nn.Module): 168 | """ 169 | PostNet: Five 1-d convolution with 512 channels and kernel size 5 170 | """ 171 | 172 | def __init__(self, 173 | n_mel_channels=80, 174 | postnet_embedding_dim=512, 175 | postnet_kernel_size=5, 176 | postnet_n_convolutions=5): 177 | 178 | super(PostNet, self).__init__() 179 | self.convolutions = nn.ModuleList() 180 | 181 | self.convolutions.append( 182 | nn.Sequential( 183 | ConvNorm(n_mel_channels, 184 | postnet_embedding_dim, 185 | kernel_size=postnet_kernel_size, 186 | stride=1, 187 | padding=int((postnet_kernel_size - 1) / 2), 188 | dilation=1, 189 | w_init_gain='tanh'), 190 | 191 | nn.BatchNorm1d(postnet_embedding_dim)) 192 | ) 193 | 194 | for i in range(1, postnet_n_convolutions - 1): 195 | self.convolutions.append( 196 | nn.Sequential( 197 | ConvNorm(postnet_embedding_dim, 198 | postnet_embedding_dim, 199 | kernel_size=postnet_kernel_size, 200 | stride=1, 201 | padding=int((postnet_kernel_size - 1) / 2), 202 | dilation=1, 203 | w_init_gain='tanh'), 204 | 205 | nn.BatchNorm1d(postnet_embedding_dim)) 206 | ) 207 | 208 | self.convolutions.append( 209 | nn.Sequential( 210 | ConvNorm(postnet_embedding_dim, 211 | n_mel_channels, 212 | kernel_size=postnet_kernel_size, 213 | stride=1, 214 | padding=int((postnet_kernel_size - 1) / 2), 215 | dilation=1, 216 | w_init_gain='linear'), 217 | 218 | nn.BatchNorm1d(n_mel_channels)) 219 | ) 220 | 221 | def forward(self, x): 222 | x = x.contiguous().transpose(1, 2) 223 | 224 | for i in range(len(self.convolutions) - 1): 225 | x = F.dropout(torch.tanh( 226 | self.convolutions[i](x)), 0.5, self.training) 227 | x = F.dropout(self.convolutions[-1](x), 0.5, self.training) 228 | 229 | x = x.contiguous().transpose(1, 2) 230 | return x 231 | -------------------------------------------------------------------------------- /transformer/Models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import hparams as hp 5 | 6 | import transformer.Constants as Constants 7 | from transformer.Layers import FFTBlock, PreNet, PostNet, Linear 8 | 9 | 10 | def get_non_pad_mask(seq): 11 | assert seq.dim() == 2 12 | return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1) 13 | 14 | 15 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 16 | ''' Sinusoid position encoding table ''' 17 | 18 | def cal_angle(position, hid_idx): 19 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 20 | 21 | def get_posi_angle_vec(position): 22 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 23 | 24 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) 25 | for pos_i in range(n_position)]) 26 | 27 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 28 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 29 | 30 | if padding_idx is not None: 31 | # zero vector for padding dimension 32 | sinusoid_table[padding_idx] = 0. 33 | 34 | return torch.FloatTensor(sinusoid_table) 35 | 36 | 37 | def get_attn_key_pad_mask(seq_k, seq_q): 38 | ''' For masking out the padding part of key sequence. ''' 39 | 40 | # Expand to fit the shape of key query attention matrix. 41 | len_q = seq_q.size(1) 42 | padding_mask = seq_k.eq(Constants.PAD) 43 | padding_mask = padding_mask.unsqueeze( 44 | 1).expand(-1, len_q, -1) # b x lq x lk 45 | 46 | return padding_mask 47 | 48 | 49 | class Encoder(nn.Module): 50 | ''' Encoder ''' 51 | 52 | def __init__(self, 53 | n_src_vocab=hp.vocab_size, 54 | len_max_seq=hp.vocab_size, 55 | d_word_vec=hp.encoder_dim, 56 | n_layers=hp.encoder_n_layer, 57 | n_head=hp.encoder_head, 58 | d_k=hp.encoder_dim // hp.encoder_head, 59 | d_v=hp.encoder_dim // hp.encoder_head, 60 | d_model=hp.encoder_dim, 61 | d_inner=hp.encoder_conv1d_filter_size, 62 | dropout=hp.dropout): 63 | 64 | super(Encoder, self).__init__() 65 | 66 | n_position = len_max_seq + 1 67 | 68 | self.src_word_emb = nn.Embedding(n_src_vocab, 69 | d_word_vec, 70 | padding_idx=Constants.PAD) 71 | 72 | self.position_enc = nn.Embedding.from_pretrained( 73 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), 74 | freeze=True) 75 | 76 | self.layer_stack = nn.ModuleList([FFTBlock( 77 | d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) 78 | 79 | def forward(self, src_seq, src_pos, return_attns=False): 80 | 81 | enc_slf_attn_list = [] 82 | 83 | # -- Prepare masks 84 | slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq) 85 | non_pad_mask = get_non_pad_mask(src_seq) 86 | 87 | # -- Forward 88 | enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos) 89 | 90 | for enc_layer in self.layer_stack: 91 | enc_output, enc_slf_attn = enc_layer( 92 | enc_output, 93 | non_pad_mask=non_pad_mask, 94 | slf_attn_mask=slf_attn_mask) 95 | if return_attns: 96 | enc_slf_attn_list += [enc_slf_attn] 97 | 98 | return enc_output, non_pad_mask 99 | 100 | 101 | class Decoder(nn.Module): 102 | """ Decoder """ 103 | 104 | def __init__(self, 105 | len_max_seq=hp.max_seq_len, 106 | n_layers=hp.decoder_n_layer, 107 | n_head=hp.decoder_head, 108 | d_k=hp.decoder_dim // hp.decoder_head, 109 | d_v=hp.decoder_dim // hp.decoder_head, 110 | d_model=hp.decoder_dim, 111 | d_inner=hp.decoder_conv1d_filter_size, 112 | dropout=hp.dropout): 113 | 114 | super(Decoder, self).__init__() 115 | 116 | n_position = len_max_seq + 1 117 | 118 | self.position_enc = nn.Embedding.from_pretrained( 119 | get_sinusoid_encoding_table(n_position, d_model, padding_idx=0), 120 | freeze=True) 121 | 122 | self.layer_stack = nn.ModuleList([FFTBlock( 123 | d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) 124 | 125 | def forward(self, enc_seq, enc_pos, return_attns=False): 126 | 127 | dec_slf_attn_list = [] 128 | 129 | # -- Prepare masks 130 | slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) 131 | non_pad_mask = get_non_pad_mask(enc_pos) 132 | 133 | # -- Forward 134 | dec_output = enc_seq + self.position_enc(enc_pos) 135 | 136 | for dec_layer in self.layer_stack: 137 | dec_output, dec_slf_attn = dec_layer( 138 | dec_output, 139 | non_pad_mask=non_pad_mask, 140 | slf_attn_mask=slf_attn_mask) 141 | if return_attns: 142 | dec_slf_attn_list += [dec_slf_attn] 143 | 144 | return dec_output 145 | -------------------------------------------------------------------------------- /transformer/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class ScaledDotProductAttention(nn.Module): 7 | ''' Scaled Dot-Product Attention ''' 8 | 9 | def __init__(self, temperature, attn_dropout=0.1): 10 | super().__init__() 11 | self.temperature = temperature 12 | self.dropout = nn.Dropout(attn_dropout) 13 | self.softmax = nn.Softmax(dim=2) 14 | 15 | def forward(self, q, k, v, mask=None): 16 | 17 | attn = torch.bmm(q, k.transpose(1, 2)) 18 | attn = attn / self.temperature 19 | 20 | if mask is not None: 21 | attn = attn.masked_fill(mask, -np.inf) 22 | 23 | attn = self.softmax(attn) 24 | attn = self.dropout(attn) 25 | output = torch.bmm(attn, v) 26 | 27 | return output, attn 28 | -------------------------------------------------------------------------------- /transformer/SubLayers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | from transformer.Modules import ScaledDotProductAttention 6 | import hparams as hp 7 | 8 | 9 | class MultiHeadAttention(nn.Module): 10 | ''' Multi-Head Attention module ''' 11 | 12 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 13 | super().__init__() 14 | 15 | self.n_head = n_head 16 | self.d_k = d_k 17 | self.d_v = d_v 18 | 19 | self.w_qs = nn.Linear(d_model, n_head * d_k) 20 | self.w_ks = nn.Linear(d_model, n_head * d_k) 21 | self.w_vs = nn.Linear(d_model, n_head * d_v) 22 | nn.init.normal_(self.w_qs.weight, mean=0, 23 | std=np.sqrt(2.0 / (d_model + d_k))) 24 | nn.init.normal_(self.w_ks.weight, mean=0, 25 | std=np.sqrt(2.0 / (d_model + d_k))) 26 | nn.init.normal_(self.w_vs.weight, mean=0, 27 | std=np.sqrt(2.0 / (d_model + d_v))) 28 | 29 | self.attention = ScaledDotProductAttention( 30 | temperature=np.power(d_k, 0.5)) 31 | self.layer_norm = nn.LayerNorm(d_model) 32 | 33 | self.fc = nn.Linear(n_head * d_v, d_model) 34 | nn.init.xavier_normal_(self.fc.weight) 35 | 36 | self.dropout = nn.Dropout(dropout) 37 | 38 | def forward(self, q, k, v, mask=None): 39 | 40 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 41 | 42 | sz_b, len_q, _ = q.size() 43 | sz_b, len_k, _ = k.size() 44 | sz_b, len_v, _ = v.size() 45 | 46 | residual = q 47 | 48 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 49 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 50 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 51 | 52 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, 53 | len_q, d_k) # (n*b) x lq x dk 54 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, 55 | len_k, d_k) # (n*b) x lk x dk 56 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, 57 | len_v, d_v) # (n*b) x lv x dv 58 | 59 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 60 | output, attn = self.attention(q, k, v, mask=mask) 61 | 62 | output = output.view(n_head, sz_b, len_q, d_v) 63 | output = output.permute(1, 2, 0, 3).contiguous().view( 64 | sz_b, len_q, -1) # b x lq x (n*dv) 65 | 66 | output = self.dropout(self.fc(output)) 67 | output = self.layer_norm(output + residual) 68 | 69 | return output, attn 70 | 71 | 72 | class PositionwiseFeedForward(nn.Module): 73 | ''' A two-feed-forward-layer module ''' 74 | 75 | def __init__(self, d_in, d_hid, dropout=0.1): 76 | super().__init__() 77 | 78 | # Use Conv1D 79 | # position-wise 80 | self.w_1 = nn.Conv1d( 81 | d_in, d_hid, kernel_size=hp.fft_conv1d_kernel[0], padding=hp.fft_conv1d_padding[0]) 82 | # position-wise 83 | self.w_2 = nn.Conv1d( 84 | d_hid, d_in, kernel_size=hp.fft_conv1d_kernel[1], padding=hp.fft_conv1d_padding[1]) 85 | 86 | self.layer_norm = nn.LayerNorm(d_in) 87 | self.dropout = nn.Dropout(dropout) 88 | 89 | def forward(self, x): 90 | residual = x 91 | output = x.transpose(1, 2) 92 | output = self.w_2(F.relu(self.w_1(output))) 93 | output = output.transpose(1, 2) 94 | output = self.dropout(output) 95 | output = self.layer_norm(output + residual) 96 | 97 | return output 98 | -------------------------------------------------------------------------------- /transformer/__init__.py: -------------------------------------------------------------------------------- 1 | import transformer.Constants 2 | import transformer.Modules 3 | import transformer.Layers 4 | import transformer.SubLayers 5 | import transformer.Models 6 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | 7 | import hparams 8 | 9 | 10 | def process_text(train_text_path): 11 | with open(train_text_path, "r", encoding="utf-8") as f: 12 | txt = [] 13 | for line in f.readlines(): 14 | txt.append(line) 15 | 16 | return txt 17 | 18 | 19 | def get_param_num(model): 20 | num_param = sum(param.numel() for param in model.parameters()) 21 | return num_param 22 | 23 | 24 | def get_mask_from_lengths(lengths, max_len=None): 25 | if max_len == None: 26 | max_len = torch.max(lengths).item() 27 | 28 | ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) 29 | mask = (ids < lengths.unsqueeze(1)).bool() 30 | 31 | return mask 32 | 33 | 34 | def get_WaveGlow(): 35 | waveglow_path = os.path.join("waveglow", "pretrained_model") 36 | waveglow_path = os.path.join(waveglow_path, "waveglow_256channels.pt") 37 | wave_glow = torch.load(waveglow_path)['model'] 38 | wave_glow = wave_glow.remove_weightnorm(wave_glow) 39 | wave_glow.cuda().eval() 40 | for m in wave_glow.modules(): 41 | if 'Conv' in str(type(m)): 42 | setattr(m, 'padding_mode', 'zeros') 43 | 44 | return wave_glow 45 | 46 | 47 | def pad_1D(inputs, PAD=0): 48 | 49 | def pad_data(x, length, PAD): 50 | x_padded = np.pad(x, (0, length - x.shape[0]), 51 | mode='constant', 52 | constant_values=PAD) 53 | return x_padded 54 | 55 | max_len = max((len(x) for x in inputs)) 56 | padded = np.stack([pad_data(x, max_len, PAD) for x in inputs]) 57 | 58 | return padded 59 | 60 | 61 | def pad_1D_tensor(inputs, PAD=0): 62 | 63 | def pad_data(x, length, PAD): 64 | x_padded = F.pad(x, (0, length - x.shape[0])) 65 | return x_padded 66 | 67 | max_len = max((len(x) for x in inputs)) 68 | padded = torch.stack([pad_data(x, max_len, PAD) for x in inputs]) 69 | 70 | return padded 71 | 72 | 73 | def pad_2D(inputs, maxlen=None): 74 | 75 | def pad(x, max_len): 76 | PAD = 0 77 | if np.shape(x)[0] > max_len: 78 | raise ValueError("not max_len") 79 | 80 | s = np.shape(x)[1] 81 | x_padded = np.pad(x, (0, max_len - np.shape(x)[0]), 82 | mode='constant', 83 | constant_values=PAD) 84 | return x_padded[:, :s] 85 | 86 | if maxlen: 87 | output = np.stack([pad(x, maxlen) for x in inputs]) 88 | else: 89 | max_len = max(np.shape(x)[0] for x in inputs) 90 | output = np.stack([pad(x, max_len) for x in inputs]) 91 | 92 | return output 93 | 94 | 95 | def pad_2D_tensor(inputs, maxlen=None): 96 | 97 | def pad(x, max_len): 98 | if x.size(0) > max_len: 99 | raise ValueError("not max_len") 100 | 101 | s = x.size(1) 102 | x_padded = F.pad(x, (0, 0, 0, max_len-x.size(0))) 103 | return x_padded[:, :s] 104 | 105 | if maxlen: 106 | output = torch.stack([pad(x, maxlen) for x in inputs]) 107 | else: 108 | max_len = max(x.size(0) for x in inputs) 109 | output = torch.stack([pad(x, max_len) for x in inputs]) 110 | 111 | return output 112 | 113 | 114 | def pad(input_ele, mel_max_length=None): 115 | if mel_max_length: 116 | out_list = list() 117 | max_len = mel_max_length 118 | for i, batch in enumerate(input_ele): 119 | one_batch_padded = F.pad( 120 | batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0) 121 | out_list.append(one_batch_padded) 122 | out_padded = torch.stack(out_list) 123 | return out_padded 124 | else: 125 | out_list = list() 126 | max_len = max([input_ele[i].size(0)for i in range(len(input_ele))]) 127 | 128 | for i, batch in enumerate(input_ele): 129 | one_batch_padded = F.pad( 130 | batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0) 131 | out_list.append(one_batch_padded) 132 | out_padded = torch.stack(out_list) 133 | return out_padded 134 | -------------------------------------------------------------------------------- /waveglow/__init__.py: -------------------------------------------------------------------------------- 1 | import waveglow.inference 2 | import waveglow.mel2samp 3 | -------------------------------------------------------------------------------- /waveglow/convert_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | import torch 4 | 5 | def _check_model_old_version(model): 6 | if hasattr(model.WN[0], 'res_layers'): 7 | return True 8 | else: 9 | return False 10 | 11 | def update_model(old_model): 12 | if not _check_model_old_version(old_model): 13 | return old_model 14 | new_model = copy.deepcopy(old_model) 15 | for idx in range(0, len(new_model.WN)): 16 | wavenet = new_model.WN[idx] 17 | wavenet.res_skip_layers = torch.nn.ModuleList() 18 | n_channels = wavenet.n_channels 19 | n_layers = wavenet.n_layers 20 | for i in range(0, n_layers): 21 | if i < n_layers - 1: 22 | res_skip_channels = 2*n_channels 23 | else: 24 | res_skip_channels = n_channels 25 | res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) 26 | skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i]) 27 | if i < n_layers - 1: 28 | res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i]) 29 | res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight])) 30 | res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias])) 31 | else: 32 | res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight) 33 | res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias) 34 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 35 | wavenet.res_skip_layers.append(res_skip_layer) 36 | del wavenet.res_layers 37 | del wavenet.skip_layers 38 | return new_model 39 | 40 | if __name__ == '__main__': 41 | old_model_path = sys.argv[1] 42 | new_model_path = sys.argv[2] 43 | model = torch.load(old_model_path) 44 | model['model'] = update_model(model['model']) 45 | torch.save(model, new_model_path) 46 | 47 | -------------------------------------------------------------------------------- /waveglow/inference.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import os 28 | from scipy.io.wavfile import write 29 | import torch 30 | from waveglow.mel2samp import files_to_list, MAX_WAV_VALUE 31 | # from denoiser import Denoiser 32 | 33 | 34 | def inference(mel, waveglow, audio_path, sigma=1.0, sampling_rate=22050): 35 | with torch.no_grad(): 36 | audio = waveglow.infer(mel, sigma=sigma) 37 | audio = audio * MAX_WAV_VALUE 38 | audio = audio.squeeze() 39 | audio = audio.cpu().numpy() 40 | audio = audio.astype('int16') 41 | write(audio_path, sampling_rate, audio) 42 | 43 | 44 | def test_speed(mel, waveglow, sigma=1.0, sampling_rate=22050): 45 | with torch.no_grad(): 46 | audio = waveglow.infer(mel, sigma=sigma) 47 | audio = audio * MAX_WAV_VALUE 48 | 49 | 50 | def get_wav(mel, waveglow, sigma=1.0, sampling_rate=22050): 51 | with torch.no_grad(): 52 | audio = waveglow.infer(mel, sigma=sigma) 53 | audio = audio * MAX_WAV_VALUE 54 | audio = audio.squeeze() 55 | audio = audio.cpu() 56 | 57 | return audio 58 | -------------------------------------------------------------------------------- /waveglow/mel2samp.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # *****************************************************************************\ 27 | # from tacotron2.layers import TacotronSTFT 28 | import os 29 | import random 30 | import argparse 31 | import json 32 | import torch 33 | import torch.utils.data 34 | import sys 35 | from scipy.io.wavfile import read 36 | 37 | # We're using the audio processing from TacoTron2 to make sure it matches 38 | sys.path.insert(0, 'tacotron2') 39 | 40 | MAX_WAV_VALUE = 32768.0 41 | 42 | 43 | def files_to_list(filename): 44 | """ 45 | Takes a text file of filenames and makes a list of filenames 46 | """ 47 | with open(filename, encoding='utf-8') as f: 48 | files = f.readlines() 49 | 50 | files = [f.rstrip() for f in files] 51 | return files 52 | 53 | 54 | # def load_wav_to_torch(full_path): 55 | # """ 56 | # Loads wavdata into torch array 57 | # """ 58 | # sampling_rate, data = read(full_path) 59 | # return torch.from_numpy(data).float(), sampling_rate 60 | 61 | 62 | # class Mel2Samp(torch.utils.data.Dataset): 63 | # """ 64 | # This is the main class that calculates the spectrogram and returns the 65 | # spectrogram, audio pair. 66 | # """ 67 | 68 | # def __init__(self, training_files, segment_length, filter_length, 69 | # hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): 70 | # self.audio_files = files_to_list(training_files) 71 | # random.seed(1234) 72 | # random.shuffle(self.audio_files) 73 | # self.stft = TacotronSTFT(filter_length=filter_length, 74 | # hop_length=hop_length, 75 | # win_length=win_length, 76 | # sampling_rate=sampling_rate, 77 | # mel_fmin=mel_fmin, mel_fmax=mel_fmax) 78 | # self.segment_length = segment_length 79 | # self.sampling_rate = sampling_rate 80 | 81 | # def get_mel(self, audio): 82 | # audio_norm = audio / MAX_WAV_VALUE 83 | # audio_norm = audio_norm.unsqueeze(0) 84 | # audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 85 | # melspec = self.stft.mel_spectrogram(audio_norm) 86 | # melspec = torch.squeeze(melspec, 0) 87 | # return melspec 88 | 89 | # def __getitem__(self, index): 90 | # # Read audio 91 | # filename = self.audio_files[index] 92 | # audio, sampling_rate = load_wav_to_torch(filename) 93 | # if sampling_rate != self.sampling_rate: 94 | # raise ValueError("{} SR doesn't match target {} SR".format( 95 | # sampling_rate, self.sampling_rate)) 96 | 97 | # # Take segment 98 | # if audio.size(0) >= self.segment_length: 99 | # max_audio_start = audio.size(0) - self.segment_length 100 | # audio_start = random.randint(0, max_audio_start) 101 | # audio = audio[audio_start:audio_start+self.segment_length] 102 | # else: 103 | # audio = torch.nn.functional.pad( 104 | # audio, (0, self.segment_length - audio.size(0)), 'constant').data 105 | 106 | # mel = self.get_mel(audio) 107 | # audio = audio / MAX_WAV_VALUE 108 | 109 | # return (mel, audio) 110 | 111 | # def __len__(self): 112 | # return len(self.audio_files) 113 | 114 | 115 | # # =================================================================== 116 | # # Takes directory of clean audio and makes directory of spectrograms 117 | # # Useful for making test sets 118 | # # =================================================================== 119 | # if __name__ == "__main__": 120 | # # Get defaults so it can work with no Sacred 121 | # parser = argparse.ArgumentParser() 122 | # parser.add_argument('-f', "--filelist_path", required=True) 123 | # parser.add_argument('-c', '--config', type=str, 124 | # help='JSON file for configuration') 125 | # parser.add_argument('-o', '--output_dir', type=str, 126 | # help='Output directory') 127 | # args = parser.parse_args() 128 | 129 | # with open(args.config) as f: 130 | # data = f.read() 131 | # data_config = json.loads(data)["data_config"] 132 | # mel2samp = Mel2Samp(**data_config) 133 | 134 | # filepaths = files_to_list(args.filelist_path) 135 | 136 | # # Make directory if it doesn't exist 137 | # if not os.path.isdir(args.output_dir): 138 | # os.makedirs(args.output_dir) 139 | # os.chmod(args.output_dir, 0o775) 140 | 141 | # for filepath in filepaths: 142 | # audio, sr = load_wav_to_torch(filepath) 143 | # melspectrogram = mel2samp.get_mel(audio) 144 | # filename = os.path.basename(filepath) 145 | # new_filepath = args.output_dir + '/' + filename + '.pt' 146 | # print(new_filepath) 147 | # torch.save(melspectrogram, new_filepath) 148 | --------------------------------------------------------------------------------