├── .gitignore
├── LICENSE
├── README.md
├── alignments.zip
├── audio
    ├── __init__.py
    ├── audio_processing.py
    ├── hparams_audio.py
    ├── stft.py
    └── tools.py
├── data
    └── ljspeech.py
├── dataset.py
├── eval.py
├── glow.py
├── hparams.py
├── img
    └── fastspeech_structure.png
├── loss.py
├── model.py
├── modules.py
├── optimizer.py
├── preprocess.py
├── requirements.txt
├── sample
    ├── 135000_0.wav
    ├── 135000_0_waveglow.wav
    ├── 135000_1.wav
    ├── 135000_1_waveglow.wav
    ├── 135000_2.wav
    ├── 135000_2_waveglow.wav
    ├── 135000_3.wav
    ├── 135000_3_waveglow.wav
    ├── 135000_4.wav
    ├── 135000_4_waveglow.wav
    ├── 135000_5.wav
    └── 135000_5_waveglow.wav
├── text
    ├── __init__.py
    ├── cleaners.py
    ├── cmudict.py
    ├── numbers.py
    └── symbols.py
├── train.py
├── transformer
    ├── Constants.py
    ├── Layers.py
    ├── Models.py
    ├── Modules.py
    ├── SubLayers.py
    └── __init__.py
├── utils.py
└── waveglow
    ├── __init__.py
    ├── convert_model.py
    ├── inference.py
    └── mel2samp.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # vscode
107 | .vscode/
108 | 
109 | data/LJSpeech-1.1
110 | data/LJSpeech-1.1.tar.bz2
111 | data/train.txt
112 | 
113 | mels
114 | alignments
115 | waveglow/pretrained_model


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020 Zhengxi Liu <xcmyz@outlook.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FastSpeech-Pytorch
 2 | The Implementation of FastSpeech Based on Pytorch.
 3 | 
 4 | ## Update (2020/07/20)
 5 | 1. Optimize the training process.
 6 | 2. Optimize the implementation of length regulator.
 7 | 3. Use the same hyper parameter as FastSpeech2.
 8 | 4. **The measures of the 1, 2 and 3 make the training process 3 times faster than before.**
 9 | 5. **Better speech quality.**
10 | 
11 | ## Model
12 | <div style="text-align: center">
13 |     <img src="img/fastspeech_structure.png" style="max-width:100%;">
14 | </div>
15 | 
16 | ## My Blog
17 | - [FastSpeech Reading Notes](https://zhuanlan.zhihu.com/p/67325775)
18 | - [Details and Rethinking of this Implementation](https://zhuanlan.zhihu.com/p/67939482)
19 | 
20 | ## Prepare Dataset
21 | 1. Download and extract [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
22 | 2. Put LJSpeech dataset in `data`.
23 | 3. Unzip `alignments.zip`.
24 | 4. Put [Nvidia pretrained waveglow model](https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing) in the `waveglow/pretrained_model` and rename as `waveglow_256channels.pt`;
25 | 5. Run `python3 preprocess.py`.
26 | 
27 | ## Training
28 | Run `python3 train.py`.
29 | 
30 | ## Evaluation
31 | Run `python3 eval.py`.
32 | 
33 | ## Notes
34 | - In the paper of FastSpeech, authors use pre-trained Transformer-TTS model to provide the target of alignment. I didn't have a well-trained Transformer-TTS model so I use Tacotron2 instead.
35 | - I use the same hyper-parameter as [FastSpeech2](https://arxiv.org/abs/2006.04558).
36 | - The examples of audio are in `sample`.
37 | - [pretrained model](https://drive.google.com/file/d/1vMrKtbjPj9u_o3Y-8prE6hHCc6Yj4Nqk/view?usp=sharing).
38 | 
39 | ## Reference
40 | 
41 | ### Repository
42 | - [The Implementation of Tacotron Based on Tensorflow](https://github.com/keithito/tacotron)
43 | - [The Implementation of Transformer Based on Pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch)
44 | - [The Implementation of Transformer-TTS Based on Pytorch](https://github.com/xcmyz/Transformer-TTS)
45 | - [The Implementation of Tacotron2 Based on Pytorch](https://github.com/NVIDIA/tacotron2)
46 | - [The Implementation of FastSpeech2 Based on Pytorch](https://github.com/ming024/FastSpeech2)
47 | 
48 | ### Paper
49 | - [Tacotron2](https://arxiv.org/abs/1712.05884)
50 | - [Transformer](https://arxiv.org/abs/1706.03762)
51 | - [FastSpeech](https://arxiv.org/abs/1905.09263)
52 | - [FastSpeech2](https://arxiv.org/abs/2006.04558)


--------------------------------------------------------------------------------
/alignments.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/alignments.zip


--------------------------------------------------------------------------------
/audio/__init__.py:
--------------------------------------------------------------------------------
1 | import audio.hparams_audio
2 | import audio.tools
3 | import audio.stft
4 | import audio.audio_processing
5 | 


--------------------------------------------------------------------------------
/audio/audio_processing.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/NVIDIA/tacotron2 """
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | from scipy.signal import get_window
 6 | import librosa.util as librosa_util
 7 | 
 8 | 
 9 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
10 |                      n_fft=800, dtype=np.float32, norm=None):
11 |     """
12 |     # from librosa 0.6
13 |     Compute the sum-square envelope of a window function at a given hop length.
14 | 
15 |     This is used to estimate modulation effects induced by windowing
16 |     observations in short-time fourier transforms.
17 | 
18 |     Parameters
19 |     ----------
20 |     window : string, tuple, number, callable, or list-like
21 |         Window specification, as in `get_window`
22 | 
23 |     n_frames : int > 0
24 |         The number of analysis frames
25 | 
26 |     hop_length : int > 0
27 |         The number of samples to advance between frames
28 | 
29 |     win_length : [optional]
30 |         The length of the window function.  By default, this matches `n_fft`.
31 | 
32 |     n_fft : int > 0
33 |         The length of each analysis frame.
34 | 
35 |     dtype : np.dtype
36 |         The data type of the output
37 | 
38 |     Returns
39 |     -------
40 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
41 |         The sum-squared envelope of the window function
42 |     """
43 |     if win_length is None:
44 |         win_length = n_fft
45 | 
46 |     n = n_fft + hop_length * (n_frames - 1)
47 |     x = np.zeros(n, dtype=dtype)
48 | 
49 |     # Compute the squared window at the desired length
50 |     win_sq = get_window(window, win_length, fftbins=True)
51 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
52 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
53 | 
54 |     # Fill the envelope
55 |     for i in range(n_frames):
56 |         sample = i * hop_length
57 |         x[sample:min(n, sample + n_fft)
58 |           ] += win_sq[:max(0, min(n_fft, n - sample))]
59 |     return x
60 | 
61 | 
62 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
63 |     """
64 |     PARAMS
65 |     ------
66 |     magnitudes: spectrogram magnitudes
67 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
68 |     """
69 | 
70 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
71 |     angles = angles.astype(np.float32)
72 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
73 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
74 | 
75 |     for i in range(n_iters):
76 |         _, angles = stft_fn.transform(signal)
77 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
78 |     return signal
79 | 
80 | 
81 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
82 |     """
83 |     PARAMS
84 |     ------
85 |     C: compression factor
86 |     """
87 |     return torch.log(torch.clamp(x, min=clip_val) * C)
88 | 
89 | 
90 | def dynamic_range_decompression(x, C=1):
91 |     """
92 |     PARAMS
93 |     ------
94 |     C: compression factor used to compress
95 |     """
96 |     return torch.exp(x) / C
97 | 


--------------------------------------------------------------------------------
/audio/hparams_audio.py:
--------------------------------------------------------------------------------
1 | max_wav_value = 32768.0
2 | sampling_rate = 22050
3 | filter_length = 1024
4 | hop_length = 256
5 | win_length = 1024
6 | n_mel_channels = 80
7 | mel_fmin = 0.0
8 | mel_fmax = 8000.0
9 | 


--------------------------------------------------------------------------------
/audio/stft.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/NVIDIA/tacotron2 """
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | 
  8 | from scipy.signal import get_window
  9 | from librosa.util import pad_center, tiny
 10 | from librosa.filters import mel as librosa_mel_fn
 11 | 
 12 | from audio.audio_processing import dynamic_range_compression
 13 | from audio.audio_processing import dynamic_range_decompression
 14 | from audio.audio_processing import window_sumsquare
 15 | 
 16 | 
 17 | class STFT(torch.nn.Module):
 18 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 19 | 
 20 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 21 |                  window='hann'):
 22 |         super(STFT, self).__init__()
 23 |         self.filter_length = filter_length
 24 |         self.hop_length = hop_length
 25 |         self.win_length = win_length
 26 |         self.window = window
 27 |         self.forward_transform = None
 28 |         scale = self.filter_length / self.hop_length
 29 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 30 | 
 31 |         cutoff = int((self.filter_length / 2 + 1))
 32 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 33 |                                    np.imag(fourier_basis[:cutoff, :])])
 34 | 
 35 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 36 |         inverse_basis = torch.FloatTensor(
 37 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 38 | 
 39 |         if window is not None:
 40 |             assert(filter_length >= win_length)
 41 |             # get window and zero center pad it to filter_length
 42 |             fft_window = get_window(window, win_length, fftbins=True)
 43 |             fft_window = pad_center(fft_window, filter_length)
 44 |             fft_window = torch.from_numpy(fft_window).float()
 45 | 
 46 |             # window the bases
 47 |             forward_basis *= fft_window
 48 |             inverse_basis *= fft_window
 49 | 
 50 |         self.register_buffer('forward_basis', forward_basis.float())
 51 |         self.register_buffer('inverse_basis', inverse_basis.float())
 52 | 
 53 |     def transform(self, input_data):
 54 |         num_batches = input_data.size(0)
 55 |         num_samples = input_data.size(1)
 56 | 
 57 |         self.num_samples = num_samples
 58 | 
 59 |         # similar to librosa, reflect-pad the input
 60 |         input_data = input_data.view(num_batches, 1, num_samples)
 61 |         input_data = F.pad(
 62 |             input_data.unsqueeze(1),
 63 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 64 |             mode='reflect')
 65 |         input_data = input_data.squeeze(1)
 66 | 
 67 |         forward_transform = F.conv1d(
 68 |             input_data.cpu(),
 69 |             Variable(self.forward_basis, requires_grad=False).cpu(),
 70 |             stride=self.hop_length,
 71 |             padding=0).cpu()
 72 | 
 73 |         cutoff = int((self.filter_length / 2) + 1)
 74 |         real_part = forward_transform[:, :cutoff, :]
 75 |         imag_part = forward_transform[:, cutoff:, :]
 76 | 
 77 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
 78 |         phase = torch.autograd.Variable(
 79 |             torch.atan2(imag_part.data, real_part.data))
 80 | 
 81 |         return magnitude, phase
 82 | 
 83 |     def inverse(self, magnitude, phase):
 84 |         recombine_magnitude_phase = torch.cat(
 85 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
 86 | 
 87 |         inverse_transform = F.conv_transpose1d(
 88 |             recombine_magnitude_phase,
 89 |             Variable(self.inverse_basis, requires_grad=False),
 90 |             stride=self.hop_length,
 91 |             padding=0)
 92 | 
 93 |         if self.window is not None:
 94 |             window_sum = window_sumsquare(
 95 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
 96 |                 win_length=self.win_length, n_fft=self.filter_length,
 97 |                 dtype=np.float32)
 98 |             # remove modulation effects
 99 |             approx_nonzero_indices = torch.from_numpy(
100 |                 np.where(window_sum > tiny(window_sum))[0])
101 |             window_sum = torch.autograd.Variable(
102 |                 torch.from_numpy(window_sum), requires_grad=False)
103 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
104 |             inverse_transform[:, :,
105 |                               approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
106 | 
107 |             # scale by hop ratio
108 |             inverse_transform *= float(self.filter_length) / self.hop_length
109 | 
110 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
111 |         inverse_transform = inverse_transform[:,
112 |                                               :, :-int(self.filter_length/2):]
113 | 
114 |         return inverse_transform
115 | 
116 |     def forward(self, input_data):
117 |         self.magnitude, self.phase = self.transform(input_data)
118 |         reconstruction = self.inverse(self.magnitude, self.phase)
119 |         return reconstruction
120 | 
121 | 
122 | class TacotronSTFT(torch.nn.Module):
123 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
124 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
125 |                  mel_fmax=8000.0):
126 |         super(TacotronSTFT, self).__init__()
127 |         self.n_mel_channels = n_mel_channels
128 |         self.sampling_rate = sampling_rate
129 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
130 |         mel_basis = librosa_mel_fn(
131 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
132 |         mel_basis = torch.from_numpy(mel_basis).float()
133 |         self.register_buffer('mel_basis', mel_basis)
134 | 
135 |     def spectral_normalize(self, magnitudes):
136 |         output = dynamic_range_compression(magnitudes)
137 |         return output
138 | 
139 |     def spectral_de_normalize(self, magnitudes):
140 |         output = dynamic_range_decompression(magnitudes)
141 |         return output
142 | 
143 |     def mel_spectrogram(self, y):
144 |         """Computes mel-spectrograms from a batch of waves
145 |         PARAMS
146 |         ------
147 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
148 | 
149 |         RETURNS
150 |         -------
151 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
152 |         """
153 |         assert(torch.min(y.data) >= -1)
154 |         assert(torch.max(y.data) <= 1)
155 | 
156 |         magnitudes, phases = self.stft_fn.transform(y)
157 |         magnitudes = magnitudes.data
158 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
159 |         mel_output = self.spectral_normalize(mel_output)
160 |         return mel_output
161 | 


--------------------------------------------------------------------------------
/audio/tools.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/NVIDIA/tacotron2 """
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | from scipy.io.wavfile import read
 6 | from scipy.io.wavfile import write
 7 | 
 8 | import audio.stft as stft
 9 | import audio.hparams_audio as hparams
10 | from audio.audio_processing import griffin_lim
11 | 
12 | _stft = stft.TacotronSTFT(
13 |     hparams.filter_length, hparams.hop_length, hparams.win_length,
14 |     hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
15 |     hparams.mel_fmax)
16 | 
17 | 
18 | def load_wav_to_torch(full_path):
19 |     sampling_rate, data = read(full_path)
20 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
21 | 
22 | 
23 | def get_mel(filename):
24 |     audio, sampling_rate = load_wav_to_torch(filename)
25 |     if sampling_rate != _stft.sampling_rate:
26 |         raise ValueError("{} {} SR doesn't match target {} SR".format(
27 |             sampling_rate, _stft.sampling_rate))
28 |     audio_norm = audio / hparams.max_wav_value
29 |     audio_norm = audio_norm.unsqueeze(0)
30 |     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
31 |     melspec = _stft.mel_spectrogram(audio_norm)
32 |     melspec = torch.squeeze(melspec, 0)
33 |     # melspec = torch.from_numpy(_normalize(melspec.numpy()))
34 | 
35 |     return melspec
36 | 
37 | 
38 | def get_mel_from_wav(audio):
39 |     sampling_rate = hparams.sampling_rate
40 |     if sampling_rate != _stft.sampling_rate:
41 |         raise ValueError("{} {} SR doesn't match target {} SR".format(
42 |             sampling_rate, _stft.sampling_rate))
43 |     audio_norm = audio / hparams.max_wav_value
44 |     audio_norm = audio_norm.unsqueeze(0)
45 |     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
46 |     melspec = _stft.mel_spectrogram(audio_norm)
47 |     melspec = torch.squeeze(melspec, 0)
48 | 
49 |     return melspec
50 | 
51 | 
52 | def inv_mel_spec(mel, out_filename, griffin_iters=60):
53 |     mel = torch.stack([mel])
54 |     # mel = torch.stack([torch.from_numpy(_denormalize(mel.numpy()))])
55 |     mel_decompress = _stft.spectral_de_normalize(mel)
56 |     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
57 |     spec_from_mel_scaling = 1000
58 |     spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
59 |     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
60 |     spec_from_mel = spec_from_mel * spec_from_mel_scaling
61 | 
62 |     audio = griffin_lim(torch.autograd.Variable(
63 |         spec_from_mel[:, :, :-1]), _stft.stft_fn, griffin_iters)
64 | 
65 |     audio = audio.squeeze()
66 |     audio = audio.cpu().numpy()
67 |     audio_path = out_filename
68 |     write(audio_path, hparams.sampling_rate, audio)
69 | 


--------------------------------------------------------------------------------
/data/ljspeech.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import audio
 4 | 
 5 | from tqdm import tqdm
 6 | from functools import partial
 7 | from concurrent.futures import ProcessPoolExecutor
 8 | 
 9 | 
10 | def build_from_path(in_dir, out_dir):
11 |     index = 1
12 |     # executor = ProcessPoolExecutor(max_workers=4)
13 |     # futures = []
14 |     texts = []
15 | 
16 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
17 |         for line in f.readlines():
18 |             if index % 100 == 0:
19 |                 print("{:d} Done".format(index))
20 |             parts = line.strip().split('|')
21 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
22 |             text = parts[2]
23 |             # futures.append(executor.submit(
24 |             #     partial(_process_utterance, out_dir, index, wav_path, text)))
25 |             texts.append(_process_utterance(out_dir, index, wav_path, text))
26 | 
27 |             index = index + 1
28 | 
29 |     # return [future.result() for future in tqdm(futures)]
30 |     return texts
31 | 
32 | 
33 | def _process_utterance(out_dir, index, wav_path, text):
34 |     # Compute a mel-scale spectrogram from the wav:
35 |     mel_spectrogram = audio.tools.get_mel(wav_path).numpy().astype(np.float32)
36 | 
37 |     # Write the spectrograms to disk:
38 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
39 |     np.save(os.path.join(out_dir, mel_filename),
40 |             mel_spectrogram.T, allow_pickle=False)
41 | 
42 |     return text
43 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from torch.utils.data import Dataset, DataLoader
  4 | 
  5 | import numpy as np
  6 | import math
  7 | import time
  8 | import os
  9 | 
 10 | import hparams
 11 | import audio
 12 | 
 13 | from utils import process_text, pad_1D, pad_2D
 14 | from utils import pad_1D_tensor, pad_2D_tensor
 15 | from text import text_to_sequence
 16 | from tqdm import tqdm
 17 | 
 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 19 | 
 20 | 
 21 | def get_data_to_buffer():
 22 |     buffer = list()
 23 |     text = process_text(os.path.join("data", "train.txt"))
 24 | 
 25 |     start = time.perf_counter()
 26 |     for i in tqdm(range(len(text))):
 27 | 
 28 |         mel_gt_name = os.path.join(
 29 |             hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i+1))
 30 |         mel_gt_target = np.load(mel_gt_name)
 31 |         duration = np.load(os.path.join(
 32 |             hparams.alignment_path, str(i)+".npy"))
 33 |         character = text[i][0:len(text[i])-1]
 34 |         character = np.array(
 35 |             text_to_sequence(character, hparams.text_cleaners))
 36 | 
 37 |         character = torch.from_numpy(character)
 38 |         duration = torch.from_numpy(duration)
 39 |         mel_gt_target = torch.from_numpy(mel_gt_target)
 40 | 
 41 |         buffer.append({"text": character, "duration": duration,
 42 |                        "mel_target": mel_gt_target})
 43 | 
 44 |     end = time.perf_counter()
 45 |     print("cost {:.2f}s to load all data into buffer.".format(end-start))
 46 | 
 47 |     return buffer
 48 | 
 49 | 
 50 | class BufferDataset(Dataset):
 51 |     def __init__(self, buffer):
 52 |         self.buffer = buffer
 53 |         self.length_dataset = len(self.buffer)
 54 | 
 55 |     def __len__(self):
 56 |         return self.length_dataset
 57 | 
 58 |     def __getitem__(self, idx):
 59 |         return self.buffer[idx]
 60 | 
 61 | 
 62 | def reprocess_tensor(batch, cut_list):
 63 |     texts = [batch[ind]["text"] for ind in cut_list]
 64 |     mel_targets = [batch[ind]["mel_target"] for ind in cut_list]
 65 |     durations = [batch[ind]["duration"] for ind in cut_list]
 66 | 
 67 |     length_text = np.array([])
 68 |     for text in texts:
 69 |         length_text = np.append(length_text, text.size(0))
 70 | 
 71 |     src_pos = list()
 72 |     max_len = int(max(length_text))
 73 |     for length_src_row in length_text:
 74 |         src_pos.append(np.pad([i+1 for i in range(int(length_src_row))],
 75 |                               (0, max_len-int(length_src_row)), 'constant'))
 76 |     src_pos = torch.from_numpy(np.array(src_pos))
 77 | 
 78 |     length_mel = np.array(list())
 79 |     for mel in mel_targets:
 80 |         length_mel = np.append(length_mel, mel.size(0))
 81 | 
 82 |     mel_pos = list()
 83 |     max_mel_len = int(max(length_mel))
 84 |     for length_mel_row in length_mel:
 85 |         mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))],
 86 |                               (0, max_mel_len-int(length_mel_row)), 'constant'))
 87 |     mel_pos = torch.from_numpy(np.array(mel_pos))
 88 | 
 89 |     texts = pad_1D_tensor(texts)
 90 |     durations = pad_1D_tensor(durations)
 91 |     mel_targets = pad_2D_tensor(mel_targets)
 92 | 
 93 |     out = {"text": texts,
 94 |            "mel_target": mel_targets,
 95 |            "duration": durations,
 96 |            "mel_pos": mel_pos,
 97 |            "src_pos": src_pos,
 98 |            "mel_max_len": max_mel_len}
 99 | 
100 |     return out
101 | 
102 | 
103 | def collate_fn_tensor(batch):
104 |     len_arr = np.array([d["text"].size(0) for d in batch])
105 |     index_arr = np.argsort(-len_arr)
106 |     batchsize = len(batch)
107 |     real_batchsize = batchsize // hparams.batch_expand_size
108 | 
109 |     cut_list = list()
110 |     for i in range(hparams.batch_expand_size):
111 |         cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize])
112 | 
113 |     output = list()
114 |     for i in range(hparams.batch_expand_size):
115 |         output.append(reprocess_tensor(batch, cut_list[i]))
116 | 
117 |     return output
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     # TEST
122 |     get_data_to_buffer()
123 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import argparse
 4 | import numpy as np
 5 | import random
 6 | import time
 7 | import shutil
 8 | import os
 9 | 
10 | import hparams as hp
11 | import audio
12 | import utils
13 | import dataset
14 | import text
15 | import model as M
16 | import waveglow
17 | 
18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19 | 
20 | 
21 | def get_DNN(num):
22 |     checkpoint_path = "checkpoint_" + str(num) + ".pth.tar"
23 |     model = nn.DataParallel(M.FastSpeech()).to(device)
24 |     model.load_state_dict(torch.load(os.path.join(hp.checkpoint_path,
25 |                                                   checkpoint_path))['model'])
26 |     model.eval()
27 |     return model
28 | 
29 | 
30 | def synthesis(model, text, alpha=1.0):
31 |     text = np.array(phn)
32 |     text = np.stack([text])
33 |     src_pos = np.array([i+1 for i in range(text.shape[1])])
34 |     src_pos = np.stack([src_pos])
35 |     sequence = torch.from_numpy(text).cuda().long()
36 |     src_pos = torch.from_numpy(src_pos).cuda().long()
37 | 
38 |     with torch.no_grad():
39 |         _, mel = model.module.forward(sequence, src_pos, alpha=alpha)
40 |     return mel[0].cpu().transpose(0, 1), mel.contiguous().transpose(1, 2)
41 | 
42 | 
43 | def get_data():
44 |     test1 = "I am very happy to see you again!"
45 |     test2 = "Durian model is a very good speech synthesis!"
46 |     test3 = "When I was twenty, I fell in love with a girl."
47 |     test4 = "I remove attention module in decoder and use average pooling to implement predicting r frames at once"
48 |     test5 = "You can not improve your past, but you can improve your future. Once time is wasted, life is wasted."
49 |     test6 = "Death comes to all, but great achievements raise a monument which shall endure until the sun grows old."
50 |     data_list = list()
51 |     data_list.append(text.text_to_sequence(test1, hp.text_cleaners))
52 |     data_list.append(text.text_to_sequence(test2, hp.text_cleaners))
53 |     data_list.append(text.text_to_sequence(test3, hp.text_cleaners))
54 |     data_list.append(text.text_to_sequence(test4, hp.text_cleaners))
55 |     data_list.append(text.text_to_sequence(test5, hp.text_cleaners))
56 |     data_list.append(text.text_to_sequence(test6, hp.text_cleaners))
57 |     return data_list
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     # Test
62 |     WaveGlow = utils.get_WaveGlow()
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument('--step', type=int, default=0)
65 |     parser.add_argument("--alpha", type=float, default=1.0)
66 |     args = parser.parse_args()
67 | 
68 |     print("use griffin-lim and waveglow")
69 |     model = get_DNN(args.step)
70 |     data_list = get_data()
71 |     for i, phn in enumerate(data_list):
72 |         mel, mel_cuda = synthesis(model, phn, args.alpha)
73 |         if not os.path.exists("results"):
74 |             os.mkdir("results")
75 |         audio.tools.inv_mel_spec(
76 |             mel, "results/"+str(args.step)+"_"+str(i)+".wav")
77 |         waveglow.inference.inference(
78 |             mel_cuda, WaveGlow,
79 |             "results/"+str(args.step)+"_"+str(i)+"_waveglow.wav")
80 |         print("Done", i + 1)
81 | 
82 |     s_t = time.perf_counter()
83 |     for i in range(100):
84 |         for _, phn in enumerate(data_list):
85 |             _, _, = synthesis(model, phn, args.alpha)
86 |         print(i)
87 |     e_t = time.perf_counter()
88 |     print((e_t - s_t) / 100.)
89 | 


--------------------------------------------------------------------------------
/glow.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import copy
 28 | import torch
 29 | from torch.autograd import Variable
 30 | import torch.nn.functional as F
 31 | torch.nn.Module.dump_patches = True
 32 | 
 33 | 
 34 | @torch.jit.script
 35 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
 36 |     n_channels_int = n_channels[0]
 37 |     in_act = input_a+input_b
 38 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 39 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 40 |     acts = t_act * s_act
 41 |     return acts
 42 | 
 43 | 
 44 | class WaveGlowLoss(torch.nn.Module):
 45 |     def __init__(self, sigma=1.0):
 46 |         super(WaveGlowLoss, self).__init__()
 47 |         self.sigma = sigma
 48 | 
 49 |     def forward(self, model_output):
 50 |         z, log_s_list, log_det_W_list = model_output
 51 |         for i, log_s in enumerate(log_s_list):
 52 |             if i == 0:
 53 |                 log_s_total = torch.sum(log_s)
 54 |                 log_det_W_total = log_det_W_list[i]
 55 |             else:
 56 |                 log_s_total = log_s_total + torch.sum(log_s)
 57 |                 log_det_W_total += log_det_W_list[i]
 58 | 
 59 |         loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - \
 60 |             log_s_total - log_det_W_total
 61 |         return loss/(z.size(0)*z.size(1)*z.size(2))
 62 | 
 63 | 
 64 | class Invertible1x1Conv(torch.nn.Module):
 65 |     """
 66 |     The layer outputs both the convolution, and the log determinant
 67 |     of its weight matrix.  If reverse=True it does convolution with
 68 |     inverse
 69 |     """
 70 | 
 71 |     def __init__(self, c):
 72 |         super(Invertible1x1Conv, self).__init__()
 73 |         self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
 74 |                                     bias=False)
 75 | 
 76 |         # Sample a random orthonormal matrix to initialize weights
 77 |         W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
 78 | 
 79 |         # Ensure determinant is 1.0 not -1.0
 80 |         if torch.det(W) < 0:
 81 |             W[:, 0] = -1*W[:, 0]
 82 |         W = W.view(c, c, 1)
 83 |         self.conv.weight.data = W
 84 | 
 85 |     def forward(self, z, reverse=False):
 86 |         # shape
 87 |         batch_size, group_size, n_of_groups = z.size()
 88 | 
 89 |         W = self.conv.weight.squeeze()
 90 | 
 91 |         if reverse:
 92 |             if not hasattr(self, 'W_inverse'):
 93 |                 # Reverse computation
 94 |                 W_inverse = W.inverse()
 95 |                 W_inverse = Variable(W_inverse[..., None])
 96 |                 if z.type() == 'torch.cuda.HalfTensor':
 97 |                     W_inverse = W_inverse.half()
 98 |                 self.W_inverse = W_inverse
 99 |             z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
100 |             return z
101 |         else:
102 |             # Forward computation
103 |             log_det_W = batch_size * n_of_groups * torch.logdet(W)
104 |             z = self.conv(z)
105 |             return z, log_det_W
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |     """
110 |     This is the WaveNet like layer for the affine coupling.  The primary difference
111 |     from WaveNet is the convolutions need not be causal.  There is also no dilation
112 |     size reset.  The dilation only doubles on each layer
113 |     """
114 | 
115 |     def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
116 |                  kernel_size):
117 |         super(WN, self).__init__()
118 |         assert(kernel_size % 2 == 1)
119 |         assert(n_channels % 2 == 0)
120 |         self.n_layers = n_layers
121 |         self.n_channels = n_channels
122 |         self.in_layers = torch.nn.ModuleList()
123 |         self.res_skip_layers = torch.nn.ModuleList()
124 |         self.cond_layers = torch.nn.ModuleList()
125 | 
126 |         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
127 |         start = torch.nn.utils.weight_norm(start, name='weight')
128 |         self.start = start
129 | 
130 |         # Initializing last layer to 0 makes the affine coupling layers
131 |         # do nothing at first.  This helps with training stability
132 |         end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
133 |         end.weight.data.zero_()
134 |         end.bias.data.zero_()
135 |         self.end = end
136 | 
137 |         for i in range(n_layers):
138 |             dilation = 2 ** i
139 |             padding = int((kernel_size*dilation - dilation)/2)
140 |             in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
141 |                                        dilation=dilation, padding=padding)
142 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
143 |             self.in_layers.append(in_layer)
144 | 
145 |             cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
146 |             cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
147 |             self.cond_layers.append(cond_layer)
148 | 
149 |             # last one is not necessary
150 |             if i < n_layers - 1:
151 |                 res_skip_channels = 2*n_channels
152 |             else:
153 |                 res_skip_channels = n_channels
154 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
155 |             res_skip_layer = torch.nn.utils.weight_norm(
156 |                 res_skip_layer, name='weight')
157 |             self.res_skip_layers.append(res_skip_layer)
158 | 
159 |     def forward(self, forward_input):
160 |         audio, spect = forward_input
161 |         audio = self.start(audio)
162 | 
163 |         for i in range(self.n_layers):
164 |             acts = fused_add_tanh_sigmoid_multiply(
165 |                 self.in_layers[i](audio),
166 |                 self.cond_layers[i](spect),
167 |                 torch.IntTensor([self.n_channels]))
168 | 
169 |             res_skip_acts = self.res_skip_layers[i](acts)
170 |             if i < self.n_layers - 1:
171 |                 audio = res_skip_acts[:, :self.n_channels, :] + audio
172 |                 skip_acts = res_skip_acts[:, self.n_channels:, :]
173 |             else:
174 |                 skip_acts = res_skip_acts
175 | 
176 |             if i == 0:
177 |                 output = skip_acts
178 |             else:
179 |                 output = skip_acts + output
180 |         return self.end(output)
181 | 
182 | 
183 | class WaveGlow(torch.nn.Module):
184 |     def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
185 |                  n_early_size, WN_config):
186 |         super(WaveGlow, self).__init__()
187 | 
188 |         self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
189 |                                                  n_mel_channels,
190 |                                                  1024, stride=256)
191 |         assert(n_group % 2 == 0)
192 |         self.n_flows = n_flows
193 |         self.n_group = n_group
194 |         self.n_early_every = n_early_every
195 |         self.n_early_size = n_early_size
196 |         self.WN = torch.nn.ModuleList()
197 |         self.convinv = torch.nn.ModuleList()
198 | 
199 |         n_half = int(n_group/2)
200 | 
201 |         # Set up layers with the right sizes based on how many dimensions
202 |         # have been output already
203 |         n_remaining_channels = n_group
204 |         for k in range(n_flows):
205 |             if k % self.n_early_every == 0 and k > 0:
206 |                 n_half = n_half - int(self.n_early_size/2)
207 |                 n_remaining_channels = n_remaining_channels - self.n_early_size
208 |             self.convinv.append(Invertible1x1Conv(n_remaining_channels))
209 |             self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
210 |         self.n_remaining_channels = n_remaining_channels  # Useful during inference
211 | 
212 |     def forward(self, forward_input):
213 |         """
214 |         forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
215 |         forward_input[1] = audio: batch x time
216 |         """
217 |         spect, audio = forward_input
218 | 
219 |         #  Upsample spectrogram to size of audio
220 |         spect = self.upsample(spect)
221 |         assert(spect.size(2) >= audio.size(1))
222 |         if spect.size(2) > audio.size(1):
223 |             spect = spect[:, :, :audio.size(1)]
224 | 
225 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
226 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
227 | 
228 |         audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
229 |         output_audio = []
230 |         log_s_list = []
231 |         log_det_W_list = []
232 | 
233 |         for k in range(self.n_flows):
234 |             if k % self.n_early_every == 0 and k > 0:
235 |                 output_audio.append(audio[:, :self.n_early_size, :])
236 |                 audio = audio[:, self.n_early_size:, :]
237 | 
238 |             audio, log_det_W = self.convinv[k](audio)
239 |             log_det_W_list.append(log_det_W)
240 | 
241 |             n_half = int(audio.size(1)/2)
242 |             audio_0 = audio[:, :n_half, :]
243 |             audio_1 = audio[:, n_half:, :]
244 | 
245 |             output = self.WN[k]((audio_0, spect))
246 |             log_s = output[:, n_half:, :]
247 |             b = output[:, :n_half, :]
248 |             audio_1 = torch.exp(log_s)*audio_1 + b
249 |             log_s_list.append(log_s)
250 | 
251 |             audio = torch.cat([audio_0, audio_1], 1)
252 | 
253 |         output_audio.append(audio)
254 |         return torch.cat(output_audio, 1), log_s_list, log_det_W_list
255 | 
256 |     def infer(self, spect, sigma=1.0):
257 |         spect = self.upsample(spect)
258 |         # trim conv artifacts. maybe pad spec to kernel multiple
259 |         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
260 |         spect = spect[:, :, :-time_cutoff]
261 | 
262 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
263 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
264 | 
265 |         if spect.type() == 'torch.cuda.HalfTensor':
266 |             audio = torch.cuda.HalfTensor(spect.size(0),
267 |                                           self.n_remaining_channels,
268 |                                           spect.size(2)).normal_()
269 |         else:
270 |             audio = torch.cuda.FloatTensor(spect.size(0),
271 |                                            self.n_remaining_channels,
272 |                                            spect.size(2)).normal_()
273 | 
274 |         audio = torch.autograd.Variable(sigma*audio)
275 | 
276 |         for k in reversed(range(self.n_flows)):
277 |             n_half = int(audio.size(1)/2)
278 |             audio_0 = audio[:, :n_half, :]
279 |             audio_1 = audio[:, n_half:, :]
280 | 
281 |             output = self.WN[k]((audio_0, spect))
282 |             s = output[:, n_half:, :]
283 |             b = output[:, :n_half, :]
284 |             audio_1 = (audio_1 - b)/torch.exp(s)
285 |             audio = torch.cat([audio_0, audio_1], 1)
286 | 
287 |             audio = self.convinv[k](audio, reverse=True)
288 | 
289 |             if k % self.n_early_every == 0 and k > 0:
290 |                 if spect.type() == 'torch.cuda.HalfTensor':
291 |                     z = torch.cuda.HalfTensor(spect.size(
292 |                         0), self.n_early_size, spect.size(2)).normal_()
293 |                 else:
294 |                     z = torch.cuda.FloatTensor(spect.size(
295 |                         0), self.n_early_size, spect.size(2)).normal_()
296 |                 audio = torch.cat((sigma*z, audio), 1)
297 | 
298 |         audio = audio.permute(0, 2, 1).contiguous().view(
299 |             audio.size(0), -1).data
300 |         return audio
301 | 
302 |     @staticmethod
303 |     def remove_weightnorm(model):
304 |         waveglow = model
305 |         for WN in waveglow.WN:
306 |             WN.start = torch.nn.utils.remove_weight_norm(WN.start)
307 |             WN.in_layers = remove(WN.in_layers)
308 |             WN.cond_layers = remove(WN.cond_layers)
309 |             WN.res_skip_layers = remove(WN.res_skip_layers)
310 |         return waveglow
311 | 
312 | 
313 | def remove(conv_list):
314 |     new_conv_list = torch.nn.ModuleList()
315 |     for old_conv in conv_list:
316 |         old_conv = torch.nn.utils.remove_weight_norm(old_conv)
317 |         new_conv_list.append(old_conv)
318 |     return new_conv_list
319 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | # Mel
 2 | num_mels = 80
 3 | text_cleaners = ['english_cleaners']
 4 | 
 5 | # FastSpeech
 6 | vocab_size = 300
 7 | max_seq_len = 3000
 8 | 
 9 | encoder_dim = 256
10 | encoder_n_layer = 4
11 | encoder_head = 2
12 | encoder_conv1d_filter_size = 1024
13 | 
14 | decoder_dim = 256
15 | decoder_n_layer = 4
16 | decoder_head = 2
17 | decoder_conv1d_filter_size = 1024
18 | 
19 | fft_conv1d_kernel = (9, 1)
20 | fft_conv1d_padding = (4, 0)
21 | 
22 | duration_predictor_filter_size = 256
23 | duration_predictor_kernel_size = 3
24 | dropout = 0.1
25 | 
26 | # Train
27 | checkpoint_path = "./model_new"
28 | logger_path = "./logger"
29 | mel_ground_truth = "./mels"
30 | alignment_path = "./alignments"
31 | 
32 | batch_size = 32
33 | epochs = 2000
34 | n_warm_up_step = 4000
35 | 
36 | learning_rate = 1e-3
37 | weight_decay = 1e-6
38 | grad_clip_thresh = 1.0
39 | decay_step = [500000, 1000000, 2000000]
40 | 
41 | save_step = 3000
42 | log_step = 5
43 | clear_Time = 20
44 | 
45 | batch_expand_size = 32
46 | 


--------------------------------------------------------------------------------
/img/fastspeech_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/img/fastspeech_structure.png


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DNNLoss(nn.Module):
 6 |     def __init__(self):
 7 |         super(DNNLoss, self).__init__()
 8 |         self.mse_loss = nn.MSELoss()
 9 |         self.l1_loss = nn.L1Loss()
10 | 
11 |     def forward(self, mel, mel_postnet, duration_predicted, mel_target, duration_predictor_target):
12 |         mel_target.requires_grad = False
13 |         mel_loss = self.mse_loss(mel, mel_target)
14 |         mel_postnet_loss = self.mse_loss(mel_postnet, mel_target)
15 | 
16 |         duration_predictor_target.requires_grad = False
17 |         duration_predictor_loss = self.l1_loss(duration_predicted,
18 |                                                duration_predictor_target.float())
19 | 
20 |         return mel_loss, mel_postnet_loss, duration_predictor_loss
21 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import hparams as hp
 4 | import utils
 5 | 
 6 | from transformer.Models import Encoder, Decoder
 7 | from transformer.Layers import Linear, PostNet
 8 | from modules import LengthRegulator, CBHG
 9 | 
10 | 
11 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12 | 
13 | 
14 | class FastSpeech(nn.Module):
15 |     """ FastSpeech """
16 | 
17 |     def __init__(self):
18 |         super(FastSpeech, self).__init__()
19 | 
20 |         self.encoder = Encoder()
21 |         self.length_regulator = LengthRegulator()
22 |         self.decoder = Decoder()
23 | 
24 |         self.mel_linear = Linear(hp.decoder_dim, hp.num_mels)
25 |         self.postnet = CBHG(hp.num_mels, K=8,
26 |                             projections=[256, hp.num_mels])
27 |         self.last_linear = Linear(hp.num_mels * 2, hp.num_mels)
28 | 
29 |     def mask_tensor(self, mel_output, position, mel_max_length):
30 |         lengths = torch.max(position, -1)[0]
31 |         mask = ~utils.get_mask_from_lengths(lengths, max_len=mel_max_length)
32 |         mask = mask.unsqueeze(-1).expand(-1, -1, mel_output.size(-1))
33 |         return mel_output.masked_fill(mask, 0.)
34 | 
35 |     def forward(self, src_seq, src_pos, mel_pos=None, mel_max_length=None, length_target=None, alpha=1.0):
36 |         encoder_output, _ = self.encoder(src_seq, src_pos)
37 | 
38 |         if self.training:
39 |             length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
40 |                                                                                        target=length_target,
41 |                                                                                        alpha=alpha,
42 |                                                                                        mel_max_length=mel_max_length)
43 |             decoder_output = self.decoder(length_regulator_output, mel_pos)
44 | 
45 |             mel_output = self.mel_linear(decoder_output)
46 |             mel_output = self.mask_tensor(mel_output, mel_pos, mel_max_length)
47 |             residual = self.postnet(mel_output)
48 |             residual = self.last_linear(residual)
49 |             mel_postnet_output = mel_output + residual
50 |             mel_postnet_output = self.mask_tensor(mel_postnet_output,
51 |                                                   mel_pos,
52 |                                                   mel_max_length)
53 | 
54 |             return mel_output, mel_postnet_output, duration_predictor_output
55 |         else:
56 |             length_regulator_output, decoder_pos = self.length_regulator(encoder_output,
57 |                                                                          alpha=alpha)
58 | 
59 |             decoder_output = self.decoder(length_regulator_output, decoder_pos)
60 | 
61 |             mel_output = self.mel_linear(decoder_output)
62 |             residual = self.postnet(mel_output)
63 |             residual = self.last_linear(residual)
64 |             mel_postnet_output = mel_output + residual
65 | 
66 |             return mel_output, mel_postnet_output
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     # Test
71 |     model = FastSpeech()
72 |     print(sum(param.numel() for param in model.parameters()))
73 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from collections import OrderedDict
  6 | from numba import jit
  7 | import numpy as np
  8 | import copy
  9 | import math
 10 | 
 11 | import hparams as hp
 12 | import utils
 13 | 
 14 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 15 | 
 16 | 
 17 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 18 |     ''' Sinusoid position encoding table '''
 19 | 
 20 |     def cal_angle(position, hid_idx):
 21 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 22 | 
 23 |     def get_posi_angle_vec(position):
 24 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 25 | 
 26 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i)
 27 |                                for pos_i in range(n_position)])
 28 | 
 29 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 30 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 31 | 
 32 |     if padding_idx is not None:
 33 |         # zero vector for padding dimension
 34 |         sinusoid_table[padding_idx] = 0.
 35 | 
 36 |     return torch.FloatTensor(sinusoid_table)
 37 | 
 38 | 
 39 | def clones(module, N):
 40 |     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 41 | 
 42 | 
 43 | # @jit(nopython=True)
 44 | def create_alignment(base_mat, duration_predictor_output):
 45 |     N, L = duration_predictor_output.shape
 46 |     for i in range(N):
 47 |         count = 0
 48 |         for j in range(L):
 49 |             for k in range(duration_predictor_output[i][j]):
 50 |                 base_mat[i][count+k][j] = 1
 51 |             count = count + duration_predictor_output[i][j]
 52 |     return base_mat
 53 | 
 54 | 
 55 | class LengthRegulator(nn.Module):
 56 |     """ Length Regulator """
 57 | 
 58 |     def __init__(self):
 59 |         super(LengthRegulator, self).__init__()
 60 |         self.duration_predictor = DurationPredictor()
 61 | 
 62 |     def LR(self, x, duration_predictor_output, mel_max_length=None):
 63 |         expand_max_len = torch.max(
 64 |             torch.sum(duration_predictor_output, -1), -1)[0]
 65 |         alignment = torch.zeros(duration_predictor_output.size(0),
 66 |                                 expand_max_len,
 67 |                                 duration_predictor_output.size(1)).numpy()
 68 |         alignment = create_alignment(alignment,
 69 |                                      duration_predictor_output.cpu().numpy())
 70 |         alignment = torch.from_numpy(alignment).to(device)
 71 | 
 72 |         output = alignment @ x
 73 |         if mel_max_length:
 74 |             output = F.pad(
 75 |                 output, (0, 0, 0, mel_max_length-output.size(1), 0, 0))
 76 |         return output
 77 | 
 78 |     def forward(self, x, alpha=1.0, target=None, mel_max_length=None):
 79 |         duration_predictor_output = self.duration_predictor(x)
 80 | 
 81 |         if target is not None:
 82 |             output = self.LR(x, target, mel_max_length=mel_max_length)
 83 |             return output, duration_predictor_output
 84 |         else:
 85 |             duration_predictor_output = (
 86 |                 (duration_predictor_output + 0.5) * alpha).int()
 87 |             output = self.LR(x, duration_predictor_output)
 88 |             mel_pos = torch.stack(
 89 |                 [torch.Tensor([i+1 for i in range(output.size(1))])]).long().to(device)
 90 | 
 91 |             return output, mel_pos
 92 | 
 93 | 
 94 | class DurationPredictor(nn.Module):
 95 |     """ Duration Predictor """
 96 | 
 97 |     def __init__(self):
 98 |         super(DurationPredictor, self).__init__()
 99 | 
100 |         self.input_size = hp.encoder_dim
101 |         self.filter_size = hp.duration_predictor_filter_size
102 |         self.kernel = hp.duration_predictor_kernel_size
103 |         self.conv_output_size = hp.duration_predictor_filter_size
104 |         self.dropout = hp.dropout
105 | 
106 |         self.conv_layer = nn.Sequential(OrderedDict([
107 |             ("conv1d_1", Conv(self.input_size,
108 |                               self.filter_size,
109 |                               kernel_size=self.kernel,
110 |                               padding=1)),
111 |             ("layer_norm_1", nn.LayerNorm(self.filter_size)),
112 |             ("relu_1", nn.ReLU()),
113 |             ("dropout_1", nn.Dropout(self.dropout)),
114 |             ("conv1d_2", Conv(self.filter_size,
115 |                               self.filter_size,
116 |                               kernel_size=self.kernel,
117 |                               padding=1)),
118 |             ("layer_norm_2", nn.LayerNorm(self.filter_size)),
119 |             ("relu_2", nn.ReLU()),
120 |             ("dropout_2", nn.Dropout(self.dropout))
121 |         ]))
122 | 
123 |         self.linear_layer = Linear(self.conv_output_size, 1)
124 |         self.relu = nn.ReLU()
125 | 
126 |     def forward(self, encoder_output):
127 |         out = self.conv_layer(encoder_output)
128 |         out = self.linear_layer(out)
129 |         out = self.relu(out)
130 |         out = out.squeeze()
131 |         if not self.training:
132 |             out = out.unsqueeze(0)
133 |         return out
134 | 
135 | 
136 | class BatchNormConv1d(nn.Module):
137 |     def __init__(self, in_dim, out_dim, kernel_size, stride, padding,
138 |                  activation=None, w_init_gain='linear'):
139 |         super(BatchNormConv1d, self).__init__()
140 |         self.conv1d = nn.Conv1d(in_dim, out_dim,
141 |                                 kernel_size=kernel_size,
142 |                                 stride=stride, padding=padding, bias=False)
143 |         self.bn = nn.BatchNorm1d(out_dim)
144 |         self.activation = activation
145 | 
146 |         torch.nn.init.xavier_uniform_(
147 |             self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
148 | 
149 |     def forward(self, x):
150 |         x = self.conv1d(x)
151 |         if self.activation is not None:
152 |             x = self.activation(x)
153 |         return self.bn(x)
154 | 
155 | 
156 | class Conv(nn.Module):
157 |     """
158 |     Convolution Module
159 |     """
160 | 
161 |     def __init__(self,
162 |                  in_channels,
163 |                  out_channels,
164 |                  kernel_size=1,
165 |                  stride=1,
166 |                  padding=0,
167 |                  dilation=1,
168 |                  bias=True,
169 |                  w_init='linear'):
170 |         """
171 |         :param in_channels: dimension of input
172 |         :param out_channels: dimension of output
173 |         :param kernel_size: size of kernel
174 |         :param stride: size of stride
175 |         :param padding: size of padding
176 |         :param dilation: dilation rate
177 |         :param bias: boolean. if True, bias is included.
178 |         :param w_init: str. weight inits with xavier initialization.
179 |         """
180 |         super(Conv, self).__init__()
181 | 
182 |         self.conv = nn.Conv1d(in_channels,
183 |                               out_channels,
184 |                               kernel_size=kernel_size,
185 |                               stride=stride,
186 |                               padding=padding,
187 |                               dilation=dilation,
188 |                               bias=bias)
189 | 
190 |         nn.init.xavier_uniform_(
191 |             self.conv.weight, gain=nn.init.calculate_gain(w_init))
192 | 
193 |     def forward(self, x):
194 |         x = x.contiguous().transpose(1, 2)
195 |         x = self.conv(x)
196 |         x = x.contiguous().transpose(1, 2)
197 | 
198 |         return x
199 | 
200 | 
201 | class Linear(nn.Module):
202 |     """
203 |     Linear Module
204 |     """
205 | 
206 |     def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
207 |         """
208 |         :param in_dim: dimension of input
209 |         :param out_dim: dimension of output
210 |         :param bias: boolean. if True, bias is included.
211 |         :param w_init: str. weight inits with xavier initialization.
212 |         """
213 |         super(Linear, self).__init__()
214 |         self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
215 | 
216 |         nn.init.xavier_uniform_(
217 |             self.linear_layer.weight,
218 |             gain=nn.init.calculate_gain(w_init))
219 | 
220 |     def forward(self, x):
221 |         return self.linear_layer(x)
222 | 
223 | 
224 | class Highway(nn.Module):
225 |     def __init__(self, in_size, out_size):
226 |         super(Highway, self).__init__()
227 |         self.H = nn.Linear(in_size, out_size)
228 |         self.H.bias.data.zero_()
229 |         self.T = nn.Linear(in_size, out_size)
230 |         self.T.bias.data.fill_(-1)
231 |         self.relu = nn.ReLU()
232 |         self.sigmoid = nn.Sigmoid()
233 | 
234 |     def forward(self, inputs):
235 |         H = self.relu(self.H(inputs))
236 |         T = self.sigmoid(self.T(inputs))
237 |         return H * T + inputs * (1.0 - T)
238 | 
239 | 
240 | class Prenet(nn.Module):
241 |     """
242 |     Prenet before passing through the network
243 |     """
244 | 
245 |     def __init__(self, input_size, hidden_size, output_size):
246 |         super(Prenet, self).__init__()
247 |         self.input_size = input_size
248 |         self.output_size = output_size
249 |         self.hidden_size = hidden_size
250 |         self.layer = nn.Sequential(OrderedDict([
251 |             ('fc1', Linear(self.input_size, self.hidden_size)),
252 |             ('relu1', nn.ReLU()),
253 |             ('dropout1', nn.Dropout(0.5)),
254 |             ('fc2', Linear(self.hidden_size, self.output_size)),
255 |             ('relu2', nn.ReLU()),
256 |             ('dropout2', nn.Dropout(0.5)),
257 |         ]))
258 | 
259 |     def forward(self, x):
260 |         out = self.layer(x)
261 |         return out
262 | 
263 | 
264 | class CBHG(nn.Module):
265 |     """CBHG module: a recurrent neural network composed of:
266 |         - 1-d convolution banks
267 |         - Highway networks + residual connections
268 |         - Bidirectional gated recurrent units
269 |     """
270 | 
271 |     def __init__(self, in_dim, K=16, projections=[128, 128]):
272 |         super(CBHG, self).__init__()
273 |         self.in_dim = in_dim
274 |         self.relu = nn.ReLU()
275 |         self.conv1d_banks = nn.ModuleList(
276 |             [BatchNormConv1d(in_dim, in_dim, kernel_size=k, stride=1,
277 |                              padding=k // 2, activation=self.relu)
278 |              for k in range(1, K + 1)])
279 |         self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
280 | 
281 |         in_sizes = [K * in_dim] + projections[:-1]
282 |         activations = [self.relu] * (len(projections) - 1) + [None]
283 |         self.conv1d_projections = nn.ModuleList(
284 |             [BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1,
285 |                              padding=1, activation=ac)
286 |              for (in_size, out_size, ac) in zip(
287 |                  in_sizes, projections, activations)])
288 | 
289 |         self.pre_highway = nn.Linear(projections[-1], in_dim, bias=False)
290 |         self.highways = nn.ModuleList(
291 |             [Highway(in_dim, in_dim) for _ in range(4)])
292 | 
293 |         self.gru = nn.GRU(
294 |             in_dim, in_dim, 1, batch_first=True, bidirectional=True)
295 | 
296 |     def forward(self, inputs, input_lengths=None):
297 |         # (B, T_in, in_dim)
298 |         x = inputs
299 | 
300 |         # Needed to perform conv1d on time-axis
301 |         # (B, in_dim, T_in)
302 |         if x.size(-1) == self.in_dim:
303 |             x = x.transpose(1, 2)
304 | 
305 |         T = x.size(-1)
306 | 
307 |         # (B, in_dim*K, T_in)
308 |         # Concat conv1d bank outputs
309 |         x = torch.cat([conv1d(x)[:, :, :T]
310 |                        for conv1d in self.conv1d_banks], dim=1)
311 |         assert x.size(1) == self.in_dim * len(self.conv1d_banks)
312 |         x = self.max_pool1d(x)[:, :, :T]
313 | 
314 |         for conv1d in self.conv1d_projections:
315 |             x = conv1d(x)
316 | 
317 |         # (B, T_in, in_dim)
318 |         # Back to the original shape
319 |         x = x.transpose(1, 2)
320 | 
321 |         if x.size(-1) != self.in_dim:
322 |             x = self.pre_highway(x)
323 | 
324 |         # Residual connection
325 |         x += inputs
326 |         for highway in self.highways:
327 |             x = highway(x)
328 | 
329 |         if input_lengths is not None:
330 |             x = nn.utils.rnn.pack_padded_sequence(
331 |                 x, input_lengths, batch_first=True)
332 | 
333 |         # (B, T_in, in_dim*2)
334 |         self.gru.flatten_parameters()
335 |         outputs, _ = self.gru(x)
336 | 
337 |         if input_lengths is not None:
338 |             outputs, _ = nn.utils.rnn.pad_packed_sequence(
339 |                 outputs, batch_first=True)
340 | 
341 |         return outputs
342 | 
343 | 
344 | if __name__ == "__main__":
345 |     # TEST
346 |     a = torch.Tensor([[2, 3, 4], [1, 2, 3]])
347 |     b = torch.Tensor([[5, 6, 7], [7, 8, 9]])
348 |     c = torch.stack([a, b])
349 | 
350 |     d = torch.Tensor([[1, 4], [6, 3]]).int()
351 |     expand_max_len = torch.max(torch.sum(d, -1), -1)[0]
352 |     base = torch.zeros(c.size(0), expand_max_len, c.size(1))
353 | 
354 |     alignment = create_alignment(base.numpy(), d.numpy())
355 |     print(alignment)
356 |     print(torch.from_numpy(alignment) @ c)
357 | 


--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ScheduledOptim():
 5 |     ''' A simple wrapper class for learning rate scheduling '''
 6 | 
 7 |     def __init__(self, optimizer, d_model, n_warmup_steps, current_steps):
 8 |         self._optimizer = optimizer
 9 |         self.n_warmup_steps = n_warmup_steps
10 |         self.n_current_steps = current_steps
11 |         self.init_lr = np.power(d_model, -0.5)
12 | 
13 |     def step_and_update_lr_frozen(self, learning_rate_frozen):
14 |         for param_group in self._optimizer.param_groups:
15 |             param_group['lr'] = learning_rate_frozen
16 |         self._optimizer.step()
17 | 
18 |     def step_and_update_lr(self):
19 |         self._update_learning_rate()
20 |         self._optimizer.step()
21 | 
22 |     def get_learning_rate(self):
23 |         learning_rate = 0.0
24 |         for param_group in self._optimizer.param_groups:
25 |             learning_rate = param_group['lr']
26 | 
27 |         return learning_rate
28 | 
29 |     def zero_grad(self):
30 |         # print(self.init_lr)
31 |         self._optimizer.zero_grad()
32 | 
33 |     def _get_lr_scale(self):
34 |         return np.min([
35 |             np.power(self.n_current_steps, -0.5),
36 |             np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
37 | 
38 |     def _update_learning_rate(self):
39 |         ''' Learning rate scheduling per step '''
40 |         self.n_current_steps += 1
41 |         lr = self.init_lr * self._get_lr_scale()
42 | 
43 |         for param_group in self._optimizer.param_groups:
44 |             param_group['lr'] = lr
45 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import shutil
 4 | import os
 5 | 
 6 | from data import ljspeech
 7 | import hparams as hp
 8 | 
 9 | 
10 | def preprocess_ljspeech(filename):
11 |     in_dir = filename
12 |     out_dir = hp.mel_ground_truth
13 |     if not os.path.exists(out_dir):
14 |         os.makedirs(out_dir, exist_ok=True)
15 |     metadata = ljspeech.build_from_path(in_dir, out_dir)
16 |     write_metadata(metadata, out_dir)
17 | 
18 |     shutil.move(os.path.join(hp.mel_ground_truth, "train.txt"),
19 |                 os.path.join("data", "train.txt"))
20 | 
21 | 
22 | def write_metadata(metadata, out_dir):
23 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
24 |         for m in metadata:
25 |             f.write(m + '\n')
26 | 
27 | 
28 | def main():
29 |     path = os.path.join("data", "LJSpeech-1.1")
30 |     preprocess_ljspeech(path)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.2.0
2 | numpy==1.22.0
3 | scipy==1.4.1
4 | librosa==0.7.2
5 | inflect==0.2.5
6 | numba==0.48.0


--------------------------------------------------------------------------------
/sample/135000_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_0.wav


--------------------------------------------------------------------------------
/sample/135000_0_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_0_waveglow.wav


--------------------------------------------------------------------------------
/sample/135000_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_1.wav


--------------------------------------------------------------------------------
/sample/135000_1_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_1_waveglow.wav


--------------------------------------------------------------------------------
/sample/135000_2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_2.wav


--------------------------------------------------------------------------------
/sample/135000_2_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_2_waveglow.wav


--------------------------------------------------------------------------------
/sample/135000_3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_3.wav


--------------------------------------------------------------------------------
/sample/135000_3_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_3_waveglow.wav


--------------------------------------------------------------------------------
/sample/135000_4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_4.wav


--------------------------------------------------------------------------------
/sample/135000_4_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_4_waveglow.wav


--------------------------------------------------------------------------------
/sample/135000_5.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_5.wav


--------------------------------------------------------------------------------
/sample/135000_5_waveglow.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xcmyz/FastSpeech/1e1a61b5015c951caa551b7fab4080339d697c7c/sample/135000_5_waveglow.wav


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | import re
 3 | from text import cleaners
 4 | from text.symbols import symbols
 5 | 
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | 
11 | # Regular expression matching text enclosed in curly braces:
12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
13 | 
14 | 
15 | def text_to_sequence(text, cleaner_names):
16 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17 | 
18 |       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19 |       in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20 | 
21 |       Args:
22 |         text: string to convert to a sequence
23 |         cleaner_names: names of the cleaner functions to run the text through
24 | 
25 |       Returns:
26 |         List of integers corresponding to the symbols in the text
27 |     '''
28 |     sequence = []
29 | 
30 |     # Check for curly braces and treat their contents as ARPAbet:
31 |     while len(text):
32 |         m = _curly_re.match(text)
33 |         if not m:
34 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
35 |             break
36 |         sequence += _symbols_to_sequence(
37 |             _clean_text(m.group(1), cleaner_names))
38 |         sequence += _arpabet_to_sequence(m.group(2))
39 |         text = m.group(3)
40 | 
41 |     return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |     '''Converts a sequence of IDs back to a string'''
46 |     result = ''
47 |     for symbol_id in sequence:
48 |         if symbol_id in _id_to_symbol:
49 |             s = _id_to_symbol[symbol_id]
50 |             # Enclose ARPAbet back in curly braces:
51 |             if len(s) > 1 and s[0] == '@':
52 |                 s = '{%s}' % s[1:]
53 |             result += s
54 |     return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |     for name in cleaner_names:
59 |         cleaner = getattr(cleaners, name)
60 |         if not cleaner:
61 |             raise Exception('Unknown cleaner: %s' % name)
62 |         text = cleaner(text)
63 |     return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |     return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |     return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Cleaners are transformations that run over the input text at both training and eval time.
 5 | 
 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 8 |   1. "english_cleaners" for English text
 9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 |      the symbols in symbols.py to match your data).
13 | '''
14 | 
15 | 
16 | # Regular expression matching whitespace:
17 | import re
18 | from unidecode import unidecode
19 | from .numbers import normalize_numbers
20 | _whitespace_re = re.compile(r'\s+')
21 | 
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 |     ('mrs', 'misess'),
25 |     ('mr', 'mister'),
26 |     ('dr', 'doctor'),
27 |     ('st', 'saint'),
28 |     ('co', 'company'),
29 |     ('jr', 'junior'),
30 |     ('maj', 'major'),
31 |     ('gen', 'general'),
32 |     ('drs', 'doctors'),
33 |     ('rev', 'reverend'),
34 |     ('lt', 'lieutenant'),
35 |     ('hon', 'honorable'),
36 |     ('sgt', 'sergeant'),
37 |     ('capt', 'captain'),
38 |     ('esq', 'esquire'),
39 |     ('ltd', 'limited'),
40 |     ('col', 'colonel'),
41 |     ('ft', 'fort'),
42 | ]]
43 | 
44 | 
45 | def expand_abbreviations(text):
46 |     for regex, replacement in _abbreviations:
47 |         text = re.sub(regex, replacement, text)
48 |     return text
49 | 
50 | 
51 | def expand_numbers(text):
52 |     return normalize_numbers(text)
53 | 
54 | 
55 | def lowercase(text):
56 |     return text.lower()
57 | 
58 | 
59 | def collapse_whitespace(text):
60 |     return re.sub(_whitespace_re, ' ', text)
61 | 
62 | 
63 | def convert_to_ascii(text):
64 |     return unidecode(text)
65 | 
66 | 
67 | def basic_cleaners(text):
68 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
69 |     text = lowercase(text)
70 |     text = collapse_whitespace(text)
71 |     return text
72 | 
73 | 
74 | def transliteration_cleaners(text):
75 |     '''Pipeline for non-English text that transliterates to ASCII.'''
76 |     text = convert_to_ascii(text)
77 |     text = lowercase(text)
78 |     text = collapse_whitespace(text)
79 |     return text
80 | 
81 | 
82 | def english_cleaners(text):
83 |     '''Pipeline for English text, including number and abbreviation expansion.'''
84 |     text = convert_to_ascii(text)
85 |     text = lowercase(text)
86 |     text = expand_numbers(text)
87 |     text = expand_abbreviations(text)
88 |     text = collapse_whitespace(text)
89 |     return text
90 | 


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | valid_symbols = [
 7 |     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 8 |     'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 9 |     'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 |     'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 |     'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 |     'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 |     'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 | 
16 | _valid_symbol_set = set(valid_symbols)
17 | 
18 | 
19 | class CMUDict:
20 |     '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 | 
22 |     def __init__(self, file_or_path, keep_ambiguous=True):
23 |         if isinstance(file_or_path, str):
24 |             with open(file_or_path, encoding='latin-1') as f:
25 |                 entries = _parse_cmudict(f)
26 |         else:
27 |             entries = _parse_cmudict(file_or_path)
28 |         if not keep_ambiguous:
29 |             entries = {word: pron for word,
30 |                        pron in entries.items() if len(pron) == 1}
31 |         self._entries = entries
32 | 
33 |     def __len__(self):
34 |         return len(self._entries)
35 | 
36 |     def lookup(self, word):
37 |         '''Returns list of ARPAbet pronunciations of the given word.'''
38 |         return self._entries.get(word.upper())
39 | 
40 | 
41 | _alt_re = re.compile(r'\([0-9]+\)')
42 | 
43 | 
44 | def _parse_cmudict(file):
45 |     cmudict = {}
46 |     for line in file:
47 |         if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
48 |             parts = line.split('  ')
49 |             word = re.sub(_alt_re, '', parts[0])
50 |             pronunciation = _get_pronunciation(parts[1])
51 |             if pronunciation:
52 |                 if word in cmudict:
53 |                     cmudict[word].append(pronunciation)
54 |                 else:
55 |                     cmudict[word] = [pronunciation]
56 |     return cmudict
57 | 
58 | 
59 | def _get_pronunciation(s):
60 |     parts = s.strip().split(' ')
61 |     for part in parts:
62 |         if part not in _valid_symbol_set:
63 |             return None
64 |     return ' '.join(parts)
65 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split('.')
27 |     if len(parts) > 2:
28 |         return match + ' dollars'  # Unexpected format
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |         cent_unit = 'cent' if cents == 1 else 'cents'
34 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |     elif dollars:
36 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |         return '%s %s' % (dollars, dollar_unit)
38 |     elif cents:
39 |         cent_unit = 'cent' if cents == 1 else 'cents'
40 |         return '%s %s' % (cents, cent_unit)
41 |     else:
42 |         return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return 'two thousand'
54 |         elif num > 2000 and num < 2010:
55 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + ' hundred'
58 |         else:
59 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |     else:
61 |         return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r'\1 pounds', text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text
72 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | 
 8 | from text import cmudict
 9 | _pad        = '_'
10 | _punctuation = '!\'(),.:;? '
11 | _special = '-'
12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
13 | 
14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
16 | 
17 | # Export all symbols:
18 | symbols = [_pad] + list(_special) + list(_punctuation) + \
19 |     list(_letters) + _arpabet
20 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from multiprocessing import cpu_count
  6 | import numpy as np
  7 | import argparse
  8 | import os
  9 | import time
 10 | import math
 11 | 
 12 | from model import FastSpeech
 13 | from loss import DNNLoss
 14 | from dataset import BufferDataset, DataLoader
 15 | from dataset import get_data_to_buffer, collate_fn_tensor
 16 | from optimizer import ScheduledOptim
 17 | import hparams as hp
 18 | import utils
 19 | 
 20 | 
 21 | def main(args):
 22 |     # Get device
 23 |     device = torch.device('cuda'if torch.cuda.is_available()else 'cpu')
 24 | 
 25 |     # Define model
 26 |     print("Use FastSpeech")
 27 |     model = nn.DataParallel(FastSpeech()).to(device)
 28 |     print("Model Has Been Defined")
 29 |     num_param = utils.get_param_num(model)
 30 |     print('Number of TTS Parameters:', num_param)
 31 |     # Get buffer
 32 |     print("Load data to buffer")
 33 |     buffer = get_data_to_buffer()
 34 | 
 35 |     # Optimizer and loss
 36 |     optimizer = torch.optim.Adam(model.parameters(),
 37 |                                  betas=(0.9, 0.98),
 38 |                                  eps=1e-9)
 39 |     scheduled_optim = ScheduledOptim(optimizer,
 40 |                                      hp.decoder_dim,
 41 |                                      hp.n_warm_up_step,
 42 |                                      args.restore_step)
 43 |     fastspeech_loss = DNNLoss().to(device)
 44 |     print("Defined Optimizer and Loss Function.")
 45 | 
 46 |     # Load checkpoint if exists
 47 |     try:
 48 |         checkpoint = torch.load(os.path.join(
 49 |             hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step))
 50 |         model.load_state_dict(checkpoint['model'])
 51 |         optimizer.load_state_dict(checkpoint['optimizer'])
 52 |         print("\n---Model Restored at Step %d---\n" % args.restore_step)
 53 |     except:
 54 |         print("\n---Start New Training---\n")
 55 |         if not os.path.exists(hp.checkpoint_path):
 56 |             os.mkdir(hp.checkpoint_path)
 57 | 
 58 |     # Init logger
 59 |     if not os.path.exists(hp.logger_path):
 60 |         os.mkdir(hp.logger_path)
 61 | 
 62 |     # Get dataset
 63 |     dataset = BufferDataset(buffer)
 64 | 
 65 |     # Get Training Loader
 66 |     training_loader = DataLoader(dataset,
 67 |                                  batch_size=hp.batch_expand_size * hp.batch_size,
 68 |                                  shuffle=True,
 69 |                                  collate_fn=collate_fn_tensor,
 70 |                                  drop_last=True,
 71 |                                  num_workers=0)
 72 |     total_step = hp.epochs * len(training_loader) * hp.batch_expand_size
 73 | 
 74 |     # Define Some Information
 75 |     Time = np.array([])
 76 |     Start = time.perf_counter()
 77 | 
 78 |     # Training
 79 |     model = model.train()
 80 | 
 81 |     for epoch in range(hp.epochs):
 82 |         for i, batchs in enumerate(training_loader):
 83 |             # real batch start here
 84 |             for j, db in enumerate(batchs):
 85 |                 start_time = time.perf_counter()
 86 | 
 87 |                 current_step = i * hp.batch_expand_size + j + args.restore_step + \
 88 |                     epoch * len(training_loader) * hp.batch_expand_size + 1
 89 | 
 90 |                 # Init
 91 |                 scheduled_optim.zero_grad()
 92 | 
 93 |                 # Get Data
 94 |                 character = db["text"].long().to(device)
 95 |                 mel_target = db["mel_target"].float().to(device)
 96 |                 duration = db["duration"].int().to(device)
 97 |                 mel_pos = db["mel_pos"].long().to(device)
 98 |                 src_pos = db["src_pos"].long().to(device)
 99 |                 max_mel_len = db["mel_max_len"]
100 | 
101 |                 # Forward
102 |                 mel_output, mel_postnet_output, duration_predictor_output = model(character,
103 |                                                                                   src_pos,
104 |                                                                                   mel_pos=mel_pos,
105 |                                                                                   mel_max_length=max_mel_len,
106 |                                                                                   length_target=duration)
107 | 
108 |                 # Cal Loss
109 |                 mel_loss, mel_postnet_loss, duration_loss = fastspeech_loss(mel_output,
110 |                                                                             mel_postnet_output,
111 |                                                                             duration_predictor_output,
112 |                                                                             mel_target,
113 |                                                                             duration)
114 |                 total_loss = mel_loss + mel_postnet_loss + duration_loss
115 | 
116 |                 # Logger
117 |                 t_l = total_loss.item()
118 |                 m_l = mel_loss.item()
119 |                 m_p_l = mel_postnet_loss.item()
120 |                 d_l = duration_loss.item()
121 | 
122 |                 with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss:
123 |                     f_total_loss.write(str(t_l)+"\n")
124 | 
125 |                 with open(os.path.join("logger", "mel_loss.txt"), "a") as f_mel_loss:
126 |                     f_mel_loss.write(str(m_l)+"\n")
127 | 
128 |                 with open(os.path.join("logger", "mel_postnet_loss.txt"), "a") as f_mel_postnet_loss:
129 |                     f_mel_postnet_loss.write(str(m_p_l)+"\n")
130 | 
131 |                 with open(os.path.join("logger", "duration_loss.txt"), "a") as f_d_loss:
132 |                     f_d_loss.write(str(d_l)+"\n")
133 | 
134 |                 # Backward
135 |                 total_loss.backward()
136 | 
137 |                 # Clipping gradients to avoid gradient explosion
138 |                 nn.utils.clip_grad_norm_(
139 |                     model.parameters(), hp.grad_clip_thresh)
140 | 
141 |                 # Update weights
142 |                 if args.frozen_learning_rate:
143 |                     scheduled_optim.step_and_update_lr_frozen(
144 |                         args.learning_rate_frozen)
145 |                 else:
146 |                     scheduled_optim.step_and_update_lr()
147 | 
148 |                 # Print
149 |                 if current_step % hp.log_step == 0:
150 |                     Now = time.perf_counter()
151 | 
152 |                     str1 = "Epoch [{}/{}], Step [{}/{}]:".format(
153 |                         epoch+1, hp.epochs, current_step, total_step)
154 |                     str2 = "Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f}, Duration Loss: {:.4f};".format(
155 |                         m_l, m_p_l, d_l)
156 |                     str3 = "Current Learning Rate is {:.6f}.".format(
157 |                         scheduled_optim.get_learning_rate())
158 |                     str4 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format(
159 |                         (Now-Start), (total_step-current_step)*np.mean(Time))
160 | 
161 |                     print("\n" + str1)
162 |                     print(str2)
163 |                     print(str3)
164 |                     print(str4)
165 | 
166 |                     with open(os.path.join("logger", "logger.txt"), "a") as f_logger:
167 |                         f_logger.write(str1 + "\n")
168 |                         f_logger.write(str2 + "\n")
169 |                         f_logger.write(str3 + "\n")
170 |                         f_logger.write(str4 + "\n")
171 |                         f_logger.write("\n")
172 | 
173 |                 if current_step % hp.save_step == 0:
174 |                     torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict(
175 |                     )}, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step))
176 |                     print("save model at step %d ..." % current_step)
177 | 
178 |                 end_time = time.perf_counter()
179 |                 Time = np.append(Time, end_time - start_time)
180 |                 if len(Time) == hp.clear_Time:
181 |                     temp_value = np.mean(Time)
182 |                     Time = np.delete(
183 |                         Time, [i for i in range(len(Time))], axis=None)
184 |                     Time = np.append(Time, temp_value)
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     parser = argparse.ArgumentParser()
189 |     parser.add_argument('--restore_step', type=int, default=0)
190 |     parser.add_argument('--frozen_learning_rate', type=bool, default=False)
191 |     parser.add_argument("--learning_rate_frozen", type=float, default=1e-3)
192 |     args = parser.parse_args()
193 |     main(args)
194 | 


--------------------------------------------------------------------------------
/transformer/Constants.py:
--------------------------------------------------------------------------------
 1 | PAD = 0
 2 | UNK = 1
 3 | BOS = 2
 4 | EOS = 3
 5 | 
 6 | PAD_WORD = '<blank>'
 7 | UNK_WORD = '<unk>'
 8 | BOS_WORD = '<s>'
 9 | EOS_WORD = '</s>'
10 | 


--------------------------------------------------------------------------------
/transformer/Layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional as F
  4 | import numpy as np
  5 | from collections import OrderedDict
  6 | 
  7 | from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
  8 | from text.symbols import symbols
  9 | 
 10 | 
 11 | class Linear(nn.Module):
 12 |     """
 13 |     Linear Module
 14 |     """
 15 | 
 16 |     def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
 17 |         """
 18 |         :param in_dim: dimension of input
 19 |         :param out_dim: dimension of output
 20 |         :param bias: boolean. if True, bias is included.
 21 |         :param w_init: str. weight inits with xavier initialization.
 22 |         """
 23 |         super(Linear, self).__init__()
 24 |         self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
 25 | 
 26 |         nn.init.xavier_uniform_(
 27 |             self.linear_layer.weight,
 28 |             gain=nn.init.calculate_gain(w_init))
 29 | 
 30 |     def forward(self, x):
 31 |         return self.linear_layer(x)
 32 | 
 33 | 
 34 | class PreNet(nn.Module):
 35 |     """
 36 |     Pre Net before passing through the network
 37 |     """
 38 | 
 39 |     def __init__(self, input_size, hidden_size, output_size, p=0.5):
 40 |         """
 41 |         :param input_size: dimension of input
 42 |         :param hidden_size: dimension of hidden unit
 43 |         :param output_size: dimension of output
 44 |         """
 45 |         super(PreNet, self).__init__()
 46 |         self.input_size = input_size
 47 |         self.output_size = output_size
 48 |         self.hidden_size = hidden_size
 49 |         self.layer = nn.Sequential(OrderedDict([
 50 |             ('fc1', Linear(self.input_size, self.hidden_size)),
 51 |             ('relu1', nn.ReLU()),
 52 |             ('dropout1', nn.Dropout(p)),
 53 |             ('fc2', Linear(self.hidden_size, self.output_size)),
 54 |             ('relu2', nn.ReLU()),
 55 |             ('dropout2', nn.Dropout(p)),
 56 |         ]))
 57 | 
 58 |     def forward(self, input_):
 59 | 
 60 |         out = self.layer(input_)
 61 | 
 62 |         return out
 63 | 
 64 | 
 65 | class Conv(nn.Module):
 66 |     """
 67 |     Convolution Module
 68 |     """
 69 | 
 70 |     def __init__(self,
 71 |                  in_channels,
 72 |                  out_channels,
 73 |                  kernel_size=1,
 74 |                  stride=1,
 75 |                  padding=0,
 76 |                  dilation=1,
 77 |                  bias=True,
 78 |                  w_init='linear'):
 79 |         """
 80 |         :param in_channels: dimension of input
 81 |         :param out_channels: dimension of output
 82 |         :param kernel_size: size of kernel
 83 |         :param stride: size of stride
 84 |         :param padding: size of padding
 85 |         :param dilation: dilation rate
 86 |         :param bias: boolean. if True, bias is included.
 87 |         :param w_init: str. weight inits with xavier initialization.
 88 |         """
 89 |         super(Conv, self).__init__()
 90 | 
 91 |         self.conv = nn.Conv1d(in_channels,
 92 |                               out_channels,
 93 |                               kernel_size=kernel_size,
 94 |                               stride=stride,
 95 |                               padding=padding,
 96 |                               dilation=dilation,
 97 |                               bias=bias)
 98 | 
 99 |         nn.init.xavier_uniform_(
100 |             self.conv.weight, gain=nn.init.calculate_gain(w_init))
101 | 
102 |     def forward(self, x):
103 |         x = self.conv(x)
104 |         return x
105 | 
106 | 
107 | class FFTBlock(torch.nn.Module):
108 |     """FFT Block"""
109 | 
110 |     def __init__(self,
111 |                  d_model,
112 |                  d_inner,
113 |                  n_head,
114 |                  d_k,
115 |                  d_v,
116 |                  dropout=0.1):
117 |         super(FFTBlock, self).__init__()
118 |         self.slf_attn = MultiHeadAttention(
119 |             n_head, d_model, d_k, d_v, dropout=dropout)
120 |         self.pos_ffn = PositionwiseFeedForward(
121 |             d_model, d_inner, dropout=dropout)
122 | 
123 |     def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
124 |         enc_output, enc_slf_attn = self.slf_attn(
125 |             enc_input, enc_input, enc_input, mask=slf_attn_mask)
126 |         enc_output *= non_pad_mask
127 | 
128 |         enc_output = self.pos_ffn(enc_output)
129 |         enc_output *= non_pad_mask
130 | 
131 |         return enc_output, enc_slf_attn
132 | 
133 | 
134 | class ConvNorm(torch.nn.Module):
135 |     def __init__(self,
136 |                  in_channels,
137 |                  out_channels,
138 |                  kernel_size=1,
139 |                  stride=1,
140 |                  padding=None,
141 |                  dilation=1,
142 |                  bias=True,
143 |                  w_init_gain='linear'):
144 |         super(ConvNorm, self).__init__()
145 | 
146 |         if padding is None:
147 |             assert(kernel_size % 2 == 1)
148 |             padding = int(dilation * (kernel_size - 1) / 2)
149 | 
150 |         self.conv = torch.nn.Conv1d(in_channels,
151 |                                     out_channels,
152 |                                     kernel_size=kernel_size,
153 |                                     stride=stride,
154 |                                     padding=padding,
155 |                                     dilation=dilation,
156 |                                     bias=bias)
157 | 
158 |         torch.nn.init.xavier_uniform_(
159 |             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
160 | 
161 |     def forward(self, signal):
162 |         conv_signal = self.conv(signal)
163 | 
164 |         return conv_signal
165 | 
166 | 
167 | class PostNet(nn.Module):
168 |     """
169 |     PostNet: Five 1-d convolution with 512 channels and kernel size 5
170 |     """
171 | 
172 |     def __init__(self,
173 |                  n_mel_channels=80,
174 |                  postnet_embedding_dim=512,
175 |                  postnet_kernel_size=5,
176 |                  postnet_n_convolutions=5):
177 | 
178 |         super(PostNet, self).__init__()
179 |         self.convolutions = nn.ModuleList()
180 | 
181 |         self.convolutions.append(
182 |             nn.Sequential(
183 |                 ConvNorm(n_mel_channels,
184 |                          postnet_embedding_dim,
185 |                          kernel_size=postnet_kernel_size,
186 |                          stride=1,
187 |                          padding=int((postnet_kernel_size - 1) / 2),
188 |                          dilation=1,
189 |                          w_init_gain='tanh'),
190 | 
191 |                 nn.BatchNorm1d(postnet_embedding_dim))
192 |         )
193 | 
194 |         for i in range(1, postnet_n_convolutions - 1):
195 |             self.convolutions.append(
196 |                 nn.Sequential(
197 |                     ConvNorm(postnet_embedding_dim,
198 |                              postnet_embedding_dim,
199 |                              kernel_size=postnet_kernel_size,
200 |                              stride=1,
201 |                              padding=int((postnet_kernel_size - 1) / 2),
202 |                              dilation=1,
203 |                              w_init_gain='tanh'),
204 | 
205 |                     nn.BatchNorm1d(postnet_embedding_dim))
206 |             )
207 | 
208 |         self.convolutions.append(
209 |             nn.Sequential(
210 |                 ConvNorm(postnet_embedding_dim,
211 |                          n_mel_channels,
212 |                          kernel_size=postnet_kernel_size,
213 |                          stride=1,
214 |                          padding=int((postnet_kernel_size - 1) / 2),
215 |                          dilation=1,
216 |                          w_init_gain='linear'),
217 | 
218 |                 nn.BatchNorm1d(n_mel_channels))
219 |         )
220 | 
221 |     def forward(self, x):
222 |         x = x.contiguous().transpose(1, 2)
223 | 
224 |         for i in range(len(self.convolutions) - 1):
225 |             x = F.dropout(torch.tanh(
226 |                 self.convolutions[i](x)), 0.5, self.training)
227 |         x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
228 | 
229 |         x = x.contiguous().transpose(1, 2)
230 |         return x
231 | 


--------------------------------------------------------------------------------
/transformer/Models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | import hparams as hp
  5 | 
  6 | import transformer.Constants as Constants
  7 | from transformer.Layers import FFTBlock, PreNet, PostNet, Linear
  8 | 
  9 | 
 10 | def get_non_pad_mask(seq):
 11 |     assert seq.dim() == 2
 12 |     return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1)
 13 | 
 14 | 
 15 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 16 |     ''' Sinusoid position encoding table '''
 17 | 
 18 |     def cal_angle(position, hid_idx):
 19 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 20 | 
 21 |     def get_posi_angle_vec(position):
 22 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 23 | 
 24 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i)
 25 |                                for pos_i in range(n_position)])
 26 | 
 27 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 28 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 29 | 
 30 |     if padding_idx is not None:
 31 |         # zero vector for padding dimension
 32 |         sinusoid_table[padding_idx] = 0.
 33 | 
 34 |     return torch.FloatTensor(sinusoid_table)
 35 | 
 36 | 
 37 | def get_attn_key_pad_mask(seq_k, seq_q):
 38 |     ''' For masking out the padding part of key sequence. '''
 39 | 
 40 |     # Expand to fit the shape of key query attention matrix.
 41 |     len_q = seq_q.size(1)
 42 |     padding_mask = seq_k.eq(Constants.PAD)
 43 |     padding_mask = padding_mask.unsqueeze(
 44 |         1).expand(-1, len_q, -1)  # b x lq x lk
 45 | 
 46 |     return padding_mask
 47 | 
 48 | 
 49 | class Encoder(nn.Module):
 50 |     ''' Encoder '''
 51 | 
 52 |     def __init__(self,
 53 |                  n_src_vocab=hp.vocab_size,
 54 |                  len_max_seq=hp.vocab_size,
 55 |                  d_word_vec=hp.encoder_dim,
 56 |                  n_layers=hp.encoder_n_layer,
 57 |                  n_head=hp.encoder_head,
 58 |                  d_k=hp.encoder_dim // hp.encoder_head,
 59 |                  d_v=hp.encoder_dim // hp.encoder_head,
 60 |                  d_model=hp.encoder_dim,
 61 |                  d_inner=hp.encoder_conv1d_filter_size,
 62 |                  dropout=hp.dropout):
 63 | 
 64 |         super(Encoder, self).__init__()
 65 | 
 66 |         n_position = len_max_seq + 1
 67 | 
 68 |         self.src_word_emb = nn.Embedding(n_src_vocab,
 69 |                                          d_word_vec,
 70 |                                          padding_idx=Constants.PAD)
 71 | 
 72 |         self.position_enc = nn.Embedding.from_pretrained(
 73 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
 74 |             freeze=True)
 75 | 
 76 |         self.layer_stack = nn.ModuleList([FFTBlock(
 77 |             d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
 78 | 
 79 |     def forward(self, src_seq, src_pos, return_attns=False):
 80 | 
 81 |         enc_slf_attn_list = []
 82 | 
 83 |         # -- Prepare masks
 84 |         slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
 85 |         non_pad_mask = get_non_pad_mask(src_seq)
 86 | 
 87 |         # -- Forward
 88 |         enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos)
 89 | 
 90 |         for enc_layer in self.layer_stack:
 91 |             enc_output, enc_slf_attn = enc_layer(
 92 |                 enc_output,
 93 |                 non_pad_mask=non_pad_mask,
 94 |                 slf_attn_mask=slf_attn_mask)
 95 |             if return_attns:
 96 |                 enc_slf_attn_list += [enc_slf_attn]
 97 | 
 98 |         return enc_output, non_pad_mask
 99 | 
100 | 
101 | class Decoder(nn.Module):
102 |     """ Decoder """
103 | 
104 |     def __init__(self,
105 |                  len_max_seq=hp.max_seq_len,
106 |                  n_layers=hp.decoder_n_layer,
107 |                  n_head=hp.decoder_head,
108 |                  d_k=hp.decoder_dim // hp.decoder_head,
109 |                  d_v=hp.decoder_dim // hp.decoder_head,
110 |                  d_model=hp.decoder_dim,
111 |                  d_inner=hp.decoder_conv1d_filter_size,
112 |                  dropout=hp.dropout):
113 | 
114 |         super(Decoder, self).__init__()
115 | 
116 |         n_position = len_max_seq + 1
117 | 
118 |         self.position_enc = nn.Embedding.from_pretrained(
119 |             get_sinusoid_encoding_table(n_position, d_model, padding_idx=0),
120 |             freeze=True)
121 | 
122 |         self.layer_stack = nn.ModuleList([FFTBlock(
123 |             d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
124 | 
125 |     def forward(self, enc_seq, enc_pos, return_attns=False):
126 | 
127 |         dec_slf_attn_list = []
128 | 
129 |         # -- Prepare masks
130 |         slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
131 |         non_pad_mask = get_non_pad_mask(enc_pos)
132 | 
133 |         # -- Forward
134 |         dec_output = enc_seq + self.position_enc(enc_pos)
135 | 
136 |         for dec_layer in self.layer_stack:
137 |             dec_output, dec_slf_attn = dec_layer(
138 |                 dec_output,
139 |                 non_pad_mask=non_pad_mask,
140 |                 slf_attn_mask=slf_attn_mask)
141 |             if return_attns:
142 |                 dec_slf_attn_list += [dec_slf_attn]
143 | 
144 |         return dec_output
145 | 


--------------------------------------------------------------------------------
/transformer/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ScaledDotProductAttention(nn.Module):
 7 |     ''' Scaled Dot-Product Attention '''
 8 | 
 9 |     def __init__(self, temperature, attn_dropout=0.1):
10 |         super().__init__()
11 |         self.temperature = temperature
12 |         self.dropout = nn.Dropout(attn_dropout)
13 |         self.softmax = nn.Softmax(dim=2)
14 | 
15 |     def forward(self, q, k, v, mask=None):
16 | 
17 |         attn = torch.bmm(q, k.transpose(1, 2))
18 |         attn = attn / self.temperature
19 | 
20 |         if mask is not None:
21 |             attn = attn.masked_fill(mask, -np.inf)
22 | 
23 |         attn = self.softmax(attn)
24 |         attn = self.dropout(attn)
25 |         output = torch.bmm(attn, v)
26 | 
27 |         return output, attn
28 | 


--------------------------------------------------------------------------------
/transformer/SubLayers.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | 
 5 | from transformer.Modules import ScaledDotProductAttention
 6 | import hparams as hp
 7 | 
 8 | 
 9 | class MultiHeadAttention(nn.Module):
10 |     ''' Multi-Head Attention module '''
11 | 
12 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
13 |         super().__init__()
14 | 
15 |         self.n_head = n_head
16 |         self.d_k = d_k
17 |         self.d_v = d_v
18 | 
19 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
20 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
21 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
22 |         nn.init.normal_(self.w_qs.weight, mean=0,
23 |                         std=np.sqrt(2.0 / (d_model + d_k)))
24 |         nn.init.normal_(self.w_ks.weight, mean=0,
25 |                         std=np.sqrt(2.0 / (d_model + d_k)))
26 |         nn.init.normal_(self.w_vs.weight, mean=0,
27 |                         std=np.sqrt(2.0 / (d_model + d_v)))
28 | 
29 |         self.attention = ScaledDotProductAttention(
30 |             temperature=np.power(d_k, 0.5))
31 |         self.layer_norm = nn.LayerNorm(d_model)
32 | 
33 |         self.fc = nn.Linear(n_head * d_v, d_model)
34 |         nn.init.xavier_normal_(self.fc.weight)
35 | 
36 |         self.dropout = nn.Dropout(dropout)
37 | 
38 |     def forward(self, q, k, v, mask=None):
39 | 
40 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
41 | 
42 |         sz_b, len_q, _ = q.size()
43 |         sz_b, len_k, _ = k.size()
44 |         sz_b, len_v, _ = v.size()
45 | 
46 |         residual = q
47 | 
48 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
49 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
50 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
51 | 
52 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1,
53 |                                                     len_q, d_k)  # (n*b) x lq x dk
54 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1,
55 |                                                     len_k, d_k)  # (n*b) x lk x dk
56 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1,
57 |                                                     len_v, d_v)  # (n*b) x lv x dv
58 | 
59 |         mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
60 |         output, attn = self.attention(q, k, v, mask=mask)
61 | 
62 |         output = output.view(n_head, sz_b, len_q, d_v)
63 |         output = output.permute(1, 2, 0, 3).contiguous().view(
64 |             sz_b, len_q, -1)  # b x lq x (n*dv)
65 | 
66 |         output = self.dropout(self.fc(output))
67 |         output = self.layer_norm(output + residual)
68 | 
69 |         return output, attn
70 | 
71 | 
72 | class PositionwiseFeedForward(nn.Module):
73 |     ''' A two-feed-forward-layer module '''
74 | 
75 |     def __init__(self, d_in, d_hid, dropout=0.1):
76 |         super().__init__()
77 | 
78 |         # Use Conv1D
79 |         # position-wise
80 |         self.w_1 = nn.Conv1d(
81 |             d_in, d_hid, kernel_size=hp.fft_conv1d_kernel[0], padding=hp.fft_conv1d_padding[0])
82 |         # position-wise
83 |         self.w_2 = nn.Conv1d(
84 |             d_hid, d_in, kernel_size=hp.fft_conv1d_kernel[1], padding=hp.fft_conv1d_padding[1])
85 | 
86 |         self.layer_norm = nn.LayerNorm(d_in)
87 |         self.dropout = nn.Dropout(dropout)
88 | 
89 |     def forward(self, x):
90 |         residual = x
91 |         output = x.transpose(1, 2)
92 |         output = self.w_2(F.relu(self.w_1(output)))
93 |         output = output.transpose(1, 2)
94 |         output = self.dropout(output)
95 |         output = self.layer_norm(output + residual)
96 | 
97 |         return output
98 | 


--------------------------------------------------------------------------------
/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | import transformer.Constants
2 | import transformer.Modules
3 | import transformer.Layers
4 | import transformer.SubLayers
5 | import transformer.Models
6 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import os
  6 | 
  7 | import hparams
  8 | 
  9 | 
 10 | def process_text(train_text_path):
 11 |     with open(train_text_path, "r", encoding="utf-8") as f:
 12 |         txt = []
 13 |         for line in f.readlines():
 14 |             txt.append(line)
 15 | 
 16 |         return txt
 17 | 
 18 | 
 19 | def get_param_num(model):
 20 |     num_param = sum(param.numel() for param in model.parameters())
 21 |     return num_param
 22 | 
 23 | 
 24 | def get_mask_from_lengths(lengths, max_len=None):
 25 |     if max_len == None:
 26 |         max_len = torch.max(lengths).item()
 27 | 
 28 |     ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
 29 |     mask = (ids < lengths.unsqueeze(1)).bool()
 30 | 
 31 |     return mask
 32 | 
 33 | 
 34 | def get_WaveGlow():
 35 |     waveglow_path = os.path.join("waveglow", "pretrained_model")
 36 |     waveglow_path = os.path.join(waveglow_path, "waveglow_256channels.pt")
 37 |     wave_glow = torch.load(waveglow_path)['model']
 38 |     wave_glow = wave_glow.remove_weightnorm(wave_glow)
 39 |     wave_glow.cuda().eval()
 40 |     for m in wave_glow.modules():
 41 |         if 'Conv' in str(type(m)):
 42 |             setattr(m, 'padding_mode', 'zeros')
 43 | 
 44 |     return wave_glow
 45 | 
 46 | 
 47 | def pad_1D(inputs, PAD=0):
 48 | 
 49 |     def pad_data(x, length, PAD):
 50 |         x_padded = np.pad(x, (0, length - x.shape[0]),
 51 |                           mode='constant',
 52 |                           constant_values=PAD)
 53 |         return x_padded
 54 | 
 55 |     max_len = max((len(x) for x in inputs))
 56 |     padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])
 57 | 
 58 |     return padded
 59 | 
 60 | 
 61 | def pad_1D_tensor(inputs, PAD=0):
 62 | 
 63 |     def pad_data(x, length, PAD):
 64 |         x_padded = F.pad(x, (0, length - x.shape[0]))
 65 |         return x_padded
 66 | 
 67 |     max_len = max((len(x) for x in inputs))
 68 |     padded = torch.stack([pad_data(x, max_len, PAD) for x in inputs])
 69 | 
 70 |     return padded
 71 | 
 72 | 
 73 | def pad_2D(inputs, maxlen=None):
 74 | 
 75 |     def pad(x, max_len):
 76 |         PAD = 0
 77 |         if np.shape(x)[0] > max_len:
 78 |             raise ValueError("not max_len")
 79 | 
 80 |         s = np.shape(x)[1]
 81 |         x_padded = np.pad(x, (0, max_len - np.shape(x)[0]),
 82 |                           mode='constant',
 83 |                           constant_values=PAD)
 84 |         return x_padded[:, :s]
 85 | 
 86 |     if maxlen:
 87 |         output = np.stack([pad(x, maxlen) for x in inputs])
 88 |     else:
 89 |         max_len = max(np.shape(x)[0] for x in inputs)
 90 |         output = np.stack([pad(x, max_len) for x in inputs])
 91 | 
 92 |     return output
 93 | 
 94 | 
 95 | def pad_2D_tensor(inputs, maxlen=None):
 96 | 
 97 |     def pad(x, max_len):
 98 |         if x.size(0) > max_len:
 99 |             raise ValueError("not max_len")
100 | 
101 |         s = x.size(1)
102 |         x_padded = F.pad(x, (0, 0, 0, max_len-x.size(0)))
103 |         return x_padded[:, :s]
104 | 
105 |     if maxlen:
106 |         output = torch.stack([pad(x, maxlen) for x in inputs])
107 |     else:
108 |         max_len = max(x.size(0) for x in inputs)
109 |         output = torch.stack([pad(x, max_len) for x in inputs])
110 | 
111 |     return output
112 | 
113 | 
114 | def pad(input_ele, mel_max_length=None):
115 |     if mel_max_length:
116 |         out_list = list()
117 |         max_len = mel_max_length
118 |         for i, batch in enumerate(input_ele):
119 |             one_batch_padded = F.pad(
120 |                 batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0)
121 |             out_list.append(one_batch_padded)
122 |         out_padded = torch.stack(out_list)
123 |         return out_padded
124 |     else:
125 |         out_list = list()
126 |         max_len = max([input_ele[i].size(0)for i in range(len(input_ele))])
127 | 
128 |         for i, batch in enumerate(input_ele):
129 |             one_batch_padded = F.pad(
130 |                 batch, (0, 0, 0, max_len-batch.size(0)), "constant", 0.0)
131 |             out_list.append(one_batch_padded)
132 |         out_padded = torch.stack(out_list)
133 |         return out_padded
134 | 


--------------------------------------------------------------------------------
/waveglow/__init__.py:
--------------------------------------------------------------------------------
1 | import waveglow.inference
2 | import waveglow.mel2samp
3 | 


--------------------------------------------------------------------------------
/waveglow/convert_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | import torch
 4 | 
 5 | def _check_model_old_version(model):
 6 |     if hasattr(model.WN[0], 'res_layers'):
 7 |         return True
 8 |     else:
 9 |         return False
10 | 
11 | def update_model(old_model):
12 |     if not _check_model_old_version(old_model):
13 |         return old_model
14 |     new_model = copy.deepcopy(old_model)
15 |     for idx in range(0, len(new_model.WN)):
16 |         wavenet = new_model.WN[idx]
17 |         wavenet.res_skip_layers = torch.nn.ModuleList()
18 |         n_channels = wavenet.n_channels
19 |         n_layers = wavenet.n_layers
20 |         for i in range(0, n_layers):
21 |             if i < n_layers - 1:
22 |                 res_skip_channels = 2*n_channels
23 |             else:
24 |                 res_skip_channels = n_channels
25 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
26 |             skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
27 |             if i < n_layers - 1:
28 |                 res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
29 |                 res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
30 |                 res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
31 |             else:
32 |                 res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
33 |                 res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
34 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
35 |             wavenet.res_skip_layers.append(res_skip_layer)
36 |         del wavenet.res_layers
37 |         del wavenet.skip_layers
38 |     return new_model
39 | 
40 | if __name__ == '__main__':
41 |     old_model_path = sys.argv[1]
42 |     new_model_path = sys.argv[2]
43 |     model = torch.load(old_model_path)
44 |     model['model'] = update_model(model['model'])
45 |     torch.save(model, new_model_path)
46 |     
47 | 


--------------------------------------------------------------------------------
/waveglow/inference.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | #  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | import os
28 | from scipy.io.wavfile import write
29 | import torch
30 | from waveglow.mel2samp import files_to_list, MAX_WAV_VALUE
31 | # from denoiser import Denoiser
32 | 
33 | 
34 | def inference(mel, waveglow, audio_path, sigma=1.0, sampling_rate=22050):
35 |     with torch.no_grad():
36 |         audio = waveglow.infer(mel, sigma=sigma)
37 |         audio = audio * MAX_WAV_VALUE
38 |     audio = audio.squeeze()
39 |     audio = audio.cpu().numpy()
40 |     audio = audio.astype('int16')
41 |     write(audio_path, sampling_rate, audio)
42 | 
43 | 
44 | def test_speed(mel, waveglow, sigma=1.0, sampling_rate=22050):
45 |     with torch.no_grad():
46 |         audio = waveglow.infer(mel, sigma=sigma)
47 |         audio = audio * MAX_WAV_VALUE
48 | 
49 | 
50 | def get_wav(mel, waveglow, sigma=1.0, sampling_rate=22050):
51 |     with torch.no_grad():
52 |         audio = waveglow.infer(mel, sigma=sigma)
53 |         audio = audio * MAX_WAV_VALUE
54 |     audio = audio.squeeze()
55 |     audio = audio.cpu()
56 | 
57 |     return audio
58 | 


--------------------------------------------------------------------------------
/waveglow/mel2samp.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************\
 27 | # from tacotron2.layers import TacotronSTFT
 28 | import os
 29 | import random
 30 | import argparse
 31 | import json
 32 | import torch
 33 | import torch.utils.data
 34 | import sys
 35 | from scipy.io.wavfile import read
 36 | 
 37 | # We're using the audio processing from TacoTron2 to make sure it matches
 38 | sys.path.insert(0, 'tacotron2')
 39 | 
 40 | MAX_WAV_VALUE = 32768.0
 41 | 
 42 | 
 43 | def files_to_list(filename):
 44 |     """
 45 |     Takes a text file of filenames and makes a list of filenames
 46 |     """
 47 |     with open(filename, encoding='utf-8') as f:
 48 |         files = f.readlines()
 49 | 
 50 |     files = [f.rstrip() for f in files]
 51 |     return files
 52 | 
 53 | 
 54 | # def load_wav_to_torch(full_path):
 55 | #     """
 56 | #     Loads wavdata into torch array
 57 | #     """
 58 | #     sampling_rate, data = read(full_path)
 59 | #     return torch.from_numpy(data).float(), sampling_rate
 60 | 
 61 | 
 62 | # class Mel2Samp(torch.utils.data.Dataset):
 63 | #     """
 64 | #     This is the main class that calculates the spectrogram and returns the
 65 | #     spectrogram, audio pair.
 66 | #     """
 67 | 
 68 | #     def __init__(self, training_files, segment_length, filter_length,
 69 | #                  hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
 70 | #         self.audio_files = files_to_list(training_files)
 71 | #         random.seed(1234)
 72 | #         random.shuffle(self.audio_files)
 73 | #         self.stft = TacotronSTFT(filter_length=filter_length,
 74 | #                                  hop_length=hop_length,
 75 | #                                  win_length=win_length,
 76 | #                                  sampling_rate=sampling_rate,
 77 | #                                  mel_fmin=mel_fmin, mel_fmax=mel_fmax)
 78 | #         self.segment_length = segment_length
 79 | #         self.sampling_rate = sampling_rate
 80 | 
 81 | #     def get_mel(self, audio):
 82 | #         audio_norm = audio / MAX_WAV_VALUE
 83 | #         audio_norm = audio_norm.unsqueeze(0)
 84 | #         audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
 85 | #         melspec = self.stft.mel_spectrogram(audio_norm)
 86 | #         melspec = torch.squeeze(melspec, 0)
 87 | #         return melspec
 88 | 
 89 | #     def __getitem__(self, index):
 90 | #         # Read audio
 91 | #         filename = self.audio_files[index]
 92 | #         audio, sampling_rate = load_wav_to_torch(filename)
 93 | #         if sampling_rate != self.sampling_rate:
 94 | #             raise ValueError("{} SR doesn't match target {} SR".format(
 95 | #                 sampling_rate, self.sampling_rate))
 96 | 
 97 | #         # Take segment
 98 | #         if audio.size(0) >= self.segment_length:
 99 | #             max_audio_start = audio.size(0) - self.segment_length
100 | #             audio_start = random.randint(0, max_audio_start)
101 | #             audio = audio[audio_start:audio_start+self.segment_length]
102 | #         else:
103 | #             audio = torch.nn.functional.pad(
104 | #                 audio, (0, self.segment_length - audio.size(0)), 'constant').data
105 | 
106 | #         mel = self.get_mel(audio)
107 | #         audio = audio / MAX_WAV_VALUE
108 | 
109 | #         return (mel, audio)
110 | 
111 | #     def __len__(self):
112 | #         return len(self.audio_files)
113 | 
114 | 
115 | # # ===================================================================
116 | # # Takes directory of clean audio and makes directory of spectrograms
117 | # # Useful for making test sets
118 | # # ===================================================================
119 | # if __name__ == "__main__":
120 | #     # Get defaults so it can work with no Sacred
121 | #     parser = argparse.ArgumentParser()
122 | #     parser.add_argument('-f', "--filelist_path", required=True)
123 | #     parser.add_argument('-c', '--config', type=str,
124 | #                         help='JSON file for configuration')
125 | #     parser.add_argument('-o', '--output_dir', type=str,
126 | #                         help='Output directory')
127 | #     args = parser.parse_args()
128 | 
129 | #     with open(args.config) as f:
130 | #         data = f.read()
131 | #     data_config = json.loads(data)["data_config"]
132 | #     mel2samp = Mel2Samp(**data_config)
133 | 
134 | #     filepaths = files_to_list(args.filelist_path)
135 | 
136 | #     # Make directory if it doesn't exist
137 | #     if not os.path.isdir(args.output_dir):
138 | #         os.makedirs(args.output_dir)
139 | #         os.chmod(args.output_dir, 0o775)
140 | 
141 | #     for filepath in filepaths:
142 | #         audio, sr = load_wav_to_torch(filepath)
143 | #         melspectrogram = mel2samp.get_mel(audio)
144 | #         filename = os.path.basename(filepath)
145 | #         new_filepath = args.output_dir + '/' + filename + '.pt'
146 | #         print(new_filepath)
147 | #         torch.save(melspectrogram, new_filepath)
148 | 


--------------------------------------------------------------------------------