├── demo.wav ├── tensorboard.png ├── waveglow ├── .gitmodules ├── waveglow_logo.png ├── requirements.txt ├── config.json ├── LICENSE ├── denoiser.py ├── convert_model.py ├── README.md ├── inference.py ├── mel2samp.py ├── distributed.py ├── train.py ├── glow_old.py └── glow.py ├── .gitmodules ├── requirements.txt ├── Dockerfile ├── multiproc.py ├── loss_function.py ├── text ├── symbols.py ├── LICENSE ├── cmudict.py ├── numbers.py ├── __init__.py └── cleaners.py ├── utils.py ├── LICENSE ├── README.md ├── plotting_utils.py ├── logger.py ├── train.ipynb ├── .gitattributes ├── hparams.py ├── audio_processing.py ├── layers.py ├── loss_scaler.py ├── data_utils.py ├── stft.py ├── distributed.py ├── .gitignore ├── train.py ├── filelists └── transcript_val.txt └── model.py /demo.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/demo.wav -------------------------------------------------------------------------------- /tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/tensorboard.png -------------------------------------------------------------------------------- /waveglow/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tacotron2"] 2 | path = tacotron2 3 | url = http://github.com/NVIDIA/tacotron2 4 | -------------------------------------------------------------------------------- /waveglow/waveglow_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/waveglow/waveglow_logo.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "waveglow"] 2 | path = waveglow 3 | url = https://github.com/NVIDIA/waveglow 4 | branch = master 5 | -------------------------------------------------------------------------------- /waveglow/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.0 2 | matplotlib==2.1.0 3 | tensorflow 4 | numpy==1.13.3 5 | inflect==0.2.5 6 | librosa==0.6.0 7 | scipy==1.0.0 8 | tensorboardX==1.1 9 | Unidecode==1.0.22 10 | pillow 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | matplotlib 3 | numpy==1.22.4 4 | inflect 5 | librosa 6 | denoiser 7 | pysoundfile 8 | scipy 9 | Unidecode 10 | pillow 11 | openjtalk>=0.3.0.dev2 12 | janome 13 | torch 14 | tensorboardX 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7 2 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 3 | 4 | RUN apt-get update -y 5 | 6 | RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 pillow jupyter 7 | 8 | ADD apex /apex/ 9 | WORKDIR /apex/ 10 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . 11 | -------------------------------------------------------------------------------- /multiproc.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import sys 4 | import subprocess 5 | 6 | argslist = list(sys.argv)[1:] 7 | num_gpus = torch.cuda.device_count() 8 | argslist.append('--n_gpus={}'.format(num_gpus)) 9 | workers = [] 10 | job_id = time.strftime("%Y_%m_%d-%H%M%S") 11 | argslist.append("--group_name=group_{}".format(job_id)) 12 | 13 | for i in range(num_gpus): 14 | argslist.append('--rank={}'.format(i)) 15 | stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i), 16 | "w") 17 | print(argslist) 18 | p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) 19 | workers.append(p) 20 | argslist = argslist[:-1] 21 | 22 | for p in workers: 23 | p.wait() 24 | -------------------------------------------------------------------------------- /loss_function.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class Tacotron2Loss(nn.Module): 5 | def __init__(self): 6 | super(Tacotron2Loss, self).__init__() 7 | 8 | def forward(self, model_output, targets): 9 | mel_target, gate_target = targets[0], targets[1] 10 | mel_target.requires_grad = False 11 | gate_target.requires_grad = False 12 | gate_target = gate_target.view(-1, 1) 13 | 14 | mel_out, mel_out_postnet, gate_out, _ = model_output 15 | gate_out = gate_out.view(-1, 1) 16 | mel_loss = nn.MSELoss()(mel_out, mel_target) + \ 17 | nn.MSELoss()(mel_out_postnet, mel_target) 18 | gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target) 19 | return mel_loss + gate_loss 20 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | from text import cmudict 8 | 9 | _pad = '_' 10 | _punctuation = '!\'(),.:;? ' 11 | _special = '-' 12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 13 | 14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 15 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 16 | 17 | # Export all symbols: 18 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet 19 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /waveglow/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_config": { 3 | "fp16_run": true, 4 | "output_directory": "checkpoints", 5 | "epochs": 100000, 6 | "learning_rate": 1e-4, 7 | "sigma": 1.0, 8 | "iters_per_checkpoint": 2000, 9 | "batch_size": 12, 10 | "seed": 1234, 11 | "checkpoint_path": "", 12 | "with_tensorboard": false 13 | }, 14 | "data_config": { 15 | "training_files": "train_files.txt", 16 | "segment_length": 16000, 17 | "sampling_rate": 22050, 18 | "filter_length": 1024, 19 | "hop_length": 256, 20 | "win_length": 1024, 21 | "mel_fmin": 0.0, 22 | "mel_fmax": 8000.0 23 | }, 24 | "dist_config": { 25 | "dist_backend": "nccl", 26 | "dist_url": "tcp://localhost:54321" 27 | }, 28 | 29 | "waveglow_config": { 30 | "n_mel_channels": 80, 31 | "n_flows": 12, 32 | "n_group": 8, 33 | "n_early_every": 4, 34 | "n_early_size": 2, 35 | "WN_config": { 36 | "n_layers": 8, 37 | "n_channels": 256, 38 | "kernel_size": 3 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io.wavfile import read 3 | import torch 4 | 5 | 6 | from hparams import create_hparams 7 | #hparam = create_hparams() 8 | #hparam.cuda_enabled = False 9 | 10 | def get_mask_from_lengths(lengths): 11 | max_len = torch.max(lengths).item() 12 | 13 | #if hparam.cuda_enabled : 14 | if create_hparams.cuda_enabled : 15 | ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) 16 | mask = (ids < lengths.unsqueeze(1)).bool() 17 | else : 18 | ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)) 19 | mask = (ids < lengths.unsqueeze(1)).bool() 20 | 21 | return mask 22 | 23 | 24 | 25 | def load_wav_to_torch(full_path): 26 | sampling_rate, data = read(full_path) 27 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 28 | 29 | 30 | def load_filepaths_and_text(filename, split="|"): 31 | with open(filename, encoding='utf-8') as f: 32 | filepaths_and_text = [line.strip().split(split) for line in f] 33 | return filepaths_and_text 34 | 35 | 36 | def to_gpu(x): 37 | x = x.contiguous() 38 | 39 | if torch.cuda.is_available(): 40 | x = x.cuda(non_blocking=True) 41 | return torch.autograd.Variable(x) 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /waveglow/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron2-Japanese 2 | - Tacotron2 implementation of Japanese 3 | ## Links 4 | * Reference: [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2) 5 | * [Pre-training tacotron2 models](https://github.com/CjangCjengh/TTSModels) 6 | * [latest changes can be viewed in this repository](https://github.com/StarxSky/tacotron2-JP) 7 | 8 | ## How to use 9 | 1. Put raw Japanese texts in ./filelists 10 | 2. Put WAV files in ./wav 11 | 3. (Optional) Download NVIDIA's [pretrained model](https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing) 12 | 4. Open ./train.ipynb to install requirements and start training 13 | 5. Download NVIDIA's [WaveGlow model](https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF) 14 | 6. Open ./inference.ipynb to generate voice 15 | 16 | ## Cleaners 17 | File ./hparams.py line 30 18 | ### 1. 'japanese_cleaners' 19 | #### Before 20 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも 21 | #### After 22 | nanikaacltaraitsudemohanashItekudasai.gakuiNnokotojanaku,shijinikaNsurukotodemonanidemo. 23 | ### 2. 'japanese_tokenization_cleaners' 24 | #### Before 25 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも 26 | #### After 27 | nani ka acl tara itsu demo hanashi te kudasai. gakuiN no koto ja naku, shiji nikaNsuru koto de mo naNdemo. 28 | ### 3. 'japanese_accent_cleaners' 29 | #### Before 30 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも 31 | #### After 32 | :na)nika a)cltara i)tsudemo ha(na)shIte ku(dasa)i.:ga(kuiNno ko(to)janaku,:shi)jini ka(Nsu)ru ko(to)demo na)nidemo. 33 | ### 4. 'japanese_phrase_cleaners' 34 | #### Before 35 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも 36 | #### After 37 | nanika acltara itsudemo hanashIte kudasai. gakuiNno kotojanaku, shijini kaNsuru kotodemo nanidemo. 38 | -------------------------------------------------------------------------------- /waveglow/denoiser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('tacotron2') 3 | import torch 4 | from layers import STFT 5 | 6 | 7 | class Denoiser(torch.nn.Module): 8 | """ Removes model bias from audio produced with waveglow """ 9 | 10 | def __init__(self, waveglow, filter_length=1024, n_overlap=4, 11 | win_length=1024, mode='zeros'): 12 | super(Denoiser, self).__init__() 13 | self.stft = STFT(filter_length=filter_length, 14 | hop_length=int(filter_length/n_overlap), 15 | win_length=win_length).cuda() 16 | if mode == 'zeros': 17 | mel_input = torch.zeros( 18 | (1, 80, 88), 19 | dtype=waveglow.upsample.weight.dtype, 20 | device=waveglow.upsample.weight.device) 21 | elif mode == 'normal': 22 | mel_input = torch.randn( 23 | (1, 80, 88), 24 | dtype=waveglow.upsample.weight.dtype, 25 | device=waveglow.upsample.weight.device) 26 | else: 27 | raise Exception("Mode {} if not supported".format(mode)) 28 | 29 | with torch.no_grad(): 30 | bias_audio = waveglow.infer(mel_input, sigma=0.0).float() 31 | bias_spec, _ = self.stft.transform(bias_audio) 32 | 33 | self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) 34 | 35 | def forward(self, audio, strength=0.1): 36 | audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) 37 | audio_spec_denoised = audio_spec - self.bias_spec * strength 38 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 39 | audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) 40 | return audio_denoised 41 | -------------------------------------------------------------------------------- /plotting_utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use("Agg") 3 | import matplotlib.pylab as plt 4 | import numpy as np 5 | 6 | 7 | def save_figure_to_numpy(fig): 8 | # save it to a numpy array. 9 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 10 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 11 | return data 12 | 13 | 14 | def plot_alignment_to_numpy(alignment, info=None): 15 | fig, ax = plt.subplots(figsize=(6, 4)) 16 | im = ax.imshow(alignment, aspect='auto', origin='lower', 17 | interpolation='none') 18 | fig.colorbar(im, ax=ax) 19 | xlabel = 'Decoder timestep' 20 | if info is not None: 21 | xlabel += '\n\n' + info 22 | plt.xlabel(xlabel) 23 | plt.ylabel('Encoder timestep') 24 | plt.tight_layout() 25 | 26 | fig.canvas.draw() 27 | data = save_figure_to_numpy(fig) 28 | plt.close() 29 | return data 30 | 31 | 32 | def plot_spectrogram_to_numpy(spectrogram): 33 | fig, ax = plt.subplots(figsize=(12, 3)) 34 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 35 | interpolation='none') 36 | plt.colorbar(im, ax=ax) 37 | plt.xlabel("Frames") 38 | plt.ylabel("Channels") 39 | plt.tight_layout() 40 | 41 | fig.canvas.draw() 42 | data = save_figure_to_numpy(fig) 43 | plt.close() 44 | return data 45 | 46 | 47 | def plot_gate_outputs_to_numpy(gate_targets, gate_outputs): 48 | fig, ax = plt.subplots(figsize=(12, 3)) 49 | ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5, 50 | color='green', marker='+', s=1, label='target') 51 | ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5, 52 | color='red', marker='.', s=1, label='predicted') 53 | 54 | plt.xlabel("Frames (Green target, Red predicted)") 55 | plt.ylabel("Gate State") 56 | plt.tight_layout() 57 | 58 | fig.canvas.draw() 59 | data = save_figure_to_numpy(fig) 60 | plt.close() 61 | return data 62 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy 5 | from plotting_utils import plot_gate_outputs_to_numpy 6 | 7 | 8 | class Tacotron2Logger(SummaryWriter): 9 | def __init__(self, logdir): 10 | super(Tacotron2Logger, self).__init__(logdir) 11 | 12 | def log_training(self, reduced_loss, grad_norm, learning_rate, duration, 13 | iteration): 14 | self.add_scalar("training.loss", reduced_loss, iteration) 15 | self.add_scalar("grad.norm", grad_norm, iteration) 16 | self.add_scalar("learning.rate", learning_rate, iteration) 17 | self.add_scalar("duration", duration, iteration) 18 | 19 | def log_validation(self, reduced_loss, model, y, y_pred, iteration): 20 | self.add_scalar("validation.loss", reduced_loss, iteration) 21 | _, mel_outputs, gate_outputs, alignments = y_pred 22 | mel_targets, gate_targets = y 23 | 24 | # plot distribution of parameters 25 | for tag, value in model.named_parameters(): 26 | tag = tag.replace('.', '/') 27 | self.add_histogram(tag, value.data.cpu().numpy(), iteration) 28 | 29 | # plot alignment, mel target and predicted, gate target and predicted 30 | idx = random.randint(0, alignments.size(0) - 1) 31 | self.add_image( 32 | "alignment", 33 | plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T), 34 | iteration, dataformats='HWC') 35 | self.add_image( 36 | "mel_target", 37 | plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()), 38 | iteration, dataformats='HWC') 39 | self.add_image( 40 | "mel_predicted", 41 | plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()), 42 | iteration, dataformats='HWC') 43 | self.add_image( 44 | "gate", 45 | plot_gate_outputs_to_numpy( 46 | gate_targets[idx].data.cpu().numpy(), 47 | torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()), 48 | iteration, dataformats='HWC') 49 | -------------------------------------------------------------------------------- /text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | def __init__(self, file_or_path, keep_ambiguous=True): 22 | if isinstance(file_or_path, str): 23 | with open(file_or_path, encoding='latin-1') as f: 24 | entries = _parse_cmudict(f) 25 | else: 26 | entries = _parse_cmudict(file_or_path) 27 | if not keep_ambiguous: 28 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 29 | self._entries = entries 30 | 31 | 32 | def __len__(self): 33 | return len(self._entries) 34 | 35 | 36 | def lookup(self, word): 37 | '''Returns list of ARPAbet pronunciations of the given word.''' 38 | return self._entries.get(word.upper()) 39 | 40 | 41 | 42 | _alt_re = re.compile(r'\([0-9]+\)') 43 | 44 | 45 | def _parse_cmudict(file): 46 | cmudict = {} 47 | for line in file: 48 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 49 | parts = line.split(' ') 50 | word = re.sub(_alt_re, '', parts[0]) 51 | pronunciation = _get_pronunciation(parts[1]) 52 | if pronunciation: 53 | if word in cmudict: 54 | cmudict[word].append(pronunciation) 55 | else: 56 | cmudict[word] = [pronunciation] 57 | return cmudict 58 | 59 | 60 | def _get_pronunciation(s): 61 | parts = s.strip().split(' ') 62 | for part in parts: 63 | if part not in _valid_symbol_set: 64 | return None 65 | return ' '.join(parts) 66 | -------------------------------------------------------------------------------- /train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "28d9a0e5", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Install requirements" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "f5ba8906-5257-4293-960c-853b8b3c6dff", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install librosa==0.8.0\n", 19 | "!pip install pysoundfile==0.9.0.post1\n", 20 | "!pip install unidecode==1.3.4\n", 21 | "!pip install pyopenjtalk==0.2.0\n", 22 | "!pip install inflect==5.6.2\n", 23 | "!pip install janome==0.4.2" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "75498eeb", 29 | "metadata": {}, 30 | "source": [ 31 | "#### Train without pretrained model" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "d5f5590e-1817-4665-9bdf-1ff06c0f7f96", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "!python train.py --output_directory=outdir --log_directory=logdir" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "e673209c", 47 | "metadata": {}, 48 | "source": [ 49 | "#### Train with a pretrained model" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "d5f5590e-1817-4665-9bdf-1ff06c0f7f96", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "!python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3.10.2 64-bit", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.10.2" 80 | }, 81 | "vscode": { 82 | "interpreter": { 83 | "hash": "d99a3f7b344b3c3107482760db15f42178bfad658d282ab0a919b76809e13cb5" 84 | } 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 5 89 | } 90 | -------------------------------------------------------------------------------- /text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | from text import cleaners 4 | from text.symbols import symbols 5 | 6 | 7 | # Mappings from symbol to numeric ID and vice versa: 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 10 | 11 | # Regular expression matching text enclosed in curly braces: 12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 13 | 14 | 15 | def text_to_sequence(text, cleaner_names): 16 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 17 | 18 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 19 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 20 | 21 | Args: 22 | text: string to convert to a sequence 23 | cleaner_names: names of the cleaner functions to run the text through 24 | 25 | Returns: 26 | List of integers corresponding to the symbols in the text 27 | ''' 28 | sequence = [] 29 | 30 | # Check for curly braces and treat their contents as ARPAbet: 31 | while len(text): 32 | m = _curly_re.match(text) 33 | if not m: 34 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 35 | break 36 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 37 | sequence += _arpabet_to_sequence(m.group(2)) 38 | text = m.group(3) 39 | 40 | return sequence 41 | 42 | 43 | def sequence_to_text(sequence): 44 | '''Converts a sequence of IDs back to a string''' 45 | result = '' 46 | for symbol_id in sequence: 47 | if symbol_id in _id_to_symbol: 48 | s = _id_to_symbol[symbol_id] 49 | # Enclose ARPAbet back in curly braces: 50 | if len(s) > 1 and s[0] == '@': 51 | s = '{%s}' % s[1:] 52 | result += s 53 | return result.replace('}{', ' ') 54 | 55 | 56 | def _clean_text(text, cleaner_names): 57 | for name in cleaner_names: 58 | cleaner = getattr(cleaners, name) 59 | if not cleaner: 60 | raise Exception('Unknown cleaner: %s' % name) 61 | text = cleaner(text) 62 | return text 63 | 64 | 65 | def _symbols_to_sequence(symbols): 66 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 67 | 68 | 69 | def _arpabet_to_sequence(text): 70 | return _symbols_to_sequence(['@' + s for s in text.split()]) 71 | 72 | 73 | def _should_keep_symbol(s): 74 | return s in _symbol_to_id and s is not '_' and s is not '~' 75 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from text import symbols 3 | 4 | 5 | class create_hparams(): 6 | """Create model hyperparameters. Parse nondefault from given string.""" 7 | ################################ 8 | # CUDA Enable # 9 | ################################ 10 | if torch.cuda.is_available() : 11 | cuda_enabled = True 12 | else : 13 | cuda_enabled = False 14 | 15 | ################################ 16 | # Experiment Parameters # 17 | ################################ 18 | epochs = 100 19 | iters_per_checkpoint = 500 20 | seed= 1234 21 | dynamic_loss_scaling = True 22 | fp16_run = False 23 | distributed_run = False 24 | dist_backend = "nccl" 25 | dist_url = "tcp://localhost:54321" 26 | cudnn_enabled = True 27 | cudnn_benchmark = False 28 | ignore_layers = ['embedding.weight'] 29 | 30 | ################################ 31 | # Data Parameters # 32 | ################################ 33 | load_mel_from_disk = False 34 | training_files = 'filelists/transcript_train.txt' 35 | validation_files = 'filelists/transcript_val.txt' 36 | text_cleaners = ['japanese_cleaners'] 37 | 38 | ################################ 39 | # Audio Parameters # 40 | ################################ 41 | max_wav_value = 32768.0 42 | sampling_rate = 22050 43 | filter_length = 1024 44 | hop_length = 256 45 | win_length = 1024 46 | n_mel_channels = 80 47 | mel_fmin = 0.0 48 | mel_fmax = 8000.0 49 | 50 | ################################ 51 | # Model Parameters # 52 | ################################ 53 | n_symbols = len(symbols) 54 | symbols_embedding_dim = 512 55 | 56 | # Encoder parameters 57 | encoder_kernel_size = 5 58 | encoder_n_convolutions = 3 59 | encoder_embedding_dim = 512 60 | 61 | # Decoder parameters 62 | n_frames_per_step = 1 # currently only 1 is supported 63 | decoder_rnn_dim = 1024 64 | prenet_dim = 256 65 | max_decoder_steps = 1000 66 | gate_threshold = 0.5 67 | p_attention_dropout = 0.1 68 | p_decoder_dropout = 0.1 69 | 70 | # Attention parameters 71 | attention_rnn_dim = 1024 72 | attention_dim = 128 73 | # Location Layer parameters 74 | attention_location_n_filters = 32 75 | attention_location_kernel_size = 31 76 | 77 | # Mel-post processing network parameters 78 | postnet_embedding_dim = 512 79 | postnet_kernel_size = 5 80 | postnet_n_convolutions = 5 81 | 82 | ################################ 83 | # Optimization Hyperparameters # 84 | ################################ 85 | use_saved_learning_rate = False 86 | learning_rate = 1e-3 87 | weight_decay = 1e-6 88 | grad_clip_thresh = 1.0 89 | batch_size = 64 90 | mask_padding = True # set model's padded outputs to padded values 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.signal import get_window 4 | import librosa.util as librosa_util 5 | 6 | 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 8 | n_fft=800, dtype=np.float32, norm=None): 9 | """ 10 | # from librosa 0.6 11 | Compute the sum-square envelope of a window function at a given hop length. 12 | 13 | This is used to estimate modulation effects induced by windowing 14 | observations in short-time fourier transforms. 15 | 16 | Parameters 17 | ---------- 18 | window : string, tuple, number, callable, or list-like 19 | Window specification, as in `get_window` 20 | 21 | n_frames : int > 0 22 | The number of analysis frames 23 | 24 | hop_length : int > 0 25 | The number of samples to advance between frames 26 | 27 | win_length : [optional] 28 | The length of the window function. By default, this matches `n_fft`. 29 | 30 | n_fft : int > 0 31 | The length of each analysis frame. 32 | 33 | dtype : np.dtype 34 | The data type of the output 35 | 36 | Returns 37 | ------- 38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 39 | The sum-squared envelope of the window function 40 | """ 41 | if win_length is None: 42 | win_length = n_fft 43 | 44 | n = n_fft + hop_length * (n_frames - 1) 45 | x = np.zeros(n, dtype=dtype) 46 | 47 | # Compute the squared window at the desired length 48 | win_sq = get_window(window, win_length, fftbins=True) 49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 50 | win_sq = librosa_util.pad_center(win_sq, n_fft) 51 | 52 | # Fill the envelope 53 | for i in range(n_frames): 54 | sample = i * hop_length 55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 56 | return x 57 | 58 | 59 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 60 | """ 61 | PARAMS 62 | ------ 63 | magnitudes: spectrogram magnitudes 64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 65 | """ 66 | 67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 68 | angles = angles.astype(np.float32) 69 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 71 | 72 | for i in range(n_iters): 73 | _, angles = stft_fn.transform(signal) 74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 75 | return signal 76 | 77 | 78 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 79 | """ 80 | PARAMS 81 | ------ 82 | C: compression factor 83 | """ 84 | return torch.log(torch.clamp(x, min=clip_val) * C) 85 | 86 | 87 | def dynamic_range_decompression(x, C=1): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor used to compress 92 | """ 93 | return torch.exp(x) / C 94 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from librosa.filters import mel as librosa_mel_fn 3 | from audio_processing import dynamic_range_compression 4 | from audio_processing import dynamic_range_decompression 5 | from stft import STFT 6 | 7 | 8 | class LinearNorm(torch.nn.Module): 9 | def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): 10 | super(LinearNorm, self).__init__() 11 | self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) 12 | 13 | torch.nn.init.xavier_uniform_( 14 | self.linear_layer.weight, 15 | gain=torch.nn.init.calculate_gain(w_init_gain)) 16 | 17 | def forward(self, x): 18 | return self.linear_layer(x) 19 | 20 | 21 | class ConvNorm(torch.nn.Module): 22 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, 23 | padding=None, dilation=1, bias=True, w_init_gain='linear'): 24 | super(ConvNorm, self).__init__() 25 | if padding is None: 26 | assert(kernel_size % 2 == 1) 27 | padding = int(dilation * (kernel_size - 1) / 2) 28 | 29 | self.conv = torch.nn.Conv1d(in_channels, out_channels, 30 | kernel_size=kernel_size, stride=stride, 31 | padding=padding, dilation=dilation, 32 | bias=bias) 33 | 34 | torch.nn.init.xavier_uniform_( 35 | self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) 36 | 37 | def forward(self, signal): 38 | conv_signal = self.conv(signal) 39 | return conv_signal 40 | 41 | 42 | class TacotronSTFT(torch.nn.Module): 43 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 44 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 45 | mel_fmax=8000.0): 46 | super(TacotronSTFT, self).__init__() 47 | self.n_mel_channels = n_mel_channels 48 | self.sampling_rate = sampling_rate 49 | self.stft_fn = STFT(filter_length, hop_length, win_length) 50 | mel_basis = librosa_mel_fn( 51 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 52 | mel_basis = torch.from_numpy(mel_basis).float() 53 | self.register_buffer('mel_basis', mel_basis) 54 | 55 | def spectral_normalize(self, magnitudes): 56 | output = dynamic_range_compression(magnitudes) 57 | return output 58 | 59 | def spectral_de_normalize(self, magnitudes): 60 | output = dynamic_range_decompression(magnitudes) 61 | return output 62 | 63 | def mel_spectrogram(self, y): 64 | """Computes mel-spectrograms from a batch of waves 65 | PARAMS 66 | ------ 67 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 68 | 69 | RETURNS 70 | ------- 71 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 72 | """ 73 | assert(torch.min(y.data) >= -1) 74 | assert(torch.max(y.data) <= 1) 75 | 76 | magnitudes, phases = self.stft_fn.transform(y) 77 | magnitudes = magnitudes.data 78 | mel_output = torch.matmul(self.mel_basis, magnitudes) 79 | mel_output = self.spectral_normalize(mel_output) 80 | return mel_output 81 | -------------------------------------------------------------------------------- /waveglow/convert_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | import torch 4 | 5 | def _check_model_old_version(model): 6 | if hasattr(model.WN[0], 'res_layers') or hasattr(model.WN[0], 'cond_layers'): 7 | return True 8 | else: 9 | return False 10 | 11 | 12 | def _update_model_res_skip(old_model, new_model): 13 | for idx in range(0, len(new_model.WN)): 14 | wavenet = new_model.WN[idx] 15 | n_channels = wavenet.n_channels 16 | n_layers = wavenet.n_layers 17 | wavenet.res_skip_layers = torch.nn.ModuleList() 18 | for i in range(0, n_layers): 19 | if i < n_layers - 1: 20 | res_skip_channels = 2*n_channels 21 | else: 22 | res_skip_channels = n_channels 23 | res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) 24 | skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i]) 25 | if i < n_layers - 1: 26 | res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i]) 27 | res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight])) 28 | res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias])) 29 | else: 30 | res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight) 31 | res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias) 32 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 33 | wavenet.res_skip_layers.append(res_skip_layer) 34 | del wavenet.res_layers 35 | del wavenet.skip_layers 36 | 37 | def _update_model_cond(old_model, new_model): 38 | for idx in range(0, len(new_model.WN)): 39 | wavenet = new_model.WN[idx] 40 | n_channels = wavenet.n_channels 41 | n_layers = wavenet.n_layers 42 | n_mel_channels = wavenet.cond_layers[0].weight.shape[1] 43 | cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) 44 | cond_layer_weight = [] 45 | cond_layer_bias = [] 46 | for i in range(0, n_layers): 47 | _cond_layer = torch.nn.utils.remove_weight_norm(wavenet.cond_layers[i]) 48 | cond_layer_weight.append(_cond_layer.weight) 49 | cond_layer_bias.append(_cond_layer.bias) 50 | cond_layer.weight = torch.nn.Parameter(torch.cat(cond_layer_weight)) 51 | cond_layer.bias = torch.nn.Parameter(torch.cat(cond_layer_bias)) 52 | cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 53 | wavenet.cond_layer = cond_layer 54 | del wavenet.cond_layers 55 | 56 | def update_model(old_model): 57 | if not _check_model_old_version(old_model): 58 | return old_model 59 | new_model = copy.deepcopy(old_model) 60 | if hasattr(old_model.WN[0], 'res_layers'): 61 | _update_model_res_skip(old_model, new_model) 62 | if hasattr(old_model.WN[0], 'cond_layers'): 63 | _update_model_cond(old_model, new_model) 64 | return new_model 65 | 66 | if __name__ == '__main__': 67 | old_model_path = sys.argv[1] 68 | new_model_path = sys.argv[2] 69 | model = torch.load(old_model_path, map_location='cpu') 70 | model['model'] = update_model(model['model']) 71 | torch.save(model, new_model_path) 72 | 73 | -------------------------------------------------------------------------------- /waveglow/README.md: -------------------------------------------------------------------------------- 1 | ![WaveGlow](waveglow_logo.png "WaveGLow") 2 | 3 | ## WaveGlow: a Flow-based Generative Network for Speech Synthesis 4 | 5 | ### Ryan Prenger, Rafael Valle, and Bryan Catanzaro 6 | 7 | In our recent [paper], we propose WaveGlow: a flow-based network capable of 8 | generating high quality speech from mel-spectrograms. WaveGlow combines insights 9 | from [Glow] and [WaveNet] in order to provide fast, efficient and high-quality 10 | audio synthesis, without the need for auto-regression. WaveGlow is implemented 11 | using only a single network, trained using only a single cost function: 12 | maximizing the likelihood of the training data, which makes the training 13 | procedure simple and stable. 14 | 15 | Our [PyTorch] implementation produces audio samples at a rate of 1200 16 | kHz on an NVIDIA V100 GPU. Mean Opinion Scores show that it delivers audio 17 | quality as good as the best publicly available WaveNet implementation. 18 | 19 | Visit our [website] for audio samples. 20 | 21 | ## Setup 22 | 23 | 1. Clone our repo and initialize submodule 24 | 25 | ```command 26 | git clone https://github.com/NVIDIA/waveglow.git 27 | cd waveglow 28 | git submodule init 29 | git submodule update 30 | ``` 31 | 32 | 2. Install requirements `pip3 install -r requirements.txt` 33 | 34 | 3. Install [Apex] 35 | 36 | 37 | ## Generate audio with our pre-existing model 38 | 39 | 1. Download our [published model] 40 | 2. Download [mel-spectrograms] 41 | 3. Generate audio `python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_256channels.pt -o . --is_fp16 -s 0.6` 42 | 43 | N.b. use `convert_model.py` to convert your older models to the current model 44 | with fused residual and skip connections. 45 | 46 | ## Train your own model 47 | 48 | 1. Download [LJ Speech Data]. In this example it's in `data/` 49 | 50 | 2. Make a list of the file names to use for training/testing 51 | 52 | ```command 53 | ls data/*.wav | tail -n+10 > train_files.txt 54 | ls data/*.wav | head -n10 > test_files.txt 55 | ``` 56 | 57 | 3. Train your WaveGlow networks 58 | 59 | ```command 60 | mkdir checkpoints 61 | python train.py -c config.json 62 | ``` 63 | 64 | For multi-GPU training replace `train.py` with `distributed.py`. Only tested with single node and NCCL. 65 | 66 | For mixed precision training set `"fp16_run": true` on `config.json`. 67 | 68 | 4. Make test set mel-spectrograms 69 | 70 | `python mel2samp.py -f test_files.txt -o . -c config.json` 71 | 72 | 5. Do inference with your network 73 | 74 | ```command 75 | ls *.pt > mel_files.txt 76 | python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6 77 | ``` 78 | 79 | [//]: # (TODO) 80 | [//]: # (PROVIDE INSTRUCTIONS FOR DOWNLOADING LJS) 81 | [pytorch 1.0]: https://github.com/pytorch/pytorch#installation 82 | [website]: https://nv-adlr.github.io/WaveGlow 83 | [paper]: https://arxiv.org/abs/1811.00002 84 | [WaveNet implementation]: https://github.com/r9y9/wavenet_vocoder 85 | [Glow]: https://blog.openai.com/glow/ 86 | [WaveNet]: https://deepmind.com/blog/wavenet-generative-model-raw-audio/ 87 | [PyTorch]: http://pytorch.org 88 | [published model]: https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF 89 | [mel-spectrograms]: https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view?usp=sharing 90 | [LJ Speech Data]: https://keithito.com/LJ-Speech-Dataset 91 | [Apex]: https://github.com/nvidia/apex 92 | -------------------------------------------------------------------------------- /waveglow/inference.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import os 28 | from scipy.io.wavfile import write 29 | import torch 30 | from mel2samp import files_to_list, MAX_WAV_VALUE 31 | from denoiser import Denoiser 32 | 33 | 34 | def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, 35 | denoiser_strength): 36 | mel_files = files_to_list(mel_files) 37 | waveglow = torch.load(waveglow_path)['model'] 38 | waveglow = waveglow.remove_weightnorm(waveglow) 39 | waveglow.cuda().eval() 40 | if is_fp16: 41 | from apex import amp 42 | waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") 43 | 44 | if denoiser_strength > 0: 45 | denoiser = Denoiser(waveglow).cuda() 46 | 47 | for i, file_path in enumerate(mel_files): 48 | file_name = os.path.splitext(os.path.basename(file_path))[0] 49 | mel = torch.load(file_path) 50 | mel = torch.autograd.Variable(mel.cuda()) 51 | mel = torch.unsqueeze(mel, 0) 52 | mel = mel.half() if is_fp16 else mel 53 | with torch.no_grad(): 54 | audio = waveglow.infer(mel, sigma=sigma) 55 | if denoiser_strength > 0: 56 | audio = denoiser(audio, denoiser_strength) 57 | audio = audio * MAX_WAV_VALUE 58 | audio = audio.squeeze() 59 | audio = audio.cpu().numpy() 60 | audio = audio.astype('int16') 61 | audio_path = os.path.join( 62 | output_dir, "{}_synthesis.wav".format(file_name)) 63 | write(audio_path, sampling_rate, audio) 64 | print(audio_path) 65 | 66 | 67 | if __name__ == "__main__": 68 | import argparse 69 | 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument('-f', "--filelist_path", required=True) 72 | parser.add_argument('-w', '--waveglow_path', 73 | help='Path to waveglow decoder checkpoint with model') 74 | parser.add_argument('-o', "--output_dir", required=True) 75 | parser.add_argument("-s", "--sigma", default=1.0, type=float) 76 | parser.add_argument("--sampling_rate", default=22050, type=int) 77 | parser.add_argument("--is_fp16", action="store_true") 78 | parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float, 79 | help='Removes model bias. Start with 0.1 and adjust') 80 | 81 | args = parser.parse_args() 82 | 83 | main(args.filelist_path, args.waveglow_path, args.sigma, args.output_dir, 84 | args.sampling_rate, args.is_fp16, args.denoiser_strength) 85 | -------------------------------------------------------------------------------- /loss_scaler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class LossScaler: 4 | 5 | def __init__(self, scale=1): 6 | self.cur_scale = scale 7 | 8 | # `params` is a list / generator of torch.Variable 9 | def has_overflow(self, params): 10 | return False 11 | 12 | # `x` is a torch.Tensor 13 | def _has_inf_or_nan(x): 14 | return False 15 | 16 | # `overflow` is boolean indicating whether we overflowed in gradient 17 | def update_scale(self, overflow): 18 | pass 19 | 20 | @property 21 | def loss_scale(self): 22 | return self.cur_scale 23 | 24 | def scale_gradient(self, module, grad_in, grad_out): 25 | return tuple(self.loss_scale * g for g in grad_in) 26 | 27 | def backward(self, loss): 28 | scaled_loss = loss*self.loss_scale 29 | scaled_loss.backward() 30 | 31 | class DynamicLossScaler: 32 | 33 | def __init__(self, 34 | init_scale=2**32, 35 | scale_factor=2., 36 | scale_window=1000): 37 | self.cur_scale = init_scale 38 | self.cur_iter = 0 39 | self.last_overflow_iter = -1 40 | self.scale_factor = scale_factor 41 | self.scale_window = scale_window 42 | 43 | # `params` is a list / generator of torch.Variable 44 | def has_overflow(self, params): 45 | # return False 46 | for p in params: 47 | if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): 48 | return True 49 | 50 | return False 51 | 52 | # `x` is a torch.Tensor 53 | def _has_inf_or_nan(x): 54 | cpu_sum = float(x.float().sum()) 55 | if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: 56 | return True 57 | return False 58 | 59 | # `overflow` is boolean indicating whether we overflowed in gradient 60 | def update_scale(self, overflow): 61 | if overflow: 62 | #self.cur_scale /= self.scale_factor 63 | self.cur_scale = max(self.cur_scale/self.scale_factor, 1) 64 | self.last_overflow_iter = self.cur_iter 65 | else: 66 | if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: 67 | self.cur_scale *= self.scale_factor 68 | # self.cur_scale = 1 69 | self.cur_iter += 1 70 | 71 | @property 72 | def loss_scale(self): 73 | return self.cur_scale 74 | 75 | def scale_gradient(self, module, grad_in, grad_out): 76 | return tuple(self.loss_scale * g for g in grad_in) 77 | 78 | def backward(self, loss): 79 | scaled_loss = loss*self.loss_scale 80 | scaled_loss.backward() 81 | 82 | ############################################################## 83 | # Example usage below here -- assuming it's in a separate file 84 | ############################################################## 85 | if __name__ == "__main__": 86 | import torch 87 | from torch.autograd import Variable 88 | from dynamic_loss_scaler import DynamicLossScaler 89 | 90 | # N is batch size; D_in is input dimension; 91 | # H is hidden dimension; D_out is output dimension. 92 | N, D_in, H, D_out = 64, 1000, 100, 10 93 | 94 | # Create random Tensors to hold inputs and outputs, and wrap them in Variables. 95 | x = Variable(torch.randn(N, D_in), requires_grad=False) 96 | y = Variable(torch.randn(N, D_out), requires_grad=False) 97 | 98 | w1 = Variable(torch.randn(D_in, H), requires_grad=True) 99 | w2 = Variable(torch.randn(H, D_out), requires_grad=True) 100 | parameters = [w1, w2] 101 | 102 | learning_rate = 1e-6 103 | optimizer = torch.optim.SGD(parameters, lr=learning_rate) 104 | loss_scaler = DynamicLossScaler() 105 | 106 | for t in range(500): 107 | y_pred = x.mm(w1).clamp(min=0).mm(w2) 108 | loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale 109 | print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) 110 | print('Iter {} scaled loss: {}'.format(t, loss.data[0])) 111 | print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) 112 | 113 | # Run backprop 114 | optimizer.zero_grad() 115 | loss.backward() 116 | 117 | # Check for overflow 118 | has_overflow = DynamicLossScaler.has_overflow(parameters) 119 | 120 | # If no overflow, unscale grad and update as usual 121 | if not has_overflow: 122 | for param in parameters: 123 | param.grad.data.mul_(1. / loss_scaler.loss_scale) 124 | optimizer.step() 125 | # Otherwise, don't do anything -- ie, skip iteration 126 | else: 127 | print('OVERFLOW!') 128 | 129 | # Update loss scale for next iteration 130 | loss_scaler.update_scale(has_overflow) 131 | 132 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | import torch.utils.data 5 | 6 | import layers 7 | from utils import load_wav_to_torch, load_filepaths_and_text 8 | from text import text_to_sequence 9 | 10 | 11 | class TextMelLoader(torch.utils.data.Dataset): 12 | """ 13 | 1) loads audio,text pairs 14 | 2) normalizes text and converts them to sequences of one-hot vectors 15 | 3) computes mel-spectrograms from audio files. 16 | """ 17 | def __init__(self, audiopaths_and_text, hparams): 18 | self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) 19 | self.text_cleaners = hparams.text_cleaners 20 | self.max_wav_value = hparams.max_wav_value 21 | self.sampling_rate = hparams.sampling_rate 22 | self.load_mel_from_disk = hparams.load_mel_from_disk 23 | self.stft = layers.TacotronSTFT( 24 | hparams.filter_length, hparams.hop_length, hparams.win_length, 25 | hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, 26 | hparams.mel_fmax) 27 | random.seed(hparams.seed) 28 | random.shuffle(self.audiopaths_and_text) 29 | 30 | def get_mel_text_pair(self, audiopath_and_text): 31 | # separate filename and text 32 | audiopath, text = audiopath_and_text[0], audiopath_and_text[1] 33 | text = self.get_text(text) 34 | mel = self.get_mel(audiopath) 35 | return (text, mel) 36 | 37 | def get_mel(self, filename): 38 | if not self.load_mel_from_disk: 39 | audio, sampling_rate = load_wav_to_torch(filename) 40 | if sampling_rate != self.stft.sampling_rate: 41 | raise ValueError("{} {} SR doesn't match target {} SR".format( 42 | sampling_rate, self.stft.sampling_rate)) 43 | audio_norm = audio / self.max_wav_value 44 | audio_norm = audio_norm.unsqueeze(0) 45 | audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 46 | melspec = self.stft.mel_spectrogram(audio_norm) 47 | melspec = torch.squeeze(melspec, 0) 48 | else: 49 | melspec = torch.from_numpy(np.load(filename)) 50 | assert melspec.size(0) == self.stft.n_mel_channels, ( 51 | 'Mel dimension mismatch: given {}, expected {}'.format( 52 | melspec.size(0), self.stft.n_mel_channels)) 53 | 54 | return melspec 55 | 56 | def get_text(self, text): 57 | text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) 58 | return text_norm 59 | 60 | def __getitem__(self, index): 61 | return self.get_mel_text_pair(self.audiopaths_and_text[index]) 62 | 63 | def __len__(self): 64 | return len(self.audiopaths_and_text) 65 | 66 | 67 | class TextMelCollate(): 68 | """ Zero-pads model inputs and targets based on number of frames per setep 69 | """ 70 | def __init__(self, n_frames_per_step): 71 | self.n_frames_per_step = n_frames_per_step 72 | 73 | def __call__(self, batch): 74 | """Collate's training batch from normalized text and mel-spectrogram 75 | PARAMS 76 | ------ 77 | batch: [text_normalized, mel_normalized] 78 | """ 79 | # Right zero-pad all one-hot text sequences to max input length 80 | input_lengths, ids_sorted_decreasing = torch.sort( 81 | torch.LongTensor([len(x[0]) for x in batch]), 82 | dim=0, descending=True) 83 | max_input_len = input_lengths[0] 84 | 85 | text_padded = torch.LongTensor(len(batch), max_input_len) 86 | text_padded.zero_() 87 | for i in range(len(ids_sorted_decreasing)): 88 | text = batch[ids_sorted_decreasing[i]][0] 89 | text_padded[i, :text.size(0)] = text 90 | 91 | # Right zero-pad mel-spec 92 | num_mels = batch[0][1].size(0) 93 | max_target_len = max([x[1].size(1) for x in batch]) 94 | if max_target_len % self.n_frames_per_step != 0: 95 | max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step 96 | assert max_target_len % self.n_frames_per_step == 0 97 | 98 | # include mel padded and gate padded 99 | mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) 100 | mel_padded.zero_() 101 | gate_padded = torch.FloatTensor(len(batch), max_target_len) 102 | gate_padded.zero_() 103 | output_lengths = torch.LongTensor(len(batch)) 104 | for i in range(len(ids_sorted_decreasing)): 105 | mel = batch[ids_sorted_decreasing[i]][1] 106 | mel_padded[i, :, :mel.size(1)] = mel 107 | gate_padded[i, mel.size(1)-1:] = 1 108 | output_lengths[i] = mel.size(1) 109 | 110 | return text_padded, input_lengths, mel_padded, gate_padded, \ 111 | output_lengths 112 | -------------------------------------------------------------------------------- /waveglow/mel2samp.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # *****************************************************************************\ 27 | import os 28 | import random 29 | import argparse 30 | import json 31 | import torch 32 | import torch.utils.data 33 | import sys 34 | from scipy.io.wavfile import read 35 | 36 | # We're using the audio processing from TacoTron2 to make sure it matches 37 | sys.path.insert(0, 'tacotron2') 38 | from tacotron2.layers import TacotronSTFT 39 | 40 | MAX_WAV_VALUE = 32768.0 41 | 42 | def files_to_list(filename): 43 | """ 44 | Takes a text file of filenames and makes a list of filenames 45 | """ 46 | with open(filename, encoding='utf-8') as f: 47 | files = f.readlines() 48 | 49 | files = [f.rstrip() for f in files] 50 | return files 51 | 52 | def load_wav_to_torch(full_path): 53 | """ 54 | Loads wavdata into torch array 55 | """ 56 | sampling_rate, data = read(full_path) 57 | return torch.from_numpy(data).float(), sampling_rate 58 | 59 | 60 | class Mel2Samp(torch.utils.data.Dataset): 61 | """ 62 | This is the main class that calculates the spectrogram and returns the 63 | spectrogram, audio pair. 64 | """ 65 | def __init__(self, training_files, segment_length, filter_length, 66 | hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): 67 | self.audio_files = files_to_list(training_files) 68 | random.seed(1234) 69 | random.shuffle(self.audio_files) 70 | self.stft = TacotronSTFT(filter_length=filter_length, 71 | hop_length=hop_length, 72 | win_length=win_length, 73 | sampling_rate=sampling_rate, 74 | mel_fmin=mel_fmin, mel_fmax=mel_fmax) 75 | self.segment_length = segment_length 76 | self.sampling_rate = sampling_rate 77 | 78 | def get_mel(self, audio): 79 | audio_norm = audio / MAX_WAV_VALUE 80 | audio_norm = audio_norm.unsqueeze(0) 81 | audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) 82 | melspec = self.stft.mel_spectrogram(audio_norm) 83 | melspec = torch.squeeze(melspec, 0) 84 | return melspec 85 | 86 | def __getitem__(self, index): 87 | # Read audio 88 | filename = self.audio_files[index] 89 | audio, sampling_rate = load_wav_to_torch(filename) 90 | if sampling_rate != self.sampling_rate: 91 | raise ValueError("{} SR doesn't match target {} SR".format( 92 | sampling_rate, self.sampling_rate)) 93 | 94 | # Take segment 95 | if audio.size(0) >= self.segment_length: 96 | max_audio_start = audio.size(0) - self.segment_length 97 | audio_start = random.randint(0, max_audio_start) 98 | audio = audio[audio_start:audio_start+self.segment_length] 99 | else: 100 | audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data 101 | 102 | mel = self.get_mel(audio) 103 | audio = audio / MAX_WAV_VALUE 104 | 105 | return (mel, audio) 106 | 107 | def __len__(self): 108 | return len(self.audio_files) 109 | 110 | # =================================================================== 111 | # Takes directory of clean audio and makes directory of spectrograms 112 | # Useful for making test sets 113 | # =================================================================== 114 | if __name__ == "__main__": 115 | # Get defaults so it can work with no Sacred 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('-f', "--filelist_path", required=True) 118 | parser.add_argument('-c', '--config', type=str, 119 | help='JSON file for configuration') 120 | parser.add_argument('-o', '--output_dir', type=str, 121 | help='Output directory') 122 | args = parser.parse_args() 123 | 124 | with open(args.config) as f: 125 | data = f.read() 126 | data_config = json.loads(data)["data_config"] 127 | mel2samp = Mel2Samp(**data_config) 128 | 129 | filepaths = files_to_list(args.filelist_path) 130 | 131 | # Make directory if it doesn't exist 132 | if not os.path.isdir(args.output_dir): 133 | os.makedirs(args.output_dir) 134 | os.chmod(args.output_dir, 0o775) 135 | 136 | for filepath in filepaths: 137 | audio, sr = load_wav_to_torch(filepath) 138 | melspectrogram = mel2samp.get_mel(audio) 139 | filename = os.path.basename(filepath) 140 | new_filepath = args.output_dir + '/' + filename + '.pt' 141 | print(new_filepath) 142 | torch.save(melspectrogram, new_filepath) 143 | -------------------------------------------------------------------------------- /stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window 38 | from librosa.util import pad_center, tiny 39 | from audio_processing import window_sumsquare 40 | 41 | 42 | class STFT(torch.nn.Module): 43 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 44 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 45 | window='hann'): 46 | super(STFT, self).__init__() 47 | self.filter_length = filter_length 48 | self.hop_length = hop_length 49 | self.win_length = win_length 50 | self.window = window 51 | self.forward_transform = None 52 | scale = self.filter_length / self.hop_length 53 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 54 | 55 | cutoff = int((self.filter_length / 2 + 1)) 56 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 57 | np.imag(fourier_basis[:cutoff, :])]) 58 | 59 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 60 | inverse_basis = torch.FloatTensor( 61 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 62 | 63 | if window is not None: 64 | assert(filter_length >= win_length) 65 | # get window and zero center pad it to filter_length 66 | fft_window = get_window(window, win_length, fftbins=True) 67 | fft_window = pad_center(fft_window, filter_length) 68 | fft_window = torch.from_numpy(fft_window).float() 69 | 70 | # window the bases 71 | forward_basis *= fft_window 72 | inverse_basis *= fft_window 73 | 74 | self.register_buffer('forward_basis', forward_basis.float()) 75 | self.register_buffer('inverse_basis', inverse_basis.float()) 76 | 77 | def transform(self, input_data): 78 | num_batches = input_data.size(0) 79 | num_samples = input_data.size(1) 80 | 81 | self.num_samples = num_samples 82 | 83 | # similar to librosa, reflect-pad the input 84 | input_data = input_data.view(num_batches, 1, num_samples) 85 | input_data = F.pad( 86 | input_data.unsqueeze(1), 87 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 88 | mode='reflect') 89 | input_data = input_data.squeeze(1) 90 | 91 | forward_transform = F.conv1d( 92 | input_data, 93 | Variable(self.forward_basis, requires_grad=False), 94 | stride=self.hop_length, 95 | padding=0) 96 | 97 | cutoff = int((self.filter_length / 2) + 1) 98 | real_part = forward_transform[:, :cutoff, :] 99 | imag_part = forward_transform[:, cutoff:, :] 100 | 101 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 102 | phase = torch.autograd.Variable( 103 | torch.atan2(imag_part.data, real_part.data)) 104 | 105 | return magnitude, phase 106 | 107 | def inverse(self, magnitude, phase): 108 | recombine_magnitude_phase = torch.cat( 109 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 110 | 111 | inverse_transform = F.conv_transpose1d( 112 | recombine_magnitude_phase, 113 | Variable(self.inverse_basis, requires_grad=False), 114 | stride=self.hop_length, 115 | padding=0) 116 | 117 | if self.window is not None: 118 | window_sum = window_sumsquare( 119 | self.window, magnitude.size(-1), hop_length=self.hop_length, 120 | win_length=self.win_length, n_fft=self.filter_length, 121 | dtype=np.float32) 122 | # remove modulation effects 123 | approx_nonzero_indices = torch.from_numpy( 124 | np.where(window_sum > tiny(window_sum))[0]) 125 | window_sum = torch.autograd.Variable( 126 | torch.from_numpy(window_sum), requires_grad=False) 127 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 128 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 129 | 130 | # scale by hop ratio 131 | inverse_transform *= float(self.filter_length) / self.hop_length 132 | 133 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 134 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 135 | 136 | return inverse_transform 137 | 138 | def forward(self, input_data): 139 | self.magnitude, self.phase = self.transform(input_data) 140 | reconstruction = self.inverse(self.magnitude, self.phase) 141 | return reconstruction 142 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from .numbers import normalize_numbers 18 | import pyopenjtalk 19 | from janome.tokenizer import Tokenizer 20 | 21 | 22 | # Regular expression matching whitespace: 23 | _whitespace_re = re.compile(r'\s+') 24 | 25 | # List of (regular expression, replacement) pairs for abbreviations: 26 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 27 | ('mrs', 'misess'), 28 | ('mr', 'mister'), 29 | ('dr', 'doctor'), 30 | ('st', 'saint'), 31 | ('co', 'company'), 32 | ('jr', 'junior'), 33 | ('maj', 'major'), 34 | ('gen', 'general'), 35 | ('drs', 'doctors'), 36 | ('rev', 'reverend'), 37 | ('lt', 'lieutenant'), 38 | ('hon', 'honorable'), 39 | ('sgt', 'sergeant'), 40 | ('capt', 'captain'), 41 | ('esq', 'esquire'), 42 | ('ltd', 'limited'), 43 | ('col', 'colonel'), 44 | ('ft', 'fort'), 45 | ]] 46 | 47 | # Regular expression matching Japanese without punctuation marks: 48 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 49 | 50 | # Regular expression matching non-Japanese characters or punctuation marks: 51 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 52 | 53 | 54 | # Tokenizer for Japanese 55 | tokenizer = Tokenizer() 56 | 57 | 58 | def expand_abbreviations(text): 59 | for regex, replacement in _abbreviations: 60 | text = re.sub(regex, replacement, text) 61 | return text 62 | 63 | 64 | def expand_numbers(text): 65 | return normalize_numbers(text) 66 | 67 | 68 | def lowercase(text): 69 | return text.lower() 70 | 71 | 72 | def collapse_whitespace(text): 73 | return re.sub(_whitespace_re, ' ', text) 74 | 75 | 76 | def convert_to_ascii(text): 77 | return unidecode(text) 78 | 79 | 80 | def basic_cleaners(text): 81 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 82 | text = lowercase(text) 83 | text = collapse_whitespace(text) 84 | return text 85 | 86 | 87 | def transliteration_cleaners(text): 88 | '''Pipeline for non-English text that transliterates to ASCII.''' 89 | text = convert_to_ascii(text) 90 | text = lowercase(text) 91 | text = collapse_whitespace(text) 92 | return text 93 | 94 | 95 | def english_cleaners(text): 96 | '''Pipeline for English text, including number and abbreviation expansion.''' 97 | text = convert_to_ascii(text) 98 | text = lowercase(text) 99 | text = expand_numbers(text) 100 | text = expand_abbreviations(text) 101 | text = collapse_whitespace(text) 102 | return text 103 | 104 | 105 | def japanese_cleaners(text): 106 | '''Pipeline for Japanese text.''' 107 | sentences = re.split(_japanese_marks, text) 108 | marks = re.findall(_japanese_marks, text) 109 | text = '' 110 | for i, mark in enumerate(marks): 111 | if re.match(_japanese_characters, sentences[i]): 112 | text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','') 113 | text += unidecode(mark).replace(' ','') 114 | if re.match(_japanese_characters, sentences[-1]): 115 | text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','') 116 | if re.match('[A-Za-z]',text[-1]): 117 | text += '.' 118 | return text 119 | 120 | 121 | def japanese_tokenization_cleaners(text): 122 | '''Pipeline for tokenizing Japanese text.''' 123 | words = [] 124 | for token in tokenizer.tokenize(text): 125 | if token.phonetic!='*': 126 | words.append(token.phonetic) 127 | else: 128 | words.append(token.surface) 129 | text = '' 130 | for word in words: 131 | if re.match(_japanese_characters, word): 132 | if word[0] == '\u30fc': 133 | continue 134 | if len(text)>0: 135 | text += ' ' 136 | text += pyopenjtalk.g2p(word, kana=False).replace(' ','') 137 | else: 138 | text += unidecode(word).replace(' ','') 139 | if re.match('[A-Za-z]',text[-1]): 140 | text += '.' 141 | return text 142 | 143 | 144 | def japanese_accent_cleaners(text): 145 | '''Pipeline for notating accent in Japanese text.''' 146 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 147 | sentences = re.split(_japanese_marks, text) 148 | marks = re.findall(_japanese_marks, text) 149 | text = '' 150 | for i, sentence in enumerate(sentences): 151 | if re.match(_japanese_characters, sentence): 152 | text += ':' 153 | labels = pyopenjtalk.extract_fullcontext(sentence) 154 | for n, label in enumerate(labels): 155 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 156 | if phoneme not in ['sil','pau']: 157 | text += phoneme 158 | else: 159 | continue 160 | n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 161 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 162 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 163 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 164 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']: 165 | a2_next=-1 166 | else: 167 | a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 168 | # Accent phrase boundary 169 | if a3 == 1 and a2_next == 1: 170 | text += ' ' 171 | # Falling 172 | elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras: 173 | text += ')' 174 | # Rising 175 | elif a2 == 1 and a2_next == 2: 176 | text += '(' 177 | if i 0: 108 | # cross-node buffer sync 109 | flat_buffers = _flatten_dense_tensors(buffers) 110 | dist.broadcast(flat_buffers, 0) 111 | for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): 112 | buf.copy_(synced) 113 | def train(self, mode=True): 114 | # Clear NCCL communicator and CUDA event cache of the default group ID, 115 | # These cache will be recreated at the later call. This is currently a 116 | # work-around for a potential NCCL deadlock. 117 | if dist._backend == dist.dist_backend.NCCL: 118 | dist._clear_group_cache() 119 | super(DistributedDataParallel, self).train(mode) 120 | self.module.train(mode) 121 | ''' 122 | ''' 123 | Modifies existing model to do gradient allreduce, but doesn't change class 124 | so you don't need "module" 125 | ''' 126 | def apply_gradient_allreduce(module): 127 | if not hasattr(dist, '_backend'): 128 | module.warn_on_half = True 129 | else: 130 | module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False 131 | 132 | for p in module.state_dict().values(): 133 | if not torch.is_tensor(p): 134 | continue 135 | dist.broadcast(p, 0) 136 | 137 | def allreduce_params(): 138 | if(module.needs_reduction): 139 | module.needs_reduction = False 140 | buckets = {} 141 | for param in module.parameters(): 142 | if param.requires_grad and param.grad is not None: 143 | tp = param.data.dtype 144 | if tp not in buckets: 145 | buckets[tp] = [] 146 | buckets[tp].append(param) 147 | if module.warn_on_half: 148 | if torch.cuda.HalfTensor in buckets: 149 | print("WARNING: gloo dist backend for half parameters may be extremely slow." + 150 | " It is recommended to use the NCCL backend in this case. This currently requires" + 151 | "PyTorch built from top of tree master.") 152 | module.warn_on_half = False 153 | 154 | for tp in buckets: 155 | bucket = buckets[tp] 156 | grads = [param.grad.data for param in bucket] 157 | coalesced = _flatten_dense_tensors(grads) 158 | dist.all_reduce(coalesced) 159 | coalesced /= dist.get_world_size() 160 | for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): 161 | buf.copy_(synced) 162 | 163 | for param in list(module.parameters()): 164 | def allreduce_hook(*unused): 165 | Variable._execution_engine.queue_callback(allreduce_params) 166 | if param.requires_grad: 167 | param.register_hook(allreduce_hook) 168 | 169 | def set_needs_reduction(self, input, output): 170 | self.needs_reduction = True 171 | 172 | module.register_forward_hook(set_needs_reduction) 173 | return module 174 | -------------------------------------------------------------------------------- /waveglow/distributed.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import os 28 | import sys 29 | import time 30 | import subprocess 31 | import argparse 32 | 33 | import torch 34 | import torch.distributed as dist 35 | from torch.autograd import Variable 36 | 37 | def reduce_tensor(tensor, num_gpus): 38 | rt = tensor.clone() 39 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 40 | rt /= num_gpus 41 | return rt 42 | 43 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): 44 | assert torch.cuda.is_available(), "Distributed mode requires CUDA." 45 | print("Initializing Distributed") 46 | 47 | # Set cuda device so everything is done on the right GPU. 48 | torch.cuda.set_device(rank % torch.cuda.device_count()) 49 | 50 | # Initialize distributed communication 51 | dist.init_process_group(dist_backend, init_method=dist_url, 52 | world_size=num_gpus, rank=rank, 53 | group_name=group_name) 54 | 55 | def _flatten_dense_tensors(tensors): 56 | """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of 57 | same dense type. 58 | Since inputs are dense, the resulting tensor will be a concatenated 1D 59 | buffer. Element-wise operation on this buffer will be equivalent to 60 | operating individually. 61 | Arguments: 62 | tensors (Iterable[Tensor]): dense tensors to flatten. 63 | Returns: 64 | A contiguous 1D buffer containing input tensors. 65 | """ 66 | if len(tensors) == 1: 67 | return tensors[0].contiguous().view(-1) 68 | flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0) 69 | return flat 70 | 71 | def _unflatten_dense_tensors(flat, tensors): 72 | """View a flat buffer using the sizes of tensors. Assume that tensors are of 73 | same dense type, and that flat is given by _flatten_dense_tensors. 74 | Arguments: 75 | flat (Tensor): flattened dense tensors to unflatten. 76 | tensors (Iterable[Tensor]): dense tensors whose sizes will be used to 77 | unflatten flat. 78 | Returns: 79 | Unflattened dense tensors with sizes same as tensors and values from 80 | flat. 81 | """ 82 | outputs = [] 83 | offset = 0 84 | for tensor in tensors: 85 | numel = tensor.numel() 86 | outputs.append(flat.narrow(0, offset, numel).view_as(tensor)) 87 | offset += numel 88 | return tuple(outputs) 89 | 90 | def apply_gradient_allreduce(module): 91 | """ 92 | Modifies existing model to do gradient allreduce, but doesn't change class 93 | so you don't need "module" 94 | """ 95 | if not hasattr(dist, '_backend'): 96 | module.warn_on_half = True 97 | else: 98 | module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False 99 | 100 | for p in module.state_dict().values(): 101 | if not torch.is_tensor(p): 102 | continue 103 | dist.broadcast(p, 0) 104 | 105 | def allreduce_params(): 106 | if(module.needs_reduction): 107 | module.needs_reduction = False 108 | buckets = {} 109 | for param in module.parameters(): 110 | if param.requires_grad and param.grad is not None: 111 | tp = type(param.data) 112 | if tp not in buckets: 113 | buckets[tp] = [] 114 | buckets[tp].append(param) 115 | if module.warn_on_half: 116 | if torch.cuda.HalfTensor in buckets: 117 | print("WARNING: gloo dist backend for half parameters may be extremely slow." + 118 | " It is recommended to use the NCCL backend in this case. This currently requires" + 119 | "PyTorch built from top of tree master.") 120 | module.warn_on_half = False 121 | 122 | for tp in buckets: 123 | bucket = buckets[tp] 124 | grads = [param.grad.data for param in bucket] 125 | coalesced = _flatten_dense_tensors(grads) 126 | dist.all_reduce(coalesced) 127 | coalesced /= dist.get_world_size() 128 | for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): 129 | buf.copy_(synced) 130 | 131 | for param in list(module.parameters()): 132 | def allreduce_hook(*unused): 133 | Variable._execution_engine.queue_callback(allreduce_params) 134 | if param.requires_grad: 135 | param.register_hook(allreduce_hook) 136 | dir(param) 137 | 138 | def set_needs_reduction(self, input, output): 139 | self.needs_reduction = True 140 | 141 | module.register_forward_hook(set_needs_reduction) 142 | return module 143 | 144 | 145 | def main(config, stdout_dir, args_str): 146 | args_list = ['train.py'] 147 | args_list += args_str.split(' ') if len(args_str) > 0 else [] 148 | 149 | args_list.append('--config={}'.format(config)) 150 | 151 | num_gpus = torch.cuda.device_count() 152 | args_list.append('--num_gpus={}'.format(num_gpus)) 153 | args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S"))) 154 | 155 | if not os.path.isdir(stdout_dir): 156 | os.makedirs(stdout_dir) 157 | os.chmod(stdout_dir, 0o775) 158 | 159 | workers = [] 160 | 161 | for i in range(num_gpus): 162 | args_list[-2] = '--rank={}'.format(i) 163 | stdout = None if i == 0 else open( 164 | os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w") 165 | print(args_list) 166 | p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout) 167 | workers.append(p) 168 | 169 | for p in workers: 170 | p.wait() 171 | 172 | 173 | if __name__ == '__main__': 174 | parser = argparse.ArgumentParser() 175 | parser.add_argument('-c', '--config', type=str, required=True, 176 | help='JSON file for configuration') 177 | parser.add_argument('-s', '--stdout_dir', type=str, default=".", 178 | help='directory to save stoud logs') 179 | parser.add_argument( 180 | '-a', '--args_str', type=str, default='', 181 | help='double quoted string with space separated key value pairs') 182 | 183 | args = parser.parse_args() 184 | main(args.config, args.stdout_dir, args.args_str) 185 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | 365 | # models 366 | /ayachi_* 367 | /inaba_* 368 | /tomotake_* 369 | /murasame_* 370 | /arihara_* 371 | /waveglow_* 372 | 373 | # jupyter cache 374 | /.ipynb_checkpoints 375 | -------------------------------------------------------------------------------- /waveglow/train.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import argparse 28 | import json 29 | import os 30 | import torch 31 | 32 | #=====START: ADDED FOR DISTRIBUTED====== 33 | from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor 34 | from torch.utils.data.distributed import DistributedSampler 35 | #=====END: ADDED FOR DISTRIBUTED====== 36 | 37 | from torch.utils.data import DataLoader 38 | from glow import WaveGlow, WaveGlowLoss 39 | from mel2samp import Mel2Samp 40 | 41 | def load_checkpoint(checkpoint_path, model, optimizer): 42 | assert os.path.isfile(checkpoint_path) 43 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 44 | iteration = checkpoint_dict['iteration'] 45 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 46 | model_for_loading = checkpoint_dict['model'] 47 | model.load_state_dict(model_for_loading.state_dict()) 48 | print("Loaded checkpoint '{}' (iteration {})" .format( 49 | checkpoint_path, iteration)) 50 | return model, optimizer, iteration 51 | 52 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): 53 | print("Saving model and optimizer state at iteration {} to {}".format( 54 | iteration, filepath)) 55 | model_for_saving = WaveGlow(**waveglow_config).cuda() 56 | model_for_saving.load_state_dict(model.state_dict()) 57 | torch.save({'model': model_for_saving, 58 | 'iteration': iteration, 59 | 'optimizer': optimizer.state_dict(), 60 | 'learning_rate': learning_rate}, filepath) 61 | 62 | def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, 63 | sigma, iters_per_checkpoint, batch_size, seed, fp16_run, 64 | checkpoint_path, with_tensorboard): 65 | torch.manual_seed(seed) 66 | torch.cuda.manual_seed(seed) 67 | #=====START: ADDED FOR DISTRIBUTED====== 68 | if num_gpus > 1: 69 | init_distributed(rank, num_gpus, group_name, **dist_config) 70 | #=====END: ADDED FOR DISTRIBUTED====== 71 | 72 | criterion = WaveGlowLoss(sigma) 73 | model = WaveGlow(**waveglow_config).cuda() 74 | 75 | #=====START: ADDED FOR DISTRIBUTED====== 76 | if num_gpus > 1: 77 | model = apply_gradient_allreduce(model) 78 | #=====END: ADDED FOR DISTRIBUTED====== 79 | 80 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 81 | 82 | if fp16_run: 83 | from apex import amp 84 | model, optimizer = amp.initialize(model, optimizer, opt_level='O1') 85 | 86 | # Load checkpoint if one exists 87 | iteration = 0 88 | if checkpoint_path != "": 89 | model, optimizer, iteration = load_checkpoint(checkpoint_path, model, 90 | optimizer) 91 | iteration += 1 # next iteration is iteration + 1 92 | 93 | trainset = Mel2Samp(**data_config) 94 | # =====START: ADDED FOR DISTRIBUTED====== 95 | train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None 96 | # =====END: ADDED FOR DISTRIBUTED====== 97 | train_loader = DataLoader(trainset, num_workers=1, shuffle=False, 98 | sampler=train_sampler, 99 | batch_size=batch_size, 100 | pin_memory=False, 101 | drop_last=True) 102 | 103 | # Get shared output_directory ready 104 | if rank == 0: 105 | if not os.path.isdir(output_directory): 106 | os.makedirs(output_directory) 107 | os.chmod(output_directory, 0o775) 108 | print("output directory", output_directory) 109 | 110 | if with_tensorboard and rank == 0: 111 | from tensorboardX import SummaryWriter 112 | logger = SummaryWriter(os.path.join(output_directory, 'logs')) 113 | 114 | model.train() 115 | epoch_offset = max(0, int(iteration / len(train_loader))) 116 | # ================ MAIN TRAINNIG LOOP! =================== 117 | for epoch in range(epoch_offset, epochs): 118 | print("Epoch: {}".format(epoch)) 119 | for i, batch in enumerate(train_loader): 120 | model.zero_grad() 121 | 122 | mel, audio = batch 123 | mel = torch.autograd.Variable(mel.cuda()) 124 | audio = torch.autograd.Variable(audio.cuda()) 125 | outputs = model((mel, audio)) 126 | 127 | loss = criterion(outputs) 128 | if num_gpus > 1: 129 | reduced_loss = reduce_tensor(loss.data, num_gpus).item() 130 | else: 131 | reduced_loss = loss.item() 132 | 133 | if fp16_run: 134 | with amp.scale_loss(loss, optimizer) as scaled_loss: 135 | scaled_loss.backward() 136 | else: 137 | loss.backward() 138 | 139 | optimizer.step() 140 | 141 | print("{}:\t{:.9f}".format(iteration, reduced_loss)) 142 | if with_tensorboard and rank == 0: 143 | logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) 144 | 145 | if (iteration % iters_per_checkpoint == 0): 146 | if rank == 0: 147 | checkpoint_path = "{}/waveglow_{}".format( 148 | output_directory, iteration) 149 | save_checkpoint(model, optimizer, learning_rate, iteration, 150 | checkpoint_path) 151 | 152 | iteration += 1 153 | 154 | if __name__ == "__main__": 155 | parser = argparse.ArgumentParser() 156 | parser.add_argument('-c', '--config', type=str, 157 | help='JSON file for configuration') 158 | parser.add_argument('-r', '--rank', type=int, default=0, 159 | help='rank of process for distributed') 160 | parser.add_argument('-g', '--group_name', type=str, default='', 161 | help='name of group for distributed') 162 | args = parser.parse_args() 163 | 164 | # Parse configs. Globals nicer in this case 165 | with open(args.config) as f: 166 | data = f.read() 167 | config = json.loads(data) 168 | train_config = config["train_config"] 169 | global data_config 170 | data_config = config["data_config"] 171 | global dist_config 172 | dist_config = config["dist_config"] 173 | global waveglow_config 174 | waveglow_config = config["waveglow_config"] 175 | 176 | num_gpus = torch.cuda.device_count() 177 | if num_gpus > 1: 178 | if args.group_name == '': 179 | print("WARNING: Multiple GPUs detected but no distributed group set") 180 | print("Only running 1 GPU. Use distributed.py for multiple GPUs") 181 | num_gpus = 1 182 | 183 | if num_gpus == 1 and args.rank != 0: 184 | raise Exception("Doing single GPU training on rank > 0") 185 | 186 | torch.backends.cudnn.enabled = True 187 | torch.backends.cudnn.benchmark = False 188 | train(num_gpus, args.rank, args.group_name, **train_config) 189 | -------------------------------------------------------------------------------- /waveglow/glow_old.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | from glow import Invertible1x1Conv, remove 4 | 5 | 6 | @torch.jit.script 7 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 8 | n_channels_int = n_channels[0] 9 | in_act = input_a+input_b 10 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 11 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 12 | acts = t_act * s_act 13 | return acts 14 | 15 | 16 | class WN(torch.nn.Module): 17 | """ 18 | This is the WaveNet like layer for the affine coupling. The primary difference 19 | from WaveNet is the convolutions need not be causal. There is also no dilation 20 | size reset. The dilation only doubles on each layer 21 | """ 22 | def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, 23 | kernel_size): 24 | super(WN, self).__init__() 25 | assert(kernel_size % 2 == 1) 26 | assert(n_channels % 2 == 0) 27 | self.n_layers = n_layers 28 | self.n_channels = n_channels 29 | self.in_layers = torch.nn.ModuleList() 30 | self.res_skip_layers = torch.nn.ModuleList() 31 | self.cond_layers = torch.nn.ModuleList() 32 | 33 | start = torch.nn.Conv1d(n_in_channels, n_channels, 1) 34 | start = torch.nn.utils.weight_norm(start, name='weight') 35 | self.start = start 36 | 37 | # Initializing last layer to 0 makes the affine coupling layers 38 | # do nothing at first. This helps with training stability 39 | end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) 40 | end.weight.data.zero_() 41 | end.bias.data.zero_() 42 | self.end = end 43 | 44 | for i in range(n_layers): 45 | dilation = 2 ** i 46 | padding = int((kernel_size*dilation - dilation)/2) 47 | in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, 48 | dilation=dilation, padding=padding) 49 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 50 | self.in_layers.append(in_layer) 51 | 52 | cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1) 53 | cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 54 | self.cond_layers.append(cond_layer) 55 | 56 | # last one is not necessary 57 | if i < n_layers - 1: 58 | res_skip_channels = 2*n_channels 59 | else: 60 | res_skip_channels = n_channels 61 | res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) 62 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 63 | self.res_skip_layers.append(res_skip_layer) 64 | 65 | def forward(self, forward_input): 66 | audio, spect = forward_input 67 | audio = self.start(audio) 68 | 69 | for i in range(self.n_layers): 70 | acts = fused_add_tanh_sigmoid_multiply( 71 | self.in_layers[i](audio), 72 | self.cond_layers[i](spect), 73 | torch.IntTensor([self.n_channels])) 74 | 75 | res_skip_acts = self.res_skip_layers[i](acts) 76 | if i < self.n_layers - 1: 77 | audio = res_skip_acts[:,:self.n_channels,:] + audio 78 | skip_acts = res_skip_acts[:,self.n_channels:,:] 79 | else: 80 | skip_acts = res_skip_acts 81 | 82 | if i == 0: 83 | output = skip_acts 84 | else: 85 | output = skip_acts + output 86 | return self.end(output) 87 | 88 | 89 | class WaveGlow(torch.nn.Module): 90 | def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, 91 | n_early_size, WN_config): 92 | super(WaveGlow, self).__init__() 93 | 94 | self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, 95 | n_mel_channels, 96 | 1024, stride=256) 97 | assert(n_group % 2 == 0) 98 | self.n_flows = n_flows 99 | self.n_group = n_group 100 | self.n_early_every = n_early_every 101 | self.n_early_size = n_early_size 102 | self.WN = torch.nn.ModuleList() 103 | self.convinv = torch.nn.ModuleList() 104 | 105 | n_half = int(n_group/2) 106 | 107 | # Set up layers with the right sizes based on how many dimensions 108 | # have been output already 109 | n_remaining_channels = n_group 110 | for k in range(n_flows): 111 | if k % self.n_early_every == 0 and k > 0: 112 | n_half = n_half - int(self.n_early_size/2) 113 | n_remaining_channels = n_remaining_channels - self.n_early_size 114 | self.convinv.append(Invertible1x1Conv(n_remaining_channels)) 115 | self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) 116 | self.n_remaining_channels = n_remaining_channels # Useful during inference 117 | 118 | def forward(self, forward_input): 119 | return None 120 | """ 121 | forward_input[0] = audio: batch x time 122 | forward_input[1] = upsamp_spectrogram: batch x n_cond_channels x time 123 | """ 124 | """ 125 | spect, audio = forward_input 126 | 127 | # Upsample spectrogram to size of audio 128 | spect = self.upsample(spect) 129 | assert(spect.size(2) >= audio.size(1)) 130 | if spect.size(2) > audio.size(1): 131 | spect = spect[:, :, :audio.size(1)] 132 | 133 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 134 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 135 | 136 | audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) 137 | output_audio = [] 138 | s_list = [] 139 | s_conv_list = [] 140 | 141 | for k in range(self.n_flows): 142 | if k%4 == 0 and k > 0: 143 | output_audio.append(audio[:,:self.n_multi,:]) 144 | audio = audio[:,self.n_multi:,:] 145 | 146 | # project to new basis 147 | audio, s = self.convinv[k](audio) 148 | s_conv_list.append(s) 149 | 150 | n_half = int(audio.size(1)/2) 151 | if k%2 == 0: 152 | audio_0 = audio[:,:n_half,:] 153 | audio_1 = audio[:,n_half:,:] 154 | else: 155 | audio_1 = audio[:,:n_half,:] 156 | audio_0 = audio[:,n_half:,:] 157 | 158 | output = self.nn[k]((audio_0, spect)) 159 | s = output[:, n_half:, :] 160 | b = output[:, :n_half, :] 161 | audio_1 = torch.exp(s)*audio_1 + b 162 | s_list.append(s) 163 | 164 | if k%2 == 0: 165 | audio = torch.cat([audio[:,:n_half,:], audio_1],1) 166 | else: 167 | audio = torch.cat([audio_1, audio[:,n_half:,:]], 1) 168 | output_audio.append(audio) 169 | return torch.cat(output_audio,1), s_list, s_conv_list 170 | """ 171 | 172 | def infer(self, spect, sigma=1.0): 173 | spect = self.upsample(spect) 174 | # trim conv artifacts. maybe pad spec to kernel multiple 175 | time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] 176 | spect = spect[:, :, :-time_cutoff] 177 | 178 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 179 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 180 | 181 | if spect.type() == 'torch.cuda.HalfTensor': 182 | audio = torch.cuda.HalfTensor(spect.size(0), 183 | self.n_remaining_channels, 184 | spect.size(2)).normal_() 185 | else: 186 | audio = torch.cuda.FloatTensor(spect.size(0), 187 | self.n_remaining_channels, 188 | spect.size(2)).normal_() 189 | 190 | audio = torch.autograd.Variable(sigma*audio) 191 | 192 | for k in reversed(range(self.n_flows)): 193 | n_half = int(audio.size(1)/2) 194 | if k%2 == 0: 195 | audio_0 = audio[:,:n_half,:] 196 | audio_1 = audio[:,n_half:,:] 197 | else: 198 | audio_1 = audio[:,:n_half,:] 199 | audio_0 = audio[:,n_half:,:] 200 | 201 | output = self.WN[k]((audio_0, spect)) 202 | s = output[:, n_half:, :] 203 | b = output[:, :n_half, :] 204 | audio_1 = (audio_1 - b)/torch.exp(s) 205 | if k%2 == 0: 206 | audio = torch.cat([audio[:,:n_half,:], audio_1],1) 207 | else: 208 | audio = torch.cat([audio_1, audio[:,n_half:,:]], 1) 209 | 210 | audio = self.convinv[k](audio, reverse=True) 211 | 212 | if k%4 == 0 and k > 0: 213 | if spect.type() == 'torch.cuda.HalfTensor': 214 | z = torch.cuda.HalfTensor(spect.size(0), 215 | self.n_early_size, 216 | spect.size(2)).normal_() 217 | else: 218 | z = torch.cuda.FloatTensor(spect.size(0), 219 | self.n_early_size, 220 | spect.size(2)).normal_() 221 | audio = torch.cat((sigma*z, audio),1) 222 | 223 | return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data 224 | 225 | @staticmethod 226 | def remove_weightnorm(model): 227 | waveglow = model 228 | for WN in waveglow.WN: 229 | WN.start = torch.nn.utils.remove_weight_norm(WN.start) 230 | WN.in_layers = remove(WN.in_layers) 231 | WN.cond_layers = remove(WN.cond_layers) 232 | WN.res_skip_layers = remove(WN.res_skip_layers) 233 | return waveglow 234 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import math 4 | import torch 5 | import argparse 6 | import torch.distributed as dist 7 | 8 | 9 | from numpy import finfo 10 | from model import Tacotron2 11 | from torch.backends import cudnn 12 | from hparams import create_hparams 13 | from logger import Tacotron2Logger 14 | from torch.utils.data import DataLoader 15 | from loss_function import Tacotron2Loss 16 | from distributed import apply_gradient_allreduce 17 | from data_utils import TextMelLoader, TextMelCollate 18 | from torch.utils.data.distributed import DistributedSampler 19 | 20 | 21 | device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 22 | 23 | # 整理tensor 24 | def reduce_tensor(tensor, n_gpus): 25 | rt = tensor.clone() 26 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 27 | rt /= n_gpus 28 | return rt 29 | 30 | 31 | def init_distributed(hparams, n_gpus, rank, group_name): 32 | #assert torch.cuda.is_available(), "Distributed mode requires CUDA." 33 | if torch.cuda.is_available() : 34 | # Set cuda device so everything is done on the right GPU. 35 | torch.cuda.set_device(rank % torch.cuda.device_count()) 36 | # Initialize distributed communication 37 | dist.init_process_group(backend=hparams.dist_backend, 38 | init_method=hparams.dist_url, 39 | world_size=n_gpus, 40 | rank=rank, 41 | group_name=group_name) 42 | print("Distributed mode requires CUDA.") 43 | else : 44 | print("Use the CPU") 45 | print("Initializing Distributed") 46 | 47 | print("Done initializing distributed") 48 | 49 | 50 | 51 | def prepare_dataloaders(hparams): 52 | # Get data, data loaders and collate function ready 53 | trainset = TextMelLoader(hparams.training_files, hparams) 54 | valset = TextMelLoader(hparams.validation_files, hparams) 55 | collate_fn = TextMelCollate(hparams.n_frames_per_step) 56 | 57 | if hparams.distributed_run: 58 | train_sampler = DistributedSampler(trainset) 59 | shuffle = False 60 | else: 61 | train_sampler = None 62 | shuffle = True 63 | 64 | train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, 65 | sampler=train_sampler, 66 | batch_size=hparams.batch_size, pin_memory=False, 67 | drop_last=True, collate_fn=collate_fn) 68 | return train_loader, valset, collate_fn 69 | 70 | 71 | def prepare_directories_and_logger(output_directory, log_directory, rank): 72 | if rank == 0: 73 | if not os.path.isdir(output_directory): 74 | os.makedirs(output_directory) 75 | os.chmod(output_directory, 0o775) 76 | logger = Tacotron2Logger(os.path.join(output_directory, log_directory)) 77 | else: 78 | logger = None 79 | return logger 80 | 81 | 82 | def load_model(hparams): 83 | model = Tacotron2(hparams) 84 | model.to(device) 85 | if hparams.fp16_run: 86 | model.decoder.attention_layer.score_mask_value = finfo('float16').min 87 | 88 | if hparams.distributed_run: 89 | model = apply_gradient_allreduce(model) 90 | 91 | return model 92 | 93 | 94 | def warm_start_model(checkpoint_path, model, ignore_layers): 95 | assert os.path.isfile(checkpoint_path) 96 | print("Warm starting model from checkpoint '{}'".format(checkpoint_path)) 97 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 98 | model_dict = checkpoint_dict['state_dict'] 99 | if len(ignore_layers) > 0: 100 | model_dict = {k: v for k, v in model_dict.items() 101 | if k not in ignore_layers} 102 | dummy_dict = model.state_dict() 103 | dummy_dict.update(model_dict) 104 | model_dict = dummy_dict 105 | model.load_state_dict(model_dict) 106 | return model 107 | 108 | 109 | def load_checkpoint(checkpoint_path, model, optimizer): 110 | assert os.path.isfile(checkpoint_path) 111 | print("Loading checkpoint '{}'".format(checkpoint_path)) 112 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 113 | model.load_state_dict(checkpoint_dict['state_dict']) 114 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 115 | learning_rate = checkpoint_dict['learning_rate'] 116 | iteration = checkpoint_dict['iteration'] 117 | print("Loaded checkpoint '{}' from iteration {}" .format( 118 | checkpoint_path, iteration)) 119 | return model, optimizer, learning_rate, iteration 120 | 121 | 122 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): 123 | print("Saving model and optimizer state at iteration {} to {}".format( 124 | iteration, filepath)) 125 | torch.save({'iteration': iteration, 126 | 'state_dict': model.state_dict(), 127 | 'optimizer': optimizer.state_dict(), 128 | 'learning_rate': learning_rate}, filepath) 129 | 130 | 131 | def validate(model, criterion, valset, iteration, batch_size, n_gpus, 132 | collate_fn, logger, distributed_run, rank): 133 | """Handles all the validation scoring and printing""" 134 | model.eval() 135 | with torch.no_grad(): 136 | val_sampler = DistributedSampler(valset) if distributed_run else None 137 | val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, 138 | shuffle=False, batch_size=batch_size, 139 | pin_memory=False, collate_fn=collate_fn) 140 | 141 | val_loss = 0.0 142 | for i, batch in enumerate(val_loader): 143 | x, y = model.parse_batch(batch) 144 | y_pred = model(x) 145 | loss = criterion(y_pred, y) 146 | if distributed_run: 147 | reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() 148 | else: 149 | reduced_val_loss = loss.item() 150 | val_loss += reduced_val_loss 151 | val_loss = val_loss / (i + 1) 152 | 153 | model.train() 154 | if rank == 0: 155 | print("Validation loss {}: {:9f} ".format(iteration, val_loss)) 156 | logger.log_validation(val_loss, model, y, y_pred, iteration) 157 | 158 | 159 | def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, 160 | rank, group_name, hparams): 161 | """Training and validation logging results to tensorboard and stdout 162 | 163 | Params 164 | ------ 165 | output_directory (string): directory to save checkpoints 166 | log_directory (string) directory to save tensorboard logs 167 | checkpoint_path(string): checkpoint path 168 | n_gpus (int): number of gpus 169 | rank (int): rank of current gpu 170 | hparams (object): comma separated list of "name=value" pairs. 171 | """ 172 | if hparams.distributed_run: 173 | init_distributed(hparams, n_gpus, rank, group_name) 174 | 175 | torch.manual_seed(hparams.seed) 176 | torch.cuda.manual_seed(hparams.seed) 177 | 178 | model = load_model(hparams) 179 | learning_rate = hparams.learning_rate 180 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, 181 | weight_decay=hparams.weight_decay) 182 | 183 | # 默认的是 False 可以注释掉 184 | #if hparams.fp16_run: 185 | # from apex import amp 186 | # model, optimizer = amp.initialize( 187 | # model, optimizer, opt_level='O2') 188 | 189 | if hparams.distributed_run: 190 | model = apply_gradient_allreduce(model) 191 | 192 | criterion = Tacotron2Loss() 193 | logger = prepare_directories_and_logger(output_directory, log_directory, rank) 194 | train_loader, valset, collate_fn = prepare_dataloaders(hparams) 195 | 196 | # Load checkpoint if one exists 197 | iteration = 0 198 | epoch_offset = 0 199 | if checkpoint_path is not None: 200 | if warm_start: 201 | model = warm_start_model( 202 | checkpoint_path, model, hparams.ignore_layers) 203 | else: 204 | model, optimizer, _learning_rate, iteration = load_checkpoint( 205 | checkpoint_path, model, optimizer) 206 | if hparams.use_saved_learning_rate: 207 | learning_rate = _learning_rate 208 | iteration += 1 # next iteration is iteration + 1 209 | epoch_offset = max(0, int(iteration / len(train_loader))) 210 | 211 | model.train() 212 | is_overflow = False 213 | # ================ MAIN TRAINNIG LOOP! =================== 214 | for epoch in range(epoch_offset, hparams.epochs): 215 | print("Epoch: {}".format(epoch)) 216 | for i, batch in enumerate(train_loader): 217 | start = time.perf_counter() 218 | for param_group in optimizer.param_groups: 219 | param_group['lr'] = learning_rate 220 | 221 | model.zero_grad() 222 | x, y = model.parse_batch(batch) 223 | y_pred = model(x) 224 | 225 | loss = criterion(y_pred, y) 226 | if hparams.distributed_run: 227 | reduced_loss = reduce_tensor(loss.data, n_gpus).item() 228 | else: 229 | reduced_loss = loss.item() 230 | 231 | loss.backward() 232 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.grad_clip_thresh) 233 | optimizer.step() 234 | 235 | if not is_overflow and rank == 0: 236 | duration = time.perf_counter() - start 237 | print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( 238 | iteration, reduced_loss, grad_norm, duration)) 239 | logger.log_training( 240 | reduced_loss, grad_norm, learning_rate, duration, iteration) 241 | 242 | if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): 243 | validate(model, criterion, valset, iteration, 244 | hparams.batch_size, n_gpus, collate_fn, logger, 245 | hparams.distributed_run, rank) 246 | if rank == 0: 247 | checkpoint_path = os.path.join( 248 | output_directory, "checkpoint_{}".format(iteration)) 249 | save_checkpoint(model, optimizer, learning_rate, iteration, 250 | checkpoint_path) 251 | 252 | iteration += 1 253 | 254 | 255 | if __name__ == '__main__': 256 | parser = argparse.ArgumentParser() 257 | parser.add_argument('-o', '--output_directory', type=str, 258 | help='directory to save checkpoints ') 259 | parser.add_argument('-l', '--log_directory', type=str, 260 | help='directory to save tensorboard logs') 261 | parser.add_argument('-c', '--checkpoint_path', type=str, default=None, 262 | required=False, help='checkpoint path') 263 | parser.add_argument('--warm_start', action='store_true', 264 | help='load model weights only, ignore specified layers') 265 | parser.add_argument('--n_gpus', type=int, default=1, 266 | required=False, help='number of gpus') 267 | parser.add_argument('--rank', type=int, default=0, 268 | required=False, help='rank of current gpu') 269 | parser.add_argument('--group_name', type=str, default='group_name', 270 | required=False, help='Distributed group name') 271 | parser.add_argument('--hparams', type=str, 272 | required=False, help='comma separated name=value pairs') 273 | 274 | args = parser.parse_args() 275 | hparams = create_hparams() 276 | 277 | cudnn.enabled = hparams.cudnn_enabled#create_hparams.cudnn_enabled 278 | cudnn.benchmark = hparams.cudnn_benchmark#create_hparams.cudnn_benchmark 279 | 280 | print("FP16 Run:", hparams.fp16_run) 281 | print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling) 282 | print("Distributed Run:", hparams.distributed_run) 283 | print("cuDNN Enabled:", hparams.cudnn_enabled) 284 | print("cuDNN Benchmark:", hparams.cudnn_benchmark) 285 | 286 | train(args.output_directory, 287 | args.log_directory, 288 | args.checkpoint_path, 289 | args.warm_start, 290 | args.n_gpus, 291 | args.rank, 292 | args.group_name, 293 | hparams) 294 | -------------------------------------------------------------------------------- /waveglow/glow.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | import copy 28 | import torch 29 | from torch.autograd import Variable 30 | import torch.nn.functional as F 31 | 32 | 33 | @torch.jit.script 34 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 35 | n_channels_int = n_channels[0] 36 | in_act = input_a+input_b 37 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 38 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 39 | acts = t_act * s_act 40 | return acts 41 | 42 | 43 | class WaveGlowLoss(torch.nn.Module): 44 | def __init__(self, sigma=1.0): 45 | super(WaveGlowLoss, self).__init__() 46 | self.sigma = sigma 47 | 48 | def forward(self, model_output): 49 | z, log_s_list, log_det_W_list = model_output 50 | for i, log_s in enumerate(log_s_list): 51 | if i == 0: 52 | log_s_total = torch.sum(log_s) 53 | log_det_W_total = log_det_W_list[i] 54 | else: 55 | log_s_total = log_s_total + torch.sum(log_s) 56 | log_det_W_total += log_det_W_list[i] 57 | 58 | loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total 59 | return loss/(z.size(0)*z.size(1)*z.size(2)) 60 | 61 | 62 | class Invertible1x1Conv(torch.nn.Module): 63 | """ 64 | The layer outputs both the convolution, and the log determinant 65 | of its weight matrix. If reverse=True it does convolution with 66 | inverse 67 | """ 68 | def __init__(self, c): 69 | super(Invertible1x1Conv, self).__init__() 70 | self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, 71 | bias=False) 72 | 73 | # Sample a random orthonormal matrix to initialize weights 74 | W = torch.qr(torch.FloatTensor(c, c).normal_())[0] 75 | 76 | # Ensure determinant is 1.0 not -1.0 77 | if torch.det(W) < 0: 78 | W[:,0] = -1*W[:,0] 79 | W = W.view(c, c, 1) 80 | self.conv.weight.data = W 81 | 82 | def forward(self, z, reverse=False): 83 | # shape 84 | batch_size, group_size, n_of_groups = z.size() 85 | 86 | W = self.conv.weight.squeeze() 87 | 88 | if reverse: 89 | if not hasattr(self, 'W_inverse'): 90 | # Reverse computation 91 | W_inverse = W.float().inverse() 92 | W_inverse = Variable(W_inverse[..., None]) 93 | if z.type() == 'torch.cuda.HalfTensor': 94 | W_inverse = W_inverse.half() 95 | self.W_inverse = W_inverse 96 | z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) 97 | return z 98 | else: 99 | # Forward computation 100 | log_det_W = batch_size * n_of_groups * torch.logdet(W) 101 | z = self.conv(z) 102 | return z, log_det_W 103 | 104 | 105 | class WN(torch.nn.Module): 106 | """ 107 | This is the WaveNet like layer for the affine coupling. The primary difference 108 | from WaveNet is the convolutions need not be causal. There is also no dilation 109 | size reset. The dilation only doubles on each layer 110 | """ 111 | def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, 112 | kernel_size): 113 | super(WN, self).__init__() 114 | assert(kernel_size % 2 == 1) 115 | assert(n_channels % 2 == 0) 116 | self.n_layers = n_layers 117 | self.n_channels = n_channels 118 | self.in_layers = torch.nn.ModuleList() 119 | self.res_skip_layers = torch.nn.ModuleList() 120 | 121 | start = torch.nn.Conv1d(n_in_channels, n_channels, 1) 122 | start = torch.nn.utils.weight_norm(start, name='weight') 123 | self.start = start 124 | 125 | # Initializing last layer to 0 makes the affine coupling layers 126 | # do nothing at first. This helps with training stability 127 | end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) 128 | end.weight.data.zero_() 129 | end.bias.data.zero_() 130 | self.end = end 131 | 132 | cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) 133 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 134 | 135 | for i in range(n_layers): 136 | dilation = 2 ** i 137 | padding = int((kernel_size*dilation - dilation)/2) 138 | in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, 139 | dilation=dilation, padding=padding) 140 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 141 | self.in_layers.append(in_layer) 142 | 143 | 144 | # last one is not necessary 145 | if i < n_layers - 1: 146 | res_skip_channels = 2*n_channels 147 | else: 148 | res_skip_channels = n_channels 149 | res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) 150 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 151 | self.res_skip_layers.append(res_skip_layer) 152 | 153 | def forward(self, forward_input): 154 | audio, spect = forward_input 155 | audio = self.start(audio) 156 | output = torch.zeros_like(audio) 157 | n_channels_tensor = torch.IntTensor([self.n_channels]) 158 | 159 | spect = self.cond_layer(spect) 160 | 161 | for i in range(self.n_layers): 162 | spect_offset = i*2*self.n_channels 163 | acts = fused_add_tanh_sigmoid_multiply( 164 | self.in_layers[i](audio), 165 | spect[:,spect_offset:spect_offset+2*self.n_channels,:], 166 | n_channels_tensor) 167 | 168 | res_skip_acts = self.res_skip_layers[i](acts) 169 | if i < self.n_layers - 1: 170 | audio = audio + res_skip_acts[:,:self.n_channels,:] 171 | output = output + res_skip_acts[:,self.n_channels:,:] 172 | else: 173 | output = output + res_skip_acts 174 | 175 | return self.end(output) 176 | 177 | 178 | class WaveGlow(torch.nn.Module): 179 | def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, 180 | n_early_size, WN_config): 181 | super(WaveGlow, self).__init__() 182 | 183 | self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, 184 | n_mel_channels, 185 | 1024, stride=256) 186 | assert(n_group % 2 == 0) 187 | self.n_flows = n_flows 188 | self.n_group = n_group 189 | self.n_early_every = n_early_every 190 | self.n_early_size = n_early_size 191 | self.WN = torch.nn.ModuleList() 192 | self.convinv = torch.nn.ModuleList() 193 | 194 | n_half = int(n_group/2) 195 | 196 | # Set up layers with the right sizes based on how many dimensions 197 | # have been output already 198 | n_remaining_channels = n_group 199 | for k in range(n_flows): 200 | if k % self.n_early_every == 0 and k > 0: 201 | n_half = n_half - int(self.n_early_size/2) 202 | n_remaining_channels = n_remaining_channels - self.n_early_size 203 | self.convinv.append(Invertible1x1Conv(n_remaining_channels)) 204 | self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) 205 | self.n_remaining_channels = n_remaining_channels # Useful during inference 206 | 207 | def forward(self, forward_input): 208 | """ 209 | forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames 210 | forward_input[1] = audio: batch x time 211 | """ 212 | spect, audio = forward_input 213 | 214 | # Upsample spectrogram to size of audio 215 | spect = self.upsample(spect) 216 | assert(spect.size(2) >= audio.size(1)) 217 | if spect.size(2) > audio.size(1): 218 | spect = spect[:, :, :audio.size(1)] 219 | 220 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 221 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 222 | 223 | audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) 224 | output_audio = [] 225 | log_s_list = [] 226 | log_det_W_list = [] 227 | 228 | for k in range(self.n_flows): 229 | if k % self.n_early_every == 0 and k > 0: 230 | output_audio.append(audio[:,:self.n_early_size,:]) 231 | audio = audio[:,self.n_early_size:,:] 232 | 233 | audio, log_det_W = self.convinv[k](audio) 234 | log_det_W_list.append(log_det_W) 235 | 236 | n_half = int(audio.size(1)/2) 237 | audio_0 = audio[:,:n_half,:] 238 | audio_1 = audio[:,n_half:,:] 239 | 240 | output = self.WN[k]((audio_0, spect)) 241 | log_s = output[:, n_half:, :] 242 | b = output[:, :n_half, :] 243 | audio_1 = torch.exp(log_s)*audio_1 + b 244 | log_s_list.append(log_s) 245 | 246 | audio = torch.cat([audio_0, audio_1],1) 247 | 248 | output_audio.append(audio) 249 | return torch.cat(output_audio,1), log_s_list, log_det_W_list 250 | 251 | def infer(self, spect, sigma=1.0): 252 | spect = self.upsample(spect) 253 | # trim conv artifacts. maybe pad spec to kernel multiple 254 | time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] 255 | spect = spect[:, :, :-time_cutoff] 256 | 257 | spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) 258 | spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) 259 | 260 | if spect.type() == 'torch.cuda.HalfTensor': 261 | audio = torch.cuda.HalfTensor(spect.size(0), 262 | self.n_remaining_channels, 263 | spect.size(2)).normal_() 264 | else: 265 | audio = torch.cuda.FloatTensor(spect.size(0), 266 | self.n_remaining_channels, 267 | spect.size(2)).normal_() 268 | 269 | audio = torch.autograd.Variable(sigma*audio) 270 | 271 | for k in reversed(range(self.n_flows)): 272 | n_half = int(audio.size(1)/2) 273 | audio_0 = audio[:,:n_half,:] 274 | audio_1 = audio[:,n_half:,:] 275 | 276 | output = self.WN[k]((audio_0, spect)) 277 | 278 | s = output[:, n_half:, :] 279 | b = output[:, :n_half, :] 280 | audio_1 = (audio_1 - b)/torch.exp(s) 281 | audio = torch.cat([audio_0, audio_1],1) 282 | 283 | audio = self.convinv[k](audio, reverse=True) 284 | 285 | if k % self.n_early_every == 0 and k > 0: 286 | if spect.type() == 'torch.cuda.HalfTensor': 287 | z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() 288 | else: 289 | z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() 290 | audio = torch.cat((sigma*z, audio),1) 291 | 292 | audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data 293 | return audio 294 | 295 | @staticmethod 296 | def remove_weightnorm(model): 297 | waveglow = model 298 | for WN in waveglow.WN: 299 | WN.start = torch.nn.utils.remove_weight_norm(WN.start) 300 | WN.in_layers = remove(WN.in_layers) 301 | WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) 302 | WN.res_skip_layers = remove(WN.res_skip_layers) 303 | return waveglow 304 | 305 | 306 | def remove(conv_list): 307 | new_conv_list = torch.nn.ModuleList() 308 | for old_conv in conv_list: 309 | old_conv = torch.nn.utils.remove_weight_norm(old_conv) 310 | new_conv_list.append(old_conv) 311 | return new_conv_list 312 | -------------------------------------------------------------------------------- /filelists/transcript_val.txt: -------------------------------------------------------------------------------- 1 | wav/nen001_001.wav|はい?呼びました? 2 | wav/nen001_012.wav|ほしな君も 3 | wav/nen001_024.wav|さすがに白蛇占いはできませんよ 4 | wav/nen001_035.wav|わかりました。ありがとう……ございます 5 | wav/nen001_049.wav|んっ、んっ、んくっ……ひっ、あっ、ぁっ、ぁっ、んんーーッ…… 6 | wav/nen001_060.wav|あああぁぁ……今は、止められなくて……じゅる……はぁ、はぁぁ……あぁぁぁあぁ…… 7 | wav/nen001_072.wav|ほしな君。珍しいところで会いますね。図書室で何か調べ物ですか?こんな時間まで大変ですね 8 | wav/nen002_004.wav|そうですか 9 | wav/nen002_018.wav|そうですか…… 10 | wav/nen002_036.wav|あっ、あれは、違うんです。別に悩みとか、ストレスじゃなくて……じ……事情が……ありまして 11 | wav/nen002_051.wav|そんな風に光るなんて、私も初めて見ました一体何をしたんですか? 12 | wav/nen002_062.wav|ど、どうして……?一体どこに……欠片が……今まで集めた欠片が……やっぱりさっきの羽根は…… 13 | wav/nen002_074.wav|それで、あの……気分はどうですか? 14 | wav/nen002_089.wav|占ったんです、その高安先輩の交際相手である女の子の恋愛運を 15 | wav/nen002_100.wav|そうですね。そういう人も含まれると思います 16 | wav/nen002_113.wav|はい。私に、“心の欠片”のことを教えてくれた人に 17 | wav/nen003_007.wav|はい、私の知り合いが営んでる喫茶店です 18 | wav/nen003_019.wav|諦め?受け入れる? 19 | wav/nen003_031.wav|あの……一つ、思ったことがあるんですが…… 20 | wav/nen003_047.wav|ななおは人間じゃないんです。私が契約を結んだ、アルプなんです 21 | wav/nen003_059.wav|楽しそうにしてる時でも、どこか楽しみきれていないと言いますか。そういう気持ちは、私にもありますから 22 | wav/nen004_011.wav|は、はい? 23 | wav/nen005_008.wav|いえ、まだです。おそらく何もないとは思うんですが、万が一ということもあります 24 | wav/nen005_022.wav|おこです。激おこです 25 | wav/nen005_035.wav|いえ、困っていることがあって、私に力になれることでしたらお手伝いさせてもらいます 26 | wav/nen005_049.wav|そう言われても事実なので 27 | wav/nen005_060.wav|そうですね……私では解決できないような依頼も、いくつかありましたね 28 | wav/nen005_071.wav|まあ、気は進まないんですけどね……はぁ…… 29 | wav/nen005_089.wav|ごめんなさい。でも、これが欠片の回収方法なんです 30 | wav/nen005_100.wav|ななおに訊いても無駄ですよ。ななおは猫なんですから 31 | wav/nen006_005.wav|私も同じです。一般的な意見なら、多少は集まりましたが…… 32 | wav/nen006_018.wav|わかりました~ 33 | wav/nen006_030.wav|じゃあ、続けますね 34 | wav/nen006_043.wav|ありがとう、ございます……んっ、んんんっ、ひっ、ひっ、ふーーーぅ……ひっ、ひっ、ふーーーーぅ…… 35 | wav/nen006_056.wav|諦めるのはまだ早いと思います。ここにはまだテクニックが記されていますから 36 | wav/nen006_068.wav|え?突然どうしたんですか?ほしな君に謝罪されるようなこと、ありましたか? 37 | wav/nen007_006.wav|一部ということは……そうじゃない人には、受け入れてもらえた、ということですか? 38 | wav/nen008_009.wav|折を見て、自分の分を買いに行こうと思ってます 39 | wav/nen008_021.wav|そ、そうなんですか……? 40 | wav/nen008_032.wav|もう1時間ぐらいしてますから 41 | wav/nen008_044.wav|いなばさん……ありがとうございます。それでは、お言葉に甘えさせてもらってもいいですか? 42 | wav/nen008_056.wav|私たちでリハーサル? 43 | wav/nen009_004.wav|はい。川上君の悩みは、本当にデートのことでいいんでしょうか? 44 | wav/nen009_016.wav|はい、大丈夫ですよ、時間はまだ10分ほど余裕がありますから 45 | wav/nen009_027.wav|そうですね……ほしな君、川上君の予定では映画の後はどうなっていますか? 46 | wav/nen009_039.wav|このことは、川上君にも伝えておいた方がいいですね 47 | wav/nen009_052.wav|本当ですか?丁度いい機会ですから、いっそ買ってしまうのもいいですね 48 | wav/nen009_063.wav|川上君はしっかりプランを組んだりしているんですから、むしろ川上君が嫌がるかもしれませんね 49 | wav/nen009_074.wav|私は何でもいいですよ。嫌いな物も特にありませんから 50 | wav/nen009_085.wav|あの、これってもう取っていいんですか? 51 | wav/nen009_100.wav|あ、甘エビ~♪ 52 | wav/nen010_007.wav|確かにそうですね。お礼の言葉を言ってもらえたりするのも、とても嬉しいものですからね 53 | wav/nen010_021.wav|は、はぁ……はぁ……あり、がとう、ございますぅ……ほしなくんんんっ…… 54 | wav/nen010_032.wav|はい、あと少し……もう少し……んっ、んひっ、あっ、あっ、あっ……はあぁぁぁー…… 55 | wav/nen010_045.wav|ふーーー……ふーーー…… 56 | wav/nen010_057.wav|いえ、そうじゃなくてですね、その………………スースー、しますから 57 | wav/nen010_074.wav|はい、どうぞ 58 | wav/nen010_086.wav|私たちのオカルト研究部も、元々は黒魔術だったみたいですよ 59 | wav/nen010_100.wav|は、はい、大事になる前に誤解をときましょう 60 | wav/nen011_008.wav|私が勧めたんです。更衣室で着替えるのを恥ずかしそうにしていたので 61 | wav/nen011_019.wav|あ、ダ、ダメですよ、変なところ触っちゃくすぐったいですから 62 | wav/nen011_033.wav|どうしたんですか?なにか連絡事項が? 63 | wav/nen011_044.wav|ではとがくし先輩の相談は、越路さんを説得すること、でいいんですか? 64 | wav/nen012_003.wav|それで、どうでしたか? 65 | wav/nen012_014.wav|あの、ほしな君 66 | wav/nen012_026.wav|あれだけ反応が弱い欠片ですと、特に 67 | wav/nen013_001.wav|もし本当に私の他に魔女がいるとしたら……困ったことになりますね 68 | wav/nen013_015.wav|はい、問題ありません 69 | wav/nen013_028.wav|ロ、ローター……です………………ローターですよぅ…… 70 | wav/nen014_001.wav|そうなんですか?どうかしたんですか? 71 | wav/nen015_071.wav|それじゃあ今後とも、よろしくお願いします 72 | wav/nen015_004.wav|はぁ、それはわかりました。でも、一つだけ答えてくれませんか?気になる事があるんです 73 | wav/nen015_016.wav|え?それって、どういうことですか? 74 | wav/nen015_031.wav|そんな普通に可愛い服だなんて卑怯ですっ。私なんてこんな恥ずかしい恰好なのにぃ理不尽です~! 75 | wav/nen015_043.wav|魔女の契約の代償……と言うことですか 76 | wav/nen015_056.wav|しいばさんはああ言ってくれましたが、私は別にこの学院を自分の領土だなんて言うつもりはありません 77 | wav/nen016_003.wav|はい。また何か困ったことがあれば、いつでもどうぞ 78 | wav/nen016_014.wav|つまり、私たちはこの部室から出ていかなければいけない、ということですか? 79 | wav/nen016_027.wav|とにかく運営のすべきことは、ほしな君が言ったことと、先生方との折衝もでしょうか? 80 | wav/nen016_039.wav|そういうことでしたら……お願いできますか? 81 | wav/nen016_050.wav|なにか問題がありましたか? 82 | wav/nen017_002.wav|全員揃っていますね。それじゃ行きましょうか 83 | wav/nen017_015.wav|それじゃあ……ここからここまでを、まず完璧に覚えましょう。ここの基礎を覚えてしまえば、次も覚えやすいですから 84 | wav/nen017_028.wav|え?なんですか? 85 | wav/nen017_041.wav|だ、大丈夫……大丈夫なはず……ええ、絶対大丈夫です……おそらく、きっと、多分 86 | wav/nen017_052.wav|確かにそれぐらいの余裕はありますが…… 87 | wav/nen017_064.wav|はい、お疲れ様でした 88 | wav/nen018_012.wav|それじゃあ、一体どうしてですか? 89 | wav/nen018_023.wav|大きな欠伸ですね 90 | wav/nen018_036.wav|ちょっと皮がむけちゃって、真っ赤になっちゃってますよ 91 | wav/nen018_050.wav|ほしな君のことを、応援していますし……それが、応援になるというのでしたら……もう一度 92 | wav/nen019_002.wav|はい。よろしくお願いします 93 | wav/nen019_013.wav|そうですか、ありがとう……ございます 94 | wav/nen019_026.wav|ありがとうございます、しいばさん……言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった 95 | wav/nen019_037.wav|予想よりも多くの人に集まってもらえて、準備してきた者としては嬉しい限りです 96 | wav/nen020_102.wav|ぷぁ、はぁぁ………………疲れました 97 | wav/nen020_114.wav|それに私もほしな君と同じで、あくまで部活の一環ですからね 98 | wav/nen020_127.wav|もぅ、どうしてそういうことを言わせるんですか! 99 | wav/nen020_139.wav|こちらのことは気にしないでいいんですよ?……こうして欠片が戻ったということは、ほしな君も嫌に思ってるわけじゃないんですよね? 100 | wav/nen020_151.wav|はい。ほしな君は気付いていないかもしれませんが、笑顔が以前とは比べ物にならないぐらい自然ですから 101 | wav/nen020_162.wav|かもしれません。でも、そういう部活も楽しくていいものですよ 102 | wav/nen020_404.wav|ありがとうございます 103 | wav/nen101_010.wav|はぁ…… 104 | wav/nen101_024.wav|いえ、買い物ではないんです。今日は色々疲れてしまったので…… 105 | wav/nen101_036.wav|いいんですか? 106 | wav/nen101_048.wav|ほしな君は、このお店に入ったことがあるんですか? 107 | wav/nen101_059.wav|ですが……こうして呪文を唱えなきゃいけないんですよね?とりあえず、初心者はこう頼むべし、って書いてありましたけど 108 | wav/nen101_071.wav|えっ……あの、それって…… 109 | wav/nen101_086.wav|あ、美味しいですね。これがラーメン 110 | wav/nen101_099.wav|あのほしな君、早く行きましょう 111 | wav/nen101_111.wav|え?いえそんな、お礼を言われるような、大層なことは出来ていませんから 112 | wav/nen101_126.wav|はい、さようなら 113 | wav/nen102_005.wav|あの……それで、どうしたんですか?突然電話だなんて 114 | wav/nen102_018.wav|それにですね、今朝に比べると大分マシにはなっています。ですから、このまま大人しくしていれば平気ですよ 115 | wav/nen102_033.wav|どうぞ 116 | wav/nen102_046.wav|私が嘘を吐いていないのは、ほしな君ならわかりますよね? 117 | wav/nen102_057.wav|ですから、む……夢精をしちゃうような……いやらしい夢を見たんじゃないかなっと 118 | wav/nen102_072.wav|私は一人暮らしですから。そういう思い出とは縁遠い生活ですね 119 | wav/nen102_087.wav|今度は、ほしな君がおまじないをかける側になって下さい。そしたらきっと、私の恥ずかしさがわかってもらえるはずです 120 | wav/nen102_099.wav|ひっ、んっ、んん……ふぅ、ふぅ……んんっ、んふぅ……んん…… 121 | wav/nen102_111.wav|はい。約束です 122 | wav/nen102_124.wav|でも……気分が少しマシになったかもしれない。あのおまじないは効くのかな? 123 | wav/nen103_010.wav|だから熱く語らないで下さい、思い出しちゃダメー、手をニギニギさせるのもダメですってばっ 124 | wav/nen103_025.wav|私に、む、夢精……とか言わせたくせに、教えてくれないなんてズルいですよぅ! 125 | wav/nen103_042.wav|ところで話は変わりますが、何かあったんですか?みんな、普段と様子が違うみたいですが 126 | wav/nen103_053.wav|はい。先生が男の人と一緒に歩いているところを見かけましたよ 127 | wav/nen103_070.wav|本命の質問だけでなく、無関係なダミーの質問も織り交ぜれば、怪しさも薄くなりませんか? 128 | wav/nen103_082.wav|わかりました 129 | wav/nen103_095.wav|そうなんですが……見られていないとわかっていても、恥ずかしいんですよぅ、この恰好 130 | wav/nen103_106.wav|というよりも……一緒に行っていいですか?実は私もまだ書いていなくて…… 131 | wav/nen103_123.wav|あ、いえ、その…… 132 | wav/nen103_141.wav|は、はい? 133 | wav/nen103_161.wav|私は、怒られたくないです…… 134 | wav/nen103_175.wav|あの……正直に言います。最近の私は変なんです 135 | wav/nen103_189.wav|い、いえ、そんな風には思っていませんから、平気ですっ 136 | wav/nen103_200.wav|それに……こんな私のことを知りたいって言ってくれたこと……嬉しかったです 137 | wav/nen103_212.wav|こ、子供っぽいですよね? 138 | wav/nen103_227.wav|なぅぅぅぅぅ……ほしな君のことを思うと心が落ち着かない…… 139 | wav/nen103_240.wav|あっ、うあっ、あぁぁぁぁぁぁぁぁぁぁぁっ 140 | wav/nen103_251.wav|はあ、はあ、はぁああぁぁ……なにこれ、こんなにすごいの、しらない……いつもと、全然違う……んっ、ふーっ……ふーっ…… 141 | wav/nen103_262.wav|はぁ、はぁ、はぁ、はぁはぁはぁぁぁぁぁんっ、ぅぅぅぅぅぅうっ! 142 | wav/nen103_273.wav|ひゃんっ……あ、あ、あぁぁぁ……ヤダぁ、止まらない、止まりませんよぉ……あ、あ、はぁぁぁぁ……っ 143 | wav/nen104_007.wav|はい?なにが……ですか? 144 | wav/nen104_020.wav|だってほしな君が言わせたんじゃないですかぁ 145 | wav/nen104_031.wav|そうです。その通りです。い、今でももうおかしくなっているのに、これ以上は…… 146 | wav/nen104_043.wav|本当にごめんなさい 147 | wav/nen104_054.wav|別に大変と言うほどのことは 148 | wav/nen104_066.wav|そうなんですか?どうしてこんなにすぐに……いつも通り過ごしていたはずなのに 149 | wav/nen104_078.wav|それじゃあ、考えておきます 150 | wav/nen104_092.wav|もしよければ、その相手の怪しい行動についても、教えてもらえますか? 151 | wav/nen104_106.wav|あっ……ぅっ…… 152 | wav/nen104_121.wav|な、なんでもないですよぅ。眠れなかったというだけですから 153 | wav/nen104_136.wav|普通は引きますよね。一晩中オナニーしちゃうような女の子なんて…… 154 | wav/nen104_148.wav|そ、それじゃあ皆さん……あっ、んんっ……私は、お先に、失礼させてもらいます、ね……んんっ 155 | wav/nen104_161.wav|でも、ダメでした。ちょっと……無理そうです。答えは出そうにありません 156 | wav/nen104_173.wav|ほしな君は、私のことを好きって言ってくれてますが私には、ほしな君にも言ってないことが……あるんです 157 | wav/nen104_190.wav|濡れて……ます……発情が止まらなくて……ぅぅ……そ、そんな、ヘンタイな私でも好きって言ってくれますか? 158 | wav/nen104_203.wav|もう無理です。我慢できません。自分が抑えられなくて………………だから先に謝っておきますね。ごめんなさいっ 159 | wav/nen104_214.wav|んふーッ……じゅる、ちゅるるる……れる、れろれろれる……ちゅ、ちゅ……んちゅ 160 | wav/nen104_226.wav|んっ、んんんーーーー……ぷぁ、はぁぁぁーー……はぁーっ……はぁーっ…… 161 | wav/nen104_239.wav|はい。わかりました――んぅっ、あ……あっ、あっ……んんぁ 162 | wav/nen104_252.wav|あっ、はぁ、はぁ、はぁ……んんっ、んんんっ……ふーっ、ふーっ……んっ、んんーーっ 163 | wav/nen104_264.wav|それは……はぁ、はぁ……んんっ、好きな人にされる方が、気持ちよくて……好きです…… 164 | wav/nen104_275.wav|んっ、ひぃぃぁぁぁぁぁあああっ 165 | wav/nen104_287.wav|あ、あ、あの……そんなに、じっくり見ないで下さい……恥ずかしいんですから…… 166 | wav/nen104_299.wav|えっ?それは、やっぱり私のそこ、変ってことですか?色々自分で弄っちゃってるから、変なんですか? 167 | wav/nen104_310.wav|ひぃぁあっ!そっ、そこっ、は……んっ、んんんっ、あ、あ、あ、あ、ああああああっ 168 | wav/nen104_322.wav|ほ、ほしな君は、どうですか? 169 | wav/nen104_333.wav|はっ、はっ、ああぁァァああんっ、びりびり、するぅ……はぁ、はぁ、はぁ……奥まで、きてますぅ 170 | wav/nen104_344.wav|あああっ、頭、くらくらします……はぁはぁはぁ、ん、んんぅぅーーーーーーッ、もっと呼んでぇ、もっと名前を呼んで下さいぃ 171 | wav/nen104_355.wav|きゃ、ぅぁっ……はぁ、はぁ、すごい、出てます、ヌルヌルのが、いっぱいっ 172 | wav/nen104_370.wav|あ、あの、それはまた、後日にお願いします 173 | wav/nen105_010.wav|い、いいですいいです、そんな仰々しいことっ 174 | wav/nen105_026.wav|はい。お疲れ様でした 175 | wav/nen105_041.wav|一人暮らし用の冷蔵庫だと小さいですから。野菜室があるタイプに買い換えようかとずっと悩んでいるんですが…… 176 | wav/nen105_058.wav|それになによりも、好きな人と一緒にいられる時間は私も好きですから 177 | wav/nen105_071.wav|はい。頑張って作りますね 178 | wav/nen105_087.wav|ん、れろ……れる、えるれろれろ……れるん………んっ、ちゅぅぱ、はぁ、はぁ、はぁっ、あぁんっ 179 | wav/nen105_100.wav|あの……別に、そういう行為が嫌というわけじゃないんです。さっき、キスの前に言ったのは本当のことですから 180 | wav/nen105_112.wav|ウソツキ……私のしたいこと、ワガママを言ってもいいって……そう言ってくれたじゃないですか 181 | wav/nen105_127.wav|あ……あの、もう一度触っていいですか?今度はちゃんと優しく、丁寧に触りますから 182 | wav/nen105_139.wav|もし痛かったら言って下さいね。ちゅ、ちゅ……ん……ちゅぅ……んっ、んんっ 183 | wav/nen105_150.wav|んちゅ、じゅる……ちゅ、ちゅ……んんんー、舐めても舐めても、全然綺麗になりませんね。むしろ、ますますベトベトになってるような…… 184 | wav/nen105_162.wav|んぶ……ンッ、ちゅばちゅば……ちゅぶっ、ちゅぶぶ……んんーーっ、じゅるっ……じゅるるるるっ 185 | wav/nen105_173.wav|んーー……じゃあ、見えなくしちゃいます……ん、じゅる、じゅるるる……ちゅ、ちゅぅぅぅぅーーー……ッ 186 | wav/nen105_184.wav|ん……ッッ!?んっ、ぅぅぅっ……ん、んんーーー……んふぅー……ふぅー……ん、んむぅ……んんっ 187 | wav/nen105_195.wav|はぁ……はぁ……んっ、はぁぁぁ……気持ち、よかったですか? 188 | wav/nen106_002.wav|ちょっと待って下さいね。私も、最近は確認をしていなかったので 189 | wav/nen106_014.wav|でも、予定は大丈夫なんですか? 190 | wav/nen106_026.wav|は、はい。そうですね 191 | wav/nen106_038.wav|んふぅ、んっじゅるっ、ぬちゅくちゅ……んぁ、はぁ、はぁ、はい。もう少し……はぁぁ、あむぅ……れろれるん、れちょれちょ 192 | wav/nen106_050.wav|さようなら。また明日 193 | wav/nen106_065.wav|いつも歩いている道ですから。それに、なるべく明るくて人気のあるところを通ります。大丈夫ですよ 194 | wav/nen106_078.wav|すみません、気を遣わせてしまいまして。でも、本当にそれだけなので、心配は必要ありませんよ 195 | wav/nen106_090.wav|あの、ちょっと待って下さい 196 | wav/nen106_101.wav|そう言ってもらえて嬉しいです 197 | wav/nen106_116.wav|は、はい。もちろんです……私も、大好きな人とキスしたい、です…… 198 | wav/nen106_129.wav|え? 199 | wav/nen106_150.wav|それじゃあ、今日は失礼しますね 200 | wav/nen107_003.wav|それじゃあ、お疲れ様でした。さようなら 201 | wav/nen107_019.wav|そう、ですよね……今みたいな状態を続けても……仕方ないですよね 202 | wav/nen107_035.wav|でも、でも…… 203 | wav/nen107_051.wav|それよりも、結局どうなんですか?私の気持ち、ちゃんと感じてもらえてますか? 204 | wav/nen107_063.wav|それは……うっ……ぅぅぅ~~~……恥ずかしい、ですけど……今は、この温もりに包まれていたいです。そっちの方が重要です 205 | wav/nen108_011.wav|でも急に泊まってもらうことになって……親御さんにもご迷惑を…… 206 | wav/nen108_024.wav|そうなんですか?えっと……気付いていませんでした。むしろ、私の方が甘えちゃっていますから…… 207 | wav/nen108_044.wav|んんっ、ふーっ……ふーっ…… 208 | wav/nen108_056.wav|授業に身が入らなくて……ず、ずっと、考えてたら……はぁ……はぁ……だ、だから……はぁ、はぁ、はぁ、はぁ 209 | wav/nen108_069.wav|それは、だから……下のお口、ですとか……他にもありませんか? 210 | wav/nen108_080.wav|ひあぁぁああぁぁああっ、それ、しび、れる……からだ、痺れちゃうっ、あ、あ、あああああ、そこ、吸うの、あっ、あああっ 211 | wav/nen108_091.wav|ひゃあああぁっ、そんな、おま●こ全部を吸われたらぁ……あ、あ、あ、あ、我慢できませんっ、もう熱いですぅ、身体が熱くて仕方ないんです 212 | wav/nen108_103.wav|ぁぁ……はぁー、はぁー……あ、これぇ、奥まで感じます……んぁぁ、はぁー……はぁー…… 213 | wav/nen108_114.wav|ひっ、ひああぁぁぁああぁ、それ、それ凄いですぅ……はぁはぁはぁ、あああぁぁあああぁっ 214 | wav/nen108_125.wav|あっ、ああっ……やだぁ、エッチな音、してます……私の、おま●こから、エッチな音が……あっあっあっ、でも、我慢できなくてっ 215 | wav/nen108_136.wav|はぁ、はぁ……はぁぁぁ……もう、ドロドロですよ…… 216 | wav/nen108_152.wav|それは……はい。確かにそういう気持ちはあります…… 217 | wav/nen108_168.wav|浮かない表情をしていました…… 218 | wav/nen108_181.wav|それは……どういう意味ですか? 219 | wav/nen109_011.wav|そうですね……カラオケに、ボウリング、プリクラも…… 220 | wav/nen109_025.wav|はい……それじゃあ、えっと、えっと…… 221 | wav/nen109_040.wav|ここがいいでしょうか……それともこっち? 222 | wav/nen109_052.wav|まだ色々やりたいことはあります、それは尽きませんけど………………でも本当に、後悔はしてませんよ 223 | wav/nen109_069.wav|はい。私、幸せになります。それで、しゅうじ君のことも幸せにしてみせます 224 | wav/kne110_008.wav|メッセージ…… 225 | wav/kne110_026.wav|こんな……形だけにこだわった物じゃないんです……でも、それはもう……無理なんですよね 226 | wav/kne110_044.wav|はい。優しそうな人ですから 227 | wav/nen110_013.wav|ぅっ……ぁぁ……ダメ……泣いたり、しない 228 | wav/nen111_006.wav|言いたいこと……ですか? 229 | wav/nen111_019.wav|ギターが欲しいんですよね? 230 | wav/nen111_033.wav|やっぱり、未来が変わっちゃってるんですよね…… 231 | wav/nen111_047.wav|それは、えっと……………… 232 | wav/nen111_062.wav|で、ですから……わ、私の……オナニー………………オナニーですっ 233 | wav/nen111_079.wav|は、はい。大丈夫です。すみません、驚かせてしまいまして 234 | wav/nen111_092.wav|ひぁっ、ぅぅぅ~~~ 235 | wav/nen111_105.wav|保健室に行きますか? 236 | wav/nen111_124.wav|好き……好きです、大好きです……私は貴方のことが大好きです。愛しています。もう離れたりしません 237 | wav/nen112_011.wav|ほしな君はちゃんと以前から、力になってくれていましたよ 238 | wav/nen112_029.wav|私だって嬉しいです。ほしな君が一緒にいてくれて……その、単純に近い場所にいてくれるってことじゃなくてですね 239 | wav/nen112_043.wav|それで、いなばさんは……相談でいいんですよね? 240 | wav/nen112_054.wav|少し考える時間をもらえますか? 241 | wav/nen112_068.wav|いえ、そんなことはありません。私も嬉しいですよ 242 | wav/nen112_080.wav|あ、あの、なんだか凄い騒ぎになってるみたいですけど…… 243 | wav/nen112_094.wav|時と場所さえ考えてもらえれば……私も、や……やぶさかではありませんが……え?え?も、もしかして今日って、そういうことなんですか? 244 | wav/nen112_108.wav|そ、そうですね。見つかったらデートできなくなってしまいますよね 245 | wav/nen112_121.wav|もぅっ!そんなに連続して呼ばれたら、嬉しすぎておかしくなっちゃいますよぅ 246 | wav/nen112_133.wav|だって美味しいじゃないですか。それにほら、見た目も可愛いです 247 | wav/nen112_145.wav|はぁ……美味しかったです 248 | wav/nen112_159.wav|自分の身体なんですから、当たってることぐらい気付いてます……わかってはいますが……抱きついていたいんです 249 | wav/nen112_173.wav|前は私のしたいことするデートでしたが……今回はしゅうじ君が私のために計画してくれたデートで、どこに行くのかドキドキして 250 | wav/nen112_188.wav|私はしゅうじ君のこと、嫌いになったりなんてしないのに 251 | wav/nen112_204.wav|お、お邪魔します 252 | wav/nen112_217.wav|それにしても、しゅうじ君はお父さんとあんな風に喋るんですね。ちょっと、意外でした 253 | wav/nen112_229.wav|いえ、平気です 254 | wav/nen112_243.wav|お、女の子だって興奮とか、期待とか、もにょもにょしちゃうものなんですよぅ…… 255 | wav/nen112_259.wav|わ、わかりました…… 256 | wav/nen112_270.wav|は、はい……ぅぅぅぁッ……はっ、はぁー、はぁー……お願いします、続けて下さい……もっと、触って 257 | wav/nen112_281.wav|あ、あ、あ、また……やっ、そんなに強く捻っちゃ……ひぁっ、んぃぃ……ッッ 258 | wav/nen112_293.wav|あぁぁ、んぁああぁぁ……ッッ、2回、2回です……んっ、んんぅぅぅぁぁぁあッ、あっ、あっ、あああッッ 259 | wav/nen112_304.wav|だ、だって……5回だなんて……恥ずかしいです。凄くエッチですから…… 260 | wav/nen112_315.wav|ぅぅ……また、そうやって全部言わせて……本当にイジワルですよぅ…… 261 | wav/nen112_327.wav|熱くて……硬くて……はぁ、はぁ、ぁぁぁあっ……前より太くて、おっきい気が、しますぅっ 262 | wav/nen112_339.wav|ちゅっ、んん、ふぅぅ……んっ、んっ、んんぅぅぅぅ……ぅぅーーッ 263 | wav/nen112_350.wav|だって、だって……んっ、ぅぅぅっ……こ、こんなに、グリグリされたら、こんな声も出ちゃいますよぅ……あっ、はぁはぁはぁはぁ 264 | wav/nen112_361.wav|あ、はぁぁぁむ、んちゅ……ちゅ、ちゅ、ちゅ、じゅるる……んちゅ、ちゅぅぅーー……ん、んむぅ、んっ、じゅるる 265 | wav/nen112_372.wav|私も……こんなにイってしまったのは、初めてです……やっぱりオナニーとは、全然違いますね……はっ、はぁぁ…… 266 | wav/nen112_386.wav|いえ、もう起きます 267 | wav/nen113_171.wav|わ……わかりました……それなら……私、命令通りに、オナニーします 268 | wav/nen113_182.wav|ひぁぁ!は、はい、はいっ……んんっ、んんぅぅ……ぅぅあっ、あっ、あっ、あっ 269 | wav/nen113_193.wav|ちがっ、違うんです……お漏らしじゃなくて……ああ、もう……どうしてこんなにビショビショなの?まだ、乳首を刺激してるだけなのに 270 | wav/nen113_205.wav|はぁ、はぁ、こ、ここら辺ですか?もう当たりますか? 271 | wav/nen113_216.wav|ぁっ、ぁっ、ぁっ、ぅぅあっ、なにこれ……ダメっ、ダメっ……あっ、あっ、あっ、ぁぁぁああぁぁあ、イく……イっちゃう 272 | wav/nen113_227.wav|はぁーっ……はぁーっ……気持ちいい、です。クリトリス、気持ちいい…… 273 | wav/nen113_238.wav|んんんっ!んぁっ、んぁっ、ダメ……手が、震えて、あっ、あっ!ローター……当てていられない……あっ、あっ 274 | wav/nen113_249.wav|はっ、はひっ、あっ、あっ、あっ!イ、イく……もう、わらひ、我慢できませんよ……ああっ、あっ、あーーーーッ! 275 | wav/nen113_260.wav|え?あ、ちょっと待って下さい……あっ…… 276 | wav/nen113_272.wav|わかりました。それじゃあ遠慮せず、沢山イきますね……はぁ、はぁ…… 277 | wav/nen113_284.wav|それに動きたいんですよね?気持ちよくなりたいんですよね?さっきから、わたしの中でおち●ちんがビクビク、してますよ 278 | wav/nen113_295.wav|はぁ……はぁ……はぁ……はぁ……あっ……あっ……あっ、ああああぁぁぁぁぁぁぁぁぁああああああーーーーー!! 279 | wav/nen113_306.wav|んふぅ……んっんっんんぅぅぅぁああ!はぁ!はぁ!あああっ、んんんーーーーー……んんぁぁああっ! 280 | wav/nen113_317.wav|んひっ、あっ!あっ!はぁぁ……まだ、出てる……あっ、あっ、あっ、はぁぁ……ん、んんっ! 281 | wav/nen113_328.wav|ちょっと? 282 | wav/nen113_006.wav|そんなことありませんよ。さあ、遠慮せずに中に入って下さい 283 | wav/nen113_017.wav|あのー…… 284 | wav/nen113_031.wav|さ、参考……ですか?川上君が考えたデートプランを実際に試してみる、とかじゃなく? 285 | wav/nen113_044.wav|私はゲームセンターも好きですよ。普段は全然入ったこともありませんから、むしろ楽しみなぐらいです 286 | wav/nen113_056.wav|あっ、しゅうじ君。あっちにもほら、クマのぬいぐるみがありますよ 287 | wav/nen113_067.wav|私のことを考えてくれたからこそ、思い出の方を優先してくれたんですよね? 288 | wav/nen113_081.wav|そうですね。特別やレアって言われてしまうと、試しに頼んでみたくなりますね 289 | wav/nen113_096.wav|なにか違うこと考えてます 290 | wav/nen113_107.wav|いえ、もうジュースが無くなっちゃいましたから…… 291 | wav/nen113_118.wav|でも……いつもよりは、疲れましたよね? 292 | wav/nen113_132.wav|た、確かに……そうですね 293 | wav/nen113_145.wav|それならいいんですが…… 294 | wav/nen113_162.wav|どっ、どうやってって 295 | wav/nen114_017.wav|いえ。むしろ、こちらこそすみません。不透明な活動ばかりで……もっと結果が残るような物があればご迷惑もおかけしなかったんですが…… 296 | wav/nen114_028.wav|それに、パーティーで演奏しないとかりやさんはギターを披露できず、モヤモヤしたままになりませんか? 297 | wav/nen114_042.wav|そっ、その言い方は……卑怯ですよぅ 298 | wav/nen114_056.wav|そこも気になる部分ではあるんですが…… 299 | wav/nen114_072.wav|しゅうじ君を待っていたんです。最近、一緒にいられる時間が少ない気がして……なんとかしたいなと思って、終わるのを待ってたんです 300 | wav/nen114_083.wav|女の子同士でもですか? 301 | wav/nen114_097.wav|はい、できました 302 | wav/nen114_111.wav|物じゃなくてですね、あの……ですから……しゅうじ君の願い事を、なんでも叶えます、私が 303 | wav/nen114_124.wav|ダメです 304 | wav/nen114_135.wav|んっ、んんーーーッ……んふぅ、ふぅー……ふぅー……んっ、んんっ、んむぅ……んぅ……も、もっと……しゅうじ君、もっと…… 305 | wav/nen114_146.wav|んぷぁぁっ、はっ、はぁ……はぁ……はひっ、んぁぁあ……はぁぁぁ…… 306 | wav/nen115_007.wav|でもその前に、私たちの演奏を聞いて下さい。一生懸命練習してきましたから 307 | wav/nen115_021.wav|しゅうじ君は……誰に投票したんですか? 308 | wav/nen115_037.wav|はぁ、ぁぁむ……ん、んんっ、ちゅちゅ……じゅる、ちゅぱちゅる、んっ、んんんんんーーーーーー 309 | wav/nen115_049.wav|ひっ!?あっ、あっ、あああぁぁーーっ! 310 | wav/nen115_061.wav|ずっとオナニー我慢してて……ぁぁぁぁあっ!しゅうじくん、しゅうじくん……っ、はぁ、はぁ、はぁぁあぁっ 311 | wav/nen115_072.wav|ふぇぇ……?はっ、はぁ、はぁ、はぁ……ど、どうかしたんですか……? 312 | wav/nen115_083.wav|好き、あっ、あっ、あっ、ひゅきでひゅ……おち●ちんにグリグリされるの……あっ、あっ、ああぁぁああっ! 313 | wav/nen115_094.wav|あーー……あはーーー……はひ、はひっ……んへぁぁ……私、こんな下品な声を出してイっちゃった……はぁーっ……はぁーっ…… 314 | wav/nen115_106.wav|んっ、んっ、んんーーーーっ!はひっ、はひっ、んっ、んんんーーーーッ! 315 | wav/nen115_118.wav|んひっ、あっ、あっ、んんっ、んんぁぁあっ、はぁーっ……はぁーっ……あっ、あっ、はぁぁぁぁぁ…… 316 | wav/nen115_129.wav|それに……こんなの、まるでおち●ちんが、私から生えたみたいです。しかも硬いままで…… 317 | wav/nen115_144.wav|んー……こんなものでしょうか 318 | wav/nen115_156.wav|お願い? 319 | wav/nen115_168.wav|んっ、ぅうぅ……はぁ、はぁ……んんっ、んんん…… 320 | wav/nen115_180.wav|はっ、んっ、んんぁっ、んぁっ……ぁぁあぁああぁ……引っかかるの、気持ちいい、です……んんー……ッッ 321 | wav/nen115_191.wav|はぁー……はぁー……はぁー……ぁぁぁ、んんんっ…… 322 | wav/nen115_202.wav|あっ、あっ、あーーーっ……中、中が切なくて……はぁ、はぁ、はぁ、あの、もうオナニーじゃなくなってもいいですか? 323 | wav/nen115_214.wav|だって……んぁぁ、ずっと待ってたんです。欲しくて、我慢してたんです……だから、仕方ないんですよ、ぁぁぁ…… 324 | wav/nen115_225.wav|違う、違うのぉ……身体が勝手に……ん、ん、ん、んぁぁあーーーぁぁぁぁ……こひゅれてる、気持ちいいの、こひゅれてるぅ 325 | wav/nen115_236.wav|おま●こですっ、おま●こに欲しい……んっ、んぁ……精液、こっちで飲みたいんです、んぁ、んぁ、んぁーーっ! 326 | wav/nen115_249.wav|え、えぇぇ……ま、まだ足りないんですか?こんなにドロドロにしたのに…… 327 | wav/nen115_262.wav|そう言ってもらえると……ありがとうございます 328 | wav/nen116_001.wav|はい 329 | wav/nen116_012.wav|私は……別に流されてもいいのに…… 330 | wav/nen116_026.wav|私にできることがあるなら、何でもします。だから、1人で苦しまないで下さい 331 | wav/nen116_039.wav|はい、大丈夫です 332 | wav/nen117_007.wav|どうしてそういうことを言うんですか!私の好きな人なのに! 333 | wav/nen117_020.wav|あとですね、せっかくですからお泊まり用具の他にも色々用意してきたんです 334 | wav/nen117_034.wav|あの、お風呂頂きました。お……お待たせ……しました、しゅうじ君 335 | wav/nen117_047.wav|せっかく気合いを入れて身体も綺麗にしたのに……先に寝ちゃうなんてひどいです 336 | wav/nen117_058.wav|よかった、安心しました 337 | wav/nen117_069.wav|んぅぅ……ちゅ、ちゅ、んんんんーーッ……嫌じゃないですよ?むしろ……私は濃い方が好きかもしれません……ん、じゅる、じゅるりっ 338 | wav/nen117_080.wav|じゃあ、続けますね。ん、ちゅ、ちゅぶぶ……んっ、じゅるっ、じゅぽじゅぽ、ちゅ、ちゅるるっ 339 | wav/nen117_091.wav|んふぅ……ほら、こうして正直に教えてくれます、気持ちいいって 340 | wav/nen117_103.wav|はぁ、はぁ、はぁ……すごい、トロトロと匂いが、さっきから止まりません……ああ、全然綺麗にできない 341 | wav/nen117_114.wav|んぐっ……んぶ、んぶ……ッ……んんんんーーーーーッ!ん、んんーーーー……コク……コク……ん、んんんむぅ 342 | wav/nen117_125.wav|ひゃっ、たくさん……あつい精液、びゅーって飛んで……あ、きゃっ、ひゃっ 343 | wav/nen117_136.wav|ん、ちゅば、ちゅば……んんっ、れろれろ……ンンッ……はぁ、はぁ……れりょれりょ 344 | wav/nen117_147.wav|んっ、んんっ、あむあむ……ぢゅぷ、ぢゅるるる……んぽくぽ、じゅるるるっ 345 | wav/nen117_158.wav|あっ!ダメですよ、これは罰なのに、あ、きゃぁぁぁッ 346 | wav/nen117_171.wav|んっ、んんんぁぁぁぁーーーーーーーー……ッッ! 347 | wav/nen117_183.wav|あ、あ、ああーーーーっ……はぁ、はぁ……あ、あ、あ、それ、すごい……すごいぃぃ……んんぁあッッ 348 | wav/nen117_194.wav|あっ、ひっ、んひぃぃッ……あーっ、あーっ……もうらめぇ…あ、あ、あ、イく、いっっ……くぅぅぅぅーーーーーぅぅぅぅううううッッ!! 349 | wav/nen117_206.wav|んっ、あっ、あっ、あっ、あっ……そうなんですか?わたし、もうちゃんと、覚えてるんですか? 350 | wav/nen117_217.wav|イっちゃうっ、わたひまたイっちゃうぅぅ……ッ 351 | wav/nen117_228.wav|かひっ、かっ、はぁ、はぁ……んんんっ……はぁ、はぁ、んんっ、んぁ……ぁぁぁぁ…… 352 | wav/nen117_239.wav|それは、ちがっ、えっと、あががががががががががが―― 353 | wav/nen203_010.wav|はい、それは残念ながら 354 | wav/nen203_025.wav|ご協力ありがとうございます。それは思い至ってませんでした、助かりました 355 | wav/nen203_040.wav|心を許しあえるような相手が出来れば、おそらくは 356 | wav/nen203_053.wav|すみません……明日もこうでないといいんですが…… 357 | wav/nen203_065.wav|あの、どうかしたんですか?いなばさん 358 | wav/nen203_080.wav|それはたぶん、昨日話をした、胸の痛みに関わることなんですよね 359 | wav/nen203_095.wav|占いなんて、あくまでも切っ掛けみたいなものですから 360 | wav/nen203_111.wav|あ、あの、優しくしてください……それと、電気を消して……お願いです…… 361 | wav/nen203_127.wav|せっかくですし、一緒に入りませんか? 362 | wav/nen204_006.wav|では、今日はこの辺りで解散にしましょうか 363 | wav/nen205_018.wav|それでですね、ほしな君 364 | wav/nen206_007.wav|ええ、ちょっと 365 | wav/nen206_022.wav|そうですね。少なくとも、自分のせいっていうのはいなばさんの誤解かも知れませんし 366 | wav/nen206_033.wav|やりとりをオープンにした方が、互いに痛くもない腹を探り合わないで済むと思います 367 | wav/nen206_048.wav|もし、木月さんの行方が知れなくなったのが、魔法や契約と絡むことなら―― 368 | wav/nen206_063.wav|だから学院にも、なにも…… 369 | wav/nen207_016.wav|座りましたっ 370 | wav/nen207_031.wav|え?そ、それはもちろんですけど 371 | wav/nen209_001.wav|こんにちは 372 | wav/nen210_009.wav|とりあえず……ほしな君にその、想定外に下着まで見せてしまったんですよね? 373 | wav/nen210_023.wav|放課後、ななおのところまで付き合ってもらえませんか? 374 | wav/nen210_039.wav|お待たせしました 375 | wav/nen211_004.wav|はい。ですからほしな君の中には今、魔女2人のものである欠片がそれぞれにあります 376 | wav/nen211_015.wav|そして、こうなってしまったものは仕方がありませんし、回収不可能なわけでもないんですから 377 | wav/nen212_001.wav|う、上手くいったんですか? 378 | wav/nen212_015.wav|はい、おかげさまで 379 | wav/nen213_011.wav|生まれつき備えてしまっていた、あの能力のせいで 380 | wav/nen213_025.wav|はあ……せ、交尾ですか 381 | wav/nen214_010.wav|い、いえっなんでもっ 382 | wav/nen215_012.wav|それもわかりますけど 383 | wav/nen217_006.wav|とがくし先輩、その―― 384 | wav/nen218_009.wav|そこはまた、ご協力いただければ助かります 385 | wav/nen219_005.wav|ハッピーハロウィンですね、いなばさん 386 | wav/nen301_006.wav|ええ。私の方は、あともう少しで溜まりますから 387 | wav/nen301_017.wav|はい、頑張ります 388 | wav/nen302_010.wav|知っている方なんですか、2人とも? 389 | wav/nen303_003.wav|なるほど。だったら、しいばさんはあまり近づき過ぎない方がいいかもしれません 390 | wav/nen303_014.wav|はい、どうやらほしな君の心の穴が広がってしまった可能性がありそうです 391 | wav/nen303_030.wav|いいんです、ほしな君が吸収してしまった分なら、ほとんど回収した後ですし 392 | wav/nen303_045.wav|ほしな君の心の穴を埋めるのも、しいばさんにお任せした方が効率的かもしれません 393 | wav/nen305_004.wav|こ、交尾をされたわけではないですよね? 394 | wav/nen307_005.wav|もっとも、ほしな君が誘ったのはしいばさんです。しいばさん次第だと思いますが 395 | wav/nen308_007.wav|ですがしばらくの間、話し相手になることにしました 396 | wav/nen310_006.wav|いいんじゃないでしょうか? 397 | wav/nen312_003.wav|どうかしましたか?ほしな君もまだ来てないようですし、気になっていたんですが 398 | wav/nen312_014.wav|いえ、私も何も聞いていませんが 399 | wav/nen314_002.wav|ありがとうございます 400 | wav/nen314_016.wav|魔女を常に見張る者が多いそうです、心当たりはありませんか? 401 | wav/nen314_027.wav|すると心を強引に削り取った痕がみつかったんです! 402 | wav/nen315_002.wav|え、ええ 403 | wav/nen315_013.wav|はい、ですがこの場合、欠片は犯人から奪い返せばいいんです 404 | wav/nen315_024.wav|見つけ出すだけでも、なかなか骨が折れそうですが 405 | wav/nen316_003.wav|別のアルプがいるなら、匂いでわかるというのですが 406 | wav/nen317_008.wav|いえ、厚真さんが預かっていた子犬も、行方がわからなくなっているのを思い出したんですが 407 | wav/nen319_005.wav|人間に見えても、ぼんやりしないでしっかり警戒を 408 | wav/nen401_006.wav|ふー……ふー………………はぁ、美味しい 409 | wav/nen402_007.wav|はい 410 | wav/nen402_020.wav|ちょっと思いつきませんね 411 | wav/nen404_003.wav|もし何かあるなら休んでくれてもいいんですよ? 412 | wav/nen404_014.wav|私に対する罪悪感といいますか、義務感と言いますか……それはきっと同情に近い感情ですから…… 413 | wav/nen405_002.wav|ほしな君。ああいうのは、どうかと思います 414 | wav/nen405_013.wav|はい、何ですか? 415 | wav/nen405_024.wav|いえ、今日は仕方ありませんよ。相談だけじゃなく、占いを希望する人も来ませんでしたからね 416 | wav/nen406_010.wav|欠片が戻ってきたのは、ほしな君がとがくし先輩とお付き合いをするようになったからだと思うんです 417 | wav/nen406_021.wav|それに……これはあくまで、責めるつもりではなく、色んな人の相談を受けて思った個人的な意見なんですが 418 | wav/nen409_003.wav|あ、ほしな君 419 | wav/nen409_014.wav|魔力の塊をぶつけることで、多少のショックを与えるかもしれないそうですが、先輩の心にひどい影響を与えるものじゃないそうです 420 | wav/nen409_025.wav|私は、この弾丸を撃てばいいわけですね 421 | wav/nen409_038.wav|それでは 422 | wav/nen410_010.wav|それは、ほしな君がオカ研で頑張ってくれた分で相殺です。実際、今のこの欠片の量は、私がほしな君と出会う前より、ほんの少し少ないだけですから 423 | wav/nen410_022.wav|学院内ではあれほどダメだって言ってるじゃないですか 424 | wav/nen504_001.wav|ほしな君、調子はどうですか? 425 | wav/nen505_008.wav|えっと……こ、ここは、励まし会とか開いた方がいいんでしょうか? 426 | wav/nen507_009.wav|なのに、部活を続けたりしたら、擦れ違いですとか、そういうことが心配になって 427 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | import torch 3 | from torch.autograd import Variable 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from layers import ConvNorm, LinearNorm 7 | from utils import to_gpu, get_mask_from_lengths 8 | 9 | 10 | class LocationLayer(nn.Module): 11 | def __init__(self, attention_n_filters, attention_kernel_size, 12 | attention_dim): 13 | super(LocationLayer, self).__init__() 14 | padding = int((attention_kernel_size - 1) / 2) 15 | self.location_conv = ConvNorm(2, attention_n_filters, 16 | kernel_size=attention_kernel_size, 17 | padding=padding, bias=False, stride=1, 18 | dilation=1) 19 | self.location_dense = LinearNorm(attention_n_filters, attention_dim, 20 | bias=False, w_init_gain='tanh') 21 | 22 | def forward(self, attention_weights_cat): 23 | processed_attention = self.location_conv(attention_weights_cat) 24 | processed_attention = processed_attention.transpose(1, 2) 25 | processed_attention = self.location_dense(processed_attention) 26 | return processed_attention 27 | 28 | 29 | class Attention(nn.Module): 30 | def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, 31 | attention_location_n_filters, attention_location_kernel_size): 32 | super(Attention, self).__init__() 33 | self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, 34 | bias=False, w_init_gain='tanh') 35 | self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, 36 | w_init_gain='tanh') 37 | self.v = LinearNorm(attention_dim, 1, bias=False) 38 | self.location_layer = LocationLayer(attention_location_n_filters, 39 | attention_location_kernel_size, 40 | attention_dim) 41 | self.score_mask_value = -float("inf") 42 | 43 | def get_alignment_energies(self, query, processed_memory, 44 | attention_weights_cat): 45 | """ 46 | PARAMS 47 | ------ 48 | query: decoder output (batch, n_mel_channels * n_frames_per_step) 49 | processed_memory: processed encoder outputs (B, T_in, attention_dim) 50 | attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) 51 | 52 | RETURNS 53 | ------- 54 | alignment (batch, max_time) 55 | """ 56 | 57 | processed_query = self.query_layer(query.unsqueeze(1)) 58 | processed_attention_weights = self.location_layer(attention_weights_cat) 59 | energies = self.v(torch.tanh( 60 | processed_query + processed_attention_weights + processed_memory)) 61 | 62 | energies = energies.squeeze(-1) 63 | return energies 64 | 65 | def forward(self, attention_hidden_state, memory, processed_memory, 66 | attention_weights_cat, mask): 67 | """ 68 | PARAMS 69 | ------ 70 | attention_hidden_state: attention rnn last output 71 | memory: encoder outputs 72 | processed_memory: processed encoder outputs 73 | attention_weights_cat: previous and cummulative attention weights 74 | mask: binary mask for padded data 75 | """ 76 | alignment = self.get_alignment_energies( 77 | attention_hidden_state, processed_memory, attention_weights_cat) 78 | 79 | if mask is not None: 80 | alignment.data.masked_fill_(mask, self.score_mask_value) 81 | 82 | attention_weights = F.softmax(alignment, dim=1) 83 | attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) 84 | attention_context = attention_context.squeeze(1) 85 | 86 | return attention_context, attention_weights 87 | 88 | 89 | class Prenet(nn.Module): 90 | def __init__(self, in_dim, sizes): 91 | super(Prenet, self).__init__() 92 | in_sizes = [in_dim] + sizes[:-1] 93 | self.layers = nn.ModuleList( 94 | [LinearNorm(in_size, out_size, bias=False) 95 | for (in_size, out_size) in zip(in_sizes, sizes)]) 96 | 97 | def forward(self, x): 98 | for linear in self.layers: 99 | x = F.dropout(F.relu(linear(x)), p=0.5, training=True) 100 | return x 101 | 102 | 103 | class Postnet(nn.Module): 104 | """Postnet 105 | - Five 1-d convolution with 512 channels and kernel size 5 106 | """ 107 | 108 | def __init__(self, hparams): 109 | super(Postnet, self).__init__() 110 | self.convolutions = nn.ModuleList() 111 | 112 | self.convolutions.append( 113 | nn.Sequential( 114 | ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, 115 | kernel_size=hparams.postnet_kernel_size, stride=1, 116 | padding=int((hparams.postnet_kernel_size - 1) / 2), 117 | dilation=1, w_init_gain='tanh'), 118 | nn.BatchNorm1d(hparams.postnet_embedding_dim)) 119 | ) 120 | 121 | for i in range(1, hparams.postnet_n_convolutions - 1): 122 | self.convolutions.append( 123 | nn.Sequential( 124 | ConvNorm(hparams.postnet_embedding_dim, 125 | hparams.postnet_embedding_dim, 126 | kernel_size=hparams.postnet_kernel_size, stride=1, 127 | padding=int((hparams.postnet_kernel_size - 1) / 2), 128 | dilation=1, w_init_gain='tanh'), 129 | nn.BatchNorm1d(hparams.postnet_embedding_dim)) 130 | ) 131 | 132 | self.convolutions.append( 133 | nn.Sequential( 134 | ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, 135 | kernel_size=hparams.postnet_kernel_size, stride=1, 136 | padding=int((hparams.postnet_kernel_size - 1) / 2), 137 | dilation=1, w_init_gain='linear'), 138 | nn.BatchNorm1d(hparams.n_mel_channels)) 139 | ) 140 | 141 | def forward(self, x): 142 | for i in range(len(self.convolutions) - 1): 143 | x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) 144 | x = F.dropout(self.convolutions[-1](x), 0.5, self.training) 145 | 146 | return x 147 | 148 | 149 | class Encoder(nn.Module): 150 | """Encoder module: 151 | - Three 1-d convolution banks 152 | - Bidirectional LSTM 153 | """ 154 | def __init__(self, hparams): 155 | super(Encoder, self).__init__() 156 | 157 | convolutions = [] 158 | for _ in range(hparams.encoder_n_convolutions): 159 | conv_layer = nn.Sequential( 160 | ConvNorm(hparams.encoder_embedding_dim, 161 | hparams.encoder_embedding_dim, 162 | kernel_size=hparams.encoder_kernel_size, stride=1, 163 | padding=int((hparams.encoder_kernel_size - 1) / 2), 164 | dilation=1, w_init_gain='relu'), 165 | nn.BatchNorm1d(hparams.encoder_embedding_dim)) 166 | convolutions.append(conv_layer) 167 | self.convolutions = nn.ModuleList(convolutions) 168 | 169 | self.lstm = nn.LSTM(hparams.encoder_embedding_dim, 170 | int(hparams.encoder_embedding_dim / 2), 1, 171 | batch_first=True, bidirectional=True) 172 | 173 | def forward(self, x, input_lengths): 174 | for conv in self.convolutions: 175 | x = F.dropout(F.relu(conv(x)), 0.5, self.training) 176 | 177 | x = x.transpose(1, 2) 178 | 179 | # pytorch tensor are not reversible, hence the conversion 180 | input_lengths = input_lengths.cpu().numpy() 181 | x = nn.utils.rnn.pack_padded_sequence( 182 | x, input_lengths, batch_first=True) 183 | 184 | self.lstm.flatten_parameters() 185 | outputs, _ = self.lstm(x) 186 | 187 | outputs, _ = nn.utils.rnn.pad_packed_sequence( 188 | outputs, batch_first=True) 189 | 190 | return outputs 191 | 192 | def inference(self, x): 193 | for conv in self.convolutions: 194 | x = F.dropout(F.relu(conv(x)), 0.5, self.training) 195 | 196 | x = x.transpose(1, 2) 197 | 198 | self.lstm.flatten_parameters() 199 | outputs, _ = self.lstm(x) 200 | 201 | return outputs 202 | 203 | 204 | class Decoder(nn.Module): 205 | def __init__(self, hparams): 206 | super(Decoder, self).__init__() 207 | self.n_mel_channels = hparams.n_mel_channels 208 | self.n_frames_per_step = hparams.n_frames_per_step 209 | self.encoder_embedding_dim = hparams.encoder_embedding_dim 210 | self.attention_rnn_dim = hparams.attention_rnn_dim 211 | self.decoder_rnn_dim = hparams.decoder_rnn_dim 212 | self.prenet_dim = hparams.prenet_dim 213 | self.max_decoder_steps = hparams.max_decoder_steps 214 | self.gate_threshold = hparams.gate_threshold 215 | self.p_attention_dropout = hparams.p_attention_dropout 216 | self.p_decoder_dropout = hparams.p_decoder_dropout 217 | 218 | self.prenet = Prenet( 219 | hparams.n_mel_channels * hparams.n_frames_per_step, 220 | [hparams.prenet_dim, hparams.prenet_dim]) 221 | 222 | self.attention_rnn = nn.LSTMCell( 223 | hparams.prenet_dim + hparams.encoder_embedding_dim, 224 | hparams.attention_rnn_dim) 225 | 226 | self.attention_layer = Attention( 227 | hparams.attention_rnn_dim, hparams.encoder_embedding_dim, 228 | hparams.attention_dim, hparams.attention_location_n_filters, 229 | hparams.attention_location_kernel_size) 230 | 231 | self.decoder_rnn = nn.LSTMCell( 232 | hparams.attention_rnn_dim + hparams.encoder_embedding_dim, 233 | hparams.decoder_rnn_dim, 1) 234 | 235 | self.linear_projection = LinearNorm( 236 | hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 237 | hparams.n_mel_channels * hparams.n_frames_per_step) 238 | 239 | self.gate_layer = LinearNorm( 240 | hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, 241 | bias=True, w_init_gain='sigmoid') 242 | 243 | def get_go_frame(self, memory): 244 | """ Gets all zeros frames to use as first decoder input 245 | PARAMS 246 | ------ 247 | memory: decoder outputs 248 | 249 | RETURNS 250 | ------- 251 | decoder_input: all zeros frames 252 | """ 253 | B = memory.size(0) 254 | decoder_input = Variable(memory.data.new( 255 | B, self.n_mel_channels * self.n_frames_per_step).zero_()) 256 | return decoder_input 257 | 258 | def initialize_decoder_states(self, memory, mask): 259 | """ Initializes attention rnn states, decoder rnn states, attention 260 | weights, attention cumulative weights, attention context, stores memory 261 | and stores processed memory 262 | PARAMS 263 | ------ 264 | memory: Encoder outputs 265 | mask: Mask for padded data if training, expects None for inference 266 | """ 267 | B = memory.size(0) 268 | MAX_TIME = memory.size(1) 269 | 270 | self.attention_hidden = Variable(memory.data.new( 271 | B, self.attention_rnn_dim).zero_()) 272 | self.attention_cell = Variable(memory.data.new( 273 | B, self.attention_rnn_dim).zero_()) 274 | 275 | self.decoder_hidden = Variable(memory.data.new( 276 | B, self.decoder_rnn_dim).zero_()) 277 | self.decoder_cell = Variable(memory.data.new( 278 | B, self.decoder_rnn_dim).zero_()) 279 | 280 | self.attention_weights = Variable(memory.data.new( 281 | B, MAX_TIME).zero_()) 282 | self.attention_weights_cum = Variable(memory.data.new( 283 | B, MAX_TIME).zero_()) 284 | self.attention_context = Variable(memory.data.new( 285 | B, self.encoder_embedding_dim).zero_()) 286 | 287 | self.memory = memory 288 | self.processed_memory = self.attention_layer.memory_layer(memory) 289 | self.mask = mask 290 | 291 | def parse_decoder_inputs(self, decoder_inputs): 292 | """ Prepares decoder inputs, i.e. mel outputs 293 | PARAMS 294 | ------ 295 | decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs 296 | 297 | RETURNS 298 | ------- 299 | inputs: processed decoder inputs 300 | 301 | """ 302 | # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels) 303 | decoder_inputs = decoder_inputs.transpose(1, 2) 304 | decoder_inputs = decoder_inputs.view( 305 | decoder_inputs.size(0), 306 | int(decoder_inputs.size(1)/self.n_frames_per_step), -1) 307 | # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels) 308 | decoder_inputs = decoder_inputs.transpose(0, 1) 309 | return decoder_inputs 310 | 311 | def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): 312 | """ Prepares decoder outputs for output 313 | PARAMS 314 | ------ 315 | mel_outputs: 316 | gate_outputs: gate output energies 317 | alignments: 318 | 319 | RETURNS 320 | ------- 321 | mel_outputs: 322 | gate_outpust: gate output energies 323 | alignments: 324 | """ 325 | # (T_out, B) -> (B, T_out) 326 | alignments = torch.stack(alignments).transpose(0, 1) 327 | # (T_out, B) -> (B, T_out) 328 | gate_outputs = torch.stack(gate_outputs).transpose(0, 1) 329 | gate_outputs = gate_outputs.contiguous() 330 | # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels) 331 | mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() 332 | # decouple frames per step 333 | mel_outputs = mel_outputs.view( 334 | mel_outputs.size(0), -1, self.n_mel_channels) 335 | # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) 336 | mel_outputs = mel_outputs.transpose(1, 2) 337 | 338 | return mel_outputs, gate_outputs, alignments 339 | 340 | def decode(self, decoder_input): 341 | """ Decoder step using stored states, attention and memory 342 | PARAMS 343 | ------ 344 | decoder_input: previous mel output 345 | 346 | RETURNS 347 | ------- 348 | mel_output: 349 | gate_output: gate output energies 350 | attention_weights: 351 | """ 352 | cell_input = torch.cat((decoder_input, self.attention_context), -1) 353 | self.attention_hidden, self.attention_cell = self.attention_rnn( 354 | cell_input, (self.attention_hidden, self.attention_cell)) 355 | self.attention_hidden = F.dropout( 356 | self.attention_hidden, self.p_attention_dropout, self.training) 357 | 358 | attention_weights_cat = torch.cat( 359 | (self.attention_weights.unsqueeze(1), 360 | self.attention_weights_cum.unsqueeze(1)), dim=1) 361 | self.attention_context, self.attention_weights = self.attention_layer( 362 | self.attention_hidden, self.memory, self.processed_memory, 363 | attention_weights_cat, self.mask) 364 | 365 | self.attention_weights_cum += self.attention_weights 366 | decoder_input = torch.cat( 367 | (self.attention_hidden, self.attention_context), -1) 368 | self.decoder_hidden, self.decoder_cell = self.decoder_rnn( 369 | decoder_input, (self.decoder_hidden, self.decoder_cell)) 370 | self.decoder_hidden = F.dropout( 371 | self.decoder_hidden, self.p_decoder_dropout, self.training) 372 | 373 | decoder_hidden_attention_context = torch.cat( 374 | (self.decoder_hidden, self.attention_context), dim=1) 375 | decoder_output = self.linear_projection( 376 | decoder_hidden_attention_context) 377 | 378 | gate_prediction = self.gate_layer(decoder_hidden_attention_context) 379 | return decoder_output, gate_prediction, self.attention_weights 380 | 381 | def forward(self, memory, decoder_inputs, memory_lengths): 382 | """ Decoder forward pass for training 383 | PARAMS 384 | ------ 385 | memory: Encoder outputs 386 | decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs 387 | memory_lengths: Encoder output lengths for attention masking. 388 | 389 | RETURNS 390 | ------- 391 | mel_outputs: mel outputs from the decoder 392 | gate_outputs: gate outputs from the decoder 393 | alignments: sequence of attention weights from the decoder 394 | """ 395 | 396 | decoder_input = self.get_go_frame(memory).unsqueeze(0) 397 | decoder_inputs = self.parse_decoder_inputs(decoder_inputs) 398 | decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) 399 | decoder_inputs = self.prenet(decoder_inputs) 400 | 401 | self.initialize_decoder_states( 402 | memory, mask=~get_mask_from_lengths(memory_lengths)) 403 | 404 | mel_outputs, gate_outputs, alignments = [], [], [] 405 | while len(mel_outputs) < decoder_inputs.size(0) - 1: 406 | decoder_input = decoder_inputs[len(mel_outputs)] 407 | mel_output, gate_output, attention_weights = self.decode( 408 | decoder_input) 409 | mel_outputs += [mel_output.squeeze(1)] 410 | gate_outputs += [gate_output.squeeze(1)] 411 | alignments += [attention_weights] 412 | 413 | mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( 414 | mel_outputs, gate_outputs, alignments) 415 | 416 | return mel_outputs, gate_outputs, alignments 417 | 418 | def inference(self, memory): 419 | """ Decoder inference 420 | PARAMS 421 | ------ 422 | memory: Encoder outputs 423 | 424 | RETURNS 425 | ------- 426 | mel_outputs: mel outputs from the decoder 427 | gate_outputs: gate outputs from the decoder 428 | alignments: sequence of attention weights from the decoder 429 | """ 430 | decoder_input = self.get_go_frame(memory) 431 | 432 | self.initialize_decoder_states(memory, mask=None) 433 | 434 | mel_outputs, gate_outputs, alignments = [], [], [] 435 | while True: 436 | decoder_input = self.prenet(decoder_input) 437 | mel_output, gate_output, alignment = self.decode(decoder_input) 438 | 439 | mel_outputs += [mel_output.squeeze(1)] 440 | gate_outputs += [gate_output] 441 | alignments += [alignment] 442 | 443 | if torch.sigmoid(gate_output.data) > self.gate_threshold: 444 | break 445 | elif len(mel_outputs) == self.max_decoder_steps: 446 | print("Warning! Reached max decoder steps") 447 | break 448 | 449 | decoder_input = mel_output 450 | 451 | mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( 452 | mel_outputs, gate_outputs, alignments) 453 | 454 | return mel_outputs, gate_outputs, alignments 455 | 456 | 457 | class Tacotron2(nn.Module): 458 | def __init__(self, hparams): 459 | super(Tacotron2, self).__init__() 460 | self.mask_padding = hparams.mask_padding 461 | self.fp16_run = hparams.fp16_run 462 | self.n_mel_channels = hparams.n_mel_channels 463 | self.n_frames_per_step = hparams.n_frames_per_step 464 | self.embedding = nn.Embedding( 465 | hparams.n_symbols, hparams.symbols_embedding_dim) 466 | std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) 467 | val = sqrt(3.0) * std # uniform bounds for std 468 | self.embedding.weight.data.uniform_(-val, val) 469 | self.encoder = Encoder(hparams) 470 | self.decoder = Decoder(hparams) 471 | self.postnet = Postnet(hparams) 472 | 473 | def parse_batch(self, batch): 474 | text_padded, input_lengths, mel_padded, gate_padded, \ 475 | output_lengths = batch 476 | text_padded = to_gpu(text_padded).long() 477 | input_lengths = to_gpu(input_lengths).long() 478 | max_len = torch.max(input_lengths.data).item() 479 | mel_padded = to_gpu(mel_padded).float() 480 | gate_padded = to_gpu(gate_padded).float() 481 | output_lengths = to_gpu(output_lengths).long() 482 | 483 | return ( 484 | (text_padded, input_lengths, mel_padded, max_len, output_lengths), 485 | (mel_padded, gate_padded)) 486 | 487 | def parse_output(self, outputs, output_lengths=None): 488 | if self.mask_padding and output_lengths is not None: 489 | mask = ~get_mask_from_lengths(output_lengths) 490 | mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) 491 | mask = mask.permute(1, 0, 2) 492 | 493 | outputs[0].data.masked_fill_(mask, 0.0) 494 | outputs[1].data.masked_fill_(mask, 0.0) 495 | outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies 496 | 497 | return outputs 498 | 499 | def forward(self, inputs): 500 | text_inputs, text_lengths, mels, max_len, output_lengths = inputs 501 | text_lengths, output_lengths = text_lengths.data, output_lengths.data 502 | 503 | embedded_inputs = self.embedding(text_inputs).transpose(1, 2) 504 | 505 | encoder_outputs = self.encoder(embedded_inputs, text_lengths) 506 | 507 | mel_outputs, gate_outputs, alignments = self.decoder( 508 | encoder_outputs, mels, memory_lengths=text_lengths) 509 | 510 | mel_outputs_postnet = self.postnet(mel_outputs) 511 | mel_outputs_postnet = mel_outputs + mel_outputs_postnet 512 | 513 | return self.parse_output( 514 | [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], 515 | output_lengths) 516 | 517 | def inference(self, inputs): 518 | embedded_inputs = self.embedding(inputs).transpose(1, 2) 519 | encoder_outputs = self.encoder.inference(embedded_inputs) 520 | mel_outputs, gate_outputs, alignments = self.decoder.inference( 521 | encoder_outputs) 522 | 523 | mel_outputs_postnet = self.postnet(mel_outputs) 524 | mel_outputs_postnet = mel_outputs + mel_outputs_postnet 525 | 526 | outputs = self.parse_output( 527 | [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]) 528 | 529 | return outputs 530 | --------------------------------------------------------------------------------