├── demo.wav
├── tensorboard.png
├── waveglow
    ├── .gitmodules
    ├── waveglow_logo.png
    ├── requirements.txt
    ├── config.json
    ├── LICENSE
    ├── denoiser.py
    ├── convert_model.py
    ├── README.md
    ├── inference.py
    ├── mel2samp.py
    ├── distributed.py
    ├── train.py
    ├── glow_old.py
    └── glow.py
├── .gitmodules
├── requirements.txt
├── Dockerfile
├── multiproc.py
├── loss_function.py
├── text
    ├── symbols.py
    ├── LICENSE
    ├── cmudict.py
    ├── numbers.py
    ├── __init__.py
    └── cleaners.py
├── utils.py
├── LICENSE
├── README.md
├── plotting_utils.py
├── logger.py
├── train.ipynb
├── .gitattributes
├── hparams.py
├── audio_processing.py
├── layers.py
├── loss_scaler.py
├── data_utils.py
├── stft.py
├── distributed.py
├── .gitignore
├── train.py
├── filelists
    └── transcript_val.txt
└── model.py


/demo.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/demo.wav


--------------------------------------------------------------------------------
/tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/tensorboard.png


--------------------------------------------------------------------------------
/waveglow/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tacotron2"]
2 | 	path = tacotron2
3 | 	url = http://github.com/NVIDIA/tacotron2
4 | 


--------------------------------------------------------------------------------
/waveglow/waveglow_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CjangCjengh/tacotron2-japanese/HEAD/waveglow/waveglow_logo.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "waveglow"]
2 | 	path = waveglow
3 | 	url = https://github.com/NVIDIA/waveglow
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/waveglow/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.0
 2 | matplotlib==2.1.0
 3 | tensorflow
 4 | numpy==1.13.3
 5 | inflect==0.2.5
 6 | librosa==0.6.0
 7 | scipy==1.0.0
 8 | tensorboardX==1.1
 9 | Unidecode==1.0.22
10 | pillow
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pillow
 2 | matplotlib
 3 | numpy==1.22.4
 4 | inflect
 5 | librosa
 6 | denoiser
 7 | pysoundfile
 8 | scipy
 9 | Unidecode
10 | pillow
11 | openjtalk>=0.3.0.dev2
12 | janome
13 | torch
14 | tensorboardX
15 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
 2 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 3 | 
 4 | RUN apt-get update -y
 5 | 
 6 | RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 pillow jupyter
 7 | 
 8 | ADD apex /apex/
 9 | WORKDIR /apex/
10 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
11 | 


--------------------------------------------------------------------------------
/multiproc.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | import sys
 4 | import subprocess
 5 | 
 6 | argslist = list(sys.argv)[1:]
 7 | num_gpus = torch.cuda.device_count()
 8 | argslist.append('--n_gpus={}'.format(num_gpus))
 9 | workers = []
10 | job_id = time.strftime("%Y_%m_%d-%H%M%S")
11 | argslist.append("--group_name=group_{}".format(job_id))
12 | 
13 | for i in range(num_gpus):
14 |     argslist.append('--rank={}'.format(i))
15 |     stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
16 |                                       "w")
17 |     print(argslist)
18 |     p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
19 |     workers.append(p)
20 |     argslist = argslist[:-1]
21 | 
22 | for p in workers:
23 |     p.wait()
24 | 


--------------------------------------------------------------------------------
/loss_function.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class Tacotron2Loss(nn.Module):
 5 |     def __init__(self):
 6 |         super(Tacotron2Loss, self).__init__()
 7 | 
 8 |     def forward(self, model_output, targets):
 9 |         mel_target, gate_target = targets[0], targets[1]
10 |         mel_target.requires_grad = False
11 |         gate_target.requires_grad = False
12 |         gate_target = gate_target.view(-1, 1)
13 | 
14 |         mel_out, mel_out_postnet, gate_out, _ = model_output
15 |         gate_out = gate_out.view(-1, 1)
16 |         mel_loss = nn.MSELoss()(mel_out, mel_target) + \
17 |             nn.MSELoss()(mel_out_postnet, mel_target)
18 |         gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
19 |         return mel_loss + gate_loss
20 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | from text import cmudict
 8 | 
 9 | _pad        = '_'
10 | _punctuation = '!\'(),.:;? '
11 | _special = '-'
12 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
13 | 
14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
16 | 
17 | # Export all symbols:
18 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
19 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/waveglow/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_config": {
 3 |         "fp16_run": true,
 4 |         "output_directory": "checkpoints",
 5 |         "epochs": 100000,
 6 |         "learning_rate": 1e-4,
 7 |         "sigma": 1.0,
 8 |         "iters_per_checkpoint": 2000,
 9 |         "batch_size": 12,
10 |         "seed": 1234,
11 |         "checkpoint_path": "",
12 |         "with_tensorboard": false
13 |     },
14 |     "data_config": {
15 |         "training_files": "train_files.txt",
16 |         "segment_length": 16000,
17 |         "sampling_rate": 22050,
18 |         "filter_length": 1024,
19 |         "hop_length": 256,
20 |         "win_length": 1024,
21 |         "mel_fmin": 0.0,
22 |         "mel_fmax": 8000.0
23 |     },
24 |     "dist_config": {
25 |         "dist_backend": "nccl",
26 |         "dist_url": "tcp://localhost:54321"
27 |     },
28 | 
29 |     "waveglow_config": {
30 |         "n_mel_channels": 80,
31 |         "n_flows": 12,
32 |         "n_group": 8,
33 |         "n_early_every": 4,
34 |         "n_early_size": 2,
35 |         "WN_config": {
36 |             "n_layers": 8,
37 |             "n_channels": 256,
38 |             "kernel_size": 3
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.io.wavfile import read
 3 | import torch
 4 | 
 5 | 
 6 | from hparams import create_hparams
 7 | #hparam = create_hparams()
 8 | #hparam.cuda_enabled = False
 9 | 
10 | def get_mask_from_lengths(lengths):
11 |     max_len = torch.max(lengths).item()
12 |     
13 |     #if hparam.cuda_enabled :
14 |     if create_hparams.cuda_enabled :
15 |         ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
16 |         mask = (ids < lengths.unsqueeze(1)).bool()
17 |     else :
18 |         ids = torch.arange(0, max_len, out=torch.LongTensor(max_len))
19 |         mask = (ids < lengths.unsqueeze(1)).bool()
20 |     
21 |     return mask
22 | 
23 | 
24 | 
25 | def load_wav_to_torch(full_path):
26 |     sampling_rate, data = read(full_path)
27 |     return torch.FloatTensor(data.astype(np.float32)), sampling_rate
28 | 
29 | 
30 | def load_filepaths_and_text(filename, split="|"):
31 |     with open(filename, encoding='utf-8') as f:
32 |         filepaths_and_text = [line.strip().split(split) for line in f]
33 |     return filepaths_and_text
34 | 
35 | 
36 | def to_gpu(x):
37 |     x = x.contiguous()
38 | 
39 |     if torch.cuda.is_available():
40 |         x = x.cuda(non_blocking=True)
41 |     return torch.autograd.Variable(x)
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/waveglow/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tacotron2-Japanese
 2 | - Tacotron2 implementation of Japanese
 3 | ## Links
 4 | * Reference: [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2)
 5 | * [Pre-training tacotron2 models](https://github.com/CjangCjengh/TTSModels)
 6 | * [latest changes can be viewed in this repository](https://github.com/StarxSky/tacotron2-JP) 
 7 | 
 8 | ## How to use
 9 | 1. Put raw Japanese texts in ./filelists
10 | 2. Put WAV files in ./wav
11 | 3. (Optional) Download NVIDIA's [pretrained model](https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing)
12 | 4. Open ./train.ipynb to install requirements and start training
13 | 5. Download NVIDIA's [WaveGlow model](https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF)
14 | 6. Open ./inference.ipynb to generate voice
15 | 
16 | ## Cleaners
17 | File ./hparams.py line 30
18 | ### 1. 'japanese_cleaners'
19 | #### Before
20 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
21 | #### After
22 | nanikaacltaraitsudemohanashItekudasai.gakuiNnokotojanaku,shijinikaNsurukotodemonanidemo.
23 | ### 2. 'japanese_tokenization_cleaners'
24 | #### Before
25 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
26 | #### After
27 | nani ka acl tara itsu demo hanashi te kudasai. gakuiN no koto ja naku, shiji nikaNsuru koto de mo naNdemo.
28 | ### 3. 'japanese_accent_cleaners'
29 | #### Before
30 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
31 | #### After
32 | :na)nika a)cltara i)tsudemo ha(na)shIte ku(dasa)i.:ga(kuiNno ko(to)janaku,:shi)jini ka(Nsu)ru ko(to)demo na)nidemo.
33 | ### 4. 'japanese_phrase_cleaners'
34 | #### Before
35 | 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
36 | #### After
37 | nanika acltara itsudemo hanashIte kudasai. gakuiNno kotojanaku, shijini kaNsuru kotodemo nanidemo.
38 | 


--------------------------------------------------------------------------------
/waveglow/denoiser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('tacotron2')
 3 | import torch
 4 | from layers import STFT
 5 | 
 6 | 
 7 | class Denoiser(torch.nn.Module):
 8 |     """ Removes model bias from audio produced with waveglow """
 9 | 
10 |     def __init__(self, waveglow, filter_length=1024, n_overlap=4,
11 |                  win_length=1024, mode='zeros'):
12 |         super(Denoiser, self).__init__()
13 |         self.stft = STFT(filter_length=filter_length,
14 |                          hop_length=int(filter_length/n_overlap),
15 |                          win_length=win_length).cuda()
16 |         if mode == 'zeros':
17 |             mel_input = torch.zeros(
18 |                 (1, 80, 88),
19 |                 dtype=waveglow.upsample.weight.dtype,
20 |                 device=waveglow.upsample.weight.device)
21 |         elif mode == 'normal':
22 |             mel_input = torch.randn(
23 |                 (1, 80, 88),
24 |                 dtype=waveglow.upsample.weight.dtype,
25 |                 device=waveglow.upsample.weight.device)
26 |         else:
27 |             raise Exception("Mode {} if not supported".format(mode))
28 | 
29 |         with torch.no_grad():
30 |             bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
31 |             bias_spec, _ = self.stft.transform(bias_audio)
32 | 
33 |         self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
34 | 
35 |     def forward(self, audio, strength=0.1):
36 |         audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
37 |         audio_spec_denoised = audio_spec - self.bias_spec * strength
38 |         audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
39 |         audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
40 |         return audio_denoised
41 | 


--------------------------------------------------------------------------------
/plotting_utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use("Agg")
 3 | import matplotlib.pylab as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def save_figure_to_numpy(fig):
 8 |     # save it to a numpy array.
 9 |     data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
10 |     data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
11 |     return data
12 | 
13 | 
14 | def plot_alignment_to_numpy(alignment, info=None):
15 |     fig, ax = plt.subplots(figsize=(6, 4))
16 |     im = ax.imshow(alignment, aspect='auto', origin='lower',
17 |                    interpolation='none')
18 |     fig.colorbar(im, ax=ax)
19 |     xlabel = 'Decoder timestep'
20 |     if info is not None:
21 |         xlabel += '\n\n' + info
22 |     plt.xlabel(xlabel)
23 |     plt.ylabel('Encoder timestep')
24 |     plt.tight_layout()
25 | 
26 |     fig.canvas.draw()
27 |     data = save_figure_to_numpy(fig)
28 |     plt.close()
29 |     return data
30 | 
31 | 
32 | def plot_spectrogram_to_numpy(spectrogram):
33 |     fig, ax = plt.subplots(figsize=(12, 3))
34 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
35 |                    interpolation='none')
36 |     plt.colorbar(im, ax=ax)
37 |     plt.xlabel("Frames")
38 |     plt.ylabel("Channels")
39 |     plt.tight_layout()
40 | 
41 |     fig.canvas.draw()
42 |     data = save_figure_to_numpy(fig)
43 |     plt.close()
44 |     return data
45 | 
46 | 
47 | def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
48 |     fig, ax = plt.subplots(figsize=(12, 3))
49 |     ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
50 |                color='green', marker='+', s=1, label='target')
51 |     ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
52 |                color='red', marker='.', s=1, label='predicted')
53 | 
54 |     plt.xlabel("Frames (Green target, Red predicted)")
55 |     plt.ylabel("Gate State")
56 |     plt.tight_layout()
57 | 
58 |     fig.canvas.draw()
59 |     data = save_figure_to_numpy(fig)
60 |     plt.close()
61 |     return data
62 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | from torch.utils.tensorboard import SummaryWriter
 4 | from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
 5 | from plotting_utils import plot_gate_outputs_to_numpy
 6 | 
 7 | 
 8 | class Tacotron2Logger(SummaryWriter):
 9 |     def __init__(self, logdir):
10 |         super(Tacotron2Logger, self).__init__(logdir)
11 | 
12 |     def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
13 |                      iteration):
14 |             self.add_scalar("training.loss", reduced_loss, iteration)
15 |             self.add_scalar("grad.norm", grad_norm, iteration)
16 |             self.add_scalar("learning.rate", learning_rate, iteration)
17 |             self.add_scalar("duration", duration, iteration)
18 | 
19 |     def log_validation(self, reduced_loss, model, y, y_pred, iteration):
20 |         self.add_scalar("validation.loss", reduced_loss, iteration)
21 |         _, mel_outputs, gate_outputs, alignments = y_pred
22 |         mel_targets, gate_targets = y
23 | 
24 |         # plot distribution of parameters
25 |         for tag, value in model.named_parameters():
26 |             tag = tag.replace('.', '/')
27 |             self.add_histogram(tag, value.data.cpu().numpy(), iteration)
28 | 
29 |         # plot alignment, mel target and predicted, gate target and predicted
30 |         idx = random.randint(0, alignments.size(0) - 1)
31 |         self.add_image(
32 |             "alignment",
33 |             plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
34 |             iteration, dataformats='HWC')
35 |         self.add_image(
36 |             "mel_target",
37 |             plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
38 |             iteration, dataformats='HWC')
39 |         self.add_image(
40 |             "mel_predicted",
41 |             plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
42 |             iteration, dataformats='HWC')
43 |         self.add_image(
44 |             "gate",
45 |             plot_gate_outputs_to_numpy(
46 |                 gate_targets[idx].data.cpu().numpy(),
47 |                 torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
48 |             iteration, dataformats='HWC')
49 | 


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | valid_symbols = [
 7 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 8 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 9 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 | 
16 | _valid_symbol_set = set(valid_symbols)
17 | 
18 | 
19 | class CMUDict:
20 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 |   def __init__(self, file_or_path, keep_ambiguous=True):
22 |     if isinstance(file_or_path, str):
23 |       with open(file_or_path, encoding='latin-1') as f:
24 |         entries = _parse_cmudict(f)
25 |     else:
26 |       entries = _parse_cmudict(file_or_path)
27 |     if not keep_ambiguous:
28 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29 |     self._entries = entries
30 | 
31 | 
32 |   def __len__(self):
33 |     return len(self._entries)
34 | 
35 | 
36 |   def lookup(self, word):
37 |     '''Returns list of ARPAbet pronunciations of the given word.'''
38 |     return self._entries.get(word.upper())
39 | 
40 | 
41 | 
42 | _alt_re = re.compile(r'\([0-9]+\)')
43 | 
44 | 
45 | def _parse_cmudict(file):
46 |   cmudict = {}
47 |   for line in file:
48 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49 |       parts = line.split('  ')
50 |       word = re.sub(_alt_re, '', parts[0])
51 |       pronunciation = _get_pronunciation(parts[1])
52 |       if pronunciation:
53 |         if word in cmudict:
54 |           cmudict[word].append(pronunciation)
55 |         else:
56 |           cmudict[word] = [pronunciation]
57 |   return cmudict
58 | 
59 | 
60 | def _get_pronunciation(s):
61 |   parts = s.strip().split(' ')
62 |   for part in parts:
63 |     if part not in _valid_symbol_set:
64 |       return None
65 |   return ' '.join(parts)
66 | 


--------------------------------------------------------------------------------
/train.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "28d9a0e5",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "#### Install requirements"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "f5ba8906-5257-4293-960c-853b8b3c6dff",
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "!pip install librosa==0.8.0\n",
19 |     "!pip install pysoundfile==0.9.0.post1\n",
20 |     "!pip install unidecode==1.3.4\n",
21 |     "!pip install pyopenjtalk==0.2.0\n",
22 |     "!pip install inflect==5.6.2\n",
23 |     "!pip install janome==0.4.2"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "markdown",
28 |    "id": "75498eeb",
29 |    "metadata": {},
30 |    "source": [
31 |     "#### Train without pretrained model"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "id": "d5f5590e-1817-4665-9bdf-1ff06c0f7f96",
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "!python train.py --output_directory=outdir --log_directory=logdir"
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "markdown",
46 |    "id": "e673209c",
47 |    "metadata": {},
48 |    "source": [
49 |     "#### Train with a pretrained model"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": null,
55 |    "id": "d5f5590e-1817-4665-9bdf-1ff06c0f7f96",
56 |    "metadata": {},
57 |    "outputs": [],
58 |    "source": [
59 |     "!python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start"
60 |    ]
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "Python 3.10.2 64-bit",
66 |    "language": "python",
67 |    "name": "python3"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 3
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython3",
79 |    "version": "3.10.2"
80 |   },
81 |   "vscode": {
82 |    "interpreter": {
83 |     "hash": "d99a3f7b344b3c3107482760db15f42178bfad658d282ab0a919b76809e13cb5"
84 |    }
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 5
89 | }
90 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |   return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |   return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |   match = m.group(1)
26 |   parts = match.split('.')
27 |   if len(parts) > 2:
28 |     return match + ' dollars'  # Unexpected format
29 |   dollars = int(parts[0]) if parts[0] else 0
30 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |   if dollars and cents:
32 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |     cent_unit = 'cent' if cents == 1 else 'cents'
34 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |   elif dollars:
36 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |     return '%s %s' % (dollars, dollar_unit)
38 |   elif cents:
39 |     cent_unit = 'cent' if cents == 1 else 'cents'
40 |     return '%s %s' % (cents, cent_unit)
41 |   else:
42 |     return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |   return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |   num = int(m.group(0))
51 |   if num > 1000 and num < 3000:
52 |     if num == 2000:
53 |       return 'two thousand'
54 |     elif num > 2000 and num < 2010:
55 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |     elif num % 100 == 0:
57 |       return _inflect.number_to_words(num // 100) + ' hundred'
58 |     else:
59 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |   else:
61 |     return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |   text = re.sub(_comma_number_re, _remove_commas, text)
66 |   text = re.sub(_pounds_re, r'\1 pounds', text)
67 |   text = re.sub(_dollars_re, _expand_dollars, text)
68 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |   text = re.sub(_number_re, _expand_number, text)
71 |   return text
72 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | import re
 3 | from text import cleaners
 4 | from text.symbols import symbols
 5 | 
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | 
11 | # Regular expression matching text enclosed in curly braces:
12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
13 | 
14 | 
15 | def text_to_sequence(text, cleaner_names):
16 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17 | 
18 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20 | 
21 |     Args:
22 |       text: string to convert to a sequence
23 |       cleaner_names: names of the cleaner functions to run the text through
24 | 
25 |     Returns:
26 |       List of integers corresponding to the symbols in the text
27 |   '''
28 |   sequence = []
29 | 
30 |   # Check for curly braces and treat their contents as ARPAbet:
31 |   while len(text):
32 |     m = _curly_re.match(text)
33 |     if not m:
34 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
35 |       break
36 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
37 |     sequence += _arpabet_to_sequence(m.group(2))
38 |     text = m.group(3)
39 | 
40 |   return sequence
41 | 
42 | 
43 | def sequence_to_text(sequence):
44 |   '''Converts a sequence of IDs back to a string'''
45 |   result = ''
46 |   for symbol_id in sequence:
47 |     if symbol_id in _id_to_symbol:
48 |       s = _id_to_symbol[symbol_id]
49 |       # Enclose ARPAbet back in curly braces:
50 |       if len(s) > 1 and s[0] == '@':
51 |         s = '{%s}' % s[1:]
52 |       result += s
53 |   return result.replace('}{', ' ')
54 | 
55 | 
56 | def _clean_text(text, cleaner_names):
57 |   for name in cleaner_names:
58 |     cleaner = getattr(cleaners, name)
59 |     if not cleaner:
60 |       raise Exception('Unknown cleaner: %s' % name)
61 |     text = cleaner(text)
62 |   return text
63 | 
64 | 
65 | def _symbols_to_sequence(symbols):
66 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
67 | 
68 | 
69 | def _arpabet_to_sequence(text):
70 |   return _symbols_to_sequence(['@' + s for s in text.split()])
71 | 
72 | 
73 | def _should_keep_symbol(s):
74 |   return s in _symbol_to_id and s is not '_' and s is not '~'
75 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from text import symbols
 3 | 
 4 | 
 5 | class create_hparams():
 6 |     """Create model hyperparameters. Parse nondefault from given string."""
 7 |     ################################
 8 |     #       CUDA Enable            #
 9 |     ################################
10 |     if torch.cuda.is_available() :
11 |         cuda_enabled = True    
12 |     else :
13 |         cuda_enabled = False
14 | 
15 |     ################################
16 |     # Experiment Parameters        #
17 |     ################################
18 |     epochs = 100
19 |     iters_per_checkpoint = 500
20 |     seed= 1234
21 |     dynamic_loss_scaling = True
22 |     fp16_run = False
23 |     distributed_run = False
24 |     dist_backend = "nccl"
25 |     dist_url = "tcp://localhost:54321"
26 |     cudnn_enabled = True
27 |     cudnn_benchmark = False
28 |     ignore_layers = ['embedding.weight']
29 | 
30 |     ################################
31 |     # Data Parameters             #
32 |     ################################
33 |     load_mel_from_disk = False
34 |     training_files = 'filelists/transcript_train.txt'
35 |     validation_files = 'filelists/transcript_val.txt'
36 |     text_cleaners = ['japanese_cleaners']
37 | 
38 |     ################################
39 |     # Audio Parameters             #
40 |     ################################
41 |     max_wav_value = 32768.0
42 |     sampling_rate = 22050
43 |     filter_length = 1024
44 |     hop_length = 256
45 |     win_length = 1024
46 |     n_mel_channels = 80
47 |     mel_fmin = 0.0
48 |     mel_fmax = 8000.0
49 | 
50 |     ################################
51 |     # Model Parameters             #
52 |     ################################
53 |     n_symbols = len(symbols)
54 |     symbols_embedding_dim = 512
55 | 
56 |     # Encoder parameters
57 |     encoder_kernel_size = 5
58 |     encoder_n_convolutions = 3
59 |     encoder_embedding_dim = 512
60 | 
61 |     # Decoder parameters
62 |     n_frames_per_step = 1  # currently only 1 is supported
63 |     decoder_rnn_dim = 1024
64 |     prenet_dim = 256
65 |     max_decoder_steps = 1000
66 |     gate_threshold = 0.5
67 |     p_attention_dropout = 0.1
68 |     p_decoder_dropout = 0.1
69 | 
70 |     # Attention parameters
71 |     attention_rnn_dim = 1024
72 |     attention_dim = 128
73 |     # Location Layer parameters
74 |     attention_location_n_filters = 32
75 |     attention_location_kernel_size = 31
76 | 
77 |     # Mel-post processing network parameters
78 |     postnet_embedding_dim = 512
79 |     postnet_kernel_size = 5
80 |     postnet_n_convolutions = 5
81 | 
82 |     ################################
83 |     # Optimization Hyperparameters #
84 |     ################################
85 |     use_saved_learning_rate = False
86 |     learning_rate = 1e-3
87 |     weight_decay = 1e-6
88 |     grad_clip_thresh = 1.0
89 |     batch_size = 64
90 |     mask_padding = True  # set model's padded outputs to padded values
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/audio_processing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.signal import get_window
 4 | import librosa.util as librosa_util
 5 | 
 6 | 
 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 8 |                      n_fft=800, dtype=np.float32, norm=None):
 9 |     """
10 |     # from librosa 0.6
11 |     Compute the sum-square envelope of a window function at a given hop length.
12 | 
13 |     This is used to estimate modulation effects induced by windowing
14 |     observations in short-time fourier transforms.
15 | 
16 |     Parameters
17 |     ----------
18 |     window : string, tuple, number, callable, or list-like
19 |         Window specification, as in `get_window`
20 | 
21 |     n_frames : int > 0
22 |         The number of analysis frames
23 | 
24 |     hop_length : int > 0
25 |         The number of samples to advance between frames
26 | 
27 |     win_length : [optional]
28 |         The length of the window function.  By default, this matches `n_fft`.
29 | 
30 |     n_fft : int > 0
31 |         The length of each analysis frame.
32 | 
33 |     dtype : np.dtype
34 |         The data type of the output
35 | 
36 |     Returns
37 |     -------
38 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39 |         The sum-squared envelope of the window function
40 |     """
41 |     if win_length is None:
42 |         win_length = n_fft
43 | 
44 |     n = n_fft + hop_length * (n_frames - 1)
45 |     x = np.zeros(n, dtype=dtype)
46 | 
47 |     # Compute the squared window at the desired length
48 |     win_sq = get_window(window, win_length, fftbins=True)
49 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
51 | 
52 |     # Fill the envelope
53 |     for i in range(n_frames):
54 |         sample = i * hop_length
55 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56 |     return x
57 | 
58 | 
59 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
60 |     """
61 |     PARAMS
62 |     ------
63 |     magnitudes: spectrogram magnitudes
64 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65 |     """
66 | 
67 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68 |     angles = angles.astype(np.float32)
69 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
70 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71 | 
72 |     for i in range(n_iters):
73 |         _, angles = stft_fn.transform(signal)
74 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75 |     return signal
76 | 
77 | 
78 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
79 |     """
80 |     PARAMS
81 |     ------
82 |     C: compression factor
83 |     """
84 |     return torch.log(torch.clamp(x, min=clip_val) * C)
85 | 
86 | 
87 | def dynamic_range_decompression(x, C=1):
88 |     """
89 |     PARAMS
90 |     ------
91 |     C: compression factor used to compress
92 |     """
93 |     return torch.exp(x) / C
94 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from librosa.filters import mel as librosa_mel_fn
 3 | from audio_processing import dynamic_range_compression
 4 | from audio_processing import dynamic_range_decompression
 5 | from stft import STFT
 6 | 
 7 | 
 8 | class LinearNorm(torch.nn.Module):
 9 |     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
10 |         super(LinearNorm, self).__init__()
11 |         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
12 | 
13 |         torch.nn.init.xavier_uniform_(
14 |             self.linear_layer.weight,
15 |             gain=torch.nn.init.calculate_gain(w_init_gain))
16 | 
17 |     def forward(self, x):
18 |         return self.linear_layer(x)
19 | 
20 | 
21 | class ConvNorm(torch.nn.Module):
22 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
23 |                  padding=None, dilation=1, bias=True, w_init_gain='linear'):
24 |         super(ConvNorm, self).__init__()
25 |         if padding is None:
26 |             assert(kernel_size % 2 == 1)
27 |             padding = int(dilation * (kernel_size - 1) / 2)
28 | 
29 |         self.conv = torch.nn.Conv1d(in_channels, out_channels,
30 |                                     kernel_size=kernel_size, stride=stride,
31 |                                     padding=padding, dilation=dilation,
32 |                                     bias=bias)
33 | 
34 |         torch.nn.init.xavier_uniform_(
35 |             self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
36 | 
37 |     def forward(self, signal):
38 |         conv_signal = self.conv(signal)
39 |         return conv_signal
40 | 
41 | 
42 | class TacotronSTFT(torch.nn.Module):
43 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
44 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
45 |                  mel_fmax=8000.0):
46 |         super(TacotronSTFT, self).__init__()
47 |         self.n_mel_channels = n_mel_channels
48 |         self.sampling_rate = sampling_rate
49 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
50 |         mel_basis = librosa_mel_fn(
51 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
52 |         mel_basis = torch.from_numpy(mel_basis).float()
53 |         self.register_buffer('mel_basis', mel_basis)
54 | 
55 |     def spectral_normalize(self, magnitudes):
56 |         output = dynamic_range_compression(magnitudes)
57 |         return output
58 | 
59 |     def spectral_de_normalize(self, magnitudes):
60 |         output = dynamic_range_decompression(magnitudes)
61 |         return output
62 | 
63 |     def mel_spectrogram(self, y):
64 |         """Computes mel-spectrograms from a batch of waves
65 |         PARAMS
66 |         ------
67 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
68 | 
69 |         RETURNS
70 |         -------
71 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
72 |         """
73 |         assert(torch.min(y.data) >= -1)
74 |         assert(torch.max(y.data) <= 1)
75 | 
76 |         magnitudes, phases = self.stft_fn.transform(y)
77 |         magnitudes = magnitudes.data
78 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
79 |         mel_output = self.spectral_normalize(mel_output)
80 |         return mel_output
81 | 


--------------------------------------------------------------------------------
/waveglow/convert_model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | import torch
 4 | 
 5 | def _check_model_old_version(model):
 6 |     if hasattr(model.WN[0], 'res_layers') or hasattr(model.WN[0], 'cond_layers'):
 7 |         return True
 8 |     else:
 9 |         return False
10 | 
11 | 
12 | def _update_model_res_skip(old_model, new_model):
13 |     for idx in range(0, len(new_model.WN)):
14 |         wavenet = new_model.WN[idx]
15 |         n_channels = wavenet.n_channels
16 |         n_layers = wavenet.n_layers
17 |         wavenet.res_skip_layers = torch.nn.ModuleList()
18 |         for i in range(0, n_layers):
19 |             if i < n_layers - 1:
20 |                 res_skip_channels = 2*n_channels
21 |             else:
22 |                 res_skip_channels = n_channels
23 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
24 |             skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
25 |             if i < n_layers - 1:
26 |                 res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
27 |                 res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
28 |                 res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
29 |             else:
30 |                 res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
31 |                 res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
32 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
33 |             wavenet.res_skip_layers.append(res_skip_layer)
34 |         del wavenet.res_layers
35 |         del wavenet.skip_layers
36 | 
37 | def _update_model_cond(old_model, new_model):
38 |     for idx in range(0, len(new_model.WN)):
39 |         wavenet = new_model.WN[idx]
40 |         n_channels = wavenet.n_channels
41 |         n_layers = wavenet.n_layers
42 |         n_mel_channels = wavenet.cond_layers[0].weight.shape[1]
43 |         cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
44 |         cond_layer_weight = []
45 |         cond_layer_bias = []
46 |         for i in range(0, n_layers):
47 |             _cond_layer = torch.nn.utils.remove_weight_norm(wavenet.cond_layers[i])
48 |             cond_layer_weight.append(_cond_layer.weight)
49 |             cond_layer_bias.append(_cond_layer.bias)
50 |         cond_layer.weight = torch.nn.Parameter(torch.cat(cond_layer_weight))
51 |         cond_layer.bias = torch.nn.Parameter(torch.cat(cond_layer_bias))
52 |         cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
53 |         wavenet.cond_layer = cond_layer
54 |         del wavenet.cond_layers
55 | 
56 | def update_model(old_model):
57 |     if not _check_model_old_version(old_model):
58 |         return old_model
59 |     new_model = copy.deepcopy(old_model)
60 |     if hasattr(old_model.WN[0], 'res_layers'):
61 |         _update_model_res_skip(old_model, new_model)
62 |     if hasattr(old_model.WN[0], 'cond_layers'):
63 |         _update_model_cond(old_model, new_model)
64 |     return new_model
65 | 
66 | if __name__ == '__main__':
67 |     old_model_path = sys.argv[1]
68 |     new_model_path = sys.argv[2]
69 |     model = torch.load(old_model_path, map_location='cpu')
70 |     model['model'] = update_model(model['model'])
71 |     torch.save(model, new_model_path)
72 |     
73 | 


--------------------------------------------------------------------------------
/waveglow/README.md:
--------------------------------------------------------------------------------
 1 | ![WaveGlow](waveglow_logo.png "WaveGLow")
 2 | 
 3 | ## WaveGlow: a Flow-based Generative Network for Speech Synthesis
 4 | 
 5 | ### Ryan Prenger, Rafael Valle, and Bryan Catanzaro
 6 | 
 7 | In our recent [paper], we propose WaveGlow: a flow-based network capable of
 8 | generating high quality speech from mel-spectrograms. WaveGlow combines insights
 9 | from [Glow] and [WaveNet] in order to provide fast, efficient and high-quality
10 | audio synthesis, without the need for auto-regression. WaveGlow is implemented
11 | using only a single network, trained using only a single cost function:
12 | maximizing the likelihood of the training data, which makes the training
13 | procedure simple and stable.
14 | 
15 | Our [PyTorch] implementation produces audio samples at a rate of 1200 
16 | kHz on an NVIDIA V100 GPU. Mean Opinion Scores show that it delivers audio
17 | quality as good as the best publicly available WaveNet implementation.
18 | 
19 | Visit our [website] for audio samples.
20 | 
21 | ## Setup
22 | 
23 | 1. Clone our repo and initialize submodule
24 | 
25 |    ```command
26 |    git clone https://github.com/NVIDIA/waveglow.git
27 |    cd waveglow
28 |    git submodule init
29 |    git submodule update
30 |    ```
31 | 
32 | 2. Install requirements `pip3 install -r requirements.txt`
33 | 
34 | 3. Install [Apex]
35 | 
36 | 
37 | ## Generate audio with our pre-existing model
38 | 
39 | 1. Download our [published model]
40 | 2. Download [mel-spectrograms]
41 | 3. Generate audio `python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_256channels.pt -o . --is_fp16 -s 0.6`  
42 | 
43 | N.b. use `convert_model.py` to convert your older models to the current model
44 | with fused residual and skip connections.
45 | 
46 | ## Train your own model
47 | 
48 | 1. Download [LJ Speech Data]. In this example it's in `data/`
49 | 
50 | 2. Make a list of the file names to use for training/testing
51 | 
52 |    ```command
53 |    ls data/*.wav | tail -n+10 > train_files.txt
54 |    ls data/*.wav | head -n10 > test_files.txt
55 |    ```
56 | 
57 | 3. Train your WaveGlow networks
58 | 
59 |    ```command
60 |    mkdir checkpoints
61 |    python train.py -c config.json
62 |    ```
63 | 
64 |    For multi-GPU training replace `train.py` with `distributed.py`.  Only tested with single node and NCCL.
65 | 
66 |    For mixed precision training set `"fp16_run": true` on `config.json`.
67 | 
68 | 4. Make test set mel-spectrograms
69 | 
70 |    `python mel2samp.py -f test_files.txt -o . -c config.json`
71 | 
72 | 5. Do inference with your network
73 | 
74 |    ```command
75 |    ls *.pt > mel_files.txt
76 |    python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6
77 |    ```
78 | 
79 | [//]: # (TODO)
80 | [//]: # (PROVIDE INSTRUCTIONS FOR DOWNLOADING LJS)
81 | [pytorch 1.0]: https://github.com/pytorch/pytorch#installation
82 | [website]: https://nv-adlr.github.io/WaveGlow
83 | [paper]: https://arxiv.org/abs/1811.00002
84 | [WaveNet implementation]: https://github.com/r9y9/wavenet_vocoder
85 | [Glow]: https://blog.openai.com/glow/
86 | [WaveNet]: https://deepmind.com/blog/wavenet-generative-model-raw-audio/
87 | [PyTorch]: http://pytorch.org
88 | [published model]: https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF
89 | [mel-spectrograms]: https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view?usp=sharing
90 | [LJ Speech Data]: https://keithito.com/LJ-Speech-Dataset
91 | [Apex]: https://github.com/nvidia/apex
92 | 


--------------------------------------------------------------------------------
/waveglow/inference.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | #  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | import os
28 | from scipy.io.wavfile import write
29 | import torch
30 | from mel2samp import files_to_list, MAX_WAV_VALUE
31 | from denoiser import Denoiser
32 | 
33 | 
34 | def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
35 |          denoiser_strength):
36 |     mel_files = files_to_list(mel_files)
37 |     waveglow = torch.load(waveglow_path)['model']
38 |     waveglow = waveglow.remove_weightnorm(waveglow)
39 |     waveglow.cuda().eval()
40 |     if is_fp16:
41 |         from apex import amp
42 |         waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
43 | 
44 |     if denoiser_strength > 0:
45 |         denoiser = Denoiser(waveglow).cuda()
46 | 
47 |     for i, file_path in enumerate(mel_files):
48 |         file_name = os.path.splitext(os.path.basename(file_path))[0]
49 |         mel = torch.load(file_path)
50 |         mel = torch.autograd.Variable(mel.cuda())
51 |         mel = torch.unsqueeze(mel, 0)
52 |         mel = mel.half() if is_fp16 else mel
53 |         with torch.no_grad():
54 |             audio = waveglow.infer(mel, sigma=sigma)
55 |             if denoiser_strength > 0:
56 |                 audio = denoiser(audio, denoiser_strength)
57 |             audio = audio * MAX_WAV_VALUE
58 |         audio = audio.squeeze()
59 |         audio = audio.cpu().numpy()
60 |         audio = audio.astype('int16')
61 |         audio_path = os.path.join(
62 |             output_dir, "{}_synthesis.wav".format(file_name))
63 |         write(audio_path, sampling_rate, audio)
64 |         print(audio_path)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     import argparse
69 | 
70 |     parser = argparse.ArgumentParser()
71 |     parser.add_argument('-f', "--filelist_path", required=True)
72 |     parser.add_argument('-w', '--waveglow_path',
73 |                         help='Path to waveglow decoder checkpoint with model')
74 |     parser.add_argument('-o', "--output_dir", required=True)
75 |     parser.add_argument("-s", "--sigma", default=1.0, type=float)
76 |     parser.add_argument("--sampling_rate", default=22050, type=int)
77 |     parser.add_argument("--is_fp16", action="store_true")
78 |     parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float,
79 |                         help='Removes model bias. Start with 0.1 and adjust')
80 | 
81 |     args = parser.parse_args()
82 | 
83 |     main(args.filelist_path, args.waveglow_path, args.sigma, args.output_dir,
84 |          args.sampling_rate, args.is_fp16, args.denoiser_strength)
85 | 


--------------------------------------------------------------------------------
/loss_scaler.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | class LossScaler:
  4 | 
  5 |     def __init__(self, scale=1):
  6 |         self.cur_scale = scale
  7 | 
  8 |     # `params` is a list / generator of torch.Variable
  9 |     def has_overflow(self, params):
 10 |         return False
 11 | 
 12 |     # `x` is a torch.Tensor
 13 |     def _has_inf_or_nan(x):
 14 |         return False
 15 | 
 16 |     # `overflow` is boolean indicating whether we overflowed in gradient
 17 |     def update_scale(self, overflow):
 18 |         pass
 19 | 
 20 |     @property
 21 |     def loss_scale(self):
 22 |         return self.cur_scale
 23 | 
 24 |     def scale_gradient(self, module, grad_in, grad_out):
 25 |         return tuple(self.loss_scale * g for g in grad_in)
 26 | 
 27 |     def backward(self, loss):
 28 |         scaled_loss = loss*self.loss_scale
 29 |         scaled_loss.backward()
 30 | 
 31 | class DynamicLossScaler:
 32 | 
 33 |     def __init__(self,
 34 |                  init_scale=2**32,
 35 |                  scale_factor=2.,
 36 |                  scale_window=1000):
 37 |         self.cur_scale = init_scale
 38 |         self.cur_iter = 0
 39 |         self.last_overflow_iter = -1
 40 |         self.scale_factor = scale_factor
 41 |         self.scale_window = scale_window
 42 | 
 43 |     # `params` is a list / generator of torch.Variable
 44 |     def has_overflow(self, params):
 45 | #        return False
 46 |         for p in params:
 47 |             if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
 48 |                 return True
 49 | 
 50 |         return False
 51 | 
 52 |     # `x` is a torch.Tensor
 53 |     def _has_inf_or_nan(x):
 54 |         cpu_sum = float(x.float().sum())
 55 |         if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
 56 |             return True
 57 |         return False
 58 | 
 59 |     # `overflow` is boolean indicating whether we overflowed in gradient
 60 |     def update_scale(self, overflow):
 61 |         if overflow:
 62 |             #self.cur_scale /= self.scale_factor
 63 |             self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
 64 |             self.last_overflow_iter = self.cur_iter
 65 |         else:
 66 |             if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
 67 |                 self.cur_scale *= self.scale_factor
 68 | #        self.cur_scale = 1
 69 |         self.cur_iter += 1
 70 | 
 71 |     @property
 72 |     def loss_scale(self):
 73 |         return self.cur_scale
 74 | 
 75 |     def scale_gradient(self, module, grad_in, grad_out):
 76 |         return tuple(self.loss_scale * g for g in grad_in)
 77 | 
 78 |     def backward(self, loss):
 79 |         scaled_loss = loss*self.loss_scale
 80 |         scaled_loss.backward()
 81 | 
 82 | ##############################################################
 83 | # Example usage below here -- assuming it's in a separate file
 84 | ##############################################################
 85 | if __name__ == "__main__":
 86 |     import torch
 87 |     from torch.autograd import Variable
 88 |     from dynamic_loss_scaler import DynamicLossScaler
 89 | 
 90 |     # N is batch size; D_in is input dimension;
 91 |     # H is hidden dimension; D_out is output dimension.
 92 |     N, D_in, H, D_out = 64, 1000, 100, 10
 93 | 
 94 |     # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
 95 |     x = Variable(torch.randn(N, D_in), requires_grad=False)
 96 |     y = Variable(torch.randn(N, D_out), requires_grad=False)
 97 | 
 98 |     w1 = Variable(torch.randn(D_in, H), requires_grad=True)
 99 |     w2 = Variable(torch.randn(H, D_out), requires_grad=True)
100 |     parameters = [w1, w2]
101 | 
102 |     learning_rate = 1e-6
103 |     optimizer = torch.optim.SGD(parameters, lr=learning_rate)
104 |     loss_scaler = DynamicLossScaler()
105 | 
106 |     for t in range(500):
107 |         y_pred = x.mm(w1).clamp(min=0).mm(w2)
108 |         loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
109 |         print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
110 |         print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
111 |         print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
112 | 
113 |         # Run backprop
114 |         optimizer.zero_grad()
115 |         loss.backward()
116 | 
117 |         # Check for overflow
118 |         has_overflow = DynamicLossScaler.has_overflow(parameters)
119 | 
120 |         # If no overflow, unscale grad and update as usual
121 |         if not has_overflow:
122 |             for param in parameters:
123 |                 param.grad.data.mul_(1. / loss_scaler.loss_scale)
124 |             optimizer.step()
125 |         # Otherwise, don't do anything -- ie, skip iteration
126 |         else:
127 |             print('OVERFLOW!')
128 | 
129 |         # Update loss scale for next iteration
130 |         loss_scaler.update_scale(has_overflow)
131 | 
132 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import torch
  4 | import torch.utils.data
  5 | 
  6 | import layers
  7 | from utils import load_wav_to_torch, load_filepaths_and_text
  8 | from text import text_to_sequence
  9 | 
 10 | 
 11 | class TextMelLoader(torch.utils.data.Dataset):
 12 |     """
 13 |         1) loads audio,text pairs
 14 |         2) normalizes text and converts them to sequences of one-hot vectors
 15 |         3) computes mel-spectrograms from audio files.
 16 |     """
 17 |     def __init__(self, audiopaths_and_text, hparams):
 18 |         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
 19 |         self.text_cleaners = hparams.text_cleaners
 20 |         self.max_wav_value = hparams.max_wav_value
 21 |         self.sampling_rate = hparams.sampling_rate
 22 |         self.load_mel_from_disk = hparams.load_mel_from_disk
 23 |         self.stft = layers.TacotronSTFT(
 24 |             hparams.filter_length, hparams.hop_length, hparams.win_length,
 25 |             hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
 26 |             hparams.mel_fmax)
 27 |         random.seed(hparams.seed)
 28 |         random.shuffle(self.audiopaths_and_text)
 29 | 
 30 |     def get_mel_text_pair(self, audiopath_and_text):
 31 |         # separate filename and text
 32 |         audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
 33 |         text = self.get_text(text)
 34 |         mel = self.get_mel(audiopath)
 35 |         return (text, mel)
 36 | 
 37 |     def get_mel(self, filename):
 38 |         if not self.load_mel_from_disk:
 39 |             audio, sampling_rate = load_wav_to_torch(filename)
 40 |             if sampling_rate != self.stft.sampling_rate:
 41 |                 raise ValueError("{} {} SR doesn't match target {} SR".format(
 42 |                     sampling_rate, self.stft.sampling_rate))
 43 |             audio_norm = audio / self.max_wav_value
 44 |             audio_norm = audio_norm.unsqueeze(0)
 45 |             audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
 46 |             melspec = self.stft.mel_spectrogram(audio_norm)
 47 |             melspec = torch.squeeze(melspec, 0)
 48 |         else:
 49 |             melspec = torch.from_numpy(np.load(filename))
 50 |             assert melspec.size(0) == self.stft.n_mel_channels, (
 51 |                 'Mel dimension mismatch: given {}, expected {}'.format(
 52 |                     melspec.size(0), self.stft.n_mel_channels))
 53 | 
 54 |         return melspec
 55 | 
 56 |     def get_text(self, text):
 57 |         text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
 58 |         return text_norm
 59 | 
 60 |     def __getitem__(self, index):
 61 |         return self.get_mel_text_pair(self.audiopaths_and_text[index])
 62 | 
 63 |     def __len__(self):
 64 |         return len(self.audiopaths_and_text)
 65 | 
 66 | 
 67 | class TextMelCollate():
 68 |     """ Zero-pads model inputs and targets based on number of frames per setep
 69 |     """
 70 |     def __init__(self, n_frames_per_step):
 71 |         self.n_frames_per_step = n_frames_per_step
 72 | 
 73 |     def __call__(self, batch):
 74 |         """Collate's training batch from normalized text and mel-spectrogram
 75 |         PARAMS
 76 |         ------
 77 |         batch: [text_normalized, mel_normalized]
 78 |         """
 79 |         # Right zero-pad all one-hot text sequences to max input length
 80 |         input_lengths, ids_sorted_decreasing = torch.sort(
 81 |             torch.LongTensor([len(x[0]) for x in batch]),
 82 |             dim=0, descending=True)
 83 |         max_input_len = input_lengths[0]
 84 | 
 85 |         text_padded = torch.LongTensor(len(batch), max_input_len)
 86 |         text_padded.zero_()
 87 |         for i in range(len(ids_sorted_decreasing)):
 88 |             text = batch[ids_sorted_decreasing[i]][0]
 89 |             text_padded[i, :text.size(0)] = text
 90 | 
 91 |         # Right zero-pad mel-spec
 92 |         num_mels = batch[0][1].size(0)
 93 |         max_target_len = max([x[1].size(1) for x in batch])
 94 |         if max_target_len % self.n_frames_per_step != 0:
 95 |             max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
 96 |             assert max_target_len % self.n_frames_per_step == 0
 97 | 
 98 |         # include mel padded and gate padded
 99 |         mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
100 |         mel_padded.zero_()
101 |         gate_padded = torch.FloatTensor(len(batch), max_target_len)
102 |         gate_padded.zero_()
103 |         output_lengths = torch.LongTensor(len(batch))
104 |         for i in range(len(ids_sorted_decreasing)):
105 |             mel = batch[ids_sorted_decreasing[i]][1]
106 |             mel_padded[i, :, :mel.size(1)] = mel
107 |             gate_padded[i, mel.size(1)-1:] = 1
108 |             output_lengths[i] = mel.size(1)
109 | 
110 |         return text_padded, input_lengths, mel_padded, gate_padded, \
111 |             output_lengths
112 | 


--------------------------------------------------------------------------------
/waveglow/mel2samp.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************\
 27 | import os
 28 | import random
 29 | import argparse
 30 | import json
 31 | import torch
 32 | import torch.utils.data
 33 | import sys
 34 | from scipy.io.wavfile import read
 35 | 
 36 | # We're using the audio processing from TacoTron2 to make sure it matches
 37 | sys.path.insert(0, 'tacotron2')
 38 | from tacotron2.layers import TacotronSTFT
 39 | 
 40 | MAX_WAV_VALUE = 32768.0
 41 | 
 42 | def files_to_list(filename):
 43 |     """
 44 |     Takes a text file of filenames and makes a list of filenames
 45 |     """
 46 |     with open(filename, encoding='utf-8') as f:
 47 |         files = f.readlines()
 48 | 
 49 |     files = [f.rstrip() for f in files]
 50 |     return files
 51 | 
 52 | def load_wav_to_torch(full_path):
 53 |     """
 54 |     Loads wavdata into torch array
 55 |     """
 56 |     sampling_rate, data = read(full_path)
 57 |     return torch.from_numpy(data).float(), sampling_rate
 58 | 
 59 | 
 60 | class Mel2Samp(torch.utils.data.Dataset):
 61 |     """
 62 |     This is the main class that calculates the spectrogram and returns the
 63 |     spectrogram, audio pair.
 64 |     """
 65 |     def __init__(self, training_files, segment_length, filter_length,
 66 |                  hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
 67 |         self.audio_files = files_to_list(training_files)
 68 |         random.seed(1234)
 69 |         random.shuffle(self.audio_files)
 70 |         self.stft = TacotronSTFT(filter_length=filter_length,
 71 |                                  hop_length=hop_length,
 72 |                                  win_length=win_length,
 73 |                                  sampling_rate=sampling_rate,
 74 |                                  mel_fmin=mel_fmin, mel_fmax=mel_fmax)
 75 |         self.segment_length = segment_length
 76 |         self.sampling_rate = sampling_rate
 77 | 
 78 |     def get_mel(self, audio):
 79 |         audio_norm = audio / MAX_WAV_VALUE
 80 |         audio_norm = audio_norm.unsqueeze(0)
 81 |         audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
 82 |         melspec = self.stft.mel_spectrogram(audio_norm)
 83 |         melspec = torch.squeeze(melspec, 0)
 84 |         return melspec
 85 | 
 86 |     def __getitem__(self, index):
 87 |         # Read audio
 88 |         filename = self.audio_files[index]
 89 |         audio, sampling_rate = load_wav_to_torch(filename)
 90 |         if sampling_rate != self.sampling_rate:
 91 |             raise ValueError("{} SR doesn't match target {} SR".format(
 92 |                 sampling_rate, self.sampling_rate))
 93 | 
 94 |         # Take segment
 95 |         if audio.size(0) >= self.segment_length:
 96 |             max_audio_start = audio.size(0) - self.segment_length
 97 |             audio_start = random.randint(0, max_audio_start)
 98 |             audio = audio[audio_start:audio_start+self.segment_length]
 99 |         else:
100 |             audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
101 | 
102 |         mel = self.get_mel(audio)
103 |         audio = audio / MAX_WAV_VALUE
104 | 
105 |         return (mel, audio)
106 | 
107 |     def __len__(self):
108 |         return len(self.audio_files)
109 | 
110 | # ===================================================================
111 | # Takes directory of clean audio and makes directory of spectrograms
112 | # Useful for making test sets
113 | # ===================================================================
114 | if __name__ == "__main__":
115 |     # Get defaults so it can work with no Sacred
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('-f', "--filelist_path", required=True)
118 |     parser.add_argument('-c', '--config', type=str,
119 |                         help='JSON file for configuration')
120 |     parser.add_argument('-o', '--output_dir', type=str,
121 |                         help='Output directory')
122 |     args = parser.parse_args()
123 | 
124 |     with open(args.config) as f:
125 |         data = f.read()
126 |     data_config = json.loads(data)["data_config"]
127 |     mel2samp = Mel2Samp(**data_config)
128 | 
129 |     filepaths = files_to_list(args.filelist_path)
130 | 
131 |     # Make directory if it doesn't exist
132 |     if not os.path.isdir(args.output_dir):
133 |         os.makedirs(args.output_dir)
134 |         os.chmod(args.output_dir, 0o775)
135 | 
136 |     for filepath in filepaths:
137 |         audio, sr = load_wav_to_torch(filepath)
138 |         melspectrogram = mel2samp.get_mel(audio)
139 |         filename = os.path.basename(filepath)
140 |         new_filepath = args.output_dir + '/' + filename + '.pt'
141 |         print(new_filepath)
142 |         torch.save(melspectrogram, new_filepath)
143 | 


--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | Copyright (c) 2017, Prem Seetharaman
  5 | All rights reserved.
  6 | 
  7 | * Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice,
 11 |   this list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice, this
 14 |   list of conditions and the following disclaimer in the
 15 |   documentation and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from this
 19 |   software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | import torch.nn.functional as F
 36 | from torch.autograd import Variable
 37 | from scipy.signal import get_window
 38 | from librosa.util import pad_center, tiny
 39 | from audio_processing import window_sumsquare
 40 | 
 41 | 
 42 | class STFT(torch.nn.Module):
 43 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 44 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 45 |                  window='hann'):
 46 |         super(STFT, self).__init__()
 47 |         self.filter_length = filter_length
 48 |         self.hop_length = hop_length
 49 |         self.win_length = win_length
 50 |         self.window = window
 51 |         self.forward_transform = None
 52 |         scale = self.filter_length / self.hop_length
 53 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 54 | 
 55 |         cutoff = int((self.filter_length / 2 + 1))
 56 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 57 |                                    np.imag(fourier_basis[:cutoff, :])])
 58 | 
 59 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 60 |         inverse_basis = torch.FloatTensor(
 61 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 62 | 
 63 |         if window is not None:
 64 |             assert(filter_length >= win_length)
 65 |             # get window and zero center pad it to filter_length
 66 |             fft_window = get_window(window, win_length, fftbins=True)
 67 |             fft_window = pad_center(fft_window, filter_length)
 68 |             fft_window = torch.from_numpy(fft_window).float()
 69 | 
 70 |             # window the bases
 71 |             forward_basis *= fft_window
 72 |             inverse_basis *= fft_window
 73 | 
 74 |         self.register_buffer('forward_basis', forward_basis.float())
 75 |         self.register_buffer('inverse_basis', inverse_basis.float())
 76 | 
 77 |     def transform(self, input_data):
 78 |         num_batches = input_data.size(0)
 79 |         num_samples = input_data.size(1)
 80 | 
 81 |         self.num_samples = num_samples
 82 | 
 83 |         # similar to librosa, reflect-pad the input
 84 |         input_data = input_data.view(num_batches, 1, num_samples)
 85 |         input_data = F.pad(
 86 |             input_data.unsqueeze(1),
 87 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 88 |             mode='reflect')
 89 |         input_data = input_data.squeeze(1)
 90 | 
 91 |         forward_transform = F.conv1d(
 92 |             input_data,
 93 |             Variable(self.forward_basis, requires_grad=False),
 94 |             stride=self.hop_length,
 95 |             padding=0)
 96 | 
 97 |         cutoff = int((self.filter_length / 2) + 1)
 98 |         real_part = forward_transform[:, :cutoff, :]
 99 |         imag_part = forward_transform[:, cutoff:, :]
100 | 
101 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
102 |         phase = torch.autograd.Variable(
103 |             torch.atan2(imag_part.data, real_part.data))
104 | 
105 |         return magnitude, phase
106 | 
107 |     def inverse(self, magnitude, phase):
108 |         recombine_magnitude_phase = torch.cat(
109 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
110 | 
111 |         inverse_transform = F.conv_transpose1d(
112 |             recombine_magnitude_phase,
113 |             Variable(self.inverse_basis, requires_grad=False),
114 |             stride=self.hop_length,
115 |             padding=0)
116 | 
117 |         if self.window is not None:
118 |             window_sum = window_sumsquare(
119 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
120 |                 win_length=self.win_length, n_fft=self.filter_length,
121 |                 dtype=np.float32)
122 |             # remove modulation effects
123 |             approx_nonzero_indices = torch.from_numpy(
124 |                 np.where(window_sum > tiny(window_sum))[0])
125 |             window_sum = torch.autograd.Variable(
126 |                 torch.from_numpy(window_sum), requires_grad=False)
127 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
128 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
129 | 
130 |             # scale by hop ratio
131 |             inverse_transform *= float(self.filter_length) / self.hop_length
132 | 
133 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
134 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
135 | 
136 |         return inverse_transform
137 | 
138 |     def forward(self, input_data):
139 |         self.magnitude, self.phase = self.transform(input_data)
140 |         reconstruction = self.inverse(self.magnitude, self.phase)
141 |         return reconstruction
142 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | from .numbers import normalize_numbers
 18 | import pyopenjtalk
 19 | from janome.tokenizer import Tokenizer
 20 | 
 21 | 
 22 | # Regular expression matching whitespace:
 23 | _whitespace_re = re.compile(r'\s+')
 24 | 
 25 | # List of (regular expression, replacement) pairs for abbreviations:
 26 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 27 |   ('mrs', 'misess'),
 28 |   ('mr', 'mister'),
 29 |   ('dr', 'doctor'),
 30 |   ('st', 'saint'),
 31 |   ('co', 'company'),
 32 |   ('jr', 'junior'),
 33 |   ('maj', 'major'),
 34 |   ('gen', 'general'),
 35 |   ('drs', 'doctors'),
 36 |   ('rev', 'reverend'),
 37 |   ('lt', 'lieutenant'),
 38 |   ('hon', 'honorable'),
 39 |   ('sgt', 'sergeant'),
 40 |   ('capt', 'captain'),
 41 |   ('esq', 'esquire'),
 42 |   ('ltd', 'limited'),
 43 |   ('col', 'colonel'),
 44 |   ('ft', 'fort'),
 45 | ]]
 46 | 
 47 | # Regular expression matching Japanese without punctuation marks:
 48 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 49 | 
 50 | # Regular expression matching non-Japanese characters or punctuation marks:
 51 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 52 | 
 53 | 
 54 | # Tokenizer for Japanese
 55 | tokenizer = Tokenizer()
 56 | 
 57 | 
 58 | def expand_abbreviations(text):
 59 |   for regex, replacement in _abbreviations:
 60 |     text = re.sub(regex, replacement, text)
 61 |   return text
 62 | 
 63 | 
 64 | def expand_numbers(text):
 65 |   return normalize_numbers(text)
 66 | 
 67 | 
 68 | def lowercase(text):
 69 |   return text.lower()
 70 | 
 71 | 
 72 | def collapse_whitespace(text):
 73 |   return re.sub(_whitespace_re, ' ', text)
 74 | 
 75 | 
 76 | def convert_to_ascii(text):
 77 |   return unidecode(text)
 78 | 
 79 | 
 80 | def basic_cleaners(text):
 81 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 82 |   text = lowercase(text)
 83 |   text = collapse_whitespace(text)
 84 |   return text
 85 | 
 86 | 
 87 | def transliteration_cleaners(text):
 88 |   '''Pipeline for non-English text that transliterates to ASCII.'''
 89 |   text = convert_to_ascii(text)
 90 |   text = lowercase(text)
 91 |   text = collapse_whitespace(text)
 92 |   return text
 93 | 
 94 | 
 95 | def english_cleaners(text):
 96 |   '''Pipeline for English text, including number and abbreviation expansion.'''
 97 |   text = convert_to_ascii(text)
 98 |   text = lowercase(text)
 99 |   text = expand_numbers(text)
100 |   text = expand_abbreviations(text)
101 |   text = collapse_whitespace(text)
102 |   return text
103 | 
104 | 
105 | def japanese_cleaners(text):
106 |   '''Pipeline for Japanese text.'''
107 |   sentences = re.split(_japanese_marks, text)
108 |   marks = re.findall(_japanese_marks, text)
109 |   text = ''
110 |   for i, mark in enumerate(marks):
111 |     if re.match(_japanese_characters, sentences[i]):
112 |       text += pyopenjtalk.g2p(sentences[i], kana=False).replace('pau','').replace(' ','')
113 |     text += unidecode(mark).replace(' ','')
114 |   if re.match(_japanese_characters, sentences[-1]):
115 |       text += pyopenjtalk.g2p(sentences[-1], kana=False).replace('pau','').replace(' ','')
116 |   if re.match('[A-Za-z]',text[-1]):
117 |     text += '.'
118 |   return text
119 | 
120 | 
121 | def japanese_tokenization_cleaners(text):
122 |   '''Pipeline for tokenizing Japanese text.'''
123 |   words = []
124 |   for token in tokenizer.tokenize(text):
125 |     if token.phonetic!='*':
126 |       words.append(token.phonetic)
127 |     else:
128 |       words.append(token.surface)
129 |   text = ''
130 |   for word in words:
131 |     if re.match(_japanese_characters, word):
132 |       if word[0] == '\u30fc':
133 |         continue
134 |       if len(text)>0:
135 |         text += ' '
136 |       text += pyopenjtalk.g2p(word, kana=False).replace(' ','')
137 |     else:
138 |       text += unidecode(word).replace(' ','')
139 |   if re.match('[A-Za-z]',text[-1]):
140 |     text += '.'
141 |   return text
142 | 
143 | 
144 | def japanese_accent_cleaners(text):
145 |   '''Pipeline for notating accent in Japanese text.'''
146 |   '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
147 |   sentences = re.split(_japanese_marks, text)
148 |   marks = re.findall(_japanese_marks, text)
149 |   text = ''
150 |   for i, sentence in enumerate(sentences):
151 |     if re.match(_japanese_characters, sentence):
152 |       text += ':'
153 |       labels = pyopenjtalk.extract_fullcontext(sentence)
154 |       for n, label in enumerate(labels):
155 |         phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
156 |         if phoneme not in ['sil','pau']:
157 |           text += phoneme
158 |         else:
159 |           continue
160 |         n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
161 |         a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
162 |         a2 = int(re.search(r"\+(\d+)\+", label).group(1))
163 |         a3 = int(re.search(r"\+(\d+)/", label).group(1))
164 |         if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
165 |           a2_next=-1
166 |         else:
167 |           a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
168 |         # Accent phrase boundary
169 |         if a3 == 1 and a2_next == 1:
170 |           text += ' '
171 |         # Falling
172 |         elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
173 |           text += ')'
174 |         # Rising
175 |         elif a2 == 1 and a2_next == 2:
176 |           text += '('
177 |     if i<len(marks):
178 |       text += unidecode(marks[i]).replace(' ','')
179 |   if re.match('[A-Za-z]',text[-1]):
180 |     text += '.'
181 |   return text
182 | 
183 | 
184 | def japanese_phrase_cleaners(text):
185 |   '''Pipeline for dividing Japanese text into phrases.'''
186 |   sentences = re.split(_japanese_marks, text)
187 |   marks = re.findall(_japanese_marks, text)
188 |   text = ''
189 |   for i, sentence in enumerate(sentences):
190 |     if re.match(_japanese_characters, sentence):
191 |       if text != '':
192 |         text += ' '
193 |       labels = pyopenjtalk.extract_fullcontext(sentence)
194 |       for n, label in enumerate(labels):
195 |         phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
196 |         if phoneme not in ['sil','pau']:
197 |           text += phoneme
198 |         else:
199 |           continue
200 |         a3 = int(re.search(r"\+(\d+)/", label).group(1))
201 |         if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
202 |           a2_next=-1
203 |         else:
204 |           a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
205 |         # Accent phrase boundary
206 |         if a3 == 1 and a2_next == 1:
207 |           text += ' '
208 |     if i<len(marks):
209 |       text += unidecode(marks[i]).replace(' ','')
210 |   if re.match('[A-Za-z]',text[-1]):
211 |     text += '.'
212 |   return text
213 | 
214 | 
215 | 
216 | 


--------------------------------------------------------------------------------
/distributed.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed as dist
  3 | from torch.nn.modules import Module
  4 | from torch.autograd import Variable
  5 | 
  6 | def _flatten_dense_tensors(tensors):
  7 |     """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
  8 |     same dense type.
  9 |     Since inputs are dense, the resulting tensor will be a concatenated 1D
 10 |     buffer. Element-wise operation on this buffer will be equivalent to
 11 |     operating individually.
 12 |     Arguments:
 13 |         tensors (Iterable[Tensor]): dense tensors to flatten.
 14 |     Returns:
 15 |         A contiguous 1D buffer containing input tensors.
 16 |     """
 17 |     if len(tensors) == 1:
 18 |         return tensors[0].contiguous().view(-1)
 19 |     flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
 20 |     return flat
 21 | 
 22 | def _unflatten_dense_tensors(flat, tensors):
 23 |     """View a flat buffer using the sizes of tensors. Assume that tensors are of
 24 |     same dense type, and that flat is given by _flatten_dense_tensors.
 25 |     Arguments:
 26 |         flat (Tensor): flattened dense tensors to unflatten.
 27 |         tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
 28 |           unflatten flat.
 29 |     Returns:
 30 |         Unflattened dense tensors with sizes same as tensors and values from
 31 |         flat.
 32 |     """
 33 |     outputs = []
 34 |     offset = 0
 35 |     for tensor in tensors:
 36 |         numel = tensor.numel()
 37 |         outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
 38 |         offset += numel
 39 |     return tuple(outputs)
 40 | 
 41 | 
 42 | '''
 43 | This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
 44 | launcher included with this example. It assumes that your run is using multiprocess with 1
 45 | GPU/process, that the model is on the correct device, and that torch.set_device has been
 46 | used to set the device.
 47 | 
 48 | Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
 49 | and will be allreduced at the finish of the backward pass.
 50 | '''
 51 | class DistributedDataParallel(Module):
 52 | 
 53 |     def __init__(self, module):
 54 |         super(DistributedDataParallel, self).__init__()
 55 |         #fallback for PyTorch 0.3
 56 |         if not hasattr(dist, '_backend'):
 57 |             self.warn_on_half = True
 58 |         else:
 59 |             self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 60 | 
 61 |         self.module = module
 62 | 
 63 |         for p in self.module.state_dict().values():
 64 |             if not torch.is_tensor(p):
 65 |                 continue
 66 |             dist.broadcast(p, 0)
 67 | 
 68 |         def allreduce_params():
 69 |             if(self.needs_reduction):
 70 |                 self.needs_reduction = False
 71 |                 buckets = {}
 72 |                 for param in self.module.parameters():
 73 |                     if param.requires_grad and param.grad is not None:
 74 |                         tp = type(param.data)
 75 |                         if tp not in buckets:
 76 |                             buckets[tp] = []
 77 |                         buckets[tp].append(param)
 78 |                 if self.warn_on_half:
 79 |                     if torch.cuda.HalfTensor in buckets:
 80 |                         print("WARNING: gloo dist backend for half parameters may be extremely slow." +
 81 |                               " It is recommended to use the NCCL backend in this case. This currently requires" +
 82 |                               "PyTorch built from top of tree master.")
 83 |                         self.warn_on_half = False
 84 | 
 85 |                 for tp in buckets:
 86 |                     bucket = buckets[tp]
 87 |                     grads = [param.grad.data for param in bucket]
 88 |                     coalesced = _flatten_dense_tensors(grads)
 89 |                     dist.all_reduce(coalesced)
 90 |                     coalesced /= dist.get_world_size()
 91 |                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
 92 |                         buf.copy_(synced)
 93 | 
 94 |         for param in list(self.module.parameters()):
 95 |             def allreduce_hook(*unused):
 96 |                 param._execution_engine.queue_callback(allreduce_params)
 97 |             if param.requires_grad:
 98 |                 param.register_hook(allreduce_hook)
 99 | 
100 |     def forward(self, *inputs, **kwargs):
101 |         self.needs_reduction = True
102 |         return self.module(*inputs, **kwargs)
103 | 
104 |     '''
105 |     def _sync_buffers(self):
106 |         buffers = list(self.module._all_buffers())
107 |         if len(buffers) > 0:
108 |             # cross-node buffer sync
109 |             flat_buffers = _flatten_dense_tensors(buffers)
110 |             dist.broadcast(flat_buffers, 0)
111 |             for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
112 |                 buf.copy_(synced)
113 |      def train(self, mode=True):
114 |         # Clear NCCL communicator and CUDA event cache of the default group ID,
115 |         # These cache will be recreated at the later call. This is currently a
116 |         # work-around for a potential NCCL deadlock.
117 |         if dist._backend == dist.dist_backend.NCCL:
118 |             dist._clear_group_cache()
119 |         super(DistributedDataParallel, self).train(mode)
120 |         self.module.train(mode)
121 |     '''
122 | '''
123 | Modifies existing model to do gradient allreduce, but doesn't change class
124 | so you don't need "module"
125 | '''
126 | def apply_gradient_allreduce(module):
127 |         if not hasattr(dist, '_backend'):
128 |             module.warn_on_half = True
129 |         else:
130 |             module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
131 | 
132 |         for p in module.state_dict().values():
133 |             if not torch.is_tensor(p):
134 |                 continue
135 |             dist.broadcast(p, 0)
136 | 
137 |         def allreduce_params():
138 |             if(module.needs_reduction):
139 |                 module.needs_reduction = False
140 |                 buckets = {}
141 |                 for param in module.parameters():
142 |                     if param.requires_grad and param.grad is not None:
143 |                         tp = param.data.dtype
144 |                         if tp not in buckets:
145 |                             buckets[tp] = []
146 |                         buckets[tp].append(param)
147 |                 if module.warn_on_half:
148 |                     if torch.cuda.HalfTensor in buckets:
149 |                         print("WARNING: gloo dist backend for half parameters may be extremely slow." +
150 |                               " It is recommended to use the NCCL backend in this case. This currently requires" +
151 |                               "PyTorch built from top of tree master.")
152 |                         module.warn_on_half = False
153 | 
154 |                 for tp in buckets:
155 |                     bucket = buckets[tp]
156 |                     grads = [param.grad.data for param in bucket]
157 |                     coalesced = _flatten_dense_tensors(grads)
158 |                     dist.all_reduce(coalesced)
159 |                     coalesced /= dist.get_world_size()
160 |                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
161 |                         buf.copy_(synced)
162 | 
163 |         for param in list(module.parameters()):
164 |             def allreduce_hook(*unused):
165 |                 Variable._execution_engine.queue_callback(allreduce_params)
166 |             if param.requires_grad:
167 |                 param.register_hook(allreduce_hook)
168 | 
169 |         def set_needs_reduction(self, input, output):
170 |             self.needs_reduction = True
171 | 
172 |         module.register_forward_hook(set_needs_reduction)
173 |         return module
174 | 


--------------------------------------------------------------------------------
/waveglow/distributed.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import os
 28 | import sys
 29 | import time
 30 | import subprocess
 31 | import argparse
 32 | 
 33 | import torch
 34 | import torch.distributed as dist
 35 | from torch.autograd import Variable
 36 | 
 37 | def reduce_tensor(tensor, num_gpus):
 38 |     rt = tensor.clone()
 39 |     dist.all_reduce(rt, op=dist.reduce_op.SUM)
 40 |     rt /= num_gpus
 41 |     return rt
 42 | 
 43 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
 44 |     assert torch.cuda.is_available(), "Distributed mode requires CUDA."
 45 |     print("Initializing Distributed")
 46 | 
 47 |     # Set cuda device so everything is done on the right GPU.
 48 |     torch.cuda.set_device(rank % torch.cuda.device_count())
 49 | 
 50 |     # Initialize distributed communication
 51 |     dist.init_process_group(dist_backend, init_method=dist_url,
 52 |                             world_size=num_gpus, rank=rank,
 53 |                             group_name=group_name)
 54 | 
 55 | def _flatten_dense_tensors(tensors):
 56 |     """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
 57 |     same dense type.
 58 |     Since inputs are dense, the resulting tensor will be a concatenated 1D
 59 |     buffer. Element-wise operation on this buffer will be equivalent to
 60 |     operating individually.
 61 |     Arguments:
 62 |         tensors (Iterable[Tensor]): dense tensors to flatten.
 63 |     Returns:
 64 |         A contiguous 1D buffer containing input tensors.
 65 |     """
 66 |     if len(tensors) == 1:
 67 |         return tensors[0].contiguous().view(-1)
 68 |     flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
 69 |     return flat
 70 | 
 71 | def _unflatten_dense_tensors(flat, tensors):
 72 |     """View a flat buffer using the sizes of tensors. Assume that tensors are of
 73 |     same dense type, and that flat is given by _flatten_dense_tensors.
 74 |     Arguments:
 75 |         flat (Tensor): flattened dense tensors to unflatten.
 76 |         tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
 77 |           unflatten flat.
 78 |     Returns:
 79 |         Unflattened dense tensors with sizes same as tensors and values from
 80 |         flat.
 81 |     """
 82 |     outputs = []
 83 |     offset = 0
 84 |     for tensor in tensors:
 85 |         numel = tensor.numel()
 86 |         outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
 87 |         offset += numel
 88 |     return tuple(outputs)
 89 | 
 90 | def apply_gradient_allreduce(module):
 91 |     """
 92 |     Modifies existing model to do gradient allreduce, but doesn't change class
 93 |     so you don't need "module"
 94 |     """
 95 |     if not hasattr(dist, '_backend'):
 96 |         module.warn_on_half = True
 97 |     else:
 98 |         module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
 99 | 
100 |     for p in module.state_dict().values():
101 |         if not torch.is_tensor(p):
102 |             continue
103 |         dist.broadcast(p, 0)
104 | 
105 |     def allreduce_params():
106 |         if(module.needs_reduction):
107 |             module.needs_reduction = False
108 |             buckets = {}
109 |             for param in module.parameters():
110 |                 if param.requires_grad and param.grad is not None:
111 |                     tp = type(param.data)
112 |                     if tp not in buckets:
113 |                         buckets[tp] = []
114 |                     buckets[tp].append(param)
115 |             if module.warn_on_half:
116 |                 if torch.cuda.HalfTensor in buckets:
117 |                     print("WARNING: gloo dist backend for half parameters may be extremely slow." +
118 |                           " It is recommended to use the NCCL backend in this case. This currently requires" +
119 |                           "PyTorch built from top of tree master.")
120 |                     module.warn_on_half = False
121 | 
122 |             for tp in buckets:
123 |                 bucket = buckets[tp]
124 |                 grads = [param.grad.data for param in bucket]
125 |                 coalesced = _flatten_dense_tensors(grads)
126 |                 dist.all_reduce(coalesced)
127 |                 coalesced /= dist.get_world_size()
128 |                 for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
129 |                     buf.copy_(synced)
130 | 
131 |     for param in list(module.parameters()):
132 |         def allreduce_hook(*unused):
133 |             Variable._execution_engine.queue_callback(allreduce_params)
134 |         if param.requires_grad:
135 |             param.register_hook(allreduce_hook)
136 |             dir(param)
137 | 
138 |     def set_needs_reduction(self, input, output):
139 |         self.needs_reduction = True
140 | 
141 |     module.register_forward_hook(set_needs_reduction)
142 |     return module
143 | 
144 | 
145 | def main(config, stdout_dir, args_str):
146 |     args_list = ['train.py']
147 |     args_list += args_str.split(' ') if len(args_str) > 0 else []
148 | 
149 |     args_list.append('--config={}'.format(config))
150 | 
151 |     num_gpus = torch.cuda.device_count()
152 |     args_list.append('--num_gpus={}'.format(num_gpus))
153 |     args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
154 | 
155 |     if not os.path.isdir(stdout_dir):
156 |         os.makedirs(stdout_dir)
157 |         os.chmod(stdout_dir, 0o775)
158 | 
159 |     workers = []
160 | 
161 |     for i in range(num_gpus):
162 |         args_list[-2] = '--rank={}'.format(i)
163 |         stdout = None if i == 0 else open(
164 |             os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
165 |         print(args_list)
166 |         p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
167 |         workers.append(p)
168 | 
169 |     for p in workers:
170 |         p.wait()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser()
175 |     parser.add_argument('-c', '--config', type=str, required=True,
176 |                         help='JSON file for configuration')
177 |     parser.add_argument('-s', '--stdout_dir', type=str, default=".",
178 |                         help='directory to save stoud logs')
179 |     parser.add_argument(
180 |         '-a', '--args_str', type=str, default='',
181 |         help='double quoted string with space separated key value pairs')
182 | 
183 |     args = parser.parse_args()
184 |     main(args.config, args.stdout_dir, args.args_str)
185 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Ww][Ii][Nn]32/
 27 | [Aa][Rr][Mm]/
 28 | [Aa][Rr][Mm]64/
 29 | bld/
 30 | [Bb]in/
 31 | [Oo]bj/
 32 | [Oo]ut/
 33 | [Ll]og/
 34 | [Ll]ogs/
 35 | 
 36 | # Visual Studio 2015/2017 cache/options directory
 37 | .vs/
 38 | # Uncomment if you have tasks that create the project's static files in wwwroot
 39 | #wwwroot/
 40 | 
 41 | # Visual Studio 2017 auto generated files
 42 | Generated\ Files/
 43 | 
 44 | # MSTest test Results
 45 | [Tt]est[Rr]esult*/
 46 | [Bb]uild[Ll]og.*
 47 | 
 48 | # NUnit
 49 | *.VisualState.xml
 50 | TestResult.xml
 51 | nunit-*.xml
 52 | 
 53 | # Build Results of an ATL Project
 54 | [Dd]ebugPS/
 55 | [Rr]eleasePS/
 56 | dlldata.c
 57 | 
 58 | # Benchmark Results
 59 | BenchmarkDotNet.Artifacts/
 60 | 
 61 | # .NET Core
 62 | project.lock.json
 63 | project.fragment.lock.json
 64 | artifacts/
 65 | 
 66 | # ASP.NET Scaffolding
 67 | ScaffoldingReadMe.txt
 68 | 
 69 | # StyleCop
 70 | StyleCopReport.xml
 71 | 
 72 | # Files built by Visual Studio
 73 | *_i.c
 74 | *_p.c
 75 | *_h.h
 76 | *.ilk
 77 | *.meta
 78 | *.obj
 79 | *.iobj
 80 | *.pch
 81 | *.pdb
 82 | *.ipdb
 83 | *.pgc
 84 | *.pgd
 85 | *.rsp
 86 | *.sbr
 87 | *.tlb
 88 | *.tli
 89 | *.tlh
 90 | *.tmp
 91 | *.tmp_proj
 92 | *_wpftmp.csproj
 93 | *.log
 94 | *.vspscc
 95 | *.vssscc
 96 | .builds
 97 | *.pidb
 98 | *.svclog
 99 | *.scc
100 | 
101 | # Chutzpah Test files
102 | _Chutzpah*
103 | 
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 | 
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 | 
121 | # Visual Studio Trace Files
122 | *.e2e
123 | 
124 | # TFS 2012 Local Workspace
125 | $tf/
126 | 
127 | # Guidance Automation Toolkit
128 | *.gpState
129 | 
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 | 
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 | 
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 | 
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 | 
163 | # Web workbench (sass)
164 | .sass-cache/
165 | 
166 | # Installshield output folder
167 | [Ee]xpress/
168 | 
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 | 
179 | # Click-Once directory
180 | publish/
181 | 
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 | 
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 | 
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 | 
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 | 
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 | 
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 | 
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 | 
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 | 
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 | 
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 | 
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 | 
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 | 
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 | 
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 | 
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 | 
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 | 
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 | 
288 | # Visual Studio 6 build log
289 | *.plg
290 | 
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 | 
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 | 
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 | 
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 | 
309 | # FAKE - F# Make
310 | .fake/
311 | 
312 | # CodeRush personal settings
313 | .cr/personal
314 | 
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 | 
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 | 
323 | # Tabs Studio
324 | *.tss
325 | 
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 | 
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 | 
335 | # OpenCover UI analysis results
336 | OpenCover/
337 | 
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 | 
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 | 
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 | 
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 | 
350 | # Local History for Visual Studio
351 | .localhistory/
352 | 
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 | 
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 | 
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 | 
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | 
365 | # models
366 | /ayachi_*
367 | /inaba_*
368 | /tomotake_*
369 | /murasame_*
370 | /arihara_*
371 | /waveglow_*
372 | 
373 | # jupyter cache
374 | /.ipynb_checkpoints
375 | 


--------------------------------------------------------------------------------
/waveglow/train.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import argparse
 28 | import json
 29 | import os
 30 | import torch
 31 | 
 32 | #=====START: ADDED FOR DISTRIBUTED======
 33 | from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
 34 | from torch.utils.data.distributed import DistributedSampler
 35 | #=====END:   ADDED FOR DISTRIBUTED======
 36 | 
 37 | from torch.utils.data import DataLoader
 38 | from glow import WaveGlow, WaveGlowLoss
 39 | from mel2samp import Mel2Samp
 40 | 
 41 | def load_checkpoint(checkpoint_path, model, optimizer):
 42 |     assert os.path.isfile(checkpoint_path)
 43 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 44 |     iteration = checkpoint_dict['iteration']
 45 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 46 |     model_for_loading = checkpoint_dict['model']
 47 |     model.load_state_dict(model_for_loading.state_dict())
 48 |     print("Loaded checkpoint '{}' (iteration {})" .format(
 49 |           checkpoint_path, iteration))
 50 |     return model, optimizer, iteration
 51 | 
 52 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
 53 |     print("Saving model and optimizer state at iteration {} to {}".format(
 54 |           iteration, filepath))
 55 |     model_for_saving = WaveGlow(**waveglow_config).cuda()
 56 |     model_for_saving.load_state_dict(model.state_dict())
 57 |     torch.save({'model': model_for_saving,
 58 |                 'iteration': iteration,
 59 |                 'optimizer': optimizer.state_dict(),
 60 |                 'learning_rate': learning_rate}, filepath)
 61 | 
 62 | def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
 63 |           sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
 64 |           checkpoint_path, with_tensorboard):
 65 |     torch.manual_seed(seed)
 66 |     torch.cuda.manual_seed(seed)
 67 |     #=====START: ADDED FOR DISTRIBUTED======
 68 |     if num_gpus > 1:
 69 |         init_distributed(rank, num_gpus, group_name, **dist_config)
 70 |     #=====END:   ADDED FOR DISTRIBUTED======
 71 | 
 72 |     criterion = WaveGlowLoss(sigma)
 73 |     model = WaveGlow(**waveglow_config).cuda()
 74 | 
 75 |     #=====START: ADDED FOR DISTRIBUTED======
 76 |     if num_gpus > 1:
 77 |         model = apply_gradient_allreduce(model)
 78 |     #=====END:   ADDED FOR DISTRIBUTED======
 79 | 
 80 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
 81 | 
 82 |     if fp16_run:
 83 |         from apex import amp
 84 |         model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
 85 | 
 86 |     # Load checkpoint if one exists
 87 |     iteration = 0
 88 |     if checkpoint_path != "":
 89 |         model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
 90 |                                                       optimizer)
 91 |         iteration += 1  # next iteration is iteration + 1
 92 | 
 93 |     trainset = Mel2Samp(**data_config)
 94 |     # =====START: ADDED FOR DISTRIBUTED======
 95 |     train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
 96 |     # =====END:   ADDED FOR DISTRIBUTED======
 97 |     train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
 98 |                               sampler=train_sampler,
 99 |                               batch_size=batch_size,
100 |                               pin_memory=False,
101 |                               drop_last=True)
102 | 
103 |     # Get shared output_directory ready
104 |     if rank == 0:
105 |         if not os.path.isdir(output_directory):
106 |             os.makedirs(output_directory)
107 |             os.chmod(output_directory, 0o775)
108 |         print("output directory", output_directory)
109 | 
110 |     if with_tensorboard and rank == 0:
111 |         from tensorboardX import SummaryWriter
112 |         logger = SummaryWriter(os.path.join(output_directory, 'logs'))
113 | 
114 |     model.train()
115 |     epoch_offset = max(0, int(iteration / len(train_loader)))
116 |     # ================ MAIN TRAINNIG LOOP! ===================
117 |     for epoch in range(epoch_offset, epochs):
118 |         print("Epoch: {}".format(epoch))
119 |         for i, batch in enumerate(train_loader):
120 |             model.zero_grad()
121 | 
122 |             mel, audio = batch
123 |             mel = torch.autograd.Variable(mel.cuda())
124 |             audio = torch.autograd.Variable(audio.cuda())
125 |             outputs = model((mel, audio))
126 | 
127 |             loss = criterion(outputs)
128 |             if num_gpus > 1:
129 |                 reduced_loss = reduce_tensor(loss.data, num_gpus).item()
130 |             else:
131 |                 reduced_loss = loss.item()
132 | 
133 |             if fp16_run:
134 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
135 |                     scaled_loss.backward()
136 |             else:
137 |                 loss.backward()
138 | 
139 |             optimizer.step()
140 | 
141 |             print("{}:\t{:.9f}".format(iteration, reduced_loss))
142 |             if with_tensorboard and rank == 0:
143 |                 logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)
144 | 
145 |             if (iteration % iters_per_checkpoint == 0):
146 |                 if rank == 0:
147 |                     checkpoint_path = "{}/waveglow_{}".format(
148 |                         output_directory, iteration)
149 |                     save_checkpoint(model, optimizer, learning_rate, iteration,
150 |                                     checkpoint_path)
151 | 
152 |             iteration += 1
153 | 
154 | if __name__ == "__main__":
155 |     parser = argparse.ArgumentParser()
156 |     parser.add_argument('-c', '--config', type=str,
157 |                         help='JSON file for configuration')
158 |     parser.add_argument('-r', '--rank', type=int, default=0,
159 |                         help='rank of process for distributed')
160 |     parser.add_argument('-g', '--group_name', type=str, default='',
161 |                         help='name of group for distributed')
162 |     args = parser.parse_args()
163 | 
164 |     # Parse configs.  Globals nicer in this case
165 |     with open(args.config) as f:
166 |         data = f.read()
167 |     config = json.loads(data)
168 |     train_config = config["train_config"]
169 |     global data_config
170 |     data_config = config["data_config"]
171 |     global dist_config
172 |     dist_config = config["dist_config"]
173 |     global waveglow_config
174 |     waveglow_config = config["waveglow_config"]
175 | 
176 |     num_gpus = torch.cuda.device_count()
177 |     if num_gpus > 1:
178 |         if args.group_name == '':
179 |             print("WARNING: Multiple GPUs detected but no distributed group set")
180 |             print("Only running 1 GPU.  Use distributed.py for multiple GPUs")
181 |             num_gpus = 1
182 | 
183 |     if num_gpus == 1 and args.rank != 0:
184 |         raise Exception("Doing single GPU training on rank > 0")
185 | 
186 |     torch.backends.cudnn.enabled = True
187 |     torch.backends.cudnn.benchmark = False
188 |     train(num_gpus, args.rank, args.group_name, **train_config)
189 | 


--------------------------------------------------------------------------------
/waveglow/glow_old.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | from glow import Invertible1x1Conv, remove
  4 | 
  5 | 
  6 | @torch.jit.script
  7 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
  8 |     n_channels_int = n_channels[0]
  9 |     in_act = input_a+input_b
 10 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 11 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 12 |     acts = t_act * s_act
 13 |     return acts
 14 | 
 15 | 
 16 | class WN(torch.nn.Module):
 17 |     """
 18 |     This is the WaveNet like layer for the affine coupling.  The primary difference
 19 |     from WaveNet is the convolutions need not be causal.  There is also no dilation
 20 |     size reset.  The dilation only doubles on each layer
 21 |     """
 22 |     def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
 23 |                  kernel_size):
 24 |         super(WN, self).__init__()
 25 |         assert(kernel_size % 2 == 1)
 26 |         assert(n_channels % 2 == 0)
 27 |         self.n_layers = n_layers
 28 |         self.n_channels = n_channels
 29 |         self.in_layers = torch.nn.ModuleList()
 30 |         self.res_skip_layers = torch.nn.ModuleList()
 31 |         self.cond_layers = torch.nn.ModuleList()
 32 | 
 33 |         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
 34 |         start = torch.nn.utils.weight_norm(start, name='weight')
 35 |         self.start = start
 36 | 
 37 |         # Initializing last layer to 0 makes the affine coupling layers
 38 |         # do nothing at first.  This helps with training stability
 39 |         end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
 40 |         end.weight.data.zero_()
 41 |         end.bias.data.zero_()
 42 |         self.end = end
 43 | 
 44 |         for i in range(n_layers):
 45 |             dilation = 2 ** i
 46 |             padding = int((kernel_size*dilation - dilation)/2)
 47 |             in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
 48 |                                        dilation=dilation, padding=padding)
 49 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
 50 |             self.in_layers.append(in_layer)
 51 | 
 52 |             cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
 53 |             cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 54 |             self.cond_layers.append(cond_layer)
 55 | 
 56 |             # last one is not necessary
 57 |             if i < n_layers - 1:
 58 |                 res_skip_channels = 2*n_channels
 59 |             else:
 60 |                 res_skip_channels = n_channels
 61 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
 62 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
 63 |             self.res_skip_layers.append(res_skip_layer)
 64 | 
 65 |     def forward(self, forward_input):
 66 |         audio, spect = forward_input
 67 |         audio = self.start(audio)
 68 | 
 69 |         for i in range(self.n_layers):
 70 |             acts = fused_add_tanh_sigmoid_multiply(
 71 |                 self.in_layers[i](audio),
 72 |                 self.cond_layers[i](spect),
 73 |                 torch.IntTensor([self.n_channels]))
 74 | 
 75 |             res_skip_acts = self.res_skip_layers[i](acts)
 76 |             if i < self.n_layers - 1:
 77 |                 audio = res_skip_acts[:,:self.n_channels,:] + audio
 78 |                 skip_acts = res_skip_acts[:,self.n_channels:,:]
 79 |             else:
 80 |                 skip_acts = res_skip_acts
 81 | 
 82 |             if i == 0:
 83 |                 output = skip_acts
 84 |             else:
 85 |                 output = skip_acts + output
 86 |         return self.end(output)
 87 | 
 88 | 
 89 | class WaveGlow(torch.nn.Module):
 90 |     def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
 91 |                  n_early_size, WN_config):
 92 |         super(WaveGlow, self).__init__()
 93 | 
 94 |         self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
 95 |                                                  n_mel_channels,
 96 |                                                  1024, stride=256)
 97 |         assert(n_group % 2 == 0)
 98 |         self.n_flows = n_flows
 99 |         self.n_group = n_group
100 |         self.n_early_every = n_early_every
101 |         self.n_early_size = n_early_size
102 |         self.WN = torch.nn.ModuleList()
103 |         self.convinv = torch.nn.ModuleList()
104 | 
105 |         n_half = int(n_group/2)
106 | 
107 |         # Set up layers with the right sizes based on how many dimensions
108 |         # have been output already
109 |         n_remaining_channels = n_group
110 |         for k in range(n_flows):
111 |             if k % self.n_early_every == 0 and k > 0:
112 |                 n_half = n_half - int(self.n_early_size/2)
113 |                 n_remaining_channels = n_remaining_channels - self.n_early_size
114 |             self.convinv.append(Invertible1x1Conv(n_remaining_channels))
115 |             self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
116 |         self.n_remaining_channels = n_remaining_channels  # Useful during inference
117 | 
118 |     def forward(self, forward_input):
119 |         return None
120 |         """
121 |         forward_input[0] = audio: batch x time
122 |         forward_input[1] = upsamp_spectrogram:  batch x n_cond_channels x time
123 |         """
124 |         """
125 |         spect, audio = forward_input
126 | 
127 |         #  Upsample spectrogram to size of audio
128 |         spect = self.upsample(spect)
129 |         assert(spect.size(2) >= audio.size(1))
130 |         if spect.size(2) > audio.size(1):
131 |             spect = spect[:, :, :audio.size(1)]
132 | 
133 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
134 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
135 | 
136 |         audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
137 |         output_audio = []
138 |         s_list = []
139 |         s_conv_list = []
140 | 
141 |         for k in range(self.n_flows):
142 |             if k%4 == 0 and k > 0:
143 |                 output_audio.append(audio[:,:self.n_multi,:])
144 |                 audio = audio[:,self.n_multi:,:]
145 | 
146 |             # project to new basis
147 |             audio, s = self.convinv[k](audio)
148 |             s_conv_list.append(s)
149 | 
150 |             n_half = int(audio.size(1)/2)
151 |             if k%2 == 0:
152 |                 audio_0 = audio[:,:n_half,:]
153 |                 audio_1 = audio[:,n_half:,:]
154 |             else:
155 |                 audio_1 = audio[:,:n_half,:]
156 |                 audio_0 = audio[:,n_half:,:]
157 | 
158 |             output = self.nn[k]((audio_0, spect))
159 |             s = output[:, n_half:, :]
160 |             b = output[:, :n_half, :]
161 |             audio_1 = torch.exp(s)*audio_1 + b
162 |             s_list.append(s)
163 | 
164 |             if k%2 == 0:
165 |                 audio = torch.cat([audio[:,:n_half,:], audio_1],1)
166 |             else:
167 |                 audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
168 |         output_audio.append(audio)
169 |         return torch.cat(output_audio,1), s_list, s_conv_list
170 |         """
171 | 
172 |     def infer(self, spect, sigma=1.0):
173 |         spect = self.upsample(spect)
174 |         # trim conv artifacts. maybe pad spec to kernel multiple
175 |         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
176 |         spect = spect[:, :, :-time_cutoff]
177 | 
178 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
179 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
180 | 
181 |         if spect.type() == 'torch.cuda.HalfTensor':
182 |             audio = torch.cuda.HalfTensor(spect.size(0),
183 |                                           self.n_remaining_channels,
184 |                                           spect.size(2)).normal_()
185 |         else:
186 |             audio = torch.cuda.FloatTensor(spect.size(0),
187 |                                            self.n_remaining_channels,
188 |                                            spect.size(2)).normal_()
189 | 
190 |         audio = torch.autograd.Variable(sigma*audio)
191 | 
192 |         for k in reversed(range(self.n_flows)):
193 |             n_half = int(audio.size(1)/2)
194 |             if k%2 == 0:
195 |                 audio_0 = audio[:,:n_half,:]
196 |                 audio_1 = audio[:,n_half:,:]
197 |             else:
198 |                 audio_1 = audio[:,:n_half,:]
199 |                 audio_0 = audio[:,n_half:,:]
200 | 
201 |             output = self.WN[k]((audio_0, spect))
202 |             s = output[:, n_half:, :]
203 |             b = output[:, :n_half, :]
204 |             audio_1 = (audio_1 - b)/torch.exp(s)
205 |             if k%2 == 0:
206 |                 audio = torch.cat([audio[:,:n_half,:], audio_1],1)
207 |             else:
208 |                 audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
209 | 
210 |             audio = self.convinv[k](audio, reverse=True)
211 | 
212 |             if k%4 == 0 and k > 0:
213 |                 if spect.type() == 'torch.cuda.HalfTensor':
214 |                     z = torch.cuda.HalfTensor(spect.size(0),
215 |                                               self.n_early_size,
216 |                                               spect.size(2)).normal_()
217 |                 else:
218 |                     z = torch.cuda.FloatTensor(spect.size(0),
219 |                                                self.n_early_size,
220 |                                                spect.size(2)).normal_()
221 |                 audio = torch.cat((sigma*z, audio),1)
222 | 
223 |         return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
224 | 
225 |     @staticmethod
226 |     def remove_weightnorm(model):
227 |         waveglow = model
228 |         for WN in waveglow.WN:
229 |             WN.start = torch.nn.utils.remove_weight_norm(WN.start)
230 |             WN.in_layers = remove(WN.in_layers)
231 |             WN.cond_layers = remove(WN.cond_layers)
232 |             WN.res_skip_layers = remove(WN.res_skip_layers)
233 |         return waveglow
234 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import math
  4 | import torch
  5 | import argparse
  6 | import torch.distributed as dist
  7 | 
  8 | 
  9 | from numpy import finfo
 10 | from model import Tacotron2
 11 | from torch.backends import cudnn
 12 | from hparams import create_hparams
 13 | from logger import Tacotron2Logger
 14 | from torch.utils.data import DataLoader
 15 | from loss_function import Tacotron2Loss
 16 | from distributed import apply_gradient_allreduce
 17 | from data_utils import TextMelLoader, TextMelCollate
 18 | from torch.utils.data.distributed import DistributedSampler
 19 | 
 20 | 
 21 | device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
 22 | 
 23 | # 整理tensor
 24 | def reduce_tensor(tensor, n_gpus):
 25 |     rt = tensor.clone()
 26 |     dist.all_reduce(rt, op=dist.reduce_op.SUM)
 27 |     rt /= n_gpus
 28 |     return rt
 29 | 
 30 | 
 31 | def init_distributed(hparams, n_gpus, rank, group_name):
 32 |     #assert torch.cuda.is_available(), "Distributed mode requires CUDA."
 33 |     if torch.cuda.is_available() :
 34 |         # Set cuda device so everything is done on the right GPU.
 35 |         torch.cuda.set_device(rank % torch.cuda.device_count())
 36 |         # Initialize distributed communication
 37 |         dist.init_process_group(backend=hparams.dist_backend,
 38 |                                 init_method=hparams.dist_url,
 39 |                                 world_size=n_gpus,
 40 |                                 rank=rank,
 41 |                                 group_name=group_name)
 42 |         print("Distributed mode requires CUDA.")
 43 |     else :
 44 |         print("Use the CPU")
 45 |     print("Initializing Distributed")
 46 | 
 47 |     print("Done initializing distributed")
 48 | 
 49 | 
 50 | 
 51 | def prepare_dataloaders(hparams):
 52 |     # Get data, data loaders and collate function ready
 53 |     trainset = TextMelLoader(hparams.training_files, hparams)
 54 |     valset = TextMelLoader(hparams.validation_files, hparams)
 55 |     collate_fn = TextMelCollate(hparams.n_frames_per_step)
 56 | 
 57 |     if hparams.distributed_run:
 58 |         train_sampler = DistributedSampler(trainset)
 59 |         shuffle = False
 60 |     else:
 61 |         train_sampler = None
 62 |         shuffle = True
 63 | 
 64 |     train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
 65 |                               sampler=train_sampler,
 66 |                               batch_size=hparams.batch_size, pin_memory=False,
 67 |                               drop_last=True, collate_fn=collate_fn)
 68 |     return train_loader, valset, collate_fn
 69 | 
 70 | 
 71 | def prepare_directories_and_logger(output_directory, log_directory, rank):
 72 |     if rank == 0:
 73 |         if not os.path.isdir(output_directory):
 74 |             os.makedirs(output_directory)
 75 |             os.chmod(output_directory, 0o775)
 76 |         logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
 77 |     else:
 78 |         logger = None
 79 |     return logger
 80 | 
 81 | 
 82 | def load_model(hparams):
 83 |     model = Tacotron2(hparams)
 84 |     model.to(device)
 85 |     if hparams.fp16_run:
 86 |         model.decoder.attention_layer.score_mask_value = finfo('float16').min
 87 | 
 88 |     if hparams.distributed_run:
 89 |         model = apply_gradient_allreduce(model)
 90 | 
 91 |     return model
 92 | 
 93 | 
 94 | def warm_start_model(checkpoint_path, model, ignore_layers):
 95 |     assert os.path.isfile(checkpoint_path)
 96 |     print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
 97 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 98 |     model_dict = checkpoint_dict['state_dict']
 99 |     if len(ignore_layers) > 0:
100 |         model_dict = {k: v for k, v in model_dict.items()
101 |                       if k not in ignore_layers}
102 |         dummy_dict = model.state_dict()
103 |         dummy_dict.update(model_dict)
104 |         model_dict = dummy_dict
105 |     model.load_state_dict(model_dict)
106 |     return model
107 | 
108 | 
109 | def load_checkpoint(checkpoint_path, model, optimizer):
110 |     assert os.path.isfile(checkpoint_path)
111 |     print("Loading checkpoint '{}'".format(checkpoint_path))
112 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
113 |     model.load_state_dict(checkpoint_dict['state_dict'])
114 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
115 |     learning_rate = checkpoint_dict['learning_rate']
116 |     iteration = checkpoint_dict['iteration']
117 |     print("Loaded checkpoint '{}' from iteration {}" .format(
118 |         checkpoint_path, iteration))
119 |     return model, optimizer, learning_rate, iteration
120 | 
121 | 
122 | def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
123 |     print("Saving model and optimizer state at iteration {} to {}".format(
124 |         iteration, filepath))
125 |     torch.save({'iteration': iteration,
126 |                 'state_dict': model.state_dict(),
127 |                 'optimizer': optimizer.state_dict(),
128 |                 'learning_rate': learning_rate}, filepath)
129 | 
130 | 
131 | def validate(model, criterion, valset, iteration, batch_size, n_gpus,
132 |              collate_fn, logger, distributed_run, rank):
133 |     """Handles all the validation scoring and printing"""
134 |     model.eval()
135 |     with torch.no_grad():
136 |         val_sampler = DistributedSampler(valset) if distributed_run else None
137 |         val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
138 |                                 shuffle=False, batch_size=batch_size,
139 |                                 pin_memory=False, collate_fn=collate_fn)
140 | 
141 |         val_loss = 0.0
142 |         for i, batch in enumerate(val_loader):
143 |             x, y = model.parse_batch(batch)
144 |             y_pred = model(x)
145 |             loss = criterion(y_pred, y)
146 |             if distributed_run:
147 |                 reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
148 |             else:
149 |                 reduced_val_loss = loss.item()
150 |             val_loss += reduced_val_loss
151 |         val_loss = val_loss / (i + 1)
152 | 
153 |     model.train()
154 |     if rank == 0:
155 |         print("Validation loss {}: {:9f}  ".format(iteration, val_loss))
156 |         logger.log_validation(val_loss, model, y, y_pred, iteration)
157 | 
158 | 
159 | def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
160 |           rank, group_name, hparams):
161 |     """Training and validation logging results to tensorboard and stdout
162 | 
163 |     Params
164 |     ------
165 |     output_directory (string): directory to save checkpoints
166 |     log_directory (string) directory to save tensorboard logs
167 |     checkpoint_path(string): checkpoint path
168 |     n_gpus (int): number of gpus
169 |     rank (int): rank of current gpu
170 |     hparams (object): comma separated list of "name=value" pairs.
171 |     """
172 |     if hparams.distributed_run:
173 |         init_distributed(hparams, n_gpus, rank, group_name)
174 | 
175 |     torch.manual_seed(hparams.seed)
176 |     torch.cuda.manual_seed(hparams.seed)
177 | 
178 |     model = load_model(hparams)
179 |     learning_rate = hparams.learning_rate
180 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
181 |                                  weight_decay=hparams.weight_decay)
182 | 
183 |     # 默认的是 False 可以注释掉
184 |     #if hparams.fp16_run:
185 |     #    from apex import amp
186 |     #    model, optimizer = amp.initialize(
187 |     #        model, optimizer, opt_level='O2')
188 | 
189 |     if hparams.distributed_run:
190 |         model = apply_gradient_allreduce(model)
191 | 
192 |     criterion = Tacotron2Loss()
193 |     logger = prepare_directories_and_logger(output_directory, log_directory, rank)
194 |     train_loader, valset, collate_fn = prepare_dataloaders(hparams)
195 | 
196 |     # Load checkpoint if one exists
197 |     iteration = 0
198 |     epoch_offset = 0
199 |     if checkpoint_path is not None:
200 |         if warm_start:
201 |             model = warm_start_model(
202 |                 checkpoint_path, model, hparams.ignore_layers)
203 |         else:
204 |             model, optimizer, _learning_rate, iteration = load_checkpoint(
205 |                 checkpoint_path, model, optimizer)
206 |             if hparams.use_saved_learning_rate:
207 |                 learning_rate = _learning_rate
208 |             iteration += 1  # next iteration is iteration + 1
209 |             epoch_offset = max(0, int(iteration / len(train_loader)))
210 | 
211 |     model.train()
212 |     is_overflow = False
213 |     # ================ MAIN TRAINNIG LOOP! ===================
214 |     for epoch in range(epoch_offset, hparams.epochs):
215 |         print("Epoch: {}".format(epoch))
216 |         for i, batch in enumerate(train_loader):
217 |             start = time.perf_counter()
218 |             for param_group in optimizer.param_groups:
219 |                 param_group['lr'] = learning_rate
220 | 
221 |             model.zero_grad()
222 |             x, y = model.parse_batch(batch)
223 |             y_pred = model(x)
224 | 
225 |             loss = criterion(y_pred, y)
226 |             if hparams.distributed_run:
227 |                 reduced_loss = reduce_tensor(loss.data, n_gpus).item()
228 |             else:
229 |                 reduced_loss = loss.item()
230 | 
231 |             loss.backward()
232 |             grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.grad_clip_thresh)
233 |             optimizer.step()
234 | 
235 |             if not is_overflow and rank == 0:
236 |                 duration = time.perf_counter() - start
237 |                 print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
238 |                     iteration, reduced_loss, grad_norm, duration))
239 |                 logger.log_training(
240 |                     reduced_loss, grad_norm, learning_rate, duration, iteration)
241 | 
242 |             if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
243 |                 validate(model, criterion, valset, iteration,
244 |                          hparams.batch_size, n_gpus, collate_fn, logger,
245 |                          hparams.distributed_run, rank)
246 |                 if rank == 0:
247 |                     checkpoint_path = os.path.join(
248 |                         output_directory, "checkpoint_{}".format(iteration))
249 |                     save_checkpoint(model, optimizer, learning_rate, iteration,
250 |                                     checkpoint_path)
251 | 
252 |             iteration += 1
253 | 
254 | 
255 | if __name__ == '__main__':
256 |     parser = argparse.ArgumentParser()
257 |     parser.add_argument('-o', '--output_directory', type=str,
258 |                         help='directory to save checkpoints ')
259 |     parser.add_argument('-l', '--log_directory', type=str,
260 |                         help='directory to save tensorboard logs')
261 |     parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
262 |                         required=False, help='checkpoint path')
263 |     parser.add_argument('--warm_start', action='store_true',
264 |                         help='load model weights only, ignore specified layers')
265 |     parser.add_argument('--n_gpus', type=int, default=1,
266 |                         required=False, help='number of gpus')
267 |     parser.add_argument('--rank', type=int, default=0,
268 |                         required=False, help='rank of current gpu')
269 |     parser.add_argument('--group_name', type=str, default='group_name',
270 |                         required=False, help='Distributed group name')
271 |     parser.add_argument('--hparams', type=str,
272 |                         required=False, help='comma separated name=value pairs')
273 | 
274 |     args = parser.parse_args()
275 |     hparams = create_hparams()
276 | 
277 |     cudnn.enabled = hparams.cudnn_enabled#create_hparams.cudnn_enabled
278 |     cudnn.benchmark = hparams.cudnn_benchmark#create_hparams.cudnn_benchmark
279 | 
280 |     print("FP16 Run:", hparams.fp16_run)
281 |     print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
282 |     print("Distributed Run:", hparams.distributed_run)
283 |     print("cuDNN Enabled:", hparams.cudnn_enabled)
284 |     print("cuDNN Benchmark:", hparams.cudnn_benchmark)
285 | 
286 |     train(args.output_directory,
287 |           args.log_directory,
288 |           args.checkpoint_path,
289 |           args.warm_start,
290 |           args.n_gpus,
291 |           args.rank,
292 |           args.group_name,
293 |           hparams)
294 | 


--------------------------------------------------------------------------------
/waveglow/glow.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | import copy
 28 | import torch
 29 | from torch.autograd import Variable
 30 | import torch.nn.functional as F
 31 | 
 32 | 
 33 | @torch.jit.script
 34 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
 35 |     n_channels_int = n_channels[0]
 36 |     in_act = input_a+input_b
 37 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 38 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 39 |     acts = t_act * s_act
 40 |     return acts
 41 | 
 42 | 
 43 | class WaveGlowLoss(torch.nn.Module):
 44 |     def __init__(self, sigma=1.0):
 45 |         super(WaveGlowLoss, self).__init__()
 46 |         self.sigma = sigma
 47 | 
 48 |     def forward(self, model_output):
 49 |         z, log_s_list, log_det_W_list = model_output
 50 |         for i, log_s in enumerate(log_s_list):
 51 |             if i == 0:
 52 |                 log_s_total = torch.sum(log_s)
 53 |                 log_det_W_total = log_det_W_list[i]
 54 |             else:
 55 |                 log_s_total = log_s_total + torch.sum(log_s)
 56 |                 log_det_W_total += log_det_W_list[i]
 57 | 
 58 |         loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
 59 |         return loss/(z.size(0)*z.size(1)*z.size(2))
 60 | 
 61 | 
 62 | class Invertible1x1Conv(torch.nn.Module):
 63 |     """
 64 |     The layer outputs both the convolution, and the log determinant
 65 |     of its weight matrix.  If reverse=True it does convolution with
 66 |     inverse
 67 |     """
 68 |     def __init__(self, c):
 69 |         super(Invertible1x1Conv, self).__init__()
 70 |         self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
 71 |                                     bias=False)
 72 | 
 73 |         # Sample a random orthonormal matrix to initialize weights
 74 |         W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
 75 | 
 76 |         # Ensure determinant is 1.0 not -1.0
 77 |         if torch.det(W) < 0:
 78 |             W[:,0] = -1*W[:,0]
 79 |         W = W.view(c, c, 1)
 80 |         self.conv.weight.data = W
 81 | 
 82 |     def forward(self, z, reverse=False):
 83 |         # shape
 84 |         batch_size, group_size, n_of_groups = z.size()
 85 | 
 86 |         W = self.conv.weight.squeeze()
 87 | 
 88 |         if reverse:
 89 |             if not hasattr(self, 'W_inverse'):
 90 |                 # Reverse computation
 91 |                 W_inverse = W.float().inverse()
 92 |                 W_inverse = Variable(W_inverse[..., None])
 93 |                 if z.type() == 'torch.cuda.HalfTensor':
 94 |                     W_inverse = W_inverse.half()
 95 |                 self.W_inverse = W_inverse
 96 |             z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
 97 |             return z
 98 |         else:
 99 |             # Forward computation
100 |             log_det_W = batch_size * n_of_groups * torch.logdet(W)
101 |             z = self.conv(z)
102 |             return z, log_det_W
103 | 
104 | 
105 | class WN(torch.nn.Module):
106 |     """
107 |     This is the WaveNet like layer for the affine coupling.  The primary difference
108 |     from WaveNet is the convolutions need not be causal.  There is also no dilation
109 |     size reset.  The dilation only doubles on each layer
110 |     """
111 |     def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
112 |                  kernel_size):
113 |         super(WN, self).__init__()
114 |         assert(kernel_size % 2 == 1)
115 |         assert(n_channels % 2 == 0)
116 |         self.n_layers = n_layers
117 |         self.n_channels = n_channels
118 |         self.in_layers = torch.nn.ModuleList()
119 |         self.res_skip_layers = torch.nn.ModuleList()
120 | 
121 |         start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
122 |         start = torch.nn.utils.weight_norm(start, name='weight')
123 |         self.start = start
124 | 
125 |         # Initializing last layer to 0 makes the affine coupling layers
126 |         # do nothing at first.  This helps with training stability
127 |         end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
128 |         end.weight.data.zero_()
129 |         end.bias.data.zero_()
130 |         self.end = end
131 | 
132 |         cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
133 |         self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
134 | 
135 |         for i in range(n_layers):
136 |             dilation = 2 ** i
137 |             padding = int((kernel_size*dilation - dilation)/2)
138 |             in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
139 |                                        dilation=dilation, padding=padding)
140 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
141 |             self.in_layers.append(in_layer)
142 | 
143 | 
144 |             # last one is not necessary
145 |             if i < n_layers - 1:
146 |                 res_skip_channels = 2*n_channels
147 |             else:
148 |                 res_skip_channels = n_channels
149 |             res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
150 |             res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
151 |             self.res_skip_layers.append(res_skip_layer)
152 | 
153 |     def forward(self, forward_input):
154 |         audio, spect = forward_input
155 |         audio = self.start(audio)
156 |         output = torch.zeros_like(audio)
157 |         n_channels_tensor = torch.IntTensor([self.n_channels])
158 | 
159 |         spect = self.cond_layer(spect)
160 | 
161 |         for i in range(self.n_layers):
162 |             spect_offset = i*2*self.n_channels
163 |             acts = fused_add_tanh_sigmoid_multiply(
164 |                 self.in_layers[i](audio),
165 |                 spect[:,spect_offset:spect_offset+2*self.n_channels,:],
166 |                 n_channels_tensor)
167 | 
168 |             res_skip_acts = self.res_skip_layers[i](acts)
169 |             if i < self.n_layers - 1:
170 |                 audio = audio + res_skip_acts[:,:self.n_channels,:]
171 |                 output = output + res_skip_acts[:,self.n_channels:,:]
172 |             else:
173 |                 output = output + res_skip_acts
174 | 
175 |         return self.end(output)
176 | 
177 | 
178 | class WaveGlow(torch.nn.Module):
179 |     def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
180 |                  n_early_size, WN_config):
181 |         super(WaveGlow, self).__init__()
182 | 
183 |         self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
184 |                                                  n_mel_channels,
185 |                                                  1024, stride=256)
186 |         assert(n_group % 2 == 0)
187 |         self.n_flows = n_flows
188 |         self.n_group = n_group
189 |         self.n_early_every = n_early_every
190 |         self.n_early_size = n_early_size
191 |         self.WN = torch.nn.ModuleList()
192 |         self.convinv = torch.nn.ModuleList()
193 | 
194 |         n_half = int(n_group/2)
195 | 
196 |         # Set up layers with the right sizes based on how many dimensions
197 |         # have been output already
198 |         n_remaining_channels = n_group
199 |         for k in range(n_flows):
200 |             if k % self.n_early_every == 0 and k > 0:
201 |                 n_half = n_half - int(self.n_early_size/2)
202 |                 n_remaining_channels = n_remaining_channels - self.n_early_size
203 |             self.convinv.append(Invertible1x1Conv(n_remaining_channels))
204 |             self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
205 |         self.n_remaining_channels = n_remaining_channels  # Useful during inference
206 | 
207 |     def forward(self, forward_input):
208 |         """
209 |         forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
210 |         forward_input[1] = audio: batch x time
211 |         """
212 |         spect, audio = forward_input
213 | 
214 |         #  Upsample spectrogram to size of audio
215 |         spect = self.upsample(spect)
216 |         assert(spect.size(2) >= audio.size(1))
217 |         if spect.size(2) > audio.size(1):
218 |             spect = spect[:, :, :audio.size(1)]
219 | 
220 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
221 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
222 | 
223 |         audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
224 |         output_audio = []
225 |         log_s_list = []
226 |         log_det_W_list = []
227 | 
228 |         for k in range(self.n_flows):
229 |             if k % self.n_early_every == 0 and k > 0:
230 |                 output_audio.append(audio[:,:self.n_early_size,:])
231 |                 audio = audio[:,self.n_early_size:,:]
232 | 
233 |             audio, log_det_W = self.convinv[k](audio)
234 |             log_det_W_list.append(log_det_W)
235 | 
236 |             n_half = int(audio.size(1)/2)
237 |             audio_0 = audio[:,:n_half,:]
238 |             audio_1 = audio[:,n_half:,:]
239 | 
240 |             output = self.WN[k]((audio_0, spect))
241 |             log_s = output[:, n_half:, :]
242 |             b = output[:, :n_half, :]
243 |             audio_1 = torch.exp(log_s)*audio_1 + b
244 |             log_s_list.append(log_s)
245 | 
246 |             audio = torch.cat([audio_0, audio_1],1)
247 | 
248 |         output_audio.append(audio)
249 |         return torch.cat(output_audio,1), log_s_list, log_det_W_list
250 | 
251 |     def infer(self, spect, sigma=1.0):
252 |         spect = self.upsample(spect)
253 |         # trim conv artifacts. maybe pad spec to kernel multiple
254 |         time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
255 |         spect = spect[:, :, :-time_cutoff]
256 | 
257 |         spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
258 |         spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
259 | 
260 |         if spect.type() == 'torch.cuda.HalfTensor':
261 |             audio = torch.cuda.HalfTensor(spect.size(0),
262 |                                           self.n_remaining_channels,
263 |                                           spect.size(2)).normal_()
264 |         else:
265 |             audio = torch.cuda.FloatTensor(spect.size(0),
266 |                                            self.n_remaining_channels,
267 |                                            spect.size(2)).normal_()
268 | 
269 |         audio = torch.autograd.Variable(sigma*audio)
270 | 
271 |         for k in reversed(range(self.n_flows)):
272 |             n_half = int(audio.size(1)/2)
273 |             audio_0 = audio[:,:n_half,:]
274 |             audio_1 = audio[:,n_half:,:]
275 | 
276 |             output = self.WN[k]((audio_0, spect))
277 | 
278 |             s = output[:, n_half:, :]
279 |             b = output[:, :n_half, :]
280 |             audio_1 = (audio_1 - b)/torch.exp(s)
281 |             audio = torch.cat([audio_0, audio_1],1)
282 | 
283 |             audio = self.convinv[k](audio, reverse=True)
284 | 
285 |             if k % self.n_early_every == 0 and k > 0:
286 |                 if spect.type() == 'torch.cuda.HalfTensor':
287 |                     z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
288 |                 else:
289 |                     z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
290 |                 audio = torch.cat((sigma*z, audio),1)
291 | 
292 |         audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
293 |         return audio
294 | 
295 |     @staticmethod
296 |     def remove_weightnorm(model):
297 |         waveglow = model
298 |         for WN in waveglow.WN:
299 |             WN.start = torch.nn.utils.remove_weight_norm(WN.start)
300 |             WN.in_layers = remove(WN.in_layers)
301 |             WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
302 |             WN.res_skip_layers = remove(WN.res_skip_layers)
303 |         return waveglow
304 | 
305 | 
306 | def remove(conv_list):
307 |     new_conv_list = torch.nn.ModuleList()
308 |     for old_conv in conv_list:
309 |         old_conv = torch.nn.utils.remove_weight_norm(old_conv)
310 |         new_conv_list.append(old_conv)
311 |     return new_conv_list
312 | 


--------------------------------------------------------------------------------
/filelists/transcript_val.txt:
--------------------------------------------------------------------------------
  1 | wav/nen001_001.wav|はい？呼びました？
  2 | wav/nen001_012.wav|ほしな君も
  3 | wav/nen001_024.wav|さすがに白蛇占いはできませんよ
  4 | wav/nen001_035.wav|わかりました。ありがとう……ございます
  5 | wav/nen001_049.wav|んっ、んっ、んくっ……ひっ、あっ、ぁっ、ぁっ、んんーーッ……
  6 | wav/nen001_060.wav|あああぁぁ……今は、止められなくて……じゅる……はぁ、はぁぁ……あぁぁぁあぁ……
  7 | wav/nen001_072.wav|ほしな君。珍しいところで会いますね。図書室で何か調べ物ですか？こんな時間まで大変ですね
  8 | wav/nen002_004.wav|そうですか
  9 | wav/nen002_018.wav|そうですか……
 10 | wav/nen002_036.wav|あっ、あれは、違うんです。別に悩みとか、ストレスじゃなくて……じ……事情が……ありまして
 11 | wav/nen002_051.wav|そんな風に光るなんて、私も初めて見ました一体何をしたんですか？
 12 | wav/nen002_062.wav|ど、どうして……？一体どこに……欠片が……今まで集めた欠片が……やっぱりさっきの羽根は……
 13 | wav/nen002_074.wav|それで、あの……気分はどうですか？
 14 | wav/nen002_089.wav|占ったんです、その高安先輩の交際相手である女の子の恋愛運を
 15 | wav/nen002_100.wav|そうですね。そういう人も含まれると思います
 16 | wav/nen002_113.wav|はい。私に、“心の欠片”のことを教えてくれた人に
 17 | wav/nen003_007.wav|はい、私の知り合いが営んでる喫茶店です
 18 | wav/nen003_019.wav|諦め？受け入れる？
 19 | wav/nen003_031.wav|あの……一つ、思ったことがあるんですが……
 20 | wav/nen003_047.wav|ななおは人間じゃないんです。私が契約を結んだ、アルプなんです
 21 | wav/nen003_059.wav|楽しそうにしてる時でも、どこか楽しみきれていないと言いますか。そういう気持ちは、私にもありますから
 22 | wav/nen004_011.wav|は、はい？
 23 | wav/nen005_008.wav|いえ、まだです。おそらく何もないとは思うんですが、万が一ということもあります
 24 | wav/nen005_022.wav|おこです。激おこです
 25 | wav/nen005_035.wav|いえ、困っていることがあって、私に力になれることでしたらお手伝いさせてもらいます
 26 | wav/nen005_049.wav|そう言われても事実なので
 27 | wav/nen005_060.wav|そうですね……私では解決できないような依頼も、いくつかありましたね
 28 | wav/nen005_071.wav|まあ、気は進まないんですけどね……はぁ……
 29 | wav/nen005_089.wav|ごめんなさい。でも、これが欠片の回収方法なんです
 30 | wav/nen005_100.wav|ななおに訊いても無駄ですよ。ななおは猫なんですから
 31 | wav/nen006_005.wav|私も同じです。一般的な意見なら、多少は集まりましたが……
 32 | wav/nen006_018.wav|わかりました～
 33 | wav/nen006_030.wav|じゃあ、続けますね
 34 | wav/nen006_043.wav|ありがとう、ございます……んっ、んんんっ、ひっ、ひっ、ふーーーぅ……ひっ、ひっ、ふーーーーぅ……
 35 | wav/nen006_056.wav|諦めるのはまだ早いと思います。ここにはまだテクニックが記されていますから
 36 | wav/nen006_068.wav|え？突然どうしたんですか？ほしな君に謝罪されるようなこと、ありましたか？
 37 | wav/nen007_006.wav|一部ということは……そうじゃない人には、受け入れてもらえた、ということですか？
 38 | wav/nen008_009.wav|折を見て、自分の分を買いに行こうと思ってます
 39 | wav/nen008_021.wav|そ、そうなんですか……？
 40 | wav/nen008_032.wav|もう１時間ぐらいしてますから
 41 | wav/nen008_044.wav|いなばさん……ありがとうございます。それでは、お言葉に甘えさせてもらってもいいですか？
 42 | wav/nen008_056.wav|私たちでリハーサル？
 43 | wav/nen009_004.wav|はい。川上君の悩みは、本当にデートのことでいいんでしょうか？
 44 | wav/nen009_016.wav|はい、大丈夫ですよ、時間はまだ１０分ほど余裕がありますから
 45 | wav/nen009_027.wav|そうですね……ほしな君、川上君の予定では映画の後はどうなっていますか？
 46 | wav/nen009_039.wav|このことは、川上君にも伝えておいた方がいいですね
 47 | wav/nen009_052.wav|本当ですか？丁度いい機会ですから、いっそ買ってしまうのもいいですね
 48 | wav/nen009_063.wav|川上君はしっかりプランを組んだりしているんですから、むしろ川上君が嫌がるかもしれませんね
 49 | wav/nen009_074.wav|私は何でもいいですよ。嫌いな物も特にありませんから
 50 | wav/nen009_085.wav|あの、これってもう取っていいんですか？
 51 | wav/nen009_100.wav|あ、甘エビ～♪
 52 | wav/nen010_007.wav|確かにそうですね。お礼の言葉を言ってもらえたりするのも、とても嬉しいものですからね
 53 | wav/nen010_021.wav|は、はぁ……はぁ……あり、がとう、ございますぅ……ほしなくんんんっ……
 54 | wav/nen010_032.wav|はい、あと少し……もう少し……んっ、んひっ、あっ、あっ、あっ……はあぁぁぁー……
 55 | wav/nen010_045.wav|ふーーー……ふーーー……
 56 | wav/nen010_057.wav|いえ、そうじゃなくてですね、その………………スースー、しますから
 57 | wav/nen010_074.wav|はい、どうぞ
 58 | wav/nen010_086.wav|私たちのオカルト研究部も、元々は黒魔術だったみたいですよ
 59 | wav/nen010_100.wav|は、はい、大事になる前に誤解をときましょう
 60 | wav/nen011_008.wav|私が勧めたんです。更衣室で着替えるのを恥ずかしそうにしていたので
 61 | wav/nen011_019.wav|あ、ダ、ダメですよ、変なところ触っちゃくすぐったいですから
 62 | wav/nen011_033.wav|どうしたんですか？なにか連絡事項が？
 63 | wav/nen011_044.wav|ではとがくし先輩の相談は、越路さんを説得すること、でいいんですか？
 64 | wav/nen012_003.wav|それで、どうでしたか？
 65 | wav/nen012_014.wav|あの、ほしな君
 66 | wav/nen012_026.wav|あれだけ反応が弱い欠片ですと、特に
 67 | wav/nen013_001.wav|もし本当に私の他に魔女がいるとしたら……困ったことになりますね
 68 | wav/nen013_015.wav|はい、問題ありません
 69 | wav/nen013_028.wav|ロ、ローター……です………………ローターですよぅ……
 70 | wav/nen014_001.wav|そうなんですか？どうかしたんですか？
 71 | wav/nen015_071.wav|それじゃあ今後とも、よろしくお願いします
 72 | wav/nen015_004.wav|はぁ、それはわかりました。でも、一つだけ答えてくれませんか？気になる事があるんです
 73 | wav/nen015_016.wav|え？それって、どういうことですか？
 74 | wav/nen015_031.wav|そんな普通に可愛い服だなんて卑怯ですっ。私なんてこんな恥ずかしい恰好なのにぃ理不尽です～！
 75 | wav/nen015_043.wav|魔女の契約の代償……と言うことですか
 76 | wav/nen015_056.wav|しいばさんはああ言ってくれましたが、私は別にこの学院を自分の領土だなんて言うつもりはありません
 77 | wav/nen016_003.wav|はい。また何か困ったことがあれば、いつでもどうぞ
 78 | wav/nen016_014.wav|つまり、私たちはこの部室から出ていかなければいけない、ということですか？
 79 | wav/nen016_027.wav|とにかく運営のすべきことは、ほしな君が言ったことと、先生方との折衝もでしょうか？
 80 | wav/nen016_039.wav|そういうことでしたら……お願いできますか？
 81 | wav/nen016_050.wav|なにか問題がありましたか？
 82 | wav/nen017_002.wav|全員揃っていますね。それじゃ行きましょうか
 83 | wav/nen017_015.wav|それじゃあ……ここからここまでを、まず完璧に覚えましょう。ここの基礎を覚えてしまえば、次も覚えやすいですから
 84 | wav/nen017_028.wav|え？なんですか？
 85 | wav/nen017_041.wav|だ、大丈夫……大丈夫なはず……ええ、絶対大丈夫です……おそらく、きっと、多分
 86 | wav/nen017_052.wav|確かにそれぐらいの余裕はありますが……
 87 | wav/nen017_064.wav|はい、お疲れ様でした
 88 | wav/nen018_012.wav|それじゃあ、一体どうしてですか？
 89 | wav/nen018_023.wav|大きな欠伸ですね
 90 | wav/nen018_036.wav|ちょっと皮がむけちゃって、真っ赤になっちゃってますよ
 91 | wav/nen018_050.wav|ほしな君のことを、応援していますし……それが、応援になるというのでしたら……もう一度
 92 | wav/nen019_002.wav|はい。よろしくお願いします
 93 | wav/nen019_013.wav|そうですか、ありがとう……ございます
 94 | wav/nen019_026.wav|ありがとうございます、しいばさん……言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった
 95 | wav/nen019_037.wav|予想よりも多くの人に集まってもらえて、準備してきた者としては嬉しい限りです
 96 | wav/nen020_102.wav|ぷぁ、はぁぁ………………疲れました
 97 | wav/nen020_114.wav|それに私もほしな君と同じで、あくまで部活の一環ですからね
 98 | wav/nen020_127.wav|もぅ、どうしてそういうことを言わせるんですか！
 99 | wav/nen020_139.wav|こちらのことは気にしないでいいんですよ？……こうして欠片が戻ったということは、ほしな君も嫌に思ってるわけじゃないんですよね？
100 | wav/nen020_151.wav|はい。ほしな君は気付いていないかもしれませんが、笑顔が以前とは比べ物にならないぐらい自然ですから
101 | wav/nen020_162.wav|かもしれません。でも、そういう部活も楽しくていいものですよ
102 | wav/nen020_404.wav|ありがとうございます
103 | wav/nen101_010.wav|はぁ……
104 | wav/nen101_024.wav|いえ、買い物ではないんです。今日は色々疲れてしまったので……
105 | wav/nen101_036.wav|いいんですか？
106 | wav/nen101_048.wav|ほしな君は、このお店に入ったことがあるんですか？
107 | wav/nen101_059.wav|ですが……こうして呪文を唱えなきゃいけないんですよね？とりあえず、初心者はこう頼むべし、って書いてありましたけど
108 | wav/nen101_071.wav|えっ……あの、それって……
109 | wav/nen101_086.wav|あ、美味しいですね。これがラーメン
110 | wav/nen101_099.wav|あのほしな君、早く行きましょう
111 | wav/nen101_111.wav|え？いえそんな、お礼を言われるような、大層なことは出来ていませんから
112 | wav/nen101_126.wav|はい、さようなら
113 | wav/nen102_005.wav|あの……それで、どうしたんですか？突然電話だなんて
114 | wav/nen102_018.wav|それにですね、今朝に比べると大分マシにはなっています。ですから、このまま大人しくしていれば平気ですよ
115 | wav/nen102_033.wav|どうぞ
116 | wav/nen102_046.wav|私が嘘を吐いていないのは、ほしな君ならわかりますよね？
117 | wav/nen102_057.wav|ですから、む……夢精をしちゃうような……いやらしい夢を見たんじゃないかなっと
118 | wav/nen102_072.wav|私は一人暮らしですから。そういう思い出とは縁遠い生活ですね
119 | wav/nen102_087.wav|今度は、ほしな君がおまじないをかける側になって下さい。そしたらきっと、私の恥ずかしさがわかってもらえるはずです
120 | wav/nen102_099.wav|ひっ、んっ、んん……ふぅ、ふぅ……んんっ、んふぅ……んん……
121 | wav/nen102_111.wav|はい。約束です
122 | wav/nen102_124.wav|でも……気分が少しマシになったかもしれない。あのおまじないは効くのかな？
123 | wav/nen103_010.wav|だから熱く語らないで下さい、思い出しちゃダメー、手をニギニギさせるのもダメですってばっ
124 | wav/nen103_025.wav|私に、む、夢精……とか言わせたくせに、教えてくれないなんてズルいですよぅ！
125 | wav/nen103_042.wav|ところで話は変わりますが、何かあったんですか？みんな、普段と様子が違うみたいですが
126 | wav/nen103_053.wav|はい。先生が男の人と一緒に歩いているところを見かけましたよ
127 | wav/nen103_070.wav|本命の質問だけでなく、無関係なダミーの質問も織り交ぜれば、怪しさも薄くなりませんか？
128 | wav/nen103_082.wav|わかりました
129 | wav/nen103_095.wav|そうなんですが……見られていないとわかっていても、恥ずかしいんですよぅ、この恰好
130 | wav/nen103_106.wav|というよりも……一緒に行っていいですか？実は私もまだ書いていなくて……
131 | wav/nen103_123.wav|あ、いえ、その……
132 | wav/nen103_141.wav|は、はい？
133 | wav/nen103_161.wav|私は、怒られたくないです……
134 | wav/nen103_175.wav|あの……正直に言います。最近の私は変なんです
135 | wav/nen103_189.wav|い、いえ、そんな風には思っていませんから、平気ですっ
136 | wav/nen103_200.wav|それに……こんな私のことを知りたいって言ってくれたこと……嬉しかったです
137 | wav/nen103_212.wav|こ、子供っぽいですよね？
138 | wav/nen103_227.wav|なぅぅぅぅぅ……ほしな君のことを思うと心が落ち着かない……
139 | wav/nen103_240.wav|あっ、うあっ、あぁぁぁぁぁぁぁぁぁぁぁっ
140 | wav/nen103_251.wav|はあ、はあ、はぁああぁぁ……なにこれ、こんなにすごいの、しらない……いつもと、全然違う……んっ、ふーっ……ふーっ……
141 | wav/nen103_262.wav|はぁ、はぁ、はぁ、はぁはぁはぁぁぁぁぁんっ、ぅぅぅぅぅぅうっ！
142 | wav/nen103_273.wav|ひゃんっ……あ、あ、あぁぁぁ……ヤダぁ、止まらない、止まりませんよぉ……あ、あ、はぁぁぁぁ……っ
143 | wav/nen104_007.wav|はい？なにが……ですか？
144 | wav/nen104_020.wav|だってほしな君が言わせたんじゃないですかぁ
145 | wav/nen104_031.wav|そうです。その通りです。い、今でももうおかしくなっているのに、これ以上は……
146 | wav/nen104_043.wav|本当にごめんなさい
147 | wav/nen104_054.wav|別に大変と言うほどのことは
148 | wav/nen104_066.wav|そうなんですか？どうしてこんなにすぐに……いつも通り過ごしていたはずなのに
149 | wav/nen104_078.wav|それじゃあ、考えておきます
150 | wav/nen104_092.wav|もしよければ、その相手の怪しい行動についても、教えてもらえますか？
151 | wav/nen104_106.wav|あっ……ぅっ……
152 | wav/nen104_121.wav|な、なんでもないですよぅ。眠れなかったというだけですから
153 | wav/nen104_136.wav|普通は引きますよね。一晩中オナニーしちゃうような女の子なんて……
154 | wav/nen104_148.wav|そ、それじゃあ皆さん……あっ、んんっ……私は、お先に、失礼させてもらいます、ね……んんっ
155 | wav/nen104_161.wav|でも、ダメでした。ちょっと……無理そうです。答えは出そうにありません
156 | wav/nen104_173.wav|ほしな君は、私のことを好きって言ってくれてますが私には、ほしな君にも言ってないことが……あるんです
157 | wav/nen104_190.wav|濡れて……ます……発情が止まらなくて……ぅぅ……そ、そんな、ヘンタイな私でも好きって言ってくれますか？
158 | wav/nen104_203.wav|もう無理です。我慢できません。自分が抑えられなくて………………だから先に謝っておきますね。ごめんなさいっ
159 | wav/nen104_214.wav|んふーッ……じゅる、ちゅるるる……れる、れろれろれる……ちゅ、ちゅ……んちゅ
160 | wav/nen104_226.wav|んっ、んんんーーーー……ぷぁ、はぁぁぁーー……はぁーっ……はぁーっ……
161 | wav/nen104_239.wav|はい。わかりました――んぅっ、あ……あっ、あっ……んんぁ
162 | wav/nen104_252.wav|あっ、はぁ、はぁ、はぁ……んんっ、んんんっ……ふーっ、ふーっ……んっ、んんーーっ
163 | wav/nen104_264.wav|それは……はぁ、はぁ……んんっ、好きな人にされる方が、気持ちよくて……好きです……
164 | wav/nen104_275.wav|んっ、ひぃぃぁぁぁぁぁあああっ
165 | wav/nen104_287.wav|あ、あ、あの……そんなに、じっくり見ないで下さい……恥ずかしいんですから……
166 | wav/nen104_299.wav|えっ？それは、やっぱり私のそこ、変ってことですか？色々自分で弄っちゃってるから、変なんですか？
167 | wav/nen104_310.wav|ひぃぁあっ！そっ、そこっ、は……んっ、んんんっ、あ、あ、あ、あ、ああああああっ
168 | wav/nen104_322.wav|ほ、ほしな君は、どうですか？
169 | wav/nen104_333.wav|はっ、はっ、ああぁァァああんっ、びりびり、するぅ……はぁ、はぁ、はぁ……奥まで、きてますぅ
170 | wav/nen104_344.wav|あああっ、頭、くらくらします……はぁはぁはぁ、ん、んんぅぅーーーーーーッ、もっと呼んでぇ、もっと名前を呼んで下さいぃ
171 | wav/nen104_355.wav|きゃ、ぅぁっ……はぁ、はぁ、すごい、出てます、ヌルヌルのが、いっぱいっ
172 | wav/nen104_370.wav|あ、あの、それはまた、後日にお願いします
173 | wav/nen105_010.wav|い、いいですいいです、そんな仰々しいことっ
174 | wav/nen105_026.wav|はい。お疲れ様でした
175 | wav/nen105_041.wav|一人暮らし用の冷蔵庫だと小さいですから。野菜室があるタイプに買い換えようかとずっと悩んでいるんですが……
176 | wav/nen105_058.wav|それになによりも、好きな人と一緒にいられる時間は私も好きですから
177 | wav/nen105_071.wav|はい。頑張って作りますね
178 | wav/nen105_087.wav|ん、れろ……れる、えるれろれろ……れるん………んっ、ちゅぅぱ、はぁ、はぁ、はぁっ、あぁんっ
179 | wav/nen105_100.wav|あの……別に、そういう行為が嫌というわけじゃないんです。さっき、キスの前に言ったのは本当のことですから
180 | wav/nen105_112.wav|ウソツキ……私のしたいこと、ワガママを言ってもいいって……そう言ってくれたじゃないですか
181 | wav/nen105_127.wav|あ……あの、もう一度触っていいですか？今度はちゃんと優しく、丁寧に触りますから
182 | wav/nen105_139.wav|もし痛かったら言って下さいね。ちゅ、ちゅ……ん……ちゅぅ……んっ、んんっ
183 | wav/nen105_150.wav|んちゅ、じゅる……ちゅ、ちゅ……んんんー、舐めても舐めても、全然綺麗になりませんね。むしろ、ますますベトベトになってるような……
184 | wav/nen105_162.wav|んぶ……ンッ、ちゅばちゅば……ちゅぶっ、ちゅぶぶ……んんーーっ、じゅるっ……じゅるるるるっ
185 | wav/nen105_173.wav|んーー……じゃあ、見えなくしちゃいます……ん、じゅる、じゅるるる……ちゅ、ちゅぅぅぅぅーーー……ッ
186 | wav/nen105_184.wav|ん……ッッ！？んっ、ぅぅぅっ……ん、んんーーー……んふぅー……ふぅー……ん、んむぅ……んんっ
187 | wav/nen105_195.wav|はぁ……はぁ……んっ、はぁぁぁ……気持ち、よかったですか？
188 | wav/nen106_002.wav|ちょっと待って下さいね。私も、最近は確認をしていなかったので
189 | wav/nen106_014.wav|でも、予定は大丈夫なんですか？
190 | wav/nen106_026.wav|は、はい。そうですね
191 | wav/nen106_038.wav|んふぅ、んっじゅるっ、ぬちゅくちゅ……んぁ、はぁ、はぁ、はい。もう少し……はぁぁ、あむぅ……れろれるん、れちょれちょ
192 | wav/nen106_050.wav|さようなら。また明日
193 | wav/nen106_065.wav|いつも歩いている道ですから。それに、なるべく明るくて人気のあるところを通ります。大丈夫ですよ
194 | wav/nen106_078.wav|すみません、気を遣わせてしまいまして。でも、本当にそれだけなので、心配は必要ありませんよ
195 | wav/nen106_090.wav|あの、ちょっと待って下さい
196 | wav/nen106_101.wav|そう言ってもらえて嬉しいです
197 | wav/nen106_116.wav|は、はい。もちろんです……私も、大好きな人とキスしたい、です……
198 | wav/nen106_129.wav|え？
199 | wav/nen106_150.wav|それじゃあ、今日は失礼しますね
200 | wav/nen107_003.wav|それじゃあ、お疲れ様でした。さようなら
201 | wav/nen107_019.wav|そう、ですよね……今みたいな状態を続けても……仕方ないですよね
202 | wav/nen107_035.wav|でも、でも……
203 | wav/nen107_051.wav|それよりも、結局どうなんですか？私の気持ち、ちゃんと感じてもらえてますか？
204 | wav/nen107_063.wav|それは……うっ……ぅぅぅ～～～……恥ずかしい、ですけど……今は、この温もりに包まれていたいです。そっちの方が重要です
205 | wav/nen108_011.wav|でも急に泊まってもらうことになって……親御さんにもご迷惑を……
206 | wav/nen108_024.wav|そうなんですか？えっと……気付いていませんでした。むしろ、私の方が甘えちゃっていますから……
207 | wav/nen108_044.wav|んんっ、ふーっ……ふーっ……
208 | wav/nen108_056.wav|授業に身が入らなくて……ず、ずっと、考えてたら……はぁ……はぁ……だ、だから……はぁ、はぁ、はぁ、はぁ
209 | wav/nen108_069.wav|それは、だから……下のお口、ですとか……他にもありませんか？
210 | wav/nen108_080.wav|ひあぁぁああぁぁああっ、それ、しび、れる……からだ、痺れちゃうっ、あ、あ、あああああ、そこ、吸うの、あっ、あああっ
211 | wav/nen108_091.wav|ひゃあああぁっ、そんな、おま●こ全部を吸われたらぁ……あ、あ、あ、あ、我慢できませんっ、もう熱いですぅ、身体が熱くて仕方ないんです
212 | wav/nen108_103.wav|ぁぁ……はぁー、はぁー……あ、これぇ、奥まで感じます……んぁぁ、はぁー……はぁー……
213 | wav/nen108_114.wav|ひっ、ひああぁぁぁああぁ、それ、それ凄いですぅ……はぁはぁはぁ、あああぁぁあああぁっ
214 | wav/nen108_125.wav|あっ、ああっ……やだぁ、エッチな音、してます……私の、おま●こから、エッチな音が……あっあっあっ、でも、我慢できなくてっ
215 | wav/nen108_136.wav|はぁ、はぁ……はぁぁぁ……もう、ドロドロですよ……
216 | wav/nen108_152.wav|それは……はい。確かにそういう気持ちはあります……
217 | wav/nen108_168.wav|浮かない表情をしていました……
218 | wav/nen108_181.wav|それは……どういう意味ですか？
219 | wav/nen109_011.wav|そうですね……カラオケに、ボウリング、プリクラも……
220 | wav/nen109_025.wav|はい……それじゃあ、えっと、えっと……
221 | wav/nen109_040.wav|ここがいいでしょうか……それともこっち？
222 | wav/nen109_052.wav|まだ色々やりたいことはあります、それは尽きませんけど………………でも本当に、後悔はしてませんよ
223 | wav/nen109_069.wav|はい。私、幸せになります。それで、しゅうじ君のことも幸せにしてみせます
224 | wav/kne110_008.wav|メッセージ……
225 | wav/kne110_026.wav|こんな……形だけにこだわった物じゃないんです……でも、それはもう……無理なんですよね
226 | wav/kne110_044.wav|はい。優しそうな人ですから
227 | wav/nen110_013.wav|ぅっ……ぁぁ……ダメ……泣いたり、しない
228 | wav/nen111_006.wav|言いたいこと……ですか？
229 | wav/nen111_019.wav|ギターが欲しいんですよね？
230 | wav/nen111_033.wav|やっぱり、未来が変わっちゃってるんですよね……
231 | wav/nen111_047.wav|それは、えっと………………
232 | wav/nen111_062.wav|で、ですから……わ、私の……オナニー………………オナニーですっ
233 | wav/nen111_079.wav|は、はい。大丈夫です。すみません、驚かせてしまいまして
234 | wav/nen111_092.wav|ひぁっ、ぅぅぅ～～～
235 | wav/nen111_105.wav|保健室に行きますか？
236 | wav/nen111_124.wav|好き……好きです、大好きです……私は貴方のことが大好きです。愛しています。もう離れたりしません
237 | wav/nen112_011.wav|ほしな君はちゃんと以前から、力になってくれていましたよ
238 | wav/nen112_029.wav|私だって嬉しいです。ほしな君が一緒にいてくれて……その、単純に近い場所にいてくれるってことじゃなくてですね
239 | wav/nen112_043.wav|それで、いなばさんは……相談でいいんですよね？
240 | wav/nen112_054.wav|少し考える時間をもらえますか？
241 | wav/nen112_068.wav|いえ、そんなことはありません。私も嬉しいですよ
242 | wav/nen112_080.wav|あ、あの、なんだか凄い騒ぎになってるみたいですけど……
243 | wav/nen112_094.wav|時と場所さえ考えてもらえれば……私も、や……やぶさかではありませんが……え？え？も、もしかして今日って、そういうことなんですか？
244 | wav/nen112_108.wav|そ、そうですね。見つかったらデートできなくなってしまいますよね
245 | wav/nen112_121.wav|もぅっ！そんなに連続して呼ばれたら、嬉しすぎておかしくなっちゃいますよぅ
246 | wav/nen112_133.wav|だって美味しいじゃないですか。それにほら、見た目も可愛いです
247 | wav/nen112_145.wav|はぁ……美味しかったです
248 | wav/nen112_159.wav|自分の身体なんですから、当たってることぐらい気付いてます……わかってはいますが……抱きついていたいんです
249 | wav/nen112_173.wav|前は私のしたいことするデートでしたが……今回はしゅうじ君が私のために計画してくれたデートで、どこに行くのかドキドキして
250 | wav/nen112_188.wav|私はしゅうじ君のこと、嫌いになったりなんてしないのに
251 | wav/nen112_204.wav|お、お邪魔します
252 | wav/nen112_217.wav|それにしても、しゅうじ君はお父さんとあんな風に喋るんですね。ちょっと、意外でした
253 | wav/nen112_229.wav|いえ、平気です
254 | wav/nen112_243.wav|お、女の子だって興奮とか、期待とか、もにょもにょしちゃうものなんですよぅ……
255 | wav/nen112_259.wav|わ、わかりました……
256 | wav/nen112_270.wav|は、はい……ぅぅぅぁッ……はっ、はぁー、はぁー……お願いします、続けて下さい……もっと、触って
257 | wav/nen112_281.wav|あ、あ、あ、また……やっ、そんなに強く捻っちゃ……ひぁっ、んぃぃ……ッッ
258 | wav/nen112_293.wav|あぁぁ、んぁああぁぁ……ッッ、２回、２回です……んっ、んんぅぅぅぁぁぁあッ、あっ、あっ、あああッッ
259 | wav/nen112_304.wav|だ、だって……５回だなんて……恥ずかしいです。凄くエッチですから……
260 | wav/nen112_315.wav|ぅぅ……また、そうやって全部言わせて……本当にイジワルですよぅ……
261 | wav/nen112_327.wav|熱くて……硬くて……はぁ、はぁ、ぁぁぁあっ……前より太くて、おっきい気が、しますぅっ
262 | wav/nen112_339.wav|ちゅっ、んん、ふぅぅ……んっ、んっ、んんぅぅぅぅ……ぅぅーーッ
263 | wav/nen112_350.wav|だって、だって……んっ、ぅぅぅっ……こ、こんなに、グリグリされたら、こんな声も出ちゃいますよぅ……あっ、はぁはぁはぁはぁ
264 | wav/nen112_361.wav|あ、はぁぁぁむ、んちゅ……ちゅ、ちゅ、ちゅ、じゅるる……んちゅ、ちゅぅぅーー……ん、んむぅ、んっ、じゅるる
265 | wav/nen112_372.wav|私も……こんなにイってしまったのは、初めてです……やっぱりオナニーとは、全然違いますね……はっ、はぁぁ……
266 | wav/nen112_386.wav|いえ、もう起きます
267 | wav/nen113_171.wav|わ……わかりました……それなら……私、命令通りに、オナニーします
268 | wav/nen113_182.wav|ひぁぁ！は、はい、はいっ……んんっ、んんぅぅ……ぅぅあっ、あっ、あっ、あっ
269 | wav/nen113_193.wav|ちがっ、違うんです……お漏らしじゃなくて……ああ、もう……どうしてこんなにビショビショなの？まだ、乳首を刺激してるだけなのに
270 | wav/nen113_205.wav|はぁ、はぁ、こ、ここら辺ですか？もう当たりますか？
271 | wav/nen113_216.wav|ぁっ、ぁっ、ぁっ、ぅぅあっ、なにこれ……ダメっ、ダメっ……あっ、あっ、あっ、ぁぁぁああぁぁあ、イく……イっちゃう
272 | wav/nen113_227.wav|はぁーっ……はぁーっ……気持ちいい、です。クリトリス、気持ちいい……
273 | wav/nen113_238.wav|んんんっ！んぁっ、んぁっ、ダメ……手が、震えて、あっ、あっ！ローター……当てていられない……あっ、あっ
274 | wav/nen113_249.wav|はっ、はひっ、あっ、あっ、あっ！イ、イく……もう、わらひ、我慢できませんよ……ああっ、あっ、あーーーーッ！
275 | wav/nen113_260.wav|え？あ、ちょっと待って下さい……あっ……
276 | wav/nen113_272.wav|わかりました。それじゃあ遠慮せず、沢山イきますね……はぁ、はぁ……
277 | wav/nen113_284.wav|それに動きたいんですよね？気持ちよくなりたいんですよね？さっきから、わたしの中でおち●ちんがビクビク、してますよ
278 | wav/nen113_295.wav|はぁ……はぁ……はぁ……はぁ……あっ……あっ……あっ、ああああぁぁぁぁぁぁぁぁぁああああああーーーーー！！
279 | wav/nen113_306.wav|んふぅ……んっんっんんぅぅぅぁああ！はぁ！はぁ！あああっ、んんんーーーーー……んんぁぁああっ！
280 | wav/nen113_317.wav|んひっ、あっ！あっ！はぁぁ……まだ、出てる……あっ、あっ、あっ、はぁぁ……ん、んんっ！
281 | wav/nen113_328.wav|ちょっと？
282 | wav/nen113_006.wav|そんなことありませんよ。さあ、遠慮せずに中に入って下さい
283 | wav/nen113_017.wav|あのー……
284 | wav/nen113_031.wav|さ、参考……ですか？川上君が考えたデートプランを実際に試してみる、とかじゃなく？
285 | wav/nen113_044.wav|私はゲームセンターも好きですよ。普段は全然入ったこともありませんから、むしろ楽しみなぐらいです
286 | wav/nen113_056.wav|あっ、しゅうじ君。あっちにもほら、クマのぬいぐるみがありますよ
287 | wav/nen113_067.wav|私のことを考えてくれたからこそ、思い出の方を優先してくれたんですよね？
288 | wav/nen113_081.wav|そうですね。特別やレアって言われてしまうと、試しに頼んでみたくなりますね
289 | wav/nen113_096.wav|なにか違うこと考えてます
290 | wav/nen113_107.wav|いえ、もうジュースが無くなっちゃいましたから……
291 | wav/nen113_118.wav|でも……いつもよりは、疲れましたよね？
292 | wav/nen113_132.wav|た、確かに……そうですね
293 | wav/nen113_145.wav|それならいいんですが……
294 | wav/nen113_162.wav|どっ、どうやってって
295 | wav/nen114_017.wav|いえ。むしろ、こちらこそすみません。不透明な活動ばかりで……もっと結果が残るような物があればご迷惑もおかけしなかったんですが……
296 | wav/nen114_028.wav|それに、パーティーで演奏しないとかりやさんはギターを披露できず、モヤモヤしたままになりませんか？
297 | wav/nen114_042.wav|そっ、その言い方は……卑怯ですよぅ
298 | wav/nen114_056.wav|そこも気になる部分ではあるんですが……
299 | wav/nen114_072.wav|しゅうじ君を待っていたんです。最近、一緒にいられる時間が少ない気がして……なんとかしたいなと思って、終わるのを待ってたんです
300 | wav/nen114_083.wav|女の子同士でもですか？
301 | wav/nen114_097.wav|はい、できました
302 | wav/nen114_111.wav|物じゃなくてですね、あの……ですから……しゅうじ君の願い事を、なんでも叶えます、私が
303 | wav/nen114_124.wav|ダメです
304 | wav/nen114_135.wav|んっ、んんーーーッ……んふぅ、ふぅー……ふぅー……んっ、んんっ、んむぅ……んぅ……も、もっと……しゅうじ君、もっと……
305 | wav/nen114_146.wav|んぷぁぁっ、はっ、はぁ……はぁ……はひっ、んぁぁあ……はぁぁぁ……
306 | wav/nen115_007.wav|でもその前に、私たちの演奏を聞いて下さい。一生懸命練習してきましたから
307 | wav/nen115_021.wav|しゅうじ君は……誰に投票したんですか？
308 | wav/nen115_037.wav|はぁ、ぁぁむ……ん、んんっ、ちゅちゅ……じゅる、ちゅぱちゅる、んっ、んんんんんーーーーーー
309 | wav/nen115_049.wav|ひっ！？あっ、あっ、あああぁぁーーっ！
310 | wav/nen115_061.wav|ずっとオナニー我慢してて……ぁぁぁぁあっ！しゅうじくん、しゅうじくん……っ、はぁ、はぁ、はぁぁあぁっ
311 | wav/nen115_072.wav|ふぇぇ……？はっ、はぁ、はぁ、はぁ……ど、どうかしたんですか……？
312 | wav/nen115_083.wav|好き、あっ、あっ、あっ、ひゅきでひゅ……おち●ちんにグリグリされるの……あっ、あっ、ああぁぁああっ！
313 | wav/nen115_094.wav|あーー……あはーーー……はひ、はひっ……んへぁぁ……私、こんな下品な声を出してイっちゃった……はぁーっ……はぁーっ……
314 | wav/nen115_106.wav|んっ、んっ、んんーーーーっ！はひっ、はひっ、んっ、んんんーーーーッ！
315 | wav/nen115_118.wav|んひっ、あっ、あっ、んんっ、んんぁぁあっ、はぁーっ……はぁーっ……あっ、あっ、はぁぁぁぁぁ……
316 | wav/nen115_129.wav|それに……こんなの、まるでおち●ちんが、私から生えたみたいです。しかも硬いままで……
317 | wav/nen115_144.wav|んー……こんなものでしょうか
318 | wav/nen115_156.wav|お願い？
319 | wav/nen115_168.wav|んっ、ぅうぅ……はぁ、はぁ……んんっ、んんん……
320 | wav/nen115_180.wav|はっ、んっ、んんぁっ、んぁっ……ぁぁあぁああぁ……引っかかるの、気持ちいい、です……んんー……ッッ
321 | wav/nen115_191.wav|はぁー……はぁー……はぁー……ぁぁぁ、んんんっ……
322 | wav/nen115_202.wav|あっ、あっ、あーーーっ……中、中が切なくて……はぁ、はぁ、はぁ、あの、もうオナニーじゃなくなってもいいですか？
323 | wav/nen115_214.wav|だって……んぁぁ、ずっと待ってたんです。欲しくて、我慢してたんです……だから、仕方ないんですよ、ぁぁぁ……
324 | wav/nen115_225.wav|違う、違うのぉ……身体が勝手に……ん、ん、ん、んぁぁあーーーぁぁぁぁ……こひゅれてる、気持ちいいの、こひゅれてるぅ
325 | wav/nen115_236.wav|おま●こですっ、おま●こに欲しい……んっ、んぁ……精液、こっちで飲みたいんです、んぁ、んぁ、んぁーーっ！
326 | wav/nen115_249.wav|え、えぇぇ……ま、まだ足りないんですか？こんなにドロドロにしたのに……
327 | wav/nen115_262.wav|そう言ってもらえると……ありがとうございます
328 | wav/nen116_001.wav|はい
329 | wav/nen116_012.wav|私は……別に流されてもいいのに……
330 | wav/nen116_026.wav|私にできることがあるなら、何でもします。だから、１人で苦しまないで下さい
331 | wav/nen116_039.wav|はい、大丈夫です
332 | wav/nen117_007.wav|どうしてそういうことを言うんですか！私の好きな人なのに！
333 | wav/nen117_020.wav|あとですね、せっかくですからお泊まり用具の他にも色々用意してきたんです
334 | wav/nen117_034.wav|あの、お風呂頂きました。お……お待たせ……しました、しゅうじ君
335 | wav/nen117_047.wav|せっかく気合いを入れて身体も綺麗にしたのに……先に寝ちゃうなんてひどいです
336 | wav/nen117_058.wav|よかった、安心しました
337 | wav/nen117_069.wav|んぅぅ……ちゅ、ちゅ、んんんんーーッ……嫌じゃないですよ？むしろ……私は濃い方が好きかもしれません……ん、じゅる、じゅるりっ
338 | wav/nen117_080.wav|じゃあ、続けますね。ん、ちゅ、ちゅぶぶ……んっ、じゅるっ、じゅぽじゅぽ、ちゅ、ちゅるるっ
339 | wav/nen117_091.wav|んふぅ……ほら、こうして正直に教えてくれます、気持ちいいって
340 | wav/nen117_103.wav|はぁ、はぁ、はぁ……すごい、トロトロと匂いが、さっきから止まりません……ああ、全然綺麗にできない
341 | wav/nen117_114.wav|んぐっ……んぶ、んぶ……ッ……んんんんーーーーーッ！ん、んんーーーー……コク……コク……ん、んんんむぅ
342 | wav/nen117_125.wav|ひゃっ、たくさん……あつい精液、びゅーって飛んで……あ、きゃっ、ひゃっ
343 | wav/nen117_136.wav|ん、ちゅば、ちゅば……んんっ、れろれろ……ンンッ……はぁ、はぁ……れりょれりょ
344 | wav/nen117_147.wav|んっ、んんっ、あむあむ……ぢゅぷ、ぢゅるるる……んぽくぽ、じゅるるるっ
345 | wav/nen117_158.wav|あっ！ダメですよ、これは罰なのに、あ、きゃぁぁぁッ
346 | wav/nen117_171.wav|んっ、んんんぁぁぁぁーーーーーーーー……ッッ！
347 | wav/nen117_183.wav|あ、あ、ああーーーーっ……はぁ、はぁ……あ、あ、あ、それ、すごい……すごいぃぃ……んんぁあッッ
348 | wav/nen117_194.wav|あっ、ひっ、んひぃぃッ……あーっ、あーっ……もうらめぇ…あ、あ、あ、イく、いっっ……くぅぅぅぅーーーーーぅぅぅぅううううッッ！！
349 | wav/nen117_206.wav|んっ、あっ、あっ、あっ、あっ……そうなんですか？わたし、もうちゃんと、覚えてるんですか？
350 | wav/nen117_217.wav|イっちゃうっ、わたひまたイっちゃうぅぅ……ッ
351 | wav/nen117_228.wav|かひっ、かっ、はぁ、はぁ……んんんっ……はぁ、はぁ、んんっ、んぁ……ぁぁぁぁ……
352 | wav/nen117_239.wav|それは、ちがっ、えっと、あががががががががががが――
353 | wav/nen203_010.wav|はい、それは残念ながら
354 | wav/nen203_025.wav|ご協力ありがとうございます。それは思い至ってませんでした、助かりました
355 | wav/nen203_040.wav|心を許しあえるような相手が出来れば、おそらくは
356 | wav/nen203_053.wav|すみません……明日もこうでないといいんですが……
357 | wav/nen203_065.wav|あの、どうかしたんですか？いなばさん
358 | wav/nen203_080.wav|それはたぶん、昨日話をした、胸の痛みに関わることなんですよね
359 | wav/nen203_095.wav|占いなんて、あくまでも切っ掛けみたいなものですから
360 | wav/nen203_111.wav|あ、あの、優しくしてください……それと、電気を消して……お願いです……
361 | wav/nen203_127.wav|せっかくですし、一緒に入りませんか？
362 | wav/nen204_006.wav|では、今日はこの辺りで解散にしましょうか
363 | wav/nen205_018.wav|それでですね、ほしな君
364 | wav/nen206_007.wav|ええ、ちょっと
365 | wav/nen206_022.wav|そうですね。少なくとも、自分のせいっていうのはいなばさんの誤解かも知れませんし
366 | wav/nen206_033.wav|やりとりをオープンにした方が、互いに痛くもない腹を探り合わないで済むと思います
367 | wav/nen206_048.wav|もし、木月さんの行方が知れなくなったのが、魔法や契約と絡むことなら――
368 | wav/nen206_063.wav|だから学院にも、なにも……
369 | wav/nen207_016.wav|座りましたっ
370 | wav/nen207_031.wav|え？そ、それはもちろんですけど
371 | wav/nen209_001.wav|こんにちは
372 | wav/nen210_009.wav|とりあえず……ほしな君にその、想定外に下着まで見せてしまったんですよね？
373 | wav/nen210_023.wav|放課後、ななおのところまで付き合ってもらえませんか？
374 | wav/nen210_039.wav|お待たせしました
375 | wav/nen211_004.wav|はい。ですからほしな君の中には今、魔女２人のものである欠片がそれぞれにあります
376 | wav/nen211_015.wav|そして、こうなってしまったものは仕方がありませんし、回収不可能なわけでもないんですから
377 | wav/nen212_001.wav|う、上手くいったんですか？
378 | wav/nen212_015.wav|はい、おかげさまで
379 | wav/nen213_011.wav|生まれつき備えてしまっていた、あの能力のせいで
380 | wav/nen213_025.wav|はあ……せ、交尾ですか
381 | wav/nen214_010.wav|い、いえっなんでもっ
382 | wav/nen215_012.wav|それもわかりますけど
383 | wav/nen217_006.wav|とがくし先輩、その――
384 | wav/nen218_009.wav|そこはまた、ご協力いただければ助かります
385 | wav/nen219_005.wav|ハッピーハロウィンですね、いなばさん
386 | wav/nen301_006.wav|ええ。私の方は、あともう少しで溜まりますから
387 | wav/nen301_017.wav|はい、頑張ります
388 | wav/nen302_010.wav|知っている方なんですか、２人とも？
389 | wav/nen303_003.wav|なるほど。だったら、しいばさんはあまり近づき過ぎない方がいいかもしれません
390 | wav/nen303_014.wav|はい、どうやらほしな君の心の穴が広がってしまった可能性がありそうです
391 | wav/nen303_030.wav|いいんです、ほしな君が吸収してしまった分なら、ほとんど回収した後ですし
392 | wav/nen303_045.wav|ほしな君の心の穴を埋めるのも、しいばさんにお任せした方が効率的かもしれません
393 | wav/nen305_004.wav|こ、交尾をされたわけではないですよね？
394 | wav/nen307_005.wav|もっとも、ほしな君が誘ったのはしいばさんです。しいばさん次第だと思いますが
395 | wav/nen308_007.wav|ですがしばらくの間、話し相手になることにしました
396 | wav/nen310_006.wav|いいんじゃないでしょうか？
397 | wav/nen312_003.wav|どうかしましたか？ほしな君もまだ来てないようですし、気になっていたんですが
398 | wav/nen312_014.wav|いえ、私も何も聞いていませんが
399 | wav/nen314_002.wav|ありがとうございます
400 | wav/nen314_016.wav|魔女を常に見張る者が多いそうです、心当たりはありませんか？
401 | wav/nen314_027.wav|すると心を強引に削り取った痕がみつかったんです！
402 | wav/nen315_002.wav|え、ええ
403 | wav/nen315_013.wav|はい、ですがこの場合、欠片は犯人から奪い返せばいいんです
404 | wav/nen315_024.wav|見つけ出すだけでも、なかなか骨が折れそうですが
405 | wav/nen316_003.wav|別のアルプがいるなら、匂いでわかるというのですが
406 | wav/nen317_008.wav|いえ、厚真さんが預かっていた子犬も、行方がわからなくなっているのを思い出したんですが
407 | wav/nen319_005.wav|人間に見えても、ぼんやりしないでしっかり警戒を
408 | wav/nen401_006.wav|ふー……ふー………………はぁ、美味しい
409 | wav/nen402_007.wav|はい
410 | wav/nen402_020.wav|ちょっと思いつきませんね
411 | wav/nen404_003.wav|もし何かあるなら休んでくれてもいいんですよ？
412 | wav/nen404_014.wav|私に対する罪悪感といいますか、義務感と言いますか……それはきっと同情に近い感情ですから……
413 | wav/nen405_002.wav|ほしな君。ああいうのは、どうかと思います
414 | wav/nen405_013.wav|はい、何ですか？
415 | wav/nen405_024.wav|いえ、今日は仕方ありませんよ。相談だけじゃなく、占いを希望する人も来ませんでしたからね
416 | wav/nen406_010.wav|欠片が戻ってきたのは、ほしな君がとがくし先輩とお付き合いをするようになったからだと思うんです
417 | wav/nen406_021.wav|それに……これはあくまで、責めるつもりではなく、色んな人の相談を受けて思った個人的な意見なんですが
418 | wav/nen409_003.wav|あ、ほしな君
419 | wav/nen409_014.wav|魔力の塊をぶつけることで、多少のショックを与えるかもしれないそうですが、先輩の心にひどい影響を与えるものじゃないそうです
420 | wav/nen409_025.wav|私は、この弾丸を撃てばいいわけですね
421 | wav/nen409_038.wav|それでは
422 | wav/nen410_010.wav|それは、ほしな君がオカ研で頑張ってくれた分で相殺です。実際、今のこの欠片の量は、私がほしな君と出会う前より、ほんの少し少ないだけですから
423 | wav/nen410_022.wav|学院内ではあれほどダメだって言ってるじゃないですか
424 | wav/nen504_001.wav|ほしな君、調子はどうですか？
425 | wav/nen505_008.wav|えっと……こ、ここは、励まし会とか開いた方がいいんでしょうか？
426 | wav/nen507_009.wav|なのに、部活を続けたりしたら、擦れ違いですとか、そういうことが心配になって
427 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from math import sqrt
  2 | import torch
  3 | from torch.autograd import Variable
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | from layers import ConvNorm, LinearNorm
  7 | from utils import to_gpu, get_mask_from_lengths
  8 | 
  9 | 
 10 | class LocationLayer(nn.Module):
 11 |     def __init__(self, attention_n_filters, attention_kernel_size,
 12 |                  attention_dim):
 13 |         super(LocationLayer, self).__init__()
 14 |         padding = int((attention_kernel_size - 1) / 2)
 15 |         self.location_conv = ConvNorm(2, attention_n_filters,
 16 |                                       kernel_size=attention_kernel_size,
 17 |                                       padding=padding, bias=False, stride=1,
 18 |                                       dilation=1)
 19 |         self.location_dense = LinearNorm(attention_n_filters, attention_dim,
 20 |                                          bias=False, w_init_gain='tanh')
 21 | 
 22 |     def forward(self, attention_weights_cat):
 23 |         processed_attention = self.location_conv(attention_weights_cat)
 24 |         processed_attention = processed_attention.transpose(1, 2)
 25 |         processed_attention = self.location_dense(processed_attention)
 26 |         return processed_attention
 27 | 
 28 | 
 29 | class Attention(nn.Module):
 30 |     def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
 31 |                  attention_location_n_filters, attention_location_kernel_size):
 32 |         super(Attention, self).__init__()
 33 |         self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
 34 |                                       bias=False, w_init_gain='tanh')
 35 |         self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
 36 |                                        w_init_gain='tanh')
 37 |         self.v = LinearNorm(attention_dim, 1, bias=False)
 38 |         self.location_layer = LocationLayer(attention_location_n_filters,
 39 |                                             attention_location_kernel_size,
 40 |                                             attention_dim)
 41 |         self.score_mask_value = -float("inf")
 42 | 
 43 |     def get_alignment_energies(self, query, processed_memory,
 44 |                                attention_weights_cat):
 45 |         """
 46 |         PARAMS
 47 |         ------
 48 |         query: decoder output (batch, n_mel_channels * n_frames_per_step)
 49 |         processed_memory: processed encoder outputs (B, T_in, attention_dim)
 50 |         attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
 51 | 
 52 |         RETURNS
 53 |         -------
 54 |         alignment (batch, max_time)
 55 |         """
 56 | 
 57 |         processed_query = self.query_layer(query.unsqueeze(1))
 58 |         processed_attention_weights = self.location_layer(attention_weights_cat)
 59 |         energies = self.v(torch.tanh(
 60 |             processed_query + processed_attention_weights + processed_memory))
 61 | 
 62 |         energies = energies.squeeze(-1)
 63 |         return energies
 64 | 
 65 |     def forward(self, attention_hidden_state, memory, processed_memory,
 66 |                 attention_weights_cat, mask):
 67 |         """
 68 |         PARAMS
 69 |         ------
 70 |         attention_hidden_state: attention rnn last output
 71 |         memory: encoder outputs
 72 |         processed_memory: processed encoder outputs
 73 |         attention_weights_cat: previous and cummulative attention weights
 74 |         mask: binary mask for padded data
 75 |         """
 76 |         alignment = self.get_alignment_energies(
 77 |             attention_hidden_state, processed_memory, attention_weights_cat)
 78 | 
 79 |         if mask is not None:
 80 |             alignment.data.masked_fill_(mask, self.score_mask_value)
 81 | 
 82 |         attention_weights = F.softmax(alignment, dim=1)
 83 |         attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
 84 |         attention_context = attention_context.squeeze(1)
 85 | 
 86 |         return attention_context, attention_weights
 87 | 
 88 | 
 89 | class Prenet(nn.Module):
 90 |     def __init__(self, in_dim, sizes):
 91 |         super(Prenet, self).__init__()
 92 |         in_sizes = [in_dim] + sizes[:-1]
 93 |         self.layers = nn.ModuleList(
 94 |             [LinearNorm(in_size, out_size, bias=False)
 95 |              for (in_size, out_size) in zip(in_sizes, sizes)])
 96 | 
 97 |     def forward(self, x):
 98 |         for linear in self.layers:
 99 |             x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
100 |         return x
101 | 
102 | 
103 | class Postnet(nn.Module):
104 |     """Postnet
105 |         - Five 1-d convolution with 512 channels and kernel size 5
106 |     """
107 | 
108 |     def __init__(self, hparams):
109 |         super(Postnet, self).__init__()
110 |         self.convolutions = nn.ModuleList()
111 | 
112 |         self.convolutions.append(
113 |             nn.Sequential(
114 |                 ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
115 |                          kernel_size=hparams.postnet_kernel_size, stride=1,
116 |                          padding=int((hparams.postnet_kernel_size - 1) / 2),
117 |                          dilation=1, w_init_gain='tanh'),
118 |                 nn.BatchNorm1d(hparams.postnet_embedding_dim))
119 |         )
120 | 
121 |         for i in range(1, hparams.postnet_n_convolutions - 1):
122 |             self.convolutions.append(
123 |                 nn.Sequential(
124 |                     ConvNorm(hparams.postnet_embedding_dim,
125 |                              hparams.postnet_embedding_dim,
126 |                              kernel_size=hparams.postnet_kernel_size, stride=1,
127 |                              padding=int((hparams.postnet_kernel_size - 1) / 2),
128 |                              dilation=1, w_init_gain='tanh'),
129 |                     nn.BatchNorm1d(hparams.postnet_embedding_dim))
130 |             )
131 | 
132 |         self.convolutions.append(
133 |             nn.Sequential(
134 |                 ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
135 |                          kernel_size=hparams.postnet_kernel_size, stride=1,
136 |                          padding=int((hparams.postnet_kernel_size - 1) / 2),
137 |                          dilation=1, w_init_gain='linear'),
138 |                 nn.BatchNorm1d(hparams.n_mel_channels))
139 |             )
140 | 
141 |     def forward(self, x):
142 |         for i in range(len(self.convolutions) - 1):
143 |             x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
144 |         x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
145 | 
146 |         return x
147 | 
148 | 
149 | class Encoder(nn.Module):
150 |     """Encoder module:
151 |         - Three 1-d convolution banks
152 |         - Bidirectional LSTM
153 |     """
154 |     def __init__(self, hparams):
155 |         super(Encoder, self).__init__()
156 | 
157 |         convolutions = []
158 |         for _ in range(hparams.encoder_n_convolutions):
159 |             conv_layer = nn.Sequential(
160 |                 ConvNorm(hparams.encoder_embedding_dim,
161 |                          hparams.encoder_embedding_dim,
162 |                          kernel_size=hparams.encoder_kernel_size, stride=1,
163 |                          padding=int((hparams.encoder_kernel_size - 1) / 2),
164 |                          dilation=1, w_init_gain='relu'),
165 |                 nn.BatchNorm1d(hparams.encoder_embedding_dim))
166 |             convolutions.append(conv_layer)
167 |         self.convolutions = nn.ModuleList(convolutions)
168 | 
169 |         self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
170 |                             int(hparams.encoder_embedding_dim / 2), 1,
171 |                             batch_first=True, bidirectional=True)
172 | 
173 |     def forward(self, x, input_lengths):
174 |         for conv in self.convolutions:
175 |             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
176 | 
177 |         x = x.transpose(1, 2)
178 | 
179 |         # pytorch tensor are not reversible, hence the conversion
180 |         input_lengths = input_lengths.cpu().numpy()
181 |         x = nn.utils.rnn.pack_padded_sequence(
182 |             x, input_lengths, batch_first=True)
183 | 
184 |         self.lstm.flatten_parameters()
185 |         outputs, _ = self.lstm(x)
186 | 
187 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(
188 |             outputs, batch_first=True)
189 | 
190 |         return outputs
191 | 
192 |     def inference(self, x):
193 |         for conv in self.convolutions:
194 |             x = F.dropout(F.relu(conv(x)), 0.5, self.training)
195 | 
196 |         x = x.transpose(1, 2)
197 | 
198 |         self.lstm.flatten_parameters()
199 |         outputs, _ = self.lstm(x)
200 | 
201 |         return outputs
202 | 
203 | 
204 | class Decoder(nn.Module):
205 |     def __init__(self, hparams):
206 |         super(Decoder, self).__init__()
207 |         self.n_mel_channels = hparams.n_mel_channels
208 |         self.n_frames_per_step = hparams.n_frames_per_step
209 |         self.encoder_embedding_dim = hparams.encoder_embedding_dim
210 |         self.attention_rnn_dim = hparams.attention_rnn_dim
211 |         self.decoder_rnn_dim = hparams.decoder_rnn_dim
212 |         self.prenet_dim = hparams.prenet_dim
213 |         self.max_decoder_steps = hparams.max_decoder_steps
214 |         self.gate_threshold = hparams.gate_threshold
215 |         self.p_attention_dropout = hparams.p_attention_dropout
216 |         self.p_decoder_dropout = hparams.p_decoder_dropout
217 | 
218 |         self.prenet = Prenet(
219 |             hparams.n_mel_channels * hparams.n_frames_per_step,
220 |             [hparams.prenet_dim, hparams.prenet_dim])
221 | 
222 |         self.attention_rnn = nn.LSTMCell(
223 |             hparams.prenet_dim + hparams.encoder_embedding_dim,
224 |             hparams.attention_rnn_dim)
225 | 
226 |         self.attention_layer = Attention(
227 |             hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
228 |             hparams.attention_dim, hparams.attention_location_n_filters,
229 |             hparams.attention_location_kernel_size)
230 | 
231 |         self.decoder_rnn = nn.LSTMCell(
232 |             hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
233 |             hparams.decoder_rnn_dim, 1)
234 | 
235 |         self.linear_projection = LinearNorm(
236 |             hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
237 |             hparams.n_mel_channels * hparams.n_frames_per_step)
238 | 
239 |         self.gate_layer = LinearNorm(
240 |             hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
241 |             bias=True, w_init_gain='sigmoid')
242 | 
243 |     def get_go_frame(self, memory):
244 |         """ Gets all zeros frames to use as first decoder input
245 |         PARAMS
246 |         ------
247 |         memory: decoder outputs
248 | 
249 |         RETURNS
250 |         -------
251 |         decoder_input: all zeros frames
252 |         """
253 |         B = memory.size(0)
254 |         decoder_input = Variable(memory.data.new(
255 |             B, self.n_mel_channels * self.n_frames_per_step).zero_())
256 |         return decoder_input
257 | 
258 |     def initialize_decoder_states(self, memory, mask):
259 |         """ Initializes attention rnn states, decoder rnn states, attention
260 |         weights, attention cumulative weights, attention context, stores memory
261 |         and stores processed memory
262 |         PARAMS
263 |         ------
264 |         memory: Encoder outputs
265 |         mask: Mask for padded data if training, expects None for inference
266 |         """
267 |         B = memory.size(0)
268 |         MAX_TIME = memory.size(1)
269 | 
270 |         self.attention_hidden = Variable(memory.data.new(
271 |             B, self.attention_rnn_dim).zero_())
272 |         self.attention_cell = Variable(memory.data.new(
273 |             B, self.attention_rnn_dim).zero_())
274 | 
275 |         self.decoder_hidden = Variable(memory.data.new(
276 |             B, self.decoder_rnn_dim).zero_())
277 |         self.decoder_cell = Variable(memory.data.new(
278 |             B, self.decoder_rnn_dim).zero_())
279 | 
280 |         self.attention_weights = Variable(memory.data.new(
281 |             B, MAX_TIME).zero_())
282 |         self.attention_weights_cum = Variable(memory.data.new(
283 |             B, MAX_TIME).zero_())
284 |         self.attention_context = Variable(memory.data.new(
285 |             B, self.encoder_embedding_dim).zero_())
286 | 
287 |         self.memory = memory
288 |         self.processed_memory = self.attention_layer.memory_layer(memory)
289 |         self.mask = mask
290 | 
291 |     def parse_decoder_inputs(self, decoder_inputs):
292 |         """ Prepares decoder inputs, i.e. mel outputs
293 |         PARAMS
294 |         ------
295 |         decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
296 | 
297 |         RETURNS
298 |         -------
299 |         inputs: processed decoder inputs
300 | 
301 |         """
302 |         # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
303 |         decoder_inputs = decoder_inputs.transpose(1, 2)
304 |         decoder_inputs = decoder_inputs.view(
305 |             decoder_inputs.size(0),
306 |             int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
307 |         # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
308 |         decoder_inputs = decoder_inputs.transpose(0, 1)
309 |         return decoder_inputs
310 | 
311 |     def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
312 |         """ Prepares decoder outputs for output
313 |         PARAMS
314 |         ------
315 |         mel_outputs:
316 |         gate_outputs: gate output energies
317 |         alignments:
318 | 
319 |         RETURNS
320 |         -------
321 |         mel_outputs:
322 |         gate_outpust: gate output energies
323 |         alignments:
324 |         """
325 |         # (T_out, B) -> (B, T_out)
326 |         alignments = torch.stack(alignments).transpose(0, 1)
327 |         # (T_out, B) -> (B, T_out)
328 |         gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
329 |         gate_outputs = gate_outputs.contiguous()
330 |         # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
331 |         mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
332 |         # decouple frames per step
333 |         mel_outputs = mel_outputs.view(
334 |             mel_outputs.size(0), -1, self.n_mel_channels)
335 |         # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
336 |         mel_outputs = mel_outputs.transpose(1, 2)
337 | 
338 |         return mel_outputs, gate_outputs, alignments
339 | 
340 |     def decode(self, decoder_input):
341 |         """ Decoder step using stored states, attention and memory
342 |         PARAMS
343 |         ------
344 |         decoder_input: previous mel output
345 | 
346 |         RETURNS
347 |         -------
348 |         mel_output:
349 |         gate_output: gate output energies
350 |         attention_weights:
351 |         """
352 |         cell_input = torch.cat((decoder_input, self.attention_context), -1)
353 |         self.attention_hidden, self.attention_cell = self.attention_rnn(
354 |             cell_input, (self.attention_hidden, self.attention_cell))
355 |         self.attention_hidden = F.dropout(
356 |             self.attention_hidden, self.p_attention_dropout, self.training)
357 | 
358 |         attention_weights_cat = torch.cat(
359 |             (self.attention_weights.unsqueeze(1),
360 |              self.attention_weights_cum.unsqueeze(1)), dim=1)
361 |         self.attention_context, self.attention_weights = self.attention_layer(
362 |             self.attention_hidden, self.memory, self.processed_memory,
363 |             attention_weights_cat, self.mask)
364 | 
365 |         self.attention_weights_cum += self.attention_weights
366 |         decoder_input = torch.cat(
367 |             (self.attention_hidden, self.attention_context), -1)
368 |         self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
369 |             decoder_input, (self.decoder_hidden, self.decoder_cell))
370 |         self.decoder_hidden = F.dropout(
371 |             self.decoder_hidden, self.p_decoder_dropout, self.training)
372 | 
373 |         decoder_hidden_attention_context = torch.cat(
374 |             (self.decoder_hidden, self.attention_context), dim=1)
375 |         decoder_output = self.linear_projection(
376 |             decoder_hidden_attention_context)
377 | 
378 |         gate_prediction = self.gate_layer(decoder_hidden_attention_context)
379 |         return decoder_output, gate_prediction, self.attention_weights
380 | 
381 |     def forward(self, memory, decoder_inputs, memory_lengths):
382 |         """ Decoder forward pass for training
383 |         PARAMS
384 |         ------
385 |         memory: Encoder outputs
386 |         decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
387 |         memory_lengths: Encoder output lengths for attention masking.
388 | 
389 |         RETURNS
390 |         -------
391 |         mel_outputs: mel outputs from the decoder
392 |         gate_outputs: gate outputs from the decoder
393 |         alignments: sequence of attention weights from the decoder
394 |         """
395 | 
396 |         decoder_input = self.get_go_frame(memory).unsqueeze(0)
397 |         decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
398 |         decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
399 |         decoder_inputs = self.prenet(decoder_inputs)
400 | 
401 |         self.initialize_decoder_states(
402 |             memory, mask=~get_mask_from_lengths(memory_lengths))
403 | 
404 |         mel_outputs, gate_outputs, alignments = [], [], []
405 |         while len(mel_outputs) < decoder_inputs.size(0) - 1:
406 |             decoder_input = decoder_inputs[len(mel_outputs)]
407 |             mel_output, gate_output, attention_weights = self.decode(
408 |                 decoder_input)
409 |             mel_outputs += [mel_output.squeeze(1)]
410 |             gate_outputs += [gate_output.squeeze(1)]
411 |             alignments += [attention_weights]
412 | 
413 |         mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
414 |             mel_outputs, gate_outputs, alignments)
415 | 
416 |         return mel_outputs, gate_outputs, alignments
417 | 
418 |     def inference(self, memory):
419 |         """ Decoder inference
420 |         PARAMS
421 |         ------
422 |         memory: Encoder outputs
423 | 
424 |         RETURNS
425 |         -------
426 |         mel_outputs: mel outputs from the decoder
427 |         gate_outputs: gate outputs from the decoder
428 |         alignments: sequence of attention weights from the decoder
429 |         """
430 |         decoder_input = self.get_go_frame(memory)
431 | 
432 |         self.initialize_decoder_states(memory, mask=None)
433 | 
434 |         mel_outputs, gate_outputs, alignments = [], [], []
435 |         while True:
436 |             decoder_input = self.prenet(decoder_input)
437 |             mel_output, gate_output, alignment = self.decode(decoder_input)
438 | 
439 |             mel_outputs += [mel_output.squeeze(1)]
440 |             gate_outputs += [gate_output]
441 |             alignments += [alignment]
442 | 
443 |             if torch.sigmoid(gate_output.data) > self.gate_threshold:
444 |                 break
445 |             elif len(mel_outputs) == self.max_decoder_steps:
446 |                 print("Warning! Reached max decoder steps")
447 |                 break
448 | 
449 |             decoder_input = mel_output
450 | 
451 |         mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
452 |             mel_outputs, gate_outputs, alignments)
453 | 
454 |         return mel_outputs, gate_outputs, alignments
455 | 
456 | 
457 | class Tacotron2(nn.Module):
458 |     def __init__(self, hparams):
459 |         super(Tacotron2, self).__init__()
460 |         self.mask_padding = hparams.mask_padding
461 |         self.fp16_run = hparams.fp16_run
462 |         self.n_mel_channels = hparams.n_mel_channels
463 |         self.n_frames_per_step = hparams.n_frames_per_step
464 |         self.embedding = nn.Embedding(
465 |             hparams.n_symbols, hparams.symbols_embedding_dim)
466 |         std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
467 |         val = sqrt(3.0) * std  # uniform bounds for std
468 |         self.embedding.weight.data.uniform_(-val, val)
469 |         self.encoder = Encoder(hparams)
470 |         self.decoder = Decoder(hparams)
471 |         self.postnet = Postnet(hparams)
472 | 
473 |     def parse_batch(self, batch):
474 |         text_padded, input_lengths, mel_padded, gate_padded, \
475 |             output_lengths = batch
476 |         text_padded = to_gpu(text_padded).long()
477 |         input_lengths = to_gpu(input_lengths).long()
478 |         max_len = torch.max(input_lengths.data).item()
479 |         mel_padded = to_gpu(mel_padded).float()
480 |         gate_padded = to_gpu(gate_padded).float()
481 |         output_lengths = to_gpu(output_lengths).long()
482 | 
483 |         return (
484 |             (text_padded, input_lengths, mel_padded, max_len, output_lengths),
485 |             (mel_padded, gate_padded))
486 | 
487 |     def parse_output(self, outputs, output_lengths=None):
488 |         if self.mask_padding and output_lengths is not None:
489 |             mask = ~get_mask_from_lengths(output_lengths)
490 |             mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
491 |             mask = mask.permute(1, 0, 2)
492 | 
493 |             outputs[0].data.masked_fill_(mask, 0.0)
494 |             outputs[1].data.masked_fill_(mask, 0.0)
495 |             outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies
496 | 
497 |         return outputs
498 | 
499 |     def forward(self, inputs):
500 |         text_inputs, text_lengths, mels, max_len, output_lengths = inputs
501 |         text_lengths, output_lengths = text_lengths.data, output_lengths.data
502 | 
503 |         embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
504 | 
505 |         encoder_outputs = self.encoder(embedded_inputs, text_lengths)
506 | 
507 |         mel_outputs, gate_outputs, alignments = self.decoder(
508 |             encoder_outputs, mels, memory_lengths=text_lengths)
509 | 
510 |         mel_outputs_postnet = self.postnet(mel_outputs)
511 |         mel_outputs_postnet = mel_outputs + mel_outputs_postnet
512 | 
513 |         return self.parse_output(
514 |             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
515 |             output_lengths)
516 | 
517 |     def inference(self, inputs):
518 |         embedded_inputs = self.embedding(inputs).transpose(1, 2)
519 |         encoder_outputs = self.encoder.inference(embedded_inputs)
520 |         mel_outputs, gate_outputs, alignments = self.decoder.inference(
521 |             encoder_outputs)
522 | 
523 |         mel_outputs_postnet = self.postnet(mel_outputs)
524 |         mel_outputs_postnet = mel_outputs + mel_outputs_postnet
525 | 
526 |         outputs = self.parse_output(
527 |             [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
528 | 
529 |         return outputs
530 | 


--------------------------------------------------------------------------------