├── AudioSamples
├── step000009300_predicted.wav
├── step000009400_predicted.wav
└── step000009500_predicted.wav
├── Cloning_Audio
├── cloning_text.txt
└── speakers_cloned_voices_mel.p
├── Encoder.py
├── Img
├── Epoch Loss.png
└── Workflow.png
├── Modules
├── Attention.py
├── CloningSamplesAttention.py
├── Conv1dGLU.py
├── MultiHeadAttention.py
├── SpectralProcessing.py
└── TemporalProcessing.py
├── README.md
├── checkpoints
└── encoder_checkpoint.pth
├── dv3
├── __init__.py
├── audio.py
├── compute_timestamp_ratio.py
├── deepvoice3_pytorch
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── builder.cpython-36.pyc
│ │ ├── conv.cpython-36.pyc
│ │ ├── deepvoice3.cpython-36.pyc
│ │ ├── modules.cpython-36.pyc
│ │ └── version.cpython-36.pyc
│ ├── builder.py
│ ├── conv.py
│ ├── deepvoice3.py
│ ├── frontend
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ └── __init__.cpython-36.pyc
│ │ ├── en
│ │ │ ├── __init__.py
│ │ │ └── __pycache__
│ │ │ │ └── __init__.cpython-36.pyc
│ │ ├── jp
│ │ │ ├── __init__.py
│ │ │ └── __pycache__
│ │ │ │ └── __init__.cpython-36.pyc
│ │ └── text
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── cleaners.cpython-36.pyc
│ │ │ ├── cmudict.cpython-36.pyc
│ │ │ ├── numbers.cpython-36.pyc
│ │ │ └── symbols.cpython-36.pyc
│ │ │ ├── cleaners.py
│ │ │ ├── cmudict.py
│ │ │ ├── numbers.py
│ │ │ └── symbols.py
│ ├── modules.py
│ ├── nyanko.py
│ └── version.py
├── deepvoice3_vctk.json
├── hparams.py
├── jsut.py
├── ljspeech.py
├── lrschedule.py
├── preprocess.py
├── setup.py
├── synthesis.py
├── tests
│ ├── test_conv.py
│ ├── test_deepvoice3.py
│ ├── test_embedding.py
│ ├── test_frontend.py
│ └── test_nyanko.py
├── train.py
├── vctk.py
└── vctk_preprocess
│ ├── .gitignore
│ ├── README.md
│ ├── extract_feats.py
│ ├── prepare_htk_alignments_vctk.py
│ └── prepare_vctk_labels.py
├── setup.py
├── speaker_adaptation.py
├── train_dv3.py
├── train_encoder.py
├── train_whole.py
└── utils.py
/AudioSamples/step000009300_predicted.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009300_predicted.wav
--------------------------------------------------------------------------------
/AudioSamples/step000009400_predicted.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009400_predicted.wav
--------------------------------------------------------------------------------
/AudioSamples/step000009500_predicted.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009500_predicted.wav
--------------------------------------------------------------------------------
/Cloning_Audio/cloning_text.txt:
--------------------------------------------------------------------------------
1 | Prosecuters have opened a massive investigation into allegations of fixiing games and illegal betting.
2 | Different telescope designs perfor differently and have different strengths and weaknesses.
3 | We can continue to strengthen the education of good lawyers.
4 | Feedback must be timely and accurate throughout the project.
5 | Humans should also judge the distance by using relative sizes of the objects.
6 | Churches should not encourage it or make it look harmless.
7 | Learn about setting up wireless network confriguration.
8 | You can eat them fresh cooked or fermented.
9 | If this is true then those who tend to think cretively really are somehow different.
10 | She will likely jump for joy and want to skip straight for the honeymoon.
11 | The sugar syrup should create very fine strands of sugar that drape of the handles.
12 | But really in the grand scheme of things this information is insignificant.
13 | I let the positive overrule the negetive.
14 | He wiped his brow with his forearm.
15 | Instead of fixing it they give it a nickname.
16 | About half the people who are infected also lose wheight.
17 | The second half of the book focuses on argument and essay writing.
18 | We have the means to help ourselves.
19 | The large items are put into containers for disposal.
20 | He loves to watch me drink this stuff.
21 | Still it is an odd fashion choice.
22 | Funding is always an issue after the fact.
23 | Let us encourage each other.
24 |
--------------------------------------------------------------------------------
/Cloning_Audio/speakers_cloned_voices_mel.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Cloning_Audio/speakers_cloned_voices_mel.p
--------------------------------------------------------------------------------
/Encoder.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import torch.nn as nn
4 | from torch.autograd import Variable
5 | import librosa
6 | import torch.nn.functional as F
7 | from Modules.SpectralProcessing import SpectralProcessing
8 | from Modules.TemporalProcessing import TemporalProcessing
9 | from Modules.CloningSamplesAttention import CloningSamplesAttention
10 |
11 |
12 | class Encoder(nn.Module):
13 | global batch_size
14 | global N_samples
15 | def __init__(self):
16 | super(Encoder, self).__init__()
17 | self.spectral_layer = SpectralProcessing(80)
18 | self.temporal_layer = TemporalProcessing()
19 | self.cloning_attention_layer = CloningSamplesAttention()
20 |
21 | def forward(self, x):
22 | #print(x)
23 | x = self.spectral_layer(x)
24 | x = self.temporal_layer(x)
25 | x = self.cloning_attention_layer(x)
26 |
27 | print(x.size())
28 |
29 | return x
30 |
31 |
32 |
33 | #def Temp_Masking(x):
34 | #Create function for temporal masking. Use librosa.decompose.hpss. Split and concatinate dimensions to make it 2D.
--------------------------------------------------------------------------------
/Img/Epoch Loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Img/Epoch Loss.png
--------------------------------------------------------------------------------
/Img/Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Img/Workflow.png
--------------------------------------------------------------------------------
/Modules/Attention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import sys
5 |
6 | from Modules.MultiHeadAttention import MultiHeadAttention
7 |
8 | class Attention(nn.Module):
9 | def __init__(self, dim):
10 | super(Attention, self).__init__()
11 |
12 | self.encoders = self._build_model(dim)
13 |
14 | def _build_model(self, dim):
15 | layers = []
16 | dim = dim
17 | layers.append(MultiHeadAttention(dim, dim, dim))
18 |
19 | return nn.ModuleList(layers)
20 |
21 | def forward(self, inputs):
22 | net_inputs = inputs
23 | net_inputs.contiguous()
24 | for enc in self.encoders:
25 | net_inputs = enc(net_inputs, net_inputs)
26 | return net_inputs
27 |
--------------------------------------------------------------------------------
/Modules/CloningSamplesAttention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import sys
5 |
6 | from Modules.Attention import Attention
7 |
8 | class CloningSamplesAttention(nn.Module):
9 | '''
10 | Implementation of the the last Cloning sample attention part.
11 | Implementation includes residual linear connection,Multiheadattentionlayer,
12 | and linear layers.
13 | '''
14 |
15 | def __init__(self):
16 | super(CloningSamplesAttention,self).__init__()
17 | self.residual_linear_layer = nn.Linear(128,512)
18 | self.attention = Attention(128)
19 | self.fc_after_attention = nn.Linear(128,1)
20 |
21 | def forward(self,x):
22 |
23 | residual_linear_x = self.residual_linear_layer(x)
24 | x.contiguous()
25 | # attention layer
26 | x = self.attention(x)
27 | # linear layers
28 | x = self.fc_after_attention(x)
29 | x = torch.squeeze(x)
30 | x = F.softsign(x)
31 | x = F.normalize(x, dim = 1)
32 | x = torch.unsqueeze(x, dim=2)
33 | x = torch.bmm(x.transpose(1,2), residual_linear_x)
34 | x = torch.squeeze(x)
35 |
36 | return x
37 |
--------------------------------------------------------------------------------
/Modules/Conv1dGLU.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 | import sys
7 |
8 | class Conv1dGLU(nn.Module):
9 | '''
10 | Implementation of the Conv1d + GLU(Gated Linear Unit)
11 | with residual connection.
12 | For GLU refer to https://arxiv.org/abs/1612.08083 paper.
13 | '''
14 | def __init__(self, in_channels=128, out_channels=128,padding = None,
15 | dilation = 2,kernel_size=12,*args, **kwargs):
16 | super(Conv1dGLU, self).__init__()
17 | if padding == None:
18 | padding = int(((kernel_size-1)/2)*dilation)
19 | self.conv1 = nn.Conv1d(in_channels, out_channels=2 * out_channels,
20 | padding=padding, dilation = dilation,
21 | kernel_size=kernel_size)
22 |
23 | def forward(self, x):
24 | residual = x
25 | x = self.conv1(x)
26 | x1, x2 = torch.split(x, split_size_or_sections = 128, dim = 1)
27 | x = x1 * torch.sigmoid(x2)
28 | x += residual
29 | x *= math.sqrt(0.5)
30 | return x
31 |
32 |
--------------------------------------------------------------------------------
/Modules/MultiHeadAttention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import torch.nn.parameter as parameter
6 |
7 | class MultiHeadAttention(nn.Module):
8 | def __init__(self, query_dim, key_dim, num_units, dropout_p=0.5, h=2, is_masked=False):
9 | super(MultiHeadAttention, self).__init__()
10 |
11 | if query_dim != key_dim:
12 | raise ValueError("query_dim and key_dim must be the same")
13 | if num_units % h != 0:
14 | raise ValueError("num_units must be dividable by h")
15 | if query_dim != num_units:
16 | raise ValueError("to employ residual connection, the number of query_dim and num_units must be the same")
17 | self.cuda = False
18 | if torch.cuda.is_available():
19 | self.cuda=True
20 |
21 | self._num_units = num_units
22 | self._h = h
23 | if self.cuda:
24 | self._key_dim = Variable(torch.cuda.FloatTensor([key_dim]))
25 | else:
26 | self._key_dim = Variable(torch.FloatTensor([key_dim]))
27 | self._dropout_p = dropout_p
28 | self._is_masked = is_masked
29 |
30 | self.query_layer = nn.Linear(query_dim, num_units, bias=False)
31 | self.key_layer = nn.Linear(key_dim, num_units, bias=False)
32 | self.value_layer = nn.Linear(key_dim, num_units, bias=False)
33 | #self.bn = nn.BatchNorm1d(num_units)
34 |
35 | def forward(self, query, keys):
36 | Q = F.elu(self.query_layer(query))
37 | K = F.elu(self.key_layer(keys))
38 | V = F.elu(self.value_layer(keys))
39 |
40 | chunk_size = int(self._num_units / self._h)
41 | Q = torch.cat(Q.split(split_size=chunk_size, dim=2), dim=0)
42 | K = torch.cat(K.split(split_size=chunk_size, dim=2), dim=0)
43 | V = torch.cat(V.split(split_size=chunk_size, dim=2), dim=0)
44 |
45 | attention = torch.matmul(Q, K.transpose(1, 2))
46 | attention = attention / torch.sqrt(self._key_dim)
47 |
48 | if self._is_masked:
49 | diag_vals = attention[0].sign().abs()
50 | diag_mat = diag_vals.tril()
51 | diag_mat = diag_mat.unsqueeze(0).expand(attention.size())
52 |
53 | mask = Variable(
54 | torch.ones(diag_mat.size()).cuda.FloatTensor * (-2**32 + 1), requires_grad=False)
55 |
56 | attention = (attention * diag_mat) + (mask * (diag_mat-1).abs())
57 | attention = F.softmax(attention, dim=-1)
58 | attention = F.dropout(attention, self._dropout_p)
59 | attention = torch.matmul(attention, V)
60 | restore_chunk_size = int(attention.size(0) / self._h)
61 | attention = torch.cat(
62 | attention.split(split_size=restore_chunk_size, dim=0), dim=2)
63 | attention += query
64 | attention = attention.transpose(1, 2)
65 | attention.contiguous()
66 | #attention = self.bn(attention).transpose(1, 2)
67 |
68 | attention = F.normalize(attention, dim = 1).transpose(1, 2)
69 | return attention
--------------------------------------------------------------------------------
/Modules/SpectralProcessing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class PreNet(nn.Module):
7 | '''
8 | 2-layer prenet
9 | 1st is the linear layer.2nd is the elu activation layer
10 | '''
11 |
12 | def __init__(self , f_mel=80,f_mapped=128):
13 | super(PreNet,self).__init__()
14 | self.linear_1 = nn.Linear(f_mel,f_mapped)
15 |
16 | def forward(self,x):
17 | x = F.elu(self.linear_1(x))
18 | return x
19 |
20 | class SpectralProcessing(nn.Module):
21 | '''
22 | Spectral Transformation layer that transforms mel
23 | spectogram to size 128
24 | '''
25 | def __init__(self,f_mel=80):
26 | super(SpectralProcessing,self).__init__()
27 | self.prenet_1 = PreNet(f_mel,128)
28 |
29 | def forward(self,x):
30 | mapped_x = self.prenet_1(x)
31 |
32 | return mapped_x
33 |
--------------------------------------------------------------------------------
/Modules/TemporalProcessing.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 |
6 | from Modules.Conv1dGLU import Conv1dGLU
7 |
8 | N_samples = 23
9 |
10 | def Temp_Masking(x):
11 | '''
12 | Create function for temporal masking. Use librosa.decompose.hpss.
13 | Split and concatinate dimensions to make it 2D.
14 |
15 | '''
16 | pass
17 |
18 |
19 | class TemporalProcessing(nn.Module):
20 | '''
21 | Implementation of Temporal Processing Layers
22 | '''
23 |
24 | def __init__(self,in_channels=128, out_channels=128,padding = None,
25 | dilation = 2,kernel_size=12):
26 | super(TemporalProcessing,self).__init__()
27 | self.conv1d_glu = Conv1dGLU(in_channels,out_channels,padding,dilation,
28 | kernel_size)
29 |
30 |
31 |
32 | def forward(self,x):
33 | batch_size = x.size(0)
34 | # transpose to do operation on the temporal dimension
35 | x = x.view(batch_size*N_samples, x.size(2), x.size(3)).transpose(1,2)
36 | x = self.conv1d_glu(x)
37 | x = x.transpose(1,2)
38 |
39 | x.contiguous()
40 | x = x.view(batch_size,N_samples,x.size(1),x.size(2))
41 | #x = librosa.decompose.hpss(x)[0]
42 | # temporal masking on x
43 | x = x.mean(dim=2)
44 |
45 | return x
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Neural_Voice_Cloning
2 | 1. Baidu Research **[Link](https://arxiv.org/pdf/1802.06006.pdf)**
3 | 2. Tested Speaker Audio **[Link](https://visionbrain.github.io/voicecloning.github.io/)**
4 |
5 | ### Abstract :
6 | * **Voice cloning is a highly desired feature for personalized speech interfaces. We introduce a neural voice cloning system that learns to synthesize a person’s voice from only a few audio samples. System that learns to synthesize a person’s voice from only a few audio samples. We study two approaches: speaker adaptation and speaker encoding.**
7 | * **Speaker adaptation is based on fine-tuning a multi-speaker generative model. Speaker encoding is based on training a separate model to directly infer a new speaker embedding, which will be applied to a multi-speaker generative model. Speaker adaptation can achieve slightly better naturalness and similarity, cloning time and required memory for the speaker encoding approach are significantly less, making it more favorable for low-resource deployment.**
8 |
9 | ### Steps :
10 |
11 |
12 |
13 |
14 | ### Audio :
15 | Tested Speaker Audio **[Link](https://visionbrain.github.io/voicecloning.github.io/)**
16 | * But don't expect anything right.
17 | * I won't make an official complaint.
18 | * They make a selective perception process.
19 |
20 | ### Made By-
21 | * **[VisionBrain](https://visionbrain.org) & Team**
22 | * **Project Lead - [Aryan Karn](https://github.com/Aryan05)**
23 |
--------------------------------------------------------------------------------
/checkpoints/encoder_checkpoint.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/checkpoints/encoder_checkpoint.pth
--------------------------------------------------------------------------------
/dv3/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import librosa
4 | import librosa.display
5 | # import IPython
6 | # from IPython.display import Audio
7 | # need this for English text processing frontend
8 | import nltk
9 |
10 | import dv3.train
11 | import dv3.synthesis
12 | # print(os.getcwd())
13 |
14 | import dv3.hparams
15 | from dv3.hparams import hparams, hparams_debug_string
16 | import json
17 |
18 | from dv3.train import build_model
19 | from dv3.train import restore_parts, load_checkpoint
20 | from dv3.synthesis import tts as _tts
21 |
22 |
23 | from dv3.deepvoice3_pytorch import frontend
24 |
25 | # print(os.getcwd())
26 |
27 |
28 | def build_deepvoice_3(preset = None ,checkpoint_path = None):
29 | if preset is None:
30 | preset = "./dv3/deepvoice3_vctk.json"
31 |
32 | # Newly added params. Need to inject dummy values
33 | for dummy, v in [("fmin", 0), ("fmax", 0),
34 | ("rescaling", False),
35 | ("rescaling_max", 0.999),
36 | ("allow_clipping_in_normalization", False)]:
37 |
38 | if hparams.get(dummy) is None:
39 | hparams.add_hparam(dummy, v)
40 | # Load parameters from preset
41 | with open(preset) as f:
42 | hparams.parse_json(f.read())
43 |
44 | # Tell we are using multi-speaker DeepVoice3
45 | hparams.builder = "deepvoice3_multispeaker"
46 |
47 | # Inject frontend text processor
48 | dv3.synthesis._frontend = getattr(frontend, "en")
49 | dv3.train._frontend = getattr(frontend, "en")
50 |
51 | # alises
52 | fs = hparams.sample_rate
53 | hop_length = hparams.hop_size
54 | model = build_model()
55 |
56 | if checkpoint_path is not None:
57 | model = load_checkpoint(checkpoint_path, model, None, True)
58 |
59 |
60 |
61 | return model
62 | # model = build_deepvoice_3()
63 |
--------------------------------------------------------------------------------
/dv3/audio.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import librosa.filters
3 | import math
4 | import numpy as np
5 | from scipy import signal
6 | from dv3.hparams import hparams
7 | from scipy.io import wavfile
8 |
9 | import lws
10 |
11 |
12 | def load_wav(path):
13 | return librosa.core.load(path, sr=hparams.sample_rate)[0]
14 |
15 |
16 | def save_wav(wav, path):
17 | wav *= 32767 / max(0.01, np.max(np.abs(wav)))
18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
19 |
20 |
21 | def preemphasis(x):
22 | from nnmnkwii.preprocessing import preemphasis
23 | return preemphasis(x, hparams.preemphasis)
24 |
25 |
26 | def inv_preemphasis(x):
27 | from nnmnkwii.preprocessing import inv_preemphasis
28 | return inv_preemphasis(x, hparams.preemphasis)
29 |
30 |
31 | def spectrogram(y):
32 | D = _lws_processor().stft(preemphasis(y)).T
33 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
34 | return _normalize(S)
35 |
36 |
37 | def inv_spectrogram(spectrogram):
38 | '''Converts spectrogram to waveform using librosa'''
39 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear
40 | processor = _lws_processor()
41 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
42 | y = processor.istft(D).astype(np.float32)
43 | return inv_preemphasis(y)
44 |
45 |
46 | def melspectrogram(y):
47 | D = _lws_processor().stft(preemphasis(y)).T
48 | S = _amp_to_db(_linear_to_mel(np.abs(D)))
49 | return _normalize(S)
50 |
51 |
52 | def _lws_processor():
53 | return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech")
54 |
55 |
56 | # Conversions:
57 |
58 |
59 | _mel_basis = None
60 |
61 |
62 | def _linear_to_mel(spectrogram):
63 | global _mel_basis
64 | if _mel_basis is None:
65 | _mel_basis = _build_mel_basis()
66 | return np.dot(_mel_basis, spectrogram)
67 |
68 |
69 | def _build_mel_basis():
70 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels)
71 |
72 |
73 | def _amp_to_db(x):
74 | return 20 * np.log10(np.maximum(1e-5, x))
75 |
76 |
77 | def _db_to_amp(x):
78 | return np.power(10.0, x * 0.05)
79 |
80 |
81 | def _normalize(S):
82 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
83 |
84 |
85 | def _denormalize(S):
86 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
87 |
--------------------------------------------------------------------------------
/dv3/compute_timestamp_ratio.py:
--------------------------------------------------------------------------------
1 | """Compute output/input timestamp ratio.
2 |
3 | usage: compute_timestamp_ratio.py [options]
4 |
5 | options:
6 | --hparams= Hyper parameters [default: ].
7 | -h, --help Show this help message and exit
8 | """
9 | from docopt import docopt
10 | import sys
11 | import numpy as np
12 | from dv3.hparams import hparams, hparams_debug_string
13 | import dv3.train
14 | from dv3.train import TextDataSource, MelSpecDataSource
15 | from nnmnkwii.datasets import FileSourceDataset
16 | from tqdm import trange
17 | from dv3.deepvoice3_pytorch import frontend
18 |
19 | if __name__ == "__main__":
20 | args = docopt(__doc__)
21 | data_root = args[""]
22 |
23 | # Override hyper parameters
24 | hparams.parse(args["--hparams"])
25 | assert hparams.name == "deepvoice3"
26 |
27 | train._frontend = getattr(frontend, hparams.frontend)
28 |
29 | # Code below
30 | X = FileSourceDataset(TextDataSource(data_root))
31 | Mel = FileSourceDataset(MelSpecDataSource(data_root))
32 |
33 | in_sizes = []
34 | out_sizes = []
35 | for i in trange(len(X)):
36 | x, m = X[i], Mel[i]
37 | if X.file_data_source.multi_speaker:
38 | x = x[0]
39 | in_sizes.append(x.shape[0])
40 | out_sizes.append(m.shape[0])
41 |
42 | in_sizes = np.array(in_sizes)
43 | out_sizes = np.array(out_sizes)
44 |
45 | input_timestamps = np.sum(in_sizes)
46 | output_timestamps = np.sum(out_sizes) / hparams.outputs_per_step / hparams.downsample_step
47 |
48 | print(input_timestamps, output_timestamps, output_timestamps / input_timestamps)
49 | sys.exit(0)
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from .version import __version__
4 |
5 | import torch
6 | from torch import nn
7 |
8 | from .modules import Embedding
9 |
10 |
11 | class MultiSpeakerTTSModel(nn.Module):
12 | """Attention seq2seq model + post processing network
13 | """
14 |
15 | def __init__(self, seq2seq, postnet,
16 | mel_dim=80, linear_dim=513,
17 | n_speakers=1, speaker_embed_dim=16, padding_idx=None,
18 | trainable_positional_encodings=False,
19 | use_decoder_state_for_postnet_input=False,
20 | speaker_embedding_weight_std=0.01,
21 | freeze_embedding=False):
22 | super(MultiSpeakerTTSModel, self).__init__()
23 | self.seq2seq = seq2seq
24 | self.postnet = postnet # referred as "Converter" in DeepVoice3
25 | self.mel_dim = mel_dim
26 | self.linear_dim = linear_dim
27 | self.trainable_positional_encodings = trainable_positional_encodings
28 | self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input
29 | self.freeze_embedding = freeze_embedding
30 |
31 | # Speaker embedding
32 | if n_speakers > 1:
33 | self.embed_speakers = Embedding(
34 | n_speakers, speaker_embed_dim, padding_idx=None,
35 | std=speaker_embedding_weight_std)
36 | self.n_speakers = n_speakers
37 | self.speaker_embed_dim = speaker_embed_dim
38 |
39 | def make_generation_fast_(self):
40 |
41 | def remove_weight_norm(m):
42 | try:
43 | nn.utils.remove_weight_norm(m)
44 | except ValueError: # this module didn't have weight norm
45 | return
46 | self.apply(remove_weight_norm)
47 |
48 | def get_trainable_parameters(self):
49 | freezed_param_ids = set()
50 |
51 | encoder, decoder = self.seq2seq.encoder, self.seq2seq.decoder
52 |
53 | # Avoid updating the position encoding
54 | if not self.trainable_positional_encodings:
55 | pe_query_param_ids = set(map(id, decoder.embed_query_positions.parameters()))
56 | pe_keys_param_ids = set(map(id, decoder.embed_keys_positions.parameters()))
57 | freezed_param_ids |= (pe_query_param_ids | pe_keys_param_ids)
58 | # Avoid updating the text embedding
59 | if self.freeze_embedding:
60 | embed_param_ids = set(map(id, encoder.embed_tokens.parameters()))
61 | freezed_param_ids |= embed_param_ids
62 |
63 | return (p for p in self.parameters() if id(p) not in freezed_param_ids)
64 |
65 | def forward(self, text_sequences, mel_targets=None, speaker_ids=None,
66 | text_positions=None, frame_positions=None, input_lengths=None):
67 | B = text_sequences.size(0)
68 |
69 | if speaker_ids is not None:
70 | assert self.n_speakers > 1
71 | speaker_embed = self.embed_speakers(speaker_ids)
72 | else:
73 | speaker_embed = None
74 |
75 | # Apply seq2seq
76 | # (B, T//r, mel_dim*r)
77 | mel_outputs, alignments, done, decoder_states = self.seq2seq(
78 | text_sequences, mel_targets, speaker_embed,
79 | text_positions, frame_positions, input_lengths)
80 |
81 | # Reshape
82 | # (B, T, mel_dim)
83 | mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
84 |
85 | # Prepare postnet inputs
86 | if self.use_decoder_state_for_postnet_input:
87 | postnet_inputs = decoder_states.view(B, mel_outputs.size(1), -1)
88 | else:
89 | postnet_inputs = mel_outputs
90 |
91 | # (B, T, linear_dim)
92 | # Convert coarse mel-spectrogram (or decoder hidden states) to
93 | # high resolution spectrogram
94 | linear_outputs = self.postnet(postnet_inputs, speaker_embed)
95 | assert linear_outputs.size(-1) == self.linear_dim
96 |
97 | return mel_outputs, linear_outputs, alignments, done
98 |
99 |
100 | class AttentionSeq2Seq(nn.Module):
101 | """Encoder + Decoder with attention
102 | """
103 |
104 | def __init__(self, encoder, decoder):
105 | super(AttentionSeq2Seq, self).__init__()
106 | self.encoder = encoder
107 | self.decoder = decoder
108 | if isinstance(self.decoder.attention, nn.ModuleList):
109 | self.encoder.num_attention_layers = sum(
110 | [layer is not None for layer in decoder.attention])
111 |
112 | def forward(self, text_sequences, mel_targets=None, speaker_embed=None,
113 | text_positions=None, frame_positions=None, input_lengths=None):
114 | # (B, T, text_embed_dim)
115 | encoder_outputs = self.encoder(
116 | text_sequences, lengths=input_lengths, speaker_embed=speaker_embed)
117 |
118 | # Mel: (B, T//r, mel_dim*r)
119 | # Alignments: (N, B, T_target, T_input)
120 | # Done: (B, T//r, 1)
121 | mel_outputs, alignments, done, decoder_states = self.decoder(
122 | encoder_outputs, mel_targets,
123 | text_positions=text_positions, frame_positions=frame_positions,
124 | speaker_embed=speaker_embed, lengths=input_lengths)
125 |
126 | return mel_outputs, alignments, done, decoder_states
127 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/builder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/builder.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/conv.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/conv.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/deepvoice3.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/deepvoice3.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/modules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/modules.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/__pycache__/version.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/version.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/builder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
5 |
6 |
7 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4,
8 | downsample_step=1,
9 | n_speakers=1, speaker_embed_dim=16, padding_idx=0,
10 | dropout=(1 - 0.95), kernel_size=5,
11 | encoder_channels=128,
12 | decoder_channels=256,
13 | converter_channels=256,
14 | query_position_rate=1.0,
15 | key_position_rate=1.29,
16 | use_memory_mask=False,
17 | trainable_positional_encodings=False,
18 | force_monotonic_attention=True,
19 | use_decoder_state_for_postnet_input=True,
20 | max_positions=512,
21 | embedding_weight_std=0.1,
22 | speaker_embedding_weight_std=0.01,
23 | freeze_embedding=False,
24 | window_ahead=3,
25 | window_backward=1,
26 | key_projection=False,
27 | value_projection=False,
28 | ):
29 | """Build deepvoice3
30 | """
31 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
32 |
33 | time_upsampling = max(downsample_step // r, 1)
34 |
35 | # Seq2seq
36 | h = encoder_channels # hidden dim (channels)
37 | k = kernel_size # kernel size
38 | encoder = Encoder(
39 | n_vocab, embed_dim, padding_idx=padding_idx,
40 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
41 | dropout=dropout, max_positions=max_positions,
42 | embedding_weight_std=embedding_weight_std,
43 | # (channels, kernel_size, dilation)
44 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
45 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
46 | (h, k, 1), (h, k, 3)],
47 | )
48 |
49 | h = decoder_channels
50 | decoder = Decoder(
51 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
52 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
53 | dropout=dropout, max_positions=max_positions,
54 | preattention=[(h, k, 1), (h, k, 3)],
55 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
56 | (h, k, 1)],
57 | attention=[True, False, False, False, True],
58 | force_monotonic_attention=force_monotonic_attention,
59 | query_position_rate=query_position_rate,
60 | key_position_rate=key_position_rate,
61 | use_memory_mask=use_memory_mask,
62 | window_ahead=window_ahead,
63 | window_backward=window_backward,
64 | key_projection=key_projection,
65 | value_projection=value_projection,
66 | )
67 |
68 | seq2seq = AttentionSeq2Seq(encoder, decoder)
69 |
70 | # Post net
71 | if use_decoder_state_for_postnet_input:
72 | in_dim = h // r
73 | else:
74 | in_dim = mel_dim
75 | h = converter_channels
76 | converter = Converter(
77 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
78 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
79 | time_upsampling=time_upsampling,
80 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)],
81 | )
82 |
83 | # Seq2seq + post net
84 | model = MultiSpeakerTTSModel(
85 | seq2seq, converter, padding_idx=padding_idx,
86 | mel_dim=mel_dim, linear_dim=linear_dim,
87 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
88 | trainable_positional_encodings=trainable_positional_encodings,
89 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
90 | speaker_embedding_weight_std=speaker_embedding_weight_std,
91 | freeze_embedding=freeze_embedding)
92 |
93 | return model
94 |
95 |
96 | def nyanko(n_vocab, embed_dim=128, mel_dim=80, linear_dim=513, r=1,
97 | downsample_step=4,
98 | n_speakers=1, speaker_embed_dim=16, padding_idx=0,
99 | dropout=(1 - 0.95), kernel_size=3,
100 | encoder_channels=256,
101 | decoder_channels=256,
102 | converter_channels=512,
103 | query_position_rate=1.0,
104 | key_position_rate=1.29,
105 | use_memory_mask=False,
106 | trainable_positional_encodings=False,
107 | force_monotonic_attention=True,
108 | use_decoder_state_for_postnet_input=False,
109 | max_positions=512, embedding_weight_std=0.01,
110 | speaker_embedding_weight_std=0.01,
111 | freeze_embedding=False,
112 | window_ahead=3,
113 | window_backward=1,
114 | key_projection=False,
115 | value_projection=False,
116 | ):
117 | from dv3.deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter
118 | assert encoder_channels == decoder_channels
119 |
120 | if n_speakers != 1:
121 | raise ValueError("Multi-speaker is not supported")
122 | if not (downsample_step == 4 and r == 1):
123 | raise ValueError("Not supported. You need to change hardcoded parameters")
124 |
125 | # Seq2seq
126 | encoder = Encoder(
127 | n_vocab, embed_dim, channels=encoder_channels, kernel_size=kernel_size,
128 | padding_idx=padding_idx,
129 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
130 | dropout=dropout, embedding_weight_std=embedding_weight_std,
131 | )
132 |
133 | decoder = Decoder(
134 | embed_dim, in_dim=mel_dim, r=r, channels=decoder_channels,
135 | kernel_size=kernel_size, padding_idx=padding_idx,
136 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
137 | dropout=dropout, max_positions=max_positions,
138 | force_monotonic_attention=force_monotonic_attention,
139 | query_position_rate=query_position_rate,
140 | key_position_rate=key_position_rate,
141 | use_memory_mask=use_memory_mask,
142 | window_ahead=window_ahead,
143 | window_backward=window_backward,
144 | key_projection=key_projection,
145 | value_projection=value_projection,
146 | )
147 |
148 | seq2seq = AttentionSeq2Seq(encoder, decoder)
149 |
150 | if use_decoder_state_for_postnet_input:
151 | in_dim = decoder_channels // r
152 | else:
153 | in_dim = mel_dim
154 |
155 | converter = Converter(
156 | in_dim=in_dim, out_dim=linear_dim, channels=converter_channels,
157 | kernel_size=kernel_size, dropout=dropout)
158 |
159 | # Seq2seq + post net
160 | model = MultiSpeakerTTSModel(
161 | seq2seq, converter, padding_idx=padding_idx,
162 | mel_dim=mel_dim, linear_dim=linear_dim,
163 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
164 | trainable_positional_encodings=trainable_positional_encodings,
165 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
166 | speaker_embedding_weight_std=speaker_embedding_weight_std,
167 | freeze_embedding=freeze_embedding)
168 |
169 | return model
170 |
171 |
172 | def deepvoice3_multispeaker(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4,
173 | downsample_step=1,
174 | n_speakers=1, speaker_embed_dim=16, padding_idx=0,
175 | dropout=(1 - 0.95), kernel_size=5,
176 | encoder_channels=128,
177 | decoder_channels=256,
178 | converter_channels=256,
179 | query_position_rate=1.0,
180 | key_position_rate=1.29,
181 | use_memory_mask=False,
182 | trainable_positional_encodings=False,
183 | force_monotonic_attention=True,
184 | use_decoder_state_for_postnet_input=True,
185 | max_positions=512,
186 | embedding_weight_std=0.1,
187 | speaker_embedding_weight_std=0.01,
188 | freeze_embedding=False,
189 | window_ahead=3,
190 | window_backward=1,
191 | key_projection=True,
192 | value_projection=True,
193 | ):
194 | """Build multi-speaker deepvoice3
195 | """
196 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
197 |
198 | time_upsampling = max(downsample_step // r, 1)
199 |
200 | # Seq2seq
201 | h = encoder_channels # hidden dim (channels)
202 | k = kernel_size # kernel size
203 | encoder = Encoder(
204 | n_vocab, embed_dim, padding_idx=padding_idx,
205 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
206 | dropout=dropout, max_positions=max_positions,
207 | embedding_weight_std=embedding_weight_std,
208 | # (channels, kernel_size, dilation)
209 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
210 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
211 | (h, k, 1), (h, k, 3)],
212 | )
213 |
214 | h = decoder_channels
215 | decoder = Decoder(
216 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
217 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
218 | dropout=dropout, max_positions=max_positions,
219 | preattention=[(h, k, 1)],
220 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
221 | (h, k, 1)],
222 | attention=[True, False, False, False, False],
223 | force_monotonic_attention=force_monotonic_attention,
224 | query_position_rate=query_position_rate,
225 | key_position_rate=key_position_rate,
226 | use_memory_mask=use_memory_mask,
227 | window_ahead=window_ahead,
228 | window_backward=window_backward,
229 | key_projection=key_projection,
230 | value_projection=value_projection,
231 | )
232 |
233 | seq2seq = AttentionSeq2Seq(encoder, decoder)
234 |
235 | # Post net
236 | if use_decoder_state_for_postnet_input:
237 | in_dim = h // r
238 | else:
239 | in_dim = mel_dim
240 | h = converter_channels
241 | converter = Converter(
242 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
243 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
244 | time_upsampling=time_upsampling,
245 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)],
246 | )
247 |
248 | # Seq2seq + post net
249 | model = MultiSpeakerTTSModel(
250 | seq2seq, converter, padding_idx=padding_idx,
251 | mel_dim=mel_dim, linear_dim=linear_dim,
252 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
253 | trainable_positional_encodings=trainable_positional_encodings,
254 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
255 | speaker_embedding_weight_std=speaker_embedding_weight_std,
256 | freeze_embedding=freeze_embedding)
257 |
258 | return model
259 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/conv.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import torch
3 | from torch import nn
4 | from torch.autograd import Variable
5 | from torch.nn import functional as F
6 |
7 |
8 | class Conv1d(nn.Conv1d):
9 | """Extended nn.Conv1d for incremental dilated convolutions
10 |
11 | currently limited for odd number kernel sizes
12 | """
13 |
14 | def __init__(self, *args, **kwargs):
15 | super().__init__(*args, **kwargs)
16 | self.clear_buffer()
17 | self._linearized_weight = None
18 | self.register_backward_hook(self._clear_linearized_weight)
19 |
20 | def incremental_forward(self, input):
21 | # input: (B, T, C)
22 | if self.training:
23 | raise RuntimeError('incremental_forward only supports eval mode')
24 |
25 | # run forward pre hooks (e.g., weight norm)
26 | for hook in self._forward_pre_hooks.values():
27 | hook(self, input)
28 |
29 | # reshape weight
30 | weight = self._get_linearized_weight()
31 | kw = self.kernel_size[0]
32 | dilation = self.dilation[0]
33 |
34 | bsz = input.size(0) # input: bsz x len x dim
35 | if kw > 1:
36 | assert kw % 2 == 1
37 | input = input.data
38 | if self.input_buffer is None:
39 | self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2))
40 | self.input_buffer.zero_()
41 | else:
42 | # shift buffer
43 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
44 | # append next input
45 | self.input_buffer[:, -1, :] = input[:, -1, :]
46 | input = torch.autograd.Variable(self.input_buffer, volatile=True)
47 | if dilation > 1:
48 | input = input[:, 0::dilation, :].contiguous()
49 | output = F.linear(input.view(bsz, -1), weight, self.bias)
50 | return output.view(bsz, 1, -1)
51 |
52 | def clear_buffer(self):
53 | self.input_buffer = None
54 |
55 | def _get_linearized_weight(self):
56 | if self._linearized_weight is None:
57 | kw = self.kernel_size[0]
58 | # nn.Conv1d
59 | if self.weight.size() == (self.out_channels, self.in_channels, kw):
60 | weight = self.weight.transpose(1, 2).contiguous()
61 | else:
62 | # fairseq.modules.conv_tbc.ConvTBC
63 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
64 | assert weight.size() == (self.out_channels, kw, self.in_channels)
65 | self._linearized_weight = weight.view(self.out_channels, -1)
66 | return self._linearized_weight
67 |
68 | def _clear_linearized_weight(self, *args):
69 | self._linearized_weight = None
70 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/deepvoice3.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from torch.autograd import Variable
7 | import math
8 | import numpy as np
9 |
10 | from .modules import Conv1d, ConvTranspose1d, Embedding, Linear, GradMultiply
11 | from .modules import get_mask_from_lengths, SinusoidalEncoding, Conv1dGLU
12 |
13 |
14 | def expand_speaker_embed(inputs_btc, speaker_embed=None, tdim=1):
15 | if speaker_embed is None:
16 | return None
17 | # expand speaker embedding for all time steps
18 | # (B, N) -> (B, T, N)
19 | ss = speaker_embed.size()
20 | speaker_embed_btc = speaker_embed.unsqueeze(1).expand(
21 | ss[0], inputs_btc.size(tdim), ss[-1])
22 | return speaker_embed_btc
23 |
24 |
25 | class Encoder(nn.Module):
26 | def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim,
27 | padding_idx=None, embedding_weight_std=0.1,
28 | convolutions=((64, 5, .1),) * 7,
29 | max_positions=512, dropout=0.1, apply_grad_scaling=False):
30 | super(Encoder, self).__init__()
31 | self.dropout = dropout
32 | self.num_attention_layers = None
33 | self.apply_grad_scaling = apply_grad_scaling
34 |
35 | # Text input embeddings
36 | self.embed_tokens = Embedding(
37 | n_vocab, embed_dim, padding_idx, embedding_weight_std)
38 |
39 | # Speaker embedding
40 | if n_speakers > 1:
41 | self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim, dropout=dropout)
42 | self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim, dropout=dropout)
43 | self.n_speakers = n_speakers
44 |
45 | # Non causual convolution blocks
46 | in_channels = embed_dim
47 | self.convolutions = nn.ModuleList()
48 | std_mul = 1.0
49 | for (out_channels, kernel_size, dilation) in convolutions:
50 | if in_channels != out_channels:
51 | # Conv1d + ReLU
52 | self.convolutions.append(
53 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
54 | dilation=1, std_mul=std_mul))
55 | self.convolutions.append(nn.ReLU(inplace=True))
56 | in_channels = out_channels
57 | std_mul = 2.0
58 | self.convolutions.append(
59 | Conv1dGLU(n_speakers, speaker_embed_dim,
60 | in_channels, out_channels, kernel_size, causal=False,
61 | dilation=dilation, dropout=dropout, std_mul=std_mul,
62 | residual=True))
63 | in_channels = out_channels
64 | std_mul = 4.0
65 | # Last 1x1 convolution
66 | self.convolutions.append(Conv1d(in_channels, embed_dim, kernel_size=1,
67 | padding=0, dilation=1, std_mul=std_mul,
68 | dropout=dropout))
69 |
70 | def forward(self, text_sequences, text_positions=None, lengths=None,
71 | speaker_embed=None):
72 | assert self.n_speakers == 1 or speaker_embed is not None
73 |
74 | # embed text_sequences
75 | x = self.embed_tokens(text_sequences)
76 | x = F.dropout(x, p=self.dropout, training=self.training)
77 |
78 | # expand speaker embedding for all time steps
79 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed)
80 | if speaker_embed_btc is not None:
81 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training)
82 | x = x + F.softsign(self.speaker_fc1(speaker_embed_btc))
83 |
84 | input_embedding = x
85 |
86 | # B x T x C -> B x C x T
87 | x = x.transpose(1, 2)
88 |
89 | # 1D conv blocks
90 | for f in self.convolutions:
91 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
92 |
93 | # Back to B x T x C
94 | keys = x.transpose(1, 2)
95 |
96 | if speaker_embed_btc is not None:
97 | keys = keys + F.softsign(self.speaker_fc2(speaker_embed_btc))
98 |
99 | # scale gradients (this only affects backward, not forward)
100 | if self.apply_grad_scaling and self.num_attention_layers is not None:
101 | keys = GradMultiply.apply(keys, 1.0 / (2.0 * self.num_attention_layers))
102 |
103 | # add output to input embedding for attention
104 | values = (keys + input_embedding) * math.sqrt(0.5)
105 |
106 | return keys, values
107 |
108 |
109 | class AttentionLayer(nn.Module):
110 | def __init__(self, conv_channels, embed_dim, dropout=0.1,
111 | window_ahead=3, window_backward=1,
112 | key_projection=True, value_projection=True):
113 | super(AttentionLayer, self).__init__()
114 | self.query_projection = Linear(conv_channels, embed_dim)
115 | if key_projection:
116 | self.key_projection = Linear(embed_dim, embed_dim)
117 | # According to the DeepVoice3 paper, intiailize weights to same values
118 | # TODO: Does this really work well? not sure..
119 | if conv_channels == embed_dim:
120 | self.key_projection.weight.data = self.query_projection.weight.data.clone()
121 | else:
122 | self.key_projection = None
123 | if value_projection:
124 | self.value_projection = Linear(embed_dim, embed_dim)
125 | else:
126 | self.value_projection = None
127 |
128 | self.out_projection = Linear(embed_dim, conv_channels)
129 | self.dropout = dropout
130 | self.window_ahead = window_ahead
131 | self.window_backward = window_backward
132 |
133 | def forward(self, query, encoder_out, mask=None, last_attended=None):
134 | keys, values = encoder_out
135 | residual = query
136 | if self.value_projection is not None:
137 | values = self.value_projection(values)
138 | # TODO: yes, this is inefficient
139 | if self.key_projection is not None:
140 | keys = self.key_projection(keys.transpose(1, 2)).transpose(1, 2)
141 |
142 | # attention
143 | x = self.query_projection(query)
144 | x = torch.bmm(x, keys)
145 |
146 | mask_value = -float("inf")
147 | if mask is not None:
148 | mask = mask.view(query.size(0), 1, -1)
149 | x.data.masked_fill_(mask, mask_value)
150 |
151 | if last_attended is not None:
152 | backward = last_attended - self.window_backward
153 | if backward > 0:
154 | x[:, :, :backward] = mask_value
155 | ahead = last_attended + self.window_ahead
156 | if ahead < x.size(-1):
157 | x[:, :, ahead:] = mask_value
158 |
159 | # softmax over last dim
160 | # (B, tgt_len, src_len)
161 | sz = x.size()
162 | x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
163 | x = x.view(sz)
164 | attn_scores = x
165 |
166 | x = F.dropout(x, p=self.dropout, training=self.training)
167 |
168 | x = torch.bmm(x, values)
169 |
170 | # scale attention output
171 | s = values.size(1)
172 | x = x * (s * math.sqrt(1.0 / s))
173 |
174 | # project back
175 | x = self.out_projection(x)
176 | x = (x + residual) * math.sqrt(0.5)
177 | return x, attn_scores
178 |
179 |
180 | class Decoder(nn.Module):
181 | def __init__(self, embed_dim, n_speakers, speaker_embed_dim,
182 | in_dim=80, r=5,
183 | max_positions=512, padding_idx=None,
184 | preattention=((128, 5, 1),) * 4,
185 | convolutions=((128, 5, 1),) * 4,
186 | attention=True, dropout=0.1,
187 | use_memory_mask=False,
188 | force_monotonic_attention=False,
189 | query_position_rate=1.0,
190 | key_position_rate=1.29,
191 | window_ahead=3,
192 | window_backward=1,
193 | key_projection=True,
194 | value_projection=True,
195 | ):
196 | super(Decoder, self).__init__()
197 | self.dropout = dropout
198 | self.in_dim = in_dim
199 | self.r = r
200 | self.query_position_rate = query_position_rate
201 | self.key_position_rate = key_position_rate
202 |
203 | in_channels = in_dim * r
204 | if isinstance(attention, bool):
205 | # expand True into [True, True, ...] and do the same with False
206 | attention = [attention] * len(convolutions)
207 |
208 | # Position encodings for query (decoder states) and keys (encoder states)
209 | self.embed_query_positions = SinusoidalEncoding(
210 | max_positions, convolutions[0][0], padding_idx)
211 | self.embed_keys_positions = SinusoidalEncoding(
212 | max_positions, embed_dim, padding_idx)
213 | # Used for compute multiplier for positional encodings
214 | if n_speakers > 1:
215 | self.speaker_proj1 = Linear(speaker_embed_dim, 1, dropout=dropout)
216 | self.speaker_proj2 = Linear(speaker_embed_dim, 1, dropout=dropout)
217 | else:
218 | self.speaker_proj1, self.speaker_proj2 = None, None
219 |
220 | # Prenet: causal convolution blocks
221 | self.preattention = nn.ModuleList()
222 | in_channels = in_dim * r
223 | std_mul = 1.0
224 | for out_channels, kernel_size, dilation in preattention:
225 | if in_channels != out_channels:
226 | # Conv1d + ReLU
227 | self.preattention.append(
228 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
229 | dilation=1, std_mul=std_mul))
230 | self.preattention.append(nn.ReLU(inplace=True))
231 | in_channels = out_channels
232 | std_mul = 2.0
233 | self.preattention.append(
234 | Conv1dGLU(n_speakers, speaker_embed_dim,
235 | in_channels, out_channels, kernel_size, causal=True,
236 | dilation=dilation, dropout=dropout, std_mul=std_mul,
237 | residual=True))
238 | in_channels = out_channels
239 | std_mul = 4.0
240 |
241 | # Causal convolution blocks + attention layers
242 | self.convolutions = nn.ModuleList()
243 | self.attention = nn.ModuleList()
244 |
245 | for i, (out_channels, kernel_size, dilation) in enumerate(convolutions):
246 | assert in_channels == out_channels
247 | self.convolutions.append(
248 | Conv1dGLU(n_speakers, speaker_embed_dim,
249 | in_channels, out_channels, kernel_size, causal=True,
250 | dilation=dilation, dropout=dropout, std_mul=std_mul,
251 | residual=False))
252 | self.attention.append(
253 | AttentionLayer(out_channels, embed_dim,
254 | dropout=dropout,
255 | window_ahead=window_ahead,
256 | window_backward=window_backward,
257 | key_projection=key_projection,
258 | value_projection=value_projection)
259 | if attention[i] else None)
260 | in_channels = out_channels
261 | std_mul = 4.0
262 | # Last 1x1 convolution
263 | self.last_conv = Conv1d(in_channels, in_dim * r, kernel_size=1,
264 | padding=0, dilation=1, std_mul=std_mul,
265 | dropout=dropout)
266 |
267 | # Mel-spectrogram (before sigmoid) -> Done binary flag
268 | self.fc = Linear(in_dim * r, 1)
269 |
270 | self.max_decoder_steps = 200
271 | self.min_decoder_steps = 10
272 | self.use_memory_mask = use_memory_mask
273 | if isinstance(force_monotonic_attention, bool):
274 | self.force_monotonic_attention = [force_monotonic_attention] * len(convolutions)
275 | else:
276 | self.force_monotonic_attention = force_monotonic_attention
277 |
278 | def forward(self, encoder_out, inputs=None,
279 | text_positions=None, frame_positions=None,
280 | speaker_embed=None, lengths=None):
281 | if inputs is None:
282 | assert text_positions is not None
283 | self.start_fresh_sequence()
284 | outputs = self.incremental_forward(encoder_out, text_positions, speaker_embed)
285 | return outputs
286 |
287 | # Grouping multiple frames if necessary
288 | if inputs.size(-1) == self.in_dim:
289 | inputs = inputs.view(inputs.size(0), inputs.size(1) // self.r, -1)
290 | assert inputs.size(-1) == self.in_dim * self.r
291 |
292 | # expand speaker embedding for all time steps
293 | speaker_embed_btc = expand_speaker_embed(inputs, speaker_embed)
294 | if speaker_embed_btc is not None:
295 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training)
296 |
297 | keys, values = encoder_out
298 |
299 | if self.use_memory_mask and lengths is not None:
300 | mask = get_mask_from_lengths(keys, lengths)
301 | else:
302 | mask = None
303 |
304 | # position encodings
305 | if text_positions is not None:
306 | w = self.key_position_rate
307 | # TODO: may be useful to have projection per attention layer
308 | if self.speaker_proj1 is not None:
309 | w = w * F.sigmoid(self.speaker_proj1(speaker_embed)).view(-1)
310 | text_pos_embed = self.embed_keys_positions(text_positions, w)
311 | keys = keys + text_pos_embed
312 | if frame_positions is not None:
313 | w = self.query_position_rate
314 | if self.speaker_proj2 is not None:
315 | w = w * F.sigmoid(self.speaker_proj2(speaker_embed)).view(-1)
316 | frame_pos_embed = self.embed_query_positions(frame_positions, w)
317 |
318 | # transpose only once to speed up attention layers
319 | keys = keys.transpose(1, 2).contiguous()
320 |
321 | x = inputs
322 | x = F.dropout(x, p=self.dropout, training=self.training)
323 |
324 | # Generic case: B x T x C -> B x C x T
325 | x = x.transpose(1, 2)
326 |
327 | # Prenet
328 | for f in self.preattention:
329 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
330 |
331 | # Casual convolutions + Multi-hop attentions
332 | alignments = []
333 | for f, attention in zip(self.convolutions, self.attention):
334 | residual = x
335 |
336 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
337 |
338 | # Feed conv output to attention layer as query
339 | if attention is not None:
340 | assert isinstance(f, Conv1dGLU)
341 | # (B x T x C)
342 | x = x.transpose(1, 2)
343 | x = x if frame_positions is None else x + frame_pos_embed
344 | x, alignment = attention(x, (keys, values), mask=mask)
345 | # (T x B x C)
346 | x = x.transpose(1, 2)
347 | alignments += [alignment]
348 |
349 | if isinstance(f, Conv1dGLU):
350 | x = (x + residual) * math.sqrt(0.5)
351 |
352 | # decoder state (B x T x C):
353 | # internal representation before compressed to output dimention
354 | decoder_states = x.transpose(1, 2).contiguous()
355 | x = self.last_conv(x)
356 |
357 | # Back to B x T x C
358 | x = x.transpose(1, 2)
359 |
360 | # project to mel-spectorgram
361 | outputs = F.sigmoid(x)
362 |
363 | # Done flag
364 | done = F.sigmoid(self.fc(x))
365 |
366 | return outputs, torch.stack(alignments), done, decoder_states
367 |
368 | def incremental_forward(self, encoder_out, text_positions, speaker_embed=None,
369 | initial_input=None, test_inputs=None):
370 | keys, values = encoder_out
371 | B = keys.size(0)
372 |
373 | # position encodings
374 | w = self.key_position_rate
375 | # TODO: may be useful to have projection per attention layer
376 | if self.speaker_proj1 is not None:
377 | w = w * F.sigmoid(self.speaker_proj1(speaker_embed)).view(-1)
378 | text_pos_embed = self.embed_keys_positions(text_positions, w)
379 | keys = keys + text_pos_embed
380 |
381 | # transpose only once to speed up attention layers
382 | keys = keys.transpose(1, 2).contiguous()
383 |
384 | decoder_states = []
385 | outputs = []
386 | alignments = []
387 | dones = []
388 | # intially set to zeros
389 | last_attended = [None] * len(self.attention)
390 | for idx, v in enumerate(self.force_monotonic_attention):
391 | last_attended[idx] = 0 if v else None
392 |
393 | num_attention_layers = sum([layer is not None for layer in self.attention])
394 | t = 0
395 | if initial_input is None:
396 | initial_input = Variable(
397 | keys.data.new(B, 1, self.in_dim * self.r).zero_())
398 | current_input = initial_input
399 | while True:
400 | # frame pos start with 1.
401 | frame_pos = Variable(keys.data.new(B, 1).fill_(t + 1)).long()
402 | w = self.query_position_rate
403 | if self.speaker_proj2 is not None:
404 | w = w * F.sigmoid(self.speaker_proj2(speaker_embed)).view(-1)
405 | frame_pos_embed = self.embed_query_positions(frame_pos, w)
406 |
407 | if test_inputs is not None:
408 | if t >= test_inputs.size(1):
409 | break
410 | current_input = test_inputs[:, t, :].unsqueeze(1)
411 | else:
412 | if t > 0:
413 | current_input = outputs[-1]
414 | x = current_input
415 | x = F.dropout(x, p=self.dropout, training=self.training)
416 |
417 | # Prenet
418 | for f in self.preattention:
419 | if isinstance(f, Conv1dGLU):
420 | x = f.incremental_forward(x, speaker_embed)
421 | else:
422 | try:
423 | x = f.incremental_forward(x)
424 | except AttributeError as e:
425 | x = f(x)
426 |
427 | # Casual convolutions + Multi-hop attentions
428 | ave_alignment = None
429 | for idx, (f, attention) in enumerate(zip(self.convolutions,
430 | self.attention)):
431 | residual = x
432 | if isinstance(f, Conv1dGLU):
433 | x = f.incremental_forward(x, speaker_embed)
434 | else:
435 | try:
436 | x = f.incremental_forward(x)
437 | except AttributeError as e:
438 | x = f(x)
439 |
440 | # attention
441 | if attention is not None:
442 | assert isinstance(f, Conv1dGLU)
443 | x = x + frame_pos_embed
444 | x, alignment = attention(x, (keys, values),
445 | last_attended=last_attended[idx])
446 | if self.force_monotonic_attention[idx]:
447 | last_attended[idx] = alignment.max(-1)[1].view(-1).data[0]
448 | if ave_alignment is None:
449 | ave_alignment = alignment
450 | else:
451 | ave_alignment = ave_alignment + ave_alignment
452 |
453 | # residual
454 | if isinstance(f, Conv1dGLU):
455 | x = (x + residual) * math.sqrt(0.5)
456 |
457 | decoder_state = x
458 | x = self.last_conv.incremental_forward(x)
459 | ave_alignment = ave_alignment.div_(num_attention_layers)
460 |
461 | # Ooutput & done flag predictions
462 | output = F.sigmoid(x)
463 | done = F.sigmoid(self.fc(x))
464 |
465 | decoder_states += [decoder_state]
466 | outputs += [output]
467 | alignments += [ave_alignment]
468 | dones += [done]
469 |
470 | t += 1
471 | if test_inputs is None:
472 | if (done > 0.5).all() and t > self.min_decoder_steps:
473 | break
474 | elif t > self.max_decoder_steps:
475 | break
476 |
477 | # Remove 1-element time axis
478 | alignments = list(map(lambda x: x.squeeze(1), alignments))
479 | decoder_states = list(map(lambda x: x.squeeze(1), decoder_states))
480 | outputs = list(map(lambda x: x.squeeze(1), outputs))
481 |
482 | # Combine outputs for all time steps
483 | alignments = torch.stack(alignments).transpose(0, 1)
484 | decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous()
485 | outputs = torch.stack(outputs).transpose(0, 1).contiguous()
486 |
487 | return outputs, alignments, dones, decoder_states
488 |
489 | def start_fresh_sequence(self):
490 | for conv in self.convolutions:
491 | conv.clear_buffer()
492 |
493 |
494 | class Converter(nn.Module):
495 | def __init__(self, n_speakers, speaker_embed_dim,
496 | in_dim, out_dim, convolutions=((256, 5, 1),) * 4,
497 | time_upsampling=1,
498 | dropout=0.1):
499 | super(Converter, self).__init__()
500 | self.dropout = dropout
501 | self.in_dim = in_dim
502 | self.out_dim = out_dim
503 | self.n_speakers = n_speakers
504 |
505 | # Non causual convolution blocks
506 | in_channels = convolutions[0][0]
507 | # Idea from nyanko
508 | if time_upsampling == 4:
509 | self.convolutions = nn.ModuleList([
510 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1,
511 | std_mul=1.0),
512 | ConvTranspose1d(in_channels, in_channels, kernel_size=2,
513 | padding=0, stride=2, std_mul=1.0),
514 | Conv1dGLU(n_speakers, speaker_embed_dim,
515 | in_channels, in_channels, kernel_size=3, causal=False,
516 | dilation=1, dropout=dropout, std_mul=1.0, residual=True),
517 | Conv1dGLU(n_speakers, speaker_embed_dim,
518 | in_channels, in_channels, kernel_size=3, causal=False,
519 | dilation=3, dropout=dropout, std_mul=4.0, residual=True),
520 | ConvTranspose1d(in_channels, in_channels, kernel_size=2,
521 | padding=0, stride=2, std_mul=4.0),
522 | Conv1dGLU(n_speakers, speaker_embed_dim,
523 | in_channels, in_channels, kernel_size=3, causal=False,
524 | dilation=1, dropout=dropout, std_mul=1.0, residual=True),
525 | Conv1dGLU(n_speakers, speaker_embed_dim,
526 | in_channels, in_channels, kernel_size=3, causal=False,
527 | dilation=3, dropout=dropout, std_mul=4.0, residual=True),
528 | ])
529 | elif time_upsampling == 2:
530 | self.convolutions = nn.ModuleList([
531 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1,
532 | std_mul=1.0),
533 | ConvTranspose1d(in_channels, in_channels, kernel_size=2,
534 | padding=0, stride=2, std_mul=1.0),
535 | Conv1dGLU(n_speakers, speaker_embed_dim,
536 | in_channels, in_channels, kernel_size=3, causal=False,
537 | dilation=1, dropout=dropout, std_mul=1.0, residual=True),
538 | Conv1dGLU(n_speakers, speaker_embed_dim,
539 | in_channels, in_channels, kernel_size=3, causal=False,
540 | dilation=3, dropout=dropout, std_mul=4.0, residual=True),
541 | ])
542 | elif time_upsampling == 1:
543 | self.convolutions = nn.ModuleList([
544 | # 1x1 convolution first
545 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1,
546 | std_mul=1.0),
547 | Conv1dGLU(n_speakers, speaker_embed_dim,
548 | in_channels, in_channels, kernel_size=3, causal=False,
549 | dilation=3, dropout=dropout, std_mul=4.0, residual=True),
550 | ])
551 | else:
552 | raise ValueError("Not supported")
553 |
554 | std_mul = 4.0
555 | for (out_channels, kernel_size, dilation) in convolutions:
556 | if in_channels != out_channels:
557 | self.convolutions.append(
558 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
559 | dilation=1, std_mul=std_mul))
560 | self.convolutions.append(nn.ReLU(inplace=True))
561 | in_channels = out_channels
562 | std_mul = 2.0
563 | self.convolutions.append(
564 | Conv1dGLU(n_speakers, speaker_embed_dim,
565 | in_channels, out_channels, kernel_size, causal=False,
566 | dilation=dilation, dropout=dropout, std_mul=std_mul,
567 | residual=True))
568 | in_channels = out_channels
569 | std_mul = 4.0
570 | # Last 1x1 convolution
571 | self.convolutions.append(Conv1d(in_channels, out_dim, kernel_size=1,
572 | padding=0, dilation=1, std_mul=std_mul,
573 | dropout=dropout))
574 |
575 | def forward(self, x, speaker_embed=None):
576 | assert self.n_speakers == 1 or speaker_embed is not None
577 |
578 | # expand speaker embedding for all time steps
579 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed)
580 | if speaker_embed_btc is not None:
581 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training)
582 |
583 | # Generic case: B x T x C -> B x C x T
584 | x = x.transpose(1, 2)
585 |
586 | for f in self.convolutions:
587 | # Case for upsampling
588 | if speaker_embed_btc is not None and speaker_embed_btc.size(1) != x.size(-1):
589 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed, tdim=-1)
590 | speaker_embed_btc = F.dropout(
591 | speaker_embed_btc, p=self.dropout, training=self.training)
592 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)
593 |
594 | # Back to B x T x C
595 | x = x.transpose(1, 2)
596 |
597 | return F.sigmoid(x)
598 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | """Text processing frontend
4 |
5 | All frontend module should have the following functions:
6 |
7 | - text_to_sequence(text, p)
8 | - sequence_to_text(sequence)
9 |
10 | and the property:
11 |
12 | - n_vocab
13 |
14 | """
15 | from dv3.deepvoice3_pytorch.frontend import en
16 |
17 | # optinoal Japanese frontend
18 | try:
19 | from dv3.deepvoice3_pytorch.frontend import jp
20 | except ImportError:
21 | jp = None
22 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/en/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from dv3.deepvoice3_pytorch.frontend.text.symbols import symbols
3 |
4 | import nltk
5 | from random import random
6 |
7 | n_vocab = len(symbols)
8 |
9 | _arphabet = nltk.corpus.cmudict.dict()
10 |
11 |
12 | def _maybe_get_arpabet(word, p):
13 | try:
14 | phonemes = _arphabet[word][0]
15 | phonemes = " ".join(phonemes)
16 | except KeyError:
17 | return word
18 |
19 | return '{%s}' % phonemes if random() < p else word
20 |
21 |
22 | def mix_pronunciation(text, p):
23 | text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
24 | return text
25 |
26 |
27 | def text_to_sequence(text, p=0.0):
28 | if p >= 0:
29 | text = mix_pronunciation(text, p)
30 | from dv3.deepvoice3_pytorch.frontend.text import text_to_sequence
31 | text = text_to_sequence(text, ["english_cleaners"])
32 | return text
33 |
34 |
35 | from dv3.deepvoice3_pytorch.frontend.text import sequence_to_text
36 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/en/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/en/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/jp/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 |
4 | import MeCab
5 | import jaconv
6 | from random import random
7 |
8 | n_vocab = 0xffff
9 |
10 | _eos = 1
11 | _pad = 0
12 | _tagger = None
13 |
14 |
15 | def _yomi(mecab_result):
16 | tokens = []
17 | yomis = []
18 | for line in mecab_result.split("\n")[:-1]:
19 | s = line.split("\t")
20 | if len(s) == 1:
21 | break
22 | token, rest = s
23 | rest = rest.split(",")
24 | tokens.append(token)
25 | yomi = rest[7] if len(rest) > 7 else None
26 | yomi = None if yomi == "*" else yomi
27 | yomis.append(yomi)
28 |
29 | return tokens, yomis
30 |
31 |
32 | def _mix_pronunciation(tokens, yomis, p):
33 | return "".join(
34 | yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
35 | for idx in range(len(tokens)))
36 |
37 |
38 | def mix_pronunciation(text, p):
39 | global _tagger
40 | if _tagger is None:
41 | _tagger = MeCab.Tagger("")
42 | tokens, yomis = _yomi(_tagger.parse(text))
43 | return _mix_pronunciation(tokens, yomis, p)
44 |
45 |
46 | def add_punctuation(text):
47 | last = text[-1]
48 | if last not in [".", ",", "、", "。", "!", "?", "!", "?"]:
49 | text = text + "。"
50 | return text
51 |
52 |
53 | def normalize_delimitor(text):
54 | text = text.replace(",", "、")
55 | text = text.replace(".", "。")
56 | text = text.replace(",", "、")
57 | text = text.replace(".", "。")
58 | return text
59 |
60 |
61 | def text_to_sequence(text, p=0.0):
62 | for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
63 | "(", ")", "(", ")"]:
64 | text = text.replace(c, "")
65 | text = text.replace("!", "!")
66 | text = text.replace("?", "?")
67 |
68 | text = normalize_delimitor(text)
69 | text = jaconv.normalize(text)
70 | if p > 0:
71 | text = mix_pronunciation(text, p)
72 | text = jaconv.hira2kata(text)
73 | text = add_punctuation(text)
74 |
75 | return [ord(c) for c in text] + [_eos] # EOS
76 |
77 |
78 | def sequence_to_text(seq):
79 | return "".join(chr(n) for n in seq)
80 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/jp/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/jp/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 | from dv3.deepvoice3_pytorch.frontend.text import cleaners
3 | from dv3.deepvoice3_pytorch.frontend.text.symbols import symbols
4 |
5 |
6 | # Mappings from symbol to numeric ID and vice versa:
7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9 |
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 |
13 |
14 | def text_to_sequence(text, cleaner_names):
15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 |
17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 |
20 | Args:
21 | text: string to convert to a sequence
22 | cleaner_names: names of the cleaner functions to run the text through
23 |
24 | Returns:
25 | List of integers corresponding to the symbols in the text
26 | '''
27 | sequence = []
28 |
29 | # Check for curly braces and treat their contents as ARPAbet:
30 | while len(text):
31 | m = _curly_re.match(text)
32 | if not m:
33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 | break
35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 | sequence += _arpabet_to_sequence(m.group(2))
37 | text = m.group(3)
38 |
39 | # Append EOS token
40 | sequence.append(_symbol_to_id['~'])
41 | return sequence
42 |
43 |
44 | def sequence_to_text(sequence):
45 | '''Converts a sequence of IDs back to a string'''
46 | result = ''
47 | for symbol_id in sequence:
48 | if symbol_id in _id_to_symbol:
49 | s = _id_to_symbol[symbol_id]
50 | # Enclose ARPAbet back in curly braces:
51 | if len(s) > 1 and s[0] == '@':
52 | s = '{%s}' % s[1:]
53 | result += s
54 | return result.replace('}{', ' ')
55 |
56 |
57 | def _clean_text(text, cleaner_names):
58 | for name in cleaner_names:
59 | cleaner = getattr(cleaners, name)
60 | if not cleaner:
61 | raise Exception('Unknown cleaner: %s' % name)
62 | text = cleaner(text)
63 | return text
64 |
65 |
66 | def _symbols_to_sequence(symbols):
67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 |
69 |
70 | def _arpabet_to_sequence(text):
71 | return _symbols_to_sequence(['@' + s for s in text.split()])
72 |
73 |
74 | def _should_keep_symbol(s):
75 | return s in _symbol_to_id and s is not '_' and s is not '~'
76 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cleaners.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cleaners.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cmudict.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cmudict.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__pycache__/numbers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/numbers.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/__pycache__/symbols.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/symbols.cpython-36.pyc
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/cleaners.py:
--------------------------------------------------------------------------------
1 | '''
2 | Cleaners are transformations that run over the input text at both training and eval time.
3 |
4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
6 | 1. "english_cleaners" for English text
7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 | the symbols in symbols.py to match your data).
11 | '''
12 |
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 |
17 |
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 |
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 | ('mrs', 'misess'),
24 | ('mr', 'mister'),
25 | ('dr', 'doctor'),
26 | ('st', 'saint'),
27 | ('co', 'company'),
28 | ('jr', 'junior'),
29 | ('maj', 'major'),
30 | ('gen', 'general'),
31 | ('drs', 'doctors'),
32 | ('rev', 'reverend'),
33 | ('lt', 'lieutenant'),
34 | ('hon', 'honorable'),
35 | ('sgt', 'sergeant'),
36 | ('capt', 'captain'),
37 | ('esq', 'esquire'),
38 | ('ltd', 'limited'),
39 | ('col', 'colonel'),
40 | ('ft', 'fort'),
41 | ]]
42 |
43 |
44 | def expand_abbreviations(text):
45 | for regex, replacement in _abbreviations:
46 | text = re.sub(regex, replacement, text)
47 | return text
48 |
49 |
50 | def expand_numbers(text):
51 | return normalize_numbers(text)
52 |
53 |
54 | def lowercase(text):
55 | return text.lower()
56 |
57 |
58 | def collapse_whitespace(text):
59 | return re.sub(_whitespace_re, ' ', text)
60 |
61 |
62 | def convert_to_ascii(text):
63 | return unidecode(text)
64 |
65 |
66 | def add_punctuation(text):
67 | if len(text) == 0:
68 | return text
69 | if text[-1] not in '!,.:;?':
70 | text = text + '.' # without this decoder is confused when to output EOS
71 | return text
72 |
73 |
74 | def basic_cleaners(text):
75 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
76 | text = lowercase(text)
77 | text = collapse_whitespace(text)
78 | return text
79 |
80 |
81 | def transliteration_cleaners(text):
82 | '''Pipeline for non-English text that transliterates to ASCII.'''
83 | text = convert_to_ascii(text)
84 | text = lowercase(text)
85 | text = collapse_whitespace(text)
86 | return text
87 |
88 |
89 | def english_cleaners(text):
90 | '''Pipeline for English text, including number and abbreviation expansion.'''
91 | text = convert_to_ascii(text)
92 | text = add_punctuation(text)
93 | text = lowercase(text)
94 | text = expand_numbers(text)
95 | text = expand_abbreviations(text)
96 | text = collapse_whitespace(text)
97 | return text
98 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/cmudict.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | valid_symbols = [
5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 |
14 | _valid_symbol_set = set(valid_symbols)
15 |
16 |
17 | class CMUDict:
18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |
20 | def __init__(self, file_or_path, keep_ambiguous=True):
21 | if isinstance(file_or_path, str):
22 | with open(file_or_path, encoding='latin-1') as f:
23 | entries = _parse_cmudict(f)
24 | else:
25 | entries = _parse_cmudict(file_or_path)
26 | if not keep_ambiguous:
27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
28 | self._entries = entries
29 |
30 | def __len__(self):
31 | return len(self._entries)
32 |
33 | def lookup(self, word):
34 | '''Returns list of ARPAbet pronunciations of the given word.'''
35 | return self._entries.get(word.upper())
36 |
37 |
38 | _alt_re = re.compile(r'\([0-9]+\)')
39 |
40 |
41 | def _parse_cmudict(file):
42 | cmudict = {}
43 | for line in file:
44 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
45 | parts = line.split(' ')
46 | word = re.sub(_alt_re, '', parts[0])
47 | pronunciation = _get_pronunciation(parts[1])
48 | if pronunciation:
49 | if word in cmudict:
50 | cmudict[word].append(pronunciation)
51 | else:
52 | cmudict[word] = [pronunciation]
53 | return cmudict
54 |
55 |
56 | def _get_pronunciation(s):
57 | parts = s.strip().split(' ')
58 | for part in parts:
59 | if part not in _valid_symbol_set:
60 | return None
61 | return ' '.join(parts)
62 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/numbers.py:
--------------------------------------------------------------------------------
1 | import inflect
2 | import re
3 |
4 |
5 | _inflect = inflect.engine()
6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 |
13 |
14 | def _remove_commas(m):
15 | return m.group(1).replace(',', '')
16 |
17 |
18 | def _expand_decimal_point(m):
19 | return m.group(1).replace('.', ' point ')
20 |
21 |
22 | def _expand_dollars(m):
23 | match = m.group(1)
24 | parts = match.split('.')
25 | if len(parts) > 2:
26 | return match + ' dollars' # Unexpected format
27 | dollars = int(parts[0]) if parts[0] else 0
28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 | if dollars and cents:
30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 | cent_unit = 'cent' if cents == 1 else 'cents'
32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 | elif dollars:
34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 | return '%s %s' % (dollars, dollar_unit)
36 | elif cents:
37 | cent_unit = 'cent' if cents == 1 else 'cents'
38 | return '%s %s' % (cents, cent_unit)
39 | else:
40 | return 'zero dollars'
41 |
42 |
43 | def _expand_ordinal(m):
44 | return _inflect.number_to_words(m.group(0))
45 |
46 |
47 | def _expand_number(m):
48 | num = int(m.group(0))
49 | if num > 1000 and num < 3000:
50 | if num == 2000:
51 | return 'two thousand'
52 | elif num > 2000 and num < 2010:
53 | return 'two thousand ' + _inflect.number_to_words(num % 100)
54 | elif num % 100 == 0:
55 | return _inflect.number_to_words(num // 100) + ' hundred'
56 | else:
57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 | else:
59 | return _inflect.number_to_words(num, andword='')
60 |
61 |
62 | def normalize_numbers(text):
63 | text = re.sub(_comma_number_re, _remove_commas, text)
64 | text = re.sub(_pounds_re, r'\1 pounds', text)
65 | text = re.sub(_dollars_re, _expand_dollars, text)
66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 | text = re.sub(_ordinal_re, _expand_ordinal, text)
68 | text = re.sub(_number_re, _expand_number, text)
69 | return text
70 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/frontend/text/symbols.py:
--------------------------------------------------------------------------------
1 | '''
2 | Defines the set of symbols used in text input to the model.
3 |
4 | The default is a set of ASCII characters that works well for English or text that has been run
5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
6 | '''
7 | from .cmudict import valid_symbols
8 |
9 | _pad = '_'
10 | _eos = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 |
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in valid_symbols]
15 |
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet
18 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/modules.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | from torch import nn
5 | import math
6 | import numpy as np
7 | from torch.nn import functional as F
8 |
9 |
10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0,
11 | sinusoidal=True):
12 | ''' Init the sinusoid position encoding table '''
13 |
14 | # keep dim 0 for padding token position encoding zero vector
15 | position_enc = np.array([
16 | [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)]
17 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
18 |
19 | position_enc = torch.from_numpy(position_enc).float()
20 | if sinusoidal:
21 | position_enc[1:, 0::2] = torch.sin(position_enc[1:, 0::2]) # dim 2i
22 | position_enc[1:, 1::2] = torch.cos(position_enc[1:, 1::2]) # dim 2i+1
23 |
24 | return position_enc
25 |
26 |
27 | def sinusoidal_encode(x, w):
28 | y = w * x
29 | y[1:, 0::2] = torch.sin(y[1:, 0::2].clone())
30 | y[1:, 1::2] = torch.cos(y[1:, 1::2].clone())
31 | return y
32 |
33 |
34 | class SinusoidalEncoding(nn.Embedding):
35 | def __init__(self, num_embeddings, embedding_dim, padding_idx=0,
36 | *args, **kwargs):
37 | super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim,
38 | padding_idx, *args, **kwargs)
39 | self.weight.data = position_encoding_init(num_embeddings, embedding_dim,
40 | position_rate=1.0,
41 | sinusoidal=False)
42 |
43 | def forward(self, x, w=1.0):
44 | isscaler = np.isscalar(w)
45 | padding_idx = self.padding_idx
46 | if padding_idx is None:
47 | padding_idx = -1
48 |
49 | if isscaler or w.size(0) == 1:
50 | weight = sinusoidal_encode(self.weight, w)
51 | return self._backend.Embedding.apply(
52 | x, weight,
53 | padding_idx, self.max_norm, self.norm_type,
54 | self.scale_grad_by_freq, self.sparse
55 | )
56 | else:
57 | # TODO: cannot simply apply for batch
58 | # better to implement efficient function
59 | pe = []
60 | for batch_idx, we in enumerate(w):
61 | weight = sinusoidal_encode(self.weight, we)
62 | pe.append(self._backend.Embedding.apply(
63 | x[batch_idx], weight,
64 | padding_idx, self.max_norm, self.norm_type,
65 | self.scale_grad_by_freq, self.sparse
66 | ))
67 | pe = torch.stack(pe)
68 | return pe
69 |
70 |
71 | class GradMultiply(torch.autograd.Function):
72 | @staticmethod
73 | def forward(ctx, x, scale):
74 | ctx.scale = scale
75 | res = x.new(x)
76 | ctx.mark_shared_storage((x, res))
77 | return res
78 |
79 | @staticmethod
80 | def backward(ctx, grad):
81 | return grad * ctx.scale, None
82 |
83 |
84 | def Linear(in_features, out_features, dropout=0):
85 | """Weight-normalized Linear layer (input: N x T x C)"""
86 | m = nn.Linear(in_features, out_features)
87 | m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
88 | m.bias.data.zero_()
89 | return nn.utils.weight_norm(m)
90 |
91 |
92 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
93 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
94 | m.weight.data.normal_(0, std)
95 | return m
96 |
97 |
98 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs):
99 | from .conv import Conv1d
100 | m = Conv1d(in_channels, out_channels, kernel_size, **kwargs)
101 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
102 | m.weight.data.normal_(mean=0, std=std)
103 | m.bias.data.zero_()
104 | return nn.utils.weight_norm(m)
105 |
106 |
107 | def ConvTranspose1d(in_channels, out_channels, kernel_size, dropout=0,
108 | std_mul=1.0, **kwargs):
109 | m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs)
110 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
111 | m.weight.data.normal_(mean=0, std=std)
112 | m.bias.data.zero_()
113 | return nn.utils.weight_norm(m)
114 |
115 |
116 | class Conv1dGLU(nn.Module):
117 | """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding
118 | """
119 |
120 | def __init__(self, n_speakers, speaker_embed_dim,
121 | in_channels, out_channels, kernel_size,
122 | dropout, padding=None, dilation=1, causal=False, residual=False,
123 | *args, **kwargs):
124 | super(Conv1dGLU, self).__init__()
125 | self.dropout = dropout
126 | self.residual = residual
127 | if padding is None:
128 | # no future time stamps available
129 | if causal:
130 | padding = (kernel_size - 1) * dilation
131 | else:
132 | padding = (kernel_size - 1) // 2 * dilation
133 | self.causal = causal
134 |
135 | self.conv = Conv1d(in_channels, 2 * out_channels, kernel_size,
136 | dropout=dropout, padding=padding, dilation=dilation,
137 | *args, **kwargs)
138 | if n_speakers > 1:
139 | self.speaker_proj = Linear(speaker_embed_dim, out_channels)
140 | else:
141 | self.speaker_proj = None
142 |
143 | def forward(self, x, speaker_embed=None):
144 | return self._forward(x, speaker_embed, False)
145 |
146 | def incremental_forward(self, x, speaker_embed=None):
147 | return self._forward(x, speaker_embed, True)
148 |
149 | def _forward(self, x, speaker_embed, is_incremental):
150 | residual = x
151 | x = F.dropout(x, p=self.dropout, training=self.training)
152 | if is_incremental:
153 | splitdim = -1
154 | x = self.conv.incremental_forward(x)
155 | else:
156 | splitdim = 1
157 | x = self.conv(x)
158 | # remove future time steps
159 | x = x[:, :, :residual.size(-1)] if self.causal else x
160 |
161 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
162 | if self.speaker_proj is not None:
163 | softsign = F.softsign(self.speaker_proj(speaker_embed))
164 | # Since conv layer assumes BCT, we need to transpose
165 | softsign = softsign if is_incremental else softsign.transpose(1, 2)
166 | a = a + softsign
167 | x = a * F.sigmoid(b)
168 | return (x + residual) * math.sqrt(0.5) if self.residual else x
169 |
170 | def clear_buffer(self):
171 | self.conv.clear_buffer()
172 |
173 |
174 | class HighwayConv1d(nn.Module):
175 | """Weight normzlized Conv1d + Highway network (support incremental forward)
176 | """
177 |
178 | def __init__(self, in_channels, out_channels, kernel_size=1, padding=None,
179 | dilation=1, causal=False, dropout=0, std_mul=None, glu=False):
180 | super(HighwayConv1d, self).__init__()
181 | if std_mul is None:
182 | std_mul = 4.0 if glu else 1.0
183 | if padding is None:
184 | # no future time stamps available
185 | if causal:
186 | padding = (kernel_size - 1) * dilation
187 | else:
188 | padding = (kernel_size - 1) // 2 * dilation
189 | self.causal = causal
190 | self.dropout = dropout
191 | self.glu = glu
192 |
193 | self.conv = Conv1d(in_channels, 2 * out_channels,
194 | kernel_size=kernel_size, padding=padding,
195 | dilation=dilation, dropout=dropout,
196 | std_mul=std_mul)
197 |
198 | def forward(self, x):
199 | return self._forward(x, False)
200 |
201 | def incremental_forward(self, x):
202 | return self._forward(x, True)
203 |
204 | def _forward(self, x, is_incremental):
205 | """Forward
206 |
207 | Args:
208 | x: (B, in_channels, T)
209 | returns:
210 | (B, out_channels, T)
211 | """
212 |
213 | residual = x
214 | x = F.dropout(x, p=self.dropout, training=self.training)
215 | if is_incremental:
216 | splitdim = -1
217 | x = self.conv.incremental_forward(x)
218 | else:
219 | splitdim = 1
220 | x = self.conv(x)
221 | # remove future time steps
222 | x = x[:, :, :residual.size(-1)] if self.causal else x
223 |
224 | if self.glu:
225 | x = F.glu(x, dim=splitdim)
226 | return (x + residual) * math.sqrt(0.5)
227 | else:
228 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
229 | T = F.sigmoid(b)
230 | return (T * a + (1 - T) * residual)
231 |
232 | def clear_buffer(self):
233 | self.conv.clear_buffer()
234 |
235 |
236 | def get_mask_from_lengths(memory, memory_lengths):
237 | """Get mask tensor from list of length
238 | Args:
239 | memory: (batch, max_time, dim)
240 | memory_lengths: array like
241 | """
242 | mask = memory.data.new(memory.size(0), memory.size(1)).byte().zero_()
243 | for idx, l in enumerate(memory_lengths):
244 | mask[idx][:l] = 1
245 | return ~mask
246 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/nyanko.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from torch.autograd import Variable
7 | import math
8 | import numpy as np
9 |
10 | from .modules import Embedding, Linear, Conv1d, ConvTranspose1d
11 | from .modules import HighwayConv1d, get_mask_from_lengths
12 | from .modules import position_encoding_init
13 | from .deepvoice3 import AttentionLayer
14 |
15 |
16 | class Encoder(nn.Module):
17 | def __init__(self, n_vocab, embed_dim, channels, kernel_size=3,
18 | n_speakers=1, speaker_embed_dim=16, embedding_weight_std=0.01,
19 | padding_idx=None, dropout=0.1):
20 | super(Encoder, self).__init__()
21 | self.dropout = dropout
22 |
23 | # Text input embeddings
24 | self.embed_tokens = Embedding(
25 | n_vocab, embed_dim, padding_idx, embedding_weight_std)
26 |
27 | E = embed_dim
28 | D = channels
29 | self.convnet = nn.Sequential(
30 | Conv1d(E, 2 * D, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
31 | nn.ReLU(inplace=True),
32 | Conv1d(2 * D, 2 * D, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
33 |
34 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
35 | dilation=1, std_mul=1.0, dropout=dropout),
36 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
37 | dilation=3, std_mul=1.0, dropout=dropout),
38 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
39 | dilation=9, std_mul=1.0, dropout=dropout),
40 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
41 | dilation=27, std_mul=1.0, dropout=dropout),
42 |
43 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
44 | dilation=1, std_mul=1.0, dropout=dropout),
45 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
46 | dilation=3, std_mul=1.0, dropout=dropout),
47 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
48 | dilation=9, std_mul=1.0, dropout=dropout),
49 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
50 | dilation=27, std_mul=1.0, dropout=dropout),
51 |
52 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
53 | dilation=1, std_mul=1.0, dropout=dropout),
54 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None,
55 | dilation=1, std_mul=1.0, dropout=dropout),
56 |
57 | HighwayConv1d(2 * D, 2 * D, kernel_size=1, padding=0,
58 | dilation=1, std_mul=1.0, dropout=dropout),
59 | )
60 |
61 | def forward(self, text_sequences, text_positions=None, lengths=None,
62 | speaker_embed=None):
63 | # embed text_sequences
64 | # (B, T, E)
65 | x = self.embed_tokens(text_sequences)
66 |
67 | x = self.convnet(x.transpose(1, 2)).transpose(1, 2)
68 |
69 | # (B, T, D) and (B, T, D)
70 | keys, values = x.split(x.size(-1) // 2, dim=-1)
71 |
72 | return keys, values
73 |
74 |
75 | class Decoder(nn.Module):
76 | def __init__(self, embed_dim, in_dim=80, r=5, channels=256, kernel_size=3,
77 | n_speakers=1, speaker_embed_dim=16,
78 | max_positions=512, padding_idx=None,
79 | dropout=0.1,
80 | use_memory_mask=False,
81 | force_monotonic_attention=False,
82 | query_position_rate=1.0,
83 | key_position_rate=1.29,
84 | window_ahead=3,
85 | window_backward=1,
86 | key_projection=False,
87 | value_projection=False,
88 | ):
89 | super(Decoder, self).__init__()
90 | self.dropout = dropout
91 | self.in_dim = in_dim
92 | self.r = r
93 |
94 | D = channels
95 | F = in_dim * r # should be r = 1 to replicate
96 | self.audio_encoder_modules = nn.ModuleList([
97 | Conv1d(F, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
98 | nn.ReLU(inplace=True),
99 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
100 | nn.ReLU(inplace=True),
101 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
102 |
103 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
104 | dilation=1, causal=True, std_mul=1.0, dropout=dropout),
105 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
106 | dilation=3, causal=True, std_mul=1.0, dropout=dropout),
107 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
108 | dilation=9, causal=True, std_mul=1.0, dropout=dropout),
109 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
110 | dilation=27, causal=True, std_mul=1.0, dropout=dropout),
111 |
112 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
113 | dilation=1, causal=True, std_mul=1.0, dropout=dropout),
114 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
115 | dilation=3, causal=True, std_mul=1.0, dropout=dropout),
116 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
117 | dilation=9, causal=True, std_mul=1.0, dropout=dropout),
118 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
119 | dilation=27, causal=True, std_mul=1.0, dropout=dropout),
120 |
121 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
122 | dilation=3, causal=True, std_mul=1.0, dropout=dropout),
123 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
124 | dilation=3, causal=True, std_mul=1.0, dropout=dropout),
125 | ])
126 |
127 | self.attention = AttentionLayer(D, D, dropout=dropout,
128 | window_ahead=window_ahead,
129 | window_backward=window_backward,
130 | key_projection=key_projection,
131 | value_projection=value_projection)
132 |
133 | self.audio_decoder_modules = nn.ModuleList([
134 | Conv1d(2 * D, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
135 |
136 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
137 | dilation=1, causal=True, std_mul=1.0, dropout=dropout),
138 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
139 | dilation=3, causal=True, std_mul=1.0, dropout=dropout),
140 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
141 | dilation=9, causal=True, std_mul=1.0, dropout=dropout),
142 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
143 | dilation=27, causal=True, std_mul=1.0, dropout=dropout),
144 |
145 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
146 | dilation=1, causal=True, std_mul=1.0, dropout=dropout),
147 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None,
148 | dilation=1, causal=True, std_mul=1.0, dropout=dropout),
149 |
150 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
151 | nn.ReLU(inplace=True),
152 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
153 | nn.ReLU(inplace=True),
154 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
155 | nn.ReLU(inplace=True),
156 | ])
157 | self.last_conv = Conv1d(D, F, kernel_size=1, padding=0, dilation=1, std_mul=2.0)
158 |
159 | # Done prediction
160 | self.fc = Linear(F, 1)
161 |
162 | # Position encodings for query (decoder states) and keys (encoder states)
163 | self.embed_query_positions = Embedding(
164 | max_positions, D, padding_idx)
165 | self.embed_query_positions.weight.data = position_encoding_init(
166 | max_positions, D, position_rate=query_position_rate, sinusoidal=True)
167 | self.embed_keys_positions = Embedding(
168 | max_positions, D, padding_idx)
169 | self.embed_keys_positions.weight.data = position_encoding_init(
170 | max_positions, D, position_rate=key_position_rate, sinusoidal=True)
171 |
172 | # options
173 | self.max_decoder_steps = 200
174 | self.min_decoder_steps = 10
175 | self.use_memory_mask = use_memory_mask
176 | self.force_monotonic_attention = force_monotonic_attention
177 |
178 | def forward(self, encoder_out, inputs=None,
179 | text_positions=None, frame_positions=None,
180 | speaker_embed=None, lengths=None):
181 |
182 | if inputs is None:
183 | assert text_positions is not None
184 | self.start_fresh_sequence()
185 | outputs = self.incremental_forward(encoder_out, text_positions)
186 | return outputs
187 |
188 | # Grouping multiple frames if necessary
189 | if inputs.size(-1) == self.in_dim:
190 | inputs = inputs.view(inputs.size(0), inputs.size(1) // self.r, -1)
191 | assert inputs.size(-1) == self.in_dim * self.r
192 |
193 | keys, values = encoder_out
194 |
195 | if self.use_memory_mask and lengths is not None:
196 | mask = get_mask_from_lengths(keys, lengths)
197 | else:
198 | mask = None
199 |
200 | # position encodings
201 | if text_positions is not None:
202 | text_pos_embed = self.embed_keys_positions(text_positions)
203 | keys = keys + text_pos_embed
204 | if frame_positions is not None:
205 | frame_pos_embed = self.embed_query_positions(frame_positions)
206 |
207 | # transpose only once to speed up attention layers
208 | keys = keys.transpose(1, 2).contiguous()
209 |
210 | # (B, T, C)
211 | x = inputs
212 |
213 | # (B, C, T)
214 | x = x.transpose(1, 2)
215 |
216 | # Apply audio encoder
217 | for f in self.audio_encoder_modules:
218 | x = f(x)
219 | Q = x
220 |
221 | # Attention modules assume query as (B, T, C)
222 | x = x.transpose(1, 2)
223 | x = x if frame_positions is None else x + frame_pos_embed
224 | R, alignments = self.attention(x, (keys, values), mask=mask)
225 | R = R.transpose(1, 2)
226 |
227 | # (B, C*2, T)
228 | Rd = torch.cat((R, Q), dim=1)
229 | x = Rd
230 |
231 | # Apply audio decoder
232 | for f in self.audio_decoder_modules:
233 | x = f(x)
234 | decoder_states = x.transpose(1, 2).contiguous()
235 | x = self.last_conv(x)
236 |
237 | # (B, T, C)
238 | x = x.transpose(1, 2)
239 |
240 | # Mel
241 | outputs = F.sigmoid(x)
242 |
243 | # Done prediction
244 | done = F.sigmoid(self.fc(x))
245 |
246 | # Adding extra dim for convenient
247 | alignments = alignments.unsqueeze(0)
248 |
249 | return outputs, alignments, done, decoder_states
250 |
251 | def incremental_forward(self, encoder_out, text_positions,
252 | initial_input=None, test_inputs=None):
253 | keys, values = encoder_out
254 | B = keys.size(0)
255 |
256 | # position encodings
257 | if text_positions is not None:
258 | text_pos_embed = self.embed_keys_positions(text_positions)
259 | keys = keys + text_pos_embed
260 |
261 | # transpose only once to speed up attention layers
262 | keys = keys.transpose(1, 2).contiguous()
263 |
264 | decoder_states = []
265 | outputs = []
266 | alignments = []
267 | dones = []
268 | # intially set to zeros
269 | last_attended = 0 if self.force_monotonic_attention else None
270 |
271 | t = 0
272 | if initial_input is None:
273 | initial_input = Variable(
274 | keys.data.new(B, 1, self.in_dim * self.r).zero_())
275 | current_input = initial_input
276 | while True:
277 | # frame pos start with 1.
278 | frame_pos = Variable(keys.data.new(B, 1).fill_(t + 1)).long()
279 | frame_pos_embed = self.embed_query_positions(frame_pos)
280 |
281 | if test_inputs is not None:
282 | if t >= test_inputs.size(1):
283 | break
284 | current_input = test_inputs[:, t, :].unsqueeze(1)
285 | else:
286 | if t > 0:
287 | current_input = outputs[-1]
288 |
289 | # (B, 1, C)
290 | x = current_input
291 |
292 | for f in self.audio_encoder_modules:
293 | try:
294 | x = f.incremental_forward(x)
295 | except AttributeError as e:
296 | x = f(x)
297 | Q = x
298 |
299 | R, alignment = self.attention(
300 | x + frame_pos_embed, (keys, values), last_attended=last_attended)
301 | if self.force_monotonic_attention:
302 | last_attended = alignment.max(-1)[1].view(-1).data[0]
303 |
304 | Rd = torch.cat((R, Q), dim=-1)
305 | x = Rd
306 | for f in self.audio_decoder_modules:
307 | try:
308 | x = f.incremental_forward(x)
309 | except AttributeError as e:
310 | x = f(x)
311 | decoder_state = x
312 | x = self.last_conv.incremental_forward(x)
313 |
314 | # Ooutput & done flag predictions
315 | output = F.sigmoid(x)
316 | done = F.sigmoid(self.fc(x))
317 |
318 | decoder_states += [decoder_state]
319 | outputs += [output]
320 | alignments += [alignment]
321 | dones += [done]
322 |
323 | t += 1
324 | if test_inputs is None:
325 | if (done > 0.5).all() and t > self.min_decoder_steps:
326 | break
327 | elif t > self.max_decoder_steps:
328 | break
329 |
330 | # Remove 1-element time axis
331 | alignments = list(map(lambda x: x.squeeze(1), alignments))
332 | decoder_states = list(map(lambda x: x.squeeze(1), decoder_states))
333 | outputs = list(map(lambda x: x.squeeze(1), outputs))
334 |
335 | # Combine outputs for all time steps
336 | alignments = torch.stack(alignments).transpose(0, 1)
337 | decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous()
338 | outputs = torch.stack(outputs).transpose(0, 1).contiguous()
339 |
340 | return outputs, alignments, dones, decoder_states
341 |
342 | def start_fresh_sequence(self):
343 | _clear_modules(self.audio_encoder_modules)
344 | _clear_modules(self.audio_decoder_modules)
345 |
346 |
347 | def _clear_modules(modules):
348 | for m in modules:
349 | try:
350 | m.clear_buffer()
351 | except AttributeError as e:
352 | pass
353 |
354 |
355 | class Converter(nn.Module):
356 | def __init__(self, in_dim, out_dim, channels=512, kernel_size=3, dropout=0.1):
357 | super(Converter, self).__init__()
358 | self.dropout = dropout
359 | self.in_dim = in_dim
360 | self.out_dim = out_dim
361 |
362 | F = in_dim
363 | Fd = out_dim
364 | C = channels
365 | self.convnet = nn.Sequential(
366 | Conv1d(F, C, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
367 |
368 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
369 | dilation=1, std_mul=1.0, dropout=dropout),
370 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
371 | dilation=3, std_mul=1.0, dropout=dropout),
372 |
373 | ConvTranspose1d(C, C, kernel_size=2, padding=0, stride=2, std_mul=1.0),
374 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
375 | dilation=1, std_mul=1.0, dropout=dropout),
376 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
377 | dilation=3, std_mul=1.0, dropout=dropout),
378 | ConvTranspose1d(C, C, kernel_size=2, padding=0, stride=2, std_mul=1.0),
379 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
380 | dilation=1, std_mul=1.0, dropout=dropout),
381 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None,
382 | dilation=3, std_mul=1.0, dropout=dropout),
383 |
384 | Conv1d(C, 2 * C, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
385 |
386 | HighwayConv1d(2 * C, 2 * C, kernel_size=kernel_size, padding=None,
387 | dilation=1, std_mul=1.0, dropout=dropout),
388 | HighwayConv1d(2 * C, 2 * C, kernel_size=kernel_size, padding=None,
389 | dilation=1, std_mul=1.0, dropout=dropout),
390 |
391 | Conv1d(2 * C, Fd, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
392 |
393 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=1.0),
394 | nn.ReLU(inplace=True),
395 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
396 | nn.ReLU(inplace=True),
397 |
398 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=2.0),
399 | nn.Sigmoid(),
400 | )
401 |
402 | def forward(self, x, speaker_embed=None):
403 | return self.convnet(x.transpose(1, 2)).transpose(1, 2)
404 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_pytorch/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.1+6645b31'
2 |
--------------------------------------------------------------------------------
/dv3/deepvoice3_vctk.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "deepvoice3",
3 | "frontend": "en",
4 | "replace_pronunciation_prob": 0.5,
5 | "builder": "deepvoice3_multispeaker",
6 | "n_speakers": 108,
7 | "speaker_embed_dim": 512,
8 | "num_mels": 80,
9 | "fmin": 125,
10 | "fmax": 7600,
11 | "fft_size": 1024,
12 | "hop_size": 256,
13 | "sample_rate": 22050,
14 | "preemphasis": 0.97,
15 | "min_level_db": -100,
16 | "ref_level_db": 20,
17 | "rescaling": false,
18 | "rescaling_max": 0.999,
19 | "allow_clipping_in_normalization": true,
20 | "downsample_step": 4,
21 | "outputs_per_step": 1,
22 | "embedding_weight_std": 0.1,
23 | "speaker_embedding_weight_std": 0.05,
24 | "padding_idx": 0,
25 | "max_positions": 1024,
26 | "dropout": 0.050000000000000044,
27 | "kernel_size": 3,
28 | "text_embed_dim": 256,
29 | "encoder_channels": 512,
30 | "decoder_channels": 256,
31 | "converter_channels": 256,
32 | "query_position_rate": 2.0,
33 | "key_position_rate": 7.6,
34 | "key_projection": true,
35 | "value_projection": true,
36 | "use_memory_mask": true,
37 | "trainable_positional_encodings": false,
38 | "freeze_embedding": false,
39 | "use_decoder_state_for_postnet_input": true,
40 | "pin_memory": true,
41 | "num_workers": 2,
42 | "masked_loss_weight": 0.5,
43 | "priority_freq": 3000,
44 | "priority_freq_weight": 0.0,
45 | "binary_divergence_weight": 0.1,
46 | "use_guided_attention": true,
47 | "guided_attention_sigma": 0.4,
48 | "batch_size": 16,
49 | "adam_beta1": 0.5,
50 | "adam_beta2": 0.9,
51 | "adam_eps": 1e-06,
52 | "initial_learning_rate": 0.0005,
53 | "lr_schedule": "noam_learning_rate_decay",
54 | "lr_schedule_kwargs": {},
55 | "nepochs": 2000,
56 | "weight_decay": 0.0,
57 | "clip_thresh": 0.1,
58 | "checkpoint_interval": 10000,
59 | "eval_interval": 10000,
60 | "save_optimizer_state": true,
61 | "force_monotonic_attention": true,
62 | "window_ahead": 3,
63 | "window_backward": 1,
64 | "power": 1.4
65 | }
--------------------------------------------------------------------------------
/dv3/hparams.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | # NOTE: If you want full control for model architecture. please take a look
4 | # at the code and change whatever you want. Some hyper parameters are hardcoded.
5 |
6 | # Default hyperparameters:
7 | hparams = tf.contrib.training.HParams(
8 | name="deepvoice3",
9 |
10 | # Text:
11 | # [en, jp]
12 | frontend='en',
13 |
14 | # Replace words to its pronunciation with fixed probability.
15 | # e.g., 'hello' to 'HH AH0 L OW1'
16 | # [en, jp]
17 | # en: Word -> pronunciation using CMUDict
18 | # jp: Word -> pronounciation usnig MeCab
19 | # [0 ~ 1.0]: 0 means no replacement happens.
20 | replace_pronunciation_prob=0.5,
21 |
22 | # Convenient model builder
23 | # [deepvoice3, deepvoice3_multispeaker, nyanko]
24 | # Definitions can be found at deepvoice3_pytorch/builder.py
25 | # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654
26 | # deepvoice3_multispeaker: Multi-speaker version of DeepVoice3
27 | # nyanko: https://arxiv.org/abs/1710.08969
28 | builder="deepvoice3",
29 |
30 | # Must be configured depends on the dataset and model you use
31 | n_speakers=1,
32 | speaker_embed_dim=128,
33 |
34 | # Presets known to work good.
35 | # NOTE: If specified, override hyper parameters with preset
36 | preset="",
37 | presets={
38 | "deepvoice3_ljspeech": {
39 | "n_speakers": 1,
40 | "downsample_step": 4,
41 | "outputs_per_step": 1,
42 | "embedding_weight_std": 0.1,
43 | "dropout": 1 - 0.95,
44 | "kernel_size": 3,
45 | "text_embed_dim": 256,
46 | "encoder_channels": 512,
47 | "decoder_channels": 256,
48 | "converter_channels": 256,
49 | "use_guided_attention": True,
50 | "guided_attention_sigma": 0.2,
51 | "binary_divergence_weight": 0.1,
52 | "use_decoder_state_for_postnet_input": True,
53 | "max_positions": 512,
54 | "query_position_rate": 1.0,
55 | "key_position_rate": 1.385,
56 | "key_projection": True,
57 | "value_projection": True,
58 | "clip_thresh": 0.1,
59 | "initial_learning_rate": 5e-4,
60 | },
61 | "deepvoice3_vctk": {
62 | "n_speakers": 108,
63 | "speaker_embed_dim": 512,
64 | "downsample_step": 4,
65 | "outputs_per_step": 1,
66 | "embedding_weight_std": 0.1,
67 | "speaker_embedding_weight_std": 0.05,
68 | "dropout": 1 - 0.95,
69 | "kernel_size": 3,
70 | "text_embed_dim": 256,
71 | "encoder_channels": 512,
72 | "decoder_channels": 256,
73 | "converter_channels": 256,
74 | "use_guided_attention": True,
75 | "guided_attention_sigma": 0.4,
76 | "binary_divergence_weight": 0.1,
77 | "use_decoder_state_for_postnet_input": True,
78 | "max_positions": 1024,
79 | "query_position_rate": 2.0,
80 | "key_position_rate": 7.6,
81 | "key_projection": True,
82 | "value_projection": True,
83 | "clip_thresh": 0.1,
84 | "initial_learning_rate": 5e-4,
85 | },
86 | "deepvoice3_speaker_adaptation_vctk": {
87 | "n_speakers": 1,
88 | "speaker_embed_dim": 128,
89 | "downsample_step": 4,
90 | "outputs_per_step": 1,
91 | "embedding_weight_std": 0.1,
92 | "speaker_embedding_weight_std": 0.05,
93 | "dropout": 1 - 0.95,
94 | "kernel_size": 3,
95 | "text_embed_dim": 256,
96 | "encoder_channels": 512,
97 | "decoder_channels": 256,
98 | "converter_channels": 256,
99 | "use_guided_attention": True,
100 | "guided_attention_sigma": 0.4,
101 | "binary_divergence_weight": 0.1,
102 | "use_decoder_state_for_postnet_input": True,
103 | "max_positions": 1024,
104 | "query_position_rate": 2.0,
105 | "key_position_rate": 7.6,
106 | "key_projection": True,
107 | "value_projection": True,
108 | "clip_thresh": 0.1,
109 | "initial_learning_rate": 5e-4,
110 | },
111 | "deepvoice3_speaker_adaptation_libri": {
112 | "n_speakers": 9026,
113 | "speaker_embed_dim": 128,
114 | "downsample_step": 4,
115 | "outputs_per_step": 1,
116 | "embedding_weight_std": 0.1,
117 | "speaker_embedding_weight_std": 0.05,
118 | "dropout": 1 - 0.95,
119 | "kernel_size": 3,
120 | "text_embed_dim": 256,
121 | "encoder_channels": 512,
122 | "decoder_channels": 256,
123 | "converter_channels": 256,
124 | "use_guided_attention": True,
125 | "guided_attention_sigma": 0.4,
126 | "binary_divergence_weight": 0.1,
127 | "use_decoder_state_for_postnet_input": True,
128 | "max_positions": 1024,
129 | "query_position_rate": 2.0,
130 | "key_position_rate": 7.6,
131 | "key_projection": True,
132 | "value_projection": True,
133 | "clip_thresh": 0.1,
134 | "initial_learning_rate": 5e-4,
135 | },
136 | "nyanko_ljspeech": {
137 | "n_speakers": 1,
138 | "downsample_step": 4,
139 | "outputs_per_step": 1,
140 | "embedding_weight_std": 0.01,
141 | "dropout": 1 - 0.95,
142 | "kernel_size": 3,
143 | "text_embed_dim": 128,
144 | "encoder_channels": 256,
145 | "decoder_channels": 256,
146 | "converter_channels": 256,
147 | "use_guided_attention": True,
148 | "guided_attention_sigma": 0.2,
149 | "binary_divergence_weight": 0.1,
150 | "use_decoder_state_for_postnet_input": True,
151 | "max_positions": 512,
152 | "query_position_rate": 1.0,
153 | "key_position_rate": 1.385,
154 | "key_projection": False,
155 | "value_projection": False,
156 | "clip_thresh": 0.1,
157 | "initial_learning_rate": 5e-4,
158 | },
159 | },
160 |
161 | # Audio:
162 | num_mels=80,
163 | fft_size=1024,
164 | hop_size=256,
165 | sample_rate=22050,
166 | preemphasis=0.97,
167 | min_level_db=-100,
168 | ref_level_db=20,
169 |
170 | # Model:
171 | downsample_step=4, # must be 4 when builder="nyanko"
172 | outputs_per_step=1, # must be 1 when builder="nyanko"
173 | embedding_weight_std=0.1,
174 | speaker_embedding_weight_std=0.01,
175 | padding_idx=0,
176 | # Maximum number of input text length
177 | # try setting larger value if you want to give very long text input
178 | max_positions=512,
179 | dropout=1 - 0.95,
180 | kernel_size=3,
181 | text_embed_dim=128,
182 | encoder_channels=256,
183 | decoder_channels=256,
184 | # Note: large converter channels requires significant computational cost
185 | converter_channels=256,
186 | query_position_rate=1.0,
187 | key_position_rate=1.385, # 2.37 for jsut
188 | key_projection=False,
189 | value_projection=False,
190 | use_memory_mask=True,
191 | trainable_positional_encodings=False,
192 | freeze_embedding=False,
193 | # If True, use decoder's internal representation for postnet inputs,
194 | # otherwise use mel-spectrogram.
195 | use_decoder_state_for_postnet_input=True,
196 |
197 | # Data loader
198 | pin_memory=True,
199 | num_workers=2,
200 |
201 | # Loss
202 | masked_loss_weight=0.5, # (1-w)*loss + w * masked_loss
203 | priority_freq=3000, # heuristic: priotrize [0 ~ priotiry_freq] for linear loss
204 | priority_freq_weight=0.0, # (1-w)*linear_loss + w*priority_linear_loss
205 | # https://arxiv.org/pdf/1710.08969.pdf
206 | # Adding the divergence to the loss stabilizes training, expecially for
207 | # very deep (> 10 layers) networks.
208 | # Binary div loss seems has approx 10x scale compared to L1 loss, so I choose 0.1.
209 | binary_divergence_weight=0.1, # set 0 to disable
210 | use_guided_attention=True,
211 | guided_attention_sigma=0.2,
212 |
213 | # Training:
214 | batch_size=16,
215 | adam_beta1=0.5,
216 | adam_beta2=0.9,
217 | adam_eps=1e-6,
218 | initial_learning_rate=5e-4, # 0.001,
219 | lr_schedule="noam_learning_rate_decay",
220 | lr_schedule_kwargs={},
221 | nepochs=2000,
222 | weight_decay=0.0,
223 | clip_thresh=0.1,
224 |
225 | # Save
226 | checkpoint_interval=10000,
227 | eval_interval=10000,
228 | save_optimizer_state=True,
229 |
230 | # Eval:
231 | # this can be list for multple layers of attention
232 | # e.g., [True, False, False, False, True]
233 | force_monotonic_attention=True,
234 | # Attention constraint for incremental decoding
235 | window_ahead=3,
236 | # 0 tends to prevent word repretetion, but sometime causes skip words
237 | window_backward=1,
238 | power=1.4, # Power to raise magnitudes to prior to phase retrieval
239 | )
240 |
241 |
242 | def hparams_debug_string():
243 | values = hparams.values()
244 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)]
245 | return 'Hyperparameters:\n' + '\n'.join(hp)
--------------------------------------------------------------------------------
/dv3/jsut.py:
--------------------------------------------------------------------------------
1 | from concurrent.futures import ProcessPoolExecutor
2 | from functools import partial
3 | import numpy as np
4 | import os
5 | import dv3.audio
6 | from nnmnkwii.datasets import jsut
7 | from nnmnkwii.io import hts
8 | from dv3.hparams import hparams
9 | from os.path import exists
10 | import librosa
11 |
12 |
13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
14 | executor = ProcessPoolExecutor(max_workers=num_workers)
15 | futures = []
16 |
17 | transcriptions = jsut.TranscriptionDataSource(
18 | in_dir, subsets=jsut.available_subsets).collect_files()
19 | wav_paths = jsut.WavFileDataSource(
20 | in_dir, subsets=jsut.available_subsets).collect_files()
21 |
22 | for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
23 | futures.append(executor.submit(
24 | partial(_process_utterance, out_dir, index + 1, wav_path, text)))
25 | return [future.result() for future in tqdm(futures)]
26 |
27 |
28 | def _process_utterance(out_dir, index, wav_path, text):
29 | sr = hparams.sample_rate
30 |
31 | # Load the audio to a numpy array:
32 | wav = dv3.audio.load_wav(wav_path)
33 |
34 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
35 |
36 | # Trim silence from hts labels if available
37 | if exists(lab_path):
38 | labels = hts.load(lab_path)
39 | assert labels[0][-1] == "silB"
40 | assert labels[-1][-1] == "silE"
41 | b = int(labels[0][1] * 1e-7 * sr)
42 | e = int(labels[-1][0] * 1e-7 * sr)
43 | wav = wav[b:e]
44 | else:
45 | wav, _ = librosa.effects.trim(wav, top_db=30)
46 |
47 | # Compute the linear-scale spectrogram from the wav:
48 | spectrogram =dv3.audio.spectrogram(wav).astype(np.float32)
49 | n_frames = spectrogram.shape[1]
50 |
51 | # Compute a mel-scale spectrogram from the wav:
52 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)
53 |
54 | # Write the spectrograms to disk:
55 | spectrogram_filename = 'jsut-spec-%05d.npy' % index
56 | mel_filename = 'jsut-mel-%05d.npy' % index
57 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
58 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
59 |
60 | # Return a tuple describing this training example:
61 | return (spectrogram_filename, mel_filename, n_frames, text)
--------------------------------------------------------------------------------
/dv3/ljspeech.py:
--------------------------------------------------------------------------------
1 | from concurrent.futures import ProcessPoolExecutor
2 | from functools import partial
3 | import numpy as np
4 | import os
5 | import dv3.audio
6 |
7 |
8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
10 |
11 | Args:
12 | in_dir: The directory where you have downloaded the LJ Speech dataset
13 | out_dir: The directory to write the output into
14 | num_workers: Optional number of worker processes to parallelize across
15 | tqdm: You can optionally pass tqdm to get a nice progress bar
16 |
17 | Returns:
18 | A list of tuples describing the training examples. This should be written to train.txt
19 | '''
20 |
21 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
22 | # can omit it and just call _process_utterance on each input if you want.
23 | executor = ProcessPoolExecutor(max_workers=num_workers)
24 | futures = []
25 | index = 1
26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
27 | for line in f:
28 | parts = line.strip().split('|')
29 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
30 | text = parts[2]
31 | futures.append(executor.submit(
32 | partial(_process_utterance, out_dir, index, wav_path, text)))
33 | index += 1
34 | return [future.result() for future in tqdm(futures)]
35 |
36 |
37 | def _process_utterance(out_dir, index, wav_path, text):
38 | '''Preprocesses a single utterance audio/text pair.
39 |
40 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write
41 | to the train.txt file.
42 |
43 | Args:
44 | out_dir: The directory to write the spectrograms into
45 | index: The numeric index to use in the spectrogram filenames.
46 | wav_path: Path to the audio file containing the speech input
47 | text: The text spoken in the input audio file
48 |
49 | Returns:
50 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
51 | '''
52 |
53 | # Load the audio to a numpy array:
54 | wav = dv3.audio.load_wav(wav_path)
55 |
56 | # Compute the linear-scale spectrogram from the wav:
57 | spectrogram = dv3.audio.spectrogram(wav).astype(np.float32)
58 | n_frames = spectrogram.shape[1]
59 |
60 | # Compute a mel-scale spectrogram from the wav:
61 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)
62 |
63 | # Write the spectrograms to disk:
64 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
65 | mel_filename = 'ljspeech-mel-%05d.npy' % index
66 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
67 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
68 |
69 | # Return a tuple describing this training example:
70 | return (spectrogram_filename, mel_filename, n_frames, text)
--------------------------------------------------------------------------------
/dv3/lrschedule.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329
5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000):
6 | # Noam scheme from tensor2tensor:
7 | warmup_steps = float(warmup_steps)
8 | step = global_step + 1.
9 | lr = init_lr * warmup_steps**0.5 * np.minimum(
10 | step * warmup_steps**-1.5, step**-0.5)
11 | return lr
12 |
13 |
14 | def step_learning_rate_decay(init_lr, global_step,
15 | anneal_rate=0.98,
16 | anneal_interval=30000):
17 | return init_lr * anneal_rate ** (global_step // anneal_interval)
18 |
19 |
20 | def cyclic_cosine_annealing(init_lr, global_step, T, M):
21 | """Cyclic cosine annealing
22 |
23 | https://arxiv.org/pdf/1704.00109.pdf
24 |
25 | Args:
26 | init_lr (float): Initial learning rate
27 | global_step (int): Current iteration number
28 | T (int): Total iteration number (i,e. nepoch)
29 | M (int): Number of ensembles we want
30 |
31 | Returns:
32 | float: Annealed learning rate
33 | """
34 | TdivM = T // M
35 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0)
--------------------------------------------------------------------------------
/dv3/preprocess.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | Preprocess dataset
4 | usage: preprocess.py [options]
5 | options:
6 | --num_workers= Num workers.
7 | -h, --help Show help message.
8 | """
9 | from docopt import docopt
10 | import os
11 | from multiprocessing import cpu_count
12 | from tqdm import tqdm
13 | import importlib
14 | from dv3.hparams import hparams
15 |
16 |
17 | def preprocess(mod, in_dir, out_root, num_workers):
18 | os.makedirs(out_dir, exist_ok=True)
19 | metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
20 | write_metadata(metadata, out_dir)
21 |
22 |
23 | def write_metadata(metadata, out_dir):
24 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
25 | for m in metadata:
26 | f.write('|'.join([str(x) for x in m]) + '\n')
27 | frames = sum([m[2] for m in metadata])
28 | frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000
29 | hours = frames * frame_shift_ms / (3600 * 1000)
30 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
31 | print('Max input length: %d' % max(len(m[3]) for m in metadata))
32 | print('Max output length: %d' % max(m[2] for m in metadata))
33 |
34 |
35 | if __name__ == "__main__":
36 | args = docopt(__doc__)
37 | name = args[""]
38 | in_dir = args[""]
39 | out_dir = args[""]
40 | num_workers = args["--num_workers"]
41 | num_workers = cpu_count() if num_workers is None else num_workers
42 |
43 | assert name in ["jsut", "ljspeech", "vctk"]
44 | mod = importlib.import_module(name)
45 | preprocess(mod, in_dir, out_dir, num_workers)
--------------------------------------------------------------------------------
/dv3/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from setuptools import setup, find_packages
3 | import setuptools.command.develop
4 | import setuptools.command.build_py
5 | import os
6 | import subprocess
7 |
8 | version = '0.0.1'
9 |
10 | # Adapted from https://github.com/pytorch/pytorch
11 | cwd = os.path.dirname(os.path.abspath(__file__))
12 | if os.getenv('TACOTRON_BUILD_VERSION'):
13 | version = os.getenv('TACOTRON_BUILD_VERSION')
14 | else:
15 | try:
16 | sha = subprocess.check_output(
17 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
18 | version += '+' + sha[:7]
19 | except subprocess.CalledProcessError:
20 | pass
21 |
22 |
23 | class build_py(setuptools.command.build_py.build_py):
24 |
25 | def run(self):
26 | self.create_version_file()
27 | setuptools.command.build_py.build_py.run(self)
28 |
29 | @staticmethod
30 | def create_version_file():
31 | global version, cwd
32 | print('-- Building version ' + version)
33 | version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py')
34 | with open(version_path, 'w') as f:
35 | f.write("__version__ = '{}'\n".format(version))
36 |
37 |
38 | class develop(setuptools.command.develop.develop):
39 |
40 | def run(self):
41 | build_py.create_version_file()
42 | setuptools.command.develop.develop.run(self)
43 |
44 |
45 | setup(name='deepvoice3_pytorch',
46 | version=version,
47 | description='PyTorch implementation of Tacotron speech synthesis model.',
48 | packages=find_packages(),
49 | cmdclass={
50 | 'build_py': build_py,
51 | 'develop': develop,
52 | },
53 | install_requires=[
54 | "numpy",
55 | "scipy",
56 | "unidecode",
57 | "inflect",
58 | "librosa",
59 | "numba",
60 | "lws <= 1.0",
61 | ],
62 | extras_require={
63 | "train": [
64 | "docopt",
65 | "tqdm",
66 | "tensorboardX",
67 | "nnmnkwii >= 0.0.9",
68 | "nltk",
69 | ],
70 | "test": [
71 | "nose",
72 | ],
73 | "jp": [
74 | "jaconv",
75 | "mecab-python3",
76 | ],
77 | })
--------------------------------------------------------------------------------
/dv3/synthesis.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | Synthesis waveform from trained model.
4 |
5 | usage: synthesis.py [options]
6 |
7 | options:
8 | --hparams= Hyper parameters [default: ].
9 | --checkpoint-seq2seq= Load seq2seq model from checkpoint path.
10 | --checkpoint-postnet= Load postnet model from checkpoint path.
11 | --file-name-suffix= File name suffix [default: ].
12 | --max-decoder-steps= Max decoder steps [default: 500].
13 | --replace_pronunciation_prob= Prob [default: 0.0].
14 | --speaker_id= Speaker ID (for multi-speaker model).
15 | --output-html Output html for blog post.
16 | -h, --help Show help message.
17 | """
18 | from docopt import docopt
19 |
20 | import sys
21 | import os
22 | from os.path import dirname, join, basename, splitext
23 |
24 | import dv3.audio
25 |
26 | import torch
27 | from torch.autograd import Variable
28 | import numpy as np
29 | import nltk
30 |
31 | # The deepvoice3 model
32 | from dv3.deepvoice3_pytorch import frontend
33 | from dv3.hparams import hparams
34 |
35 | from tqdm import tqdm
36 |
37 | use_cuda = torch.cuda.is_available()
38 | _frontend = None # to be set later
39 |
40 |
41 | def tts(model, text, p=0, speaker_id=None, fast=False):
42 | """Convert text to speech waveform given a deepvoice3 model.
43 |
44 | Args:
45 | text (str) : Input text to be synthesized
46 | p (float) : Replace word to pronounciation if p > 0. Default is 0.
47 | """
48 | if use_cuda:
49 | model = model.cuda()
50 | model.eval()
51 | if fast:
52 | model.make_generation_fast_()
53 |
54 | sequence = np.array(_frontend.text_to_sequence(text, p=p))
55 | sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
56 | text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
57 | text_positions = Variable(text_positions)
58 | speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id]))
59 | if use_cuda:
60 | sequence = sequence.cuda()
61 | text_positions = text_positions.cuda()
62 | speaker_ids = None if speaker_ids is None else speaker_ids.cuda()
63 |
64 | # Greedy decoding
65 | mel_outputs, linear_outputs, alignments, done = model(
66 | sequence, text_positions=text_positions, speaker_ids=speaker_ids)
67 |
68 | linear_output = linear_outputs[0].cpu().data.numpy()
69 | spectrogram = dv3.audio._denormalize(linear_output)
70 | alignment = alignments[0].cpu().data.numpy()
71 | mel = mel_outputs[0].cpu().data.numpy()
72 | mel = dv3.audio._denormalize(mel)
73 |
74 | # Predicted audio signal
75 | waveform = dv3.audio.inv_spectrogram(linear_output.T)
76 |
77 | return waveform, alignment, spectrogram, mel
78 |
79 |
80 | if __name__ == "__main__":
81 | args = docopt(__doc__)
82 | print("Command line args:\n", args)
83 | checkpoint_path = args[""]
84 | text_list_file_path = args[""]
85 | dst_dir = args[""]
86 | checkpoint_seq2seq_path = args["--checkpoint-seq2seq"]
87 | checkpoint_postnet_path = args["--checkpoint-postnet"]
88 | max_decoder_steps = int(args["--max-decoder-steps"])
89 | file_name_suffix = args["--file-name-suffix"]
90 | replace_pronunciation_prob = float(args["--replace_pronunciation_prob"])
91 | output_html = args["--output-html"]
92 | speaker_id = args["--speaker_id"]
93 | if speaker_id is not None:
94 | speaker_id = int(speaker_id)
95 |
96 | # Override hyper parameters
97 | hparams.parse(args["--hparams"])
98 | assert hparams.name == "deepvoice3"
99 |
100 | # Presets
101 | if hparams.preset is not None and hparams.preset != "":
102 | preset = hparams.presets[hparams.preset]
103 | import json
104 | hparams.parse_json(json.dumps(preset))
105 | print("Override hyper parameters with preset \"{}\": {}".format(
106 | hparams.preset, json.dumps(preset, indent=4)))
107 |
108 | _frontend = getattr(frontend, hparams.frontend)
109 | import dv3.train
110 | dv3.train._frontend = _frontend
111 | from dv3.train import plot_alignment, build_model
112 |
113 | # Model
114 | model = build_model()
115 |
116 | # Load checkpoints separately
117 | if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None:
118 | checkpoint = torch.load(checkpoint_seq2seq_path)
119 | model.seq2seq.load_state_dict(checkpoint["state_dict"])
120 | checkpoint = torch.load(checkpoint_postnet_path)
121 | model.postnet.load_state_dict(checkpoint["state_dict"])
122 | checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0]
123 | else:
124 | checkpoint = torch.load(checkpoint_path)
125 | model.load_state_dict(checkpoint["state_dict"])
126 | checkpoint_name = splitext(basename(checkpoint_path))[0]
127 |
128 | model.seq2seq.decoder.max_decoder_steps = max_decoder_steps
129 |
130 | os.makedirs(dst_dir, exist_ok=True)
131 | with open(text_list_file_path, "rb") as f:
132 | lines = f.readlines()
133 | for idx, line in enumerate(lines):
134 | text = line.decode("utf-8")[:-1]
135 | words = nltk.word_tokenize(text)
136 | waveform, alignment, _, _ = tts(
137 | model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True)
138 | dst_wav_path = join(dst_dir, "{}_{}{}.wav".format(
139 | idx, checkpoint_name, file_name_suffix))
140 | dst_alignment_path = join(
141 | dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name,
142 | file_name_suffix))
143 | plot_alignment(alignment.T, dst_alignment_path,
144 | info="{}, {}".format(hparams.builder, basename(checkpoint_path)))
145 | dv3.audio.save_wav(waveform, dst_wav_path)
146 | from os.path import basename, splitext
147 | name = splitext(basename(text_list_file_path))[0]
148 | if output_html:
149 | print("""
150 | {}
151 |
152 | ({} chars, {} words)
153 |
154 |
158 |
159 | 
160 | """.format(text, len(text), len(words),
161 | hparams.builder, name, basename(dst_wav_path),
162 | hparams.builder, name, basename(dst_alignment_path)))
163 | else:
164 | print(idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words)))
165 |
166 | print("Finished! Check out {} for generated audio samples.".format(dst_dir))
167 | sys.exit(0)
--------------------------------------------------------------------------------
/dv3/tests/test_conv.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 |
4 | import torch
5 | from torch import nn
6 | from torch.autograd import Variable
7 | from torch.nn import functional as F
8 | from dv3.deepvoice3_pytorch.conv import Conv1d
9 |
10 |
11 | def test_conv1d_incremental():
12 | def __test(kernel_size, dilation, T, B, C, causual=True):
13 | kernel_size = 3
14 | dilation = (dilation,)
15 |
16 | # dilation = (4,)
17 | # causual
18 | assert causual
19 | if causual:
20 | padding = (kernel_size - 1) * dilation[0]
21 | else:
22 | padding = (kernel_size - 1) // 2 * dilation[0]
23 |
24 | # weight: (Cout, Cin, K)
25 | conv = nn.Conv1d(
26 | C, C * 2, kernel_size=kernel_size, padding=padding,
27 | dilation=dilation).eval()
28 | conv.weight.data.fill_(1.0)
29 | conv.bias.data.zero_()
30 |
31 | # weight: (K, Cin, Cout)
32 | # weight (linearized): (Cout*K, Cin)
33 | conv_online = Conv1d(
34 | C, C * 2, kernel_size=kernel_size, padding=padding,
35 | dilation=dilation).eval()
36 | conv_online.weight.data.fill_(1.0)
37 | conv_online.bias.data.zero_()
38 |
39 | # (B, C, T)
40 | bct = Variable(torch.zeros(B, C, T) + torch.arange(0, T))
41 | output_conv = conv(bct)
42 |
43 | # Remove future time stamps
44 | output_conv = output_conv[:, :, :T]
45 |
46 | output_conv_online = []
47 |
48 | # B, T, C
49 | btc = bct.transpose(1, 2).contiguous()
50 | for t in range(btc.size(1)):
51 | input = btc[:, t, :].contiguous().view(B, -1, C)
52 | output = conv_online.incremental_forward(input)
53 | output_conv_online += [output]
54 |
55 | output_conv_online = torch.stack(output_conv_online).squeeze(2)
56 | output_conv_online = output_conv_online.transpose(0, 1).transpose(1, 2)
57 |
58 | assert (output_conv == output_conv_online).all()
59 |
60 | for B in [1, 16]:
61 | for T in [10, 20, 30]:
62 | for C in [1, 2, 4]:
63 | for kernel_size in [3, 5, 9]:
64 | for dilation in [1, 2, 3, 4, 5, 6, 7, 8, 9, 27]:
65 | __test, kernel_size, dilation, T, B, C
--------------------------------------------------------------------------------
/dv3/tests/test_deepvoice3.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 |
4 | import sys
5 | from os.path import dirname, join, exists
6 |
7 | from dv3.deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab
8 |
9 | import torch
10 | from torch.autograd import Variable
11 | from torch import nn
12 | import numpy as np
13 |
14 | from nose.plugins.attrib import attr
15 |
16 | from dv3.deepvoice3_pytorch.builder import deepvoice3
17 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
18 |
19 |
20 | use_cuda = torch.cuda.is_available() and False
21 | num_mels = 80
22 | num_freq = 513
23 | outputs_per_step = 4
24 | padding_idx = 0
25 |
26 |
27 | def _get_model(n_speakers=1, speaker_embed_dim=None,
28 | force_monotonic_attention=False,
29 | use_decoder_state_for_postnet_input=False):
30 | model = deepvoice3(n_vocab=n_vocab,
31 | embed_dim=256,
32 | mel_dim=num_mels,
33 | linear_dim=num_freq,
34 | r=outputs_per_step,
35 | padding_idx=padding_idx,
36 | n_speakers=n_speakers,
37 | speaker_embed_dim=speaker_embed_dim,
38 | dropout=1 - 0.95,
39 | kernel_size=5,
40 | encoder_channels=128,
41 | decoder_channels=256,
42 | converter_channels=256,
43 | force_monotonic_attention=force_monotonic_attention,
44 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
45 | )
46 | return model
47 |
48 |
49 | def _pad(seq, max_len):
50 | return np.pad(seq, (0, max_len - len(seq)),
51 | mode='constant', constant_values=0)
52 |
53 |
54 | def _test_data():
55 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
56 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
57 | input_lengths = np.array([len(s) for s in seqs])
58 | max_len = np.max(input_lengths)
59 | seqs = np.array([_pad(s, max_len) for s in seqs])
60 |
61 | # Test encoder
62 | x = Variable(torch.LongTensor(seqs))
63 | y = Variable(torch.rand(x.size(0), 12, 80))
64 |
65 | return x, y
66 |
67 |
68 | def _deepvoice3(n_vocab, embed_dim=256, mel_dim=80,
69 | linear_dim=4096, r=5,
70 | n_speakers=1, speaker_embed_dim=16,
71 | padding_idx=None,
72 | dropout=(1 - 0.95), dilation=1):
73 |
74 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
75 | h = 128
76 | encoder = Encoder(
77 | n_vocab, embed_dim, padding_idx=padding_idx,
78 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
79 | dropout=dropout,
80 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
81 | (h, 3, dilation), (h, 3, dilation)],
82 | )
83 |
84 | h = 256
85 | decoder = Decoder(
86 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
87 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
88 | dropout=dropout,
89 | preattention=[(h, 3, 1)],
90 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
91 | (h, 3, dilation), (h, 3, dilation)],
92 | attention=[True, False, False, False, True],
93 | force_monotonic_attention=False)
94 |
95 | seq2seq = AttentionSeq2Seq(encoder, decoder)
96 |
97 | in_dim = mel_dim
98 | h = 256
99 | converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
100 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
101 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
102 | (h, 3, dilation), (h, 3, dilation)])
103 |
104 | model = MultiSpeakerTTSModel(
105 | seq2seq, converter, padding_idx=padding_idx,
106 | mel_dim=mel_dim, linear_dim=linear_dim,
107 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim)
108 |
109 | return model
110 |
111 |
112 | def test_single_speaker_deepvoice3():
113 | x, y = _test_data()
114 |
115 | for v in [False, True]:
116 | model = _get_model(use_decoder_state_for_postnet_input=v)
117 | mel_outputs, linear_outputs, alignments, done = model(x, y)
118 |
119 |
120 | def _pad_2d(x, max_len, b_pad=0):
121 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)],
122 | mode="constant", constant_values=0)
123 | return x
124 |
125 |
126 | def test_multi_speaker_deepvoice3():
127 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
128 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
129 | input_lengths = np.array([len(s) for s in seqs])
130 | max_len = np.max(input_lengths)
131 | seqs = np.array([_pad(s, max_len) for s in seqs])
132 |
133 | # Test encoder
134 | x = Variable(torch.LongTensor(seqs))
135 | y = Variable(torch.rand(x.size(0), 4 * 33, 80))
136 | model = _get_model(n_speakers=32, speaker_embed_dim=16)
137 | speaker_ids = Variable(torch.LongTensor([1, 2, 3]))
138 |
139 | mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids)
140 | print("Input text:", x.size())
141 | print("Input mel:", y.size())
142 | print("Mel:", mel_outputs.size())
143 | print("Linear:", linear_outputs.size())
144 | print("Alignments:", alignments.size())
145 | print("Done:", done.size())
146 |
147 |
148 | @attr("local_only")
149 | def test_incremental_correctness():
150 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
151 | seqs = np.array([text_to_sequence(t) for t in texts])
152 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
153 |
154 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
155 | max_target_len = mel.shape[0]
156 | r = 4
157 | mel_dim = 80
158 | if max_target_len % r != 0:
159 | max_target_len += r - max_target_len % r
160 | assert max_target_len % r == 0
161 | mel = _pad_2d(mel, max_target_len)
162 | mel = Variable(torch.from_numpy(mel))
163 | mel_reshaped = mel.view(1, -1, mel_dim * r)
164 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
165 |
166 | x = Variable(torch.LongTensor(seqs))
167 | text_positions = Variable(torch.LongTensor(text_positions))
168 | frame_positions = Variable(torch.LongTensor(frame_positions))
169 |
170 | for model, speaker_ids in [
171 | (_get_model(force_monotonic_attention=False), None),
172 | (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]:
173 | model.eval()
174 |
175 | if speaker_ids is not None:
176 | speaker_embed = model.embed_speakers(speaker_ids)
177 | else:
178 | speaker_embed = None
179 |
180 | # Encoder
181 | encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed)
182 |
183 | # Off line decoding
184 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
185 | encoder_outs, mel_reshaped, speaker_embed=speaker_embed,
186 | text_positions=text_positions, frame_positions=frame_positions)
187 |
188 | # Online decoding with test inputs
189 | model.seq2seq.decoder.start_fresh_sequence()
190 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
191 | encoder_outs, text_positions, speaker_embed=speaker_embed,
192 | test_inputs=mel_reshaped)
193 |
194 | # Should get same result
195 | c = (mel_outputs_offline - mel_outputs_online).abs()
196 | print(c.mean(), c.max())
197 |
198 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
199 | mel_outputs_online.cpu().data.numpy(), atol=1e-5)
200 |
201 |
202 | @attr("local_only")
203 | def test_incremental_forward():
204 | checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth")
205 | if not exists(checkpoint_path):
206 | return
207 | model = _get_model()
208 |
209 | use_cuda = False
210 |
211 | checkpoint = torch.load(checkpoint_path)
212 | model.load_state_dict(checkpoint["state_dict"])
213 | model.make_generation_fast_()
214 | model = model.cuda() if use_cuda else model
215 |
216 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
217 | seqs = np.array([text_to_sequence(t) for t in texts])
218 | input_lengths = [len(s) for s in seqs]
219 |
220 | use_manual_padding = False
221 | if use_manual_padding:
222 | max_input_len = np.max(input_lengths) + 10 # manuall padding
223 | seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int)
224 | input_lengths = torch.LongTensor(input_lengths)
225 | input_lengths = input_lengths.cuda() if use_cuda else input_lenghts
226 | else:
227 | input_lengths = None
228 |
229 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
230 |
231 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
232 | max_target_len = mel.shape[0]
233 | r = 4
234 | mel_dim = 80
235 | if max_target_len % r != 0:
236 | max_target_len += r - max_target_len % r
237 | assert max_target_len % r == 0
238 | mel = _pad_2d(mel, max_target_len)
239 | mel = Variable(torch.from_numpy(mel))
240 | mel_reshaped = mel.view(1, -1, mel_dim * r)
241 |
242 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
243 |
244 | x = Variable(torch.LongTensor(seqs))
245 | text_positions = Variable(torch.LongTensor(text_positions))
246 | frame_positions = Variable(torch.LongTensor(frame_positions))
247 |
248 | if use_cuda:
249 | x = x.cuda()
250 | text_positions = text_positions.cuda()
251 | frame_positions = frame_positions.cuda()
252 | mel_reshaped = mel_reshaped.cuda()
253 |
254 | model.eval()
255 |
256 | def _plot(mel, mel_predicted, alignments):
257 | from matplotlib import pylab as plt
258 | plt.figure(figsize=(16, 10))
259 | plt.subplot(3, 1, 1)
260 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma")
261 | plt.colorbar()
262 |
263 | plt.subplot(3, 1, 2)
264 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
265 | origin="lower bottom", aspect="auto", cmap="magma")
266 | plt.colorbar()
267 |
268 | plt.subplot(3, 1, 3)
269 | if alignments.dim() == 4:
270 | alignments = alignments.mean(0)
271 | plt.imshow(alignments[0].data.cpu(
272 | ).numpy().T, origin="lower bottom", aspect="auto")
273 | plt.colorbar()
274 | plt.show()
275 |
276 | # Encoder
277 | encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths)
278 |
279 | # Off line decoding
280 | mel_output_offline, alignments_offline, done = model.seq2seq.decoder(
281 | encoder_outs, mel_reshaped,
282 | text_positions=text_positions, frame_positions=frame_positions,
283 | lengths=input_lengths)
284 |
285 | _plot(mel, mel_output_offline, alignments_offline)
286 |
287 | # Online decoding
288 | test_inputs = None
289 | # test_inputs = mel_reshaped
290 | model.seq2seq.decoder.start_fresh_sequence()
291 | mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward(
292 | encoder_outs, text_positions,
293 | # initial_input=mel_reshaped[:, :1, :],
294 | test_inputs=test_inputs)
295 |
296 | if test_inputs is not None:
297 | c = (mel_output_offline - mel_outputs).abs()
298 | print(c.mean(), c.max())
299 | _plot(mel, c, alignments)
300 |
301 | _plot(mel, mel_outputs, alignments)
--------------------------------------------------------------------------------
/dv3/tests/test_embedding.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 |
4 | import torch
5 | from torch import nn
6 | from torch.autograd import Variable
7 | from dv3.deepvoice3_pytorch.modules import SinusoidalEncoding, position_encoding_init
8 | import numpy as np
9 |
10 |
11 | def test_sinusoidal():
12 | num_embedding = 512
13 | embedding_dim = 128
14 | padding_idx = 0
15 |
16 | for w in [1.0, 0.5, 2.0, 10.0, 20.0]:
17 | a = nn.Embedding(num_embedding, embedding_dim, padding_idx=padding_idx)
18 | a.weight.data = position_encoding_init(
19 | num_embedding, embedding_dim, position_rate=w)
20 |
21 | b = SinusoidalEncoding(num_embedding, embedding_dim, padding_idx=padding_idx)
22 |
23 | x = Variable(torch.arange(0, 128).long())
24 | ax = a(x).data.numpy()
25 | bx = b(x, w).data.numpy()
26 |
27 | print(w, np.abs(ax - bx).mean())
28 | try:
29 | assert np.allclose(ax, bx)
30 | except:
31 | print("TODO: has little numerical errors?")
32 | assert np.abs(ax - bx).mean() < 1e-5
33 |
--------------------------------------------------------------------------------
/dv3/tests/test_frontend.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 |
4 | from dv3.deepvoice3_pytorch import frontend
5 | from nose.plugins.attrib import attr
6 |
7 | eos = 1
8 |
9 |
10 | def test_en():
11 | f = getattr(frontend, "en")
12 | seq = f.text_to_sequence("hello world.")
13 | assert seq[-1] == eos
14 | t = f.sequence_to_text(seq)
15 | assert t == "hello world.~"
16 |
17 |
18 | def test_ja():
19 | f = getattr(frontend, "jp")
20 | seq = f.text_to_sequence("こんにちわ")
21 | assert seq[-1] == eos
22 | t = f.sequence_to_text(seq)
23 | assert t[:-1] == "コンニチワ。"
24 |
25 |
26 | @attr("local_only")
27 | def test_en_lj():
28 | f = getattr(frontend, "en")
29 | from nnmnkwii.datasets import ljspeech
30 | from tqdm import trange
31 | import jaconv
32 |
33 | d = ljspeech.TranscriptionDataSource("/home/ryuichi/data/LJSpeech-1.0")
34 | texts = d.collect_files()
35 |
36 | for p in [0.0, 0.5, 1.0]:
37 | for idx in trange(len(texts)):
38 | text = texts[idx]
39 | seq = f.text_to_sequence(text, p=p)
40 | assert seq[-1] == eos
41 | t = f.sequence_to_text(seq)
42 |
43 | if idx < 10:
44 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t))
45 |
46 |
47 | @attr("local_only")
48 | def test_ja_jsut():
49 | f = getattr(frontend, "jp")
50 | from nnmnkwii.datasets import jsut
51 | from tqdm import trange
52 | import jaconv
53 |
54 | d = jsut.TranscriptionDataSource("/home/ryuichi/data/jsut_ver1.1/",
55 | subsets=jsut.available_subsets)
56 | texts = d.collect_files()
57 |
58 | for p in [0.0, 0.5, 1.0]:
59 | for idx in trange(len(texts)):
60 | text = texts[idx]
61 | seq = f.text_to_sequence(text, p=p)
62 | assert seq[-1] == eos
63 | t = f.sequence_to_text(seq)
64 |
65 | if idx < 10:
66 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t))
--------------------------------------------------------------------------------
/dv3/tests/test_nyanko.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 |
4 | import sys
5 | from os.path import dirname, join, exists
6 |
7 | from dv3.deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab
8 |
9 | import torch
10 | from torch.autograd import Variable
11 | from torch import nn
12 | import numpy as np
13 |
14 | from nose.plugins.attrib import attr
15 |
16 | from dv3.deepvoice3_pytorch.builder import nyanko
17 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
18 |
19 | use_cuda = torch.cuda.is_available()
20 | num_mels = 80
21 | num_freq = 513
22 | outputs_per_step = 4
23 | padding_idx = 0
24 |
25 |
26 | def _pad(seq, max_len):
27 | return np.pad(seq, (0, max_len - len(seq)),
28 | mode='constant', constant_values=0)
29 |
30 |
31 | def _test_data():
32 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
33 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
34 | input_lengths = np.array([len(s) for s in seqs])
35 | max_len = np.max(input_lengths)
36 | seqs = np.array([_pad(s, max_len) for s in seqs])
37 |
38 | # Test encoder
39 | x = Variable(torch.LongTensor(seqs))
40 | y = Variable(torch.rand(x.size(0), 12, 80))
41 |
42 | return x, y
43 |
44 |
45 | def _pad_2d(x, max_len, b_pad=0):
46 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)],
47 | mode="constant", constant_values=0)
48 | return x
49 |
50 |
51 | def test_nyanko_basics():
52 | x, y = _test_data()
53 |
54 | for v in [False, True]:
55 | model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4,
56 | use_decoder_state_for_postnet_input=v)
57 | mel_outputs, linear_outputs, alignments, done = model(x, y)
58 |
59 |
60 | @attr("local_only")
61 | def test_incremental_correctness():
62 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
63 | seqs = np.array([text_to_sequence(t) for t in texts])
64 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
65 |
66 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
67 | max_target_len = mel.shape[0]
68 | r = 1
69 | mel_dim = 80
70 | if max_target_len % r != 0:
71 | max_target_len += r - max_target_len % r
72 | assert max_target_len % r == 0
73 | mel = _pad_2d(mel, max_target_len)
74 | mel = Variable(torch.from_numpy(mel))
75 | mel_reshaped = mel.view(1, -1, mel_dim * r)
76 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
77 |
78 | x = Variable(torch.LongTensor(seqs))
79 | text_positions = Variable(torch.LongTensor(text_positions))
80 | frame_positions = Variable(torch.LongTensor(frame_positions))
81 |
82 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
83 | r=r, force_monotonic_attention=False)
84 | model.eval()
85 |
86 | # Encoder
87 | encoder_outs = model.seq2seq.encoder(x)
88 |
89 | # Off line decoding
90 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
91 | encoder_outs, mel_reshaped,
92 | text_positions=text_positions, frame_positions=frame_positions)
93 |
94 | # Online decoding with test inputs
95 | model.seq2seq.decoder.start_fresh_sequence()
96 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
97 | encoder_outs, text_positions,
98 | test_inputs=mel_reshaped)
99 |
100 | # Should get same result
101 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
102 | mel_outputs_online.cpu().data.numpy())
103 |
104 |
105 | @attr("local_only")
106 | def test_nyanko():
107 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
108 | seqs = np.array([text_to_sequence(t) for t in texts])
109 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
110 |
111 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
112 | max_target_len = mel.shape[0]
113 | r = 1
114 | mel_dim = 80
115 | if max_target_len % r != 0:
116 | max_target_len += r - max_target_len % r
117 | assert max_target_len % r == 0
118 | mel = _pad_2d(mel, max_target_len)
119 | mel = Variable(torch.from_numpy(mel))
120 | mel_reshaped = mel.view(1, -1, mel_dim * r)
121 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
122 |
123 | x = Variable(torch.LongTensor(seqs))
124 | text_positions = Variable(torch.LongTensor(text_positions))
125 | frame_positions = Variable(torch.LongTensor(frame_positions))
126 |
127 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
128 | r=r, force_monotonic_attention=False)
129 | model.eval()
130 |
131 | def _plot(mel, mel_predicted, alignments):
132 | from matplotlib import pylab as plt
133 | plt.figure(figsize=(16, 10))
134 | plt.subplot(3, 1, 1)
135 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma")
136 | plt.colorbar()
137 |
138 | plt.subplot(3, 1, 2)
139 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
140 | origin="lower bottom", aspect="auto", cmap="magma")
141 | plt.colorbar()
142 |
143 | plt.subplot(3, 1, 3)
144 | if alignments.dim() == 4:
145 | alignments = alignments.mean(0)
146 | plt.imshow(alignments[0].data.cpu(
147 | ).numpy().T, origin="lower bottom", aspect="auto")
148 | plt.colorbar()
149 | plt.show()
150 |
151 | seq2seq = model.seq2seq
152 |
153 | # Encoder
154 | encoder_outs = seq2seq.encoder(x)
155 |
156 | # Off line decoding
157 | print("Offline decoding")
158 | mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder(
159 | encoder_outs, mel_reshaped,
160 | text_positions=text_positions, frame_positions=frame_positions)
161 |
162 | _plot(mel, mel_outputs_offline, alignments_offline)
163 |
164 | # Online decoding with test inputs
165 | print("Online decoding")
166 | seq2seq.decoder.start_fresh_sequence()
167 | mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward(
168 | encoder_outs, text_positions,
169 | test_inputs=mel_reshaped)
170 |
171 | a = mel_outputs_offline.cpu().data.numpy()
172 | b = mel_outputs_online.cpu().data.numpy()
173 | c = (mel_outputs_offline - mel_outputs_online).abs()
174 | print(c.mean(), c.max())
175 |
176 | _plot(mel, mel_outputs_offline, alignments_offline)
177 | _plot(mel, mel_outputs_online, alignments)
178 | _plot(mel, c, alignments)
179 |
180 | # Should get same result
181 | assert np.allclose(a, b)
182 |
183 | postnet = model.postnet
184 |
185 | linear_outputs = postnet(mel_outputs_offline)
186 | print(linear_outputs.size())
187 |
--------------------------------------------------------------------------------
/dv3/vctk.py:
--------------------------------------------------------------------------------
1 | from concurrent.futures import ProcessPoolExecutor
2 | from functools import partial
3 | import numpy as np
4 | import os
5 | import dv3.audio
6 | from nnmnkwii.datasets import vctk
7 | from nnmnkwii.io import hts
8 | from dv3.hparams import hparams
9 | from os.path import exists
10 | import librosa
11 |
12 |
13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
14 | executor = ProcessPoolExecutor(max_workers=num_workers)
15 | futures = []
16 |
17 | speakers = vctk.available_speakers
18 |
19 | td = vctk.TranscriptionDataSource(in_dir, speakers=speakers)
20 | transcriptions = td.collect_files()
21 | speaker_ids = td.labels
22 | wav_paths = vctk.WavFileDataSource(
23 | in_dir, speakers=speakers).collect_files()
24 |
25 | for index, (speaker_id, text, wav_path) in enumerate(
26 | zip(speaker_ids, transcriptions, wav_paths)):
27 | futures.append(executor.submit(
28 | partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text)))
29 | return [future.result() for future in tqdm(futures)]
30 |
31 |
32 | def start_at(labels):
33 | has_silence = labels[0][-1] == "pau"
34 | if not has_silence:
35 | return labels[0][0]
36 | for i in range(1, len(labels)):
37 | if labels[i][-1] != "pau":
38 | return labels[i][0]
39 | assert False
40 |
41 |
42 | def end_at(labels):
43 | has_silence = labels[-1][-1] == "pau"
44 | if not has_silence:
45 | return labels[-1][1]
46 | for i in range(len(labels) - 2, 0, -1):
47 | if labels[i][-1] != "pau":
48 | return labels[i][1]
49 | assert False
50 |
51 |
52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text):
53 | sr = hparams.sample_rate
54 |
55 | # Load the audio to a numpy array:
56 | wav = dv3.audio.load_wav(wav_path)
57 |
58 | lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
59 |
60 | # Trim silence from hts labels if available
61 | if exists(lab_path):
62 | labels = hts.load(lab_path)
63 | b = int(start_at(labels) * 1e-7 * sr)
64 | e = int(end_at(labels) * 1e-7 * sr)
65 | wav = wav[b:e]
66 | wav, _ = librosa.effects.trim(wav, top_db=25)
67 | else:
68 | wav, _ = librosa.effects.trim(wav, top_db=15)
69 |
70 | # Compute the linear-scale spectrogram from the wav:
71 | spectrogram = dv3.audio.spectrogram(wav).astype(np.float32)
72 | n_frames = spectrogram.shape[1]
73 |
74 | # Compute a mel-scale spectrogram from the wav:
75 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)
76 |
77 | # Write the spectrograms to disk:
78 | spectrogram_filename = 'vctk-spec-%05d.npy' % index
79 | mel_filename = 'vctk-mel-%05d.npy' % index
80 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
81 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
82 |
83 | # Return a tuple describing this training example:
84 | return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
--------------------------------------------------------------------------------
/dv3/vctk_preprocess/.gitignore:
--------------------------------------------------------------------------------
1 | latest_features
2 | tts_env.sh
3 |
--------------------------------------------------------------------------------
/dv3/vctk_preprocess/README.md:
--------------------------------------------------------------------------------
1 | # Preprocessing for VCTK
2 |
3 | Wav files in VCTK contains lots of long silences, which affects training char-level seq2seq models. To deal with the problem, we will
4 |
5 | - **Prepare phoneme alignments for all utterances** (code in the directory)
6 | - Cut silences during preprocessing (code in the parent directory)
7 |
8 | ## Note
9 |
10 | Code in the directory heavily relies on https://gist.github.com/kastnerkyle/cc0ac48d34860c5bb3f9112f4d9a0300 (which is hard copied in the repo). If you have any issues, please make sure that you can successfully run the script.
11 |
12 | ## Steps
13 |
14 | 1. Download VCTK: http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html
15 | 2. Install HTK/speech_tools/festival/merlin and prepare `tts_env.sh`. If you don't have speech tools, you can install them by https://gist.github.com/kastnerkyle/001a58a58d090658ee5350cb6129f857. For the reference, `tts_env.sh` of mine is:
16 | ```
17 | export ESTDIR=/home/ryuichi/Dropbox/sp/speech_tools/
18 | export FESTDIR=/home/ryuichi/Dropbox/sp/festival/
19 | export FESTVOXDIR=/home/ryuichi/Dropbox/sp/festvox/
20 | export VCTKDIR=/home/ryuichi/data/VCTK-Corpus/
21 | export HTKDIR=/usr/local/HTS-2.3/bin/
22 | export SPTKDIR=/usr/local/bin/
23 | export MERLINDIR=/home/ryuichi/Dropbox/sp/merlin_pr/
24 | ```
25 | 3. Run the script (takes ~24 hours)
26 | ```
27 | python prepare_vctk_labels.py ${your_vctk_dir} ${dst_dir}
28 | ```
29 | This will process all utterances of VCTK and copy HTK-style alignments to `${dst_dir}`.
30 | It is recommended to copy alignments to the top of VCTK corpus. i.e.,
31 | ```
32 | python prepare_vctk_labels.py ~/data/VCTK-Corpus ~/data/VCTK-Corpus/lab
33 | ```
34 |
35 | After the above steps, you will get alignments as follows:
36 |
37 | ```
38 | tree ~/data/VCTK-Corpus/lab/ | head /home/ryuichi/data/VCTK-Corpus/lab/
39 | ├── p225
40 | │ ├── p225_001.lab
41 | │ ├── p225_002.lab
42 | │ ├── p225_003.lab
43 | │ ├── p225_004.lab
44 | │ ├── p225_005.lab
45 | │ ├── p225_006.lab
46 | │ ├── p225_007.lab
47 | │ ├── p225_008.lab
48 | ```
49 |
50 | ```
51 | cat ~/data/VCTK-Corpus/lab/p225/p225_001.lab
52 |
53 | 0 850000 pau
54 | 850000 2850000 pau
55 | 2850000 3600000 p
56 | 3600000 3900000 l
57 | 3900000 6000000 iy
58 | 6000000 8450000 z
59 | 8450000 8600000 k
60 | 8600000 11300000 ao
61 | 11300000 11450000 l
62 | 11450000 12800000 s
63 | 12800000 13099999 t
64 | 13099999 15800000 eh
65 | 15800000 16050000 l
66 | 16050000 17600000 ax
67 | 17600000 20400000 pau
68 | ```
69 |
70 | ## Using Gentle?
71 |
72 | `prepare_htk_alignments_vctk.py` do the same things above using [Gentle](https://github.com/lowerquality/gentle), but turned out it seems not very good. Leaving code for future possibility if we can improve.
73 |
--------------------------------------------------------------------------------
/dv3/vctk_preprocess/prepare_htk_alignments_vctk.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | Script for do force alignment by gentle for VCTK. This script takes approx
4 | ~40 hours to finish. It processes all utterances in VCTK.
5 |
6 | NOTE: Must be run with Python2, since gentle doesn't work with Python3.
7 |
8 | Usage:
9 | 1. Install https://github.com/lowerquality/gentle
10 | 2. Download VCTK http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html
11 |
12 | and then run the script by:
13 |
14 | python2 prepare_htk_alignments_vctk.py ${your_vctk_data_path}
15 |
16 | After running the script, you will see alignment files in `lab` directory as
17 | follows:
18 |
19 | > tree ~/data/VCTK-Corpus/ -d -L
20 |
21 | /home/ryuichi/data/VCTK-Corpus/
22 | ├── lab
23 | ├── txt
24 | └── wav48
25 | """
26 | import argparse
27 | import logging
28 | import multiprocessing
29 | import os
30 | import sys
31 | from tqdm import tqdm
32 | import json
33 | from os.path import join, basename, dirname, exists
34 | import numpy as np
35 |
36 | import gentle
37 | import librosa
38 | from nnmnkwii.datasets import vctk
39 |
40 |
41 | def on_progress(p):
42 | for k, v in p.items():
43 | logging.debug("%s: %s" % (k, v))
44 |
45 |
46 | def write_hts_label(labels, lab_path):
47 | lab = ""
48 | for s, e, l in labels:
49 | s, e = float(s) * 1e7, float(e) * 1e7
50 | s, e = int(s), int(e)
51 | lab += "{} {} {}\n".format(s, e, l)
52 | print(lab)
53 | with open(lab_path, "w") as f:
54 | f.write(lab)
55 |
56 |
57 | def json2hts(data):
58 | emit_bos = False
59 | emit_eos = False
60 |
61 | phone_start = 0
62 | phone_end = None
63 | labels = []
64 |
65 | for word in data["words"]:
66 | case = word["case"]
67 | if case != "success":
68 | raise RuntimeError("Alignment failed")
69 | start = float(word["start"])
70 | word_end = float(word["end"])
71 |
72 | if not emit_bos:
73 | labels.append((phone_start, start, "silB"))
74 | emit_bos = True
75 |
76 | phone_start = start
77 | phone_end = None
78 | for phone in word["phones"]:
79 | ph = str(phone["phone"][:-2])
80 | duration = float(phone["duration"])
81 | phone_end = phone_start + duration
82 | labels.append((phone_start, phone_end, ph))
83 | phone_start += duration
84 | assert np.allclose(phone_end, word_end)
85 | if not emit_eos:
86 | labels.append((phone_start, phone_end, "silE"))
87 | emit_eos = True
88 |
89 | return labels
90 |
91 |
92 | if __name__ == "__main__":
93 | parser = argparse.ArgumentParser(
94 | description='Do force alignment for VCTK and save HTK-style alignments')
95 | parser.add_argument(
96 | '--nthreads', default=multiprocessing.cpu_count(), type=int,
97 | help='number of alignment threads')
98 | parser.add_argument(
99 | '--conservative', dest='conservative', action='store_true',
100 | help='conservative alignment')
101 | parser.set_defaults(conservative=False)
102 | parser.add_argument(
103 | '--disfluency', dest='disfluency', action='store_true',
104 | help='include disfluencies (uh, um) in alignment')
105 | parser.set_defaults(disfluency=False)
106 | parser.add_argument(
107 | '--log', default="INFO",
108 | help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)')
109 | parser.add_argument('data_root', type=str, help='Data root')
110 |
111 | args = parser.parse_args()
112 |
113 | log_level = args.log.upper()
114 | logging.getLogger().setLevel(log_level)
115 | disfluencies = set(['uh', 'um'])
116 |
117 | data_root = args.data_root
118 |
119 | # Do for all speakers
120 | speakers = vctk.available_speakers
121 |
122 | # Collect all transcripts/wav files
123 | td = vctk.TranscriptionDataSource(data_root, speakers=speakers)
124 | transcriptions = td.collect_files()
125 | wav_paths = vctk.WavFileDataSource(
126 | data_root, speakers=speakers).collect_files()
127 |
128 | # Save dir
129 | save_dir = join(data_root, "lab")
130 | if not exists(save_dir):
131 | os.makedirs(save_dir)
132 |
133 | resources = gentle.Resources()
134 |
135 | for idx in tqdm(range(len(wav_paths))):
136 | transcript = transcriptions[idx]
137 | audiofile = wav_paths[idx]
138 | lab_path = audiofile.replace("wav48/", "lab/").replace(".wav", ".lab")
139 | print(transcript)
140 | print(audiofile)
141 | print(lab_path)
142 | lab_dir = dirname(lab_path)
143 | if not exists(lab_dir):
144 | os.makedirs(lab_dir)
145 |
146 | logging.info("converting audio to 8K sampled wav")
147 | with gentle.resampled(audiofile) as wavfile:
148 | logging.info("starting alignment")
149 | aligner = gentle.ForcedAligner(resources, transcript,
150 | nthreads=args.nthreads,
151 | disfluency=args.disfluency,
152 | conservative=args.conservative,
153 | disfluencies=disfluencies)
154 | result = aligner.transcribe(
155 | wavfile, progress_cb=on_progress, logging=logging)
156 |
157 | # convert to htk format
158 | a = json.loads(result.to_json())
159 | try:
160 | labels = json2hts(a)
161 | except RuntimeError as e:
162 | from warnings import warn
163 | warn(str(e))
164 | continue
165 |
166 | # Insert end time
167 | x, sr = librosa.load(wavfile, sr=8000)
168 | endtime = float(len(x)) / sr
169 | labels[-1] = (labels[-1][0], endtime, labels[-1][-1])
170 |
171 | # write to file
172 | write_hts_label(labels, lab_path)
173 |
--------------------------------------------------------------------------------
/dv3/vctk_preprocess/prepare_vctk_labels.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """
3 | Prepare HTS alignments for VCTK.
4 |
5 | usage: prepare_vctk_labels.py [options]
6 |
7 | options:
8 | -h, --help Show help message.
9 | """
10 | from docopt import docopt
11 | import os
12 | from nnmnkwii.datasets import vctk
13 | from os.path import join, exists, splitext, basename
14 | import sys
15 | from glob import glob
16 |
17 | from subprocess import Popen, PIPE
18 | from tqdm import tqdm
19 |
20 |
21 | def do(cmd):
22 | print(cmd)
23 | p = Popen(cmd, shell=True)
24 | p.wait()
25 |
26 |
27 | if __name__ == "__main__":
28 | args = docopt(__doc__)
29 | data_root = args[""]
30 | out_dir = args[""]
31 |
32 | for idx in tqdm(range(len(vctk.available_speakers))):
33 | speaker = vctk.available_speakers[idx]
34 |
35 | wav_root = join(data_root, "wav48/p{}".format(speaker))
36 | txt_root = join(data_root, "txt/p{}".format(speaker))
37 | assert exists(wav_root)
38 | assert exists(txt_root)
39 | print(wav_root, txt_root)
40 |
41 | # Do alignments
42 | cmd = "python ./extract_feats.py -w {} -t {}".format(wav_root, txt_root)
43 | do(cmd)
44 |
45 | # Copy
46 | lab_dir = join(out_dir, "p{}".format(speaker))
47 | if not exists(lab_dir):
48 | os.makedirs(lab_dir)
49 | cmd = "cp ./latest_features/merlin/misc/scripts/alignment/phone_align/full-context-labels/mono/*.lab {}".format(
50 | lab_dir)
51 | do(cmd)
52 |
53 | # Remove
54 | do("rm -rf ./latest_features")
55 |
56 | sys.exit(0)
57 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from setuptools import setup, find_packages
4 | import setuptools.command.develop
5 | import setuptools.command.build_py
6 | import os
7 | import subprocess
8 |
9 | version = '0.0.1'
10 |
11 | # Adapted from https://github.com/pytorch/pytorch
12 | cwd = os.path.dirname(os.path.abspath(__file__))
13 | if os.getenv('TACOTRON_BUILD_VERSION'):
14 | version = os.getenv('TACOTRON_BUILD_VERSION')
15 | else:
16 | try:
17 | sha = subprocess.check_output(
18 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
19 | version += '+' + sha[:7]
20 | except subprocess.CalledProcessError:
21 | pass
22 |
23 |
24 | class build_py(setuptools.command.build_py.build_py):
25 |
26 | def run(self):
27 | self.create_version_file()
28 | setuptools.command.build_py.build_py.run(self)
29 |
30 | @staticmethod
31 | def create_version_file():
32 | global version, cwd
33 | print('-- Building version ' + version)
34 | version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py')
35 | with open(version_path, 'w') as f:
36 | f.write("__version__ = '{}'\n".format(version))
37 |
38 |
39 | class develop(setuptools.command.develop.develop):
40 |
41 | def run(self):
42 | build_py.create_version_file()
43 | setuptools.command.develop.develop.run(self)
44 |
45 |
46 | setup(name='deepvoice3_pytorch',
47 | version=version,
48 | description='PyTorch implementation of Tacotron speech synthesis model.',
49 | packages=find_packages(),
50 | cmdclass={
51 | 'build_py': build_py,
52 | 'develop': develop,
53 | },
54 | install_requires=[
55 | "numpy",
56 | "scipy",
57 | "unidecode",
58 | "inflect",
59 | "librosa",
60 | "numba",
61 | "lws <= 1.0",
62 | ],
63 | extras_require={
64 | "train": [
65 | "docopt",
66 | "tqdm",
67 | "tensorboardX",
68 | "nnmnkwii >= 0.0.9",
69 | "nltk",
70 | ],
71 | "test": [
72 | "nose",
73 | ],
74 | "jp": [
75 | "jaconv",
76 | "mecab-python3",
77 | ],
78 | })
79 |
--------------------------------------------------------------------------------
/train_encoder.py:
--------------------------------------------------------------------------------
1 | from docopt import docopt
2 | import sys
3 | from os.path import dirname, join
4 | from tqdm import tqdm, trange
5 | from datetime import datetime
6 |
7 | import pickle
8 |
9 | import torch
10 | from torch.autograd import Variable
11 | from torch.utils.data import Dataset, DataLoader
12 | from torch.utils import data as data_utils
13 | from torch import nn
14 | from torch import optim
15 | import torch.backends.cudnn as cudnn
16 | from torch.utils import data as data_utils
17 | from torch.utils.data.sampler import Sampler
18 | import numpy as np
19 | from numba import jit
20 |
21 |
22 | from utils import generate_cloned_samples, Speech_Dataset
23 | import dv3
24 |
25 | import sys
26 | import os
27 |
28 | # sys.path.append('./deepvoice3_pytorch')
29 | from dv3 import build_deepvoice_3
30 | from Encoder import Encoder
31 |
32 | # print(hparams)
33 | batch_size_encoder = 16
34 |
35 |
36 | global_step = 0
37 | global_epoch = 0
38 | use_cuda = torch.cuda.is_available()
39 | if use_cuda:
40 | cudnn.benchmark = False
41 |
42 | def get_cloned_voices(model,no_speakers = 108,no_cloned_texts = 23):
43 | try:
44 | with open("./Cloning_Audio/speakers_cloned_voices_mel.p" , "rb") as fp:
45 | cloned_voices = pickle.load(fp)
46 | except:
47 | cloned_voices = generate_cloned_samples(model)
48 | if(np.array(cloned_voices).shape != (no_speakers , no_cloned_texts)):
49 | cloned_voices = generate_cloned_samples(model,"./Cloning_Audio/cloning_text.txt" ,no_speakers,True,0)
50 | print("Cloned_voices Loaded!")
51 | return cloned_voices
52 |
53 | # Assumes that only Deep Voice 3 is given
54 | def get_speaker_embeddings(model):
55 | '''
56 | return the speaker embeddings and its shape from deep voice 3
57 | '''
58 | embed = model.embed_speakers.weight.data
59 | # shape = embed.shape
60 | return embed
61 |
62 | def build_encoder():
63 | encoder = Encoder()
64 | return encoder
65 |
66 |
67 | def save_checkpoint(model, optimizer, checkpoint_path, epoch):
68 |
69 | optimizer_state = optimizer.state_dict()
70 | torch.save({
71 | "state_dict": model.state_dict(),
72 | "optimizer": optimizer_state,
73 | "global_epoch": epoch,
74 | "epoch":epoch+1,
75 |
76 | }, checkpoint_path)
77 | print("Saved checkpoint:", checkpoint_path)
78 |
79 | def load_checkpoint(encoder, optimizer, path='checkpoints/encoder_checkpoint.pth'):
80 |
81 | checkpoint = torch.load(path)
82 |
83 | encoder.load_state_dict(checkpoint["state_dict"])
84 |
85 | print('Encoder state restored')
86 |
87 | optimizer.load_state_dict(checkpoint["optimizer"])
88 |
89 | print('Optimizer state restored')
90 |
91 | return encoder, optimizer
92 |
93 | def my_collate(batch):
94 | data = [item[0] for item in batch]
95 | samples = [text.shape[0] for text in data]
96 | max_size = data[0].shape[1]
97 | max_samples = np.amax(np.array(samples))
98 | for i, i_element in enumerate(data):
99 | final = torch.zeros(int(max_samples), max_size, 80)
100 | final[:data[i].shape[0], :, :] += torch.from_numpy(i_element).type(torch.FloatTensor)
101 | data[i]=torch.unsqueeze(final, 0)
102 | data = torch.cat(data, 0)
103 | target = np.stack([item[1] for item in batch], 0)
104 | target = torch.from_numpy(target)
105 | return [data, target]
106 |
107 | def train_encoder(encoder, data, optimizer, scheduler, criterion, epochs=100000, after_epoch_download=1000):
108 |
109 | #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.6)
110 |
111 | for i in range(epochs):
112 |
113 | epoch_loss=0.0
114 |
115 | for i_element, element in enumerate(data):
116 |
117 | voice, embed = element[0], element[1]
118 |
119 | input_to_encoder = Variable(voice.type(torch.cuda.FloatTensor))
120 |
121 | optimizer.zero_grad()
122 |
123 | output_from_encoder = encoder(input_to_encoder)
124 |
125 | embeddings = Variable(embed.type(torch.cuda.FloatTensor))
126 |
127 | loss = criterion(output_from_encoder,embeddings)
128 |
129 | loss.backward()
130 |
131 | scheduler.step()
132 | optimizer.step()
133 |
134 | epoch_loss+=loss
135 |
136 |
137 | if i%100==99:
138 | save_checkpoint(encoder,optimizer,"encoder_checkpoint.pth",i)
139 | print(i, ' done')
140 | print('Loss for epoch ', i, ' is ', loss)
141 |
142 | def download_file(file_name=None):
143 | from google.colab import files
144 | files.download(file_name)
145 |
146 |
147 | batch_size=64
148 |
149 | if __name__ == "__main__":
150 |
151 | #Load Deep Voice 3
152 | # Pre Trained Model
153 | print("start")
154 | dv3_model = build_deepvoice_3(True)
155 | print("dv3 built")
156 | all_speakers = get_cloned_voices(dv3_model)
157 | print("Cloning Texts are produced")
158 |
159 | speaker_embed = get_speaker_embeddings(dv3_model)
160 |
161 | encoder = build_encoder()
162 |
163 | print("Encoder is built!")
164 |
165 |
166 | speech_data = Speech_Dataset(all_speakers, speaker_embed, sampler=True)
167 |
168 | criterion = nn.L1Loss()
169 |
170 | optimizer = torch.optim.SGD(encoder.parameters(),lr=0.0006)
171 |
172 | lambda1 = lambda epoch: 0.6 if epoch%8000==7999 else 1
173 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)
174 |
175 |
176 | data_loader = DataLoader(speech_data, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn = my_collate)
177 | # Training The Encoder
178 |
179 | encoder = encoder.cuda()
180 |
181 | if os.path.isfile('checkpoints/encoder_checkpoint.pth'):
182 | encoder, optimizer = load_checkpoint(encoder, optimizer)
183 |
184 | try:
185 | train_encoder(encoder, data_loader, optimizer, scheduler, criterion, epochs=100000)
186 | except KeyboardInterrupt:
187 | print("KeyboardInterrupt")
188 |
189 | print("Finished")
190 | sys.exit(0)
191 |
--------------------------------------------------------------------------------
/train_whole.py:
--------------------------------------------------------------------------------
1 |
2 | from docopt import docopt
3 | import sys
4 | from os.path import dirname, join
5 | from tqdm import tqdm, trange
6 | from datetime import datetime
7 |
8 | import pickle
9 |
10 | import torch
11 | from torch.autograd import Variable
12 | from torch.utils.data import Dataset, DataLoader
13 | from torch.utils import data as data_utils
14 | from torch import nn
15 | from torch import optim
16 | import torch.backends.cudnn as cudnn
17 | from torch.utils import data as data_utils
18 | from torch.utils.data.sampler import Sampler
19 | import numpy as np
20 | from numba import jit
21 | from nnmnkwii.datasets import FileSourceDataset, FileDataSource
22 | from os.path import join, expanduser
23 |
24 |
25 | # import requirements for dv3
26 | from utils import generate_cloned_samples, Speech_Dataset
27 | import dv3
28 | from dv3 import build_deepvoice_3
29 | from dv3.hparams import hparams, hparams_debug_string
30 | from dv3.train import train as train_dv3
31 | from dv3.train import TextDataSource,MelSpecDataSource,LinearSpecDataSource,\
32 | PyTorchDataset,PartialyRandomizedSimilarTimeLengthSampler
33 | from dv3.train import collate_fn
34 | from dv3.deepvoice3_pytorch import frontend
35 | from dv3.train import sequence_mask
36 | from dv3.train import save_checkpoint as save_checkpoint_dv3
37 | from dv3.train import save_states as save_states_dv3
38 | from tensorboardX import SummaryWriter
39 |
40 | # requirements for encoder
41 | from utils import generate_cloned_samples, Speech_Dataset
42 | from Encoder import Encoder
43 | from train_encoder import get_cloned_voices,build_encoder,get_speaker_embeddings
44 | from train_encoder import load_checkpoint as load_checkpoint_encoder
45 | from train_encoder import save_checkpoint as save_checkpoint_encoder
46 | from train_encoder import train as train_encoder
47 |
48 |
49 | import sys
50 | import os
51 |
52 | # sys.path.append('./deepvoice3_pytorch')
53 |
54 | # print(hparams)
55 | batch_size_encoder = 16
56 |
57 |
58 | global_step = 0
59 | global_epoch = 0
60 | use_cuda = torch.cuda.is_available()
61 | if use_cuda:
62 | cudnn.benchmark = False
63 |
64 |
65 | def train(model_dv3,model_encoder,
66 | data_loader_dv3,
67 | optimizer_dv3,
68 | init_lr_dv3=0.002,
69 | checkpoint_dir_dv3=None,
70 | clip_thresh = 1.0,
71 | data_loader_encoder=None,
72 | optimizer_encoder=None,
73 | scheduler_encoder=None,
74 | checkpoint_interval=None,
75 | nepochs=None):
76 | # this training function is to train the combined model
77 |
78 | grad = {}
79 | def save_grad(name):
80 | def hook(grad):
81 | grads[name] = grad
82 | return hook
83 |
84 | # to remember the embeddings of the speakers
85 | model_dv3.embed_speakers.weight.register_hook(save_grad('embeddings'))
86 |
87 | if use_cuda:
88 | model_dv3 = model_dv3.cuda()
89 | model_encoder = model_encoder.cuda()
90 | linear_dim = model_dv3.linear_dim
91 | r = hparams.outputs_per_step
92 | downsample_step = hparams.downsample_step
93 | current_lr = init_lr_dv3
94 |
95 | binary_criterion_dv3 = nn.BCELoss()
96 |
97 | global global_step, global_epoch
98 | while global_epoch < nepochs:
99 | running_loss = 0.0
100 | for step, (x, input_lengths, mel, y, positions, done, target_lengths,
101 | speaker_ids) \
102 | in tqdm(enumerate(data_loader_dv3)):
103 |
104 |
105 | model_dv3.zero_grad()
106 | encoder.zero_grad()
107 |
108 | #Declaring Requirements
109 | model_dv3.train()
110 | ismultispeaker = speaker_ids is not None
111 | # Learning rate schedule
112 | if hparams.lr_schedule is not None:
113 | lr_schedule_f = getattr(dv3.lrschedule, hparams.lr_schedule)
114 | current_lr = lr_schedule_f(
115 | init_lr, global_step, **hparams.lr_schedule_kwargs)
116 | for param_group in optimizer.param_groups:
117 | param_group['lr'] = current_lr
118 | optimizer_dv3.zero_grad()
119 |
120 | # Used for Position encoding
121 | text_positions, frame_positions = positions
122 |
123 | # Downsample mel spectrogram
124 | if downsample_step > 1:
125 | mel = mel[:, 0::downsample_step, :].contiguous()
126 |
127 | # Lengths
128 | input_lengths = input_lengths.long().numpy()
129 | decoder_lengths = target_lengths.long().numpy() // r // downsample_step
130 |
131 | voice_encoder = mel.view(mel.shape[0],1,mel.shape[1],mel.shape[2])
132 | # Feed data
133 | x, mel, y = Variable(x), Variable(mel), Variable(y)
134 | voice_encoder = Variable(voice_encoder)
135 | text_positions = Variable(text_positions)
136 | frame_positions = Variable(frame_positions)
137 | done = Variable(done)
138 | target_lengths = Variable(target_lengths)
139 | speaker_ids = Variable(speaker_ids) if ismultispeaker else None
140 | if use_cuda:
141 | x = x.cuda()
142 | text_positions = text_positions.cuda()
143 | frame_positions = frame_positions.cuda()
144 | y = y.cuda()
145 | mel = mel.cuda()
146 | voice_encoder = voice_encoder.cuda()
147 | done, target_lengths = done.cuda(), target_lengths.cuda()
148 | speaker_ids = speaker_ids.cuda() if ismultispeaker else None
149 |
150 | # Create mask if we use masked loss
151 | if hparams.masked_loss_weight > 0:
152 | # decoder output domain mask
153 | decoder_target_mask = sequence_mask(
154 | target_lengths / (r * downsample_step),
155 | max_len=mel.size(1)).unsqueeze(-1)
156 | if downsample_step > 1:
157 | # spectrogram-domain mask
158 | target_mask = sequence_mask(
159 | target_lengths, max_len=y.size(1)).unsqueeze(-1)
160 | else:
161 | target_mask = decoder_target_mask
162 | # shift mask
163 | decoder_target_mask = decoder_target_mask[:, r:, :]
164 | target_mask = target_mask[:, r:, :]
165 | else:
166 | decoder_target_mask, target_mask = None, None
167 |
168 | #apply encoder model
169 |
170 |
171 |
172 | model_dv3.embed_speakers.weight.data = (encoder_out).data
173 | # Apply dv3 model
174 | mel_outputs, linear_outputs, attn, done_hat = model_dv3(
175 | x, mel, speaker_ids=speaker_ids,
176 | text_positions=text_positions, frame_positions=frame_positions,
177 | input_lengths=input_lengths)
178 |
179 |
180 |
181 | # Losses
182 | w = hparams.binary_divergence_weight
183 |
184 | # mel:
185 | mel_l1_loss, mel_binary_div = spec_loss(
186 | mel_outputs[:, :-r, :], mel[:, r:, :], decoder_target_mask)
187 | mel_loss = (1 - w) * mel_l1_loss + w * mel_binary_div
188 |
189 | # done:
190 | done_loss = binary_criterion(done_hat, done)
191 |
192 | # linear:
193 | n_priority_freq = int(hparams.priority_freq / (fs * 0.5) * linear_dim)
194 | linear_l1_loss, linear_binary_div = spec_loss(
195 | linear_outputs[:, :-r, :], y[:, r:, :], target_mask,
196 | priority_bin=n_priority_freq,
197 | priority_w=hparams.priority_freq_weight)
198 | linear_loss = (1 - w) * linear_l1_loss + w * linear_binary_div
199 |
200 | # Combine losses
201 | loss_dv3 = mel_loss + linear_loss + done_loss
202 | loss_dv3 = mel_loss + done_loss
203 | loss_dv3 = linear_loss
204 |
205 | # attention
206 | if hparams.use_guided_attention:
207 | soft_mask = guided_attentions(input_lengths, decoder_lengths,
208 | attn.size(-2),
209 | g=hparams.guided_attention_sigma)
210 | soft_mask = Variable(torch.from_numpy(soft_mask))
211 | soft_mask = soft_mask.cuda() if use_cuda else soft_mask
212 | attn_loss = (attn * soft_mask).mean()
213 | loss_dv3 += attn_loss
214 |
215 | if global_step > 0 and global_step % checkpoint_interval == 0:
216 | save_states_dv3(
217 | global_step, writer, mel_outputs, linear_outputs, attn,
218 | mel, y, input_lengths, checkpoint_dir)
219 | save_checkpoint_dv3(
220 | model, optimizer, global_step, checkpoint_dir, global_epoch,
221 | train_seq2seq, train_postnet)
222 |
223 | if global_step > 0 and global_step % hparams.eval_interval == 0:
224 | eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker)
225 |
226 | # Update
227 | loss_dv3.backward()
228 | encoder_out.backward(grads['embeddings'])
229 |
230 | optimizer_dv3.step()
231 | optimizer_encoder.step()
232 |
233 | # if clip_thresh> 0:
234 | # grad_norm = torch.nn.utils.clip_grad_norm(
235 | # model.get_trainable_parameters(), clip_thresh)
236 | global_step += 1
237 | running_loss += loss.data[0]
238 |
239 | averaged_loss = running_loss / (len(data_loader))
240 |
241 | print("Loss: {}".format(running_loss / (len(data_loader))))
242 |
243 | global_epoch += 1
244 |
245 |
246 | # dv3 loss function
247 | # backward on that
248 | mel_outputs.backward()
249 | # dv3_model.embed_speakers.weight.data = (encoder_out).data
250 |
251 |
252 | if __name__=="main"
253 |
254 | args = docopt(__doc__)
255 | print("Command line args:\n",args)
256 |
257 | checkpoint_dv3 = args["--checkpoints-dv3"]
258 | checkpoint_encoder = args["--checkpoint-encoder"]
259 | speaker_id = None
260 | dv3_preset =None
261 |
262 | data_root = args["--data-root"]
263 | if data_root is None:
264 | data_root = join(dirname(__file__), "data", "ljspeech")
265 |
266 |
267 |
268 | train_dv3_v = args["--train-dv3"]
269 | train_encoder_v = args["--train-encoder"]
270 |
271 |
272 | if not train_dv3_v and not train_encoder_v:
273 | print("Training whole model")
274 | train_dv3_v,train_encoder_v= True,True
275 | if train_dv3_v:
276 | print("Training deep voice 3 model")
277 | elif train_encoder_v:
278 | print("Training encoder model")
279 | else:
280 | assert False, "must be specified wrong args"
281 |
282 | os.makedirs(checkpoint_dir , exist_ok=True)
283 |
284 | # Input dataset definitions
285 | X = FileSourceDataset(TextDataSource(data_root, speaker_id))
286 | Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id))
287 | Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id))
288 |
289 | # Prepare sampler
290 | frame_lengths = Mel.file_data_source.frame_lengths
291 | sampler = PartialyRandomizedSimilarTimeLengthSampler(
292 | frame_lengths, batch_size=hparams.batch_size)
293 |
294 | # Dataset and Dataloader setup
295 | dataset = PyTorchDataset(X, Mel, Y)
296 | data_loader_dv3 = data_utils.DataLoader(
297 | dataset, batch_size=hparams.batch_size,
298 | num_workers=hparams.num_workers, sampler=sampler,
299 | collate_fn=collate_fn, pin_memory=hparams.pin_memory)
300 | print("dataloader for dv3 prepared")
301 |
302 | dv3.train._frontend = getattr(frontend, hparams.frontend)
303 | dv3_model = build_deepvoice_3(dv3_preset , checkpoint_dv3)
304 | print("Built dv3!")
305 |
306 | if use_cuda:
307 | dv3_model = dv3_model.cuda()
308 |
309 | dv3_optimizer = optim.Adam((dv3_model.get_trainable_parameters(),
310 | lr=hparams.initial_learning_rate, betas=(
311 | hparams.adam_beta1, hparams.adam_beta2),
312 | eps=hparams.adam_eps, weight_decay=hparams.weight_decay)
313 |
314 | log_event_path = "log/run-test" + str(datetime.now()).replace(" ", "_")
315 | print("Log event path for dv3: {}".format(log_event_path))
316 | writer_dv3 = SummaryWriter(log_dir=log_event_path)
317 |
318 | # ENCODER
319 | all_speakers = get_cloned_voices(dv3_model)
320 | print("Cloning Texts are produced")
321 |
322 | speaker_embed = get_speaker_embeddings(dv3_model)
323 |
324 | encoder = build_encoder()
325 |
326 | print("Encoder is built!")
327 |
328 | speech_data_encoder = Speech_Dataset(all_speakers, speaker_embed)
329 |
330 | criterion_encoder = nn.L1Loss()
331 |
332 | optimizer_encoder = torch.optim.SGD(encoder.parameters(),lr=0.0006)
333 |
334 | lambda1_encoder = lambda epoch: 0.6 if epoch%8000==7999 else 1#???????????
335 | scheduler_encoder = torch.optim.lr_scheduler.LambdaLR(optimizer_encoder, lr_lambda=lambda1_encoder)
336 |
337 | data_loader_encoder = data_utils.DataLoader(speech_data_encoder, batch_size=batch_size_encoder, shuffle=True, drop_last=True)
338 | # Training The Encoder
339 | dataiter_encoder = iter(data_loader_encoder)
340 |
341 | if use_cuda:
342 | encoder = encoder.cuda()
343 |
344 | if checkpoint_encoder!=None and os.path.isfile(checkpoint_encoder):
345 | encoder, optimizer_encoder = load_checkpoint_encoder(encoder, optimizer_encoder)
346 |
347 | if train_encoder_v and train_dv3_v:
348 | try:
349 | train()
350 | except KeyboardInterrupt:
351 | print("KeyboardInterrupt")
352 | elif train_encoder_v:
353 | try:
354 | train_encoder(encoder , data_loader_encoder , optimizer_encoder,scheduler_encoder,criterion_encoder,epochs=100000)
355 | except KeyboardInterrupt:
356 |
357 | print("KeyboardInterrupt")
358 |
359 | elif train_dv3_v:
360 | try:
361 | train_dv3(dv3_model ,data_loader_dv3, dv3_optimizer, writer_dv3,
362 | init_lr=hparams.initial_learning_rate,
363 | checkpoint_dir=checkpoint_dv3,
364 | checkpoint_interval=hparams.checkpoint_interval,
365 | nepochs=hparams.nepochs,
366 | clip_thresh=hparams.clip_thresh,
367 | train_seq2seq=True, train_postnet=True)
368 | except KeyboardInterrupt:
369 |
370 | print("KeyboardInterrupt")
371 | else:
372 | assert False , "Wrongs arguments specified"
373 |
374 | print("Finished")
375 | sys.exit(0)
376 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os.path import exists, join, expanduser
3 |
4 | import torch
5 | import numpy as np
6 | import librosa
7 | import librosa.display
8 | from torch.utils.data import Dataset
9 |
10 | # need this for English text processing frontend
11 | import nltk
12 | import pickle
13 |
14 | # import dv3.synthesis
15 | # import train
16 | # from deepvoice3_pytorch import frontend
17 | # from train import build_model
18 | # from train import restore_parts, load_checkpoint
19 | from dv3.synthesis import tts as _tts
20 |
21 |
22 | def tts(model, text, p=0, speaker_id=0, fast=True, figures=True):
23 | from dv3.synthesis import tts as _tts
24 | waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
25 | if figures:
26 | visualize(alignment, spectrogram)
27 | IPython.display.display(Audio(waveform, rate=fs))
28 |
29 | def visualize(alignment, spectrogram):
30 | label_fontsize = 16
31 | figure(figsize=(16,16))
32 |
33 | subplot(2,1,1)
34 | imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
35 | xlabel("Decoder timestamp", fontsize=label_fontsize)
36 | ylabel("Encoder timestamp", fontsize=label_fontsize)
37 | colorbar()
38 |
39 | subplot(2,1,2)
40 | librosa.display.specshow(spectrogram.T, sr=fs,
41 | hop_length=hop_length, x_axis="time", y_axis="linear")
42 | xlabel("Time", fontsize=label_fontsize)
43 | ylabel("Hz", fontsize=label_fontsize)
44 | tight_layout()
45 | colorbar()
46 |
47 |
48 | def generate_cloned_samples(model,cloning_text_path = None, no_speakers = 108 , fast = True, p =0 ):
49 |
50 | #cloning_texts = ["this is the first" , "this is the second"]
51 | if(cloning_text_path == None):
52 | cloning_text_path = "./Cloning_Audio/cloning_text.txt"
53 |
54 | cloning_texts = open("./Cloning_Audio/cloning_text.txt").read().splitlines()
55 | # no_cloning_texts = len(cloning_texts)
56 |
57 | all_speakers = []
58 |
59 | for speaker_id in range(no_speakers):
60 | speaker_cloning_mel = []
61 | print("The Speaker being cloned speaker-{}".format(speaker_id))
62 | for text in cloning_texts:
63 | waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
64 | speaker_cloning_mel.append([speaker_id, mel])
65 | #print(np.array(speaker_cloning_mel).shape)
66 | all_speakers.append(speaker_cloning_mel)
67 | with open("./Cloning_Audio/speakers_cloned_voices_mel.p", "wb") as fp: #Pickling
68 | pickle.dump(all_speakers, fp)
69 | # print("")
70 |
71 | print("Shape of all speakers:",np.array(all_speakers).shape)
72 | # print(all_speakers.shape)
73 |
74 |
75 | # all speakers[speaker_id][cloned_audio_number]
76 | # print(all_speakers[0][1].shape)
77 | return all_speakers
78 |
79 | class Speech_Dataset(Dataset):
80 | def __init__(self, mfccs, embeddings, sampler):
81 | '''Mfccs have to be list of lists of numpy arrays. Each of these numpy arrays will be a mel spectrogram'''
82 | self.voices = mfccs
83 | temp = [spec.shape[0] for text in self.voices for spec in text]
84 | largest_size = np.amax(np.array(temp))
85 | self._pad(largest_size)
86 | self.embeddings = embeddings
87 | if sampler==True:
88 | self.sampler = True
89 |
90 | def _pad(self, maximum_size):
91 | '''Input:
92 | Specs: Mel Spectrograms with 80 channels but the length of each channel is not the same.
93 | maximum_size: Largest channel length. Others are padded to this length
94 |
95 | Padding with 0 won't affect the convolutions because anyway the neurons corresponding to the states have to
96 | be dead if they are not padded. Putting 0 will also make those neurons dead. And later an average is taken along
97 | this dimension too.
98 |
99 | Returns: A padded array of arrays of spectrograms.'''
100 |
101 | for i, i_element in enumerate(self.voices):
102 | for j, j_element in enumerate(i_element):
103 | final = np.zeros((maximum_size, 80))
104 | final[:self.voices[i][j].shape[0], :] += j_element
105 | self.voices[i][j]=final
106 | self.voices = np.array(self.voices)
107 | print(self.voices.shape)
108 |
109 | def __len__(self):
110 | '''Returns total number of speakers'''
111 | return len(self.voices)
112 |
113 | def __getitem__(self, idx):
114 | if self.sampler==False:
115 | return (self.voices[idx], self.embeddings[idx])
116 | elif self.sampler==True:
117 | sample = np.random.random_integers(1, 22, size=int(np.random.randint(1, 10, size=1)))
118 | return (self.voices[idx, sample, :, :], self.embeddings[idx])
119 |
--------------------------------------------------------------------------------