├── AudioSamples ├── step000009300_predicted.wav ├── step000009400_predicted.wav └── step000009500_predicted.wav ├── Cloning_Audio ├── cloning_text.txt └── speakers_cloned_voices_mel.p ├── Encoder.py ├── Img ├── Epoch Loss.png └── Workflow.png ├── Modules ├── Attention.py ├── CloningSamplesAttention.py ├── Conv1dGLU.py ├── MultiHeadAttention.py ├── SpectralProcessing.py └── TemporalProcessing.py ├── README.md ├── checkpoints └── encoder_checkpoint.pth ├── dv3 ├── __init__.py ├── audio.py ├── compute_timestamp_ratio.py ├── deepvoice3_pytorch │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── builder.cpython-36.pyc │ │ ├── conv.cpython-36.pyc │ │ ├── deepvoice3.cpython-36.pyc │ │ ├── modules.cpython-36.pyc │ │ └── version.cpython-36.pyc │ ├── builder.py │ ├── conv.py │ ├── deepvoice3.py │ ├── frontend │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ └── __init__.cpython-36.pyc │ │ ├── en │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ │ └── __init__.cpython-36.pyc │ │ ├── jp │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ │ └── __init__.cpython-36.pyc │ │ └── text │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── cleaners.cpython-36.pyc │ │ │ ├── cmudict.cpython-36.pyc │ │ │ ├── numbers.cpython-36.pyc │ │ │ └── symbols.cpython-36.pyc │ │ │ ├── cleaners.py │ │ │ ├── cmudict.py │ │ │ ├── numbers.py │ │ │ └── symbols.py │ ├── modules.py │ ├── nyanko.py │ └── version.py ├── deepvoice3_vctk.json ├── hparams.py ├── jsut.py ├── ljspeech.py ├── lrschedule.py ├── preprocess.py ├── setup.py ├── synthesis.py ├── tests │ ├── test_conv.py │ ├── test_deepvoice3.py │ ├── test_embedding.py │ ├── test_frontend.py │ └── test_nyanko.py ├── train.py ├── vctk.py └── vctk_preprocess │ ├── .gitignore │ ├── README.md │ ├── extract_feats.py │ ├── prepare_htk_alignments_vctk.py │ └── prepare_vctk_labels.py ├── setup.py ├── speaker_adaptation.py ├── train_dv3.py ├── train_encoder.py ├── train_whole.py └── utils.py /AudioSamples/step000009300_predicted.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009300_predicted.wav -------------------------------------------------------------------------------- /AudioSamples/step000009400_predicted.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009400_predicted.wav -------------------------------------------------------------------------------- /AudioSamples/step000009500_predicted.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/AudioSamples/step000009500_predicted.wav -------------------------------------------------------------------------------- /Cloning_Audio/cloning_text.txt: -------------------------------------------------------------------------------- 1 | Prosecuters have opened a massive investigation into allegations of fixiing games and illegal betting. 2 | Different telescope designs perfor differently and have different strengths and weaknesses. 3 | We can continue to strengthen the education of good lawyers. 4 | Feedback must be timely and accurate throughout the project. 5 | Humans should also judge the distance by using relative sizes of the objects. 6 | Churches should not encourage it or make it look harmless. 7 | Learn about setting up wireless network confriguration. 8 | You can eat them fresh cooked or fermented. 9 | If this is true then those who tend to think cretively really are somehow different. 10 | She will likely jump for joy and want to skip straight for the honeymoon. 11 | The sugar syrup should create very fine strands of sugar that drape of the handles. 12 | But really in the grand scheme of things this information is insignificant. 13 | I let the positive overrule the negetive. 14 | He wiped his brow with his forearm. 15 | Instead of fixing it they give it a nickname. 16 | About half the people who are infected also lose wheight. 17 | The second half of the book focuses on argument and essay writing. 18 | We have the means to help ourselves. 19 | The large items are put into containers for disposal. 20 | He loves to watch me drink this stuff. 21 | Still it is an odd fashion choice. 22 | Funding is always an issue after the fact. 23 | Let us encourage each other. 24 | -------------------------------------------------------------------------------- /Cloning_Audio/speakers_cloned_voices_mel.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Cloning_Audio/speakers_cloned_voices_mel.p -------------------------------------------------------------------------------- /Encoder.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | import librosa 6 | import torch.nn.functional as F 7 | from Modules.SpectralProcessing import SpectralProcessing 8 | from Modules.TemporalProcessing import TemporalProcessing 9 | from Modules.CloningSamplesAttention import CloningSamplesAttention 10 | 11 | 12 | class Encoder(nn.Module): 13 | global batch_size 14 | global N_samples 15 | def __init__(self): 16 | super(Encoder, self).__init__() 17 | self.spectral_layer = SpectralProcessing(80) 18 | self.temporal_layer = TemporalProcessing() 19 | self.cloning_attention_layer = CloningSamplesAttention() 20 | 21 | def forward(self, x): 22 | #print(x) 23 | x = self.spectral_layer(x) 24 | x = self.temporal_layer(x) 25 | x = self.cloning_attention_layer(x) 26 | 27 | print(x.size()) 28 | 29 | return x 30 | 31 | 32 | 33 | #def Temp_Masking(x): 34 | #Create function for temporal masking. Use librosa.decompose.hpss. Split and concatinate dimensions to make it 2D. -------------------------------------------------------------------------------- /Img/Epoch Loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Img/Epoch Loss.png -------------------------------------------------------------------------------- /Img/Workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/Img/Workflow.png -------------------------------------------------------------------------------- /Modules/Attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import sys 5 | 6 | from Modules.MultiHeadAttention import MultiHeadAttention 7 | 8 | class Attention(nn.Module): 9 | def __init__(self, dim): 10 | super(Attention, self).__init__() 11 | 12 | self.encoders = self._build_model(dim) 13 | 14 | def _build_model(self, dim): 15 | layers = [] 16 | dim = dim 17 | layers.append(MultiHeadAttention(dim, dim, dim)) 18 | 19 | return nn.ModuleList(layers) 20 | 21 | def forward(self, inputs): 22 | net_inputs = inputs 23 | net_inputs.contiguous() 24 | for enc in self.encoders: 25 | net_inputs = enc(net_inputs, net_inputs) 26 | return net_inputs 27 | -------------------------------------------------------------------------------- /Modules/CloningSamplesAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import sys 5 | 6 | from Modules.Attention import Attention 7 | 8 | class CloningSamplesAttention(nn.Module): 9 | ''' 10 | Implementation of the the last Cloning sample attention part. 11 | Implementation includes residual linear connection,Multiheadattentionlayer, 12 | and linear layers. 13 | ''' 14 | 15 | def __init__(self): 16 | super(CloningSamplesAttention,self).__init__() 17 | self.residual_linear_layer = nn.Linear(128,512) 18 | self.attention = Attention(128) 19 | self.fc_after_attention = nn.Linear(128,1) 20 | 21 | def forward(self,x): 22 | 23 | residual_linear_x = self.residual_linear_layer(x) 24 | x.contiguous() 25 | # attention layer 26 | x = self.attention(x) 27 | # linear layers 28 | x = self.fc_after_attention(x) 29 | x = torch.squeeze(x) 30 | x = F.softsign(x) 31 | x = F.normalize(x, dim = 1) 32 | x = torch.unsqueeze(x, dim=2) 33 | x = torch.bmm(x.transpose(1,2), residual_linear_x) 34 | x = torch.squeeze(x) 35 | 36 | return x 37 | -------------------------------------------------------------------------------- /Modules/Conv1dGLU.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | import sys 7 | 8 | class Conv1dGLU(nn.Module): 9 | ''' 10 | Implementation of the Conv1d + GLU(Gated Linear Unit) 11 | with residual connection. 12 | For GLU refer to https://arxiv.org/abs/1612.08083 paper. 13 | ''' 14 | def __init__(self, in_channels=128, out_channels=128,padding = None, 15 | dilation = 2,kernel_size=12,*args, **kwargs): 16 | super(Conv1dGLU, self).__init__() 17 | if padding == None: 18 | padding = int(((kernel_size-1)/2)*dilation) 19 | self.conv1 = nn.Conv1d(in_channels, out_channels=2 * out_channels, 20 | padding=padding, dilation = dilation, 21 | kernel_size=kernel_size) 22 | 23 | def forward(self, x): 24 | residual = x 25 | x = self.conv1(x) 26 | x1, x2 = torch.split(x, split_size_or_sections = 128, dim = 1) 27 | x = x1 * torch.sigmoid(x2) 28 | x += residual 29 | x *= math.sqrt(0.5) 30 | return x 31 | 32 | -------------------------------------------------------------------------------- /Modules/MultiHeadAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import torch.nn.parameter as parameter 6 | 7 | class MultiHeadAttention(nn.Module): 8 | def __init__(self, query_dim, key_dim, num_units, dropout_p=0.5, h=2, is_masked=False): 9 | super(MultiHeadAttention, self).__init__() 10 | 11 | if query_dim != key_dim: 12 | raise ValueError("query_dim and key_dim must be the same") 13 | if num_units % h != 0: 14 | raise ValueError("num_units must be dividable by h") 15 | if query_dim != num_units: 16 | raise ValueError("to employ residual connection, the number of query_dim and num_units must be the same") 17 | self.cuda = False 18 | if torch.cuda.is_available(): 19 | self.cuda=True 20 | 21 | self._num_units = num_units 22 | self._h = h 23 | if self.cuda: 24 | self._key_dim = Variable(torch.cuda.FloatTensor([key_dim])) 25 | else: 26 | self._key_dim = Variable(torch.FloatTensor([key_dim])) 27 | self._dropout_p = dropout_p 28 | self._is_masked = is_masked 29 | 30 | self.query_layer = nn.Linear(query_dim, num_units, bias=False) 31 | self.key_layer = nn.Linear(key_dim, num_units, bias=False) 32 | self.value_layer = nn.Linear(key_dim, num_units, bias=False) 33 | #self.bn = nn.BatchNorm1d(num_units) 34 | 35 | def forward(self, query, keys): 36 | Q = F.elu(self.query_layer(query)) 37 | K = F.elu(self.key_layer(keys)) 38 | V = F.elu(self.value_layer(keys)) 39 | 40 | chunk_size = int(self._num_units / self._h) 41 | Q = torch.cat(Q.split(split_size=chunk_size, dim=2), dim=0) 42 | K = torch.cat(K.split(split_size=chunk_size, dim=2), dim=0) 43 | V = torch.cat(V.split(split_size=chunk_size, dim=2), dim=0) 44 | 45 | attention = torch.matmul(Q, K.transpose(1, 2)) 46 | attention = attention / torch.sqrt(self._key_dim) 47 | 48 | if self._is_masked: 49 | diag_vals = attention[0].sign().abs() 50 | diag_mat = diag_vals.tril() 51 | diag_mat = diag_mat.unsqueeze(0).expand(attention.size()) 52 | 53 | mask = Variable( 54 | torch.ones(diag_mat.size()).cuda.FloatTensor * (-2**32 + 1), requires_grad=False) 55 | 56 | attention = (attention * diag_mat) + (mask * (diag_mat-1).abs()) 57 | attention = F.softmax(attention, dim=-1) 58 | attention = F.dropout(attention, self._dropout_p) 59 | attention = torch.matmul(attention, V) 60 | restore_chunk_size = int(attention.size(0) / self._h) 61 | attention = torch.cat( 62 | attention.split(split_size=restore_chunk_size, dim=0), dim=2) 63 | attention += query 64 | attention = attention.transpose(1, 2) 65 | attention.contiguous() 66 | #attention = self.bn(attention).transpose(1, 2) 67 | 68 | attention = F.normalize(attention, dim = 1).transpose(1, 2) 69 | return attention -------------------------------------------------------------------------------- /Modules/SpectralProcessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class PreNet(nn.Module): 7 | ''' 8 | 2-layer prenet 9 | 1st is the linear layer.2nd is the elu activation layer 10 | ''' 11 | 12 | def __init__(self , f_mel=80,f_mapped=128): 13 | super(PreNet,self).__init__() 14 | self.linear_1 = nn.Linear(f_mel,f_mapped) 15 | 16 | def forward(self,x): 17 | x = F.elu(self.linear_1(x)) 18 | return x 19 | 20 | class SpectralProcessing(nn.Module): 21 | ''' 22 | Spectral Transformation layer that transforms mel 23 | spectogram to size 128 24 | ''' 25 | def __init__(self,f_mel=80): 26 | super(SpectralProcessing,self).__init__() 27 | self.prenet_1 = PreNet(f_mel,128) 28 | 29 | def forward(self,x): 30 | mapped_x = self.prenet_1(x) 31 | 32 | return mapped_x 33 | -------------------------------------------------------------------------------- /Modules/TemporalProcessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | from Modules.Conv1dGLU import Conv1dGLU 7 | 8 | N_samples = 23 9 | 10 | def Temp_Masking(x): 11 | ''' 12 | Create function for temporal masking. Use librosa.decompose.hpss. 13 | Split and concatinate dimensions to make it 2D. 14 | 15 | ''' 16 | pass 17 | 18 | 19 | class TemporalProcessing(nn.Module): 20 | ''' 21 | Implementation of Temporal Processing Layers 22 | ''' 23 | 24 | def __init__(self,in_channels=128, out_channels=128,padding = None, 25 | dilation = 2,kernel_size=12): 26 | super(TemporalProcessing,self).__init__() 27 | self.conv1d_glu = Conv1dGLU(in_channels,out_channels,padding,dilation, 28 | kernel_size) 29 | 30 | 31 | 32 | def forward(self,x): 33 | batch_size = x.size(0) 34 | # transpose to do operation on the temporal dimension 35 | x = x.view(batch_size*N_samples, x.size(2), x.size(3)).transpose(1,2) 36 | x = self.conv1d_glu(x) 37 | x = x.transpose(1,2) 38 | 39 | x.contiguous() 40 | x = x.view(batch_size,N_samples,x.size(1),x.size(2)) 41 | #x = librosa.decompose.hpss(x)[0] 42 | # temporal masking on x 43 | x = x.mean(dim=2) 44 | 45 | return x -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural_Voice_Cloning 2 | 1. Baidu Research **[Link](https://arxiv.org/pdf/1802.06006.pdf)** 3 | 2. Tested Speaker Audio **[Link](https://visionbrain.github.io/voicecloning.github.io/)** 4 | 5 | ### Abstract : 6 | * **Voice cloning is a highly desired feature for personalized speech interfaces. We introduce a neural voice cloning system that learns to synthesize a person’s voice from only a few audio samples. System that learns to synthesize a person’s voice from only a few audio samples. We study two approaches: speaker adaptation and speaker encoding.** 7 | * **Speaker adaptation is based on fine-tuning a multi-speaker generative model. Speaker encoding is based on training a separate model to directly infer a new speaker embedding, which will be applied to a multi-speaker generative model. Speaker adaptation can achieve slightly better naturalness and similarity, cloning time and required memory for the speaker encoding approach are significantly less, making it more favorable for low-resource deployment.** 8 | 9 | ### Steps : 10 |

11 | Image 12 |

13 | 14 | ### Audio : 15 | Tested Speaker Audio **[Link](https://visionbrain.github.io/voicecloning.github.io/)** 16 | * But don't expect anything right. 17 | * I won't make an official complaint. 18 | * They make a selective perception process. 19 | 20 | ### Made By- 21 | * **[VisionBrain](https://visionbrain.org) & Team** 22 | * **Project Lead - [Aryan Karn](https://github.com/Aryan05)** 23 | -------------------------------------------------------------------------------- /checkpoints/encoder_checkpoint.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/checkpoints/encoder_checkpoint.pth -------------------------------------------------------------------------------- /dv3/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import librosa 4 | import librosa.display 5 | # import IPython 6 | # from IPython.display import Audio 7 | # need this for English text processing frontend 8 | import nltk 9 | 10 | import dv3.train 11 | import dv3.synthesis 12 | # print(os.getcwd()) 13 | 14 | import dv3.hparams 15 | from dv3.hparams import hparams, hparams_debug_string 16 | import json 17 | 18 | from dv3.train import build_model 19 | from dv3.train import restore_parts, load_checkpoint 20 | from dv3.synthesis import tts as _tts 21 | 22 | 23 | from dv3.deepvoice3_pytorch import frontend 24 | 25 | # print(os.getcwd()) 26 | 27 | 28 | def build_deepvoice_3(preset = None ,checkpoint_path = None): 29 | if preset is None: 30 | preset = "./dv3/deepvoice3_vctk.json" 31 | 32 | # Newly added params. Need to inject dummy values 33 | for dummy, v in [("fmin", 0), ("fmax", 0), 34 | ("rescaling", False), 35 | ("rescaling_max", 0.999), 36 | ("allow_clipping_in_normalization", False)]: 37 | 38 | if hparams.get(dummy) is None: 39 | hparams.add_hparam(dummy, v) 40 | # Load parameters from preset 41 | with open(preset) as f: 42 | hparams.parse_json(f.read()) 43 | 44 | # Tell we are using multi-speaker DeepVoice3 45 | hparams.builder = "deepvoice3_multispeaker" 46 | 47 | # Inject frontend text processor 48 | dv3.synthesis._frontend = getattr(frontend, "en") 49 | dv3.train._frontend = getattr(frontend, "en") 50 | 51 | # alises 52 | fs = hparams.sample_rate 53 | hop_length = hparams.hop_size 54 | model = build_model() 55 | 56 | if checkpoint_path is not None: 57 | model = load_checkpoint(checkpoint_path, model, None, True) 58 | 59 | 60 | 61 | return model 62 | # model = build_deepvoice_3() 63 | -------------------------------------------------------------------------------- /dv3/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | from scipy import signal 6 | from dv3.hparams import hparams 7 | from scipy.io import wavfile 8 | 9 | import lws 10 | 11 | 12 | def load_wav(path): 13 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | 15 | 16 | def save_wav(wav, path): 17 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 19 | 20 | 21 | def preemphasis(x): 22 | from nnmnkwii.preprocessing import preemphasis 23 | return preemphasis(x, hparams.preemphasis) 24 | 25 | 26 | def inv_preemphasis(x): 27 | from nnmnkwii.preprocessing import inv_preemphasis 28 | return inv_preemphasis(x, hparams.preemphasis) 29 | 30 | 31 | def spectrogram(y): 32 | D = _lws_processor().stft(preemphasis(y)).T 33 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 34 | return _normalize(S) 35 | 36 | 37 | def inv_spectrogram(spectrogram): 38 | '''Converts spectrogram to waveform using librosa''' 39 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 40 | processor = _lws_processor() 41 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 42 | y = processor.istft(D).astype(np.float32) 43 | return inv_preemphasis(y) 44 | 45 | 46 | def melspectrogram(y): 47 | D = _lws_processor().stft(preemphasis(y)).T 48 | S = _amp_to_db(_linear_to_mel(np.abs(D))) 49 | return _normalize(S) 50 | 51 | 52 | def _lws_processor(): 53 | return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech") 54 | 55 | 56 | # Conversions: 57 | 58 | 59 | _mel_basis = None 60 | 61 | 62 | def _linear_to_mel(spectrogram): 63 | global _mel_basis 64 | if _mel_basis is None: 65 | _mel_basis = _build_mel_basis() 66 | return np.dot(_mel_basis, spectrogram) 67 | 68 | 69 | def _build_mel_basis(): 70 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels) 71 | 72 | 73 | def _amp_to_db(x): 74 | return 20 * np.log10(np.maximum(1e-5, x)) 75 | 76 | 77 | def _db_to_amp(x): 78 | return np.power(10.0, x * 0.05) 79 | 80 | 81 | def _normalize(S): 82 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 83 | 84 | 85 | def _denormalize(S): 86 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 87 | -------------------------------------------------------------------------------- /dv3/compute_timestamp_ratio.py: -------------------------------------------------------------------------------- 1 | """Compute output/input timestamp ratio. 2 | 3 | usage: compute_timestamp_ratio.py [options] 4 | 5 | options: 6 | --hparams= Hyper parameters [default: ]. 7 | -h, --help Show this help message and exit 8 | """ 9 | from docopt import docopt 10 | import sys 11 | import numpy as np 12 | from dv3.hparams import hparams, hparams_debug_string 13 | import dv3.train 14 | from dv3.train import TextDataSource, MelSpecDataSource 15 | from nnmnkwii.datasets import FileSourceDataset 16 | from tqdm import trange 17 | from dv3.deepvoice3_pytorch import frontend 18 | 19 | if __name__ == "__main__": 20 | args = docopt(__doc__) 21 | data_root = args[""] 22 | 23 | # Override hyper parameters 24 | hparams.parse(args["--hparams"]) 25 | assert hparams.name == "deepvoice3" 26 | 27 | train._frontend = getattr(frontend, hparams.frontend) 28 | 29 | # Code below 30 | X = FileSourceDataset(TextDataSource(data_root)) 31 | Mel = FileSourceDataset(MelSpecDataSource(data_root)) 32 | 33 | in_sizes = [] 34 | out_sizes = [] 35 | for i in trange(len(X)): 36 | x, m = X[i], Mel[i] 37 | if X.file_data_source.multi_speaker: 38 | x = x[0] 39 | in_sizes.append(x.shape[0]) 40 | out_sizes.append(m.shape[0]) 41 | 42 | in_sizes = np.array(in_sizes) 43 | out_sizes = np.array(out_sizes) 44 | 45 | input_timestamps = np.sum(in_sizes) 46 | output_timestamps = np.sum(out_sizes) / hparams.outputs_per_step / hparams.downsample_step 47 | 48 | print(input_timestamps, output_timestamps, output_timestamps / input_timestamps) 49 | sys.exit(0) -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .version import __version__ 4 | 5 | import torch 6 | from torch import nn 7 | 8 | from .modules import Embedding 9 | 10 | 11 | class MultiSpeakerTTSModel(nn.Module): 12 | """Attention seq2seq model + post processing network 13 | """ 14 | 15 | def __init__(self, seq2seq, postnet, 16 | mel_dim=80, linear_dim=513, 17 | n_speakers=1, speaker_embed_dim=16, padding_idx=None, 18 | trainable_positional_encodings=False, 19 | use_decoder_state_for_postnet_input=False, 20 | speaker_embedding_weight_std=0.01, 21 | freeze_embedding=False): 22 | super(MultiSpeakerTTSModel, self).__init__() 23 | self.seq2seq = seq2seq 24 | self.postnet = postnet # referred as "Converter" in DeepVoice3 25 | self.mel_dim = mel_dim 26 | self.linear_dim = linear_dim 27 | self.trainable_positional_encodings = trainable_positional_encodings 28 | self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input 29 | self.freeze_embedding = freeze_embedding 30 | 31 | # Speaker embedding 32 | if n_speakers > 1: 33 | self.embed_speakers = Embedding( 34 | n_speakers, speaker_embed_dim, padding_idx=None, 35 | std=speaker_embedding_weight_std) 36 | self.n_speakers = n_speakers 37 | self.speaker_embed_dim = speaker_embed_dim 38 | 39 | def make_generation_fast_(self): 40 | 41 | def remove_weight_norm(m): 42 | try: 43 | nn.utils.remove_weight_norm(m) 44 | except ValueError: # this module didn't have weight norm 45 | return 46 | self.apply(remove_weight_norm) 47 | 48 | def get_trainable_parameters(self): 49 | freezed_param_ids = set() 50 | 51 | encoder, decoder = self.seq2seq.encoder, self.seq2seq.decoder 52 | 53 | # Avoid updating the position encoding 54 | if not self.trainable_positional_encodings: 55 | pe_query_param_ids = set(map(id, decoder.embed_query_positions.parameters())) 56 | pe_keys_param_ids = set(map(id, decoder.embed_keys_positions.parameters())) 57 | freezed_param_ids |= (pe_query_param_ids | pe_keys_param_ids) 58 | # Avoid updating the text embedding 59 | if self.freeze_embedding: 60 | embed_param_ids = set(map(id, encoder.embed_tokens.parameters())) 61 | freezed_param_ids |= embed_param_ids 62 | 63 | return (p for p in self.parameters() if id(p) not in freezed_param_ids) 64 | 65 | def forward(self, text_sequences, mel_targets=None, speaker_ids=None, 66 | text_positions=None, frame_positions=None, input_lengths=None): 67 | B = text_sequences.size(0) 68 | 69 | if speaker_ids is not None: 70 | assert self.n_speakers > 1 71 | speaker_embed = self.embed_speakers(speaker_ids) 72 | else: 73 | speaker_embed = None 74 | 75 | # Apply seq2seq 76 | # (B, T//r, mel_dim*r) 77 | mel_outputs, alignments, done, decoder_states = self.seq2seq( 78 | text_sequences, mel_targets, speaker_embed, 79 | text_positions, frame_positions, input_lengths) 80 | 81 | # Reshape 82 | # (B, T, mel_dim) 83 | mel_outputs = mel_outputs.view(B, -1, self.mel_dim) 84 | 85 | # Prepare postnet inputs 86 | if self.use_decoder_state_for_postnet_input: 87 | postnet_inputs = decoder_states.view(B, mel_outputs.size(1), -1) 88 | else: 89 | postnet_inputs = mel_outputs 90 | 91 | # (B, T, linear_dim) 92 | # Convert coarse mel-spectrogram (or decoder hidden states) to 93 | # high resolution spectrogram 94 | linear_outputs = self.postnet(postnet_inputs, speaker_embed) 95 | assert linear_outputs.size(-1) == self.linear_dim 96 | 97 | return mel_outputs, linear_outputs, alignments, done 98 | 99 | 100 | class AttentionSeq2Seq(nn.Module): 101 | """Encoder + Decoder with attention 102 | """ 103 | 104 | def __init__(self, encoder, decoder): 105 | super(AttentionSeq2Seq, self).__init__() 106 | self.encoder = encoder 107 | self.decoder = decoder 108 | if isinstance(self.decoder.attention, nn.ModuleList): 109 | self.encoder.num_attention_layers = sum( 110 | [layer is not None for layer in decoder.attention]) 111 | 112 | def forward(self, text_sequences, mel_targets=None, speaker_embed=None, 113 | text_positions=None, frame_positions=None, input_lengths=None): 114 | # (B, T, text_embed_dim) 115 | encoder_outputs = self.encoder( 116 | text_sequences, lengths=input_lengths, speaker_embed=speaker_embed) 117 | 118 | # Mel: (B, T//r, mel_dim*r) 119 | # Alignments: (N, B, T_target, T_input) 120 | # Done: (B, T//r, 1) 121 | mel_outputs, alignments, done, decoder_states = self.decoder( 122 | encoder_outputs, mel_targets, 123 | text_positions=text_positions, frame_positions=frame_positions, 124 | speaker_embed=speaker_embed, lengths=input_lengths) 125 | 126 | return mel_outputs, alignments, done, decoder_states 127 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/builder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/builder.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/conv.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/conv.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/deepvoice3.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/deepvoice3.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/__pycache__/version.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/__pycache__/version.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 5 | 6 | 7 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, 8 | downsample_step=1, 9 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 10 | dropout=(1 - 0.95), kernel_size=5, 11 | encoder_channels=128, 12 | decoder_channels=256, 13 | converter_channels=256, 14 | query_position_rate=1.0, 15 | key_position_rate=1.29, 16 | use_memory_mask=False, 17 | trainable_positional_encodings=False, 18 | force_monotonic_attention=True, 19 | use_decoder_state_for_postnet_input=True, 20 | max_positions=512, 21 | embedding_weight_std=0.1, 22 | speaker_embedding_weight_std=0.01, 23 | freeze_embedding=False, 24 | window_ahead=3, 25 | window_backward=1, 26 | key_projection=False, 27 | value_projection=False, 28 | ): 29 | """Build deepvoice3 30 | """ 31 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 32 | 33 | time_upsampling = max(downsample_step // r, 1) 34 | 35 | # Seq2seq 36 | h = encoder_channels # hidden dim (channels) 37 | k = kernel_size # kernel size 38 | encoder = Encoder( 39 | n_vocab, embed_dim, padding_idx=padding_idx, 40 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 41 | dropout=dropout, max_positions=max_positions, 42 | embedding_weight_std=embedding_weight_std, 43 | # (channels, kernel_size, dilation) 44 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 45 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 46 | (h, k, 1), (h, k, 3)], 47 | ) 48 | 49 | h = decoder_channels 50 | decoder = Decoder( 51 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 52 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 53 | dropout=dropout, max_positions=max_positions, 54 | preattention=[(h, k, 1), (h, k, 3)], 55 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 56 | (h, k, 1)], 57 | attention=[True, False, False, False, True], 58 | force_monotonic_attention=force_monotonic_attention, 59 | query_position_rate=query_position_rate, 60 | key_position_rate=key_position_rate, 61 | use_memory_mask=use_memory_mask, 62 | window_ahead=window_ahead, 63 | window_backward=window_backward, 64 | key_projection=key_projection, 65 | value_projection=value_projection, 66 | ) 67 | 68 | seq2seq = AttentionSeq2Seq(encoder, decoder) 69 | 70 | # Post net 71 | if use_decoder_state_for_postnet_input: 72 | in_dim = h // r 73 | else: 74 | in_dim = mel_dim 75 | h = converter_channels 76 | converter = Converter( 77 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 78 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 79 | time_upsampling=time_upsampling, 80 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], 81 | ) 82 | 83 | # Seq2seq + post net 84 | model = MultiSpeakerTTSModel( 85 | seq2seq, converter, padding_idx=padding_idx, 86 | mel_dim=mel_dim, linear_dim=linear_dim, 87 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 88 | trainable_positional_encodings=trainable_positional_encodings, 89 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 90 | speaker_embedding_weight_std=speaker_embedding_weight_std, 91 | freeze_embedding=freeze_embedding) 92 | 93 | return model 94 | 95 | 96 | def nyanko(n_vocab, embed_dim=128, mel_dim=80, linear_dim=513, r=1, 97 | downsample_step=4, 98 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 99 | dropout=(1 - 0.95), kernel_size=3, 100 | encoder_channels=256, 101 | decoder_channels=256, 102 | converter_channels=512, 103 | query_position_rate=1.0, 104 | key_position_rate=1.29, 105 | use_memory_mask=False, 106 | trainable_positional_encodings=False, 107 | force_monotonic_attention=True, 108 | use_decoder_state_for_postnet_input=False, 109 | max_positions=512, embedding_weight_std=0.01, 110 | speaker_embedding_weight_std=0.01, 111 | freeze_embedding=False, 112 | window_ahead=3, 113 | window_backward=1, 114 | key_projection=False, 115 | value_projection=False, 116 | ): 117 | from dv3.deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter 118 | assert encoder_channels == decoder_channels 119 | 120 | if n_speakers != 1: 121 | raise ValueError("Multi-speaker is not supported") 122 | if not (downsample_step == 4 and r == 1): 123 | raise ValueError("Not supported. You need to change hardcoded parameters") 124 | 125 | # Seq2seq 126 | encoder = Encoder( 127 | n_vocab, embed_dim, channels=encoder_channels, kernel_size=kernel_size, 128 | padding_idx=padding_idx, 129 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 130 | dropout=dropout, embedding_weight_std=embedding_weight_std, 131 | ) 132 | 133 | decoder = Decoder( 134 | embed_dim, in_dim=mel_dim, r=r, channels=decoder_channels, 135 | kernel_size=kernel_size, padding_idx=padding_idx, 136 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 137 | dropout=dropout, max_positions=max_positions, 138 | force_monotonic_attention=force_monotonic_attention, 139 | query_position_rate=query_position_rate, 140 | key_position_rate=key_position_rate, 141 | use_memory_mask=use_memory_mask, 142 | window_ahead=window_ahead, 143 | window_backward=window_backward, 144 | key_projection=key_projection, 145 | value_projection=value_projection, 146 | ) 147 | 148 | seq2seq = AttentionSeq2Seq(encoder, decoder) 149 | 150 | if use_decoder_state_for_postnet_input: 151 | in_dim = decoder_channels // r 152 | else: 153 | in_dim = mel_dim 154 | 155 | converter = Converter( 156 | in_dim=in_dim, out_dim=linear_dim, channels=converter_channels, 157 | kernel_size=kernel_size, dropout=dropout) 158 | 159 | # Seq2seq + post net 160 | model = MultiSpeakerTTSModel( 161 | seq2seq, converter, padding_idx=padding_idx, 162 | mel_dim=mel_dim, linear_dim=linear_dim, 163 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 164 | trainable_positional_encodings=trainable_positional_encodings, 165 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 166 | speaker_embedding_weight_std=speaker_embedding_weight_std, 167 | freeze_embedding=freeze_embedding) 168 | 169 | return model 170 | 171 | 172 | def deepvoice3_multispeaker(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, 173 | downsample_step=1, 174 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 175 | dropout=(1 - 0.95), kernel_size=5, 176 | encoder_channels=128, 177 | decoder_channels=256, 178 | converter_channels=256, 179 | query_position_rate=1.0, 180 | key_position_rate=1.29, 181 | use_memory_mask=False, 182 | trainable_positional_encodings=False, 183 | force_monotonic_attention=True, 184 | use_decoder_state_for_postnet_input=True, 185 | max_positions=512, 186 | embedding_weight_std=0.1, 187 | speaker_embedding_weight_std=0.01, 188 | freeze_embedding=False, 189 | window_ahead=3, 190 | window_backward=1, 191 | key_projection=True, 192 | value_projection=True, 193 | ): 194 | """Build multi-speaker deepvoice3 195 | """ 196 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 197 | 198 | time_upsampling = max(downsample_step // r, 1) 199 | 200 | # Seq2seq 201 | h = encoder_channels # hidden dim (channels) 202 | k = kernel_size # kernel size 203 | encoder = Encoder( 204 | n_vocab, embed_dim, padding_idx=padding_idx, 205 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 206 | dropout=dropout, max_positions=max_positions, 207 | embedding_weight_std=embedding_weight_std, 208 | # (channels, kernel_size, dilation) 209 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 210 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 211 | (h, k, 1), (h, k, 3)], 212 | ) 213 | 214 | h = decoder_channels 215 | decoder = Decoder( 216 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 217 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 218 | dropout=dropout, max_positions=max_positions, 219 | preattention=[(h, k, 1)], 220 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 221 | (h, k, 1)], 222 | attention=[True, False, False, False, False], 223 | force_monotonic_attention=force_monotonic_attention, 224 | query_position_rate=query_position_rate, 225 | key_position_rate=key_position_rate, 226 | use_memory_mask=use_memory_mask, 227 | window_ahead=window_ahead, 228 | window_backward=window_backward, 229 | key_projection=key_projection, 230 | value_projection=value_projection, 231 | ) 232 | 233 | seq2seq = AttentionSeq2Seq(encoder, decoder) 234 | 235 | # Post net 236 | if use_decoder_state_for_postnet_input: 237 | in_dim = h // r 238 | else: 239 | in_dim = mel_dim 240 | h = converter_channels 241 | converter = Converter( 242 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 243 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 244 | time_upsampling=time_upsampling, 245 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], 246 | ) 247 | 248 | # Seq2seq + post net 249 | model = MultiSpeakerTTSModel( 250 | seq2seq, converter, padding_idx=padding_idx, 251 | mel_dim=mel_dim, linear_dim=linear_dim, 252 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 253 | trainable_positional_encodings=trainable_positional_encodings, 254 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 255 | speaker_embedding_weight_std=speaker_embedding_weight_std, 256 | freeze_embedding=freeze_embedding) 257 | 258 | return model 259 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Variable 5 | from torch.nn import functional as F 6 | 7 | 8 | class Conv1d(nn.Conv1d): 9 | """Extended nn.Conv1d for incremental dilated convolutions 10 | 11 | currently limited for odd number kernel sizes 12 | """ 13 | 14 | def __init__(self, *args, **kwargs): 15 | super().__init__(*args, **kwargs) 16 | self.clear_buffer() 17 | self._linearized_weight = None 18 | self.register_backward_hook(self._clear_linearized_weight) 19 | 20 | def incremental_forward(self, input): 21 | # input: (B, T, C) 22 | if self.training: 23 | raise RuntimeError('incremental_forward only supports eval mode') 24 | 25 | # run forward pre hooks (e.g., weight norm) 26 | for hook in self._forward_pre_hooks.values(): 27 | hook(self, input) 28 | 29 | # reshape weight 30 | weight = self._get_linearized_weight() 31 | kw = self.kernel_size[0] 32 | dilation = self.dilation[0] 33 | 34 | bsz = input.size(0) # input: bsz x len x dim 35 | if kw > 1: 36 | assert kw % 2 == 1 37 | input = input.data 38 | if self.input_buffer is None: 39 | self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2)) 40 | self.input_buffer.zero_() 41 | else: 42 | # shift buffer 43 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 44 | # append next input 45 | self.input_buffer[:, -1, :] = input[:, -1, :] 46 | input = torch.autograd.Variable(self.input_buffer, volatile=True) 47 | if dilation > 1: 48 | input = input[:, 0::dilation, :].contiguous() 49 | output = F.linear(input.view(bsz, -1), weight, self.bias) 50 | return output.view(bsz, 1, -1) 51 | 52 | def clear_buffer(self): 53 | self.input_buffer = None 54 | 55 | def _get_linearized_weight(self): 56 | if self._linearized_weight is None: 57 | kw = self.kernel_size[0] 58 | # nn.Conv1d 59 | if self.weight.size() == (self.out_channels, self.in_channels, kw): 60 | weight = self.weight.transpose(1, 2).contiguous() 61 | else: 62 | # fairseq.modules.conv_tbc.ConvTBC 63 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 64 | assert weight.size() == (self.out_channels, kw, self.in_channels) 65 | self._linearized_weight = weight.view(self.out_channels, -1) 66 | return self._linearized_weight 67 | 68 | def _clear_linearized_weight(self, *args): 69 | self._linearized_weight = None 70 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/deepvoice3.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from torch.autograd import Variable 7 | import math 8 | import numpy as np 9 | 10 | from .modules import Conv1d, ConvTranspose1d, Embedding, Linear, GradMultiply 11 | from .modules import get_mask_from_lengths, SinusoidalEncoding, Conv1dGLU 12 | 13 | 14 | def expand_speaker_embed(inputs_btc, speaker_embed=None, tdim=1): 15 | if speaker_embed is None: 16 | return None 17 | # expand speaker embedding for all time steps 18 | # (B, N) -> (B, T, N) 19 | ss = speaker_embed.size() 20 | speaker_embed_btc = speaker_embed.unsqueeze(1).expand( 21 | ss[0], inputs_btc.size(tdim), ss[-1]) 22 | return speaker_embed_btc 23 | 24 | 25 | class Encoder(nn.Module): 26 | def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim, 27 | padding_idx=None, embedding_weight_std=0.1, 28 | convolutions=((64, 5, .1),) * 7, 29 | max_positions=512, dropout=0.1, apply_grad_scaling=False): 30 | super(Encoder, self).__init__() 31 | self.dropout = dropout 32 | self.num_attention_layers = None 33 | self.apply_grad_scaling = apply_grad_scaling 34 | 35 | # Text input embeddings 36 | self.embed_tokens = Embedding( 37 | n_vocab, embed_dim, padding_idx, embedding_weight_std) 38 | 39 | # Speaker embedding 40 | if n_speakers > 1: 41 | self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim, dropout=dropout) 42 | self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim, dropout=dropout) 43 | self.n_speakers = n_speakers 44 | 45 | # Non causual convolution blocks 46 | in_channels = embed_dim 47 | self.convolutions = nn.ModuleList() 48 | std_mul = 1.0 49 | for (out_channels, kernel_size, dilation) in convolutions: 50 | if in_channels != out_channels: 51 | # Conv1d + ReLU 52 | self.convolutions.append( 53 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 54 | dilation=1, std_mul=std_mul)) 55 | self.convolutions.append(nn.ReLU(inplace=True)) 56 | in_channels = out_channels 57 | std_mul = 2.0 58 | self.convolutions.append( 59 | Conv1dGLU(n_speakers, speaker_embed_dim, 60 | in_channels, out_channels, kernel_size, causal=False, 61 | dilation=dilation, dropout=dropout, std_mul=std_mul, 62 | residual=True)) 63 | in_channels = out_channels 64 | std_mul = 4.0 65 | # Last 1x1 convolution 66 | self.convolutions.append(Conv1d(in_channels, embed_dim, kernel_size=1, 67 | padding=0, dilation=1, std_mul=std_mul, 68 | dropout=dropout)) 69 | 70 | def forward(self, text_sequences, text_positions=None, lengths=None, 71 | speaker_embed=None): 72 | assert self.n_speakers == 1 or speaker_embed is not None 73 | 74 | # embed text_sequences 75 | x = self.embed_tokens(text_sequences) 76 | x = F.dropout(x, p=self.dropout, training=self.training) 77 | 78 | # expand speaker embedding for all time steps 79 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed) 80 | if speaker_embed_btc is not None: 81 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training) 82 | x = x + F.softsign(self.speaker_fc1(speaker_embed_btc)) 83 | 84 | input_embedding = x 85 | 86 | # B x T x C -> B x C x T 87 | x = x.transpose(1, 2) 88 | 89 | # 1D conv blocks 90 | for f in self.convolutions: 91 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 92 | 93 | # Back to B x T x C 94 | keys = x.transpose(1, 2) 95 | 96 | if speaker_embed_btc is not None: 97 | keys = keys + F.softsign(self.speaker_fc2(speaker_embed_btc)) 98 | 99 | # scale gradients (this only affects backward, not forward) 100 | if self.apply_grad_scaling and self.num_attention_layers is not None: 101 | keys = GradMultiply.apply(keys, 1.0 / (2.0 * self.num_attention_layers)) 102 | 103 | # add output to input embedding for attention 104 | values = (keys + input_embedding) * math.sqrt(0.5) 105 | 106 | return keys, values 107 | 108 | 109 | class AttentionLayer(nn.Module): 110 | def __init__(self, conv_channels, embed_dim, dropout=0.1, 111 | window_ahead=3, window_backward=1, 112 | key_projection=True, value_projection=True): 113 | super(AttentionLayer, self).__init__() 114 | self.query_projection = Linear(conv_channels, embed_dim) 115 | if key_projection: 116 | self.key_projection = Linear(embed_dim, embed_dim) 117 | # According to the DeepVoice3 paper, intiailize weights to same values 118 | # TODO: Does this really work well? not sure.. 119 | if conv_channels == embed_dim: 120 | self.key_projection.weight.data = self.query_projection.weight.data.clone() 121 | else: 122 | self.key_projection = None 123 | if value_projection: 124 | self.value_projection = Linear(embed_dim, embed_dim) 125 | else: 126 | self.value_projection = None 127 | 128 | self.out_projection = Linear(embed_dim, conv_channels) 129 | self.dropout = dropout 130 | self.window_ahead = window_ahead 131 | self.window_backward = window_backward 132 | 133 | def forward(self, query, encoder_out, mask=None, last_attended=None): 134 | keys, values = encoder_out 135 | residual = query 136 | if self.value_projection is not None: 137 | values = self.value_projection(values) 138 | # TODO: yes, this is inefficient 139 | if self.key_projection is not None: 140 | keys = self.key_projection(keys.transpose(1, 2)).transpose(1, 2) 141 | 142 | # attention 143 | x = self.query_projection(query) 144 | x = torch.bmm(x, keys) 145 | 146 | mask_value = -float("inf") 147 | if mask is not None: 148 | mask = mask.view(query.size(0), 1, -1) 149 | x.data.masked_fill_(mask, mask_value) 150 | 151 | if last_attended is not None: 152 | backward = last_attended - self.window_backward 153 | if backward > 0: 154 | x[:, :, :backward] = mask_value 155 | ahead = last_attended + self.window_ahead 156 | if ahead < x.size(-1): 157 | x[:, :, ahead:] = mask_value 158 | 159 | # softmax over last dim 160 | # (B, tgt_len, src_len) 161 | sz = x.size() 162 | x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1) 163 | x = x.view(sz) 164 | attn_scores = x 165 | 166 | x = F.dropout(x, p=self.dropout, training=self.training) 167 | 168 | x = torch.bmm(x, values) 169 | 170 | # scale attention output 171 | s = values.size(1) 172 | x = x * (s * math.sqrt(1.0 / s)) 173 | 174 | # project back 175 | x = self.out_projection(x) 176 | x = (x + residual) * math.sqrt(0.5) 177 | return x, attn_scores 178 | 179 | 180 | class Decoder(nn.Module): 181 | def __init__(self, embed_dim, n_speakers, speaker_embed_dim, 182 | in_dim=80, r=5, 183 | max_positions=512, padding_idx=None, 184 | preattention=((128, 5, 1),) * 4, 185 | convolutions=((128, 5, 1),) * 4, 186 | attention=True, dropout=0.1, 187 | use_memory_mask=False, 188 | force_monotonic_attention=False, 189 | query_position_rate=1.0, 190 | key_position_rate=1.29, 191 | window_ahead=3, 192 | window_backward=1, 193 | key_projection=True, 194 | value_projection=True, 195 | ): 196 | super(Decoder, self).__init__() 197 | self.dropout = dropout 198 | self.in_dim = in_dim 199 | self.r = r 200 | self.query_position_rate = query_position_rate 201 | self.key_position_rate = key_position_rate 202 | 203 | in_channels = in_dim * r 204 | if isinstance(attention, bool): 205 | # expand True into [True, True, ...] and do the same with False 206 | attention = [attention] * len(convolutions) 207 | 208 | # Position encodings for query (decoder states) and keys (encoder states) 209 | self.embed_query_positions = SinusoidalEncoding( 210 | max_positions, convolutions[0][0], padding_idx) 211 | self.embed_keys_positions = SinusoidalEncoding( 212 | max_positions, embed_dim, padding_idx) 213 | # Used for compute multiplier for positional encodings 214 | if n_speakers > 1: 215 | self.speaker_proj1 = Linear(speaker_embed_dim, 1, dropout=dropout) 216 | self.speaker_proj2 = Linear(speaker_embed_dim, 1, dropout=dropout) 217 | else: 218 | self.speaker_proj1, self.speaker_proj2 = None, None 219 | 220 | # Prenet: causal convolution blocks 221 | self.preattention = nn.ModuleList() 222 | in_channels = in_dim * r 223 | std_mul = 1.0 224 | for out_channels, kernel_size, dilation in preattention: 225 | if in_channels != out_channels: 226 | # Conv1d + ReLU 227 | self.preattention.append( 228 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 229 | dilation=1, std_mul=std_mul)) 230 | self.preattention.append(nn.ReLU(inplace=True)) 231 | in_channels = out_channels 232 | std_mul = 2.0 233 | self.preattention.append( 234 | Conv1dGLU(n_speakers, speaker_embed_dim, 235 | in_channels, out_channels, kernel_size, causal=True, 236 | dilation=dilation, dropout=dropout, std_mul=std_mul, 237 | residual=True)) 238 | in_channels = out_channels 239 | std_mul = 4.0 240 | 241 | # Causal convolution blocks + attention layers 242 | self.convolutions = nn.ModuleList() 243 | self.attention = nn.ModuleList() 244 | 245 | for i, (out_channels, kernel_size, dilation) in enumerate(convolutions): 246 | assert in_channels == out_channels 247 | self.convolutions.append( 248 | Conv1dGLU(n_speakers, speaker_embed_dim, 249 | in_channels, out_channels, kernel_size, causal=True, 250 | dilation=dilation, dropout=dropout, std_mul=std_mul, 251 | residual=False)) 252 | self.attention.append( 253 | AttentionLayer(out_channels, embed_dim, 254 | dropout=dropout, 255 | window_ahead=window_ahead, 256 | window_backward=window_backward, 257 | key_projection=key_projection, 258 | value_projection=value_projection) 259 | if attention[i] else None) 260 | in_channels = out_channels 261 | std_mul = 4.0 262 | # Last 1x1 convolution 263 | self.last_conv = Conv1d(in_channels, in_dim * r, kernel_size=1, 264 | padding=0, dilation=1, std_mul=std_mul, 265 | dropout=dropout) 266 | 267 | # Mel-spectrogram (before sigmoid) -> Done binary flag 268 | self.fc = Linear(in_dim * r, 1) 269 | 270 | self.max_decoder_steps = 200 271 | self.min_decoder_steps = 10 272 | self.use_memory_mask = use_memory_mask 273 | if isinstance(force_monotonic_attention, bool): 274 | self.force_monotonic_attention = [force_monotonic_attention] * len(convolutions) 275 | else: 276 | self.force_monotonic_attention = force_monotonic_attention 277 | 278 | def forward(self, encoder_out, inputs=None, 279 | text_positions=None, frame_positions=None, 280 | speaker_embed=None, lengths=None): 281 | if inputs is None: 282 | assert text_positions is not None 283 | self.start_fresh_sequence() 284 | outputs = self.incremental_forward(encoder_out, text_positions, speaker_embed) 285 | return outputs 286 | 287 | # Grouping multiple frames if necessary 288 | if inputs.size(-1) == self.in_dim: 289 | inputs = inputs.view(inputs.size(0), inputs.size(1) // self.r, -1) 290 | assert inputs.size(-1) == self.in_dim * self.r 291 | 292 | # expand speaker embedding for all time steps 293 | speaker_embed_btc = expand_speaker_embed(inputs, speaker_embed) 294 | if speaker_embed_btc is not None: 295 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training) 296 | 297 | keys, values = encoder_out 298 | 299 | if self.use_memory_mask and lengths is not None: 300 | mask = get_mask_from_lengths(keys, lengths) 301 | else: 302 | mask = None 303 | 304 | # position encodings 305 | if text_positions is not None: 306 | w = self.key_position_rate 307 | # TODO: may be useful to have projection per attention layer 308 | if self.speaker_proj1 is not None: 309 | w = w * F.sigmoid(self.speaker_proj1(speaker_embed)).view(-1) 310 | text_pos_embed = self.embed_keys_positions(text_positions, w) 311 | keys = keys + text_pos_embed 312 | if frame_positions is not None: 313 | w = self.query_position_rate 314 | if self.speaker_proj2 is not None: 315 | w = w * F.sigmoid(self.speaker_proj2(speaker_embed)).view(-1) 316 | frame_pos_embed = self.embed_query_positions(frame_positions, w) 317 | 318 | # transpose only once to speed up attention layers 319 | keys = keys.transpose(1, 2).contiguous() 320 | 321 | x = inputs 322 | x = F.dropout(x, p=self.dropout, training=self.training) 323 | 324 | # Generic case: B x T x C -> B x C x T 325 | x = x.transpose(1, 2) 326 | 327 | # Prenet 328 | for f in self.preattention: 329 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 330 | 331 | # Casual convolutions + Multi-hop attentions 332 | alignments = [] 333 | for f, attention in zip(self.convolutions, self.attention): 334 | residual = x 335 | 336 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 337 | 338 | # Feed conv output to attention layer as query 339 | if attention is not None: 340 | assert isinstance(f, Conv1dGLU) 341 | # (B x T x C) 342 | x = x.transpose(1, 2) 343 | x = x if frame_positions is None else x + frame_pos_embed 344 | x, alignment = attention(x, (keys, values), mask=mask) 345 | # (T x B x C) 346 | x = x.transpose(1, 2) 347 | alignments += [alignment] 348 | 349 | if isinstance(f, Conv1dGLU): 350 | x = (x + residual) * math.sqrt(0.5) 351 | 352 | # decoder state (B x T x C): 353 | # internal representation before compressed to output dimention 354 | decoder_states = x.transpose(1, 2).contiguous() 355 | x = self.last_conv(x) 356 | 357 | # Back to B x T x C 358 | x = x.transpose(1, 2) 359 | 360 | # project to mel-spectorgram 361 | outputs = F.sigmoid(x) 362 | 363 | # Done flag 364 | done = F.sigmoid(self.fc(x)) 365 | 366 | return outputs, torch.stack(alignments), done, decoder_states 367 | 368 | def incremental_forward(self, encoder_out, text_positions, speaker_embed=None, 369 | initial_input=None, test_inputs=None): 370 | keys, values = encoder_out 371 | B = keys.size(0) 372 | 373 | # position encodings 374 | w = self.key_position_rate 375 | # TODO: may be useful to have projection per attention layer 376 | if self.speaker_proj1 is not None: 377 | w = w * F.sigmoid(self.speaker_proj1(speaker_embed)).view(-1) 378 | text_pos_embed = self.embed_keys_positions(text_positions, w) 379 | keys = keys + text_pos_embed 380 | 381 | # transpose only once to speed up attention layers 382 | keys = keys.transpose(1, 2).contiguous() 383 | 384 | decoder_states = [] 385 | outputs = [] 386 | alignments = [] 387 | dones = [] 388 | # intially set to zeros 389 | last_attended = [None] * len(self.attention) 390 | for idx, v in enumerate(self.force_monotonic_attention): 391 | last_attended[idx] = 0 if v else None 392 | 393 | num_attention_layers = sum([layer is not None for layer in self.attention]) 394 | t = 0 395 | if initial_input is None: 396 | initial_input = Variable( 397 | keys.data.new(B, 1, self.in_dim * self.r).zero_()) 398 | current_input = initial_input 399 | while True: 400 | # frame pos start with 1. 401 | frame_pos = Variable(keys.data.new(B, 1).fill_(t + 1)).long() 402 | w = self.query_position_rate 403 | if self.speaker_proj2 is not None: 404 | w = w * F.sigmoid(self.speaker_proj2(speaker_embed)).view(-1) 405 | frame_pos_embed = self.embed_query_positions(frame_pos, w) 406 | 407 | if test_inputs is not None: 408 | if t >= test_inputs.size(1): 409 | break 410 | current_input = test_inputs[:, t, :].unsqueeze(1) 411 | else: 412 | if t > 0: 413 | current_input = outputs[-1] 414 | x = current_input 415 | x = F.dropout(x, p=self.dropout, training=self.training) 416 | 417 | # Prenet 418 | for f in self.preattention: 419 | if isinstance(f, Conv1dGLU): 420 | x = f.incremental_forward(x, speaker_embed) 421 | else: 422 | try: 423 | x = f.incremental_forward(x) 424 | except AttributeError as e: 425 | x = f(x) 426 | 427 | # Casual convolutions + Multi-hop attentions 428 | ave_alignment = None 429 | for idx, (f, attention) in enumerate(zip(self.convolutions, 430 | self.attention)): 431 | residual = x 432 | if isinstance(f, Conv1dGLU): 433 | x = f.incremental_forward(x, speaker_embed) 434 | else: 435 | try: 436 | x = f.incremental_forward(x) 437 | except AttributeError as e: 438 | x = f(x) 439 | 440 | # attention 441 | if attention is not None: 442 | assert isinstance(f, Conv1dGLU) 443 | x = x + frame_pos_embed 444 | x, alignment = attention(x, (keys, values), 445 | last_attended=last_attended[idx]) 446 | if self.force_monotonic_attention[idx]: 447 | last_attended[idx] = alignment.max(-1)[1].view(-1).data[0] 448 | if ave_alignment is None: 449 | ave_alignment = alignment 450 | else: 451 | ave_alignment = ave_alignment + ave_alignment 452 | 453 | # residual 454 | if isinstance(f, Conv1dGLU): 455 | x = (x + residual) * math.sqrt(0.5) 456 | 457 | decoder_state = x 458 | x = self.last_conv.incremental_forward(x) 459 | ave_alignment = ave_alignment.div_(num_attention_layers) 460 | 461 | # Ooutput & done flag predictions 462 | output = F.sigmoid(x) 463 | done = F.sigmoid(self.fc(x)) 464 | 465 | decoder_states += [decoder_state] 466 | outputs += [output] 467 | alignments += [ave_alignment] 468 | dones += [done] 469 | 470 | t += 1 471 | if test_inputs is None: 472 | if (done > 0.5).all() and t > self.min_decoder_steps: 473 | break 474 | elif t > self.max_decoder_steps: 475 | break 476 | 477 | # Remove 1-element time axis 478 | alignments = list(map(lambda x: x.squeeze(1), alignments)) 479 | decoder_states = list(map(lambda x: x.squeeze(1), decoder_states)) 480 | outputs = list(map(lambda x: x.squeeze(1), outputs)) 481 | 482 | # Combine outputs for all time steps 483 | alignments = torch.stack(alignments).transpose(0, 1) 484 | decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous() 485 | outputs = torch.stack(outputs).transpose(0, 1).contiguous() 486 | 487 | return outputs, alignments, dones, decoder_states 488 | 489 | def start_fresh_sequence(self): 490 | for conv in self.convolutions: 491 | conv.clear_buffer() 492 | 493 | 494 | class Converter(nn.Module): 495 | def __init__(self, n_speakers, speaker_embed_dim, 496 | in_dim, out_dim, convolutions=((256, 5, 1),) * 4, 497 | time_upsampling=1, 498 | dropout=0.1): 499 | super(Converter, self).__init__() 500 | self.dropout = dropout 501 | self.in_dim = in_dim 502 | self.out_dim = out_dim 503 | self.n_speakers = n_speakers 504 | 505 | # Non causual convolution blocks 506 | in_channels = convolutions[0][0] 507 | # Idea from nyanko 508 | if time_upsampling == 4: 509 | self.convolutions = nn.ModuleList([ 510 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1, 511 | std_mul=1.0), 512 | ConvTranspose1d(in_channels, in_channels, kernel_size=2, 513 | padding=0, stride=2, std_mul=1.0), 514 | Conv1dGLU(n_speakers, speaker_embed_dim, 515 | in_channels, in_channels, kernel_size=3, causal=False, 516 | dilation=1, dropout=dropout, std_mul=1.0, residual=True), 517 | Conv1dGLU(n_speakers, speaker_embed_dim, 518 | in_channels, in_channels, kernel_size=3, causal=False, 519 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 520 | ConvTranspose1d(in_channels, in_channels, kernel_size=2, 521 | padding=0, stride=2, std_mul=4.0), 522 | Conv1dGLU(n_speakers, speaker_embed_dim, 523 | in_channels, in_channels, kernel_size=3, causal=False, 524 | dilation=1, dropout=dropout, std_mul=1.0, residual=True), 525 | Conv1dGLU(n_speakers, speaker_embed_dim, 526 | in_channels, in_channels, kernel_size=3, causal=False, 527 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 528 | ]) 529 | elif time_upsampling == 2: 530 | self.convolutions = nn.ModuleList([ 531 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1, 532 | std_mul=1.0), 533 | ConvTranspose1d(in_channels, in_channels, kernel_size=2, 534 | padding=0, stride=2, std_mul=1.0), 535 | Conv1dGLU(n_speakers, speaker_embed_dim, 536 | in_channels, in_channels, kernel_size=3, causal=False, 537 | dilation=1, dropout=dropout, std_mul=1.0, residual=True), 538 | Conv1dGLU(n_speakers, speaker_embed_dim, 539 | in_channels, in_channels, kernel_size=3, causal=False, 540 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 541 | ]) 542 | elif time_upsampling == 1: 543 | self.convolutions = nn.ModuleList([ 544 | # 1x1 convolution first 545 | Conv1d(in_dim, in_channels, kernel_size=1, padding=0, dilation=1, 546 | std_mul=1.0), 547 | Conv1dGLU(n_speakers, speaker_embed_dim, 548 | in_channels, in_channels, kernel_size=3, causal=False, 549 | dilation=3, dropout=dropout, std_mul=4.0, residual=True), 550 | ]) 551 | else: 552 | raise ValueError("Not supported") 553 | 554 | std_mul = 4.0 555 | for (out_channels, kernel_size, dilation) in convolutions: 556 | if in_channels != out_channels: 557 | self.convolutions.append( 558 | Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 559 | dilation=1, std_mul=std_mul)) 560 | self.convolutions.append(nn.ReLU(inplace=True)) 561 | in_channels = out_channels 562 | std_mul = 2.0 563 | self.convolutions.append( 564 | Conv1dGLU(n_speakers, speaker_embed_dim, 565 | in_channels, out_channels, kernel_size, causal=False, 566 | dilation=dilation, dropout=dropout, std_mul=std_mul, 567 | residual=True)) 568 | in_channels = out_channels 569 | std_mul = 4.0 570 | # Last 1x1 convolution 571 | self.convolutions.append(Conv1d(in_channels, out_dim, kernel_size=1, 572 | padding=0, dilation=1, std_mul=std_mul, 573 | dropout=dropout)) 574 | 575 | def forward(self, x, speaker_embed=None): 576 | assert self.n_speakers == 1 or speaker_embed is not None 577 | 578 | # expand speaker embedding for all time steps 579 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed) 580 | if speaker_embed_btc is not None: 581 | speaker_embed_btc = F.dropout(speaker_embed_btc, p=self.dropout, training=self.training) 582 | 583 | # Generic case: B x T x C -> B x C x T 584 | x = x.transpose(1, 2) 585 | 586 | for f in self.convolutions: 587 | # Case for upsampling 588 | if speaker_embed_btc is not None and speaker_embed_btc.size(1) != x.size(-1): 589 | speaker_embed_btc = expand_speaker_embed(x, speaker_embed, tdim=-1) 590 | speaker_embed_btc = F.dropout( 591 | speaker_embed_btc, p=self.dropout, training=self.training) 592 | x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) 593 | 594 | # Back to B x T x C 595 | x = x.transpose(1, 2) 596 | 597 | return F.sigmoid(x) 598 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """Text processing frontend 4 | 5 | All frontend module should have the following functions: 6 | 7 | - text_to_sequence(text, p) 8 | - sequence_to_text(sequence) 9 | 10 | and the property: 11 | 12 | - n_vocab 13 | 14 | """ 15 | from dv3.deepvoice3_pytorch.frontend import en 16 | 17 | # optinoal Japanese frontend 18 | try: 19 | from dv3.deepvoice3_pytorch.frontend import jp 20 | except ImportError: 21 | jp = None 22 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/en/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from dv3.deepvoice3_pytorch.frontend.text.symbols import symbols 3 | 4 | import nltk 5 | from random import random 6 | 7 | n_vocab = len(symbols) 8 | 9 | _arphabet = nltk.corpus.cmudict.dict() 10 | 11 | 12 | def _maybe_get_arpabet(word, p): 13 | try: 14 | phonemes = _arphabet[word][0] 15 | phonemes = " ".join(phonemes) 16 | except KeyError: 17 | return word 18 | 19 | return '{%s}' % phonemes if random() < p else word 20 | 21 | 22 | def mix_pronunciation(text, p): 23 | text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) 24 | return text 25 | 26 | 27 | def text_to_sequence(text, p=0.0): 28 | if p >= 0: 29 | text = mix_pronunciation(text, p) 30 | from dv3.deepvoice3_pytorch.frontend.text import text_to_sequence 31 | text = text_to_sequence(text, ["english_cleaners"]) 32 | return text 33 | 34 | 35 | from dv3.deepvoice3_pytorch.frontend.text import sequence_to_text 36 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/en/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/en/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/jp/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | import MeCab 5 | import jaconv 6 | from random import random 7 | 8 | n_vocab = 0xffff 9 | 10 | _eos = 1 11 | _pad = 0 12 | _tagger = None 13 | 14 | 15 | def _yomi(mecab_result): 16 | tokens = [] 17 | yomis = [] 18 | for line in mecab_result.split("\n")[:-1]: 19 | s = line.split("\t") 20 | if len(s) == 1: 21 | break 22 | token, rest = s 23 | rest = rest.split(",") 24 | tokens.append(token) 25 | yomi = rest[7] if len(rest) > 7 else None 26 | yomi = None if yomi == "*" else yomi 27 | yomis.append(yomi) 28 | 29 | return tokens, yomis 30 | 31 | 32 | def _mix_pronunciation(tokens, yomis, p): 33 | return "".join( 34 | yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] 35 | for idx in range(len(tokens))) 36 | 37 | 38 | def mix_pronunciation(text, p): 39 | global _tagger 40 | if _tagger is None: 41 | _tagger = MeCab.Tagger("") 42 | tokens, yomis = _yomi(_tagger.parse(text)) 43 | return _mix_pronunciation(tokens, yomis, p) 44 | 45 | 46 | def add_punctuation(text): 47 | last = text[-1] 48 | if last not in [".", ",", "、", "。", "!", "?", "!", "?"]: 49 | text = text + "。" 50 | return text 51 | 52 | 53 | def normalize_delimitor(text): 54 | text = text.replace(",", "、") 55 | text = text.replace(".", "。") 56 | text = text.replace(",", "、") 57 | text = text.replace(".", "。") 58 | return text 59 | 60 | 61 | def text_to_sequence(text, p=0.0): 62 | for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", 63 | "(", ")", "(", ")"]: 64 | text = text.replace(c, "") 65 | text = text.replace("!", "!") 66 | text = text.replace("?", "?") 67 | 68 | text = normalize_delimitor(text) 69 | text = jaconv.normalize(text) 70 | if p > 0: 71 | text = mix_pronunciation(text, p) 72 | text = jaconv.hira2kata(text) 73 | text = add_punctuation(text) 74 | 75 | return [ord(c) for c in text] + [_eos] # EOS 76 | 77 | 78 | def sequence_to_text(seq): 79 | return "".join(chr(n) for n in seq) 80 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/jp/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/jp/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dv3.deepvoice3_pytorch.frontend.text import cleaners 3 | from dv3.deepvoice3_pytorch.frontend.text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__pycache__/cleaners.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cleaners.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__pycache__/cmudict.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/cmudict.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__pycache__/numbers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/numbers.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/__pycache__/symbols.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VisionBrain/Neural_Voice_Cloning/a0bed8a73dbaf745820758f9c0fc91709a0c1de9/dv3/deepvoice3_pytorch/frontend/text/__pycache__/symbols.cpython-36.pyc -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, ' ', text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def add_punctuation(text): 67 | if len(text) == 0: 68 | return text 69 | if text[-1] not in '!,.:;?': 70 | text = text + '.' # without this decoder is confused when to output EOS 71 | return text 72 | 73 | 74 | def basic_cleaners(text): 75 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def transliteration_cleaners(text): 82 | '''Pipeline for non-English text that transliterates to ASCII.''' 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = collapse_whitespace(text) 86 | return text 87 | 88 | 89 | def english_cleaners(text): 90 | '''Pipeline for English text, including number and abbreviation expansion.''' 91 | text = convert_to_ascii(text) 92 | text = add_punctuation(text) 93 | text = lowercase(text) 94 | text = expand_numbers(text) 95 | text = expand_abbreviations(text) 96 | text = collapse_whitespace(text) 97 | return text 98 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | 20 | def __init__(self, file_or_path, keep_ambiguous=True): 21 | if isinstance(file_or_path, str): 22 | with open(file_or_path, encoding='latin-1') as f: 23 | entries = _parse_cmudict(f) 24 | else: 25 | entries = _parse_cmudict(file_or_path) 26 | if not keep_ambiguous: 27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 28 | self._entries = entries 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | def lookup(self, word): 34 | '''Returns list of ARPAbet pronunciations of the given word.''' 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | _alt_re = re.compile(r'\([0-9]+\)') 39 | 40 | 41 | def _parse_cmudict(file): 42 | cmudict = {} 43 | for line in file: 44 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 45 | parts = line.split(' ') 46 | word = re.sub(_alt_re, '', parts[0]) 47 | pronunciation = _get_pronunciation(parts[1]) 48 | if pronunciation: 49 | if word in cmudict: 50 | cmudict[word].append(pronunciation) 51 | else: 52 | cmudict[word] = [pronunciation] 53 | return cmudict 54 | 55 | 56 | def _get_pronunciation(s): 57 | parts = s.strip().split(' ') 58 | for part in parts: 59 | if part not in _valid_symbol_set: 60 | return None 61 | return ' '.join(parts) 62 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/frontend/text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from .cmudict import valid_symbols 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet 18 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | import math 6 | import numpy as np 7 | from torch.nn import functional as F 8 | 9 | 10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0, 11 | sinusoidal=True): 12 | ''' Init the sinusoid position encoding table ''' 13 | 14 | # keep dim 0 for padding token position encoding zero vector 15 | position_enc = np.array([ 16 | [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)] 17 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) 18 | 19 | position_enc = torch.from_numpy(position_enc).float() 20 | if sinusoidal: 21 | position_enc[1:, 0::2] = torch.sin(position_enc[1:, 0::2]) # dim 2i 22 | position_enc[1:, 1::2] = torch.cos(position_enc[1:, 1::2]) # dim 2i+1 23 | 24 | return position_enc 25 | 26 | 27 | def sinusoidal_encode(x, w): 28 | y = w * x 29 | y[1:, 0::2] = torch.sin(y[1:, 0::2].clone()) 30 | y[1:, 1::2] = torch.cos(y[1:, 1::2].clone()) 31 | return y 32 | 33 | 34 | class SinusoidalEncoding(nn.Embedding): 35 | def __init__(self, num_embeddings, embedding_dim, padding_idx=0, 36 | *args, **kwargs): 37 | super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim, 38 | padding_idx, *args, **kwargs) 39 | self.weight.data = position_encoding_init(num_embeddings, embedding_dim, 40 | position_rate=1.0, 41 | sinusoidal=False) 42 | 43 | def forward(self, x, w=1.0): 44 | isscaler = np.isscalar(w) 45 | padding_idx = self.padding_idx 46 | if padding_idx is None: 47 | padding_idx = -1 48 | 49 | if isscaler or w.size(0) == 1: 50 | weight = sinusoidal_encode(self.weight, w) 51 | return self._backend.Embedding.apply( 52 | x, weight, 53 | padding_idx, self.max_norm, self.norm_type, 54 | self.scale_grad_by_freq, self.sparse 55 | ) 56 | else: 57 | # TODO: cannot simply apply for batch 58 | # better to implement efficient function 59 | pe = [] 60 | for batch_idx, we in enumerate(w): 61 | weight = sinusoidal_encode(self.weight, we) 62 | pe.append(self._backend.Embedding.apply( 63 | x[batch_idx], weight, 64 | padding_idx, self.max_norm, self.norm_type, 65 | self.scale_grad_by_freq, self.sparse 66 | )) 67 | pe = torch.stack(pe) 68 | return pe 69 | 70 | 71 | class GradMultiply(torch.autograd.Function): 72 | @staticmethod 73 | def forward(ctx, x, scale): 74 | ctx.scale = scale 75 | res = x.new(x) 76 | ctx.mark_shared_storage((x, res)) 77 | return res 78 | 79 | @staticmethod 80 | def backward(ctx, grad): 81 | return grad * ctx.scale, None 82 | 83 | 84 | def Linear(in_features, out_features, dropout=0): 85 | """Weight-normalized Linear layer (input: N x T x C)""" 86 | m = nn.Linear(in_features, out_features) 87 | m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) 88 | m.bias.data.zero_() 89 | return nn.utils.weight_norm(m) 90 | 91 | 92 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01): 93 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 94 | m.weight.data.normal_(0, std) 95 | return m 96 | 97 | 98 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs): 99 | from .conv import Conv1d 100 | m = Conv1d(in_channels, out_channels, kernel_size, **kwargs) 101 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 102 | m.weight.data.normal_(mean=0, std=std) 103 | m.bias.data.zero_() 104 | return nn.utils.weight_norm(m) 105 | 106 | 107 | def ConvTranspose1d(in_channels, out_channels, kernel_size, dropout=0, 108 | std_mul=1.0, **kwargs): 109 | m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs) 110 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 111 | m.weight.data.normal_(mean=0, std=std) 112 | m.bias.data.zero_() 113 | return nn.utils.weight_norm(m) 114 | 115 | 116 | class Conv1dGLU(nn.Module): 117 | """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding 118 | """ 119 | 120 | def __init__(self, n_speakers, speaker_embed_dim, 121 | in_channels, out_channels, kernel_size, 122 | dropout, padding=None, dilation=1, causal=False, residual=False, 123 | *args, **kwargs): 124 | super(Conv1dGLU, self).__init__() 125 | self.dropout = dropout 126 | self.residual = residual 127 | if padding is None: 128 | # no future time stamps available 129 | if causal: 130 | padding = (kernel_size - 1) * dilation 131 | else: 132 | padding = (kernel_size - 1) // 2 * dilation 133 | self.causal = causal 134 | 135 | self.conv = Conv1d(in_channels, 2 * out_channels, kernel_size, 136 | dropout=dropout, padding=padding, dilation=dilation, 137 | *args, **kwargs) 138 | if n_speakers > 1: 139 | self.speaker_proj = Linear(speaker_embed_dim, out_channels) 140 | else: 141 | self.speaker_proj = None 142 | 143 | def forward(self, x, speaker_embed=None): 144 | return self._forward(x, speaker_embed, False) 145 | 146 | def incremental_forward(self, x, speaker_embed=None): 147 | return self._forward(x, speaker_embed, True) 148 | 149 | def _forward(self, x, speaker_embed, is_incremental): 150 | residual = x 151 | x = F.dropout(x, p=self.dropout, training=self.training) 152 | if is_incremental: 153 | splitdim = -1 154 | x = self.conv.incremental_forward(x) 155 | else: 156 | splitdim = 1 157 | x = self.conv(x) 158 | # remove future time steps 159 | x = x[:, :, :residual.size(-1)] if self.causal else x 160 | 161 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 162 | if self.speaker_proj is not None: 163 | softsign = F.softsign(self.speaker_proj(speaker_embed)) 164 | # Since conv layer assumes BCT, we need to transpose 165 | softsign = softsign if is_incremental else softsign.transpose(1, 2) 166 | a = a + softsign 167 | x = a * F.sigmoid(b) 168 | return (x + residual) * math.sqrt(0.5) if self.residual else x 169 | 170 | def clear_buffer(self): 171 | self.conv.clear_buffer() 172 | 173 | 174 | class HighwayConv1d(nn.Module): 175 | """Weight normzlized Conv1d + Highway network (support incremental forward) 176 | """ 177 | 178 | def __init__(self, in_channels, out_channels, kernel_size=1, padding=None, 179 | dilation=1, causal=False, dropout=0, std_mul=None, glu=False): 180 | super(HighwayConv1d, self).__init__() 181 | if std_mul is None: 182 | std_mul = 4.0 if glu else 1.0 183 | if padding is None: 184 | # no future time stamps available 185 | if causal: 186 | padding = (kernel_size - 1) * dilation 187 | else: 188 | padding = (kernel_size - 1) // 2 * dilation 189 | self.causal = causal 190 | self.dropout = dropout 191 | self.glu = glu 192 | 193 | self.conv = Conv1d(in_channels, 2 * out_channels, 194 | kernel_size=kernel_size, padding=padding, 195 | dilation=dilation, dropout=dropout, 196 | std_mul=std_mul) 197 | 198 | def forward(self, x): 199 | return self._forward(x, False) 200 | 201 | def incremental_forward(self, x): 202 | return self._forward(x, True) 203 | 204 | def _forward(self, x, is_incremental): 205 | """Forward 206 | 207 | Args: 208 | x: (B, in_channels, T) 209 | returns: 210 | (B, out_channels, T) 211 | """ 212 | 213 | residual = x 214 | x = F.dropout(x, p=self.dropout, training=self.training) 215 | if is_incremental: 216 | splitdim = -1 217 | x = self.conv.incremental_forward(x) 218 | else: 219 | splitdim = 1 220 | x = self.conv(x) 221 | # remove future time steps 222 | x = x[:, :, :residual.size(-1)] if self.causal else x 223 | 224 | if self.glu: 225 | x = F.glu(x, dim=splitdim) 226 | return (x + residual) * math.sqrt(0.5) 227 | else: 228 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 229 | T = F.sigmoid(b) 230 | return (T * a + (1 - T) * residual) 231 | 232 | def clear_buffer(self): 233 | self.conv.clear_buffer() 234 | 235 | 236 | def get_mask_from_lengths(memory, memory_lengths): 237 | """Get mask tensor from list of length 238 | Args: 239 | memory: (batch, max_time, dim) 240 | memory_lengths: array like 241 | """ 242 | mask = memory.data.new(memory.size(0), memory.size(1)).byte().zero_() 243 | for idx, l in enumerate(memory_lengths): 244 | mask[idx][:l] = 1 245 | return ~mask 246 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/nyanko.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from torch.autograd import Variable 7 | import math 8 | import numpy as np 9 | 10 | from .modules import Embedding, Linear, Conv1d, ConvTranspose1d 11 | from .modules import HighwayConv1d, get_mask_from_lengths 12 | from .modules import position_encoding_init 13 | from .deepvoice3 import AttentionLayer 14 | 15 | 16 | class Encoder(nn.Module): 17 | def __init__(self, n_vocab, embed_dim, channels, kernel_size=3, 18 | n_speakers=1, speaker_embed_dim=16, embedding_weight_std=0.01, 19 | padding_idx=None, dropout=0.1): 20 | super(Encoder, self).__init__() 21 | self.dropout = dropout 22 | 23 | # Text input embeddings 24 | self.embed_tokens = Embedding( 25 | n_vocab, embed_dim, padding_idx, embedding_weight_std) 26 | 27 | E = embed_dim 28 | D = channels 29 | self.convnet = nn.Sequential( 30 | Conv1d(E, 2 * D, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 31 | nn.ReLU(inplace=True), 32 | Conv1d(2 * D, 2 * D, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 33 | 34 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 35 | dilation=1, std_mul=1.0, dropout=dropout), 36 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 37 | dilation=3, std_mul=1.0, dropout=dropout), 38 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 39 | dilation=9, std_mul=1.0, dropout=dropout), 40 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 41 | dilation=27, std_mul=1.0, dropout=dropout), 42 | 43 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 44 | dilation=1, std_mul=1.0, dropout=dropout), 45 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 46 | dilation=3, std_mul=1.0, dropout=dropout), 47 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 48 | dilation=9, std_mul=1.0, dropout=dropout), 49 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 50 | dilation=27, std_mul=1.0, dropout=dropout), 51 | 52 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 53 | dilation=1, std_mul=1.0, dropout=dropout), 54 | HighwayConv1d(2 * D, 2 * D, kernel_size=kernel_size, padding=None, 55 | dilation=1, std_mul=1.0, dropout=dropout), 56 | 57 | HighwayConv1d(2 * D, 2 * D, kernel_size=1, padding=0, 58 | dilation=1, std_mul=1.0, dropout=dropout), 59 | ) 60 | 61 | def forward(self, text_sequences, text_positions=None, lengths=None, 62 | speaker_embed=None): 63 | # embed text_sequences 64 | # (B, T, E) 65 | x = self.embed_tokens(text_sequences) 66 | 67 | x = self.convnet(x.transpose(1, 2)).transpose(1, 2) 68 | 69 | # (B, T, D) and (B, T, D) 70 | keys, values = x.split(x.size(-1) // 2, dim=-1) 71 | 72 | return keys, values 73 | 74 | 75 | class Decoder(nn.Module): 76 | def __init__(self, embed_dim, in_dim=80, r=5, channels=256, kernel_size=3, 77 | n_speakers=1, speaker_embed_dim=16, 78 | max_positions=512, padding_idx=None, 79 | dropout=0.1, 80 | use_memory_mask=False, 81 | force_monotonic_attention=False, 82 | query_position_rate=1.0, 83 | key_position_rate=1.29, 84 | window_ahead=3, 85 | window_backward=1, 86 | key_projection=False, 87 | value_projection=False, 88 | ): 89 | super(Decoder, self).__init__() 90 | self.dropout = dropout 91 | self.in_dim = in_dim 92 | self.r = r 93 | 94 | D = channels 95 | F = in_dim * r # should be r = 1 to replicate 96 | self.audio_encoder_modules = nn.ModuleList([ 97 | Conv1d(F, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 98 | nn.ReLU(inplace=True), 99 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 100 | nn.ReLU(inplace=True), 101 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 102 | 103 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 104 | dilation=1, causal=True, std_mul=1.0, dropout=dropout), 105 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 106 | dilation=3, causal=True, std_mul=1.0, dropout=dropout), 107 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 108 | dilation=9, causal=True, std_mul=1.0, dropout=dropout), 109 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 110 | dilation=27, causal=True, std_mul=1.0, dropout=dropout), 111 | 112 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 113 | dilation=1, causal=True, std_mul=1.0, dropout=dropout), 114 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 115 | dilation=3, causal=True, std_mul=1.0, dropout=dropout), 116 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 117 | dilation=9, causal=True, std_mul=1.0, dropout=dropout), 118 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 119 | dilation=27, causal=True, std_mul=1.0, dropout=dropout), 120 | 121 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 122 | dilation=3, causal=True, std_mul=1.0, dropout=dropout), 123 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 124 | dilation=3, causal=True, std_mul=1.0, dropout=dropout), 125 | ]) 126 | 127 | self.attention = AttentionLayer(D, D, dropout=dropout, 128 | window_ahead=window_ahead, 129 | window_backward=window_backward, 130 | key_projection=key_projection, 131 | value_projection=value_projection) 132 | 133 | self.audio_decoder_modules = nn.ModuleList([ 134 | Conv1d(2 * D, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 135 | 136 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 137 | dilation=1, causal=True, std_mul=1.0, dropout=dropout), 138 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 139 | dilation=3, causal=True, std_mul=1.0, dropout=dropout), 140 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 141 | dilation=9, causal=True, std_mul=1.0, dropout=dropout), 142 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 143 | dilation=27, causal=True, std_mul=1.0, dropout=dropout), 144 | 145 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 146 | dilation=1, causal=True, std_mul=1.0, dropout=dropout), 147 | HighwayConv1d(D, D, kernel_size=kernel_size, padding=None, 148 | dilation=1, causal=True, std_mul=1.0, dropout=dropout), 149 | 150 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 151 | nn.ReLU(inplace=True), 152 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 153 | nn.ReLU(inplace=True), 154 | Conv1d(D, D, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 155 | nn.ReLU(inplace=True), 156 | ]) 157 | self.last_conv = Conv1d(D, F, kernel_size=1, padding=0, dilation=1, std_mul=2.0) 158 | 159 | # Done prediction 160 | self.fc = Linear(F, 1) 161 | 162 | # Position encodings for query (decoder states) and keys (encoder states) 163 | self.embed_query_positions = Embedding( 164 | max_positions, D, padding_idx) 165 | self.embed_query_positions.weight.data = position_encoding_init( 166 | max_positions, D, position_rate=query_position_rate, sinusoidal=True) 167 | self.embed_keys_positions = Embedding( 168 | max_positions, D, padding_idx) 169 | self.embed_keys_positions.weight.data = position_encoding_init( 170 | max_positions, D, position_rate=key_position_rate, sinusoidal=True) 171 | 172 | # options 173 | self.max_decoder_steps = 200 174 | self.min_decoder_steps = 10 175 | self.use_memory_mask = use_memory_mask 176 | self.force_monotonic_attention = force_monotonic_attention 177 | 178 | def forward(self, encoder_out, inputs=None, 179 | text_positions=None, frame_positions=None, 180 | speaker_embed=None, lengths=None): 181 | 182 | if inputs is None: 183 | assert text_positions is not None 184 | self.start_fresh_sequence() 185 | outputs = self.incremental_forward(encoder_out, text_positions) 186 | return outputs 187 | 188 | # Grouping multiple frames if necessary 189 | if inputs.size(-1) == self.in_dim: 190 | inputs = inputs.view(inputs.size(0), inputs.size(1) // self.r, -1) 191 | assert inputs.size(-1) == self.in_dim * self.r 192 | 193 | keys, values = encoder_out 194 | 195 | if self.use_memory_mask and lengths is not None: 196 | mask = get_mask_from_lengths(keys, lengths) 197 | else: 198 | mask = None 199 | 200 | # position encodings 201 | if text_positions is not None: 202 | text_pos_embed = self.embed_keys_positions(text_positions) 203 | keys = keys + text_pos_embed 204 | if frame_positions is not None: 205 | frame_pos_embed = self.embed_query_positions(frame_positions) 206 | 207 | # transpose only once to speed up attention layers 208 | keys = keys.transpose(1, 2).contiguous() 209 | 210 | # (B, T, C) 211 | x = inputs 212 | 213 | # (B, C, T) 214 | x = x.transpose(1, 2) 215 | 216 | # Apply audio encoder 217 | for f in self.audio_encoder_modules: 218 | x = f(x) 219 | Q = x 220 | 221 | # Attention modules assume query as (B, T, C) 222 | x = x.transpose(1, 2) 223 | x = x if frame_positions is None else x + frame_pos_embed 224 | R, alignments = self.attention(x, (keys, values), mask=mask) 225 | R = R.transpose(1, 2) 226 | 227 | # (B, C*2, T) 228 | Rd = torch.cat((R, Q), dim=1) 229 | x = Rd 230 | 231 | # Apply audio decoder 232 | for f in self.audio_decoder_modules: 233 | x = f(x) 234 | decoder_states = x.transpose(1, 2).contiguous() 235 | x = self.last_conv(x) 236 | 237 | # (B, T, C) 238 | x = x.transpose(1, 2) 239 | 240 | # Mel 241 | outputs = F.sigmoid(x) 242 | 243 | # Done prediction 244 | done = F.sigmoid(self.fc(x)) 245 | 246 | # Adding extra dim for convenient 247 | alignments = alignments.unsqueeze(0) 248 | 249 | return outputs, alignments, done, decoder_states 250 | 251 | def incremental_forward(self, encoder_out, text_positions, 252 | initial_input=None, test_inputs=None): 253 | keys, values = encoder_out 254 | B = keys.size(0) 255 | 256 | # position encodings 257 | if text_positions is not None: 258 | text_pos_embed = self.embed_keys_positions(text_positions) 259 | keys = keys + text_pos_embed 260 | 261 | # transpose only once to speed up attention layers 262 | keys = keys.transpose(1, 2).contiguous() 263 | 264 | decoder_states = [] 265 | outputs = [] 266 | alignments = [] 267 | dones = [] 268 | # intially set to zeros 269 | last_attended = 0 if self.force_monotonic_attention else None 270 | 271 | t = 0 272 | if initial_input is None: 273 | initial_input = Variable( 274 | keys.data.new(B, 1, self.in_dim * self.r).zero_()) 275 | current_input = initial_input 276 | while True: 277 | # frame pos start with 1. 278 | frame_pos = Variable(keys.data.new(B, 1).fill_(t + 1)).long() 279 | frame_pos_embed = self.embed_query_positions(frame_pos) 280 | 281 | if test_inputs is not None: 282 | if t >= test_inputs.size(1): 283 | break 284 | current_input = test_inputs[:, t, :].unsqueeze(1) 285 | else: 286 | if t > 0: 287 | current_input = outputs[-1] 288 | 289 | # (B, 1, C) 290 | x = current_input 291 | 292 | for f in self.audio_encoder_modules: 293 | try: 294 | x = f.incremental_forward(x) 295 | except AttributeError as e: 296 | x = f(x) 297 | Q = x 298 | 299 | R, alignment = self.attention( 300 | x + frame_pos_embed, (keys, values), last_attended=last_attended) 301 | if self.force_monotonic_attention: 302 | last_attended = alignment.max(-1)[1].view(-1).data[0] 303 | 304 | Rd = torch.cat((R, Q), dim=-1) 305 | x = Rd 306 | for f in self.audio_decoder_modules: 307 | try: 308 | x = f.incremental_forward(x) 309 | except AttributeError as e: 310 | x = f(x) 311 | decoder_state = x 312 | x = self.last_conv.incremental_forward(x) 313 | 314 | # Ooutput & done flag predictions 315 | output = F.sigmoid(x) 316 | done = F.sigmoid(self.fc(x)) 317 | 318 | decoder_states += [decoder_state] 319 | outputs += [output] 320 | alignments += [alignment] 321 | dones += [done] 322 | 323 | t += 1 324 | if test_inputs is None: 325 | if (done > 0.5).all() and t > self.min_decoder_steps: 326 | break 327 | elif t > self.max_decoder_steps: 328 | break 329 | 330 | # Remove 1-element time axis 331 | alignments = list(map(lambda x: x.squeeze(1), alignments)) 332 | decoder_states = list(map(lambda x: x.squeeze(1), decoder_states)) 333 | outputs = list(map(lambda x: x.squeeze(1), outputs)) 334 | 335 | # Combine outputs for all time steps 336 | alignments = torch.stack(alignments).transpose(0, 1) 337 | decoder_states = torch.stack(decoder_states).transpose(0, 1).contiguous() 338 | outputs = torch.stack(outputs).transpose(0, 1).contiguous() 339 | 340 | return outputs, alignments, dones, decoder_states 341 | 342 | def start_fresh_sequence(self): 343 | _clear_modules(self.audio_encoder_modules) 344 | _clear_modules(self.audio_decoder_modules) 345 | 346 | 347 | def _clear_modules(modules): 348 | for m in modules: 349 | try: 350 | m.clear_buffer() 351 | except AttributeError as e: 352 | pass 353 | 354 | 355 | class Converter(nn.Module): 356 | def __init__(self, in_dim, out_dim, channels=512, kernel_size=3, dropout=0.1): 357 | super(Converter, self).__init__() 358 | self.dropout = dropout 359 | self.in_dim = in_dim 360 | self.out_dim = out_dim 361 | 362 | F = in_dim 363 | Fd = out_dim 364 | C = channels 365 | self.convnet = nn.Sequential( 366 | Conv1d(F, C, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 367 | 368 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 369 | dilation=1, std_mul=1.0, dropout=dropout), 370 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 371 | dilation=3, std_mul=1.0, dropout=dropout), 372 | 373 | ConvTranspose1d(C, C, kernel_size=2, padding=0, stride=2, std_mul=1.0), 374 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 375 | dilation=1, std_mul=1.0, dropout=dropout), 376 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 377 | dilation=3, std_mul=1.0, dropout=dropout), 378 | ConvTranspose1d(C, C, kernel_size=2, padding=0, stride=2, std_mul=1.0), 379 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 380 | dilation=1, std_mul=1.0, dropout=dropout), 381 | HighwayConv1d(C, C, kernel_size=kernel_size, padding=None, 382 | dilation=3, std_mul=1.0, dropout=dropout), 383 | 384 | Conv1d(C, 2 * C, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 385 | 386 | HighwayConv1d(2 * C, 2 * C, kernel_size=kernel_size, padding=None, 387 | dilation=1, std_mul=1.0, dropout=dropout), 388 | HighwayConv1d(2 * C, 2 * C, kernel_size=kernel_size, padding=None, 389 | dilation=1, std_mul=1.0, dropout=dropout), 390 | 391 | Conv1d(2 * C, Fd, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 392 | 393 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=1.0), 394 | nn.ReLU(inplace=True), 395 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 396 | nn.ReLU(inplace=True), 397 | 398 | Conv1d(Fd, Fd, kernel_size=1, padding=0, dilation=1, std_mul=2.0), 399 | nn.Sigmoid(), 400 | ) 401 | 402 | def forward(self, x, speaker_embed=None): 403 | return self.convnet(x.transpose(1, 2)).transpose(1, 2) 404 | -------------------------------------------------------------------------------- /dv3/deepvoice3_pytorch/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1+6645b31' 2 | -------------------------------------------------------------------------------- /dv3/deepvoice3_vctk.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "en", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "deepvoice3_multispeaker", 6 | "n_speakers": 108, 7 | "speaker_embed_dim": 512, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.1, 23 | "speaker_embedding_weight_std": 0.05, 24 | "padding_idx": 0, 25 | "max_positions": 1024, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 256, 29 | "encoder_channels": 512, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 2.0, 33 | "key_position_rate": 7.6, 34 | "key_projection": true, 35 | "value_projection": true, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.4, 48 | "batch_size": 16, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } -------------------------------------------------------------------------------- /dv3/hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # NOTE: If you want full control for model architecture. please take a look 4 | # at the code and change whatever you want. Some hyper parameters are hardcoded. 5 | 6 | # Default hyperparameters: 7 | hparams = tf.contrib.training.HParams( 8 | name="deepvoice3", 9 | 10 | # Text: 11 | # [en, jp] 12 | frontend='en', 13 | 14 | # Replace words to its pronunciation with fixed probability. 15 | # e.g., 'hello' to 'HH AH0 L OW1' 16 | # [en, jp] 17 | # en: Word -> pronunciation using CMUDict 18 | # jp: Word -> pronounciation usnig MeCab 19 | # [0 ~ 1.0]: 0 means no replacement happens. 20 | replace_pronunciation_prob=0.5, 21 | 22 | # Convenient model builder 23 | # [deepvoice3, deepvoice3_multispeaker, nyanko] 24 | # Definitions can be found at deepvoice3_pytorch/builder.py 25 | # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654 26 | # deepvoice3_multispeaker: Multi-speaker version of DeepVoice3 27 | # nyanko: https://arxiv.org/abs/1710.08969 28 | builder="deepvoice3", 29 | 30 | # Must be configured depends on the dataset and model you use 31 | n_speakers=1, 32 | speaker_embed_dim=128, 33 | 34 | # Presets known to work good. 35 | # NOTE: If specified, override hyper parameters with preset 36 | preset="", 37 | presets={ 38 | "deepvoice3_ljspeech": { 39 | "n_speakers": 1, 40 | "downsample_step": 4, 41 | "outputs_per_step": 1, 42 | "embedding_weight_std": 0.1, 43 | "dropout": 1 - 0.95, 44 | "kernel_size": 3, 45 | "text_embed_dim": 256, 46 | "encoder_channels": 512, 47 | "decoder_channels": 256, 48 | "converter_channels": 256, 49 | "use_guided_attention": True, 50 | "guided_attention_sigma": 0.2, 51 | "binary_divergence_weight": 0.1, 52 | "use_decoder_state_for_postnet_input": True, 53 | "max_positions": 512, 54 | "query_position_rate": 1.0, 55 | "key_position_rate": 1.385, 56 | "key_projection": True, 57 | "value_projection": True, 58 | "clip_thresh": 0.1, 59 | "initial_learning_rate": 5e-4, 60 | }, 61 | "deepvoice3_vctk": { 62 | "n_speakers": 108, 63 | "speaker_embed_dim": 512, 64 | "downsample_step": 4, 65 | "outputs_per_step": 1, 66 | "embedding_weight_std": 0.1, 67 | "speaker_embedding_weight_std": 0.05, 68 | "dropout": 1 - 0.95, 69 | "kernel_size": 3, 70 | "text_embed_dim": 256, 71 | "encoder_channels": 512, 72 | "decoder_channels": 256, 73 | "converter_channels": 256, 74 | "use_guided_attention": True, 75 | "guided_attention_sigma": 0.4, 76 | "binary_divergence_weight": 0.1, 77 | "use_decoder_state_for_postnet_input": True, 78 | "max_positions": 1024, 79 | "query_position_rate": 2.0, 80 | "key_position_rate": 7.6, 81 | "key_projection": True, 82 | "value_projection": True, 83 | "clip_thresh": 0.1, 84 | "initial_learning_rate": 5e-4, 85 | }, 86 | "deepvoice3_speaker_adaptation_vctk": { 87 | "n_speakers": 1, 88 | "speaker_embed_dim": 128, 89 | "downsample_step": 4, 90 | "outputs_per_step": 1, 91 | "embedding_weight_std": 0.1, 92 | "speaker_embedding_weight_std": 0.05, 93 | "dropout": 1 - 0.95, 94 | "kernel_size": 3, 95 | "text_embed_dim": 256, 96 | "encoder_channels": 512, 97 | "decoder_channels": 256, 98 | "converter_channels": 256, 99 | "use_guided_attention": True, 100 | "guided_attention_sigma": 0.4, 101 | "binary_divergence_weight": 0.1, 102 | "use_decoder_state_for_postnet_input": True, 103 | "max_positions": 1024, 104 | "query_position_rate": 2.0, 105 | "key_position_rate": 7.6, 106 | "key_projection": True, 107 | "value_projection": True, 108 | "clip_thresh": 0.1, 109 | "initial_learning_rate": 5e-4, 110 | }, 111 | "deepvoice3_speaker_adaptation_libri": { 112 | "n_speakers": 9026, 113 | "speaker_embed_dim": 128, 114 | "downsample_step": 4, 115 | "outputs_per_step": 1, 116 | "embedding_weight_std": 0.1, 117 | "speaker_embedding_weight_std": 0.05, 118 | "dropout": 1 - 0.95, 119 | "kernel_size": 3, 120 | "text_embed_dim": 256, 121 | "encoder_channels": 512, 122 | "decoder_channels": 256, 123 | "converter_channels": 256, 124 | "use_guided_attention": True, 125 | "guided_attention_sigma": 0.4, 126 | "binary_divergence_weight": 0.1, 127 | "use_decoder_state_for_postnet_input": True, 128 | "max_positions": 1024, 129 | "query_position_rate": 2.0, 130 | "key_position_rate": 7.6, 131 | "key_projection": True, 132 | "value_projection": True, 133 | "clip_thresh": 0.1, 134 | "initial_learning_rate": 5e-4, 135 | }, 136 | "nyanko_ljspeech": { 137 | "n_speakers": 1, 138 | "downsample_step": 4, 139 | "outputs_per_step": 1, 140 | "embedding_weight_std": 0.01, 141 | "dropout": 1 - 0.95, 142 | "kernel_size": 3, 143 | "text_embed_dim": 128, 144 | "encoder_channels": 256, 145 | "decoder_channels": 256, 146 | "converter_channels": 256, 147 | "use_guided_attention": True, 148 | "guided_attention_sigma": 0.2, 149 | "binary_divergence_weight": 0.1, 150 | "use_decoder_state_for_postnet_input": True, 151 | "max_positions": 512, 152 | "query_position_rate": 1.0, 153 | "key_position_rate": 1.385, 154 | "key_projection": False, 155 | "value_projection": False, 156 | "clip_thresh": 0.1, 157 | "initial_learning_rate": 5e-4, 158 | }, 159 | }, 160 | 161 | # Audio: 162 | num_mels=80, 163 | fft_size=1024, 164 | hop_size=256, 165 | sample_rate=22050, 166 | preemphasis=0.97, 167 | min_level_db=-100, 168 | ref_level_db=20, 169 | 170 | # Model: 171 | downsample_step=4, # must be 4 when builder="nyanko" 172 | outputs_per_step=1, # must be 1 when builder="nyanko" 173 | embedding_weight_std=0.1, 174 | speaker_embedding_weight_std=0.01, 175 | padding_idx=0, 176 | # Maximum number of input text length 177 | # try setting larger value if you want to give very long text input 178 | max_positions=512, 179 | dropout=1 - 0.95, 180 | kernel_size=3, 181 | text_embed_dim=128, 182 | encoder_channels=256, 183 | decoder_channels=256, 184 | # Note: large converter channels requires significant computational cost 185 | converter_channels=256, 186 | query_position_rate=1.0, 187 | key_position_rate=1.385, # 2.37 for jsut 188 | key_projection=False, 189 | value_projection=False, 190 | use_memory_mask=True, 191 | trainable_positional_encodings=False, 192 | freeze_embedding=False, 193 | # If True, use decoder's internal representation for postnet inputs, 194 | # otherwise use mel-spectrogram. 195 | use_decoder_state_for_postnet_input=True, 196 | 197 | # Data loader 198 | pin_memory=True, 199 | num_workers=2, 200 | 201 | # Loss 202 | masked_loss_weight=0.5, # (1-w)*loss + w * masked_loss 203 | priority_freq=3000, # heuristic: priotrize [0 ~ priotiry_freq] for linear loss 204 | priority_freq_weight=0.0, # (1-w)*linear_loss + w*priority_linear_loss 205 | # https://arxiv.org/pdf/1710.08969.pdf 206 | # Adding the divergence to the loss stabilizes training, expecially for 207 | # very deep (> 10 layers) networks. 208 | # Binary div loss seems has approx 10x scale compared to L1 loss, so I choose 0.1. 209 | binary_divergence_weight=0.1, # set 0 to disable 210 | use_guided_attention=True, 211 | guided_attention_sigma=0.2, 212 | 213 | # Training: 214 | batch_size=16, 215 | adam_beta1=0.5, 216 | adam_beta2=0.9, 217 | adam_eps=1e-6, 218 | initial_learning_rate=5e-4, # 0.001, 219 | lr_schedule="noam_learning_rate_decay", 220 | lr_schedule_kwargs={}, 221 | nepochs=2000, 222 | weight_decay=0.0, 223 | clip_thresh=0.1, 224 | 225 | # Save 226 | checkpoint_interval=10000, 227 | eval_interval=10000, 228 | save_optimizer_state=True, 229 | 230 | # Eval: 231 | # this can be list for multple layers of attention 232 | # e.g., [True, False, False, False, True] 233 | force_monotonic_attention=True, 234 | # Attention constraint for incremental decoding 235 | window_ahead=3, 236 | # 0 tends to prevent word repretetion, but sometime causes skip words 237 | window_backward=1, 238 | power=1.4, # Power to raise magnitudes to prior to phase retrieval 239 | ) 240 | 241 | 242 | def hparams_debug_string(): 243 | values = hparams.values() 244 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 245 | return 'Hyperparameters:\n' + '\n'.join(hp) -------------------------------------------------------------------------------- /dv3/jsut.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import dv3.audio 6 | from nnmnkwii.datasets import jsut 7 | from nnmnkwii.io import hts 8 | from dv3.hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | 13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 14 | executor = ProcessPoolExecutor(max_workers=num_workers) 15 | futures = [] 16 | 17 | transcriptions = jsut.TranscriptionDataSource( 18 | in_dir, subsets=jsut.available_subsets).collect_files() 19 | wav_paths = jsut.WavFileDataSource( 20 | in_dir, subsets=jsut.available_subsets).collect_files() 21 | 22 | for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)): 23 | futures.append(executor.submit( 24 | partial(_process_utterance, out_dir, index + 1, wav_path, text))) 25 | return [future.result() for future in tqdm(futures)] 26 | 27 | 28 | def _process_utterance(out_dir, index, wav_path, text): 29 | sr = hparams.sample_rate 30 | 31 | # Load the audio to a numpy array: 32 | wav = dv3.audio.load_wav(wav_path) 33 | 34 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") 35 | 36 | # Trim silence from hts labels if available 37 | if exists(lab_path): 38 | labels = hts.load(lab_path) 39 | assert labels[0][-1] == "silB" 40 | assert labels[-1][-1] == "silE" 41 | b = int(labels[0][1] * 1e-7 * sr) 42 | e = int(labels[-1][0] * 1e-7 * sr) 43 | wav = wav[b:e] 44 | else: 45 | wav, _ = librosa.effects.trim(wav, top_db=30) 46 | 47 | # Compute the linear-scale spectrogram from the wav: 48 | spectrogram =dv3.audio.spectrogram(wav).astype(np.float32) 49 | n_frames = spectrogram.shape[1] 50 | 51 | # Compute a mel-scale spectrogram from the wav: 52 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32) 53 | 54 | # Write the spectrograms to disk: 55 | spectrogram_filename = 'jsut-spec-%05d.npy' % index 56 | mel_filename = 'jsut-mel-%05d.npy' % index 57 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 58 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 59 | 60 | # Return a tuple describing this training example: 61 | return (spectrogram_filename, mel_filename, n_frames, text) -------------------------------------------------------------------------------- /dv3/ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import dv3.audio 6 | 7 | 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 10 | 11 | Args: 12 | in_dir: The directory where you have downloaded the LJ Speech dataset 13 | out_dir: The directory to write the output into 14 | num_workers: Optional number of worker processes to parallelize across 15 | tqdm: You can optionally pass tqdm to get a nice progress bar 16 | 17 | Returns: 18 | A list of tuples describing the training examples. This should be written to train.txt 19 | ''' 20 | 21 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 22 | # can omit it and just call _process_utterance on each input if you want. 23 | executor = ProcessPoolExecutor(max_workers=num_workers) 24 | futures = [] 25 | index = 1 26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 27 | for line in f: 28 | parts = line.strip().split('|') 29 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 30 | text = parts[2] 31 | futures.append(executor.submit( 32 | partial(_process_utterance, out_dir, index, wav_path, text))) 33 | index += 1 34 | return [future.result() for future in tqdm(futures)] 35 | 36 | 37 | def _process_utterance(out_dir, index, wav_path, text): 38 | '''Preprocesses a single utterance audio/text pair. 39 | 40 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 41 | to the train.txt file. 42 | 43 | Args: 44 | out_dir: The directory to write the spectrograms into 45 | index: The numeric index to use in the spectrogram filenames. 46 | wav_path: Path to the audio file containing the speech input 47 | text: The text spoken in the input audio file 48 | 49 | Returns: 50 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 51 | ''' 52 | 53 | # Load the audio to a numpy array: 54 | wav = dv3.audio.load_wav(wav_path) 55 | 56 | # Compute the linear-scale spectrogram from the wav: 57 | spectrogram = dv3.audio.spectrogram(wav).astype(np.float32) 58 | n_frames = spectrogram.shape[1] 59 | 60 | # Compute a mel-scale spectrogram from the wav: 61 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32) 62 | 63 | # Write the spectrograms to disk: 64 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index 65 | mel_filename = 'ljspeech-mel-%05d.npy' % index 66 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 67 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 68 | 69 | # Return a tuple describing this training example: 70 | return (spectrogram_filename, mel_filename, n_frames, text) -------------------------------------------------------------------------------- /dv3/lrschedule.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000): 6 | # Noam scheme from tensor2tensor: 7 | warmup_steps = float(warmup_steps) 8 | step = global_step + 1. 9 | lr = init_lr * warmup_steps**0.5 * np.minimum( 10 | step * warmup_steps**-1.5, step**-0.5) 11 | return lr 12 | 13 | 14 | def step_learning_rate_decay(init_lr, global_step, 15 | anneal_rate=0.98, 16 | anneal_interval=30000): 17 | return init_lr * anneal_rate ** (global_step // anneal_interval) 18 | 19 | 20 | def cyclic_cosine_annealing(init_lr, global_step, T, M): 21 | """Cyclic cosine annealing 22 | 23 | https://arxiv.org/pdf/1704.00109.pdf 24 | 25 | Args: 26 | init_lr (float): Initial learning rate 27 | global_step (int): Current iteration number 28 | T (int): Total iteration number (i,e. nepoch) 29 | M (int): Number of ensembles we want 30 | 31 | Returns: 32 | float: Annealed learning rate 33 | """ 34 | TdivM = T // M 35 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0) -------------------------------------------------------------------------------- /dv3/preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Preprocess dataset 4 | usage: preprocess.py [options] 5 | options: 6 | --num_workers= Num workers. 7 | -h, --help Show help message. 8 | """ 9 | from docopt import docopt 10 | import os 11 | from multiprocessing import cpu_count 12 | from tqdm import tqdm 13 | import importlib 14 | from dv3.hparams import hparams 15 | 16 | 17 | def preprocess(mod, in_dir, out_root, num_workers): 18 | os.makedirs(out_dir, exist_ok=True) 19 | metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm) 20 | write_metadata(metadata, out_dir) 21 | 22 | 23 | def write_metadata(metadata, out_dir): 24 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 25 | for m in metadata: 26 | f.write('|'.join([str(x) for x in m]) + '\n') 27 | frames = sum([m[2] for m in metadata]) 28 | frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000 29 | hours = frames * frame_shift_ms / (3600 * 1000) 30 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) 31 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 32 | print('Max output length: %d' % max(m[2] for m in metadata)) 33 | 34 | 35 | if __name__ == "__main__": 36 | args = docopt(__doc__) 37 | name = args[""] 38 | in_dir = args[""] 39 | out_dir = args[""] 40 | num_workers = args["--num_workers"] 41 | num_workers = cpu_count() if num_workers is None else num_workers 42 | 43 | assert name in ["jsut", "ljspeech", "vctk"] 44 | mod = importlib.import_module(name) 45 | preprocess(mod, in_dir, out_dir, num_workers) -------------------------------------------------------------------------------- /dv3/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | import setuptools.command.develop 4 | import setuptools.command.build_py 5 | import os 6 | import subprocess 7 | 8 | version = '0.0.1' 9 | 10 | # Adapted from https://github.com/pytorch/pytorch 11 | cwd = os.path.dirname(os.path.abspath(__file__)) 12 | if os.getenv('TACOTRON_BUILD_VERSION'): 13 | version = os.getenv('TACOTRON_BUILD_VERSION') 14 | else: 15 | try: 16 | sha = subprocess.check_output( 17 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 18 | version += '+' + sha[:7] 19 | except subprocess.CalledProcessError: 20 | pass 21 | 22 | 23 | class build_py(setuptools.command.build_py.build_py): 24 | 25 | def run(self): 26 | self.create_version_file() 27 | setuptools.command.build_py.build_py.run(self) 28 | 29 | @staticmethod 30 | def create_version_file(): 31 | global version, cwd 32 | print('-- Building version ' + version) 33 | version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py') 34 | with open(version_path, 'w') as f: 35 | f.write("__version__ = '{}'\n".format(version)) 36 | 37 | 38 | class develop(setuptools.command.develop.develop): 39 | 40 | def run(self): 41 | build_py.create_version_file() 42 | setuptools.command.develop.develop.run(self) 43 | 44 | 45 | setup(name='deepvoice3_pytorch', 46 | version=version, 47 | description='PyTorch implementation of Tacotron speech synthesis model.', 48 | packages=find_packages(), 49 | cmdclass={ 50 | 'build_py': build_py, 51 | 'develop': develop, 52 | }, 53 | install_requires=[ 54 | "numpy", 55 | "scipy", 56 | "unidecode", 57 | "inflect", 58 | "librosa", 59 | "numba", 60 | "lws <= 1.0", 61 | ], 62 | extras_require={ 63 | "train": [ 64 | "docopt", 65 | "tqdm", 66 | "tensorboardX", 67 | "nnmnkwii >= 0.0.9", 68 | "nltk", 69 | ], 70 | "test": [ 71 | "nose", 72 | ], 73 | "jp": [ 74 | "jaconv", 75 | "mecab-python3", 76 | ], 77 | }) -------------------------------------------------------------------------------- /dv3/synthesis.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Synthesis waveform from trained model. 4 | 5 | usage: synthesis.py [options] 6 | 7 | options: 8 | --hparams= Hyper parameters [default: ]. 9 | --checkpoint-seq2seq= Load seq2seq model from checkpoint path. 10 | --checkpoint-postnet= Load postnet model from checkpoint path. 11 | --file-name-suffix= File name suffix [default: ]. 12 | --max-decoder-steps= Max decoder steps [default: 500]. 13 | --replace_pronunciation_prob= Prob [default: 0.0]. 14 | --speaker_id= Speaker ID (for multi-speaker model). 15 | --output-html Output html for blog post. 16 | -h, --help Show help message. 17 | """ 18 | from docopt import docopt 19 | 20 | import sys 21 | import os 22 | from os.path import dirname, join, basename, splitext 23 | 24 | import dv3.audio 25 | 26 | import torch 27 | from torch.autograd import Variable 28 | import numpy as np 29 | import nltk 30 | 31 | # The deepvoice3 model 32 | from dv3.deepvoice3_pytorch import frontend 33 | from dv3.hparams import hparams 34 | 35 | from tqdm import tqdm 36 | 37 | use_cuda = torch.cuda.is_available() 38 | _frontend = None # to be set later 39 | 40 | 41 | def tts(model, text, p=0, speaker_id=None, fast=False): 42 | """Convert text to speech waveform given a deepvoice3 model. 43 | 44 | Args: 45 | text (str) : Input text to be synthesized 46 | p (float) : Replace word to pronounciation if p > 0. Default is 0. 47 | """ 48 | if use_cuda: 49 | model = model.cuda() 50 | model.eval() 51 | if fast: 52 | model.make_generation_fast_() 53 | 54 | sequence = np.array(_frontend.text_to_sequence(text, p=p)) 55 | sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) 56 | text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long() 57 | text_positions = Variable(text_positions) 58 | speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id])) 59 | if use_cuda: 60 | sequence = sequence.cuda() 61 | text_positions = text_positions.cuda() 62 | speaker_ids = None if speaker_ids is None else speaker_ids.cuda() 63 | 64 | # Greedy decoding 65 | mel_outputs, linear_outputs, alignments, done = model( 66 | sequence, text_positions=text_positions, speaker_ids=speaker_ids) 67 | 68 | linear_output = linear_outputs[0].cpu().data.numpy() 69 | spectrogram = dv3.audio._denormalize(linear_output) 70 | alignment = alignments[0].cpu().data.numpy() 71 | mel = mel_outputs[0].cpu().data.numpy() 72 | mel = dv3.audio._denormalize(mel) 73 | 74 | # Predicted audio signal 75 | waveform = dv3.audio.inv_spectrogram(linear_output.T) 76 | 77 | return waveform, alignment, spectrogram, mel 78 | 79 | 80 | if __name__ == "__main__": 81 | args = docopt(__doc__) 82 | print("Command line args:\n", args) 83 | checkpoint_path = args[""] 84 | text_list_file_path = args[""] 85 | dst_dir = args[""] 86 | checkpoint_seq2seq_path = args["--checkpoint-seq2seq"] 87 | checkpoint_postnet_path = args["--checkpoint-postnet"] 88 | max_decoder_steps = int(args["--max-decoder-steps"]) 89 | file_name_suffix = args["--file-name-suffix"] 90 | replace_pronunciation_prob = float(args["--replace_pronunciation_prob"]) 91 | output_html = args["--output-html"] 92 | speaker_id = args["--speaker_id"] 93 | if speaker_id is not None: 94 | speaker_id = int(speaker_id) 95 | 96 | # Override hyper parameters 97 | hparams.parse(args["--hparams"]) 98 | assert hparams.name == "deepvoice3" 99 | 100 | # Presets 101 | if hparams.preset is not None and hparams.preset != "": 102 | preset = hparams.presets[hparams.preset] 103 | import json 104 | hparams.parse_json(json.dumps(preset)) 105 | print("Override hyper parameters with preset \"{}\": {}".format( 106 | hparams.preset, json.dumps(preset, indent=4))) 107 | 108 | _frontend = getattr(frontend, hparams.frontend) 109 | import dv3.train 110 | dv3.train._frontend = _frontend 111 | from dv3.train import plot_alignment, build_model 112 | 113 | # Model 114 | model = build_model() 115 | 116 | # Load checkpoints separately 117 | if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None: 118 | checkpoint = torch.load(checkpoint_seq2seq_path) 119 | model.seq2seq.load_state_dict(checkpoint["state_dict"]) 120 | checkpoint = torch.load(checkpoint_postnet_path) 121 | model.postnet.load_state_dict(checkpoint["state_dict"]) 122 | checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0] 123 | else: 124 | checkpoint = torch.load(checkpoint_path) 125 | model.load_state_dict(checkpoint["state_dict"]) 126 | checkpoint_name = splitext(basename(checkpoint_path))[0] 127 | 128 | model.seq2seq.decoder.max_decoder_steps = max_decoder_steps 129 | 130 | os.makedirs(dst_dir, exist_ok=True) 131 | with open(text_list_file_path, "rb") as f: 132 | lines = f.readlines() 133 | for idx, line in enumerate(lines): 134 | text = line.decode("utf-8")[:-1] 135 | words = nltk.word_tokenize(text) 136 | waveform, alignment, _, _ = tts( 137 | model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True) 138 | dst_wav_path = join(dst_dir, "{}_{}{}.wav".format( 139 | idx, checkpoint_name, file_name_suffix)) 140 | dst_alignment_path = join( 141 | dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name, 142 | file_name_suffix)) 143 | plot_alignment(alignment.T, dst_alignment_path, 144 | info="{}, {}".format(hparams.builder, basename(checkpoint_path))) 145 | dv3.audio.save_wav(waveform, dst_wav_path) 146 | from os.path import basename, splitext 147 | name = splitext(basename(text_list_file_path))[0] 148 | if output_html: 149 | print(""" 150 | {} 151 | 152 | ({} chars, {} words) 153 | 154 | 158 | 159 |
160 | """.format(text, len(text), len(words), 161 | hparams.builder, name, basename(dst_wav_path), 162 | hparams.builder, name, basename(dst_alignment_path))) 163 | else: 164 | print(idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words))) 165 | 166 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 167 | sys.exit(0) -------------------------------------------------------------------------------- /dv3/tests/test_conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import torch 5 | from torch import nn 6 | from torch.autograd import Variable 7 | from torch.nn import functional as F 8 | from dv3.deepvoice3_pytorch.conv import Conv1d 9 | 10 | 11 | def test_conv1d_incremental(): 12 | def __test(kernel_size, dilation, T, B, C, causual=True): 13 | kernel_size = 3 14 | dilation = (dilation,) 15 | 16 | # dilation = (4,) 17 | # causual 18 | assert causual 19 | if causual: 20 | padding = (kernel_size - 1) * dilation[0] 21 | else: 22 | padding = (kernel_size - 1) // 2 * dilation[0] 23 | 24 | # weight: (Cout, Cin, K) 25 | conv = nn.Conv1d( 26 | C, C * 2, kernel_size=kernel_size, padding=padding, 27 | dilation=dilation).eval() 28 | conv.weight.data.fill_(1.0) 29 | conv.bias.data.zero_() 30 | 31 | # weight: (K, Cin, Cout) 32 | # weight (linearized): (Cout*K, Cin) 33 | conv_online = Conv1d( 34 | C, C * 2, kernel_size=kernel_size, padding=padding, 35 | dilation=dilation).eval() 36 | conv_online.weight.data.fill_(1.0) 37 | conv_online.bias.data.zero_() 38 | 39 | # (B, C, T) 40 | bct = Variable(torch.zeros(B, C, T) + torch.arange(0, T)) 41 | output_conv = conv(bct) 42 | 43 | # Remove future time stamps 44 | output_conv = output_conv[:, :, :T] 45 | 46 | output_conv_online = [] 47 | 48 | # B, T, C 49 | btc = bct.transpose(1, 2).contiguous() 50 | for t in range(btc.size(1)): 51 | input = btc[:, t, :].contiguous().view(B, -1, C) 52 | output = conv_online.incremental_forward(input) 53 | output_conv_online += [output] 54 | 55 | output_conv_online = torch.stack(output_conv_online).squeeze(2) 56 | output_conv_online = output_conv_online.transpose(0, 1).transpose(1, 2) 57 | 58 | assert (output_conv == output_conv_online).all() 59 | 60 | for B in [1, 16]: 61 | for T in [10, 20, 30]: 62 | for C in [1, 2, 4]: 63 | for kernel_size in [3, 5, 9]: 64 | for dilation in [1, 2, 3, 4, 5, 6, 7, 8, 9, 27]: 65 | __test, kernel_size, dilation, T, B, C -------------------------------------------------------------------------------- /dv3/tests/test_deepvoice3.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join, exists 6 | 7 | from dv3.deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab 8 | 9 | import torch 10 | from torch.autograd import Variable 11 | from torch import nn 12 | import numpy as np 13 | 14 | from nose.plugins.attrib import attr 15 | 16 | from dv3.deepvoice3_pytorch.builder import deepvoice3 17 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 18 | 19 | 20 | use_cuda = torch.cuda.is_available() and False 21 | num_mels = 80 22 | num_freq = 513 23 | outputs_per_step = 4 24 | padding_idx = 0 25 | 26 | 27 | def _get_model(n_speakers=1, speaker_embed_dim=None, 28 | force_monotonic_attention=False, 29 | use_decoder_state_for_postnet_input=False): 30 | model = deepvoice3(n_vocab=n_vocab, 31 | embed_dim=256, 32 | mel_dim=num_mels, 33 | linear_dim=num_freq, 34 | r=outputs_per_step, 35 | padding_idx=padding_idx, 36 | n_speakers=n_speakers, 37 | speaker_embed_dim=speaker_embed_dim, 38 | dropout=1 - 0.95, 39 | kernel_size=5, 40 | encoder_channels=128, 41 | decoder_channels=256, 42 | converter_channels=256, 43 | force_monotonic_attention=force_monotonic_attention, 44 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 45 | ) 46 | return model 47 | 48 | 49 | def _pad(seq, max_len): 50 | return np.pad(seq, (0, max_len - len(seq)), 51 | mode='constant', constant_values=0) 52 | 53 | 54 | def _test_data(): 55 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 56 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 57 | input_lengths = np.array([len(s) for s in seqs]) 58 | max_len = np.max(input_lengths) 59 | seqs = np.array([_pad(s, max_len) for s in seqs]) 60 | 61 | # Test encoder 62 | x = Variable(torch.LongTensor(seqs)) 63 | y = Variable(torch.rand(x.size(0), 12, 80)) 64 | 65 | return x, y 66 | 67 | 68 | def _deepvoice3(n_vocab, embed_dim=256, mel_dim=80, 69 | linear_dim=4096, r=5, 70 | n_speakers=1, speaker_embed_dim=16, 71 | padding_idx=None, 72 | dropout=(1 - 0.95), dilation=1): 73 | 74 | from dv3.deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 75 | h = 128 76 | encoder = Encoder( 77 | n_vocab, embed_dim, padding_idx=padding_idx, 78 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 79 | dropout=dropout, 80 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 81 | (h, 3, dilation), (h, 3, dilation)], 82 | ) 83 | 84 | h = 256 85 | decoder = Decoder( 86 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 87 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 88 | dropout=dropout, 89 | preattention=[(h, 3, 1)], 90 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 91 | (h, 3, dilation), (h, 3, dilation)], 92 | attention=[True, False, False, False, True], 93 | force_monotonic_attention=False) 94 | 95 | seq2seq = AttentionSeq2Seq(encoder, decoder) 96 | 97 | in_dim = mel_dim 98 | h = 256 99 | converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 100 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 101 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 102 | (h, 3, dilation), (h, 3, dilation)]) 103 | 104 | model = MultiSpeakerTTSModel( 105 | seq2seq, converter, padding_idx=padding_idx, 106 | mel_dim=mel_dim, linear_dim=linear_dim, 107 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim) 108 | 109 | return model 110 | 111 | 112 | def test_single_speaker_deepvoice3(): 113 | x, y = _test_data() 114 | 115 | for v in [False, True]: 116 | model = _get_model(use_decoder_state_for_postnet_input=v) 117 | mel_outputs, linear_outputs, alignments, done = model(x, y) 118 | 119 | 120 | def _pad_2d(x, max_len, b_pad=0): 121 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)], 122 | mode="constant", constant_values=0) 123 | return x 124 | 125 | 126 | def test_multi_speaker_deepvoice3(): 127 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 128 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 129 | input_lengths = np.array([len(s) for s in seqs]) 130 | max_len = np.max(input_lengths) 131 | seqs = np.array([_pad(s, max_len) for s in seqs]) 132 | 133 | # Test encoder 134 | x = Variable(torch.LongTensor(seqs)) 135 | y = Variable(torch.rand(x.size(0), 4 * 33, 80)) 136 | model = _get_model(n_speakers=32, speaker_embed_dim=16) 137 | speaker_ids = Variable(torch.LongTensor([1, 2, 3])) 138 | 139 | mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids) 140 | print("Input text:", x.size()) 141 | print("Input mel:", y.size()) 142 | print("Mel:", mel_outputs.size()) 143 | print("Linear:", linear_outputs.size()) 144 | print("Alignments:", alignments.size()) 145 | print("Done:", done.size()) 146 | 147 | 148 | @attr("local_only") 149 | def test_incremental_correctness(): 150 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 151 | seqs = np.array([text_to_sequence(t) for t in texts]) 152 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 153 | 154 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 155 | max_target_len = mel.shape[0] 156 | r = 4 157 | mel_dim = 80 158 | if max_target_len % r != 0: 159 | max_target_len += r - max_target_len % r 160 | assert max_target_len % r == 0 161 | mel = _pad_2d(mel, max_target_len) 162 | mel = Variable(torch.from_numpy(mel)) 163 | mel_reshaped = mel.view(1, -1, mel_dim * r) 164 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 165 | 166 | x = Variable(torch.LongTensor(seqs)) 167 | text_positions = Variable(torch.LongTensor(text_positions)) 168 | frame_positions = Variable(torch.LongTensor(frame_positions)) 169 | 170 | for model, speaker_ids in [ 171 | (_get_model(force_monotonic_attention=False), None), 172 | (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1])))]: 173 | model.eval() 174 | 175 | if speaker_ids is not None: 176 | speaker_embed = model.embed_speakers(speaker_ids) 177 | else: 178 | speaker_embed = None 179 | 180 | # Encoder 181 | encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed) 182 | 183 | # Off line decoding 184 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( 185 | encoder_outs, mel_reshaped, speaker_embed=speaker_embed, 186 | text_positions=text_positions, frame_positions=frame_positions) 187 | 188 | # Online decoding with test inputs 189 | model.seq2seq.decoder.start_fresh_sequence() 190 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( 191 | encoder_outs, text_positions, speaker_embed=speaker_embed, 192 | test_inputs=mel_reshaped) 193 | 194 | # Should get same result 195 | c = (mel_outputs_offline - mel_outputs_online).abs() 196 | print(c.mean(), c.max()) 197 | 198 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(), 199 | mel_outputs_online.cpu().data.numpy(), atol=1e-5) 200 | 201 | 202 | @attr("local_only") 203 | def test_incremental_forward(): 204 | checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth") 205 | if not exists(checkpoint_path): 206 | return 207 | model = _get_model() 208 | 209 | use_cuda = False 210 | 211 | checkpoint = torch.load(checkpoint_path) 212 | model.load_state_dict(checkpoint["state_dict"]) 213 | model.make_generation_fast_() 214 | model = model.cuda() if use_cuda else model 215 | 216 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 217 | seqs = np.array([text_to_sequence(t) for t in texts]) 218 | input_lengths = [len(s) for s in seqs] 219 | 220 | use_manual_padding = False 221 | if use_manual_padding: 222 | max_input_len = np.max(input_lengths) + 10 # manuall padding 223 | seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int) 224 | input_lengths = torch.LongTensor(input_lengths) 225 | input_lengths = input_lengths.cuda() if use_cuda else input_lenghts 226 | else: 227 | input_lengths = None 228 | 229 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 230 | 231 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 232 | max_target_len = mel.shape[0] 233 | r = 4 234 | mel_dim = 80 235 | if max_target_len % r != 0: 236 | max_target_len += r - max_target_len % r 237 | assert max_target_len % r == 0 238 | mel = _pad_2d(mel, max_target_len) 239 | mel = Variable(torch.from_numpy(mel)) 240 | mel_reshaped = mel.view(1, -1, mel_dim * r) 241 | 242 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 243 | 244 | x = Variable(torch.LongTensor(seqs)) 245 | text_positions = Variable(torch.LongTensor(text_positions)) 246 | frame_positions = Variable(torch.LongTensor(frame_positions)) 247 | 248 | if use_cuda: 249 | x = x.cuda() 250 | text_positions = text_positions.cuda() 251 | frame_positions = frame_positions.cuda() 252 | mel_reshaped = mel_reshaped.cuda() 253 | 254 | model.eval() 255 | 256 | def _plot(mel, mel_predicted, alignments): 257 | from matplotlib import pylab as plt 258 | plt.figure(figsize=(16, 10)) 259 | plt.subplot(3, 1, 1) 260 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") 261 | plt.colorbar() 262 | 263 | plt.subplot(3, 1, 2) 264 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, 265 | origin="lower bottom", aspect="auto", cmap="magma") 266 | plt.colorbar() 267 | 268 | plt.subplot(3, 1, 3) 269 | if alignments.dim() == 4: 270 | alignments = alignments.mean(0) 271 | plt.imshow(alignments[0].data.cpu( 272 | ).numpy().T, origin="lower bottom", aspect="auto") 273 | plt.colorbar() 274 | plt.show() 275 | 276 | # Encoder 277 | encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths) 278 | 279 | # Off line decoding 280 | mel_output_offline, alignments_offline, done = model.seq2seq.decoder( 281 | encoder_outs, mel_reshaped, 282 | text_positions=text_positions, frame_positions=frame_positions, 283 | lengths=input_lengths) 284 | 285 | _plot(mel, mel_output_offline, alignments_offline) 286 | 287 | # Online decoding 288 | test_inputs = None 289 | # test_inputs = mel_reshaped 290 | model.seq2seq.decoder.start_fresh_sequence() 291 | mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward( 292 | encoder_outs, text_positions, 293 | # initial_input=mel_reshaped[:, :1, :], 294 | test_inputs=test_inputs) 295 | 296 | if test_inputs is not None: 297 | c = (mel_output_offline - mel_outputs).abs() 298 | print(c.mean(), c.max()) 299 | _plot(mel, c, alignments) 300 | 301 | _plot(mel, mel_outputs, alignments) -------------------------------------------------------------------------------- /dv3/tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import torch 5 | from torch import nn 6 | from torch.autograd import Variable 7 | from dv3.deepvoice3_pytorch.modules import SinusoidalEncoding, position_encoding_init 8 | import numpy as np 9 | 10 | 11 | def test_sinusoidal(): 12 | num_embedding = 512 13 | embedding_dim = 128 14 | padding_idx = 0 15 | 16 | for w in [1.0, 0.5, 2.0, 10.0, 20.0]: 17 | a = nn.Embedding(num_embedding, embedding_dim, padding_idx=padding_idx) 18 | a.weight.data = position_encoding_init( 19 | num_embedding, embedding_dim, position_rate=w) 20 | 21 | b = SinusoidalEncoding(num_embedding, embedding_dim, padding_idx=padding_idx) 22 | 23 | x = Variable(torch.arange(0, 128).long()) 24 | ax = a(x).data.numpy() 25 | bx = b(x, w).data.numpy() 26 | 27 | print(w, np.abs(ax - bx).mean()) 28 | try: 29 | assert np.allclose(ax, bx) 30 | except: 31 | print("TODO: has little numerical errors?") 32 | assert np.abs(ax - bx).mean() < 1e-5 33 | -------------------------------------------------------------------------------- /dv3/tests/test_frontend.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | from dv3.deepvoice3_pytorch import frontend 5 | from nose.plugins.attrib import attr 6 | 7 | eos = 1 8 | 9 | 10 | def test_en(): 11 | f = getattr(frontend, "en") 12 | seq = f.text_to_sequence("hello world.") 13 | assert seq[-1] == eos 14 | t = f.sequence_to_text(seq) 15 | assert t == "hello world.~" 16 | 17 | 18 | def test_ja(): 19 | f = getattr(frontend, "jp") 20 | seq = f.text_to_sequence("こんにちわ") 21 | assert seq[-1] == eos 22 | t = f.sequence_to_text(seq) 23 | assert t[:-1] == "コンニチワ。" 24 | 25 | 26 | @attr("local_only") 27 | def test_en_lj(): 28 | f = getattr(frontend, "en") 29 | from nnmnkwii.datasets import ljspeech 30 | from tqdm import trange 31 | import jaconv 32 | 33 | d = ljspeech.TranscriptionDataSource("/home/ryuichi/data/LJSpeech-1.0") 34 | texts = d.collect_files() 35 | 36 | for p in [0.0, 0.5, 1.0]: 37 | for idx in trange(len(texts)): 38 | text = texts[idx] 39 | seq = f.text_to_sequence(text, p=p) 40 | assert seq[-1] == eos 41 | t = f.sequence_to_text(seq) 42 | 43 | if idx < 10: 44 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t)) 45 | 46 | 47 | @attr("local_only") 48 | def test_ja_jsut(): 49 | f = getattr(frontend, "jp") 50 | from nnmnkwii.datasets import jsut 51 | from tqdm import trange 52 | import jaconv 53 | 54 | d = jsut.TranscriptionDataSource("/home/ryuichi/data/jsut_ver1.1/", 55 | subsets=jsut.available_subsets) 56 | texts = d.collect_files() 57 | 58 | for p in [0.0, 0.5, 1.0]: 59 | for idx in trange(len(texts)): 60 | text = texts[idx] 61 | seq = f.text_to_sequence(text, p=p) 62 | assert seq[-1] == eos 63 | t = f.sequence_to_text(seq) 64 | 65 | if idx < 10: 66 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t)) -------------------------------------------------------------------------------- /dv3/tests/test_nyanko.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join, exists 6 | 7 | from dv3.deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab 8 | 9 | import torch 10 | from torch.autograd import Variable 11 | from torch import nn 12 | import numpy as np 13 | 14 | from nose.plugins.attrib import attr 15 | 16 | from dv3.deepvoice3_pytorch.builder import nyanko 17 | from dv3.deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 18 | 19 | use_cuda = torch.cuda.is_available() 20 | num_mels = 80 21 | num_freq = 513 22 | outputs_per_step = 4 23 | padding_idx = 0 24 | 25 | 26 | def _pad(seq, max_len): 27 | return np.pad(seq, (0, max_len - len(seq)), 28 | mode='constant', constant_values=0) 29 | 30 | 31 | def _test_data(): 32 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 33 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 34 | input_lengths = np.array([len(s) for s in seqs]) 35 | max_len = np.max(input_lengths) 36 | seqs = np.array([_pad(s, max_len) for s in seqs]) 37 | 38 | # Test encoder 39 | x = Variable(torch.LongTensor(seqs)) 40 | y = Variable(torch.rand(x.size(0), 12, 80)) 41 | 42 | return x, y 43 | 44 | 45 | def _pad_2d(x, max_len, b_pad=0): 46 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)], 47 | mode="constant", constant_values=0) 48 | return x 49 | 50 | 51 | def test_nyanko_basics(): 52 | x, y = _test_data() 53 | 54 | for v in [False, True]: 55 | model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4, 56 | use_decoder_state_for_postnet_input=v) 57 | mel_outputs, linear_outputs, alignments, done = model(x, y) 58 | 59 | 60 | @attr("local_only") 61 | def test_incremental_correctness(): 62 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 63 | seqs = np.array([text_to_sequence(t) for t in texts]) 64 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 65 | 66 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 67 | max_target_len = mel.shape[0] 68 | r = 1 69 | mel_dim = 80 70 | if max_target_len % r != 0: 71 | max_target_len += r - max_target_len % r 72 | assert max_target_len % r == 0 73 | mel = _pad_2d(mel, max_target_len) 74 | mel = Variable(torch.from_numpy(mel)) 75 | mel_reshaped = mel.view(1, -1, mel_dim * r) 76 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 77 | 78 | x = Variable(torch.LongTensor(seqs)) 79 | text_positions = Variable(torch.LongTensor(text_positions)) 80 | frame_positions = Variable(torch.LongTensor(frame_positions)) 81 | 82 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, 83 | r=r, force_monotonic_attention=False) 84 | model.eval() 85 | 86 | # Encoder 87 | encoder_outs = model.seq2seq.encoder(x) 88 | 89 | # Off line decoding 90 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( 91 | encoder_outs, mel_reshaped, 92 | text_positions=text_positions, frame_positions=frame_positions) 93 | 94 | # Online decoding with test inputs 95 | model.seq2seq.decoder.start_fresh_sequence() 96 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( 97 | encoder_outs, text_positions, 98 | test_inputs=mel_reshaped) 99 | 100 | # Should get same result 101 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(), 102 | mel_outputs_online.cpu().data.numpy()) 103 | 104 | 105 | @attr("local_only") 106 | def test_nyanko(): 107 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 108 | seqs = np.array([text_to_sequence(t) for t in texts]) 109 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 110 | 111 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 112 | max_target_len = mel.shape[0] 113 | r = 1 114 | mel_dim = 80 115 | if max_target_len % r != 0: 116 | max_target_len += r - max_target_len % r 117 | assert max_target_len % r == 0 118 | mel = _pad_2d(mel, max_target_len) 119 | mel = Variable(torch.from_numpy(mel)) 120 | mel_reshaped = mel.view(1, -1, mel_dim * r) 121 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 122 | 123 | x = Variable(torch.LongTensor(seqs)) 124 | text_positions = Variable(torch.LongTensor(text_positions)) 125 | frame_positions = Variable(torch.LongTensor(frame_positions)) 126 | 127 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, 128 | r=r, force_monotonic_attention=False) 129 | model.eval() 130 | 131 | def _plot(mel, mel_predicted, alignments): 132 | from matplotlib import pylab as plt 133 | plt.figure(figsize=(16, 10)) 134 | plt.subplot(3, 1, 1) 135 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") 136 | plt.colorbar() 137 | 138 | plt.subplot(3, 1, 2) 139 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, 140 | origin="lower bottom", aspect="auto", cmap="magma") 141 | plt.colorbar() 142 | 143 | plt.subplot(3, 1, 3) 144 | if alignments.dim() == 4: 145 | alignments = alignments.mean(0) 146 | plt.imshow(alignments[0].data.cpu( 147 | ).numpy().T, origin="lower bottom", aspect="auto") 148 | plt.colorbar() 149 | plt.show() 150 | 151 | seq2seq = model.seq2seq 152 | 153 | # Encoder 154 | encoder_outs = seq2seq.encoder(x) 155 | 156 | # Off line decoding 157 | print("Offline decoding") 158 | mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder( 159 | encoder_outs, mel_reshaped, 160 | text_positions=text_positions, frame_positions=frame_positions) 161 | 162 | _plot(mel, mel_outputs_offline, alignments_offline) 163 | 164 | # Online decoding with test inputs 165 | print("Online decoding") 166 | seq2seq.decoder.start_fresh_sequence() 167 | mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward( 168 | encoder_outs, text_positions, 169 | test_inputs=mel_reshaped) 170 | 171 | a = mel_outputs_offline.cpu().data.numpy() 172 | b = mel_outputs_online.cpu().data.numpy() 173 | c = (mel_outputs_offline - mel_outputs_online).abs() 174 | print(c.mean(), c.max()) 175 | 176 | _plot(mel, mel_outputs_offline, alignments_offline) 177 | _plot(mel, mel_outputs_online, alignments) 178 | _plot(mel, c, alignments) 179 | 180 | # Should get same result 181 | assert np.allclose(a, b) 182 | 183 | postnet = model.postnet 184 | 185 | linear_outputs = postnet(mel_outputs_offline) 186 | print(linear_outputs.size()) 187 | -------------------------------------------------------------------------------- /dv3/vctk.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import dv3.audio 6 | from nnmnkwii.datasets import vctk 7 | from nnmnkwii.io import hts 8 | from dv3.hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | 13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 14 | executor = ProcessPoolExecutor(max_workers=num_workers) 15 | futures = [] 16 | 17 | speakers = vctk.available_speakers 18 | 19 | td = vctk.TranscriptionDataSource(in_dir, speakers=speakers) 20 | transcriptions = td.collect_files() 21 | speaker_ids = td.labels 22 | wav_paths = vctk.WavFileDataSource( 23 | in_dir, speakers=speakers).collect_files() 24 | 25 | for index, (speaker_id, text, wav_path) in enumerate( 26 | zip(speaker_ids, transcriptions, wav_paths)): 27 | futures.append(executor.submit( 28 | partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text))) 29 | return [future.result() for future in tqdm(futures)] 30 | 31 | 32 | def start_at(labels): 33 | has_silence = labels[0][-1] == "pau" 34 | if not has_silence: 35 | return labels[0][0] 36 | for i in range(1, len(labels)): 37 | if labels[i][-1] != "pau": 38 | return labels[i][0] 39 | assert False 40 | 41 | 42 | def end_at(labels): 43 | has_silence = labels[-1][-1] == "pau" 44 | if not has_silence: 45 | return labels[-1][1] 46 | for i in range(len(labels) - 2, 0, -1): 47 | if labels[i][-1] != "pau": 48 | return labels[i][1] 49 | assert False 50 | 51 | 52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text): 53 | sr = hparams.sample_rate 54 | 55 | # Load the audio to a numpy array: 56 | wav = dv3.audio.load_wav(wav_path) 57 | 58 | lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") 59 | 60 | # Trim silence from hts labels if available 61 | if exists(lab_path): 62 | labels = hts.load(lab_path) 63 | b = int(start_at(labels) * 1e-7 * sr) 64 | e = int(end_at(labels) * 1e-7 * sr) 65 | wav = wav[b:e] 66 | wav, _ = librosa.effects.trim(wav, top_db=25) 67 | else: 68 | wav, _ = librosa.effects.trim(wav, top_db=15) 69 | 70 | # Compute the linear-scale spectrogram from the wav: 71 | spectrogram = dv3.audio.spectrogram(wav).astype(np.float32) 72 | n_frames = spectrogram.shape[1] 73 | 74 | # Compute a mel-scale spectrogram from the wav: 75 | mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32) 76 | 77 | # Write the spectrograms to disk: 78 | spectrogram_filename = 'vctk-spec-%05d.npy' % index 79 | mel_filename = 'vctk-mel-%05d.npy' % index 80 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 81 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 82 | 83 | # Return a tuple describing this training example: 84 | return (spectrogram_filename, mel_filename, n_frames, text, speaker_id) -------------------------------------------------------------------------------- /dv3/vctk_preprocess/.gitignore: -------------------------------------------------------------------------------- 1 | latest_features 2 | tts_env.sh 3 | -------------------------------------------------------------------------------- /dv3/vctk_preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Preprocessing for VCTK 2 | 3 | Wav files in VCTK contains lots of long silences, which affects training char-level seq2seq models. To deal with the problem, we will 4 | 5 | - **Prepare phoneme alignments for all utterances** (code in the directory) 6 | - Cut silences during preprocessing (code in the parent directory) 7 | 8 | ## Note 9 | 10 | Code in the directory heavily relies on https://gist.github.com/kastnerkyle/cc0ac48d34860c5bb3f9112f4d9a0300 (which is hard copied in the repo). If you have any issues, please make sure that you can successfully run the script. 11 | 12 | ## Steps 13 | 14 | 1. Download VCTK: http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html 15 | 2. Install HTK/speech_tools/festival/merlin and prepare `tts_env.sh`. If you don't have speech tools, you can install them by https://gist.github.com/kastnerkyle/001a58a58d090658ee5350cb6129f857. For the reference, `tts_env.sh` of mine is: 16 | ``` 17 | export ESTDIR=/home/ryuichi/Dropbox/sp/speech_tools/ 18 | export FESTDIR=/home/ryuichi/Dropbox/sp/festival/ 19 | export FESTVOXDIR=/home/ryuichi/Dropbox/sp/festvox/ 20 | export VCTKDIR=/home/ryuichi/data/VCTK-Corpus/ 21 | export HTKDIR=/usr/local/HTS-2.3/bin/ 22 | export SPTKDIR=/usr/local/bin/ 23 | export MERLINDIR=/home/ryuichi/Dropbox/sp/merlin_pr/ 24 | ``` 25 | 3. Run the script (takes ~24 hours) 26 | ``` 27 | python prepare_vctk_labels.py ${your_vctk_dir} ${dst_dir} 28 | ``` 29 | This will process all utterances of VCTK and copy HTK-style alignments to `${dst_dir}`. 30 | It is recommended to copy alignments to the top of VCTK corpus. i.e., 31 | ``` 32 | python prepare_vctk_labels.py ~/data/VCTK-Corpus ~/data/VCTK-Corpus/lab 33 | ``` 34 | 35 | After the above steps, you will get alignments as follows: 36 | 37 | ``` 38 | tree ~/data/VCTK-Corpus/lab/ | head /home/ryuichi/data/VCTK-Corpus/lab/ 39 | ├── p225 40 | │   ├── p225_001.lab 41 | │   ├── p225_002.lab 42 | │   ├── p225_003.lab 43 | │   ├── p225_004.lab 44 | │   ├── p225_005.lab 45 | │   ├── p225_006.lab 46 | │   ├── p225_007.lab 47 | │   ├── p225_008.lab 48 | ``` 49 | 50 | ``` 51 | cat ~/data/VCTK-Corpus/lab/p225/p225_001.lab 52 | 53 | 0 850000 pau 54 | 850000 2850000 pau 55 | 2850000 3600000 p 56 | 3600000 3900000 l 57 | 3900000 6000000 iy 58 | 6000000 8450000 z 59 | 8450000 8600000 k 60 | 8600000 11300000 ao 61 | 11300000 11450000 l 62 | 11450000 12800000 s 63 | 12800000 13099999 t 64 | 13099999 15800000 eh 65 | 15800000 16050000 l 66 | 16050000 17600000 ax 67 | 17600000 20400000 pau 68 | ``` 69 | 70 | ## Using Gentle? 71 | 72 | `prepare_htk_alignments_vctk.py` do the same things above using [Gentle](https://github.com/lowerquality/gentle), but turned out it seems not very good. Leaving code for future possibility if we can improve. 73 | -------------------------------------------------------------------------------- /dv3/vctk_preprocess/prepare_htk_alignments_vctk.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Script for do force alignment by gentle for VCTK. This script takes approx 4 | ~40 hours to finish. It processes all utterances in VCTK. 5 | 6 | NOTE: Must be run with Python2, since gentle doesn't work with Python3. 7 | 8 | Usage: 9 | 1. Install https://github.com/lowerquality/gentle 10 | 2. Download VCTK http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html 11 | 12 | and then run the script by: 13 | 14 | python2 prepare_htk_alignments_vctk.py ${your_vctk_data_path} 15 | 16 | After running the script, you will see alignment files in `lab` directory as 17 | follows: 18 | 19 | > tree ~/data/VCTK-Corpus/ -d -L 20 | 21 | /home/ryuichi/data/VCTK-Corpus/ 22 | ├── lab 23 | ├── txt 24 | └── wav48 25 | """ 26 | import argparse 27 | import logging 28 | import multiprocessing 29 | import os 30 | import sys 31 | from tqdm import tqdm 32 | import json 33 | from os.path import join, basename, dirname, exists 34 | import numpy as np 35 | 36 | import gentle 37 | import librosa 38 | from nnmnkwii.datasets import vctk 39 | 40 | 41 | def on_progress(p): 42 | for k, v in p.items(): 43 | logging.debug("%s: %s" % (k, v)) 44 | 45 | 46 | def write_hts_label(labels, lab_path): 47 | lab = "" 48 | for s, e, l in labels: 49 | s, e = float(s) * 1e7, float(e) * 1e7 50 | s, e = int(s), int(e) 51 | lab += "{} {} {}\n".format(s, e, l) 52 | print(lab) 53 | with open(lab_path, "w") as f: 54 | f.write(lab) 55 | 56 | 57 | def json2hts(data): 58 | emit_bos = False 59 | emit_eos = False 60 | 61 | phone_start = 0 62 | phone_end = None 63 | labels = [] 64 | 65 | for word in data["words"]: 66 | case = word["case"] 67 | if case != "success": 68 | raise RuntimeError("Alignment failed") 69 | start = float(word["start"]) 70 | word_end = float(word["end"]) 71 | 72 | if not emit_bos: 73 | labels.append((phone_start, start, "silB")) 74 | emit_bos = True 75 | 76 | phone_start = start 77 | phone_end = None 78 | for phone in word["phones"]: 79 | ph = str(phone["phone"][:-2]) 80 | duration = float(phone["duration"]) 81 | phone_end = phone_start + duration 82 | labels.append((phone_start, phone_end, ph)) 83 | phone_start += duration 84 | assert np.allclose(phone_end, word_end) 85 | if not emit_eos: 86 | labels.append((phone_start, phone_end, "silE")) 87 | emit_eos = True 88 | 89 | return labels 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser( 94 | description='Do force alignment for VCTK and save HTK-style alignments') 95 | parser.add_argument( 96 | '--nthreads', default=multiprocessing.cpu_count(), type=int, 97 | help='number of alignment threads') 98 | parser.add_argument( 99 | '--conservative', dest='conservative', action='store_true', 100 | help='conservative alignment') 101 | parser.set_defaults(conservative=False) 102 | parser.add_argument( 103 | '--disfluency', dest='disfluency', action='store_true', 104 | help='include disfluencies (uh, um) in alignment') 105 | parser.set_defaults(disfluency=False) 106 | parser.add_argument( 107 | '--log', default="INFO", 108 | help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)') 109 | parser.add_argument('data_root', type=str, help='Data root') 110 | 111 | args = parser.parse_args() 112 | 113 | log_level = args.log.upper() 114 | logging.getLogger().setLevel(log_level) 115 | disfluencies = set(['uh', 'um']) 116 | 117 | data_root = args.data_root 118 | 119 | # Do for all speakers 120 | speakers = vctk.available_speakers 121 | 122 | # Collect all transcripts/wav files 123 | td = vctk.TranscriptionDataSource(data_root, speakers=speakers) 124 | transcriptions = td.collect_files() 125 | wav_paths = vctk.WavFileDataSource( 126 | data_root, speakers=speakers).collect_files() 127 | 128 | # Save dir 129 | save_dir = join(data_root, "lab") 130 | if not exists(save_dir): 131 | os.makedirs(save_dir) 132 | 133 | resources = gentle.Resources() 134 | 135 | for idx in tqdm(range(len(wav_paths))): 136 | transcript = transcriptions[idx] 137 | audiofile = wav_paths[idx] 138 | lab_path = audiofile.replace("wav48/", "lab/").replace(".wav", ".lab") 139 | print(transcript) 140 | print(audiofile) 141 | print(lab_path) 142 | lab_dir = dirname(lab_path) 143 | if not exists(lab_dir): 144 | os.makedirs(lab_dir) 145 | 146 | logging.info("converting audio to 8K sampled wav") 147 | with gentle.resampled(audiofile) as wavfile: 148 | logging.info("starting alignment") 149 | aligner = gentle.ForcedAligner(resources, transcript, 150 | nthreads=args.nthreads, 151 | disfluency=args.disfluency, 152 | conservative=args.conservative, 153 | disfluencies=disfluencies) 154 | result = aligner.transcribe( 155 | wavfile, progress_cb=on_progress, logging=logging) 156 | 157 | # convert to htk format 158 | a = json.loads(result.to_json()) 159 | try: 160 | labels = json2hts(a) 161 | except RuntimeError as e: 162 | from warnings import warn 163 | warn(str(e)) 164 | continue 165 | 166 | # Insert end time 167 | x, sr = librosa.load(wavfile, sr=8000) 168 | endtime = float(len(x)) / sr 169 | labels[-1] = (labels[-1][0], endtime, labels[-1][-1]) 170 | 171 | # write to file 172 | write_hts_label(labels, lab_path) 173 | -------------------------------------------------------------------------------- /dv3/vctk_preprocess/prepare_vctk_labels.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Prepare HTS alignments for VCTK. 4 | 5 | usage: prepare_vctk_labels.py [options] 6 | 7 | options: 8 | -h, --help Show help message. 9 | """ 10 | from docopt import docopt 11 | import os 12 | from nnmnkwii.datasets import vctk 13 | from os.path import join, exists, splitext, basename 14 | import sys 15 | from glob import glob 16 | 17 | from subprocess import Popen, PIPE 18 | from tqdm import tqdm 19 | 20 | 21 | def do(cmd): 22 | print(cmd) 23 | p = Popen(cmd, shell=True) 24 | p.wait() 25 | 26 | 27 | if __name__ == "__main__": 28 | args = docopt(__doc__) 29 | data_root = args[""] 30 | out_dir = args[""] 31 | 32 | for idx in tqdm(range(len(vctk.available_speakers))): 33 | speaker = vctk.available_speakers[idx] 34 | 35 | wav_root = join(data_root, "wav48/p{}".format(speaker)) 36 | txt_root = join(data_root, "txt/p{}".format(speaker)) 37 | assert exists(wav_root) 38 | assert exists(txt_root) 39 | print(wav_root, txt_root) 40 | 41 | # Do alignments 42 | cmd = "python ./extract_feats.py -w {} -t {}".format(wav_root, txt_root) 43 | do(cmd) 44 | 45 | # Copy 46 | lab_dir = join(out_dir, "p{}".format(speaker)) 47 | if not exists(lab_dir): 48 | os.makedirs(lab_dir) 49 | cmd = "cp ./latest_features/merlin/misc/scripts/alignment/phone_align/full-context-labels/mono/*.lab {}".format( 50 | lab_dir) 51 | do(cmd) 52 | 53 | # Remove 54 | do("rm -rf ./latest_features") 55 | 56 | sys.exit(0) 57 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import setuptools.command.develop 5 | import setuptools.command.build_py 6 | import os 7 | import subprocess 8 | 9 | version = '0.0.1' 10 | 11 | # Adapted from https://github.com/pytorch/pytorch 12 | cwd = os.path.dirname(os.path.abspath(__file__)) 13 | if os.getenv('TACOTRON_BUILD_VERSION'): 14 | version = os.getenv('TACOTRON_BUILD_VERSION') 15 | else: 16 | try: 17 | sha = subprocess.check_output( 18 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 19 | version += '+' + sha[:7] 20 | except subprocess.CalledProcessError: 21 | pass 22 | 23 | 24 | class build_py(setuptools.command.build_py.build_py): 25 | 26 | def run(self): 27 | self.create_version_file() 28 | setuptools.command.build_py.build_py.run(self) 29 | 30 | @staticmethod 31 | def create_version_file(): 32 | global version, cwd 33 | print('-- Building version ' + version) 34 | version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py') 35 | with open(version_path, 'w') as f: 36 | f.write("__version__ = '{}'\n".format(version)) 37 | 38 | 39 | class develop(setuptools.command.develop.develop): 40 | 41 | def run(self): 42 | build_py.create_version_file() 43 | setuptools.command.develop.develop.run(self) 44 | 45 | 46 | setup(name='deepvoice3_pytorch', 47 | version=version, 48 | description='PyTorch implementation of Tacotron speech synthesis model.', 49 | packages=find_packages(), 50 | cmdclass={ 51 | 'build_py': build_py, 52 | 'develop': develop, 53 | }, 54 | install_requires=[ 55 | "numpy", 56 | "scipy", 57 | "unidecode", 58 | "inflect", 59 | "librosa", 60 | "numba", 61 | "lws <= 1.0", 62 | ], 63 | extras_require={ 64 | "train": [ 65 | "docopt", 66 | "tqdm", 67 | "tensorboardX", 68 | "nnmnkwii >= 0.0.9", 69 | "nltk", 70 | ], 71 | "test": [ 72 | "nose", 73 | ], 74 | "jp": [ 75 | "jaconv", 76 | "mecab-python3", 77 | ], 78 | }) 79 | -------------------------------------------------------------------------------- /train_encoder.py: -------------------------------------------------------------------------------- 1 | from docopt import docopt 2 | import sys 3 | from os.path import dirname, join 4 | from tqdm import tqdm, trange 5 | from datetime import datetime 6 | 7 | import pickle 8 | 9 | import torch 10 | from torch.autograd import Variable 11 | from torch.utils.data import Dataset, DataLoader 12 | from torch.utils import data as data_utils 13 | from torch import nn 14 | from torch import optim 15 | import torch.backends.cudnn as cudnn 16 | from torch.utils import data as data_utils 17 | from torch.utils.data.sampler import Sampler 18 | import numpy as np 19 | from numba import jit 20 | 21 | 22 | from utils import generate_cloned_samples, Speech_Dataset 23 | import dv3 24 | 25 | import sys 26 | import os 27 | 28 | # sys.path.append('./deepvoice3_pytorch') 29 | from dv3 import build_deepvoice_3 30 | from Encoder import Encoder 31 | 32 | # print(hparams) 33 | batch_size_encoder = 16 34 | 35 | 36 | global_step = 0 37 | global_epoch = 0 38 | use_cuda = torch.cuda.is_available() 39 | if use_cuda: 40 | cudnn.benchmark = False 41 | 42 | def get_cloned_voices(model,no_speakers = 108,no_cloned_texts = 23): 43 | try: 44 | with open("./Cloning_Audio/speakers_cloned_voices_mel.p" , "rb") as fp: 45 | cloned_voices = pickle.load(fp) 46 | except: 47 | cloned_voices = generate_cloned_samples(model) 48 | if(np.array(cloned_voices).shape != (no_speakers , no_cloned_texts)): 49 | cloned_voices = generate_cloned_samples(model,"./Cloning_Audio/cloning_text.txt" ,no_speakers,True,0) 50 | print("Cloned_voices Loaded!") 51 | return cloned_voices 52 | 53 | # Assumes that only Deep Voice 3 is given 54 | def get_speaker_embeddings(model): 55 | ''' 56 | return the speaker embeddings and its shape from deep voice 3 57 | ''' 58 | embed = model.embed_speakers.weight.data 59 | # shape = embed.shape 60 | return embed 61 | 62 | def build_encoder(): 63 | encoder = Encoder() 64 | return encoder 65 | 66 | 67 | def save_checkpoint(model, optimizer, checkpoint_path, epoch): 68 | 69 | optimizer_state = optimizer.state_dict() 70 | torch.save({ 71 | "state_dict": model.state_dict(), 72 | "optimizer": optimizer_state, 73 | "global_epoch": epoch, 74 | "epoch":epoch+1, 75 | 76 | }, checkpoint_path) 77 | print("Saved checkpoint:", checkpoint_path) 78 | 79 | def load_checkpoint(encoder, optimizer, path='checkpoints/encoder_checkpoint.pth'): 80 | 81 | checkpoint = torch.load(path) 82 | 83 | encoder.load_state_dict(checkpoint["state_dict"]) 84 | 85 | print('Encoder state restored') 86 | 87 | optimizer.load_state_dict(checkpoint["optimizer"]) 88 | 89 | print('Optimizer state restored') 90 | 91 | return encoder, optimizer 92 | 93 | def my_collate(batch): 94 | data = [item[0] for item in batch] 95 | samples = [text.shape[0] for text in data] 96 | max_size = data[0].shape[1] 97 | max_samples = np.amax(np.array(samples)) 98 | for i, i_element in enumerate(data): 99 | final = torch.zeros(int(max_samples), max_size, 80) 100 | final[:data[i].shape[0], :, :] += torch.from_numpy(i_element).type(torch.FloatTensor) 101 | data[i]=torch.unsqueeze(final, 0) 102 | data = torch.cat(data, 0) 103 | target = np.stack([item[1] for item in batch], 0) 104 | target = torch.from_numpy(target) 105 | return [data, target] 106 | 107 | def train_encoder(encoder, data, optimizer, scheduler, criterion, epochs=100000, after_epoch_download=1000): 108 | 109 | #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.6) 110 | 111 | for i in range(epochs): 112 | 113 | epoch_loss=0.0 114 | 115 | for i_element, element in enumerate(data): 116 | 117 | voice, embed = element[0], element[1] 118 | 119 | input_to_encoder = Variable(voice.type(torch.cuda.FloatTensor)) 120 | 121 | optimizer.zero_grad() 122 | 123 | output_from_encoder = encoder(input_to_encoder) 124 | 125 | embeddings = Variable(embed.type(torch.cuda.FloatTensor)) 126 | 127 | loss = criterion(output_from_encoder,embeddings) 128 | 129 | loss.backward() 130 | 131 | scheduler.step() 132 | optimizer.step() 133 | 134 | epoch_loss+=loss 135 | 136 | 137 | if i%100==99: 138 | save_checkpoint(encoder,optimizer,"encoder_checkpoint.pth",i) 139 | print(i, ' done') 140 | print('Loss for epoch ', i, ' is ', loss) 141 | 142 | def download_file(file_name=None): 143 | from google.colab import files 144 | files.download(file_name) 145 | 146 | 147 | batch_size=64 148 | 149 | if __name__ == "__main__": 150 | 151 | #Load Deep Voice 3 152 | # Pre Trained Model 153 | print("start") 154 | dv3_model = build_deepvoice_3(True) 155 | print("dv3 built") 156 | all_speakers = get_cloned_voices(dv3_model) 157 | print("Cloning Texts are produced") 158 | 159 | speaker_embed = get_speaker_embeddings(dv3_model) 160 | 161 | encoder = build_encoder() 162 | 163 | print("Encoder is built!") 164 | 165 | 166 | speech_data = Speech_Dataset(all_speakers, speaker_embed, sampler=True) 167 | 168 | criterion = nn.L1Loss() 169 | 170 | optimizer = torch.optim.SGD(encoder.parameters(),lr=0.0006) 171 | 172 | lambda1 = lambda epoch: 0.6 if epoch%8000==7999 else 1 173 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) 174 | 175 | 176 | data_loader = DataLoader(speech_data, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn = my_collate) 177 | # Training The Encoder 178 | 179 | encoder = encoder.cuda() 180 | 181 | if os.path.isfile('checkpoints/encoder_checkpoint.pth'): 182 | encoder, optimizer = load_checkpoint(encoder, optimizer) 183 | 184 | try: 185 | train_encoder(encoder, data_loader, optimizer, scheduler, criterion, epochs=100000) 186 | except KeyboardInterrupt: 187 | print("KeyboardInterrupt") 188 | 189 | print("Finished") 190 | sys.exit(0) 191 | -------------------------------------------------------------------------------- /train_whole.py: -------------------------------------------------------------------------------- 1 | 2 | from docopt import docopt 3 | import sys 4 | from os.path import dirname, join 5 | from tqdm import tqdm, trange 6 | from datetime import datetime 7 | 8 | import pickle 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | from torch.utils.data import Dataset, DataLoader 13 | from torch.utils import data as data_utils 14 | from torch import nn 15 | from torch import optim 16 | import torch.backends.cudnn as cudnn 17 | from torch.utils import data as data_utils 18 | from torch.utils.data.sampler import Sampler 19 | import numpy as np 20 | from numba import jit 21 | from nnmnkwii.datasets import FileSourceDataset, FileDataSource 22 | from os.path import join, expanduser 23 | 24 | 25 | # import requirements for dv3 26 | from utils import generate_cloned_samples, Speech_Dataset 27 | import dv3 28 | from dv3 import build_deepvoice_3 29 | from dv3.hparams import hparams, hparams_debug_string 30 | from dv3.train import train as train_dv3 31 | from dv3.train import TextDataSource,MelSpecDataSource,LinearSpecDataSource,\ 32 | PyTorchDataset,PartialyRandomizedSimilarTimeLengthSampler 33 | from dv3.train import collate_fn 34 | from dv3.deepvoice3_pytorch import frontend 35 | from dv3.train import sequence_mask 36 | from dv3.train import save_checkpoint as save_checkpoint_dv3 37 | from dv3.train import save_states as save_states_dv3 38 | from tensorboardX import SummaryWriter 39 | 40 | # requirements for encoder 41 | from utils import generate_cloned_samples, Speech_Dataset 42 | from Encoder import Encoder 43 | from train_encoder import get_cloned_voices,build_encoder,get_speaker_embeddings 44 | from train_encoder import load_checkpoint as load_checkpoint_encoder 45 | from train_encoder import save_checkpoint as save_checkpoint_encoder 46 | from train_encoder import train as train_encoder 47 | 48 | 49 | import sys 50 | import os 51 | 52 | # sys.path.append('./deepvoice3_pytorch') 53 | 54 | # print(hparams) 55 | batch_size_encoder = 16 56 | 57 | 58 | global_step = 0 59 | global_epoch = 0 60 | use_cuda = torch.cuda.is_available() 61 | if use_cuda: 62 | cudnn.benchmark = False 63 | 64 | 65 | def train(model_dv3,model_encoder, 66 | data_loader_dv3, 67 | optimizer_dv3, 68 | init_lr_dv3=0.002, 69 | checkpoint_dir_dv3=None, 70 | clip_thresh = 1.0, 71 | data_loader_encoder=None, 72 | optimizer_encoder=None, 73 | scheduler_encoder=None, 74 | checkpoint_interval=None, 75 | nepochs=None): 76 | # this training function is to train the combined model 77 | 78 | grad = {} 79 | def save_grad(name): 80 | def hook(grad): 81 | grads[name] = grad 82 | return hook 83 | 84 | # to remember the embeddings of the speakers 85 | model_dv3.embed_speakers.weight.register_hook(save_grad('embeddings')) 86 | 87 | if use_cuda: 88 | model_dv3 = model_dv3.cuda() 89 | model_encoder = model_encoder.cuda() 90 | linear_dim = model_dv3.linear_dim 91 | r = hparams.outputs_per_step 92 | downsample_step = hparams.downsample_step 93 | current_lr = init_lr_dv3 94 | 95 | binary_criterion_dv3 = nn.BCELoss() 96 | 97 | global global_step, global_epoch 98 | while global_epoch < nepochs: 99 | running_loss = 0.0 100 | for step, (x, input_lengths, mel, y, positions, done, target_lengths, 101 | speaker_ids) \ 102 | in tqdm(enumerate(data_loader_dv3)): 103 | 104 | 105 | model_dv3.zero_grad() 106 | encoder.zero_grad() 107 | 108 | #Declaring Requirements 109 | model_dv3.train() 110 | ismultispeaker = speaker_ids is not None 111 | # Learning rate schedule 112 | if hparams.lr_schedule is not None: 113 | lr_schedule_f = getattr(dv3.lrschedule, hparams.lr_schedule) 114 | current_lr = lr_schedule_f( 115 | init_lr, global_step, **hparams.lr_schedule_kwargs) 116 | for param_group in optimizer.param_groups: 117 | param_group['lr'] = current_lr 118 | optimizer_dv3.zero_grad() 119 | 120 | # Used for Position encoding 121 | text_positions, frame_positions = positions 122 | 123 | # Downsample mel spectrogram 124 | if downsample_step > 1: 125 | mel = mel[:, 0::downsample_step, :].contiguous() 126 | 127 | # Lengths 128 | input_lengths = input_lengths.long().numpy() 129 | decoder_lengths = target_lengths.long().numpy() // r // downsample_step 130 | 131 | voice_encoder = mel.view(mel.shape[0],1,mel.shape[1],mel.shape[2]) 132 | # Feed data 133 | x, mel, y = Variable(x), Variable(mel), Variable(y) 134 | voice_encoder = Variable(voice_encoder) 135 | text_positions = Variable(text_positions) 136 | frame_positions = Variable(frame_positions) 137 | done = Variable(done) 138 | target_lengths = Variable(target_lengths) 139 | speaker_ids = Variable(speaker_ids) if ismultispeaker else None 140 | if use_cuda: 141 | x = x.cuda() 142 | text_positions = text_positions.cuda() 143 | frame_positions = frame_positions.cuda() 144 | y = y.cuda() 145 | mel = mel.cuda() 146 | voice_encoder = voice_encoder.cuda() 147 | done, target_lengths = done.cuda(), target_lengths.cuda() 148 | speaker_ids = speaker_ids.cuda() if ismultispeaker else None 149 | 150 | # Create mask if we use masked loss 151 | if hparams.masked_loss_weight > 0: 152 | # decoder output domain mask 153 | decoder_target_mask = sequence_mask( 154 | target_lengths / (r * downsample_step), 155 | max_len=mel.size(1)).unsqueeze(-1) 156 | if downsample_step > 1: 157 | # spectrogram-domain mask 158 | target_mask = sequence_mask( 159 | target_lengths, max_len=y.size(1)).unsqueeze(-1) 160 | else: 161 | target_mask = decoder_target_mask 162 | # shift mask 163 | decoder_target_mask = decoder_target_mask[:, r:, :] 164 | target_mask = target_mask[:, r:, :] 165 | else: 166 | decoder_target_mask, target_mask = None, None 167 | 168 | #apply encoder model 169 | 170 | 171 | 172 | model_dv3.embed_speakers.weight.data = (encoder_out).data 173 | # Apply dv3 model 174 | mel_outputs, linear_outputs, attn, done_hat = model_dv3( 175 | x, mel, speaker_ids=speaker_ids, 176 | text_positions=text_positions, frame_positions=frame_positions, 177 | input_lengths=input_lengths) 178 | 179 | 180 | 181 | # Losses 182 | w = hparams.binary_divergence_weight 183 | 184 | # mel: 185 | mel_l1_loss, mel_binary_div = spec_loss( 186 | mel_outputs[:, :-r, :], mel[:, r:, :], decoder_target_mask) 187 | mel_loss = (1 - w) * mel_l1_loss + w * mel_binary_div 188 | 189 | # done: 190 | done_loss = binary_criterion(done_hat, done) 191 | 192 | # linear: 193 | n_priority_freq = int(hparams.priority_freq / (fs * 0.5) * linear_dim) 194 | linear_l1_loss, linear_binary_div = spec_loss( 195 | linear_outputs[:, :-r, :], y[:, r:, :], target_mask, 196 | priority_bin=n_priority_freq, 197 | priority_w=hparams.priority_freq_weight) 198 | linear_loss = (1 - w) * linear_l1_loss + w * linear_binary_div 199 | 200 | # Combine losses 201 | loss_dv3 = mel_loss + linear_loss + done_loss 202 | loss_dv3 = mel_loss + done_loss 203 | loss_dv3 = linear_loss 204 | 205 | # attention 206 | if hparams.use_guided_attention: 207 | soft_mask = guided_attentions(input_lengths, decoder_lengths, 208 | attn.size(-2), 209 | g=hparams.guided_attention_sigma) 210 | soft_mask = Variable(torch.from_numpy(soft_mask)) 211 | soft_mask = soft_mask.cuda() if use_cuda else soft_mask 212 | attn_loss = (attn * soft_mask).mean() 213 | loss_dv3 += attn_loss 214 | 215 | if global_step > 0 and global_step % checkpoint_interval == 0: 216 | save_states_dv3( 217 | global_step, writer, mel_outputs, linear_outputs, attn, 218 | mel, y, input_lengths, checkpoint_dir) 219 | save_checkpoint_dv3( 220 | model, optimizer, global_step, checkpoint_dir, global_epoch, 221 | train_seq2seq, train_postnet) 222 | 223 | if global_step > 0 and global_step % hparams.eval_interval == 0: 224 | eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker) 225 | 226 | # Update 227 | loss_dv3.backward() 228 | encoder_out.backward(grads['embeddings']) 229 | 230 | optimizer_dv3.step() 231 | optimizer_encoder.step() 232 | 233 | # if clip_thresh> 0: 234 | # grad_norm = torch.nn.utils.clip_grad_norm( 235 | # model.get_trainable_parameters(), clip_thresh) 236 | global_step += 1 237 | running_loss += loss.data[0] 238 | 239 | averaged_loss = running_loss / (len(data_loader)) 240 | 241 | print("Loss: {}".format(running_loss / (len(data_loader)))) 242 | 243 | global_epoch += 1 244 | 245 | 246 | # dv3 loss function 247 | # backward on that 248 | mel_outputs.backward() 249 | # dv3_model.embed_speakers.weight.data = (encoder_out).data 250 | 251 | 252 | if __name__=="main" 253 | 254 | args = docopt(__doc__) 255 | print("Command line args:\n",args) 256 | 257 | checkpoint_dv3 = args["--checkpoints-dv3"] 258 | checkpoint_encoder = args["--checkpoint-encoder"] 259 | speaker_id = None 260 | dv3_preset =None 261 | 262 | data_root = args["--data-root"] 263 | if data_root is None: 264 | data_root = join(dirname(__file__), "data", "ljspeech") 265 | 266 | 267 | 268 | train_dv3_v = args["--train-dv3"] 269 | train_encoder_v = args["--train-encoder"] 270 | 271 | 272 | if not train_dv3_v and not train_encoder_v: 273 | print("Training whole model") 274 | train_dv3_v,train_encoder_v= True,True 275 | if train_dv3_v: 276 | print("Training deep voice 3 model") 277 | elif train_encoder_v: 278 | print("Training encoder model") 279 | else: 280 | assert False, "must be specified wrong args" 281 | 282 | os.makedirs(checkpoint_dir , exist_ok=True) 283 | 284 | # Input dataset definitions 285 | X = FileSourceDataset(TextDataSource(data_root, speaker_id)) 286 | Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id)) 287 | Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id)) 288 | 289 | # Prepare sampler 290 | frame_lengths = Mel.file_data_source.frame_lengths 291 | sampler = PartialyRandomizedSimilarTimeLengthSampler( 292 | frame_lengths, batch_size=hparams.batch_size) 293 | 294 | # Dataset and Dataloader setup 295 | dataset = PyTorchDataset(X, Mel, Y) 296 | data_loader_dv3 = data_utils.DataLoader( 297 | dataset, batch_size=hparams.batch_size, 298 | num_workers=hparams.num_workers, sampler=sampler, 299 | collate_fn=collate_fn, pin_memory=hparams.pin_memory) 300 | print("dataloader for dv3 prepared") 301 | 302 | dv3.train._frontend = getattr(frontend, hparams.frontend) 303 | dv3_model = build_deepvoice_3(dv3_preset , checkpoint_dv3) 304 | print("Built dv3!") 305 | 306 | if use_cuda: 307 | dv3_model = dv3_model.cuda() 308 | 309 | dv3_optimizer = optim.Adam((dv3_model.get_trainable_parameters(), 310 | lr=hparams.initial_learning_rate, betas=( 311 | hparams.adam_beta1, hparams.adam_beta2), 312 | eps=hparams.adam_eps, weight_decay=hparams.weight_decay) 313 | 314 | log_event_path = "log/run-test" + str(datetime.now()).replace(" ", "_") 315 | print("Log event path for dv3: {}".format(log_event_path)) 316 | writer_dv3 = SummaryWriter(log_dir=log_event_path) 317 | 318 | # ENCODER 319 | all_speakers = get_cloned_voices(dv3_model) 320 | print("Cloning Texts are produced") 321 | 322 | speaker_embed = get_speaker_embeddings(dv3_model) 323 | 324 | encoder = build_encoder() 325 | 326 | print("Encoder is built!") 327 | 328 | speech_data_encoder = Speech_Dataset(all_speakers, speaker_embed) 329 | 330 | criterion_encoder = nn.L1Loss() 331 | 332 | optimizer_encoder = torch.optim.SGD(encoder.parameters(),lr=0.0006) 333 | 334 | lambda1_encoder = lambda epoch: 0.6 if epoch%8000==7999 else 1#??????????? 335 | scheduler_encoder = torch.optim.lr_scheduler.LambdaLR(optimizer_encoder, lr_lambda=lambda1_encoder) 336 | 337 | data_loader_encoder = data_utils.DataLoader(speech_data_encoder, batch_size=batch_size_encoder, shuffle=True, drop_last=True) 338 | # Training The Encoder 339 | dataiter_encoder = iter(data_loader_encoder) 340 | 341 | if use_cuda: 342 | encoder = encoder.cuda() 343 | 344 | if checkpoint_encoder!=None and os.path.isfile(checkpoint_encoder): 345 | encoder, optimizer_encoder = load_checkpoint_encoder(encoder, optimizer_encoder) 346 | 347 | if train_encoder_v and train_dv3_v: 348 | try: 349 | train() 350 | except KeyboardInterrupt: 351 | print("KeyboardInterrupt") 352 | elif train_encoder_v: 353 | try: 354 | train_encoder(encoder , data_loader_encoder , optimizer_encoder,scheduler_encoder,criterion_encoder,epochs=100000) 355 | except KeyboardInterrupt: 356 | 357 | print("KeyboardInterrupt") 358 | 359 | elif train_dv3_v: 360 | try: 361 | train_dv3(dv3_model ,data_loader_dv3, dv3_optimizer, writer_dv3, 362 | init_lr=hparams.initial_learning_rate, 363 | checkpoint_dir=checkpoint_dv3, 364 | checkpoint_interval=hparams.checkpoint_interval, 365 | nepochs=hparams.nepochs, 366 | clip_thresh=hparams.clip_thresh, 367 | train_seq2seq=True, train_postnet=True) 368 | except KeyboardInterrupt: 369 | 370 | print("KeyboardInterrupt") 371 | else: 372 | assert False , "Wrongs arguments specified" 373 | 374 | print("Finished") 375 | sys.exit(0) 376 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import exists, join, expanduser 3 | 4 | import torch 5 | import numpy as np 6 | import librosa 7 | import librosa.display 8 | from torch.utils.data import Dataset 9 | 10 | # need this for English text processing frontend 11 | import nltk 12 | import pickle 13 | 14 | # import dv3.synthesis 15 | # import train 16 | # from deepvoice3_pytorch import frontend 17 | # from train import build_model 18 | # from train import restore_parts, load_checkpoint 19 | from dv3.synthesis import tts as _tts 20 | 21 | 22 | def tts(model, text, p=0, speaker_id=0, fast=True, figures=True): 23 | from dv3.synthesis import tts as _tts 24 | waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast) 25 | if figures: 26 | visualize(alignment, spectrogram) 27 | IPython.display.display(Audio(waveform, rate=fs)) 28 | 29 | def visualize(alignment, spectrogram): 30 | label_fontsize = 16 31 | figure(figsize=(16,16)) 32 | 33 | subplot(2,1,1) 34 | imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) 35 | xlabel("Decoder timestamp", fontsize=label_fontsize) 36 | ylabel("Encoder timestamp", fontsize=label_fontsize) 37 | colorbar() 38 | 39 | subplot(2,1,2) 40 | librosa.display.specshow(spectrogram.T, sr=fs, 41 | hop_length=hop_length, x_axis="time", y_axis="linear") 42 | xlabel("Time", fontsize=label_fontsize) 43 | ylabel("Hz", fontsize=label_fontsize) 44 | tight_layout() 45 | colorbar() 46 | 47 | 48 | def generate_cloned_samples(model,cloning_text_path = None, no_speakers = 108 , fast = True, p =0 ): 49 | 50 | #cloning_texts = ["this is the first" , "this is the second"] 51 | if(cloning_text_path == None): 52 | cloning_text_path = "./Cloning_Audio/cloning_text.txt" 53 | 54 | cloning_texts = open("./Cloning_Audio/cloning_text.txt").read().splitlines() 55 | # no_cloning_texts = len(cloning_texts) 56 | 57 | all_speakers = [] 58 | 59 | for speaker_id in range(no_speakers): 60 | speaker_cloning_mel = [] 61 | print("The Speaker being cloned speaker-{}".format(speaker_id)) 62 | for text in cloning_texts: 63 | waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast) 64 | speaker_cloning_mel.append([speaker_id, mel]) 65 | #print(np.array(speaker_cloning_mel).shape) 66 | all_speakers.append(speaker_cloning_mel) 67 | with open("./Cloning_Audio/speakers_cloned_voices_mel.p", "wb") as fp: #Pickling 68 | pickle.dump(all_speakers, fp) 69 | # print("") 70 | 71 | print("Shape of all speakers:",np.array(all_speakers).shape) 72 | # print(all_speakers.shape) 73 | 74 | 75 | # all speakers[speaker_id][cloned_audio_number] 76 | # print(all_speakers[0][1].shape) 77 | return all_speakers 78 | 79 | class Speech_Dataset(Dataset): 80 | def __init__(self, mfccs, embeddings, sampler): 81 | '''Mfccs have to be list of lists of numpy arrays. Each of these numpy arrays will be a mel spectrogram''' 82 | self.voices = mfccs 83 | temp = [spec.shape[0] for text in self.voices for spec in text] 84 | largest_size = np.amax(np.array(temp)) 85 | self._pad(largest_size) 86 | self.embeddings = embeddings 87 | if sampler==True: 88 | self.sampler = True 89 | 90 | def _pad(self, maximum_size): 91 | '''Input: 92 | Specs: Mel Spectrograms with 80 channels but the length of each channel is not the same. 93 | maximum_size: Largest channel length. Others are padded to this length 94 | 95 | Padding with 0 won't affect the convolutions because anyway the neurons corresponding to the states have to 96 | be dead if they are not padded. Putting 0 will also make those neurons dead. And later an average is taken along 97 | this dimension too. 98 | 99 | Returns: A padded array of arrays of spectrograms.''' 100 | 101 | for i, i_element in enumerate(self.voices): 102 | for j, j_element in enumerate(i_element): 103 | final = np.zeros((maximum_size, 80)) 104 | final[:self.voices[i][j].shape[0], :] += j_element 105 | self.voices[i][j]=final 106 | self.voices = np.array(self.voices) 107 | print(self.voices.shape) 108 | 109 | def __len__(self): 110 | '''Returns total number of speakers''' 111 | return len(self.voices) 112 | 113 | def __getitem__(self, idx): 114 | if self.sampler==False: 115 | return (self.voices[idx], self.embeddings[idx]) 116 | elif self.sampler==True: 117 | sample = np.random.random_integers(1, 22, size=int(np.random.randint(1, 10, size=1))) 118 | return (self.voices[idx, sample, :, :], self.embeddings[idx]) 119 | --------------------------------------------------------------------------------