├── .gitattributes ├── LICENSE ├── README.md ├── datasets ├── __init__.py ├── audio.py ├── preprocessor.py └── wavenet_preprocessor.py ├── dictionary.txt ├── docker └── Dockerfile ├── frozen_tacotron2.py ├── gen_serving_model_tacotron2.py ├── gitignore ├── griffin_lim_synthesis_tool.ipynb ├── hparams.py ├── inference_melgan.py ├── inference_tacotron2_melgan.py ├── infolog.py ├── melgan_vocoder ├── .gitignore ├── LICENSE ├── README.md ├── assets │ ├── gd.png │ ├── lj-tensorboard-v0.3-alpha.png │ └── lj-tensorboard.png ├── config │ └── default.yaml ├── data │ └── test │ │ └── mel │ │ └── mel_out.npy ├── datasets │ ├── __init__.py │ └── dataloader.py ├── hubconf.py ├── melgan2onnx.py ├── model │ ├── __init__.py │ ├── discriminator.py │ ├── generator.py │ ├── identity.py │ ├── multiscale.py │ └── res_stack.py ├── onnx2trt.py ├── preprocess.py ├── requirements.txt ├── train_melgan.py └── utils │ ├── __init__.py │ ├── audio_processing.py │ ├── hparams.py │ ├── plotting.py │ ├── stft.py │ ├── train.py │ ├── utils.py │ ├── validation.py │ └── writer.py ├── paper_hparams.py ├── preprocess.py ├── requirements.txt ├── sentences_phone.txt ├── symbols.py ├── synthesize.py ├── tacotron ├── __init__.py ├── feeder.py ├── models │ ├── Architecture_wrappers.py │ ├── __init__.py │ ├── attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── modules.py │ └── tacotron.py ├── synthesize.py ├── synthesizer.py ├── train.py └── utils │ ├── __init__.py │ ├── cleaners.py │ ├── cmudict.py │ ├── numbers.py │ ├── plot.py │ ├── symbols.py │ └── text.py ├── tacotron2_client.py └── train_tacotron.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=python 2 | *.ipynb linguist-language=python 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rayhane Mama 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tacotron-2(tensorflow) + melgan(pytorch) chinese TTS: 2 | 3 | 4 | [melgan](https://github.com/seungwonpark/melgan) is very faster than other vocoders and the quality is not so bad. re-implement the [split_func](./tacotron/models/tacotron.py) in tacotron2 that tensorflow serving not support , re-implement the [nn.ReflectionPad1d](./melgan_vocoder/model/res_stack.py) that tensorrt not support. modify the 5 | melgan's input from [-12,2] to [-4,4] that match the tacotron2's output. 6 | 7 | python37,biaobei chinese dataset,tacotron2 support chinese pinyin or chinese phone + rhythm training(default is phone + rhythm),edit [symbols.py](./tacotron/utils/symbols.py) and [text.py](./tacotron/utils/text.py): 8 | 9 | pinyin: 10 | 000001,ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 11 | 000002,jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3 12 | 000003,bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4 13 | 14 | phone + rhythm(dictionary.txt): 15 | 000001,k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。 16 | 000002,j ia2 v3 c uen1 ian2 #2 b ie2 z ai4 #1 iong1 b ao4 uo3 #4 。 17 | 000003,b ao2 m a3 #1 p ei4 g ua4 #1 b o3 l uo2 an1 #3 , d iao1 ch an2 #1 van4 zh en3 #2 d ong3 ueng1 t a4 #4 。 18 | 19 | # tacotron2 20 | - Step **(0)**: python frozen_tacotron2.py, set tf_pyfunc = False in [tacotron.py](./tacotron/models/tacotron.py), you can freeze the model. 21 | - Step **(1)**: python gen_serving_model_tacotron2.py , convert pb to savemodel. 22 | 23 | # melgan 24 | - Step **(0)**: python melgan2onnx.py, use MyRefPad1d() instead of nn.ReflectionPad1d() , convert pt to onnx. 25 | - Step **(1)**: python onnx2trt.py , convert onnx to trt , trt support dynamic input shape. 26 | 27 | # Training and Inference: 28 | 29 | gta: 30 | - Step **(0)**: python preprocess.py ,process the audios for t2 and melgan training . 31 | - Step **(1)**: python train_tacotron.py ,while finish the t2 training, it will generate gta data . 32 | - Step **(2)**: cd melgan, cp audio-xxx.npy and mel-xxx.npy(gta data) to melgan's training/validing data-path . 33 | - Step **(3)**: python train_melgan.py . 34 | 35 | real mel: 36 | - Step **(0)**: python preprocess.py ,process the audios for t2 and melgan training . 37 | - Step **(1)**: cp audio-xxx.npy and mel-xxx.npy(real mel) to melgan's training/validing data-path . 38 | - Step **(2)**: python train_tacotron.py . 39 | - Step **(3)**: python train_melgan.py ,train the melgan with the real mel data. 40 | 41 | also ,run inference_melgan.py if you only interested in vocoder . 42 | 43 | 44 | # reference: 45 | https://github.com/Rayhane-mamah/Tacotron-2 46 | https://github.com/seungwonpark/melgan 47 | 48 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path_mydata(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | 12 | executor = ProcessPoolExecutor(max_workers=n_jobs) 13 | futures = [] 14 | index = 1 15 | data_set = 'biaobei' 16 | # biaobei 17 | if (data_set == 'biaobei'): 18 | for input_dir in input_dirs: 19 | with open(os.path.join(input_dir, 'ProsodyLabeling', 'biaobei_transcript_phone.txt'), encoding='utf-8') as f: 20 | for line in f: 21 | parts = line.strip().split(',') 22 | basename = parts[0] 23 | wav_path = os.path.join(input_dir, 'Wave', '001', '{}.wav'.format(basename)) 24 | text = parts[-1] 25 | futures.append(executor.submit( 26 | partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams))) 27 | index += 1 28 | 29 | return [future.result() for future in tqdm(futures) if future.result() is not None] 30 | 31 | 32 | 33 | 34 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 35 | """ 36 | Preprocesses the speech dataset from a gven input path to given output directories 37 | 38 | Args: 39 | - hparams: hyper parameters 40 | - input_dir: input directory that contains the files to prerocess 41 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 42 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 43 | - wav_dir: output directory of the preprocessed speech audio dataset 44 | - n_jobs: Optional, number of worker process to parallelize across 45 | - tqdm: Optional, provides a nice progress bar 46 | 47 | Returns: 48 | - A list of tuple describing the train examples. this should be written to train.txt 49 | """ 50 | 51 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 52 | # optimization purposes and it can be omited 53 | executor = ProcessPoolExecutor(max_workers=n_jobs) 54 | futures = [] 55 | index = 1 56 | for input_dir in input_dirs: 57 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: 58 | for line in f: 59 | parts = line.strip().split('|') 60 | basename = parts[0] 61 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename)) 62 | text = parts[2] 63 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams))) 64 | index += 1 65 | 66 | return [future.result() for future in tqdm(futures) if future.result() is not None] 67 | 68 | 69 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): 70 | """ 71 | Preprocesses a single utterance wav/text pair 72 | 73 | this writes the mel scale spectogram to disk and return a tuple to write 74 | to the train.txt file 75 | 76 | Args: 77 | - mel_dir: the directory to write the mel spectograms into 78 | - linear_dir: the directory to write the linear spectrograms into 79 | - wav_dir: the directory to write the preprocessed wav into 80 | - index: the numeric index to use in the spectogram filename 81 | - wav_path: path to the audio file containing the speech input 82 | - text: text spoken in the input audio file 83 | - hparams: hyper parameters 84 | 85 | Returns: 86 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 87 | """ 88 | try: 89 | # Load the audio as numpy array 90 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 91 | except FileNotFoundError: #catch missing wav exception 92 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 93 | wav_path)) 94 | return None 95 | 96 | #Trim lead/trail silences 97 | if hparams.trim_silence: 98 | wav = audio.trim_silence(wav, hparams) 99 | 100 | #Pre-emphasize 101 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 102 | 103 | #rescale wav 104 | if hparams.rescale: 105 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 106 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max 107 | 108 | #Assert all audio is in [-1, 1] 109 | if (wav > 1.).any() or (wav < -1.).any(): 110 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 111 | if (preem_wav > 1.).any() or (preem_wav < -1.).any(): 112 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 113 | 114 | #Mu-law quantize 115 | if is_mulaw_quantize(hparams.input_type): 116 | #[0, quantize_channels) 117 | out = mulaw_quantize(wav, hparams.quantize_channels) 118 | 119 | #Trim silences 120 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 121 | wav = wav[start: end] 122 | preem_wav = preem_wav[start: end] 123 | out = out[start: end] 124 | 125 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 126 | out_dtype = np.int16 127 | 128 | elif is_mulaw(hparams.input_type): 129 | #[-1, 1] 130 | out = mulaw(wav, hparams.quantize_channels) 131 | constant_values = mulaw(0., hparams.quantize_channels) 132 | out_dtype = np.float32 133 | 134 | else: 135 | #[-1, 1] 136 | out = wav 137 | constant_values = 0. 138 | out_dtype = np.float32 139 | 140 | # Compute the mel scale spectrogram from the wav 141 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) 142 | mel_frames = mel_spectrogram.shape[1] 143 | 144 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 145 | return None 146 | 147 | #Compute the linear scale spectrogram from the wav 148 | linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) 149 | linear_frames = linear_spectrogram.shape[1] 150 | 151 | #sanity check 152 | assert linear_frames == mel_frames 153 | 154 | if hparams.use_lws: 155 | #Ensure time resolution adjustement between audio and mel-spectrogram 156 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 157 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 158 | 159 | #Zero pad audio signal 160 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 161 | else: 162 | #Ensure time resolution adjustement between audio and mel-spectrogram 163 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) 164 | 165 | #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) 166 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) 167 | 168 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 169 | 170 | #time resolution adjustement 171 | #ensure length of raw audio is multiple of hop size so that we can use 172 | #transposed convolution to upsample 173 | out = out[:mel_frames * audio.get_hop_size(hparams)] 174 | assert len(out) % audio.get_hop_size(hparams) == 0 175 | time_steps = len(out) 176 | 177 | # Write the spectrogram and audio to disk 178 | audio_filename = 'audio-{}.npy'.format(index) 179 | mel_filename = 'mel-{}.npy'.format(index) 180 | linear_filename = 'linear-{}.npy'.format(index) 181 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) 182 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 183 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) 184 | 185 | # Return a tuple describing this training example 186 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text) 187 | -------------------------------------------------------------------------------- /datasets/wavenet_preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 19 | - wav_dir: output directory of the preprocessed speech audio dataset 20 | - n_jobs: Optional, number of worker process to parallelize across 21 | - tqdm: Optional, provides a nice progress bar 22 | 23 | Returns: 24 | - A list of tuple describing the train examples. this should be written to train.txt 25 | """ 26 | 27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 28 | # optimization purposes and it can be omited 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | futures = [] 31 | 32 | #for file in os.listdir(input_dir): 33 | #wav_path = os.path.join(input_dir, file) 34 | #basename = os.path.basename(wav_path).replace('.wav', '') 35 | #futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams))) 36 | for tmp1 in os.listdir(input_dir): 37 | tmp_path = os.path.join(input_dir, tmp1) 38 | for file in os.listdir(tmp_path): 39 | if(file.split('.')[-1] == 'wav'): 40 | wav_path = os.path.join(tmp_path, file) 41 | basename = os.path.basename(wav_path).replace('.wav', '') 42 | futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams))) 43 | 44 | return [future.result() for future in tqdm(futures) if future.result() is not None] 45 | 46 | 47 | def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams): 48 | """ 49 | Preprocesses a single utterance wav/text pair 50 | 51 | this writes the mel scale spectogram to disk and return a tuple to write 52 | to the train.txt file 53 | 54 | Args: 55 | - mel_dir: the directory to write the mel spectograms into 56 | - linear_dir: the directory to write the linear spectrograms into 57 | - wav_dir: the directory to write the preprocessed wav into 58 | - index: the numeric index to use in the spectrogram filename 59 | - wav_path: path to the audio file containing the speech input 60 | - text: text spoken in the input audio file 61 | - hparams: hyper parameters 62 | 63 | Returns: 64 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 65 | """ 66 | try: 67 | # Load the audio as numpy array 68 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 69 | except FileNotFoundError: #catch missing wav exception 70 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 71 | wav_path)) 72 | return None 73 | 74 | #M-AILABS extra silence specific 75 | if hparams.trim_silence: 76 | wav = audio.trim_silence(wav, hparams) 77 | 78 | #Pre-emphasize 79 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 80 | 81 | #rescale wav 82 | if hparams.rescale: 83 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 84 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max 85 | 86 | #Assert all audio is in [-1, 1] 87 | if (wav > 1.).any() or (wav < -1.).any(): 88 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 89 | if (preem_wav > 1.).any() or (preem_wav < -1.).any(): 90 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 91 | 92 | #Mu-law quantize 93 | if is_mulaw_quantize(hparams.input_type): 94 | #[0, quantize_channels) 95 | out = mulaw_quantize(wav, hparams.quantize_channels) 96 | 97 | #Trim silences 98 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 99 | wav = wav[start: end] 100 | preem_wav = preem_wav[start: end] 101 | out = out[start: end] 102 | 103 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 104 | out_dtype = np.int16 105 | 106 | elif is_mulaw(hparams.input_type): 107 | #[-1, 1] 108 | out = mulaw(wav, hparams.quantize_channels) 109 | constant_values = mulaw(0., hparams.quantize_channels) 110 | out_dtype = np.float32 111 | 112 | else: 113 | #[-1, 1] 114 | out = wav 115 | constant_values = 0. 116 | out_dtype = np.float32 117 | 118 | # Compute the mel scale spectrogram from the wav 119 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) 120 | mel_frames = mel_spectrogram.shape[1] 121 | 122 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 123 | return None 124 | 125 | if hparams.use_lws: 126 | #Ensure time resolution adjustement between audio and mel-spectrogram 127 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 128 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 129 | 130 | #Zero pad audio signal 131 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 132 | else: 133 | #Ensure time resolution adjustement between audio and mel-spectrogram 134 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) 135 | 136 | #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) 137 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) 138 | 139 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 140 | 141 | #time resolution adjustement 142 | #ensure length of raw audio is multiple of hop size so that we can use 143 | #transposed convolution to upsample 144 | out = out[:mel_frames * audio.get_hop_size(hparams)] 145 | assert len(out) % audio.get_hop_size(hparams) == 0 146 | time_steps = len(out) 147 | 148 | # Write the spectrogram and audio to disk 149 | audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) 150 | mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) 151 | np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) 152 | np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) 153 | 154 | #global condition features 155 | if hparams.gin_channels > 0: 156 | raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training') 157 | speaker_id = '' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) 158 | else: 159 | speaker_id = '' 160 | 161 | # Return a tuple describing this training example 162 | return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps, mel_frames) 163 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda3:latest 2 | FROM tensorflow/tensorflow:latest-gpu-py3 3 | 4 | RUN apt-get update 5 | RUN apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools wget git vim 6 | 7 | RUN wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 8 | RUN tar -jxvf LJSpeech-1.1.tar.bz2 9 | 10 | RUN git clone https://github.com/Rayhane-mamah/Tacotron-2.git 11 | 12 | WORKDIR Tacotron-2 13 | RUN ln -s ../LJSpeech-1.1 . 14 | RUN pip install -r requirements.txt -------------------------------------------------------------------------------- /frozen_tacotron2.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | 3 | import os,time 4 | import numpy as np 5 | import tensorflow as tf 6 | from tacotron.utils.text import text_to_sequence 7 | from hparams import hparams 8 | from datasets import audio 9 | from tacotron.models import create_model 10 | import re 11 | import symbols 12 | syms = symbols.symbols 13 | 14 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 16 | 17 | # set tf_pyfunc = False when freeze model in tacotron/models/tacotron.py 18 | 19 | class Synthesizer: 20 | def load(self, hparams, src_model_path=None,des_model_path=None): 21 | 22 | gta = False 23 | model_name = 'Tacotron' 24 | 25 | #Force the batch size to be known in order to use attention masking in batch synthesis 26 | inputs = tf.placeholder(tf.int32, (None, None), name='inputs') 27 | input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths') 28 | targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') 29 | split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') 30 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: 31 | self.model = create_model(model_name, hparams) 32 | if gta: 33 | self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos) 34 | else: 35 | self.model.initialize(inputs, input_lengths, split_infos=split_infos) 36 | 37 | self.mel_outputs = self.model.tower_mel_outputs 38 | self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None 39 | self.alignments = self.model.tower_alignments 40 | self.stop_token_prediction = self.model.tower_stop_token_prediction 41 | self.targets = targets 42 | 43 | hparams.GL_on_GPU = False 44 | if hparams.GL_on_GPU: 45 | self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') 46 | self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') 47 | 48 | self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams) 49 | self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams) 50 | 51 | self.gta = gta 52 | self._hparams = hparams 53 | #pad input sequences with the 0 ( _ ) 54 | self._pad = 0 55 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 56 | #to avoid any possible conflicts, without affecting the output range of the model too much 57 | if hparams.symmetric_mels: 58 | self._target_pad = -hparams.max_abs_value 59 | else: 60 | self._target_pad = 0. 61 | 62 | self.inputs = inputs 63 | self.input_lengths = input_lengths 64 | self.targets = targets 65 | self.split_infos = split_infos 66 | 67 | config = tf.ConfigProto() 68 | config.gpu_options.allow_growth = True 69 | config.allow_soft_placement = True 70 | 71 | session = tf.Session(config=config) 72 | session.run(tf.global_variables_initializer()) 73 | 74 | saver = tf.train.Saver() 75 | saver.restore(session, src_model_path) 76 | # re-save model 77 | saver.save(session, des_model_path) 78 | 79 | 80 | def ckpt2pb(ckpt_model,pb_model): 81 | saver = tf.train.import_meta_graph(ckpt_model + '.meta', clear_devices=True) 82 | with tf.Session() as sess: 83 | tf.global_variables_initializer().run() 84 | saver.restore(sess, ckpt_model) 85 | 86 | # # 打印节点信息 87 | # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node] 88 | # for tensor_name in tensor_name_list: 89 | # print(tensor_name) 90 | 91 | # mel_outputs linear_outputs alignments stop_token_prediction 92 | output_graph_def = tf.graph_util.convert_variables_to_constants( 93 | sess, 94 | tf.get_default_graph().as_graph_def(), 95 | ["Tacotron_model/inference/Minimum_1", 96 | # "Tacotron_model/inference/Minimum_2", 97 | # "Tacotron_model/inference/transpose", 98 | "Tacotron_model/inference/Reshape_2", 99 | ]) 100 | 101 | with tf.gfile.GFile(pb_model, "wb") as f: 102 | f.write(output_graph_def.SerializeToString()) 103 | 104 | return 105 | 106 | 107 | def prepare_inputs(inputs): 108 | max_len = max([len(x) for x in inputs]) 109 | return np.stack([pad_input(x, max_len) for x in inputs]), max_len 110 | 111 | def pad_input(x, length): 112 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=0) 113 | 114 | 115 | 116 | def inference_pb(model_pb): 117 | texts = 'k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。' 118 | # texts = 'b ao2 m a3 #1 p ei4 g ua4 #1 b o3 l uo2 an1 #3 , d iao1 ch an2 #1 van4 zh en3 #2 d ong3 ueng1 t a4 #4 。' 119 | s = [] 120 | texts_split = re.split("( )", texts) 121 | for i in texts_split: 122 | if (i in syms): 123 | index = syms.index(i) 124 | s.append(index) 125 | seqs = np.asarray(s) 126 | 127 | seqs_lengths = len(seqs) 128 | input_lengths_np = np.asarray(seqs_lengths, dtype=np.int32).reshape(1) 129 | 130 | input_seqs = seqs[np.newaxis].astype(np.int32) 131 | max_seq_len = seqs_lengths 132 | split_infos_np = np.asarray([max_seq_len, 0, 0, 0], dtype=np.int32)[np.newaxis] 133 | print('input_seqs:', input_seqs.shape) 134 | print('input_lengths_np:', input_lengths_np.shape) 135 | print('split_infos_np:', split_infos_np.shape) 136 | 137 | ############################# 138 | # texts = ['k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。'] 139 | # t2_hparams = hparams.parse('') 140 | # cleaner_names = [x.strip() for x in t2_hparams.cleaners.split(',')] 141 | # seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 142 | # input_lengths_np = [len(seq) for seq in seqs] 143 | # input_lengths_np = np.asarray(input_lengths_np, dtype=np.int32) 144 | # 145 | # size_per_device = len(seqs) // t2_hparams.tacotron_num_gpus 146 | # 147 | # # Pad inputs according to each GPU max length 148 | # input_seqs = None 149 | # split_infos_np = [] 150 | # for i in range(t2_hparams.tacotron_num_gpus): 151 | # device_input = seqs[size_per_device * i: size_per_device * (i + 1)] 152 | # device_input, max_seq_len = prepare_inputs(device_input) 153 | # input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input 154 | # input_seqs = input_seqs.astype(np.int32) 155 | # split_infos_np.append([max_seq_len, 0, 0, 0]) 156 | # split_infos_np = np.asarray(split_infos_np, dtype=np.int32) 157 | # print('input_seqs:', input_seqs.shape) 158 | # print('input_lengths_np:', input_lengths_np.shape) 159 | # print('split_infos_np:', split_infos_np.shape) 160 | 161 | ################# 162 | with tf.gfile.FastGFile(model_pb, 'rb') as f: 163 | graph_def = tf.GraphDef() 164 | graph_def.ParseFromString(f.read()) 165 | tf.import_graph_def(graph_def, name='') 166 | 167 | with tf.Session() as sess: 168 | in_tensor = sess.graph.get_tensor_by_name('inputs:0') 169 | in_length_tensor = sess.graph.get_tensor_by_name('input_lengths:0') 170 | split_infos_tensor = sess.graph.get_tensor_by_name('split_infos:0') 171 | 172 | mel_output_tensor = sess.graph.get_tensor_by_name('Tacotron_model/inference/Minimum_1:0') 173 | # linear_output_tensor = sess.graph.get_tensor_by_name('Tacotron_model/inference/Minimum_2:0') 174 | # alignments_output_tensor = sess.graph.get_tensor_by_name('Tacotron_model/inference/transpose:0') 175 | stop_token_output_tensor = sess.graph.get_tensor_by_name('Tacotron_model/inference/Reshape_2:0') 176 | 177 | feed_dict = {in_tensor: input_seqs, 178 | in_length_tensor: input_lengths_np, 179 | split_infos_tensor: split_infos_np} 180 | 181 | mel_out,stop_token_output = sess.run([mel_output_tensor,stop_token_output_tensor], feed_dict=feed_dict) 182 | 183 | # postprocess 184 | mel_out = np.squeeze(mel_out, 0) 185 | target_length = 0 186 | stop_tokens_list = np.round(stop_token_output).tolist() 187 | for row in stop_tokens_list: 188 | if 1 in row: 189 | target_length = row.index(1) 190 | else: 191 | target_length = len(row) 192 | 193 | # Take off the batch wise padding 194 | mel_out = mel_out[:target_length, :] 195 | 196 | mel_out = np.clip(mel_out, -4, 4) 197 | print(mel_out.shape) 198 | np.save('mel_out.npy',mel_out) 199 | 200 | 201 | 202 | 203 | if __name__ == '__main__': 204 | 205 | src_model_path = './logs-Tacotron-2_phone/taco_pretrained/tacotron_model.ckpt-250000' 206 | des_model_path = './logs-Tacotron-2_phone/taco_pretrained1/new_model' 207 | 208 | pb_file = 'tacotron2.pb' 209 | 210 | # 1.re-save model 211 | # t2_hparams = hparams.parse('') 212 | # synth = Synthesizer() 213 | # synth.load(t2_hparams,src_model_path,des_model_path) 214 | 215 | # 2.frozen_model 216 | # ckpt2pb(des_model_path, pb_file) 217 | 218 | # 3.test 219 | # t1 = time.time() 220 | # inference_pb(pb_file) 221 | # print('time:',time.time() - t1) 222 | print('done') 223 | 224 | -------------------------------------------------------------------------------- /gen_serving_model_tacotron2.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import argparse 5 | import os 6 | import shutil 7 | 8 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 9 | 10 | export_model = './save_model' 11 | parser = argparse.ArgumentParser(description='Generate a saved model.') 12 | parser.add_argument('--export_model_dir', type=str, default=export_model, help='export model directory') 13 | parser.add_argument('--model_version', type=int, default=1, help='model version') 14 | parser.add_argument('--model', type=str, default='tacotron2.pb', help='model pb file') 15 | 16 | args = parser.parse_args() 17 | 18 | if os.path.exists(export_model): 19 | shutil.rmtree(export_model) 20 | 21 | if __name__ == '__main__': 22 | 23 | #----------------- 24 | with tf.Session() as sess: 25 | with tf.gfile.GFile(args.model, "rb") as f: 26 | restored_graph_def = tf.GraphDef() 27 | restored_graph_def.ParseFromString(f.read()) 28 | tf.import_graph_def( 29 | restored_graph_def, 30 | input_map=None, 31 | return_elements=None, 32 | name="") 33 | 34 | # #打印节点信息 35 | # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node] 36 | # for tensor_name in tensor_name_list: 37 | # print(tensor_name) 38 | 39 | export_path_base = args.export_model_dir 40 | export_path = os.path.join(tf.compat.as_bytes(export_path_base), 41 | tf.compat.as_bytes(str(args.model_version))) 42 | print('Exporting trained model to', export_path) 43 | builder = tf.saved_model.builder.SavedModelBuilder(export_path) 44 | 45 | # input 46 | in_tensor = tf.get_default_graph().get_tensor_by_name('inputs:0') 47 | in_length_tensor = tf.get_default_graph().get_tensor_by_name('input_lengths:0') 48 | split_infos_tensor = tf.get_default_graph().get_tensor_by_name('split_infos:0') 49 | 50 | # output 51 | mel_output_tensor = tf.get_default_graph().get_tensor_by_name('Tacotron_model/inference/Minimum_1:0') 52 | stop_token_output_tensor = tf.get_default_graph().get_tensor_by_name('Tacotron_model/inference/Reshape_2:0') 53 | 54 | # postprocess 55 | mel_output_tensor = tf.squeeze(mel_output_tensor, 0) 56 | stop_token_output_round = tf.round(stop_token_output_tensor) 57 | keep = tf.where(tf.squeeze(stop_token_output_round, 0) >= 1) 58 | keep_index = tf.squeeze(tf.cast(keep, tf.int32)) 59 | output_lengths = tf.cond(tf.equal(tf.size(keep_index), 0), lambda: tf.shape(stop_token_output_round)[1], 60 | lambda: keep_index) 61 | mel_output_tensor = mel_output_tensor[:output_lengths, :] 62 | mel_output_tensor = tf.clip_by_value(mel_output_tensor, -4, 4) 63 | 64 | 65 | #build_tensor_info 66 | tensor_in = tf.saved_model.utils.build_tensor_info(in_tensor) 67 | tensor_in_length = tf.saved_model.utils.build_tensor_info(in_length_tensor) 68 | tensor_split_infos = tf.saved_model.utils.build_tensor_info(split_infos_tensor) 69 | 70 | model_out_mel = tf.saved_model.utils.build_tensor_info(mel_output_tensor) 71 | 72 | prediction_signature = ( 73 | tf.saved_model.signature_def_utils.build_signature_def( 74 | inputs={'inputs': tensor_in, 75 | 'input_lengths': tensor_in_length, 76 | 'split_infos': tensor_split_infos, 77 | }, 78 | outputs={'mel': model_out_mel, 79 | }, 80 | method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME 81 | )) 82 | 83 | builder.add_meta_graph_and_variables( 84 | sess, [tf.saved_model.tag_constants.SERVING], 85 | signature_def_map={ 86 | 'predict':prediction_signature, 87 | }) 88 | 89 | builder.save(as_text=False) 90 | print('Done exporting!') 91 | 92 | -------------------------------------------------------------------------------- /gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.pb 3 | *.ckpt 4 | *.trt 5 | *.onnx 6 | -------------------------------------------------------------------------------- /griffin_lim_synthesis_tool.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "from datasets.audio import *\n", 13 | "import os\n", 14 | "from hparams import hparams\n", 15 | "\n", 16 | "n_sample = 0 #Change n_steps here\n", 17 | "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n", 18 | "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n", 19 | "out_dir = 'wav_out'\n", 20 | "\n", 21 | "os.makedirs(out_dir, exist_ok=True)\n", 22 | "\n", 23 | "#mel_file = os.path.join(mel_folder, mel_file)\n", 24 | "mel_file = 'training_data/mels/mel-LJ001-0001.npy'\n", 25 | "mel_spectro = np.load(mel_file)\n", 26 | "mel_spectro.shape" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n", 36 | "#save the wav under test__\n", 37 | "save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 38 | " sr=hparams.sample_rate)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from tacotron.utils.plot import *\n", 48 | "\n", 49 | "plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "lin_file = 'training_data/linear/linear-LJ001-0001.npy'\n", 59 | "lin_spectro = np.load(lin_file)\n", 60 | "lin_spectro.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n", 70 | "save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 71 | " sr=hparams.sample_rate)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 81 | " auto_aspect=True)" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.6.4" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /inference_melgan.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | import glob 3 | import tqdm 4 | import torch 5 | import argparse 6 | from scipy.io.wavfile import write 7 | import numpy as np 8 | from melgan_vocoder.model.generator import Generator 9 | from melgan_vocoder.utils.hparams import HParam, load_hparam_str 10 | import os 11 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 12 | 13 | MAX_WAV_VALUE = 32768.0 14 | 15 | def main(args): 16 | checkpoint = torch.load(args.checkpoint_path) 17 | if args.config is not None: 18 | hp = HParam(args.config) 19 | else: 20 | hp = load_hparam_str(checkpoint['hp_str']) 21 | 22 | model = Generator(hp.audio.n_mel_channels).cuda() 23 | model.load_state_dict(checkpoint['model_g']) 24 | model.eval(inference=False) 25 | 26 | # torch.save(model, 'genertor1.pt') # 保存和加载整个模型 27 | # torch.save(model.state_dict(), 'genertor2.pt') # 仅保存和加载模型参数(推荐使用) 28 | 29 | num = 0 30 | with torch.no_grad(): 31 | for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.npy'))): 32 | 33 | t2_mel = np.load(melpath) 34 | t2_mel = np.transpose(t2_mel,[1,0]) 35 | t2_mel = t2_mel[np.newaxis, :] 36 | mel = torch.from_numpy(t2_mel) 37 | mel = mel.cuda() 38 | mel_np = mel.cpu().numpy() 39 | 40 | audio = model.inference(mel) 41 | 42 | audio = audio.cpu().detach().numpy() 43 | 44 | out_path = args.save_path + str(num) + ('_reconstructed_epoch%04d.wav' % checkpoint['epoch']) 45 | write(out_path, hp.audio.sampling_rate, audio) 46 | num += 1 47 | 48 | 49 | if __name__ == '__main__': 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument('-c', '--config', type=str, default=None, 52 | help="yaml file for config. will use hp_str from checkpoint if not given.") 53 | parser.add_argument('-p', '--checkpoint_path', type=str, default='./melgan_vocoder/chkpt/biaobei/biaobei_aca5990_3125.pt', 54 | help="path of checkpoint pt file for evaluation") 55 | parser.add_argument('-i', '--input_folder', type=str, default= './melgan_vocoder/data/test/mel/' , 56 | help="directory of mel-spectrograms to invert into raw audio. ") 57 | parser.add_argument('-s', '--save_path', type=str, default='./melgan_vocoder/data/test/wav/') 58 | args = parser.parse_args() 59 | 60 | main(args) 61 | -------------------------------------------------------------------------------- /inference_tacotron2_melgan.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | 3 | import argparse 4 | import os 5 | from tqdm import tqdm 6 | import numpy as np 7 | 8 | import tensorflow as tf 9 | from hparams import hparams, hparams_debug_string 10 | from infolog import log 11 | from tacotron.synthesizer import Synthesizer 12 | 13 | # 14 | import torch 15 | from scipy.io.wavfile import write 16 | from melgan_vocoder.model.generator import Generator 17 | from melgan_vocoder.utils.hparams import HParam, load_hparam_str 18 | 19 | import warnings 20 | warnings.filterwarnings("ignore") 21 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 22 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 23 | 24 | MAX_WAV_VALUE = 32768.0 25 | 26 | 27 | def get_sentences(args): 28 | if args.text_list != '': 29 | with open(args.text_list, 'rb') as f: 30 | sentences = list(map(lambda l: l.decode("utf-8")[:-1], f.readlines())) 31 | else: 32 | sentences = hparams.sentences 33 | return sentences 34 | 35 | def init_tacotron2(args): 36 | # t2 37 | print('\n#####################################') 38 | if args.model == 'Tacotron': 39 | print('\nInitialising Tacotron Model...\n') 40 | t2_hparams = hparams.parse(args.hparams) 41 | try: 42 | checkpoint_path = tf.train.get_checkpoint_state(args.taco_checkpoint).model_checkpoint_path 43 | log('loaded model at {}'.format(checkpoint_path)) 44 | except: 45 | raise RuntimeError('Failed to load checkpoint at {}'.format(args.taco_checkpoint)) 46 | 47 | output_dir = 'tacotron_' + args.output_dir 48 | eval_dir = os.path.join(output_dir, 'eval') 49 | log_dir = os.path.join(output_dir, 'logs-eval') 50 | print('eval_dir:', eval_dir) 51 | print('args.mels_dir:', args.mels_dir) 52 | 53 | # Create output path if it doesn't exist 54 | os.makedirs(eval_dir, exist_ok=True) 55 | os.makedirs(log_dir, exist_ok=True) 56 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) 57 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) 58 | log(hparams_debug_string()) 59 | synth = Synthesizer() 60 | synth.load(checkpoint_path, t2_hparams) 61 | 62 | return synth,eval_dir,log_dir 63 | 64 | def init_melgan(args): 65 | # melgan 66 | print('\n#####################################') 67 | checkpoint = torch.load(args.vocoder_checkpoint) 68 | if args.vocoder_config is not None: 69 | hp = HParam(args.config) 70 | else: 71 | hp = load_hparam_str(checkpoint['hp_str']) 72 | 73 | melgan_model = Generator(hp.audio.n_mel_channels).cuda() 74 | melgan_model.load_state_dict(checkpoint['model_g']) 75 | melgan_model.eval(inference=False) 76 | 77 | # torch.save(model, 'genertor1.pt') # 保存和加载整个模型 78 | # torch.save(model.state_dict(), 'genertor2.pt') # 仅保存和加载模型参数(推荐使用) 79 | 80 | return melgan_model,hp,checkpoint 81 | 82 | 83 | def main(): 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('--taco_checkpoint', 86 | default='./logs-Tacotron-2_phone/taco_pretrained/',help='Path to model checkpoint') 87 | parser.add_argument('--hparams', default='', 88 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 89 | parser.add_argument('--model', default='Tacotron') 90 | parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') 91 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 92 | parser.add_argument('--text_list', default='sentences_phone.txt', help='Text file contains list of texts to be synthesized. Valid if mode=eval') 93 | parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids') 94 | 95 | # melgan 96 | parser.add_argument('--vocoder_config', type=str, default=None, 97 | help="yaml file for config. will use hp_str from checkpoint if not given.") 98 | parser.add_argument('--vocoder_checkpoint', type=str, default='./melgan_vocoder/chkpt/biaobei/biaobei_aca5990_3125.pt', 99 | help="path of checkpoint pt file for evaluation") 100 | 101 | args = parser.parse_args() 102 | sentences = get_sentences(args) 103 | 104 | ############################ 105 | synth, eval_dir, log_dir = init_tacotron2(args) 106 | 107 | voc_model,voc_hp,voc_checkpoint = init_melgan(args) 108 | output_melgan_dir = 'tacotron_' + args.output_dir + 'melgan/' 109 | os.makedirs(output_melgan_dir, exist_ok=True) 110 | 111 | # ################################### 112 | # Set inputs batch wise 113 | sentences = [sentences[i: i + hparams.tacotron_synthesis_batch_size] for i in 114 | range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] 115 | 116 | log('Starting Synthesis') 117 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: 118 | for i, texts in enumerate(tqdm(sentences)): 119 | print('\nsynthesis mel:' + str(i)) 120 | basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] 121 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) 122 | for elems in zip(texts, mel_filenames, speaker_ids): 123 | file.write('|'.join([str(x) for x in elems]) + '\n') 124 | print('\nsynthesis mel done') 125 | 126 | # melgan 127 | with torch.no_grad(): 128 | mel_filenames = mel_filenames[0] 129 | t2_mel = np.load(mel_filenames) 130 | t2_mel = np.transpose(t2_mel, [1, 0]) 131 | t2_mel = t2_mel[np.newaxis, :] 132 | mel = torch.from_numpy(t2_mel) 133 | mel = mel.cuda() 134 | mel_np = mel.cpu().numpy() 135 | 136 | audio = voc_model.inference(mel) 137 | 138 | audio = audio.cpu().detach().numpy() 139 | 140 | out_path = output_melgan_dir + str(i) + ('_melgan_epoch%04d.wav' % voc_checkpoint['epoch']) 141 | write(out_path, voc_hp.audio.sampling_rate, audio) 142 | 143 | print('\nmelgan done') 144 | print('#####################\n') 145 | 146 | log('\nsynthesized done at {}'.format(output_melgan_dir)) 147 | 148 | 149 | if __name__ == '__main__': 150 | main() 151 | -------------------------------------------------------------------------------- /infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import json 3 | from datetime import datetime 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | _format = '%Y-%m-%d %H:%M:%S.%f' 8 | _file = None 9 | _run_name = None 10 | _slack_url = None 11 | 12 | 13 | def init(filename, run_name, slack_url=None): 14 | global _file, _run_name, _slack_url 15 | _close_logfile() 16 | _file = open(filename, 'a') 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new {} training run\n'.format(run_name)) 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, end='\n', slack=False): 26 | print(msg, end=end) 27 | if _file is not None: 28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header('Content-Type', 'application/json') 43 | urlopen(req, json.dumps({ 44 | 'username': 'tacotron', 45 | 'icon_emoji': ':taco:', 46 | 'text': '*%s*: %s' % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /melgan_vocoder/.gitignore: -------------------------------------------------------------------------------- 1 | # IDE configuration 2 | .idea/ 3 | 4 | # configuration 5 | config/* 6 | !config/default.yaml 7 | temp-restore.yaml 8 | 9 | # logs, checkpoints 10 | chkpt/ 11 | logs/ 12 | 13 | # just a temporary folder 14 | temp/ 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | -------------------------------------------------------------------------------- /melgan_vocoder/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Seungwon Park 박승원 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /melgan_vocoder/README.md: -------------------------------------------------------------------------------- 1 | # MelGAN 2 | Unofficial PyTorch implementation of [MelGAN vocoder](https://arxiv.org/abs/1910.06711) 3 | 4 | ## Key Features 5 | 6 | - MelGAN is lighter, faster, and better at generalizing to unseen speakers than [WaveGlow](https://github.com/NVIDIA/waveglow). 7 | - This repository use identical mel-spectrogram function from [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2), so this can be directly used to convert output from NVIDIA's tacotron2 into raw-audio. 8 | - Pretrained model on LJSpeech-1.1 via [PyTorch Hub](https://pytorch.org/hub). 9 | 10 | ![](./assets/gd.png) 11 | 12 | ## Prerequisites 13 | 14 | Tested on Python 3.6 15 | ```bash 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | ## Prepare Dataset 20 | 21 | - Download dataset for training. This can be any wav files with sample rate 22050Hz. (e.g. LJSpeech was used in paper) 22 | - preprocess: `python preprocess.py -c config/default.yaml -d [data's root path]` 23 | - Edit configuration `yaml` file 24 | 25 | ## Train & Tensorboard 26 | 27 | - `python trainer.py -c [config yaml file] -n [name of the run]` 28 | - `cp config/default.yaml config/config.yaml` and then edit `config.yaml` 29 | - Write down the root path of train/validation files to 2nd/3rd line. 30 | - Each path should contain pairs of `*.wav` with corresponding (preprocessed) `*.mel` file. 31 | - The data loader parses list of files within the path recursively. 32 | - `tensorboard --logdir logs/` 33 | 34 | ## Pretrained model 35 | 36 | Try with Google Colab: TODO 37 | 38 | ```python 39 | import torch 40 | vocoder = torch.hub.load('seungwonpark/melgan', 'melgan') 41 | vocoder.eval() 42 | mel = torch.randn(1, 80, 234) # use your own mel-spectrogram here 43 | 44 | if torch.cuda.is_available(): 45 | vocoder = vocoder.cuda() 46 | mel = mel.cuda() 47 | 48 | with torch.no_grad(): 49 | audio = vocoder.inference(mel) 50 | ``` 51 | 52 | ## Inference 53 | 54 | - `python inference.py -p [checkpoint path] -i [input mel path]` 55 | 56 | ## Results 57 | 58 | See audio samples at: http://swpark.me/melgan/. 59 | Model was trained at V100 GPU for 14 days using LJSpeech-1.1. 60 | 61 | ![](./assets/lj-tensorboard-v0.3-alpha.png) 62 | 63 | 64 | ## Implementation Authors 65 | 66 | - [Seungwon Park](http://swpark.me) @ MINDsLab Inc. (yyyyy@snu.ac.kr, swpark@mindslab.ai) 67 | - Myunchul Joe @ MINDsLab Inc. 68 | - [Rishikesh](https://github.com/rishikksh20) @ DeepSync Technologies Pvt Ltd. 69 | 70 | ## License 71 | 72 | BSD 3-Clause License. 73 | 74 | - [utils/stft.py](./utils/stft.py) by Prem Seetharaman (BSD 3-Clause License) 75 | - [datasets/mel2samp.py](./datasets/mel2samp.py) from https://github.com/NVIDIA/waveglow (BSD 3-Clause License) 76 | - [utils/hparams.py](./utils/hparams.py) from https://github.com/HarryVolek/PyTorch_Speaker_Verification (No License specified) 77 | 78 | ## Useful resources 79 | 80 | - [How to Train a GAN? Tips and tricks to make GANs work](https://github.com/soumith/ganhacks) by Soumith Chintala 81 | - [Official MelGAN implementation by original authors](https://github.com/descriptinc/melgan-neurips) 82 | - [Reproduction of MelGAN - NeurIPS 2019 Reproducibility Challenge (Ablation Track)](https://openreview.net/pdf?id=9jTbNbBNw0) by Yifei Zhao, Yichao Yang, and Yang Gao 83 | - "replacing the average pooling layer with max pooling layer and replacing reflection padding with replication padding improves the performance significantly, while combining them produces worse results" 84 | -------------------------------------------------------------------------------- /melgan_vocoder/assets/gd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/assets/gd.png -------------------------------------------------------------------------------- /melgan_vocoder/assets/lj-tensorboard-v0.3-alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/assets/lj-tensorboard-v0.3-alpha.png -------------------------------------------------------------------------------- /melgan_vocoder/assets/lj-tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/assets/lj-tensorboard.png -------------------------------------------------------------------------------- /melgan_vocoder/config/default.yaml: -------------------------------------------------------------------------------- 1 | data: # root path of train/validation data (either relative/absoulte path is ok) 2 | train: './data/train/wav/' 3 | validation: './data/valid/wav/' 4 | --- 5 | train: 6 | rep_discriminator: 1 7 | num_workers: 2 #32 8 | batch_size: 2 #16 9 | optimizer: 'adam' 10 | adam: 11 | lr: 0.0001 12 | beta1: 0.5 13 | beta2: 0.9 14 | --- 15 | audio: 16 | n_mel_channels: 80 17 | segment_length: 16000 18 | pad_short: 2000 19 | filter_length: 1024 20 | hop_length: 256 # WARNING: this can't be changed. 21 | win_length: 1024 22 | sampling_rate: 22050 23 | mel_fmin: 0.0 24 | mel_fmax: 8000.0 25 | --- 26 | model: 27 | feat_match: 10.0 28 | --- 29 | log: 30 | summary_interval: 1 31 | validation_interval: 10 32 | save_interval: 25 33 | chkpt_dir: 'chkpt' 34 | log_dir: 'logs' 35 | -------------------------------------------------------------------------------- /melgan_vocoder/data/test/mel/mel_out.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/data/test/mel/mel_out.npy -------------------------------------------------------------------------------- /melgan_vocoder/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/datasets/__init__.py -------------------------------------------------------------------------------- /melgan_vocoder/datasets/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import torch 4 | import random 5 | import numpy as np 6 | from torch.utils.data import Dataset, DataLoader 7 | 8 | from utils.utils import read_wav_np,read_wav_np_librosa 9 | 10 | 11 | def create_dataloader(hp, args, train): 12 | dataset = MelFromDisk(hp, args, train) 13 | 14 | if train: 15 | return DataLoader(dataset=dataset, batch_size=hp.train.batch_size, shuffle=True, 16 | num_workers=hp.train.num_workers, pin_memory=True, drop_last=True) 17 | else: 18 | return DataLoader(dataset=dataset, batch_size=1, shuffle=False, 19 | num_workers=hp.train.num_workers, pin_memory=True, drop_last=False) 20 | 21 | 22 | class MelFromDisk(Dataset): 23 | def __init__(self, hp, args, train): 24 | self.hp = hp 25 | self.args = args 26 | self.train = train 27 | self.path = hp.data.train if train else hp.data.validation 28 | # self.wav_list = glob.glob(os.path.join(self.path, '**', '*.wav'), recursive=True) 29 | self.wav_list = glob.glob(os.path.join(self.path, '**', 'audio*'), recursive=True) 30 | self.mel_segment_length = hp.audio.segment_length // hp.audio.hop_length + 2 31 | self.mapping = [i for i in range(len(self.wav_list))] 32 | 33 | def __len__(self): 34 | return len(self.wav_list) 35 | 36 | def __getitem__(self, idx): 37 | if self.train: 38 | idx1 = idx 39 | idx2 = self.mapping[idx1] 40 | return self.my_getitem(idx1), self.my_getitem(idx2) 41 | else: 42 | return self.my_getitem(idx) 43 | 44 | def shuffle_mapping(self): 45 | random.shuffle(self.mapping) 46 | 47 | def my_getitem(self, idx): 48 | wavpath = self.wav_list[idx] 49 | # melpath = wavpath.replace('.wav', '.mel') 50 | 51 | # tmp = os.path.split(wavpath) 52 | # melpath = tmp[0] + '/mel-' + tmp[1].replace('.wav', '.npy') 53 | 54 | melpath = wavpath.replace('audio', 'mel') 55 | 56 | # sr, audio = read_wav_np(wavpath) 57 | # sr, audio = read_wav_np_librosa(wavpath, self.hp.audio.sampling_rate) 58 | 59 | sr = self.hp.audio.sampling_rate 60 | audio = np.load(wavpath) 61 | 62 | if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short: 63 | audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \ 64 | mode='constant', constant_values=0.0) 65 | 66 | audio = torch.from_numpy(audio).unsqueeze(0) 67 | # mel = torch.load(melpath).squeeze(0) 68 | 69 | mel_np = np.load(melpath) 70 | mel_np = np.transpose(mel_np, [1, 0]) 71 | mel = torch.from_numpy(mel_np) 72 | 73 | 74 | if self.train: 75 | max_mel_start = mel.size(1) - self.mel_segment_length 76 | mel_start = random.randint(0, max_mel_start) 77 | mel_end = mel_start + self.mel_segment_length 78 | mel = mel[:, mel_start:mel_end] 79 | 80 | audio_start = mel_start * self.hp.audio.hop_length 81 | audio = audio[:, audio_start:audio_start+self.hp.audio.segment_length] 82 | 83 | audio = audio + (1/32768) * torch.randn_like(audio) 84 | return mel, audio 85 | -------------------------------------------------------------------------------- /melgan_vocoder/hubconf.py: -------------------------------------------------------------------------------- 1 | dependencies = ['torch'] 2 | import torch 3 | from model.generator import Generator 4 | 5 | model_params = { 6 | 'nvidia_tacotron2_LJ11_epoch6400': { 7 | 'mel_channel': 80, 8 | 'model_url': 'https://github.com/seungwonpark/melgan/releases/download/v0.3-alpha/nvidia_tacotron2_LJ11_epoch6400.pt', 9 | }, 10 | } 11 | 12 | 13 | def melgan(model_name='nvidia_tacotron2_LJ11_epoch6400', pretrained=True, progress=True): 14 | params = model_params[model_name] 15 | model = Generator(params['mel_channel']) 16 | 17 | if pretrained: 18 | state_dict = torch.hub.load_state_dict_from_url(params['model_url'], 19 | progress=progress) 20 | model.load_state_dict(state_dict['model_g']) 21 | 22 | model.eval(inference=True) 23 | 24 | return model 25 | 26 | 27 | if __name__ == '__main__': 28 | vocoder = torch.hub.load('seungwonpark/melgan', 'melgan') 29 | mel = torch.randn(1, 80, 234) # use your own mel-spectrogram here 30 | 31 | print('Input mel-spectrogram shape: {}'.format(mel.shape)) 32 | 33 | if torch.cuda.is_available(): 34 | print('Moving data & model to GPU') 35 | vocoder = vocoder.cuda() 36 | mel = mel.cuda() 37 | 38 | with torch.no_grad(): 39 | audio = vocoder.inference(mel) 40 | 41 | print('Output audio shape: {}'.format(audio.shape)) 42 | -------------------------------------------------------------------------------- /melgan_vocoder/melgan2onnx.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | 3 | import onnx # 'import onnx' must before 'import torch' 4 | import torch 5 | from torch.autograd import Variable 6 | import onnxruntime 7 | import numpy as np 8 | from scipy.io.wavfile import write 9 | from model.generator import Generator 10 | 11 | # use MyRefPad1d() instead nn.ReflectionPad1d() in generator.py and res_stack.py 12 | # tensorrt not support nn.ReflectionPad1d() 13 | 14 | # python -m onnxsim melgan_dynamic.onnx melgan_new_dynamic.onnx --input-shape "1,80,100" 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | n_mel_channels = 80 18 | hop_length = 256 19 | sampling_rate = 22050 20 | test_time_step = 100 21 | 22 | export_onnx_file = 'melgan_dynamic.onnx' 23 | mel_file = './data/test/t2_mel/mel-batch_11_sentence_0.npy' 24 | checkpoint_path = './chkpt/biaobei/biaobei_aca5990_3125.pt' 25 | 26 | 27 | def torch2onnx(export_onnx_file): 28 | 29 | input_name = ['input'] 30 | output_name = ['output'] 31 | 32 | input = Variable(torch.randn(1, 80, test_time_step)) 33 | 34 | checkpoint = torch.load(checkpoint_path) 35 | model = Generator(n_mel_channels) 36 | model.load_state_dict(checkpoint['model_g']) 37 | model.eval(inference=True) # onnx-inference must true 38 | 39 | torch.onnx.export(model, input, export_onnx_file, 40 | input_names=input_name, 41 | output_names=output_name, 42 | verbose=True, 43 | opset_version=10, 44 | export_params=True, 45 | keep_initializers_as_inputs=True, 46 | dynamic_axes={"input": {2: "time_step"}, 47 | "output": {2: "time_step"}} 48 | ) 49 | 50 | test = onnx.load(export_onnx_file) 51 | onnx.checker.check_model(test) 52 | 53 | print("Producer Name:", test.producer_name) 54 | print("Producer Version:", test.producer_version) 55 | print("Opset", test.opset_import[0]) 56 | 57 | print("==> Passed") 58 | 59 | #--------------------------- 60 | 61 | 62 | def to_numpy(tensor): 63 | return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() 64 | 65 | 66 | def onnxruntime_infer(mel_file,export_onnx_file): 67 | 68 | t2_mel = np.load(mel_file) 69 | t2_mel = np.transpose(t2_mel, [1, 0]) 70 | t2_mel = t2_mel[np.newaxis, :] 71 | mel = torch.from_numpy(t2_mel) 72 | zero = torch.full((1, 80, 10), -4) 73 | mel = torch.cat((mel, zero), dim=2) 74 | mel_np = mel.cpu().numpy() 75 | # mel_np = mel.cpu().numpy()[:, :, :test_time_step] 76 | 77 | 78 | ort_session = onnxruntime.InferenceSession(export_onnx_file) 79 | 80 | for input_meta in ort_session.get_inputs(): 81 | print(input_meta) 82 | for output_meta in ort_session.get_outputs(): 83 | print(output_meta) 84 | 85 | ort_inputs = {ort_session.get_inputs()[0].name: mel_np} 86 | ort_outs = ort_session.run(["output"], ort_inputs) 87 | 88 | audio = ort_outs[0].squeeze() 89 | audio = torch.from_numpy(audio) 90 | audio = audio[:-(hop_length * 10)] 91 | audio = MAX_WAV_VALUE * audio 92 | audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) 93 | audio = audio.short() 94 | 95 | audio = audio.cpu().detach().numpy() 96 | out_path = 'save.wav' 97 | write(out_path, sampling_rate, audio) 98 | 99 | print('done') 100 | 101 | if __name__ == '__main__': 102 | 103 | torch2onnx(export_onnx_file) 104 | onnxruntime_infer(mel_file , export_onnx_file) 105 | 106 | 107 | -------------------------------------------------------------------------------- /melgan_vocoder/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/model/__init__.py -------------------------------------------------------------------------------- /melgan_vocoder/model/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Discriminator(nn.Module): 7 | def __init__(self): 8 | super(Discriminator, self).__init__() 9 | 10 | self.discriminator = nn.ModuleList([ 11 | nn.Sequential( 12 | nn.ReflectionPad1d(7), 13 | nn.utils.weight_norm(nn.Conv1d(1, 16, kernel_size=15, stride=1)), 14 | nn.LeakyReLU(0.2, inplace=True), 15 | ), 16 | nn.Sequential( 17 | nn.utils.weight_norm(nn.Conv1d(16, 64, kernel_size=41, stride=4, padding=20, groups=4)), 18 | nn.LeakyReLU(0.2, inplace=True), 19 | ), 20 | nn.Sequential( 21 | nn.utils.weight_norm(nn.Conv1d(64, 256, kernel_size=41, stride=4, padding=20, groups=16)), 22 | nn.LeakyReLU(0.2, inplace=True), 23 | ), 24 | nn.Sequential( 25 | nn.utils.weight_norm(nn.Conv1d(256, 1024, kernel_size=41, stride=4, padding=20, groups=64)), 26 | nn.LeakyReLU(0.2, inplace=True), 27 | ), 28 | nn.Sequential( 29 | nn.utils.weight_norm(nn.Conv1d(1024, 1024, kernel_size=41, stride=4, padding=20, groups=256)), 30 | nn.LeakyReLU(0.2, inplace=True), 31 | ), 32 | nn.Sequential( 33 | nn.utils.weight_norm(nn.Conv1d(1024, 1024, kernel_size=5, stride=1, padding=2)), 34 | nn.LeakyReLU(0.2, inplace=True), 35 | ), 36 | nn.utils.weight_norm(nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1)), 37 | ]) 38 | 39 | def forward(self, x): 40 | ''' 41 | returns: (list of 6 features, discriminator score) 42 | we directly predict score without last sigmoid function 43 | since we're using Least Squares GAN (https://arxiv.org/abs/1611.04076) 44 | ''' 45 | features = list() 46 | for module in self.discriminator: 47 | x = module(x) 48 | features.append(x) 49 | return features[:-1], features[-1] 50 | 51 | 52 | if __name__ == '__main__': 53 | model = Discriminator() 54 | 55 | x = torch.randn(3, 1, 22050) 56 | print(x.shape) 57 | 58 | features, score = model(x) 59 | for feat in features: 60 | print(feat.shape) 61 | print(score.shape) 62 | 63 | pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 64 | print(pytorch_total_params) -------------------------------------------------------------------------------- /melgan_vocoder/model/generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .res_stack import ResStack , MyRefPad1d 6 | # from res_stack import ResStack 7 | 8 | MAX_WAV_VALUE = 32768.0 9 | 10 | class Generator(nn.Module): 11 | def __init__(self, mel_channel): 12 | super(Generator, self).__init__() 13 | self.mel_channel = mel_channel 14 | 15 | self.generator = nn.Sequential( 16 | nn.ReflectionPad1d(3), # tensorrt not support 17 | # MyRefPad1d(3), 18 | nn.utils.weight_norm(nn.Conv1d(mel_channel, 512, kernel_size=7, stride=1)), 19 | 20 | nn.LeakyReLU(0.2), 21 | nn.utils.weight_norm(nn.ConvTranspose1d(512, 256, kernel_size=16, stride=8, padding=4)), 22 | 23 | ResStack(256), 24 | 25 | nn.LeakyReLU(0.2), 26 | nn.utils.weight_norm(nn.ConvTranspose1d(256, 128, kernel_size=16, stride=8, padding=4)), 27 | 28 | ResStack(128), 29 | 30 | nn.LeakyReLU(0.2), 31 | nn.utils.weight_norm(nn.ConvTranspose1d(128, 64, kernel_size=4, stride=2, padding=1)), 32 | 33 | ResStack(64), 34 | 35 | nn.LeakyReLU(0.2), 36 | nn.utils.weight_norm(nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2, padding=1)), 37 | 38 | ResStack(32), 39 | 40 | nn.LeakyReLU(0.2), 41 | nn.ReflectionPad1d(3), 42 | # MyRefPad1d(3), 43 | nn.utils.weight_norm(nn.Conv1d(32, 1, kernel_size=7, stride=1)), 44 | nn.Tanh(), 45 | ) 46 | 47 | def forward(self, mel): 48 | mel = (mel + 5.0) / 5.0 # roughly normalize spectrogram 49 | return self.generator(mel) 50 | 51 | def eval(self, inference=False): 52 | super(Generator, self).eval() 53 | 54 | # don't remove weight norm while validation in training loop 55 | if inference: 56 | self.remove_weight_norm() 57 | 58 | def remove_weight_norm(self): 59 | for idx, layer in enumerate(self.generator): 60 | if len(layer.state_dict()) != 0: 61 | try: 62 | nn.utils.remove_weight_norm(layer) 63 | except: 64 | layer.remove_weight_norm() 65 | 66 | def inference(self, mel): 67 | hop_length = 256 68 | # pad input mel with zeros to cut artifact 69 | # see https://github.com/seungwonpark/melgan/issues/8 70 | # zero = torch.full((1, self.mel_channel, 10), -11.5129).to(mel.device) 71 | zero = torch.full((1, self.mel_channel, 10), -4).to(mel.device) 72 | mel = torch.cat((mel, zero), dim=2) 73 | 74 | audio = self.forward(mel) 75 | audio = audio.squeeze() # collapse all dimension except time axis 76 | audio = audio[:-(hop_length*10)] 77 | audio = MAX_WAV_VALUE * audio 78 | audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) 79 | audio = audio.short() 80 | 81 | return audio 82 | 83 | 84 | ''' 85 | to run this, fix 86 | from . import ResStack 87 | into 88 | from res_stack import ResStack 89 | ''' 90 | if __name__ == '__main__': 91 | model = Generator(80) 92 | 93 | x = torch.randn(3, 80, 10) 94 | print(x.shape) 95 | 96 | y = model(x) 97 | print(y.shape) 98 | assert y.shape == torch.Size([3, 1, 2560]) 99 | 100 | pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 101 | print(pytorch_total_params) -------------------------------------------------------------------------------- /melgan_vocoder/model/identity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Identity(nn.Module): 7 | def __init__(self): 8 | super(Identity, self).__init__() 9 | 10 | def forward(self, x): 11 | return x 12 | 13 | -------------------------------------------------------------------------------- /melgan_vocoder/model/multiscale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .discriminator import Discriminator 6 | from .identity import Identity 7 | 8 | 9 | class MultiScaleDiscriminator(nn.Module): 10 | def __init__(self): 11 | super(MultiScaleDiscriminator, self).__init__() 12 | 13 | self.discriminators = nn.ModuleList( 14 | [Discriminator() for _ in range(3)] 15 | ) 16 | 17 | self.pooling = nn.ModuleList( 18 | [Identity()] + 19 | [nn.AvgPool1d(kernel_size=4, stride=2, padding=1, count_include_pad=False) for _ in range(1, 3)] 20 | ) 21 | 22 | def forward(self, x): 23 | ret = list() 24 | 25 | for pool, disc in zip(self.pooling, self.discriminators): 26 | x = pool(x) 27 | ret.append(disc(x)) 28 | 29 | return ret # [(feat, score), (feat, score), (feat, score)] 30 | -------------------------------------------------------------------------------- /melgan_vocoder/model/res_stack.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | 7 | class MyRefPad1d(nn.Module): 8 | def __init__(self, value): 9 | super(MyRefPad1d, self).__init__() 10 | self.value = value 11 | 12 | def forward(self, x): 13 | input_size = x.size() 14 | # input_size = [1,80,100] 15 | # ---------- 16 | ref_pad1d = x.clone() 17 | for i in range(self.value): 18 | tmp1 = x[:, :, i + 1].unsqueeze(2) 19 | tmp2 = x[:, :, input_size[2] - i - 2].unsqueeze(2) 20 | ref_pad1d = torch.cat((tmp1, ref_pad1d), 2) 21 | ref_pad1d = torch.cat((ref_pad1d, tmp2), 2) 22 | 23 | return ref_pad1d 24 | 25 | 26 | class ResStack(nn.Module): 27 | def __init__(self, channel): 28 | super(ResStack, self).__init__() 29 | 30 | self.blocks = nn.ModuleList([ 31 | nn.Sequential( 32 | nn.LeakyReLU(0.2), 33 | nn.ReflectionPad1d(3**i), 34 | # MyRefPad1d(3**i), 35 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=3, dilation=3**i)), 36 | nn.LeakyReLU(0.2), 37 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), 38 | ) 39 | for i in range(3) 40 | ]) 41 | 42 | self.shortcuts = nn.ModuleList([ 43 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) 44 | for i in range(3) 45 | ]) 46 | 47 | def forward(self, x): 48 | for block, shortcut in zip(self.blocks, self.shortcuts): 49 | x = shortcut(x) + block(x) 50 | return x 51 | 52 | def remove_weight_norm(self): 53 | for block, shortcut in zip(self.blocks, self.shortcuts): 54 | nn.utils.remove_weight_norm(block[2]) 55 | nn.utils.remove_weight_norm(block[4]) 56 | nn.utils.remove_weight_norm(shortcut) 57 | -------------------------------------------------------------------------------- /melgan_vocoder/onnx2trt.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | 3 | import onnx 4 | import pycuda.autoinit 5 | import numpy as np 6 | import pycuda.driver as cuda 7 | import tensorrt as trt 8 | import torch 9 | import os 10 | import time 11 | 12 | test_time_step = 100 13 | MAX_WAV_VALUE = 32768.0 14 | n_mel_channels = 80 15 | hop_length = 256 16 | sampling_rate = 22050 17 | min_dynamic_shape = (1,n_mel_channels,10) 18 | opt_dynamic_shape = (1,n_mel_channels,1000) 19 | max_dynamic_shape = (1,n_mel_channels,2000) 20 | 21 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) # This logger is required to build an engine 22 | 23 | class HostDeviceMem(object): 24 | def __init__(self, host_mem, device_mem): 25 | """Within this context, host_mom means the cpu memory and device means the GPU memory 26 | """ 27 | self.host = host_mem 28 | self.device = device_mem 29 | 30 | def __str__(self): 31 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 32 | 33 | def __repr__(self): 34 | return self.__str__() 35 | 36 | 37 | def allocate_buffers(engine,melgan_time_step): 38 | inputs = [] 39 | outputs = [] 40 | bindings = [] 41 | stream = cuda.Stream() 42 | for binding in engine: 43 | # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 44 | if(binding == 'input'): 45 | size = 1 * n_mel_channels * melgan_time_step 46 | if (binding == 'output'): 47 | size = hop_length * melgan_time_step 48 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 49 | # Allocate host and device buffers 50 | host_mem = cuda.pagelocked_empty(size, dtype) 51 | device_mem = cuda.mem_alloc(host_mem.nbytes) 52 | # Append the device buffer to device bindings. 53 | bindings.append(int(device_mem)) 54 | # Append to the appropriate list. 55 | if engine.binding_is_input(binding): 56 | inputs.append(HostDeviceMem(host_mem, device_mem)) 57 | else: 58 | outputs.append(HostDeviceMem(host_mem, device_mem)) 59 | return inputs, outputs, bindings, stream 60 | 61 | 62 | def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \ 63 | fp16_mode=False, int8_mode=False, save_engine=False,): 64 | """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" 65 | 66 | def build_engine(max_batch_size, save_engine): 67 | """Takes an ONNX file and creates a TensorRT engine to run inference with""" 68 | 69 | explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 70 | with trt.Builder(TRT_LOGGER) as builder, \ 71 | builder.create_network(explicit_batch) as network, \ 72 | trt.OnnxParser(network, TRT_LOGGER) as parser: 73 | 74 | builder.max_workspace_size = 1 << 30 # Your workspace size 75 | builder.max_batch_size = max_batch_size 76 | builder.fp16_mode = fp16_mode # Default: False 77 | builder.int8_mode = int8_mode # Default: False 78 | 79 | if int8_mode: 80 | # To be updated 81 | raise NotImplementedError 82 | 83 | # Parse model file 84 | if not os.path.exists(onnx_file_path): 85 | quit('ONNX file {} not found'.format(onnx_file_path)) 86 | 87 | print('Loading ONNX file from path {}...'.format(onnx_file_path)) 88 | with open(onnx_file_path, 'rb') as model: 89 | print('Beginning ONNX file parsing') 90 | parser_flag = parser.parse(model.read()) 91 | print(parser_flag) 92 | 93 | if not parser_flag: 94 | print('ERROR: Failed to parse the ONNX file.') 95 | for error in range(parser.num_errors): 96 | print(parser.get_error(error)) 97 | return None 98 | 99 | last_layer = network.get_layer(network.num_layers - 1) 100 | # Check if last layer recognizes it's output 101 | if not last_layer.get_output(0): 102 | # If not, then mark the output using TensorRT API 103 | network.mark_output(last_layer.get_output(0)) 104 | 105 | print('Completed parsing of ONNX file') 106 | print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) 107 | 108 | # dynamic shape 109 | profile = builder.create_optimization_profile() 110 | input_name = network.get_input(0).name 111 | profile.set_shape(input_name, min=min_dynamic_shape, opt=opt_dynamic_shape, max=max_dynamic_shape) 112 | config = builder.create_builder_config() 113 | config.max_workspace_size = 2 ** 30 # 1GiB 114 | config.add_optimization_profile(profile) 115 | 116 | engine = builder.build_engine(network,config) 117 | 118 | # fixed shape 119 | # engine = builder.build_cuda_engine(network) 120 | 121 | print("Completed creating Engine") 122 | 123 | if save_engine: 124 | with open(engine_file_path, "wb") as f: 125 | f.write(engine.serialize()) 126 | return engine 127 | 128 | if os.path.exists(engine_file_path): 129 | # If a serialized engine exists, load it instead of building a new one. 130 | print("Reading engine from file {}".format(engine_file_path)) 131 | with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: 132 | return runtime.deserialize_cuda_engine(f.read()) 133 | else: 134 | return build_engine(max_batch_size, save_engine) 135 | 136 | 137 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): 138 | # Transfer data from CPU to the GPU. 139 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 140 | # Run inference. 141 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 142 | # Transfer predictions back from the GPU. 143 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 144 | # Synchronize the stream 145 | stream.synchronize() 146 | # Return only the host outputs. 147 | return [out.host for out in outputs] 148 | 149 | 150 | if __name__ == '__main__': 151 | 152 | onnx_model_path = 'melgan_dynamic.onnx' 153 | save_file = 'save_trt.wav' 154 | mel_file = './data/test/t2_mel/mel-batch_11_sentence_0.npy' 155 | 156 | #preprocess 157 | t2_mel = np.load(mel_file) 158 | t2_mel = np.transpose(t2_mel, [1, 0]) 159 | t2_mel = t2_mel[np.newaxis, :] 160 | mel = torch.from_numpy(t2_mel) 161 | zero = torch.full((1, 80, 10), -4) 162 | mel = torch.cat((mel, zero), dim=2) 163 | mel_np = mel.cpu().numpy() 164 | real_time_step = mel_np.shape[2] 165 | # mel_np = mel.cpu().numpy()[:, :, :test_time_step] 166 | 167 | # These two modes are dependent on hardwares 168 | fp16_mode = False 169 | int8_mode = False 170 | save_engine = True 171 | trt_engine_path = 'melgan_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode) 172 | # Build an engine 173 | engine = get_engine(1, onnx_model_path, trt_engine_path, fp16_mode, int8_mode, save_engine) 174 | # Create the context for this engine 175 | context = engine.create_execution_context() 176 | context.set_binding_shape(0, (1, n_mel_channels, real_time_step)) # important 177 | 178 | # Allocate buffers for input and output 179 | inputs, outputs, bindings, stream = allocate_buffers(engine,real_time_step) # input, output: host # bindings 180 | 181 | # Do inference 182 | shape_of_output = (1, hop_length * real_time_step) 183 | # Load data to the buffer 184 | inputs[0].host = np.array(mel_np.reshape(-1)) 185 | # inputs[1].host = ... for multiple input 186 | 187 | t1 = time.time() 188 | trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data 189 | t2 = time.time() 190 | print('time:',t2-t1) 191 | 192 | # postprocess 193 | audio = trt_outputs[0].squeeze() 194 | audio = torch.from_numpy(audio) 195 | audio = audio[:-(hop_length * 10)] 196 | audio = MAX_WAV_VALUE * audio 197 | audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) 198 | audio = audio.short() 199 | audio = audio.cpu().detach().numpy() 200 | 201 | from scipy.io.wavfile import write 202 | write(save_file, sampling_rate, audio) 203 | 204 | print('All completed!') 205 | 206 | -------------------------------------------------------------------------------- /melgan_vocoder/preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import tqdm 4 | import torch 5 | import argparse 6 | import numpy as np 7 | 8 | from utils.stft import TacotronSTFT 9 | from utils.hparams import HParam 10 | from utils.utils import read_wav_np,read_wav_np_librosa 11 | 12 | 13 | def main(hp, args): 14 | stft = TacotronSTFT(filter_length=hp.audio.filter_length, 15 | hop_length=hp.audio.hop_length, 16 | win_length=hp.audio.win_length, 17 | n_mel_channels=hp.audio.n_mel_channels, 18 | sampling_rate=hp.audio.sampling_rate, 19 | mel_fmin=hp.audio.mel_fmin, 20 | mel_fmax=hp.audio.mel_fmax) 21 | 22 | wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) 23 | 24 | for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): 25 | # sr, wav = read_wav_np(wavpath) 26 | sr, wav = read_wav_np_librosa(wavpath, hp.audio.sampling_rate) 27 | assert sr == hp.audio.sampling_rate, \ 28 | "sample rate mismatch. expected %d, got %d at %s" % \ 29 | (hp.audio.sampling_rate, sr, wavpath) 30 | 31 | if len(wav) < hp.audio.segment_length + hp.audio.pad_short: 32 | wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \ 33 | mode='constant', constant_values=0.0) 34 | 35 | wav = torch.from_numpy(wav).unsqueeze(0) 36 | mel = stft.mel_spectrogram(wav) 37 | 38 | melpath = wavpath.replace('.wav', '.mel') 39 | torch.save(mel, melpath) 40 | 41 | 42 | if __name__ == '__main__': 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('-c', '--config', type=str, default='config/default.yaml', 45 | help="yaml file for config.") 46 | parser.add_argument('-d', '--data_path', type=str, default='data/wav2/', 47 | help="root directory of wav files") 48 | args = parser.parse_args() 49 | hp = HParam(args.config) 50 | 51 | main(hp, args) 52 | -------------------------------------------------------------------------------- /melgan_vocoder/requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | matplotlib 3 | numpy 4 | scipy 5 | tensorboardX 6 | torch 7 | tqdm 8 | pillow 9 | pyyaml 10 | -------------------------------------------------------------------------------- /melgan_vocoder/train_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | import argparse 5 | 6 | from utils.train import train 7 | from utils.hparams import HParam 8 | from utils.writer import MyWriter 9 | from datasets.dataloader import create_dataloader 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-c', '--config', type=str, default= 'config/config.yaml', 15 | help="yaml file for configuration") 16 | parser.add_argument('-p', '--checkpoint_path', type=str, default=None, 17 | help="path of checkpoint pt file to resume training") 18 | parser.add_argument('-n', '--name', type=str, default='biaobei_train', 19 | help="name of the model for logging, saving checkpoint") 20 | args = parser.parse_args() 21 | 22 | hp = HParam(args.config) 23 | with open(args.config, 'r') as f: 24 | hp_str = ''.join(f.readlines()) 25 | 26 | pt_dir = os.path.join(hp.log.chkpt_dir, args.name) 27 | log_dir = os.path.join(hp.log.log_dir, args.name) 28 | os.makedirs(pt_dir, exist_ok=True) 29 | os.makedirs(log_dir, exist_ok=True) 30 | 31 | logging.basicConfig( 32 | level=logging.INFO, 33 | format='%(asctime)s - %(levelname)s - %(message)s', 34 | handlers=[ 35 | logging.FileHandler(os.path.join(log_dir, 36 | '%s-%d.log' % (args.name, time.time()))), 37 | logging.StreamHandler() 38 | ] 39 | ) 40 | logger = logging.getLogger() 41 | 42 | writer = MyWriter(hp, log_dir) 43 | 44 | assert hp.audio.hop_length == 256, \ 45 | 'hp.audio.hop_length must be equal to 256, got %d' % hp.audio.hop_length 46 | assert hp.data.train != '' and hp.data.validation != '', \ 47 | 'hp.data.train and hp.data.validation can\'t be empty: please fix %s' % args.config 48 | 49 | trainloader = create_dataloader(hp, args, True) 50 | valloader = create_dataloader(hp, args, False) 51 | 52 | train(args, pt_dir, args.checkpoint_path, trainloader, valloader, writer, logger, hp, hp_str) 53 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wqt2019/tacotron-2_melgan/6c8a1e2ddc257aec79d63c1157370b15fe5ca781/melgan_vocoder/utils/__init__.py -------------------------------------------------------------------------------- /melgan_vocoder/utils/audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.signal import get_window 4 | import librosa.util as librosa_util 5 | 6 | 7 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 8 | n_fft=800, dtype=np.float32, norm=None): 9 | """ 10 | # from librosa 0.6 11 | Compute the sum-square envelope of a window function at a given hop length. 12 | 13 | This is used to estimate modulation effects induced by windowing 14 | observations in short-time fourier transforms. 15 | 16 | Parameters 17 | ---------- 18 | window : string, tuple, number, callable, or list-like 19 | Window specification, as in `get_window` 20 | 21 | n_frames : int > 0 22 | The number of analysis frames 23 | 24 | hop_length : int > 0 25 | The number of samples to advance between frames 26 | 27 | win_length : [optional] 28 | The length of the window function. By default, this matches `n_fft`. 29 | 30 | n_fft : int > 0 31 | The length of each analysis frame. 32 | 33 | dtype : np.dtype 34 | The data type of the output 35 | 36 | Returns 37 | ------- 38 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 39 | The sum-squared envelope of the window function 40 | """ 41 | if win_length is None: 42 | win_length = n_fft 43 | 44 | n = n_fft + hop_length * (n_frames - 1) 45 | x = np.zeros(n, dtype=dtype) 46 | 47 | # Compute the squared window at the desired length 48 | win_sq = get_window(window, win_length, fftbins=True) 49 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 50 | win_sq = librosa_util.pad_center(win_sq, n_fft) 51 | 52 | # Fill the envelope 53 | for i in range(n_frames): 54 | sample = i * hop_length 55 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 56 | return x 57 | 58 | 59 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 60 | """ 61 | PARAMS 62 | ------ 63 | magnitudes: spectrogram magnitudes 64 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 65 | """ 66 | 67 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 68 | angles = angles.astype(np.float32) 69 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 70 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 71 | 72 | for i in range(n_iters): 73 | _, angles = stft_fn.transform(signal) 74 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 75 | return signal 76 | 77 | 78 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 79 | """ 80 | PARAMS 81 | ------ 82 | C: compression factor 83 | """ 84 | return torch.log(torch.clamp(x, min=clip_val) * C) 85 | 86 | 87 | def dynamic_range_decompression(x, C=1): 88 | """ 89 | PARAMS 90 | ------ 91 | C: compression factor used to compress 92 | """ 93 | return torch.exp(x) / C 94 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/hparams.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/HarryVolek/PyTorch_Speaker_Verification 2 | 3 | import os 4 | import yaml 5 | 6 | 7 | def load_hparam_str(hp_str): 8 | path = 'temp-restore.yaml' 9 | with open(path, 'w') as f: 10 | f.write(hp_str) 11 | ret = HParam(path) 12 | os.remove(path) 13 | return ret 14 | 15 | 16 | def load_hparam(filename): 17 | stream = open(filename, 'r') 18 | docs = yaml.load_all(stream, Loader=yaml.Loader) 19 | hparam_dict = dict() 20 | for doc in docs: 21 | for k, v in doc.items(): 22 | hparam_dict[k] = v 23 | return hparam_dict 24 | 25 | 26 | def merge_dict(user, default): 27 | if isinstance(user, dict) and isinstance(default, dict): 28 | for k, v in default.items(): 29 | if k not in user: 30 | user[k] = v 31 | else: 32 | user[k] = merge_dict(user[k], v) 33 | return user 34 | 35 | 36 | class Dotdict(dict): 37 | """ 38 | a dictionary that supports dot notation 39 | as well as dictionary access notation 40 | usage: d = DotDict() or d = DotDict({'val1':'first'}) 41 | set attributes: d.val2 = 'second' or d['val2'] = 'second' 42 | get attributes: d.val2 or d['val2'] 43 | """ 44 | __getattr__ = dict.__getitem__ 45 | __setattr__ = dict.__setitem__ 46 | __delattr__ = dict.__delitem__ 47 | 48 | def __init__(self, dct=None): 49 | dct = dict() if not dct else dct 50 | for key, value in dct.items(): 51 | if hasattr(value, 'keys'): 52 | value = Dotdict(value) 53 | self[key] = value 54 | 55 | 56 | class HParam(Dotdict): 57 | 58 | def __init__(self, file): 59 | super(Dotdict, self).__init__() 60 | hp_dict = load_hparam(file) 61 | hp_dotdict = Dotdict(hp_dict) 62 | for k, v in hp_dotdict.items(): 63 | setattr(self, k, v) 64 | 65 | __getattr__ = Dotdict.__getitem__ 66 | __setattr__ = Dotdict.__setitem__ 67 | __delattr__ = Dotdict.__delitem__ 68 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use("Agg") 3 | import matplotlib.pylab as plt 4 | import numpy as np 5 | 6 | 7 | def save_figure_to_numpy(fig): 8 | # save it to a numpy array. 9 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 10 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 11 | data = np.transpose(data, (2, 0, 1)) 12 | return data 13 | 14 | 15 | def plot_waveform_to_numpy(waveform): 16 | fig, ax = plt.subplots(figsize=(12, 3)) 17 | ax.plot() 18 | ax.plot(range(len(waveform)), waveform, 19 | linewidth=0.1, alpha=0.7, color='blue') 20 | 21 | plt.xlabel("Samples") 22 | plt.ylabel("Amplitude") 23 | plt.ylim(-1, 1) 24 | plt.tight_layout() 25 | 26 | fig.canvas.draw() 27 | data = save_figure_to_numpy(fig) 28 | plt.close() 29 | return data 30 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window 38 | from librosa.util import pad_center, tiny 39 | from .audio_processing import window_sumsquare, dynamic_range_compression, dynamic_range_decompression 40 | from librosa.filters import mel as librosa_mel_fn 41 | 42 | 43 | class STFT(torch.nn.Module): 44 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 45 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 46 | window='hann'): 47 | super(STFT, self).__init__() 48 | self.filter_length = filter_length 49 | self.hop_length = hop_length 50 | self.win_length = win_length 51 | self.window = window 52 | self.forward_transform = None 53 | scale = self.filter_length / self.hop_length 54 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 55 | 56 | cutoff = int((self.filter_length / 2 + 1)) 57 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 58 | np.imag(fourier_basis[:cutoff, :])]) 59 | 60 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 61 | inverse_basis = torch.FloatTensor( 62 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 63 | 64 | if window is not None: 65 | assert(filter_length >= win_length) 66 | # get window and zero center pad it to filter_length 67 | fft_window = get_window(window, win_length, fftbins=True) 68 | fft_window = pad_center(fft_window, filter_length) 69 | fft_window = torch.from_numpy(fft_window).float() 70 | 71 | # window the bases 72 | forward_basis *= fft_window 73 | inverse_basis *= fft_window 74 | 75 | self.register_buffer('forward_basis', forward_basis.float()) 76 | self.register_buffer('inverse_basis', inverse_basis.float()) 77 | 78 | def transform(self, input_data): 79 | num_batches = input_data.size(0) 80 | num_samples = input_data.size(1) 81 | 82 | self.num_samples = num_samples 83 | 84 | # similar to librosa, reflect-pad the input 85 | input_data = input_data.view(num_batches, 1, num_samples) 86 | input_data = F.pad( 87 | input_data.unsqueeze(1), 88 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 89 | mode='reflect') 90 | input_data = input_data.squeeze(1) 91 | 92 | # https://github.com/NVIDIA/tacotron2/issues/125 93 | forward_transform = F.conv1d( 94 | input_data, 95 | # input_data.cuda(), 96 | Variable(self.forward_basis, requires_grad=False), 97 | # Variable(self.forward_basis, requires_grad=False).cuda(), 98 | stride=self.hop_length, 99 | padding=0).cpu() 100 | 101 | cutoff = int((self.filter_length / 2) + 1) 102 | real_part = forward_transform[:, :cutoff, :] 103 | imag_part = forward_transform[:, cutoff:, :] 104 | 105 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 106 | phase = torch.autograd.Variable( 107 | torch.atan2(imag_part.data, real_part.data)) 108 | 109 | return magnitude, phase 110 | 111 | def inverse(self, magnitude, phase): 112 | recombine_magnitude_phase = torch.cat( 113 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 114 | 115 | inverse_transform = F.conv_transpose1d( 116 | recombine_magnitude_phase, 117 | Variable(self.inverse_basis, requires_grad=False), 118 | stride=self.hop_length, 119 | padding=0) 120 | 121 | if self.window is not None: 122 | window_sum = window_sumsquare( 123 | self.window, magnitude.size(-1), hop_length=self.hop_length, 124 | win_length=self.win_length, n_fft=self.filter_length, 125 | dtype=np.float32) 126 | # remove modulation effects 127 | approx_nonzero_indices = torch.from_numpy( 128 | np.where(window_sum > tiny(window_sum))[0]) 129 | window_sum = torch.autograd.Variable( 130 | torch.from_numpy(window_sum), requires_grad=False) 131 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 132 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 133 | 134 | # scale by hop ratio 135 | inverse_transform *= float(self.filter_length) / self.hop_length 136 | 137 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 138 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 139 | 140 | return inverse_transform 141 | 142 | def forward(self, input_data): 143 | self.magnitude, self.phase = self.transform(input_data) 144 | reconstruction = self.inverse(self.magnitude, self.phase) 145 | return reconstruction 146 | 147 | 148 | class TacotronSTFT(torch.nn.Module): 149 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 150 | n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, 151 | mel_fmax=None): 152 | super(TacotronSTFT, self).__init__() 153 | self.n_mel_channels = n_mel_channels 154 | self.sampling_rate = sampling_rate 155 | self.stft_fn = STFT(filter_length, hop_length, win_length) 156 | mel_basis = librosa_mel_fn( 157 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 158 | mel_basis = torch.from_numpy(mel_basis).float() 159 | self.register_buffer('mel_basis', mel_basis) 160 | 161 | def spectral_normalize(self, magnitudes): 162 | output = dynamic_range_compression(magnitudes) 163 | return output 164 | 165 | def spectral_de_normalize(self, magnitudes): 166 | output = dynamic_range_decompression(magnitudes) 167 | return output 168 | 169 | def mel_spectrogram(self, y): 170 | """Computes mel-spectrograms from a batch of waves 171 | PARAMS 172 | ------ 173 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 174 | 175 | RETURNS 176 | ------- 177 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 178 | """ 179 | assert(torch.min(y.data) >= -1) 180 | assert(torch.max(y.data) <= 1) 181 | 182 | magnitudes, phases = self.stft_fn.transform(y) 183 | magnitudes = magnitudes.data 184 | mel_output = torch.matmul(self.mel_basis, magnitudes) 185 | mel_output = self.spectral_normalize(mel_output) 186 | return mel_output 187 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import tqdm 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import itertools 8 | import traceback 9 | 10 | from model.generator import Generator 11 | from model.multiscale import MultiScaleDiscriminator 12 | from .utils import get_commit_hash 13 | from .validation import validate 14 | 15 | 16 | def train(args, pt_dir, chkpt_path, trainloader, valloader, writer, logger, hp, hp_str): 17 | model_g = Generator(hp.audio.n_mel_channels).cuda() 18 | model_d = MultiScaleDiscriminator().cuda() 19 | 20 | optim_g = torch.optim.Adam(model_g.parameters(), 21 | lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2)) 22 | optim_d = torch.optim.Adam(model_d.parameters(), 23 | lr=hp.train.adam.lr, betas=(hp.train.adam.beta1, hp.train.adam.beta2)) 24 | 25 | githash = get_commit_hash() 26 | 27 | init_epoch = -1 28 | step = 0 29 | 30 | if chkpt_path is not None: 31 | logger.info("Resuming from checkpoint: %s" % chkpt_path) 32 | checkpoint = torch.load(chkpt_path) 33 | model_g.load_state_dict(checkpoint['model_g']) 34 | model_d.load_state_dict(checkpoint['model_d']) 35 | optim_g.load_state_dict(checkpoint['optim_g']) 36 | optim_d.load_state_dict(checkpoint['optim_d']) 37 | step = checkpoint['step'] 38 | init_epoch = checkpoint['epoch'] 39 | 40 | if hp_str != checkpoint['hp_str']: 41 | logger.warning("New hparams is different from checkpoint. Will use new.") 42 | 43 | if githash != checkpoint['githash']: 44 | logger.warning("Code might be different: git hash is different.") 45 | logger.warning("%s -> %s" % (checkpoint['githash'], githash)) 46 | 47 | else: 48 | logger.info("Starting new training run.") 49 | 50 | # this accelerates training when the size of minibatch is always consistent. 51 | # if not consistent, it'll horribly slow down. 52 | torch.backends.cudnn.benchmark = True 53 | 54 | try: 55 | model_g.train() 56 | model_d.train() 57 | for epoch in itertools.count(init_epoch+1): 58 | if epoch % hp.log.validation_interval == 0: 59 | with torch.no_grad(): 60 | validate(hp, args, model_g, model_d, valloader, writer, step) 61 | a = 0 62 | 63 | trainloader.dataset.shuffle_mapping() 64 | loader = tqdm.tqdm(trainloader, desc='Loading train data') 65 | for (melG, audioG), (melD, audioD) in loader: 66 | melG = melG.cuda() 67 | audioG = audioG.cuda() 68 | melD = melD.cuda() 69 | audioD = audioD.cuda() 70 | 71 | # generator 72 | optim_g.zero_grad() 73 | fake_audio = model_g(melG)[:, :, :hp.audio.segment_length] 74 | disc_fake = model_d(fake_audio) 75 | disc_real = model_d(audioG) 76 | loss_g = 0.0 77 | for (feats_fake, score_fake), (feats_real, _) in zip(disc_fake, disc_real): 78 | loss_g += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2])) 79 | for feat_f, feat_r in zip(feats_fake, feats_real): 80 | loss_g += hp.model.feat_match * torch.mean(torch.abs(feat_f - feat_r)) 81 | 82 | loss_g.backward() 83 | optim_g.step() 84 | 85 | # discriminator 86 | fake_audio = model_g(melD)[:, :, :hp.audio.segment_length] 87 | fake_audio = fake_audio.detach() 88 | loss_d_sum = 0.0 89 | for _ in range(hp.train.rep_discriminator): 90 | optim_d.zero_grad() 91 | disc_fake = model_d(fake_audio) 92 | disc_real = model_d(audioD) 93 | loss_d = 0.0 94 | for (_, score_fake), (_, score_real) in zip(disc_fake, disc_real): 95 | loss_d += torch.mean(torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2])) 96 | loss_d += torch.mean(torch.sum(torch.pow(score_fake, 2), dim=[1, 2])) 97 | 98 | loss_d.backward() 99 | optim_d.step() 100 | loss_d_sum += loss_d 101 | 102 | step += 1 103 | # logging 104 | loss_g = loss_g.item() 105 | loss_d_avg = loss_d_sum / hp.train.rep_discriminator 106 | loss_d_avg = loss_d_avg.item() 107 | if any([loss_g > 1e8, math.isnan(loss_g), loss_d_avg > 1e8, math.isnan(loss_d_avg)]): 108 | logger.error("loss_g %.01f loss_d_avg %.01f at step %d!" % (loss_g, loss_d_avg, step)) 109 | raise Exception("Loss exploded") 110 | 111 | if step % hp.log.summary_interval == 0: 112 | writer.log_training(loss_g, loss_d_avg, step) 113 | loader.set_description("g %.04f d %.04f | step %d" % (loss_g, loss_d_avg, step)) 114 | 115 | if epoch % hp.log.save_interval == 0: 116 | save_path = os.path.join(pt_dir, '%s_%s_%04d.pt' 117 | % (args.name, githash, epoch)) 118 | torch.save({ 119 | 'model_g': model_g.state_dict(), 120 | 'model_d': model_d.state_dict(), 121 | 'optim_g': optim_g.state_dict(), 122 | 'optim_d': optim_d.state_dict(), 123 | 'step': step, 124 | 'epoch': epoch, 125 | 'hp_str': hp_str, 126 | 'githash': githash, 127 | }, save_path) 128 | logger.info("Saved checkpoint to: %s" % save_path) 129 | 130 | except Exception as e: 131 | logger.info("Exiting due to exception: %s" % e) 132 | traceback.print_exc() 133 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import subprocess 3 | import numpy as np 4 | from scipy.io.wavfile import read 5 | import librosa 6 | 7 | def get_commit_hash(): 8 | message = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]) 9 | return message.strip().decode('utf-8') 10 | 11 | def read_wav_np(path): 12 | sr, wav = read(path) 13 | 14 | if len(wav.shape) == 2: 15 | wav = wav[:, 0] 16 | 17 | if wav.dtype == np.int16: 18 | wav = wav / 32768.0 19 | elif wav.dtype == np.int32: 20 | wav = wav / 2147483648.0 21 | elif wav.dtype == np.uint8: 22 | wav = (wav - 128) / 128.0 23 | 24 | wav = wav.astype(np.float32) 25 | 26 | return sr, wav 27 | 28 | 29 | def read_wav_np_librosa(path,sampling_rate): 30 | wav, sr = librosa.load(path, sr=sampling_rate) 31 | return sr, wav 32 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/validation.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import torch 3 | 4 | 5 | def validate(hp, args, generator, discriminator, valloader, writer, step): 6 | generator.eval() 7 | discriminator.eval() 8 | torch.backends.cudnn.benchmark = False 9 | 10 | loader = tqdm.tqdm(valloader, desc='Validation loop') 11 | loss_g_sum = 0.0 12 | loss_d_sum = 0.0 13 | for mel, audio in loader: 14 | mel = mel.cuda() 15 | audio = audio.cuda() 16 | 17 | # generator 18 | fake_audio = generator(mel) 19 | if(audio.size(2) < fake_audio.size(2)): 20 | disc_fake = discriminator(fake_audio[:, :, :audio.size(2)]) 21 | disc_real = discriminator(audio) 22 | else: 23 | disc_fake = discriminator(fake_audio) 24 | disc_real = discriminator(audio[:, :, :fake_audio.size(2)]) 25 | 26 | loss_g = 0.0 27 | loss_d = 0.0 28 | for (feats_fake, score_fake), (feats_real, score_real) in zip(disc_fake, disc_real): 29 | loss_g += torch.mean(torch.sum(torch.pow(score_fake - 1.0, 2), dim=[1, 2])) 30 | for feat_f, feat_r in zip(feats_fake, feats_real): 31 | loss_g += hp.model.feat_match * torch.mean(torch.abs(feat_f - feat_r)) 32 | loss_d += torch.mean(torch.sum(torch.pow(score_real - 1.0, 2), dim=[1, 2])) 33 | loss_d += torch.mean(torch.sum(torch.pow(score_fake, 2), dim=[1, 2])) 34 | 35 | loss_g_sum += loss_g.item() 36 | loss_d_sum += loss_d.item() 37 | 38 | loss_g_avg = loss_g_sum / len(valloader.dataset) 39 | loss_d_avg = loss_d_sum / len(valloader.dataset) 40 | 41 | audio = audio[0][0].cpu().detach().numpy() 42 | fake_audio = fake_audio[0][0].cpu().detach().numpy() 43 | 44 | writer.log_validation(loss_g_avg, loss_d_avg, generator, discriminator, audio, fake_audio, step) 45 | 46 | torch.backends.cudnn.benchmark = True 47 | -------------------------------------------------------------------------------- /melgan_vocoder/utils/writer.py: -------------------------------------------------------------------------------- 1 | from tensorboardX import SummaryWriter 2 | 3 | from .plotting import plot_waveform_to_numpy 4 | 5 | 6 | class MyWriter(SummaryWriter): 7 | def __init__(self, hp, logdir): 8 | super(MyWriter, self).__init__(logdir) 9 | self.sample_rate = hp.audio.sampling_rate 10 | self.is_first = True 11 | 12 | def log_training(self, g_loss, d_loss, step): 13 | self.add_scalar('train.g_loss', g_loss, step) 14 | self.add_scalar('train.d_loss', d_loss, step) 15 | 16 | def log_validation(self, g_loss, d_loss, generator, discriminator, target, prediction, step): 17 | self.add_scalar('validation.g_loss', g_loss, step) 18 | self.add_scalar('validation.d_loss', d_loss, step) 19 | 20 | self.add_audio('raw_audio_predicted', prediction, step, self.sample_rate) 21 | self.add_image('waveform_predicted', plot_waveform_to_numpy(prediction), step) 22 | 23 | self.log_histogram(generator, step) 24 | self.log_histogram(discriminator, step) 25 | 26 | if self.is_first: 27 | self.add_audio('raw_audio_target', target, step, self.sample_rate) 28 | self.add_image('waveform_target', plot_waveform_to_numpy(target), step) 29 | self.is_first = False 30 | 31 | def log_histogram(self, model, step): 32 | for tag, value in model.named_parameters(): 33 | self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step) 34 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | 5 | from datasets import preprocessor 6 | from hparams import hparams 7 | from tqdm import tqdm 8 | 9 | 10 | def preprocess(args, input_folders, out_dir, hparams): 11 | mel_dir = os.path.join(out_dir, 'mels') 12 | wav_dir = os.path.join(out_dir, 'audio') 13 | linear_dir = os.path.join(out_dir, 'linear') 14 | os.makedirs(mel_dir, exist_ok=True) 15 | os.makedirs(wav_dir, exist_ok=True) 16 | os.makedirs(linear_dir, exist_ok=True) 17 | metadata = preprocessor.build_from_path_mydata(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs,tqdm=tqdm) 18 | # metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm) 19 | write_metadata(metadata, out_dir) 20 | 21 | def write_metadata(metadata, out_dir): 22 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 23 | for m in metadata: 24 | f.write('|'.join([str(x) for x in m]) + '\n') 25 | mel_frames = sum([int(m[4]) for m in metadata]) 26 | timesteps = sum([int(m[3]) for m in metadata]) 27 | sr = hparams.sample_rate 28 | hours = timesteps / sr / 3600 29 | print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format( 30 | len(metadata), mel_frames, timesteps, hours)) 31 | print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata))) 32 | print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata))) 33 | print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 34 | 35 | def norm_data(args): 36 | 37 | merge_books = (args.merge_books=='True') 38 | 39 | print('Selecting data folders..') 40 | supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS'] 41 | if args.dataset not in supported_datasets: 42 | raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format( 43 | args.dataset, supported_datasets)) 44 | 45 | if args.dataset.startswith('LJSpeech'): 46 | return [os.path.join(args.base_dir, args.dataset)] 47 | 48 | 49 | if args.dataset == 'M-AILABS': 50 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 51 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA'] 52 | if args.language not in supported_languages: 53 | raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format( 54 | supported_languages)) 55 | 56 | supported_voices = ['female', 'male', 'mix'] 57 | if args.voice not in supported_voices: 58 | raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format( 59 | supported_voices)) 60 | 61 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice) 62 | supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 63 | if args.reader not in supported_readers: 64 | raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format( 65 | supported_readers)) 66 | 67 | path = os.path.join(path, args.reader) 68 | supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 69 | if merge_books: 70 | return [os.path.join(path, book) for book in supported_books] 71 | 72 | else: 73 | if args.book not in supported_books: 74 | raise ValueError('Please enter a valid book for your reader settings! \n{}'.format( 75 | supported_books)) 76 | 77 | return [os.path.join(path, args.book)] 78 | 79 | 80 | def run_preprocess(args, hparams): 81 | # input_folders = norm_data(args) 82 | # output_folder = os.path.join(args.base_dir, args.output) 83 | 84 | input_folders = ['/xxx/BZNSYP/'] 85 | output_folder = '/xxx/tacotron2_wavernn/training_data' 86 | 87 | preprocess(args, input_folders, output_folder, hparams) 88 | 89 | 90 | def main(): 91 | print('initializing preprocessing..') 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('--base_dir', default='./') 94 | parser.add_argument('--hparams', default='', 95 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 96 | parser.add_argument('--dataset', default='LJSpeech-1.1') 97 | parser.add_argument('--language', default='en_US') 98 | parser.add_argument('--voice', default='female') 99 | parser.add_argument('--reader', default='mary_ann') 100 | parser.add_argument('--merge_books', default='False') 101 | parser.add_argument('--book', default='northandsouth') 102 | parser.add_argument('--output', default='training_data') 103 | parser.add_argument('--n_jobs', type=int, default=4) 104 | #parser.add_argument('--n_jobs', type=int, default=cpu_count()) 105 | args = parser.parse_args() 106 | 107 | modified_hp = hparams.parse(args.hparams) 108 | 109 | assert args.merge_books in ('False', 'True') 110 | 111 | run_preprocess(args, modified_hp) 112 | 113 | 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | falcon==1.2.0 2 | inflect==0.2.5 3 | audioread==2.1.5 4 | librosa==0.5.1 5 | matplotlib==2.0.2 6 | numpy==1.14.0 7 | scipy==1.0.0 8 | tqdm==4.11.2 9 | Unidecode==0.4.20 10 | pyaudio==0.2.11 11 | sounddevice==0.3.10 12 | lws 13 | keras -------------------------------------------------------------------------------- /sentences_phone.txt: -------------------------------------------------------------------------------- 1 | k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。 2 | b ao2 m a3 #1 p ei4 g ua4 #1 b o3 l uo2 an1 #3 , d iao1 ch an2 #1 van4 zh en3 #2 d ong3 ueng1 t a4 #4 。 3 | uei4 sh en3 m e5 #1 k an4 j ian4 #1 n i3 #1 d i4 x iong1 #2 ian3 zh ong1 #1 iou3 c ii4 #4 , q ve4 b u4 x iang3 #1 z ii4 j i3 #1 ian3 zh ong1 #2 iou3 #1 l iang2 m u4 n e5 #4 ? 4 | r en2 sh eng1 #2 x iang4 #1 i4 j ie2 #1 m u4 t ou5 #4 , h uo4 zh e3 #1 x van3 z e2 #2 x iong2 x iong2 #1 r an2 sh ao1 #4 , h uo4 zh e3 #1 x van3 z e2 #2 m an4 m an4 #1 f u3 x iou3 #4 。 5 | sh iii4 uo3 #2 n ve4 d ai4 #1 zh ang4 f u5 l o5 #4 , d an4 t a1 #2 h ai2 sh iii4 #1 ai4 uo3 d ie5 #4 。 6 | ing1 g ai1 sh uo1 #4 , d uo1 van2 h ua4 #1 j ing1 ing2 #1 zh an4 l ve4 #3 sh iii4 #1 d a4 x ing2 #1 q i3 ie4 #1 j i2 t uan2 #2 f a1 zh an3 d e5 #2 zh ong4 iao4 #1 zh an4 l ve4 #1 x van3 z e2 #4 。 7 | er2 q ie3 #2 j v1 r an2 #1 z ai4 #1 uei2 x i1 d e5 #1 q i3 h ao4 x ia4 #3 l uen2 uei2 #1 q in1 l ve4 zh e3 d e5 #1 b ang1 x iong1 #4 。 8 | b ei4 #1 h ou4 r en2 #1 f eng4 uei2 #2 b ing1 sh eng4 d e5 #2 s uen1 z ii3 b ing1 f a3 #2 j i2 q i2 #1 j vn1 sh iii4 #1 m ou2 l ve4 #4 。 9 | b en3 c ong2 sh u1 #2 s uo3 x van3 q v3 d e5 #1 sh iii2 q i1 g e4 #1 r en2 u4 #4 , zh uo2 ian3 v2 #2 l i4 ch ao2 #1 l i4 d ai4 #2 j iao4 uei2 #1 t u1 ch u1 d e5 #2 j vn1 sh iii4 #1 m ou2 l ve4 #1 r en2 u4 #4 。 10 | zh ong1 g uo2 #1 b ing1 sh u1 #3 j i2 zh ong1 zh e3 #2 zh ong1 g uo2 #1 b ing1 j ia1 #1 x ian1 r en2 d e5 #3 m ou2 l ve4 #1 s ii1 x iang3 #1 j ing1 c uei4 #4 。 11 | m u4 d i4 #1 sh iii4 #3 h u1 v4 #1 sh e4 h uei4 #1 g e4 j ie4 #2 b ang1 zh u4 #1 u2 j ia1 k e3 g uei1 zh e3 #3 、 q iong2 r en2 #2 h e2 #1 sh eng1 h uo2 #1 iou3 k uen4 n an2 d e5 #1 r en2 #4 。 12 | g uan1 v2 #1 x i1 z ang4 d e5 #1 ch uan2 sh uo1 #2 iou3 #1 h en3 #1 d uo1 #3 , l i4 l ai2 #1 d ou1 sh iii4 #2 ch ao2 sh eng4 zh e3 d e5 #2 t ian1 t ang2 #4 。 13 | er2 #1 z uo4 uei2 #2 zh ong1 g uo2 #1 x i1 n an2 #2 b ian1 ch uei2 #1 zh ong4 d i4 #3 , ie3 #1 d ou1 sh iii4 #2 zh ong1 g uo2 #1 l ing3 t u3 #2 b u4 k e3 f en1 g e1 d e5 #2 i2 b u4 f en1 #4 。 -------------------------------------------------------------------------------- /symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | # from . import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | 12 | label = 'phone' # pingyin phone 13 | 14 | if(label == 'pinyin'): 15 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? ' 16 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890@!\'(),-.:;? ' 17 | 18 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 19 | #_arpabet = ['@' + s for s in cmudict.valid_symbols] 20 | # Export all symbols: 21 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet 22 | 23 | 24 | if(label == 'phone'): 25 | _characters = [ 26 | 'breath','cough','noise','smack','um','sil','sp1', 27 | 'a5','a1','a4','ai5','ai1','ai2','ai3','ai4','an1', 28 | 'an2','an3','an4','ang1','ang2','ang4','ao1','ao2', 29 | 'ao3','ao4','b','a2','a3','ang5','ang3','ao5','ei5', 30 | 'ei1','ei2','ei3','ei4','en5','en1','en2','en3','en4', 31 | 'eng1','eng2','eng4','i1','i2','i3','i4','ian5','ian1', 32 | 'ian2','ian3','ian4','iao1','iao2','iao3','iao4','ie1', 33 | 'ie2','ie3','ie4','in1','in4','ing1','ing2','ing3','ing4', 34 | 'o5','o1','o2','o3','o4','u5','u2','u3','u4','c','e4', 35 | 'ch','an5','e1','e2','e3','eng5','eng3','iii5','iii1', 36 | 'iii2','iii3','iii4','ong1','ong2','ong3','ong4','ou5', 37 | 'ou1','ou2','ou3','ou4','u1','uai1','uai3','uai4','uan5', 38 | 'uan1','uan2','uan3','uan4','uang5','uang1','uang2','uang3', 39 | 'uang4','uei5','uei1','uei2','uen1','uen2','uen3','uo1', 40 | 'uo4','ii1','ii2','ii3','ii4','ong5','uei3','uei4','uen4', 41 | 'uo2','d','e5','i5','ia2','ia3','iao5','ie5','iou1','uo5', 42 | 'uo3','eer2','er5','er2','er3','er4','f','g','ua5','ua1', 43 | 'ua2','ua3','ua4','uai2','h','uen5','j','ia5','ia1','ia4', 44 | 'iang5','iang1','iang2','iang3','iang4','in5','in2','in3', 45 | 'ing5','iong2','iong3','iou5','iou2','iou3','iou4','v5', 46 | 'v1','v2','v3','v4','van1','van2','van3','van4','ve1', 47 | 've2','ve4','vn5','vn1','vn4','k','uai5','l','m','n', 48 | 'ng1','p','q','van5','vn2','r','s','sh','ii5','t','ueng1', 49 | 'ueng2','ueng3','ueng4','x','iong5','iong1','ve3','io5', 50 | 'io1','iong4','ve5','vn3','z','zh',',','!','。','?', 51 | '、',':','#1','#2','#3','#4','#',' ' 52 | ] 53 | symbols = [_pad, _eos] + _characters 54 | -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from warnings import warn 4 | from time import sleep 5 | 6 | import tensorflow as tf 7 | 8 | from hparams import hparams 9 | from infolog import log 10 | from tacotron.synthesize import tacotron_synthesize 11 | from wavenet_vocoder.synthesize import wavenet_synthesize 12 | # os.environ["CUDA_VISIBLE_DEVICES"] = '0' 13 | 14 | def prepare_run(args): 15 | modified_hp = hparams.parse(args.hparams) 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 17 | 18 | run_name = args.name or args.tacotron_name or args.model 19 | tmp_path = '/xxx/tacotron_2_biaobei/' 20 | taco_checkpoint = os.path.join(tmp_path,'logs-' + run_name, 'taco_' + args.checkpoint) 21 | 22 | run_name = args.name or args.wavenet_name or args.model 23 | wave_checkpoint = os.path.join(tmp_path,'logs-' + run_name, 'wave_' + args.checkpoint) 24 | return taco_checkpoint, wave_checkpoint, modified_hp 25 | 26 | def get_sentences(args): 27 | if args.text_list != '': 28 | with open(args.text_list, 'rb') as f: 29 | sentences = list(map(lambda l: l.decode("utf-8")[:-1], f.readlines())) 30 | else: 31 | sentences = hparams.sentences 32 | return sentences 33 | 34 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences): 35 | log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model)) 36 | log('Synthesizing mel-spectrograms from text..') 37 | wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 38 | #Delete Tacotron model from graph 39 | tf.reset_default_graph() 40 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is synthesizing 41 | sleep(0.5) 42 | log('Synthesizing audio from mel-spectrograms.. (This may take a while)') 43 | wavenet_synthesize(args, hparams, wave_checkpoint) 44 | log('Tacotron-2 TTS synthesis complete!') 45 | 46 | def main(): 47 | accepted_modes = ['eval', 'synthesis', 'live'] 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint') 50 | parser.add_argument('--hparams', default='', 51 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 52 | parser.add_argument('--name', help='Name of logging directory if the two models were trained together.') 53 | parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately') 54 | parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately') 55 | parser.add_argument('--model', default='Tacotron-2') 56 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') 57 | parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') 58 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 59 | parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) 60 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') 61 | parser.add_argument('--text_list', default='sentences.txt', help='Text file contains list of texts to be synthesized. Valid if mode=eval') 62 | parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids') 63 | args = parser.parse_args() 64 | 65 | accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2'] 66 | 67 | if args.model not in accepted_models: 68 | raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models)) 69 | 70 | if args.mode not in accepted_modes: 71 | raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode)) 72 | 73 | if args.mode == 'live' and args.model == 'Wavenet': 74 | raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!') 75 | 76 | if args.GTA not in ('True', 'False'): 77 | raise ValueError('GTA option must be either True or False') 78 | 79 | if args.model == 'Tacotron-2': 80 | if args.mode == 'live': 81 | warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!') 82 | if args.mode == 'synthesis': 83 | raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)') 84 | 85 | taco_checkpoint, wave_checkpoint, hparams = prepare_run(args) 86 | sentences = get_sentences(args) 87 | 88 | if args.model == 'Tacotron': 89 | _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 90 | elif args.model == 'WaveNet': 91 | wavenet_synthesize(args, hparams, wave_checkpoint) 92 | elif args.model == 'Tacotron-2': 93 | synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) 94 | else: 95 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /tacotron/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /tacotron/feeder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import time 4 | import traceback 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from infolog import log 9 | from sklearn.model_selection import train_test_split 10 | from tacotron.utils.text import text_to_sequence 11 | 12 | _batches_per_group = 64 13 | 14 | class Feeder: 15 | """ 16 | Feeds batches of data into queue on a background thread. 17 | """ 18 | 19 | def __init__(self, coordinator, metadata_filename, hparams): 20 | super(Feeder, self).__init__() 21 | self._coord = coordinator 22 | self._hparams = hparams 23 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 24 | self._train_offset = 0 25 | self._test_offset = 0 26 | 27 | # Load metadata 28 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') 29 | self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') 30 | with open(metadata_filename, encoding='utf-8') as f: 31 | self._metadata = [line.strip().split('|') for line in f] 32 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 33 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) 34 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) 35 | 36 | #Train test split 37 | if hparams.tacotron_test_size is None: 38 | assert hparams.tacotron_test_batches is not None 39 | 40 | test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 41 | else hparams.tacotron_test_batches * hparams.tacotron_batch_size) 42 | indices = np.arange(len(self._metadata)) 43 | train_indices, test_indices = train_test_split(indices, 44 | test_size=test_size, random_state=hparams.tacotron_data_random_state) 45 | 46 | #Make sure test_indices is a multiple of batch_size else round down 47 | len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) 48 | extra_test = test_indices[len_test_indices:] 49 | test_indices = test_indices[:len_test_indices] 50 | train_indices = np.concatenate([train_indices, extra_test]) 51 | 52 | self._train_meta = list(np.array(self._metadata)[train_indices]) 53 | self._test_meta = list(np.array(self._metadata)[test_indices]) 54 | 55 | self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size 56 | 57 | if hparams.tacotron_test_size is None: 58 | assert hparams.tacotron_test_batches == self.test_steps 59 | 60 | #pad input sequences with the 0 ( _ ) 61 | self._pad = 0 62 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 63 | #to avoid any possible conflicts, without affecting the output range of the model too much 64 | if hparams.symmetric_mels: 65 | self._target_pad = -hparams.max_abs_value 66 | else: 67 | self._target_pad = 0. 68 | #Mark finished sequences with 1s 69 | self._token_pad = 1. 70 | 71 | with tf.device('/cpu:0'): 72 | # Create placeholders for inputs and targets. Don't specify batch size because we want 73 | # to be able to feed different batch sizes at eval time. 74 | self._placeholders = [ 75 | tf.placeholder(tf.int32, shape=(None, None), name='inputs'), 76 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 77 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), 78 | tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), 79 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), 80 | tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), 81 | tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos'), 82 | ] 83 | 84 | # Create queue for buffering data 85 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue') 86 | self._enqueue_op = queue.enqueue(self._placeholders) 87 | self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = queue.dequeue() 88 | 89 | self.inputs.set_shape(self._placeholders[0].shape) 90 | self.input_lengths.set_shape(self._placeholders[1].shape) 91 | self.mel_targets.set_shape(self._placeholders[2].shape) 92 | self.token_targets.set_shape(self._placeholders[3].shape) 93 | self.linear_targets.set_shape(self._placeholders[4].shape) 94 | self.targets_lengths.set_shape(self._placeholders[5].shape) 95 | self.split_infos.set_shape(self._placeholders[6].shape) 96 | 97 | # Create eval queue for buffering eval data 98 | eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='eval_queue') 99 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 100 | self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ 101 | self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = eval_queue.dequeue() 102 | 103 | self.eval_inputs.set_shape(self._placeholders[0].shape) 104 | self.eval_input_lengths.set_shape(self._placeholders[1].shape) 105 | self.eval_mel_targets.set_shape(self._placeholders[2].shape) 106 | self.eval_token_targets.set_shape(self._placeholders[3].shape) 107 | self.eval_linear_targets.set_shape(self._placeholders[4].shape) 108 | self.eval_targets_lengths.set_shape(self._placeholders[5].shape) 109 | self.eval_split_infos.set_shape(self._placeholders[6].shape) 110 | 111 | def start_threads(self, session): 112 | self._session = session 113 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group) 114 | thread.daemon = True #Thread will close when parent quits 115 | thread.start() 116 | 117 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group) 118 | thread.daemon = True #Thread will close when parent quits 119 | thread.start() 120 | 121 | def _get_test_groups(self): 122 | meta = self._test_meta[self._test_offset] 123 | self._test_offset += 1 124 | 125 | text = meta[5] 126 | 127 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 128 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 129 | #Create parallel sequences containing zeros to represent a non finished sequence 130 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 131 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 132 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 133 | 134 | def make_test_batches(self): 135 | start = time.time() 136 | 137 | # Read a group of examples 138 | n = self._hparams.tacotron_batch_size 139 | r = self._hparams.outputs_per_step 140 | 141 | #Test on entire test set 142 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 143 | 144 | # Bucket examples based on similar output sequence length for efficiency 145 | examples.sort(key=lambda x: x[-1]) 146 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 147 | np.random.shuffle(batches) 148 | 149 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 150 | return batches, r 151 | 152 | def _enqueue_next_train_group(self): 153 | while not self._coord.should_stop(): 154 | start = time.time() 155 | 156 | # Read a group of examples 157 | n = self._hparams.tacotron_batch_size 158 | r = self._hparams.outputs_per_step 159 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 160 | 161 | # Bucket examples based on similar output sequence length for efficiency 162 | examples.sort(key=lambda x: x[-1]) 163 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 164 | np.random.shuffle(batches) 165 | 166 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 167 | for batch in batches: 168 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 169 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 170 | 171 | def _enqueue_next_test_group(self): 172 | #Create test batches once and evaluate on them for all test steps 173 | test_batches, r = self.make_test_batches() 174 | while not self._coord.should_stop(): 175 | for batch in test_batches: 176 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 177 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 178 | 179 | def _get_next_example(self): 180 | """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk 181 | """ 182 | if self._train_offset >= len(self._train_meta): 183 | self._train_offset = 0 184 | np.random.shuffle(self._train_meta) 185 | 186 | meta = self._train_meta[self._train_offset] 187 | self._train_offset += 1 188 | 189 | text = meta[5] 190 | 191 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 192 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 193 | #Create parallel sequences containing zeros to represent a non finished sequence 194 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 195 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 196 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 197 | 198 | def _prepare_batch(self, batches, outputs_per_step): 199 | assert 0 == len(batches) % self._hparams.tacotron_num_gpus 200 | size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus) 201 | np.random.shuffle(batches) 202 | 203 | inputs = None 204 | mel_targets = None 205 | token_targets = None 206 | linear_targets = None 207 | targets_lengths = None 208 | split_infos = [] 209 | 210 | targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss 211 | input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32) 212 | 213 | #Produce inputs/targets of variables lengths for different GPUs 214 | for i in range(self._hparams.tacotron_num_gpus): 215 | batch = batches[size_per_device * i: size_per_device * (i + 1)] 216 | input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch]) 217 | inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device 218 | mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step) 219 | mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device 220 | 221 | #Pad sequences with 1 to infer that the sequence is done 222 | token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step) 223 | token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device 224 | linear_targets_cur_device, linear_target_max_len = self._prepare_targets([x[3] for x in batch], outputs_per_step) 225 | linear_targets = np.concatenate((linear_targets, linear_targets_cur_device), axis=1) if linear_targets is not None else linear_targets_cur_device 226 | split_infos.append([input_max_len, mel_target_max_len, token_target_max_len, linear_target_max_len]) 227 | 228 | split_infos = np.asarray(split_infos, dtype=np.int32) 229 | return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths, split_infos) 230 | 231 | def _prepare_inputs(self, inputs): 232 | max_len = max([len(x) for x in inputs]) 233 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 234 | 235 | def _prepare_targets(self, targets, alignment): 236 | max_len = max([len(t) for t in targets]) 237 | data_len = self._round_up(max_len, alignment) 238 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 239 | 240 | def _prepare_token_targets(self, targets, alignment): 241 | max_len = max([len(t) for t in targets]) + 1 242 | data_len = self._round_up(max_len, alignment) 243 | return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len 244 | 245 | def _pad_input(self, x, length): 246 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad) 247 | 248 | def _pad_target(self, t, length): 249 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) 250 | 251 | def _pad_token_target(self, t, length): 252 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad) 253 | 254 | def _round_up(self, x, multiple): 255 | remainder = x % multiple 256 | return x if remainder == 0 else x + multiple - remainder 257 | 258 | def _round_down(self, x, multiple): 259 | remainder = x % multiple 260 | return x if remainder == 0 else x - remainder 261 | -------------------------------------------------------------------------------- /tacotron/models/Architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers usefull for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from tacotron.models.attention import _compute_attention 9 | from tensorflow.contrib.rnn import RNNCell 10 | from tensorflow.python.framework import ops, tensor_shape 11 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops 12 | from tensorflow.python.util import nest 13 | 14 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 15 | 16 | 17 | 18 | class TacotronEncoderCell(RNNCell): 19 | """Tacotron 2 Encoder Cell 20 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 21 | layer to predict the hidden representation vector (or memory) 22 | """ 23 | 24 | def __init__(self, convolutional_layers, lstm_layer): 25 | """Initialize encoder parameters 26 | 27 | Args: 28 | convolutional_layers: Encoder convolutional block class 29 | lstm_layer: encoder bidirectional lstm layer class 30 | """ 31 | super(TacotronEncoderCell, self).__init__() 32 | #Initialize encoder layers 33 | self._convolutions = convolutional_layers 34 | self._cell = lstm_layer 35 | 36 | def __call__(self, inputs, input_lengths=None): 37 | #Pass input sequence through a stack of convolutional layers 38 | conv_output = self._convolutions(inputs) 39 | 40 | #Extract hidden representation from encoder lstm cells 41 | hidden_representation = self._cell(conv_output, input_lengths) 42 | 43 | #For shape visualization 44 | self.conv_output_shape = conv_output.shape 45 | return hidden_representation 46 | 47 | 48 | class TacotronDecoderCellState( 49 | collections.namedtuple("TacotronDecoderCellState", 50 | ("cell_state", "attention", "time", "alignments", 51 | "alignment_history", "max_attentions"))): 52 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 53 | Contains: 54 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 55 | step. 56 | - `attention`: The attention emitted at the previous time step. 57 | - `time`: int32 scalar containing the current time step. 58 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 59 | emitted at the previous time step for each attention mechanism. 60 | - `alignment_history`: a single or tuple of `TensorArray`(s) 61 | containing alignment matrices from all time steps for each attention 62 | mechanism. Call `stack()` on each to convert to a `Tensor`. 63 | """ 64 | def replace(self, **kwargs): 65 | """Clones the current state while overwriting components provided by kwargs. 66 | """ 67 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 68 | 69 | class TacotronDecoderCell(RNNCell): 70 | """Tactron 2 Decoder Cell 71 | Decodes encoder output and previous mel frames into next r frames 72 | 73 | Decoder Step i: 74 | 1) Prenet to compress last output information 75 | 2) Concat compressed inputs with previous context vector (input feeding) * 76 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 77 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 78 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 79 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 80 | 81 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper, 82 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 83 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 84 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only. 85 | """ 86 | 87 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): 88 | """Initialize decoder parameters 89 | 90 | Args: 91 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 92 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 93 | learn encoder-decoder alignments 94 | rnn_cell: Instance of RNNCell, main body of the decoder 95 | frame_projection: tensorflow fully connected layer with r * num_mels output units 96 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 97 | and through a sigmoid activation 98 | mask_finished: Boolean, Whether to mask decoder frames after the 99 | """ 100 | super(TacotronDecoderCell, self).__init__() 101 | #Initialize decoder layers 102 | self._prenet = prenet 103 | self._attention_mechanism = attention_mechanism 104 | self._cell = rnn_cell 105 | self._frame_projection = frame_projection 106 | self._stop_projection = stop_projection 107 | 108 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 109 | 110 | def _batch_size_checks(self, batch_size, error_message): 111 | return [check_ops.assert_equal(batch_size, 112 | self._attention_mechanism.batch_size, 113 | message=error_message)] 114 | 115 | @property 116 | def output_size(self): 117 | return self._frame_projection.shape 118 | 119 | @property 120 | def state_size(self): 121 | """The `state_size` property of `TacotronDecoderCell`. 122 | 123 | Returns: 124 | An `TacotronDecoderCell` tuple containing shapes used by this object. 125 | """ 126 | return TacotronDecoderCellState( 127 | cell_state=self._cell._cell.state_size, 128 | time=tensor_shape.TensorShape([]), 129 | attention=self._attention_layer_size, 130 | alignments=self._attention_mechanism.alignments_size, 131 | alignment_history=(), 132 | max_attentions=()) 133 | 134 | def zero_state(self, batch_size, dtype): 135 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 136 | 137 | Args: 138 | batch_size: `0D` integer tensor: the batch size. 139 | dtype: The internal state data type. 140 | Returns: 141 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 142 | possibly, empty `TensorArray` objects. 143 | Raises: 144 | ValueError: (or, possibly at runtime, InvalidArgument), if 145 | `batch_size` does not match the output size of the encoder passed 146 | to the wrapper object at initialization time. 147 | """ 148 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 149 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 150 | error_message = ( 151 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 152 | "Non-matching batch sizes between the memory " 153 | "(encoder output) and the requested batch size.") 154 | with ops.control_dependencies( 155 | self._batch_size_checks(batch_size, error_message)): 156 | cell_state = nest.map_structure( 157 | lambda s: array_ops.identity(s, name="checked_cell_state"), 158 | cell_state) 159 | return TacotronDecoderCellState( 160 | cell_state=cell_state, 161 | time=array_ops.zeros([], dtype=tf.int32), 162 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 163 | dtype), 164 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 165 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 166 | dynamic_size=True), 167 | max_attentions=tf.zeros((batch_size, ), dtype=tf.int32)) 168 | 169 | def __call__(self, inputs, state): 170 | #Information bottleneck (essential for learning attention) 171 | prenet_output = self._prenet(inputs) 172 | 173 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 174 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 175 | 176 | #Unidirectional LSTM layers 177 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 178 | 179 | 180 | #Compute the attention (context) vector and alignments using 181 | #the new decoder cell hidden state as query vector 182 | #and cumulative alignments to extract location features 183 | #The choice of the new cell hidden state (s_{i}) of the last 184 | #decoder RNN Cell is based on Luong et Al. (2015): 185 | #https://arxiv.org/pdf/1508.04025.pdf 186 | previous_alignments = state.alignments 187 | previous_alignment_history = state.alignment_history 188 | context_vector, alignments, cumulated_alignments, max_attentions = _compute_attention(self._attention_mechanism, 189 | LSTM_output, 190 | previous_alignments, 191 | attention_layer=None, 192 | prev_max_attentions=state.max_attentions) 193 | 194 | #Concat LSTM outputs and context vector to form projections inputs 195 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 196 | 197 | #Compute predicted frames and predicted 198 | cell_outputs = self._frame_projection(projections_input) 199 | stop_tokens = self._stop_projection(projections_input) 200 | 201 | #Save alignment history 202 | alignment_history = previous_alignment_history.write(state.time, alignments) 203 | 204 | #Prepare next decoder state 205 | next_state = TacotronDecoderCellState( 206 | time=state.time + 1, 207 | cell_state=next_cell_state, 208 | attention=context_vector, 209 | alignments=cumulated_alignments, 210 | alignment_history=alignment_history, 211 | max_attentions=max_attentions) 212 | 213 | return (cell_outputs, stop_tokens), next_state 214 | -------------------------------------------------------------------------------- /tacotron/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'Tacotron': 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /tacotron/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.layers import core as layers_core 6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope 7 | 8 | 9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, 11 | attention_layer, prev_max_attentions): 12 | """Computes the attention and alignments for a given attention_mechanism.""" 13 | alignments, next_attention_state, max_attentions = attention_mechanism( 14 | cell_output, state=attention_state, prev_max_attentions=prev_max_attentions) 15 | 16 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 17 | expanded_alignments = array_ops.expand_dims(alignments, 1) 18 | # Context is the inner product of alignments and values along the 19 | # memory time dimension. 20 | # alignments shape is 21 | # [batch_size, 1, memory_time] 22 | # attention_mechanism.values shape is 23 | # [batch_size, memory_time, memory_size] 24 | # the batched matmul is over memory_time, so the output shape is 25 | # [batch_size, 1, memory_size]. 26 | # we then squeeze out the singleton dim. 27 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 28 | context = array_ops.squeeze(context, [1]) 29 | 30 | if attention_layer is not None: 31 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 32 | else: 33 | attention = context 34 | 35 | return attention, alignments, next_attention_state, max_attentions 36 | 37 | 38 | def _location_sensitive_score(W_query, W_fil, W_keys): 39 | """Impelements Bahdanau-style (cumulative) scoring function. 40 | This attention is described in: 41 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 42 | gio, “Attention-based models for speech recognition,” in Ad- 43 | vances in Neural Information Processing Systems, 2015, pp. 44 | 577–585. 45 | 46 | ############################################################################# 47 | hybrid attention (content-based + location-based) 48 | f = F * α_{i-1} 49 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 50 | ############################################################################# 51 | 52 | Args: 53 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 54 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 55 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 56 | Returns: 57 | A '[batch_size, max_time]' attention score (energy) 58 | """ 59 | # Get the number of hidden units from the trailing dimension of keys 60 | dtype = W_query.dtype 61 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 62 | 63 | v_a = tf.get_variable( 64 | 'attention_variable_projection', shape=[num_units], dtype=dtype, 65 | initializer=tf.contrib.layers.xavier_initializer()) 66 | b_a = tf.get_variable( 67 | 'attention_bias', shape=[num_units], dtype=dtype, 68 | initializer=tf.zeros_initializer()) 69 | 70 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 71 | 72 | def _smoothing_normalization(e): 73 | """Applies a smoothing normalization function instead of softmax 74 | Introduced in: 75 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 76 | gio, “Attention-based models for speech recognition,” in Ad- 77 | vances in Neural Information Processing Systems, 2015, pp. 78 | 577–585. 79 | 80 | ############################################################################ 81 | Smoothing normalization function 82 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 83 | ############################################################################ 84 | 85 | Args: 86 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 87 | values of an attention mechanism 88 | Returns: 89 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 90 | attendance to multiple memory time steps. 91 | """ 92 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 93 | 94 | 95 | class LocationSensitiveAttention(BahdanauAttention): 96 | """Impelements Bahdanau-style (cumulative) scoring function. 97 | Usually referred to as "hybrid" attention (content-based + location-based) 98 | Extends the additive attention described in: 99 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 100 | tion by jointly learning to align and translate,” in Proceedings 101 | of ICLR, 2015." 102 | to use previous alignments as additional location features. 103 | 104 | This attention is described in: 105 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 106 | gio, “Attention-based models for speech recognition,” in Ad- 107 | vances in Neural Information Processing Systems, 2015, pp. 108 | 577–585. 109 | """ 110 | 111 | def __init__(self, 112 | num_units, 113 | memory, 114 | hparams, 115 | is_training, 116 | mask_encoder=True, 117 | memory_sequence_length=None, 118 | smoothing=False, 119 | cumulate_weights=True, 120 | name='LocationSensitiveAttention'): 121 | """Construct the Attention mechanism. 122 | Args: 123 | num_units: The depth of the query mechanism. 124 | memory: The memory to query; usually the output of an RNN encoder. This 125 | tensor should be shaped `[batch_size, max_time, ...]`. 126 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 127 | memory_sequence_length (optional): Sequence lengths for the batch entries 128 | in memory. If provided, the memory tensor rows are masked with zeros 129 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 130 | smoothing (optional): Boolean. Determines which normalization function to use. 131 | Default normalization function (probablity_fn) is softmax. If smoothing is 132 | enabled, we replace softmax with: 133 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 134 | Introduced in: 135 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 136 | gio, “Attention-based models for speech recognition,” in Ad- 137 | vances in Neural Information Processing Systems, 2015, pp. 138 | 577–585. 139 | This is mainly used if the model wants to attend to multiple input parts 140 | at the same decoding step. We probably won't be using it since multiple sound 141 | frames may depend on the same character/phone, probably not the way around. 142 | Note: 143 | We still keep it implemented in case we want to test it. They used it in the 144 | paper in the context of speech recognition, where one phoneme may depend on 145 | multiple subsequent sound frames. 146 | name: Name to use when creating ops. 147 | """ 148 | #Create normalization function 149 | #Setting it to None defaults in using softmax 150 | normalization_function = _smoothing_normalization if (smoothing == True) else None 151 | memory_length = memory_sequence_length if (mask_encoder==True) else None 152 | super(LocationSensitiveAttention, self).__init__( 153 | num_units=num_units, 154 | memory=memory, 155 | memory_sequence_length=memory_length, 156 | probability_fn=normalization_function, 157 | name=name) 158 | 159 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 160 | kernel_size=hparams.attention_kernel, padding='same', use_bias=True, 161 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution') 162 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 163 | dtype=tf.float32, name='location_features_layer') 164 | self._cumulate = cumulate_weights 165 | self.synthesis_constraint = hparams.synthesis_constraint and not is_training 166 | self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32) 167 | self.constraint_type = hparams.synthesis_constraint_type 168 | 169 | def __call__(self, query, state, prev_max_attentions): 170 | """Score the query based on the keys and values. 171 | Args: 172 | query: Tensor of dtype matching `self.values` and shape 173 | `[batch_size, query_depth]`. 174 | state (previous alignments): Tensor of dtype matching `self.values` and shape 175 | `[batch_size, alignments_size]` 176 | (`alignments_size` is memory's `max_time`). 177 | Returns: 178 | alignments: Tensor of dtype matching `self.values` and shape 179 | `[batch_size, alignments_size]` (`alignments_size` is memory's 180 | `max_time`). 181 | """ 182 | previous_alignments = state 183 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 184 | 185 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 186 | processed_query = self.query_layer(query) if self.query_layer else query 187 | # -> [batch_size, 1, attention_dim] 188 | processed_query = tf.expand_dims(processed_query, 1) 189 | 190 | # processed_location_features shape [batch_size, max_time, attention dimension] 191 | # [batch_size, max_time] -> [batch_size, max_time, 1] 192 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 193 | # location features [batch_size, max_time, filters] 194 | f = self.location_convolution(expanded_alignments) 195 | # Projected location features [batch_size, max_time, attention_dim] 196 | processed_location_features = self.location_layer(f) 197 | 198 | # energy shape [batch_size, max_time] 199 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 200 | 201 | if self.synthesis_constraint: 202 | Tx = tf.shape(energy)[-1] 203 | # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1]) 204 | if self.constraint_type == 'monotonic': 205 | key_masks = tf.sequence_mask(prev_max_attentions, Tx) 206 | reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1] 207 | else: 208 | assert self.constraint_type == 'window' 209 | key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx) 210 | reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1] 211 | 212 | masks = tf.logical_or(key_masks, reverse_masks) 213 | paddings = tf.ones_like(energy) * (-2 ** 32 + 1) # (N, Ty/r, Tx) 214 | energy = tf.where(tf.equal(masks, False), energy, paddings) 215 | 216 | # alignments shape = energy shape = [batch_size, max_time] 217 | alignments = self._probability_fn(energy, previous_alignments) 218 | max_attentions = tf.argmax(alignments, -1, output_type=tf.int32) # (N, Ty/r) 219 | 220 | # Cumulate alignments 221 | if self._cumulate: 222 | next_state = alignments + previous_alignments 223 | else: 224 | next_state = alignments 225 | 226 | return alignments, next_state, max_attentions 227 | -------------------------------------------------------------------------------- /tacotron/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import collections 4 | 5 | import tensorflow as tf 6 | from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper 7 | from tensorflow.contrib.seq2seq.python.ops import decoder 8 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 9 | from tensorflow.python.framework import ops, tensor_shape 10 | from tensorflow.python.layers import base as layers_base 11 | from tensorflow.python.ops import rnn_cell_impl 12 | from tensorflow.python.util import nest 13 | 14 | 15 | class CustomDecoderOutput( 16 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 17 | pass 18 | 19 | 20 | class CustomDecoder(decoder.Decoder): 21 | """Custom sampling decoder. 22 | 23 | Allows for stop token prediction at inference time 24 | and returns equivalent loss in training time. 25 | 26 | Note: 27 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 28 | """ 29 | 30 | def __init__(self, cell, helper, initial_state, output_layer=None): 31 | """Initialize CustomDecoder. 32 | Args: 33 | cell: An `RNNCell` instance. 34 | helper: A `Helper` instance. 35 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 36 | The initial state of the RNNCell. 37 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 38 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 39 | to storing the result or sampling. 40 | Raises: 41 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 42 | """ 43 | rnn_cell_impl.assert_like_rnncell(type(cell), cell) 44 | if not isinstance(helper, helper_py.Helper): 45 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 46 | if (output_layer is not None 47 | and not isinstance(output_layer, layers_base.Layer)): 48 | raise TypeError( 49 | "output_layer must be a Layer, received: %s" % type(output_layer)) 50 | self._cell = cell 51 | self._helper = helper 52 | self._initial_state = initial_state 53 | self._output_layer = output_layer 54 | 55 | @property 56 | def batch_size(self): 57 | return self._helper.batch_size 58 | 59 | def _rnn_output_size(self): 60 | size = self._cell.output_size 61 | if self._output_layer is None: 62 | return size 63 | else: 64 | # To use layer's compute_output_shape, we need to convert the 65 | # RNNCell's output_size entries into shapes with an unknown 66 | # batch size. We then pass this through the layer's 67 | # compute_output_shape and read off all but the first (batch) 68 | # dimensions to get the output size of the rnn with the layer 69 | # applied to the top. 70 | output_shape_with_unknown_batch = nest.map_structure( 71 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 72 | size) 73 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 74 | output_shape_with_unknown_batch) 75 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 76 | 77 | @property 78 | def output_size(self): 79 | # Return the cell output and the id 80 | return CustomDecoderOutput( 81 | rnn_output=self._rnn_output_size(), 82 | token_output=self._helper.token_output_size, 83 | sample_id=self._helper.sample_ids_shape) 84 | 85 | @property 86 | def output_dtype(self): 87 | # Assume the dtype of the cell is the output_size structure 88 | # containing the input_state's first component's dtype. 89 | # Return that structure and the sample_ids_dtype from the helper. 90 | dtype = nest.flatten(self._initial_state)[0].dtype 91 | return CustomDecoderOutput( 92 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 93 | tf.float32, 94 | self._helper.sample_ids_dtype) 95 | 96 | def initialize(self, name=None): 97 | """Initialize the decoder. 98 | Args: 99 | name: Name scope for any created operations. 100 | Returns: 101 | `(finished, first_inputs, initial_state)`. 102 | """ 103 | return self._helper.initialize() + (self._initial_state,) 104 | 105 | def step(self, time, inputs, state, name=None): 106 | """Perform a custom decoding step. 107 | Enables for dyanmic prediction 108 | Args: 109 | time: scalar `int32` tensor. 110 | inputs: A (structure of) input tensors. 111 | state: A (structure of) state tensors and TensorArrays. 112 | name: Name scope for any created operations. 113 | Returns: 114 | `(outputs, next_state, next_inputs, finished)`. 115 | """ 116 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 117 | #Call outputprojection wrapper cell 118 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 119 | 120 | #apply output_layer (if existant) 121 | if self._output_layer is not None: 122 | cell_outputs = self._output_layer(cell_outputs) 123 | sample_ids = self._helper.sample( 124 | time=time, outputs=cell_outputs, state=cell_state) 125 | 126 | (finished, next_inputs, next_state) = self._helper.next_inputs( 127 | time=time, 128 | outputs=cell_outputs, 129 | state=cell_state, 130 | sample_ids=sample_ids, 131 | stop_token_prediction=stop_token) 132 | 133 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 134 | return (outputs, next_state, next_inputs, finished) 135 | -------------------------------------------------------------------------------- /tacotron/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | class TacoTestHelper(Helper): 7 | def __init__(self, batch_size, hparams): 8 | with tf.name_scope('TacoTestHelper'): 9 | self._batch_size = batch_size 10 | self._output_dim = hparams.num_mels 11 | self._reduction_factor = hparams.outputs_per_step 12 | self.stop_at_any = hparams.stop_at_any 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 38 | with tf.name_scope('TacoTestHelper'): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn't 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if self.stop_at_any: 52 | finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended 53 | else: 54 | finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope('TacoTrainingHelper'): 66 | self._batch_size = batch_size 67 | self._output_dim = hparams.num_mels 68 | self._reduction_factor = hparams.outputs_per_step 69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) 70 | self.gta = gta 71 | self.eval = evaluating 72 | self._hparams = hparams 73 | self.global_step = global_step 74 | 75 | r = self._reduction_factor 76 | # Feed every r-th target frame as input 77 | self._targets = targets[:, r-1::r, :] 78 | 79 | #Maximal sequence length 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | #Compute teacher forcing ratio for this global step. 100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing 101 | if self.gta: 102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth 103 | elif self.eval and self._hparams.tacotron_natural_eval: 104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions 105 | else: 106 | if self._hparams.tacotron_teacher_forcing_mode == 'scheduled': 107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, 108 | self.global_step, self._hparams) 109 | 110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 111 | 112 | def sample(self, time, outputs, state, name=None): 113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 114 | 115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 116 | with tf.name_scope(name or 'TacoTrainingHelper'): 117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions) 118 | finished = (time + 1 >= self._lengths) 119 | 120 | #Pick previous outputs randomly with respect to teacher forcing ratio 121 | next_inputs = tf.cond( 122 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 124 | lambda: outputs[:,-self._output_dim:]) 125 | 126 | #Pass on state 127 | next_state = state 128 | return (finished, next_inputs, next_state) 129 | 130 | 131 | def _go_frames(batch_size, output_dim): 132 | '''Returns all-zero frames for a given batch size and output dimension''' 133 | return tf.tile([[0.0]], [batch_size, output_dim]) 134 | 135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): 136 | ################################################################# 137 | # Narrow Cosine Decay: 138 | 139 | # Phase 1: tfr = init 140 | # We only start learning rate decay after 10k steps 141 | 142 | # Phase 2: tfr in ]init, final[ 143 | # decay reach minimal value at step ~40k 144 | 145 | # Phase 3: tfr = final 146 | # clip by minimal teacher forcing ratio value (step >~ 40k) 147 | ################################################################# 148 | #Pick final teacher forcing rate value 149 | if hparams.tacotron_teacher_forcing_final_ratio is not None: 150 | alpha = float(hparams.tacotron_teacher_forcing_final_ratio / hparams.tacotron_teacher_forcing_init_ratio) 151 | 152 | else: 153 | assert hparams.tacotron_teacher_forcing_decay_alpha is not None 154 | alpha = hparams.tacotron_teacher_forcing_decay_alpha 155 | 156 | #Compute natural cosine decay 157 | tfr = tf.train.cosine_decay(init_tfr, 158 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr ~= init at step 10k 159 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr ~= final at step ~40k 160 | alpha=alpha, #tfr = alpha% of init_tfr as final value 161 | name='tfr_cosine_decay') 162 | 163 | #force teacher forcing ratio to take initial value when global step < start decay step. 164 | narrow_tfr = tf.cond( 165 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), 166 | lambda: tf.convert_to_tensor(init_tfr), 167 | lambda: tfr) 168 | 169 | return narrow_tfr -------------------------------------------------------------------------------- /tacotron/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import time 5 | from time import sleep 6 | 7 | import tensorflow as tf 8 | from hparams import hparams, hparams_debug_string 9 | from infolog import log 10 | from tacotron.synthesizer import Synthesizer 11 | from tqdm import tqdm 12 | 13 | 14 | def generate_fast(model, text): 15 | model.synthesize([text], None, None, None, None) 16 | 17 | 18 | def run_live(args, checkpoint_path, hparams): 19 | #Log to Terminal without keeping any records in files 20 | log(hparams_debug_string()) 21 | synth = Synthesizer() 22 | synth.load(checkpoint_path, hparams) 23 | 24 | #Generate fast greeting message 25 | greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' 26 | log(greetings) 27 | generate_fast(synth, greetings) 28 | 29 | #Interaction loop 30 | while True: 31 | try: 32 | text = input() 33 | generate_fast(synth, text) 34 | 35 | except KeyboardInterrupt: 36 | leave = 'Thank you for testing our features. see you soon.' 37 | log(leave) 38 | generate_fast(synth, leave) 39 | sleep(2) 40 | break 41 | 42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences): 43 | eval_dir = os.path.join(output_dir, 'eval') 44 | log_dir = os.path.join(output_dir, 'logs-eval') 45 | print('eval_dir:',eval_dir) 46 | print('args.mels_dir:',args.mels_dir) 47 | 48 | if args.model == 'Tacotron-2': 49 | assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) 50 | 51 | #Create output path if it doesn't exist 52 | os.makedirs(eval_dir, exist_ok=True) 53 | os.makedirs(log_dir, exist_ok=True) 54 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) 55 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) 56 | 57 | log(hparams_debug_string()) 58 | synth = Synthesizer() 59 | synth.load(checkpoint_path, hparams) 60 | 61 | #Set inputs batch wise 62 | sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] 63 | 64 | log('Starting Synthesis') 65 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: 66 | for i, texts in enumerate(tqdm(sentences)): 67 | start = time.time() 68 | basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] 69 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) 70 | 71 | for elems in zip(texts, mel_filenames, speaker_ids): 72 | file.write('|'.join([str(x) for x in elems]) + '\n') 73 | log('synthesized mel spectrograms at {}'.format(eval_dir)) 74 | return eval_dir 75 | 76 | def run_synthesis(args, checkpoint_path, output_dir, hparams): 77 | GTA = (args.GTA == 'True') 78 | if GTA: 79 | synth_dir = os.path.join(output_dir, 'gta') 80 | 81 | #Create output path if it doesn't exist 82 | os.makedirs(synth_dir, exist_ok=True) 83 | else: 84 | synth_dir = os.path.join(output_dir, 'natural') 85 | 86 | #Create output path if it doesn't exist 87 | os.makedirs(synth_dir, exist_ok=True) 88 | 89 | metadata_filename = os.path.join(args.input_dir, 'train.txt') 90 | log(hparams_debug_string()) 91 | synth = Synthesizer() 92 | synth.load(checkpoint_path, hparams, gta=GTA) 93 | with open(metadata_filename, encoding='utf-8') as f: 94 | metadata = [line.strip().split('|') for line in f] 95 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 96 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) 97 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) 98 | 99 | #Set inputs batch wise 100 | metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] 101 | 102 | log('Starting Synthesis') 103 | mel_dir = os.path.join(args.input_dir, 'mels') 104 | wav_dir = os.path.join(args.input_dir, 'audio') 105 | with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: 106 | for i, meta in enumerate(tqdm(metadata)): 107 | texts = [m[5] for m in meta] 108 | mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] 109 | wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] 110 | basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames] 111 | mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames) 112 | 113 | for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): 114 | file.write('|'.join([str(x) for x in elems]) + '\n') 115 | log('synthesized mel spectrograms at {}'.format(synth_dir)) 116 | return os.path.join(synth_dir, 'map.txt') 117 | 118 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None): 119 | output_dir = 'tacotron_' + args.output_dir 120 | try: 121 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 122 | log('loaded model at {}'.format(checkpoint_path)) 123 | except: 124 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) 125 | 126 | if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: 127 | raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format( 128 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) 129 | 130 | if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: 131 | raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format( 132 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) 133 | 134 | if args.mode == 'eval': 135 | return run_eval(args, checkpoint_path, output_dir, hparams, sentences) 136 | elif args.mode == 'synthesis': 137 | output_dir = args.gta_output 138 | return run_synthesis(args, checkpoint_path, output_dir, hparams) 139 | else: 140 | run_live(args, checkpoint_path, hparams) 141 | -------------------------------------------------------------------------------- /tacotron/synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wave 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | # import pyaudio 7 | # import sounddevice as sd 8 | import tensorflow as tf 9 | from datasets import audio 10 | from infolog import log 11 | from librosa import effects 12 | from tacotron.models import create_model 13 | from tacotron.utils import plot 14 | from tacotron.utils.text import text_to_sequence 15 | 16 | 17 | class Synthesizer: 18 | def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): 19 | log('Constructing model: %s' % model_name) 20 | #Force the batch size to be known in order to use attention masking in batch synthesis 21 | inputs = tf.placeholder(tf.int32, (None, None), name='inputs') 22 | input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths') 23 | targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') 24 | split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') 25 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: 26 | self.model = create_model(model_name, hparams) 27 | if gta: 28 | self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos) 29 | else: 30 | self.model.initialize(inputs, input_lengths, split_infos=split_infos) 31 | 32 | self.mel_outputs = self.model.tower_mel_outputs 33 | self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None 34 | self.alignments = self.model.tower_alignments 35 | self.stop_token_prediction = self.model.tower_stop_token_prediction 36 | self.targets = targets 37 | 38 | if hparams.GL_on_GPU: 39 | self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') 40 | self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') 41 | 42 | self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams) 43 | self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams) 44 | 45 | self.gta = gta 46 | self._hparams = hparams 47 | #pad input sequences with the 0 ( _ ) 48 | self._pad = 0 49 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 50 | #to avoid any possible conflicts, without affecting the output range of the model too much 51 | if hparams.symmetric_mels: 52 | self._target_pad = -hparams.max_abs_value 53 | else: 54 | self._target_pad = 0. 55 | 56 | self.inputs = inputs 57 | self.input_lengths = input_lengths 58 | self.targets = targets 59 | self.split_infos = split_infos 60 | 61 | log('Loading checkpoint: %s' % checkpoint_path) 62 | #Memory allocation on the GPUs as needed 63 | config = tf.ConfigProto() 64 | config.gpu_options.allow_growth = True 65 | config.allow_soft_placement = True 66 | 67 | self.session = tf.Session(config=config) 68 | self.session.run(tf.global_variables_initializer()) 69 | 70 | saver = tf.train.Saver() 71 | saver.restore(self.session, checkpoint_path) 72 | 73 | def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): 74 | hparams = self._hparams 75 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 76 | #[-max, max] or [0,max] 77 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value) 78 | 79 | #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) 80 | while len(texts) % hparams.tacotron_synthesis_batch_size != 0: 81 | texts.append(texts[-1]) 82 | basenames.append(basenames[-1]) 83 | if mel_filenames is not None: 84 | mel_filenames.append(mel_filenames[-1]) 85 | 86 | assert 0 == len(texts) % self._hparams.tacotron_num_gpus 87 | seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 88 | input_lengths = [len(seq) for seq in seqs] 89 | 90 | size_per_device = len(seqs) // self._hparams.tacotron_num_gpus 91 | 92 | #Pad inputs according to each GPU max length 93 | input_seqs = None 94 | split_infos = [] 95 | for i in range(self._hparams.tacotron_num_gpus): 96 | device_input = seqs[size_per_device*i: size_per_device*(i+1)] 97 | device_input, max_seq_len = self._prepare_inputs(device_input) 98 | input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input 99 | split_infos.append([max_seq_len, 0, 0, 0]) 100 | 101 | feed_dict = { 102 | self.inputs: input_seqs, 103 | self.input_lengths: np.asarray(input_lengths, dtype=np.int32), 104 | } 105 | 106 | if self.gta: 107 | np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] 108 | target_lengths = [len(np_target) for np_target in np_targets] 109 | 110 | #pad targets according to each GPU max length 111 | target_seqs = None 112 | for i in range(self._hparams.tacotron_num_gpus): 113 | device_target = np_targets[size_per_device*i: size_per_device*(i+1)] 114 | device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step) 115 | target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target 116 | split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe? 117 | 118 | feed_dict[self.targets] = target_seqs 119 | assert len(np_targets) == len(texts) 120 | 121 | feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) 122 | 123 | if self.gta or not hparams.predict_linear: 124 | mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) 125 | 126 | #Linearize outputs (n_gpus -> 1D) 127 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 128 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 129 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 130 | 131 | if not self.gta: 132 | #Natural batch synthesis 133 | #Get Mel lengths for the entire batch from stop_tokens predictions 134 | target_lengths = self._get_output_lengths(stop_tokens) 135 | 136 | #Take off the batch wise padding 137 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 138 | assert len(mels) == len(texts) 139 | 140 | else: 141 | linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) 142 | 143 | #Linearize outputs (1D arrays) 144 | linears = [linear for gpu_linear in linears for linear in gpu_linear] 145 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 146 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 147 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 148 | 149 | #Natural batch synthesis 150 | #Get Mel/Linear lengths for the entire batch from stop_tokens predictions 151 | target_lengths = self._get_output_lengths(stop_tokens) 152 | 153 | #Take off the batch wise padding 154 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 155 | linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] 156 | linears = np.clip(linears, T2_output_range[0], T2_output_range[1]) 157 | assert len(mels) == len(linears) == len(texts) 158 | 159 | mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) 160 | 161 | if basenames is None: 162 | #Generate wav and read it 163 | if hparams.GL_on_GPU: 164 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]}) 165 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 166 | else: 167 | wav = audio.inv_mel_spectrogram(mels[0].T, hparams) 168 | audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way 169 | 170 | if platform.system() == 'Linux': 171 | #Linux wav reader 172 | os.system('aplay temp.wav') 173 | 174 | elif platform.system() == 'Windows': 175 | #windows wav reader 176 | os.system('start /min mplay32 /play /close temp.wav') 177 | 178 | else: 179 | raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') 180 | 181 | return 182 | 183 | 184 | saved_mels_paths = [] 185 | speaker_ids = [] 186 | for i, mel in enumerate(mels): 187 | #Get speaker id for global conditioning (only used with GTA generally) 188 | if hparams.gin_channels > 0: 189 | raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.') 190 | speaker_id = '' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) 191 | speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) 192 | else: 193 | speaker_id = '' 194 | speaker_ids.append(speaker_id) 195 | 196 | # Write the spectrogram to disk 197 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders 198 | mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) 199 | np.save(mel_filename, mel, allow_pickle=False) 200 | saved_mels_paths.append(mel_filename) 201 | 202 | if log_dir is not None: 203 | #save wav (mel -> wav) 204 | if hparams.GL_on_GPU: 205 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel}) 206 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 207 | else: 208 | wav = audio.inv_mel_spectrogram(mel.T, hparams) 209 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) 210 | 211 | #save alignments 212 | plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), 213 | title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) 214 | 215 | #save mel spectrogram plot 216 | plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), 217 | title='{}'.format(texts[i]), split_title=True) 218 | 219 | if hparams.predict_linear: 220 | #save wav (linear -> wav) 221 | if hparams.GL_on_GPU: 222 | wav = self.session.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]}) 223 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 224 | else: 225 | wav = audio.inv_linear_spectrogram(linears[i].T, hparams) 226 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) 227 | 228 | #save linear spectrogram plot 229 | plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), 230 | title='{}'.format(texts[i]), split_title=True, auto_aspect=True) 231 | 232 | return saved_mels_paths, speaker_ids 233 | 234 | def _round_up(self, x, multiple): 235 | remainder = x % multiple 236 | return x if remainder == 0 else x + multiple - remainder 237 | 238 | def _prepare_inputs(self, inputs): 239 | max_len = max([len(x) for x in inputs]) 240 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 241 | 242 | def _pad_input(self, x, length): 243 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad) 244 | 245 | def _prepare_targets(self, targets, alignment): 246 | max_len = max([len(t) for t in targets]) 247 | data_len = self._round_up(max_len, alignment) 248 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 249 | 250 | def _pad_target(self, t, length): 251 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) 252 | 253 | def _get_output_lengths(self, stop_tokens): 254 | #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise) 255 | output_lengths = [row.index(1) if 1 in row else len(row) for row in np.round(stop_tokens).tolist()] 256 | 257 | # output_lengths1 = [] 258 | # stop_tokens_list = np.round(stop_tokens).tolist() 259 | # for row in stop_tokens_list: 260 | # if 1 in row : 261 | # output_lengths1 = [row.index(1)] 262 | # else : 263 | # output_lengths1 = [len(row)] 264 | 265 | return output_lengths 266 | -------------------------------------------------------------------------------- /tacotron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /tacotron/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | 15 | from unidecode import unidecode 16 | 17 | from .numbers import normalize_numbers 18 | 19 | # Regular expression matching whitespace: 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def expand_numbers(text): 52 | return normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | '''lowercase input tokens. 57 | ''' 58 | return text.lower() 59 | 60 | 61 | def collapse_whitespace(text): 62 | return re.sub(_whitespace_re, ' ', text) 63 | 64 | 65 | def convert_to_ascii(text): 66 | return unidecode(text) 67 | 68 | 69 | def basic_cleaners(text): 70 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 71 | text = lowercase(text) 72 | text = collapse_whitespace(text) 73 | return text 74 | 75 | 76 | def transliteration_cleaners(text): 77 | '''Pipeline for non-English text that transliterates to ASCII.''' 78 | text = convert_to_ascii(text) 79 | text = lowercase(text) 80 | text = collapse_whitespace(text) 81 | return text 82 | 83 | 84 | def english_cleaners(text): 85 | '''Pipeline for English text, including number and abbreviation expansion.''' 86 | text = convert_to_ascii(text) 87 | # text = lowercase(text) 88 | text = expand_numbers(text) 89 | text = expand_abbreviations(text) 90 | text = collapse_whitespace(text) 91 | return text 92 | -------------------------------------------------------------------------------- /tacotron/utils/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_symbols = [ 4 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 5 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 6 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 7 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 8 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 9 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 10 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 11 | ] 12 | 13 | _valid_symbol_set = set(valid_symbols) 14 | 15 | 16 | class CMUDict: 17 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 18 | def __init__(self, file_or_path, keep_ambiguous=True): 19 | if isinstance(file_or_path, str): 20 | with open(file_or_path, encoding='latin-1') as f: 21 | entries = _parse_cmudict(f) 22 | else: 23 | entries = _parse_cmudict(file_or_path) 24 | if not keep_ambiguous: 25 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 26 | self._entries = entries 27 | 28 | 29 | def __len__(self): 30 | return len(self._entries) 31 | 32 | 33 | def lookup(self, word): 34 | '''Returns list of ARPAbet pronunciations of the given word.''' 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | 39 | _alt_re = re.compile(r'\([0-9]+\)') 40 | 41 | 42 | def _parse_cmudict(file): 43 | cmudict = {} 44 | for line in file: 45 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 46 | parts = line.split(' ') 47 | word = re.sub(_alt_re, '', parts[0]) 48 | pronunciation = _get_pronunciation(parts[1]) 49 | if pronunciation: 50 | if word in cmudict: 51 | cmudict[word].append(pronunciation) 52 | else: 53 | cmudict[word] = [pronunciation] 54 | return cmudict 55 | 56 | 57 | def _get_pronunciation(s): 58 | parts = s.strip().split(' ') 59 | for part in parts: 60 | if part not in _valid_symbol_set: 61 | return None 62 | return ' '.join(parts) 63 | -------------------------------------------------------------------------------- /tacotron/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /tacotron/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | import numpy as np 6 | 7 | 8 | def split_title_line(title_text, max_words=5): 9 | """ 10 | A function that splits any string based on specific character 11 | (returning it with the string), with maximum number of words on it 12 | """ 13 | seq = title_text.split() 14 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 15 | 16 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None): 17 | if max_len is not None: 18 | alignment = alignment[:, :max_len] 19 | 20 | fig = plt.figure(figsize=(8, 6)) 21 | ax = fig.add_subplot(111) 22 | 23 | im = ax.imshow( 24 | alignment, 25 | aspect='auto', 26 | origin='lower', 27 | interpolation='none') 28 | fig.colorbar(im, ax=ax) 29 | xlabel = 'Decoder timestep' 30 | 31 | if split_title: 32 | title = split_title_line(title) 33 | 34 | plt.xlabel(xlabel) 35 | plt.title(title) 36 | plt.ylabel('Encoder timestep') 37 | plt.tight_layout() 38 | plt.savefig(path, format='png') 39 | plt.close() 40 | 41 | 42 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 43 | if max_len is not None: 44 | target_spectrogram = target_spectrogram[:max_len] 45 | pred_spectrogram = pred_spectrogram[:max_len] 46 | 47 | if split_title: 48 | title = split_title_line(title) 49 | 50 | fig = plt.figure(figsize=(10, 8)) 51 | # Set common labels 52 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 53 | 54 | #target spectrogram subplot 55 | if target_spectrogram is not None: 56 | ax1 = fig.add_subplot(311) 57 | ax2 = fig.add_subplot(312) 58 | 59 | if auto_aspect: 60 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none') 61 | else: 62 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none') 63 | ax1.set_title('Target Mel-Spectrogram') 64 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) 65 | ax2.set_title('Predicted Mel-Spectrogram') 66 | else: 67 | ax2 = fig.add_subplot(211) 68 | 69 | if auto_aspect: 70 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none') 71 | else: 72 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none') 73 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2) 74 | 75 | plt.tight_layout() 76 | plt.savefig(path, format='png') 77 | plt.close() 78 | -------------------------------------------------------------------------------- /tacotron/utils/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from . import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | 12 | label = 'phone' # pingyin phone 13 | 14 | if(label == 'pinyin'): 15 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? ' 16 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890@!\'(),-.:;? ' 17 | 18 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 19 | #_arpabet = ['@' + s for s in cmudict.valid_symbols] 20 | # Export all symbols: 21 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet 22 | 23 | 24 | if(label == 'phone'): 25 | _characters = [ 26 | 'breath','cough','noise','smack','um','sil','sp1', 27 | 'a5','a1','a4','ai5','ai1','ai2','ai3','ai4','an1', 28 | 'an2','an3','an4','ang1','ang2','ang4','ao1','ao2', 29 | 'ao3','ao4','b','a2','a3','ang5','ang3','ao5','ei5', 30 | 'ei1','ei2','ei3','ei4','en5','en1','en2','en3','en4', 31 | 'eng1','eng2','eng4','i1','i2','i3','i4','ian5','ian1', 32 | 'ian2','ian3','ian4','iao1','iao2','iao3','iao4','ie1', 33 | 'ie2','ie3','ie4','in1','in4','ing1','ing2','ing3','ing4', 34 | 'o5','o1','o2','o3','o4','u5','u2','u3','u4','c','e4', 35 | 'ch','an5','e1','e2','e3','eng5','eng3','iii5','iii1', 36 | 'iii2','iii3','iii4','ong1','ong2','ong3','ong4','ou5', 37 | 'ou1','ou2','ou3','ou4','u1','uai1','uai3','uai4','uan5', 38 | 'uan1','uan2','uan3','uan4','uang5','uang1','uang2','uang3', 39 | 'uang4','uei5','uei1','uei2','uen1','uen2','uen3','uo1', 40 | 'uo4','ii1','ii2','ii3','ii4','ong5','uei3','uei4','uen4', 41 | 'uo2','d','e5','i5','ia2','ia3','iao5','ie5','iou1','uo5', 42 | 'uo3','eer2','er5','er2','er3','er4','f','g','ua5','ua1', 43 | 'ua2','ua3','ua4','uai2','h','uen5','j','ia5','ia1','ia4', 44 | 'iang5','iang1','iang2','iang3','iang4','in5','in2','in3', 45 | 'ing5','iong2','iong3','iou5','iou2','iou3','iou4','v5', 46 | 'v1','v2','v3','v4','van1','van2','van3','van4','ve1', 47 | 've2','ve4','vn5','vn1','vn4','k','uai5','l','m','n', 48 | 'ng1','p','q','van5','vn2','r','s','sh','ii5','t','ueng1', 49 | 'ueng2','ueng3','ueng4','x','iong5','iong1','ve3','io5', 50 | 'io1','iong4','ve5','vn3','z','zh',',','!','。','?', 51 | '、',':','#1','#2','#3','#4','#',' ' 52 | ] 53 | symbols = [_pad, _eos] + _characters 54 | -------------------------------------------------------------------------------- /tacotron/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from . import cleaners 4 | from .symbols import symbols 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | label = 'phone' # pinyin phone 66 | def _symbols_to_sequence(symbols): 67 | if(label == 'phone'): 68 | symbols = re.split("( )", symbols) 69 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 70 | 71 | def _arpabet_to_sequence(text): 72 | return _symbols_to_sequence(['@' + s for s in text.split()]) 73 | 74 | 75 | def _should_keep_symbol(s): 76 | return s in _symbol_to_id and s is not '_' and s is not '~' 77 | -------------------------------------------------------------------------------- /tacotron2_client.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | 5 | import grpc,time 6 | import tensorflow as tf 7 | import numpy as np 8 | from numpy.core.multiarray import ndarray 9 | from tensorflow_serving.apis import predict_pb2 10 | from tensorflow_serving.apis import prediction_service_pb2_grpc 11 | 12 | from tensorflow.core.framework import tensor_pb2 13 | from tensorflow.core.framework import tensor_shape_pb2 14 | from tensorflow.core.framework import types_pb2 15 | 16 | from tacotron.utils.text import text_to_sequence 17 | from hparams import hparams 18 | 19 | import re 20 | import symbols 21 | 22 | # tensorflow_model_server --port=9001 --model_name=tacotron2 --model_base_path=./tacotron-2_melgan/save_model/ 23 | 24 | # sudo docker run -p 9002:8501 -p 9001:8500 -e CUDA_VISIBLE_DEVICES=0 --mount type=bind,source=./tacotron-2_melgan/save_model/, 25 | # target=/models/tacotron2 -e MODEL_NAME=tacotron2 -t tensorflow/serving:1.13.0-gpu --per_process_gpu_memory_fraction=0.5 26 | 27 | syms = symbols.symbols 28 | 29 | MAX_MESSAGE_LENGTH=-1 30 | 31 | def prepare_inputs(inputs): 32 | max_len = max([len(x) for x in inputs]) 33 | return np.stack([pad_input(x, max_len) for x in inputs]), max_len 34 | 35 | def pad_input(x, length): 36 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=0) 37 | 38 | def predict_tts(): 39 | 40 | texts = 'k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。' 41 | # texts = 'b ao2 m a3 #1 p ei4 g ua4 #1 b o3 l uo2 an1 #3 , d iao1 ch an2 #1 van4 zh en3 #2 d ong3 ueng1 t a4 #4 。' 42 | s = [] 43 | texts_split = re.split("( )", texts) 44 | for i in texts_split: 45 | if(i in syms): 46 | index = syms.index(i) 47 | s.append(index) 48 | seqs = np.asarray(s) 49 | 50 | seqs_lengths = len(seqs) 51 | input_lengths_np = np.asarray(seqs_lengths, dtype=np.int32).reshape(1) 52 | 53 | input_seqs = seqs[np.newaxis].astype(np.int32) 54 | max_seq_len = seqs_lengths 55 | split_infos_np = np.asarray([max_seq_len, 0, 0, 0], dtype=np.int32)[np.newaxis] 56 | print('input_seqs:', input_seqs.shape) 57 | print('input_lengths_np:', input_lengths_np.shape) 58 | print('split_infos_np:', split_infos_np.shape) 59 | 60 | ############################# 61 | # texts = ['k a2 er2 p u3 #2 p ei2 uai4 s uen1 #1 uan2 h ua2 t i1 #4 。'] 62 | # t2_hparams = hparams.parse('') 63 | # cleaner_names = [x.strip() for x in t2_hparams.cleaners.split(',')] 64 | # seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 65 | # input_lengths_np = [len(seq) for seq in seqs] 66 | # input_lengths_np = np.asarray(input_lengths_np, dtype=np.int32) 67 | # 68 | # size_per_device = len(seqs) // t2_hparams.tacotron_num_gpus 69 | # 70 | # # Pad inputs according to each GPU max length 71 | # input_seqs = None 72 | # split_infos_np = [] 73 | # for i in range(t2_hparams.tacotron_num_gpus): 74 | # device_input = seqs[size_per_device * i: size_per_device * (i + 1)] 75 | # device_input, max_seq_len = prepare_inputs(device_input) 76 | # input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input 77 | # input_seqs = input_seqs.astype(np.int32) 78 | # split_infos_np.append([max_seq_len, 0, 0, 0]) 79 | # split_infos_np = np.asarray(split_infos_np, dtype=np.int32) 80 | # print('input_seqs:', input_seqs.shape) 81 | # print('input_lengths_np:', input_lengths_np.shape) 82 | # print('split_infos_np:', split_infos_np.shape) 83 | 84 | 85 | #-----------tacotron2------------ 86 | hostport = 'localhost:9001' 87 | 88 | channel = grpc.insecure_channel(hostport, options= 89 | [('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), 90 | ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH)]) 91 | 92 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) 93 | request = predict_pb2.PredictRequest() 94 | request.model_spec.name = 'tacotron2' 95 | request.model_spec.signature_name = 'predict' 96 | 97 | # tensor_proto = tensor_pb2.TensorProto(dtype=types_pb2.DT_STRING,string_val=[img_str]) 98 | # request.inputs['images'].CopyFrom(tensor_proto) 99 | 100 | request.inputs['inputs'].CopyFrom(tf.contrib.util.make_tensor_proto(input_seqs)) 101 | request.inputs['input_lengths'].CopyFrom(tf.contrib.util.make_tensor_proto(input_lengths_np)) 102 | request.inputs['split_infos'].CopyFrom(tf.contrib.util.make_tensor_proto(split_infos_np)) 103 | 104 | t1 = time.time() 105 | result_future = stub.Predict.future(request) 106 | print('time:',time.time() - t1) 107 | 108 | mel_out = result_future.result().outputs['mel'] 109 | mel_out_list = (tf.contrib.util.make_ndarray(mel_out).tolist()) 110 | mel_out_np = np.array(mel_out_list) # type: ndarray 111 | 112 | # mel_out_np = np.squeeze(mel_out_np, 0) 113 | mel_out_np = mel_out_np.astype(np.float32) 114 | print(mel_out_np.shape) 115 | np.save('mel_out1.npy', mel_out_np) 116 | 117 | return 118 | 119 | if __name__ == '__main__': 120 | 121 | # t = time.time() 122 | result = predict_tts() 123 | # print('time:',time.time() - t) 124 | 125 | print('done...') 126 | 127 | -------------------------------------------------------------------------------- /train_tacotron.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from time import sleep 4 | 5 | import infolog 6 | import tensorflow as tf 7 | from hparams import hparams 8 | from infolog import log 9 | from tacotron.synthesize import tacotron_synthesize 10 | from tacotron.train import tacotron_train 11 | 12 | os.environ["CUDA_VISIBLE_DEVICES"] = '1' 13 | 14 | log = infolog.log 15 | 16 | 17 | def save_seq(file, sequence, input_path): 18 | '''Save Tacotron-2 training state to disk. (To skip for future runs) 19 | ''' 20 | sequence = [str(int(s)) for s in sequence] + [input_path] 21 | with open(file, 'w') as f: 22 | f.write('|'.join(sequence)) 23 | 24 | def read_seq(file): 25 | '''Load Tacotron-2 training state from disk. (To skip if not first run) 26 | ''' 27 | if os.path.isfile(file): 28 | with open(file, 'r') as f: 29 | sequence = f.read().split('|') 30 | 31 | return [bool(int(s)) for s in sequence[:-1]], sequence[-1] 32 | else: 33 | return [0, 0, 0], '' 34 | 35 | def prepare_run(args): 36 | modified_hp = hparams.parse(args.hparams) 37 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 38 | run_name = args.name or args.model 39 | log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name)) 40 | os.makedirs(log_dir, exist_ok=True) 41 | infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url) 42 | return log_dir, modified_hp 43 | 44 | def train(args, log_dir, hparams): 45 | state_file = os.path.join(log_dir, 'state_log') 46 | #Get training states 47 | (taco_state, GTA_state, wave_state), input_path = read_seq(state_file) 48 | print('taco_state, GTA_state, wave_state:',taco_state, GTA_state, wave_state) 49 | if not taco_state: 50 | log('\n#############################################################\n') 51 | log('Tacotron Train\n') 52 | log('###########################################################\n') 53 | checkpoint = tacotron_train(args, log_dir, hparams) 54 | tf.reset_default_graph() 55 | #Sleep 1/2 second to let previous graph close and avoid error messages while synthesis 56 | sleep(0.5) 57 | if checkpoint is None: 58 | raise('Error occured while training Tacotron, Exiting!') 59 | taco_state = 1 60 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 61 | else: 62 | checkpoint = os.path.join(log_dir, 'taco_pretrained/') 63 | # print('checkpoint:',checkpoint) 64 | log('tacotron_train done!!') 65 | 66 | if not GTA_state: 67 | log('\n#############################################################\n') 68 | log('Tacotron GTA Synthesis\n') 69 | log('###########################################################\n') 70 | input_path = tacotron_synthesize(args, hparams, checkpoint) 71 | tf.reset_default_graph() 72 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is training 73 | sleep(0.5) 74 | GTA_state = 1 75 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 76 | else: 77 | input_path = os.path.join(log_dir, 'tacotron_' + args.output_dir, 'gta', 'map.txt') 78 | # input_path = './tacotron_output/gta/map.txt' 79 | log('Tacotron GTA Synthesis done') 80 | 81 | 82 | def main(): 83 | train_data_base = '/xxx/tacotron2_wavernn/' 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('--base_dir', default=train_data_base) 86 | parser.add_argument('--hparams', default='', 87 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 88 | parser.add_argument('--tacotron_input', default='training_data/train.txt') 89 | parser.add_argument('--name', help='Name of logging directory.') 90 | parser.add_argument('--model', default='Tacotron-2') 91 | parser.add_argument('--input_dir', default=train_data_base + 'training_data', help='folder to contain inputs sentences/targets') 92 | parser.add_argument('--output_dir', default='output', help='folder to contain synthesized mel spectrograms') 93 | parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training') 94 | parser.add_argument('--gta_output', default=train_data_base + 'training_data/') 95 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode') 96 | parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training') 97 | parser.add_argument('--summary_interval', type=int, default=250, 98 | help='Steps between running summary ops') 99 | parser.add_argument('--embedding_interval', type=int, default=5000, 100 | help='Steps between updating embeddings projection visualization') 101 | parser.add_argument('--checkpoint_interval', type=int, default=5000, 102 | help='Steps between writing checkpoints') 103 | parser.add_argument('--eval_interval', type=int, default=5000, 104 | help='Steps between eval on test data') 105 | parser.add_argument('--tacotron_train_steps', type=int, default=400000, help='total number of tacotron training steps') 106 | parser.add_argument('--tf_log_level', type=int, default=3, help='Tensorflow C++ log level.') 107 | parser.add_argument('--slack_url', default=None, help='slack webhook notification destination link') 108 | args = parser.parse_args() 109 | 110 | accepted_models = ['Tacotron-2'] 111 | 112 | if args.model not in accepted_models: 113 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models)) 114 | 115 | log_dir, hparams = prepare_run(args) 116 | 117 | if args.model == 'Tacotron-2': 118 | train(args, log_dir, hparams) 119 | else: 120 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | --------------------------------------------------------------------------------