├── .gitignore ├── .vscode └── settings.json ├── Audio.py ├── Example_Results ├── Figures │ ├── 20200505.214958.IDX_0.PNG │ ├── 20200505.214958.IDX_1.PNG │ ├── 20200505.214958.IDX_2.PNG │ ├── 20200505.214958.IDX_3.PNG │ ├── 20200505.214958.IDX_4.PNG │ ├── 20200505.214958.IDX_5.PNG │ ├── 20200505.214958.IDX_6.PNG │ └── 20200505.214958.IDX_7.PNG ├── GST │ └── 20200506.001527.GST.PNG └── Wav │ ├── 20200505.214958.IDX_0.WAV │ ├── 20200505.214958.IDX_1.WAV │ ├── 20200505.214958.IDX_2.WAV │ ├── 20200505.214958.IDX_3.WAV │ ├── 20200505.214958.IDX_4.WAV │ ├── 20200505.214958.IDX_5.WAV │ ├── 20200505.214958.IDX_6.WAV │ └── 20200505.214958.IDX_7.WAV ├── Feeder.py ├── Figures └── Structure.png ├── Get_Path.py ├── Hyper_Parameters.json ├── Inference_Sentence_for_Training.txt ├── Inference_Wav_for_Training.txt ├── LICENSE ├── LICENSE.txt ├── Model.py ├── Modules ├── Attention │ ├── Layers.py │ ├── Steps.py │ └── __init__.py ├── GST.py ├── Taco2.py └── __init__.py ├── Papers ├── He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf ├── Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf ├── Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf ├── Style Tokens Unsupervised Style Modeling Control and Transfer.pdf └── Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf ├── Pattern_Generator.py ├── ProgressBar.py ├── README.md ├── R_Script ├── TSNE.R └── VCTK_Outlier_Checker.R ├── Requirements.txt ├── Token_Index_Dict.ENG.json ├── Wav_for_Inference ├── FV.AWB.arctic_a0001.wav ├── FV.BDL.arctic_a0002.wav ├── FV.CLB.arctic_a0003.wav ├── FV.JMK.arctic_a0004.wav ├── FV.KSP.arctic_a0005.wav ├── FV.RMS.arctic_a0006.wav ├── FV.SLT.arctic_a0007.wav └── LJ.LJ050-0278.wav └── vctk_nonoutlier.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | Temp.py 131 | Temp1.py 132 | Temp/stepwise.py 133 | Wav_for_Inference/BC2013.CB-20K1-01-01.wav 134 | Wav_for_Inference/LJ.LJ001-0001.wav 135 | Wav_for_Inference/VCTK.p376_001.wav 136 | .vscode/settings.json 137 | Hyper_Parameters.CP.json 138 | nonoutlier.txt 139 | Sig.Original.png 140 | Sig.RemoveOutlier.png 141 | Split.Original.png 142 | Split.RemoveOutlier.png 143 | Trim.Original.png 144 | Trim.RemoveOutlier.png 145 | VCTK_Length.txt 146 | Bak/.gitignore 147 | Bak/Attention_Modules.py 148 | Bak/Audio.py 149 | Bak/Feeder.py 150 | Bak/Hyper_Parameters.json 151 | Bak/Inference_Sentence_for_Training.txt 152 | Bak/Inference_Wav_for_Training.txt 153 | Bak/LICENSE 154 | Bak/Model.py 155 | Bak/Modules.py 156 | Bak/Pattern_Generator.py 157 | Bak/ProgressBar.py 158 | Bak/README.md 159 | Bak/Token_Index_Dict.ENG.json 160 | Bak/VCTK_Outlier_Checker.R 161 | Bak/DCA.py 162 | Bak/Taco1_Modules.py 163 | Figures/Figure.pptx 164 | RAdam.py 165 | R_Script/Token_analysis.R 166 | Get_Path.py 167 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "C:\\Users\\Heejo\\Anaconda3\\python.exe" 3 | } -------------------------------------------------------------------------------- /Audio.py: -------------------------------------------------------------------------------- 1 | # https://github.com/keithito/tacotron/blob/master/util/audio.py 2 | # https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/audio/__init__.py 3 | # I only changed the hparams to usual parameters from oroginal code. 4 | 5 | import numpy as np 6 | from scipy import signal 7 | import librosa.filters 8 | import librosa 9 | 10 | 11 | def preemphasis(x, preemphasis = 0.97): 12 | return signal.lfilter([1, -preemphasis], [1], x) 13 | 14 | def inv_preemphasis(x, preemphasis = 0.97): 15 | return signal.lfilter([1], [1, -preemphasis], x) 16 | 17 | 18 | def spectrogram(y, num_freq, hop_length, win_length, sample_rate, ref_level_db = 20, max_abs_value = None, spectral_subtract= False): 19 | M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract) 20 | S = _amp_to_db(M) - ref_level_db 21 | return _normalize(S) if max_abs_value is None else _symmetric_normalize(S, max_abs_value= max_abs_value) 22 | 23 | def inv_spectrogram(spectrogram, num_freq, hop_length, win_length, sample_rate, ref_level_db = 20, power = 1.5, max_abs_value = None, griffin_lim_iters= 60): 24 | '''Converts spectrogram to waveform using librosa''' 25 | spectrogram = _denormalize(spectrogram) if max_abs_value is None else _symmetric_denormalize(spectrogram, max_abs_value= max_abs_value) 26 | S = _db_to_amp(spectrogram + ref_level_db) # Convert back to linear 27 | return inv_preemphasis(_griffin_lim(S ** power, num_freq, hop_length, win_length, sample_rate, griffin_lim_iters= griffin_lim_iters)) # Reconstruct phase 28 | 29 | def melspectrogram(y, num_freq, hop_length, win_length, num_mels, sample_rate, max_abs_value = None, spectral_subtract= False): 30 | M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract) 31 | S = _amp_to_db(_linear_to_mel(M, num_freq, num_mels, sample_rate)) 32 | return _normalize(S) if max_abs_value is None else _symmetric_normalize(S, max_abs_value= max_abs_value) 33 | 34 | def spectrogram_and_mel(y, num_freq, hop_length, win_length, sample_rate, spect_ref_level_db = 20, num_mels= 80, max_abs_mels = None, spectral_subtract= False): 35 | M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract) 36 | spect_S = _normalize(_amp_to_db(M) - spect_ref_level_db) 37 | mel_S = _amp_to_db(_linear_to_mel(M, num_freq, num_mels, sample_rate)) 38 | mel_S = _normalize(mel_S) if max_abs_mels is None else _symmetric_normalize(mel_S, max_abs_value= max_abs_mels) 39 | 40 | return spect_S, mel_S 41 | 42 | def mfcc(y, num_freq, num_mfcc, hop_length, win_length, sample_rate, use_energy= False): 43 | n_fft = (num_freq - 1) * 2 44 | mfcc_Array = librosa.feature.mfcc(y, sr= sample_rate, n_mfcc= num_mfcc + 1, n_fft= n_fft, hop_length= hop_length, win_length= win_length) 45 | mfcc_Array = mfcc_Array[:-1] if use_energy else mfcc_Array[1:] 46 | 47 | return mfcc_Array 48 | 49 | def _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract= False): 50 | D = _stft(preemphasis(y), num_freq, hop_length, win_length, sample_rate) 51 | M = np.abs(D) 52 | if spectral_subtract: 53 | M = np.clip(M - np.mean(M, axis= 1, keepdims= True) / 10, a_min= 0.0, a_max= np.inf) 54 | 55 | return M 56 | 57 | def _griffin_lim(S, num_freq, hop_length, win_length, sample_rate, griffin_lim_iters = 60): 58 | '''librosa implementation of Griffin-Lim 59 | Based on https://github.com/librosa/librosa/issues/434 60 | ''' 61 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 62 | S_complex = np.abs(S).astype(np.complex) 63 | y = _istft(S_complex * angles, num_freq, hop_length, win_length, sample_rate) 64 | 65 | for _ in range(griffin_lim_iters): 66 | angles = np.exp(1j * np.angle(_stft(y, num_freq, hop_length, win_length, sample_rate))) 67 | y = _istft(S_complex * angles, num_freq, hop_length, win_length, sample_rate) 68 | return y 69 | 70 | def _stft(y, num_freq, hop_length, win_length, sample_rate): 71 | n_fft = (num_freq - 1) * 2 72 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 73 | 74 | def _istft(y, num_freq, hop_length, win_length, sample_rate): 75 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 76 | 77 | def _linear_to_mel(spectrogram, num_freq, num_mels, sample_rate): 78 | _mel_basis = _build_mel_basis(num_freq, num_mels, sample_rate) 79 | return np.dot(_mel_basis, spectrogram) 80 | 81 | def _build_mel_basis(num_freq, num_mels, sample_rate): 82 | n_fft = (num_freq - 1) * 2 83 | return librosa.filters.mel(sample_rate, n_fft, n_mels=num_mels) 84 | 85 | 86 | def _amp_to_db(x): 87 | return 20 * np.log10(np.maximum(1e-5, x)) 88 | 89 | def _db_to_amp(x): 90 | return np.power(10.0, x * 0.05) 91 | 92 | def _normalize(S, min_level_db = -100): 93 | return np.clip((S - min_level_db) / -min_level_db, 0, 1) 94 | 95 | def _symmetric_normalize(S, min_level_db = -100, max_abs_value = 4): 96 | return np.clip((2 * max_abs_value) * ((S - min_level_db) / (-min_level_db)) - max_abs_value, -max_abs_value, max_abs_value) 97 | 98 | def _denormalize(S, min_level_db = -100): 99 | return (np.clip(S, 0, 1) * -min_level_db) + min_level_db 100 | 101 | def _symmetric_denormalize(S, min_level_db = -100, max_abs_value = 4): 102 | return ((np.clip(S, -max_abs_value, max_abs_value) + max_abs_value) / (2 * max_abs_value) * -min_level_db) + min_level_db -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_0.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_0.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_1.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_2.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_3.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_4.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_5.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_6.PNG -------------------------------------------------------------------------------- /Example_Results/Figures/20200505.214958.IDX_7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_7.PNG -------------------------------------------------------------------------------- /Example_Results/GST/20200506.001527.GST.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/GST/20200506.001527.GST.PNG -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_0.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_0.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_1.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_1.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_2.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_2.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_3.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_3.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_4.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_4.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_5.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_5.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_6.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_6.WAV -------------------------------------------------------------------------------- /Example_Results/Wav/20200505.214958.IDX_7.WAV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_7.WAV -------------------------------------------------------------------------------- /Feeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json, os, time, pickle, librosa 3 | from collections import deque 4 | from threading import Thread 5 | from random import shuffle 6 | 7 | from Pattern_Generator import Mel_Generate 8 | 9 | 10 | with open('Hyper_Parameters.json', 'r') as f: 11 | hp_Dict = json.load(f) 12 | 13 | class Feeder: 14 | def __init__(self, is_Training= False): 15 | self.is_Training = is_Training 16 | 17 | self.Metadata_Load() 18 | 19 | if self.is_Training: 20 | self.pattern_Queue = deque() 21 | pattern_Generate_Thread = Thread(target= self.Pattern_Generate) 22 | pattern_Generate_Thread.daemon = True 23 | pattern_Generate_Thread.start() 24 | 25 | def Metadata_Load(self): 26 | with open(hp_Dict['Token_JSON_Path'], 'r') as f: 27 | self.token_Index_Dict = json.load(f) 28 | 29 | if self.is_Training: 30 | with open(os.path.join(hp_Dict['Train']['Pattern_Path'], hp_Dict['Train']['Metadata_File']).replace('\\', '/'), 'rb') as f: 31 | self.metadata_Dict = pickle.load(f) 32 | 33 | if not all([ 34 | self.token_Index_Dict[key] == self.metadata_Dict['Token_Index_Dict'][key] 35 | for key in self.token_Index_Dict.keys() 36 | ]): 37 | raise ValueError('The token information of metadata information and hyper parameter is not consistent.') 38 | elif not all([ 39 | self.metadata_Dict['Spectrogram_Dim'] == hp_Dict['Sound']['Spectrogram_Dim'], 40 | self.metadata_Dict['Mel_Dim'] == hp_Dict['Sound']['Mel_Dim'], 41 | self.metadata_Dict['Frame_Shift'] == hp_Dict['Sound']['Frame_Shift'], 42 | self.metadata_Dict['Frame_Length'] == hp_Dict['Sound']['Frame_Length'], 43 | self.metadata_Dict['Sample_Rate'] == hp_Dict['Sound']['Sample_Rate'], 44 | self.metadata_Dict['Max_Abs_Mel'] == hp_Dict['Sound']['Max_Abs_Mel'], 45 | ]): 46 | raise ValueError('The metadata information and hyper parameter setting are not consistent.') 47 | 48 | def Pattern_Generate(self): 49 | min_Mel_Length = hp_Dict['Train']['Min_Wav_Length'] * hp_Dict['Sound']['Sample_Rate'] / hp_Dict['Sound']['Frame_Shift'] / 1000 50 | max_Mel_Length = hp_Dict['Train']['Max_Wav_Length'] * hp_Dict['Sound']['Sample_Rate'] / hp_Dict['Sound']['Frame_Shift'] / 1000 51 | 52 | path_List = [ 53 | (path, self.metadata_Dict['Mel_Length_Dict'][path]) 54 | for path in self.metadata_Dict['File_List'] 55 | if self.metadata_Dict['Mel_Length_Dict'][path] >= min_Mel_Length and self.metadata_Dict['Mel_Length_Dict'][path] <= max_Mel_Length 56 | ] 57 | 58 | print( 59 | 'Train pattern info', '\n', 60 | 'Total pattern count: {}'.format(len(self.metadata_Dict['Mel_Length_Dict'])), '\n', 61 | 'Use pattern count: {}'.format(len(path_List)), '\n', 62 | 'Excluded pattern count: {}'.format(len(self.metadata_Dict['Mel_Length_Dict']) - len(path_List)) 63 | ) 64 | 65 | if hp_Dict['Train']['Pattern_Sorting']: 66 | path_List = [file_Name for file_Name, _ in sorted(path_List, key=lambda x: x[1])] 67 | else: 68 | path_List = [file_Name for file_Name, _ in path_List] 69 | 70 | while True: 71 | if not hp_Dict['Train']['Pattern_Sorting']: 72 | shuffle(path_List) 73 | 74 | path_Batch_List = [ 75 | path_List[x:x + hp_Dict['Train']['Batch_Size']] 76 | for x in range(0, len(path_List), hp_Dict['Train']['Batch_Size']) 77 | ] 78 | if hp_Dict['Train']['Sequential_Pattern']: 79 | path_Batch_List = path_Batch_List[0:2] + list(reversed(path_Batch_List)) #Batch size의 적절성을 위한 코드. 10회 이상 되면 문제 없음 80 | else: 81 | shuffle(path_Batch_List) 82 | 83 | batch_Index = 0 84 | while batch_Index < len(path_Batch_List): 85 | if len(self.pattern_Queue) >= hp_Dict['Train']['Max_Pattern_Queue']: 86 | time.sleep(0.1) 87 | continue 88 | 89 | pattern_Count = len(path_Batch_List[batch_Index]) 90 | 91 | mel_List = [] 92 | token_List = [] 93 | spectrogram_List = [] 94 | 95 | for file_Path in path_Batch_List[batch_Index]: 96 | with open(os.path.join(hp_Dict['Train']['Pattern_Path'], file_Path).replace('\\', '/'), 'rb') as f: 97 | pattern_Dict = pickle.load(f) 98 | 99 | mel_List.append(pattern_Dict['Mel']) 100 | token_List.append(pattern_Dict['Token']) 101 | spectrogram_List.append(pattern_Dict['Spectrogram']) 102 | 103 | max_Mel_Length = max([mel.shape[0] for mel in mel_List]) 104 | max_Token_Length = max([token.shape[0] for token in token_List]) 105 | max_Spectrogram_Length = max([spect.shape[0] for spect in spectrogram_List]) 106 | 107 | new_Mel_Pattern = np.zeros( 108 | shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']), 109 | dtype= np.float32 110 | ) 111 | new_Token_Pattern = np.zeros( 112 | shape=(pattern_Count, max_Token_Length), 113 | dtype= np.int32 114 | ) + self.token_Index_Dict[''] 115 | new_Spectrogram_Pattern = np.zeros( 116 | shape=(pattern_Count, max_Spectrogram_Length, hp_Dict['Sound']['Spectrogram_Dim']), 117 | dtype= np.float32 118 | ) 119 | 120 | for pattern_Index, (mel, token, spect) in enumerate(zip(mel_List, token_List, spectrogram_List)): 121 | new_Mel_Pattern[pattern_Index, :mel.shape[0]] = mel 122 | new_Token_Pattern[pattern_Index, :token.shape[0]] = token 123 | new_Spectrogram_Pattern[pattern_Index, :spect.shape[0]] = spect 124 | 125 | new_Mel_Pattern = np.hstack([ 126 | np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32), 127 | new_Mel_Pattern 128 | ]) #initial frame 129 | new_Spectrogram_Pattern = np.hstack([ 130 | np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Spectrogram_Dim']), dtype= np.float32), 131 | new_Spectrogram_Pattern 132 | ]) #initial frame 133 | 134 | padded_Length = np.maximum(new_Mel_Pattern.shape[1], new_Spectrogram_Pattern.shape[1]) 135 | padded_Length = int(np.ceil(padded_Length / hp_Dict['Step_Reduction']) * hp_Dict['Step_Reduction']) 136 | new_Mel_Pattern = np.hstack([ 137 | new_Mel_Pattern, 138 | np.zeros(shape=(pattern_Count, padded_Length - new_Mel_Pattern.shape[1] + 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32) 139 | ]) # +1 is initial frame. This frame is removed when loss calc. 140 | new_Spectrogram_Pattern = np.hstack([ 141 | new_Spectrogram_Pattern, 142 | np.zeros(shape=(pattern_Count, padded_Length - new_Spectrogram_Pattern.shape[1] + 1, hp_Dict['Sound']['Spectrogram_Dim']), dtype= np.float32), 143 | ]) # +1 is initial frame. This frame is removed when loss calc. 144 | 145 | self.pattern_Queue.append({ 146 | 'mels': new_Mel_Pattern, 147 | 'mel_lengths': np.array([mel.shape[0] for mel in mel_List], dtype=np.int32), 148 | 'tokens': new_Token_Pattern, 149 | 'token_lengths': np.array([token.shape[0] for token in token_List], dtype=np.int32), 150 | 'spectrograms': new_Spectrogram_Pattern, 151 | 'spectrogram_lengths': np.array([spect.shape[0] for spect in spectrogram_List], dtype=np.int32), 152 | }) 153 | 154 | batch_Index += 1 155 | 156 | def Get_Pattern(self): 157 | while len(self.pattern_Queue) == 0: #When training speed is faster than making pattern, model should be wait. 158 | time.sleep(0.01) 159 | return self.pattern_Queue.popleft() 160 | 161 | def Get_Inference_Pattern(self, sentence_List, wav_List_for_GST= None): 162 | pattern_Count = len(sentence_List) 163 | 164 | sentence_List = [sentence.upper().strip() for sentence in sentence_List] 165 | 166 | token_List = [ 167 | np.array( 168 | [self.token_Index_Dict['']] + 169 | [self.token_Index_Dict[letter] for letter in sentence] + 170 | [self.token_Index_Dict['']], 171 | dtype= np.int32 172 | ) 173 | for sentence in sentence_List 174 | ] 175 | max_Token_Length = max([token.shape[0] for token in token_List]) 176 | 177 | new_Token_Pattern = np.zeros( 178 | shape=(pattern_Count, max_Token_Length), 179 | dtype= np.int32 180 | ) + self.token_Index_Dict[''] 181 | 182 | new_Initial_Mel_Pattern = np.zeros( 183 | shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), 184 | dtype= np.float32 185 | ) 186 | 187 | for pattern_Index, token in enumerate(token_List): 188 | new_Token_Pattern[pattern_Index, :token.shape[0]] = token 189 | 190 | pattern_Dict = { 191 | 'tokens': new_Token_Pattern, 192 | 'token_lengths': np.array([token.shape[0] for token in token_List], dtype=np.int32), 193 | 'initial_mels': new_Initial_Mel_Pattern 194 | } 195 | 196 | if hp_Dict['GST']['Use']: 197 | if wav_List_for_GST is None: 198 | print('GST is enabled, but no wav information.') 199 | return 200 | if not len(wav_List_for_GST) in [1, pattern_Count]: 201 | print('The length of wav_List_for_GST must be 1 or same to the length of sentence_List and wav_List_for_GST must be same.') 202 | return 203 | 204 | if len(wav_List_for_GST) == 1: 205 | mel = Mel_Generate(wav_List_for_GST[0], top_db= 60, range_Ignore= True) 206 | new_Mel_Pattern_for_GST = np.stack([mel] * pattern_Count, axis= 0) 207 | new_Mel_Length_for_GST = np.array([mel.shape[0]] * pattern_Count, dtype= np.int32) 208 | else: 209 | mel_List = [Mel_Generate(path, top_db= 15, range_Ignore= True) for path in wav_List_for_GST] 210 | max_Mel_Length = max([mel.shape[0] for mel in mel_List]) 211 | new_Mel_Pattern_for_GST = np.zeros( 212 | shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']), 213 | dtype= np.float32 214 | ) 215 | for pattern_Index, mel in enumerate(mel_List): 216 | new_Mel_Pattern_for_GST[pattern_Index, :mel.shape[0]] = mel 217 | 218 | new_Mel_Length_for_GST = np.array([mel.shape[0] for mel in mel_List], dtype=np.int32) 219 | 220 | # GST does not need an initial frame. But for the same pattern input as the training, I add an initial frame 221 | pattern_Dict['mels_for_gst'] = np.hstack([ 222 | np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32), 223 | new_Mel_Pattern_for_GST 224 | ]) 225 | pattern_Dict['mel_lengths_for_gst'] = new_Mel_Length_for_GST 226 | 227 | return pattern_Dict 228 | 229 | def Get_Inference_GST_Pattern(self, wav_List): 230 | pattern_Count = len(wav_List) 231 | 232 | mel_List = [Mel_Generate(path, top_db= 60, range_Ignore= True) for path in wav_List] 233 | max_Mel_Length = max([mel.shape[0] for mel in mel_List]) 234 | new_Mel_Pattern = np.zeros( 235 | shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']), 236 | dtype= np.float32 237 | ) 238 | for pattern_Index, mel in enumerate(mel_List): 239 | new_Mel_Pattern[pattern_Index, :mel.shape[0]] = mel 240 | 241 | new_Mel_Length = np.array([mel.shape[0] for mel in mel_List], dtype=np.int32) 242 | 243 | # GST does not need an initial frame. But for the same pattern input as the training, I add an initial frame 244 | pattern_Dict = { 245 | 'mels_for_gst': np.hstack([ 246 | np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32), 247 | new_Mel_Pattern 248 | ]), 249 | 'mel_lengths_for_gst': new_Mel_Length 250 | } 251 | 252 | return pattern_Dict 253 | 254 | 255 | if __name__ == "__main__": 256 | new_Feeder = Feeder(is_Training= True) 257 | x = new_Feeder.Get_Pattern() 258 | 259 | print(x['mels'].shape) 260 | print(x['spectrograms'].shape) 261 | print(x['tokens'].shape) 262 | print(x['mel_lengths'].shape) 263 | print(x['spectrogram_lengths'].shape) 264 | print(x['token_lengths'].shape) 265 | print(x['tokens']) 266 | 267 | print('######################################################') 268 | 269 | x = new_Feeder.Get_Inference_Pattern(sentence_List= [ 270 | 'The grass is always greener on the other side of the fence.', 271 | 'Strike while the iron is hot.' 272 | ]) 273 | print(x['initial_mels'].shape) 274 | print(x['tokens'].shape) 275 | print(x['token_lengths'].shape) 276 | print(x['tokens']) 277 | 278 | # while True: 279 | # time.sleep(1) 280 | # print(new_Feeder.Get_Pattern()) 281 | -------------------------------------------------------------------------------- /Figures/Structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Figures/Structure.png -------------------------------------------------------------------------------- /Get_Path.py: -------------------------------------------------------------------------------- 1 | import os 2 | from random import sample 3 | 4 | def Get_Path(sample_count= 50): 5 | path_List = [ 6 | ('LJ(F)', 'D:/Pattern/ENG/LJSpeech/wavs'), 7 | ('CLB(F)', 'D:/Pattern/ENG/FastVox/cmu_us_clb_arctic/wav'), 8 | ('SLT(F)', 'D:/Pattern/ENG/FastVox/cmu_us_slt_arctic/wav'), 9 | ('AWB(M)', 'D:/Pattern/ENG/FastVox/cmu_us_awb_arctic/wav'), 10 | ('BDL(M)', 'D:/Pattern/ENG/FastVox/cmu_us_bdl_arctic/wav'), 11 | ('JMK(M)', 'D:/Pattern/ENG/FastVox/cmu_us_jmk_arctic/wav'), 12 | ('KSP(M)', 'D:/Pattern/ENG/FastVox/cmu_us_ksp_arctic/wav'), 13 | ('RMS(M)', 'D:/Pattern/ENG/FastVox/cmu_us_rms_arctic/wav'), 14 | ] 15 | 16 | wav_List = [] 17 | tag_List = [] 18 | for tag, path in path_List: 19 | for root, _, files in os.walk(path): 20 | for file in sample(files, sample_count): 21 | wav_List.append(os.path.join(root, file).replace('\\', '/')) 22 | tag_List.append(tag) 23 | 24 | return wav_List, tag_List 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /Hyper_Parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "Sound": { 3 | "Spectrogram_Dim": 513, 4 | "Mel_Dim": 80, 5 | "Frame_Length": 1024, 6 | "Frame_Shift": 256, 7 | "Sample_Rate": 16000, 8 | "Max_Abs_Mel": 4 9 | }, 10 | 11 | "Token_JSON_Path": "Token_Index_Dict.ENG.json", 12 | 13 | "GST": { 14 | "Use": true, 15 | "Reference_Encoder": { 16 | "Conv": { 17 | "Filters": [32, 32, 64, 64, 128, 128], 18 | "Kernel_Size": [3, 3, 3, 3, 3, 3], 19 | "Strides": [2, 2, 2, 2, 2, 2] 20 | }, 21 | "RNN": { 22 | "Size": 128 23 | }, 24 | "Dense": { 25 | "Size": 128 26 | } 27 | }, 28 | "Style_Token": { 29 | "Size": 16, 30 | "Embedding": { 31 | "Size": 256 32 | }, 33 | "Attention": { 34 | "Head": 4, 35 | "Size": 128 36 | } 37 | } 38 | }, 39 | 40 | "Tacotron1": { 41 | "Encoder": { 42 | "Embedding": { 43 | "Size": 256 44 | }, 45 | "Prenet": { 46 | "Size": [256, 128], 47 | "Dropout_Rate": 0.5 48 | }, 49 | "CBHG": { 50 | "Conv_Bank": { 51 | "Stack_Count": 16, 52 | "Filters": 128 53 | }, 54 | "Pool": { 55 | "Pool_Size": 2, 56 | "Strides": 1 57 | }, 58 | "Conv1D": { 59 | "Filters": [128, 128], 60 | "Kernel_Size": [3, 3] 61 | }, 62 | "Highwaynet": { 63 | "Count": 4, 64 | "Size": 128 65 | }, 66 | "RNN": { 67 | "Size": 128, 68 | "Zoneout": 0.0 69 | } 70 | } 71 | }, 72 | "Decoder": { 73 | "Prenet": { 74 | "Size": [256, 128], 75 | "Dropout_Rate": 0.5 76 | }, 77 | "Pre_RNN": { 78 | "Size": [256], 79 | "Zoneout": 0.0 80 | }, 81 | "Attention": { 82 | "Type": ["SMA"], 83 | "Size": [128] 84 | }, 85 | "Post_RNN": { 86 | "Count": 2, 87 | "Size": 256, 88 | "Zoneout": 0.0 89 | } 90 | } 91 | }, 92 | 93 | "Tacotron2": { 94 | "Encoder": { 95 | "Embedding": { 96 | "Size": 512 97 | }, 98 | "Conv": { 99 | "Filters": [512, 512, 512], 100 | "Kernel_Size": [5, 5, 5], 101 | "Strides": [1, 1, 1], 102 | "Dropout_Rate": 0.5 103 | }, 104 | "RNN": { 105 | "Size": 256, 106 | "Zoneout": 0.0 107 | } 108 | }, 109 | "Decoder": { 110 | "Prenet": { 111 | "Size": [256, 256], 112 | "Dropout_Rate": 0.5 113 | }, 114 | "RNN": { 115 | "Size": [1024, 1024], 116 | "Zoneout": 0.0 117 | }, 118 | "Attention": { 119 | "Type": "SMA", 120 | "Size": 128 121 | }, 122 | "Conv": { 123 | "Filters": [512, 512, 512, 512], 124 | "Kernel_Size": [5, 5, 5, 5], 125 | "Strides": [1, 1, 1, 1], 126 | "Dropout_Rate": 0.5 127 | } 128 | } 129 | }, 130 | 131 | "Step_Reduction": 1, 132 | "Max_Step": 1000, 133 | 134 | "Vocoder_Taco1": { 135 | "CBHG": { 136 | "Conv_Bank": { 137 | "Stack_Count": 8, 138 | "Filters": 256 139 | }, 140 | "Pool": { 141 | "Pool_Size": 2, 142 | "Strides": 1 143 | }, 144 | "Conv1D": { 145 | "Filters": [128, 128], 146 | "Kernel_Size": [3, 3] 147 | }, 148 | "Highwaynet": { 149 | "Count": 4, 150 | "Size": 128 151 | }, 152 | "RNN": { 153 | "Size": 256, 154 | "Zoneout": 0.0 155 | } 156 | }, 157 | "Griffin-Lim_Iter": 60 158 | }, 159 | 160 | "Train": { 161 | "Pattern_Path": "C:/Pattern/GST.Pattern.LJFV", 162 | "Metadata_File": "METADATA.PICKLE", 163 | "Batch_Size": 24, 164 | "Pattern_Sorting": true, 165 | "Min_Wav_Length": 500, 166 | "Max_Wav_Length": 10000, 167 | "Max_Pattern_Queue": 50, 168 | "Initial_Learning_Rate": 1e-3, 169 | "Min_Learning_Rate": 1e-5, 170 | "ADAM": { 171 | "Beta1": 0.9, 172 | "Beta2": 0.999, 173 | "Epsilon": 1e-7 174 | }, 175 | "Use_L2_Loss": true, 176 | "Inference_Timing": 1000, 177 | "Checkpoint_Save_Timing": 1000, 178 | 179 | "Sequential_Pattern": false, 180 | "Initial_Inference": true 181 | }, 182 | 183 | 184 | "Taco_Version": 2, 185 | "Use_Mixed_Precision": false, 186 | "Inference_Cut": true, 187 | "Inference_Path": "D:/GST.Results/Inference", 188 | "Checkpoint_Path": "D:/GST.Results/Checkpoint", 189 | "Device": "0" 190 | } -------------------------------------------------------------------------------- /Inference_Sentence_for_Training.txt: -------------------------------------------------------------------------------- 1 | The grass is always greener on the other side of the fence. 2 | Strike while the iron is hot. 3 | A creative artist works on his next composition because he was not satisfied with his previous one. 4 | You cannot make an omelet without breaking a few eggs. 5 | Death is like a fisherman who catches fish in his net and leaves them for a while in the water. The fish is still swimming but the net is around him, and the fisherman will draw him up. 6 | A man who marries a woman to educate her falls a victim to the same fallacy as the woman who marries a man to reform him. 7 | Birds of a feather flock together. 8 | Too many cooks in the kitchen spoil the broth. -------------------------------------------------------------------------------- /Inference_Wav_for_Training.txt: -------------------------------------------------------------------------------- 1 | ./Wav_for_Inference/FV.AWB.arctic_a0001.wav 2 | ./Wav_for_Inference/FV.BDL.arctic_a0002.wav 3 | ./Wav_for_Inference/FV.CLB.arctic_a0003.wav 4 | ./Wav_for_Inference/FV.JMK.arctic_a0004.wav 5 | ./Wav_for_Inference/FV.KSP.arctic_a0005.wav 6 | ./Wav_for_Inference/FV.RMS.arctic_a0006.wav 7 | ./Wav_for_Inference/FV.SLT.arctic_a0007.wav 8 | ./Wav_for_Inference/LJ.LJ050-0278.wav -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Heejo You 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Heejo You 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.mixed_precision import experimental as mixed_precision 3 | import numpy as np 4 | import json, os, time, argparse 5 | from threading import Thread 6 | import matplotlib 7 | matplotlib.use('agg') 8 | import matplotlib.pyplot as plt 9 | from datetime import datetime 10 | 11 | from ProgressBar import progress 12 | from Feeder import Feeder 13 | from Modules.GST import Style_Token_Layer, GST_Concated_Encoder 14 | from Audio import inv_spectrogram 15 | from scipy.io import wavfile 16 | 17 | with open('Hyper_Parameters.json', 'r') as f: 18 | hp_Dict = json.load(f) 19 | 20 | # if hp_Dict['Taco_Version'] == 1: 21 | # import Modules_Taco1 as Modules 22 | # elif hp_Dict['Taco_Version'] == 2: 23 | # import Modules_Taco2 as Modules 24 | # else: 25 | # raise ValueError('Unexpected tactoron version hyperparameters: {}'.format(hp_Dict['Version'])) 26 | from Modules import Taco2 as Modules 27 | 28 | if not hp_Dict['Device'] is None: 29 | os.environ["CUDA_VISIBLE_DEVICES"]= hp_Dict['Device'] 30 | 31 | if hp_Dict['Use_Mixed_Precision']: 32 | policy = mixed_precision.Policy('mixed_float16') 33 | else: 34 | policy = mixed_precision.Policy('float32') 35 | mixed_precision.set_policy(policy) 36 | 37 | class GST_Tacotron: 38 | def __init__(self, is_Training= False): 39 | self.feeder = Feeder(is_Training= is_Training) 40 | self.Model_Generate() 41 | 42 | def Model_Generate(self): 43 | input_Dict = {} 44 | layer_Dict = {} 45 | tensor_Dict = {} 46 | 47 | input_Dict['Mel'] = tf.keras.layers.Input( 48 | shape=[None, hp_Dict['Sound']['Mel_Dim']], 49 | dtype= tf.as_dtype(policy.compute_dtype) 50 | ) 51 | input_Dict['Mel_Length'] = tf.keras.layers.Input( 52 | shape=[], 53 | dtype= tf.int32 54 | ) 55 | input_Dict['Token'] = tf.keras.layers.Input( 56 | shape=[None,], 57 | dtype= tf.int32 58 | ) 59 | input_Dict['Token_Length'] = tf.keras.layers.Input( 60 | shape=[], 61 | dtype= tf.int32 62 | ) 63 | input_Dict['Spectrogram'] = tf.keras.layers.Input( 64 | shape=[None, hp_Dict['Sound']['Spectrogram_Dim']], 65 | dtype= tf.as_dtype(policy.compute_dtype) 66 | ) 67 | input_Dict['Spectrogram_Length'] = tf.keras.layers.Input( 68 | shape=[], 69 | dtype= tf.int32 70 | ) 71 | if hp_Dict['GST']['Use']: 72 | input_Dict['GST_Mel'] = tf.keras.layers.Input( 73 | shape=[None, hp_Dict['Sound']['Mel_Dim']], 74 | dtype= tf.as_dtype(policy.compute_dtype) 75 | ) 76 | 77 | layer_Dict['Encoder'] = Modules.Encoder() 78 | layer_Dict['Decoder'] = Modules.Decoder() 79 | layer_Dict['Vocoder_Taco1'] = Modules.Vocoder_Taco1() 80 | if hp_Dict['GST']['Use']: 81 | layer_Dict['Style_Token_Layer'] = Style_Token_Layer() 82 | layer_Dict['GST_Concated_Encoder'] = GST_Concated_Encoder() 83 | 84 | 85 | tensor_Dict['Train', 'Encoder'] = layer_Dict['Encoder']( 86 | input_Dict['Token'], 87 | training= True 88 | ) 89 | if hp_Dict['GST']['Use']: 90 | tensor_Dict['Train', 'GST'] = layer_Dict['Style_Token_Layer']([ 91 | input_Dict['GST_Mel'], 92 | input_Dict['Mel_Length'] 93 | ]) 94 | tensor_Dict['Train', 'Encoder'] = layer_Dict['GST_Concated_Encoder']([ 95 | tensor_Dict['Train', 'Encoder'], 96 | tensor_Dict['Train', 'GST'] 97 | ]) 98 | 99 | tensor_Dict['Train', 'Export_Pre_Mel'], tensor_Dict['Train', 'Export_Mel'], tensor_Dict['Train', 'Stop_Token'], _ = layer_Dict['Decoder']( 100 | [tensor_Dict['Train', 'Encoder'], input_Dict['Mel']], 101 | training= True 102 | ) 103 | tensor_Dict['Train', 'Export_Spectrogram'] = layer_Dict['Vocoder_Taco1']( 104 | tensor_Dict['Train', 'Export_Mel'], 105 | training= True 106 | ) 107 | 108 | tensor_Dict['Inference', 'Encoder'] = layer_Dict['Encoder']( 109 | input_Dict['Token'], 110 | training= False 111 | ) 112 | if hp_Dict['GST']['Use']: 113 | tensor_Dict['Inference', 'GST'] = layer_Dict['Style_Token_Layer']([ 114 | input_Dict['GST_Mel'], 115 | input_Dict['Mel_Length'] 116 | ]) 117 | tensor_Dict['Inference', 'Encoder'] = layer_Dict['GST_Concated_Encoder']([ 118 | tensor_Dict['Inference', 'Encoder'], 119 | tensor_Dict['Inference', 'GST'] 120 | ]) 121 | 122 | _, tensor_Dict['Inference', 'Export_Mel'], tensor_Dict['Inference', 'Stop_Token'], tensor_Dict['Inference', 'Alignment'] = layer_Dict['Decoder']( 123 | [tensor_Dict['Inference', 'Encoder'], input_Dict['Mel']], 124 | training= False 125 | ) 126 | tensor_Dict['Inference', 'Export_Spectrogram'] = layer_Dict['Vocoder_Taco1']( 127 | tensor_Dict['Inference', 'Export_Mel'], 128 | training= False 129 | ) 130 | 131 | self.model_Dict = {} 132 | self.model_Dict['Train'] = tf.keras.Model( 133 | inputs=[ 134 | input_Dict['Mel'], 135 | input_Dict['Token'], 136 | input_Dict['Spectrogram'] 137 | ] + ([input_Dict['GST_Mel'], input_Dict['Mel_Length']] if hp_Dict['GST']['Use'] else []), 138 | outputs= [ 139 | tensor_Dict['Train', 'Export_Pre_Mel'], 140 | tensor_Dict['Train', 'Export_Mel'], 141 | tensor_Dict['Train', 'Stop_Token'], 142 | tensor_Dict['Train', 'Export_Spectrogram'] 143 | ] 144 | ) 145 | self.model_Dict['Inference'] = tf.keras.Model( 146 | inputs=[ 147 | input_Dict['Mel'], 148 | input_Dict['Token'] 149 | ] + ([input_Dict['GST_Mel'], input_Dict['Mel_Length']] if hp_Dict['GST']['Use'] else []), 150 | outputs= [ 151 | tensor_Dict['Inference', 'Export_Mel'], 152 | tensor_Dict['Inference', 'Stop_Token'], 153 | tensor_Dict['Inference', 'Export_Spectrogram'], 154 | tensor_Dict['Inference', 'Alignment'] 155 | ] 156 | ) 157 | 158 | self.model_Dict['Train'].summary() 159 | self.model_Dict['Inference'].summary() 160 | 161 | if hp_Dict['GST']['Use']: 162 | self.model_Dict['GST'] = tf.keras.Model( 163 | inputs= [ 164 | input_Dict['GST_Mel'], 165 | input_Dict['Mel_Length'] 166 | ], 167 | outputs= tensor_Dict['Inference', 'GST'] 168 | ) 169 | self.model_Dict['GST'].summary() 170 | 171 | learning_Rate = Modules.ExponentialDecay( 172 | initial_learning_rate= hp_Dict['Train']['Initial_Learning_Rate'], 173 | decay_steps= 50000, 174 | decay_rate= 0.1, 175 | min_learning_rate= hp_Dict['Train']['Min_Learning_Rate'], 176 | staircase= False 177 | ) 178 | 179 | self.optimizer = tf.keras.optimizers.Adam( 180 | learning_rate= learning_Rate, 181 | beta_1= hp_Dict['Train']['ADAM']['Beta1'], 182 | beta_2= hp_Dict['Train']['ADAM']['Beta2'], 183 | epsilon= hp_Dict['Train']['ADAM']['Epsilon'], 184 | ) 185 | 186 | self.checkpoint = tf.train.Checkpoint( 187 | optimizer= self.optimizer, 188 | model= self.model_Dict['Train'] 189 | ) 190 | 191 | # @tf.function( 192 | # input_signature=[ 193 | # tf.TensorSpec(shape=[None, None, hp_Dict['Sound']['Mel_Dim']], dtype= tf.as_dtype(policy.compute_dtype)), 194 | # tf.TensorSpec(shape=[None,], dtype=tf.int32), 195 | # tf.TensorSpec(shape=[None, None], dtype=tf.int32), 196 | # tf.TensorSpec(shape=[None,], dtype=tf.int32), 197 | # tf.TensorSpec(shape=[None, None, hp_Dict['Sound']['Spectrogram_Dim']], dtype= tf.as_dtype(policy.compute_dtype)), 198 | # tf.TensorSpec(shape=[None,], dtype=tf.int32) 199 | # ], 200 | # autograph= False, 201 | # experimental_relax_shapes= False 202 | # ) 203 | def Train_Step(self, mels, mel_lengths, tokens, token_lengths, spectrograms, spectrogram_lengths): 204 | with tf.GradientTape() as tape: 205 | pre_Mel_Logits, mel_Logits, stop_Logits, spectrogram_Logits = self.model_Dict['Train']( 206 | inputs= [mels, tokens, spectrograms] + ([mels, mel_lengths] if hp_Dict['GST']['Use'] else []), 207 | training= True 208 | ) 209 | 210 | pre_Mel_Loss = tf.reduce_mean(tf.abs(mels[:, 1:] - pre_Mel_Logits), axis= -1) 211 | mel_Loss = tf.reduce_mean(tf.abs(mels[:, 1:] - mel_Logits), axis= -1) 212 | spectrogram_Loss = tf.reduce_mean(tf.abs(spectrograms[:, 1:] - spectrogram_Logits), axis= -1) 213 | if hp_Dict['Train']['Use_L2_Loss']: 214 | mel_Loss += tf.reduce_mean(tf.pow(mels[:, 1:] - mel_Logits, 2), axis= -1) 215 | spectrogram_Loss += tf.reduce_mean(tf.pow(spectrograms[:, 1:] - spectrogram_Logits, 2), axis= -1) 216 | 217 | pre_Mel_Loss *= tf.sequence_mask( 218 | lengths= mel_lengths, 219 | maxlen= tf.shape(mel_Loss)[-1], 220 | dtype= tf.as_dtype(policy.compute_dtype) 221 | ) 222 | mel_Loss *= tf.sequence_mask( 223 | lengths= mel_lengths, 224 | maxlen= tf.shape(mel_Loss)[-1], 225 | dtype= tf.as_dtype(policy.compute_dtype) 226 | ) 227 | stop_Loss = tf.nn.sigmoid_cross_entropy_with_logits( 228 | labels= tf.sequence_mask( 229 | lengths= tf.math.ceil(mel_lengths / hp_Dict['Step_Reduction']), # stop > 0.5: Going, stop < 0.5: Done 230 | maxlen= tf.math.ceil(tf.shape(mel_Loss)[-1] / hp_Dict['Step_Reduction']), 231 | dtype= tf.as_dtype(policy.compute_dtype) 232 | ), 233 | logits= stop_Logits 234 | ) 235 | spectrogram_Loss *= tf.sequence_mask( 236 | lengths= spectrogram_lengths, 237 | maxlen= tf.shape(spectrogram_Loss)[-1], 238 | dtype= tf.as_dtype(policy.compute_dtype) 239 | ) 240 | 241 | loss = tf.reduce_mean(pre_Mel_Loss) + tf.reduce_mean(mel_Loss) + tf.reduce_mean(stop_Loss) + tf.reduce_mean(spectrogram_Loss) 242 | 243 | gradients = tape.gradient(loss, self.model_Dict['Train'].trainable_variables) 244 | self.optimizer.apply_gradients(zip(gradients, self.model_Dict['Train'].trainable_variables)) 245 | 246 | return loss 247 | 248 | # @tf.function 249 | def Inference_Step(self, tokens, token_lengths, initial_mels, mels_for_gst= None, mel_lengths_for_gst= None): 250 | mel_Logits, stop_Logits, spectrogram_Logits, alignments = self.model_Dict['Inference']( 251 | inputs= [initial_mels, tokens] + ([mels_for_gst, mel_lengths_for_gst] if hp_Dict['GST']['Use'] else []), 252 | training= False 253 | ) 254 | 255 | return mel_Logits, stop_Logits, spectrogram_Logits, alignments 256 | 257 | def Inference_GST_Step(self, mels_for_gst, mel_lengths_for_gst): 258 | if not hp_Dict['GST']['Use']: 259 | raise NotImplementedError('GST is not used') 260 | gst = self.model_Dict['GST']( 261 | inputs= [mels_for_gst, mel_lengths_for_gst], 262 | training= False 263 | ) 264 | 265 | return gst 266 | 267 | def Restore(self, checkpoint_File_Path= None): 268 | if checkpoint_File_Path is None: 269 | checkpoint_File_Path = tf.train.latest_checkpoint(hp_Dict['Checkpoint_Path']) 270 | 271 | if not os.path.exists('{}.index'.format(checkpoint_File_Path)): 272 | print('There is no checkpoint.') 273 | return 274 | 275 | self.checkpoint.restore(checkpoint_File_Path) 276 | print('Checkpoint \'{}\' is loaded.'.format(checkpoint_File_Path)) 277 | 278 | def Train(self): 279 | if not os.path.exists(os.path.join(hp_Dict['Inference_Path'], 'Hyper_Parameters.json')): 280 | os.makedirs(hp_Dict['Inference_Path'], exist_ok= True) 281 | with open(os.path.join(hp_Dict['Inference_Path'], 'Hyper_Parameters.json').replace("\\", "/"), "w") as f: 282 | json.dump(hp_Dict, f, indent= 4) 283 | 284 | def Save_Checkpoint(): 285 | os.makedirs(os.path.join(hp_Dict['Checkpoint_Path']).replace("\\", "/"), exist_ok= True) 286 | self.checkpoint.save( 287 | os.path.join( 288 | hp_Dict['Checkpoint_Path'], 289 | 'S_{}.CHECKPOINT.H5'.format(self.optimizer.iterations.numpy()) 290 | ).replace('\\', '/') 291 | ) 292 | 293 | def Run_Inference(): 294 | sentence_List = [] 295 | with open('Inference_Sentence_for_Training.txt', 'r') as f: 296 | for line in f.readlines(): 297 | sentence_List.append(line.strip()) 298 | 299 | if hp_Dict['GST']['Use']: 300 | wav_List_for_GST = [] 301 | with open('Inference_Wav_for_Training.txt', 'r') as f: 302 | for line in f.readlines(): 303 | wav_List_for_GST.append(line.strip()) 304 | else: 305 | wav_List_for_GST = None 306 | 307 | self.Inference(sentence_List, wav_List_for_GST) 308 | 309 | def Run_GST_Inference(): 310 | from Get_Path import Get_Path 311 | wav_List, tag_List = Get_Path(100) 312 | self.Inference_GST(wav_List, tag_List) 313 | 314 | # Save_Checkpoint() 315 | if hp_Dict['Train']['Initial_Inference']: 316 | Run_Inference() 317 | Run_GST_Inference() 318 | 319 | while True: 320 | start_Time = time.time() 321 | 322 | loss = self.Train_Step(**self.feeder.Get_Pattern()) 323 | if np.isnan(loss): 324 | raise ValueError('NaN loss') 325 | display_List = [ 326 | 'Time: {:0.3f}'.format(time.time() - start_Time), 327 | 'Step: {}'.format(self.optimizer.iterations.numpy()), 328 | 'LR: {:0.5f}'.format(self.optimizer.lr(self.optimizer.iterations.numpy() - 1)), 329 | 'Loss: {:0.5f}'.format(loss), 330 | ] 331 | print('\t\t'.join(display_List)) 332 | 333 | if self.optimizer.iterations.numpy() % hp_Dict['Train']['Checkpoint_Save_Timing'] == 0: 334 | Save_Checkpoint() 335 | 336 | if self.optimizer.iterations.numpy() % hp_Dict['Train']['Inference_Timing'] == 0: 337 | Run_Inference() 338 | 339 | if self.optimizer.iterations.numpy() % (hp_Dict['Train']['Inference_Timing'] * 10) == 0: 340 | Run_GST_Inference() 341 | 342 | def Inference(self, sentence_List, wav_List_for_GST= None, label= None): 343 | print('Inference running...') 344 | 345 | pattern_Dict = self.feeder.Get_Inference_Pattern(sentence_List, wav_List_for_GST) 346 | if pattern_Dict is None: 347 | print('Inference fail.') 348 | return 349 | mels, stops, spectrograms, alignments = self.Inference_Step( 350 | **pattern_Dict 351 | ) 352 | 353 | export_Inference_Thread = Thread( 354 | target= self.Export_Inference, 355 | args= [ 356 | sentence_List, 357 | mels.numpy(), 358 | stops.numpy(), 359 | spectrograms.numpy(), 360 | alignments.numpy(), 361 | label or datetime.now().strftime("%Y%m%d.%H%M%S") 362 | ] 363 | ) 364 | export_Inference_Thread.daemon = True 365 | export_Inference_Thread.start() 366 | 367 | return mels, stops, spectrograms, alignments 368 | 369 | def Export_Inference(self, sentence_List, mel_List, stop_List, spectrogram_List, alignment_List, label): 370 | os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'Plot').replace("\\", "/"), exist_ok= True) 371 | os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'Wav').replace("\\", "/"), exist_ok= True) 372 | 373 | for index, (sentence, mel, stop, spect, alignment) in enumerate(zip(sentence_List, mel_List, stop_List, spectrogram_List, alignment_List)): 374 | #matplotlib does not supprt float16 375 | mel = mel.astype(np.float32) 376 | stop = stop.astype(np.float32) 377 | spect = spect.astype(np.float32) 378 | alignment = alignment.astype(np.float32) 379 | 380 | slice_Index = np.argmax(stop < 0) if any(stop < 0) else stop.shape[0] # Check stop tokens 381 | 382 | new_Figure = plt.figure(figsize=(24, 6 * 5), dpi=100) 383 | plt.subplot2grid((5, 1), (0, 0)) 384 | plt.imshow(np.transpose(mel), aspect='auto', origin='lower') 385 | plt.title('Mel Sentence: {}'.format(sentence)) 386 | plt.colorbar() 387 | plt.subplot2grid((5, 1), (1, 0)) 388 | plt.imshow(np.transpose(spect), aspect='auto', origin='lower') 389 | plt.title('Spectrogram Sentence: {}'.format(sentence)) 390 | plt.colorbar() 391 | plt.subplot2grid((5, 1), (2, 0), rowspan=2) 392 | plt.imshow(np.transpose(alignment), aspect='auto', origin='lower') 393 | plt.title('Alignment Sentence: {}'.format(sentence)) 394 | plt.yticks( 395 | range(alignment.shape[1]), 396 | [''] + list(sentence) + [''], 397 | fontsize = 10 398 | ) 399 | plt.colorbar() 400 | plt.subplot2grid((5, 1), (4, 0)) 401 | plt.plot(stop) 402 | plt.axvline(x= slice_Index, linestyle='--', linewidth=1) 403 | plt.title('Stop token Sentence: {}'.format(sentence)) 404 | plt.colorbar() 405 | 406 | plt.tight_layout() 407 | plt.savefig( 408 | os.path.join(hp_Dict['Inference_Path'], 'Plot', '{}.IDX_{}.PNG'.format(label, index)).replace("\\", "/") 409 | ) 410 | plt.close(new_Figure) 411 | 412 | new_Sig = inv_spectrogram( 413 | spectrogram= np.transpose(spect[:np.maximum(1, slice_Index) * hp_Dict['Step_Reduction']]), 414 | num_freq= hp_Dict['Sound']['Spectrogram_Dim'], 415 | hop_length= hp_Dict['Sound']['Frame_Shift'], 416 | win_length= hp_Dict['Sound']['Frame_Length'], 417 | sample_rate= hp_Dict['Sound']['Sample_Rate'], 418 | max_abs_value= hp_Dict['Sound']['Max_Abs_Mel'], 419 | griffin_lim_iters= hp_Dict['Vocoder_Taco1']['Griffin-Lim_Iter'] 420 | ) 421 | wavfile.write( 422 | filename= os.path.join(hp_Dict['Inference_Path'], 'Wav', '{}.IDX_{}.WAV'.format(label, index)).replace("\\", "/"), 423 | data= (new_Sig * 32768).astype(np.int16), 424 | rate= hp_Dict['Sound']['Sample_Rate'] 425 | ) 426 | 427 | def Inference_GST(self, wav_List, tag_List, label= None): 428 | if not hp_Dict['GST']['Use']: 429 | raise NotImplementedError('GST is not used') 430 | 431 | print('GST Inference running...') 432 | gsts = self.Inference_GST_Step( 433 | **self.feeder.Get_Inference_GST_Pattern(wav_List) 434 | ) 435 | 436 | export_Inference_Thread = Thread( 437 | target= self.Export_GST, 438 | args= [ 439 | wav_List, 440 | tag_List, 441 | gsts, 442 | label or datetime.now().strftime("%Y%m%d.%H%M%S") 443 | ] 444 | ) 445 | export_Inference_Thread.daemon = True 446 | export_Inference_Thread.start() 447 | 448 | def Export_GST(self, wav_List, tag_List, gst_List, label): 449 | os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'GST').replace("\\", "/"), exist_ok= True) 450 | 451 | title_Column_List = ['Wav', 'Tag'] + ['Unit_{}'.format(x) for x in range(gst_List[0].shape[0])] 452 | export_List = ['\t'.join(title_Column_List)] 453 | for wav_Path, tag, gst in zip(wav_List, tag_List, gst_List): 454 | new_Line_List = [wav_Path, tag] + [x for x in gst] 455 | new_Line_List = ['{}'.format(x) for x in new_Line_List] 456 | export_List.append('\t'.join(new_Line_List)) 457 | 458 | with open(os.path.join(hp_Dict['Inference_Path'], 'GST', '{}.GST.TXT'.format(label)).replace("\\", "/"), 'w') as f: 459 | f.write('\n'.join(export_List)) 460 | 461 | if __name__ == '__main__': 462 | new_Model = GST_Tacotron(is_Training= True) 463 | new_Model.Restore() 464 | new_Model.Train() -------------------------------------------------------------------------------- /Modules/Attention/Layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from scipy.special import comb, beta 4 | 5 | class DotProductAttention(tf.keras.layers.Attention): 6 | ''' 7 | Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L182-L303 8 | Changes 9 | 1. Attention size managing 10 | 2. Getting the attention history(scores). 11 | ''' 12 | def __init__(self, size, use_scale=False, **kwargs): 13 | super(DotProductAttention, self).__init__(use_scale= use_scale, **kwargs) 14 | self.size = size 15 | self.layer_Dict = { 16 | 'Query': tf.keras.layers.Dense(size), 17 | 'Value': tf.keras.layers.Dense(size), 18 | 'Key': tf.keras.layers.Dense(size) 19 | } 20 | 21 | def call(self, inputs, mask=None): 22 | self._validate_call_args(inputs=inputs, mask=mask) 23 | q = self.layer_Dict['Query'](inputs[0]) 24 | v = self.layer_Dict['Value'](inputs[1]) 25 | k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v 26 | q_mask = mask[0] if mask else None 27 | v_mask = mask[1] if mask else None 28 | scores = self._calculate_scores(query=q, key=k) 29 | if v_mask is not None: 30 | # Mask of shape [batch_size, 1, Tv]. 31 | v_mask = tf.expand_dims(v_mask, axis=-2) 32 | if self.causal: 33 | # Creates a lower triangular mask, so position i cannot attend to 34 | # positions j>i. This prevents the flow of information from the future 35 | # into the past. 36 | scores_shape = tf.shape(scores) 37 | # causal_mask_shape = [1, Tq, Tv]. 38 | causal_mask_shape = tf.concat( 39 | [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]], 40 | axis=0) 41 | causal_mask = _lower_triangular_mask(causal_mask_shape) 42 | else: 43 | causal_mask = None 44 | scores_mask = _merge_masks(v_mask, causal_mask) 45 | result, attention_distribution = _apply_scores(scores=scores, value=v, scores_mask=scores_mask) 46 | if q_mask is not None: 47 | # Mask of shape [batch_size, Tq, 1]. 48 | q_mask = tf.expand_dims(q_mask, axis=-1) 49 | result *= tf.cast(q_mask, dtype=result.dtype) 50 | 51 | return result, attention_distribution 52 | 53 | def _calculate_scores(self, query, key): 54 | """Calculates attention scores as a query-key dot product. 55 | Args: 56 | query: Query tensor of shape `[batch_size, Tq, dim]`. 57 | key: Key tensor of shape `[batch_size, Tv, dim]`. 58 | Returns: 59 | Tensor of shape `[batch_size, Tq, Tv]`. 60 | """ 61 | scores = tf.matmul(query, key, transpose_b=True) 62 | 63 | if self.scale is not None: 64 | scores *= self.scale 65 | return scores 66 | 67 | class BahdanauAttention(tf.keras.layers.AdditiveAttention): 68 | ''' 69 | Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440 70 | This is for attention size managing and getting the attention history(scores). 71 | ''' 72 | def __init__(self, size, use_scale=False, **kwargs): 73 | super(BahdanauAttention, self).__init__(use_scale= use_scale, **kwargs) 74 | self.size = size 75 | self.layer_Dict = { 76 | 'Query': tf.keras.layers.Dense(size), 77 | 'Value': tf.keras.layers.Dense(size), 78 | 'Key': tf.keras.layers.Dense(size) 79 | } 80 | 81 | def build(self, input_shape): 82 | if self.use_scale: 83 | self.scale = self.add_weight( 84 | name='scale', 85 | shape=[self.size], 86 | initializer= tf.initializers.glorot_uniform(), 87 | dtype=self.dtype, 88 | trainable=True) 89 | else: 90 | self.scale = None 91 | 92 | self.built = True 93 | 94 | def call(self, inputs, mask=None): 95 | self._validate_call_args(inputs=inputs, mask=mask) 96 | q = self.layer_Dict['Query'](inputs[0]) 97 | v = self.layer_Dict['Value'](inputs[1]) 98 | k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v 99 | q_mask = mask[0] if mask else None 100 | v_mask = mask[1] if mask else None 101 | scores = self._calculate_scores(query=q, key=k) #[Batch, T_q, T_k] 102 | if v_mask is not None: 103 | # Mask of shape [batch_size, 1, Tv]. 104 | v_mask = tf.expand_dims(v_mask, axis=-2) 105 | if self.causal: 106 | # Creates a lower triangular mask, so position i cannot attend to 107 | # positions j>i. This prevents the flow of information from the future 108 | # into the past. 109 | scores_shape = tf.shape(scores) 110 | # causal_mask_shape = [1, Tq, Tv]. 111 | causal_mask_shape = tf.concat( 112 | [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]], 113 | axis=0) 114 | causal_mask = _lower_triangular_mask(causal_mask_shape) 115 | else: 116 | causal_mask = None 117 | scores_mask = _merge_masks(v_mask, causal_mask) 118 | 119 | result, attention_distribution = _apply_scores(scores=scores, value=v, scores_mask=scores_mask) 120 | if q_mask is not None: 121 | # Mask of shape [batch_size, Tq, 1]. 122 | q_mask = tf.expand_dims(q_mask, axis=-1) 123 | result *= tf.cast(q_mask, dtype=result.dtype) 124 | 125 | return result, attention_distribution 126 | 127 | def _calculate_scores(self, query, key): 128 | """Calculates attention scores as a nonlinear sum of query and key. 129 | Args: 130 | query: Query tensor of shape `[batch_size, Tq, dim]`. 131 | key: Key tensor of shape `[batch_size, Tv, dim]`. 132 | Returns: 133 | Tensor of shape `[batch_size, Tq, Tv]`. 134 | """ 135 | # Reshape tensors to enable broadcasting. 136 | # Reshape into [batch_size, Tq, 1, dim]. 137 | q_reshaped = tf.expand_dims(query, axis=-2) 138 | # Reshape into [batch_size, 1, Tv, dim]. 139 | k_reshaped = tf.expand_dims(key, axis=-3) 140 | if self.use_scale: 141 | scale = self.scale 142 | else: 143 | scale = 1. 144 | return tf.reduce_sum( 145 | scale * tf.tanh(q_reshaped + k_reshaped), axis=-1) 146 | 147 | class MultiHeadAttention(tf.keras.layers.Attention): 148 | ''' 149 | Refer1: DotProductAttention 150 | Refer2: https://github.com/Kyubyong/transformer/blob/master/modules.py 151 | ''' 152 | def __init__(self, num_heads, size, use_scale=False, **kwargs): 153 | super(MultiHeadAttention, self).__init__(use_scale= use_scale, **kwargs) 154 | 155 | if size % num_heads != 0: 156 | raise ValueError('size must be divisible by num_heads. (\'{}\' % \'{}\' != 0)'.format(size, num_heads)) 157 | 158 | self.num_heads = num_heads 159 | self.size = size 160 | self.use_scale = use_scale 161 | 162 | def build(self, input_shape): 163 | self.layer_Dict = { 164 | 'Query': tf.keras.layers.Dense(self.size), 165 | 'Value': tf.keras.layers.Dense(self.size), 166 | 'Key': tf.keras.layers.Dense(self.size), 167 | 'Layer_Normalization': Layer_Norm() 168 | } 169 | 170 | super(MultiHeadAttention, self).build(input_shape= input_shape) 171 | 172 | def call(self, inputs, mask=None): 173 | self._validate_call_args(inputs=inputs, mask=mask) 174 | q = self.layer_Dict['Query'](inputs[0]) # [batch_size, Tq, Att_Dim] 175 | v = self.layer_Dict['Value'](inputs[1]) # [batch_size, Tv, Att_Dim] 176 | k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v # [batch_size, Tv, Att_Dim] 177 | 178 | #Multihead 179 | q_split = tf.concat(tf.split(q, self.num_heads, axis= -1), axis= 0) # [batch_size * Heads, Tq, Att_Dim / Heads] 180 | v_split = tf.concat(tf.split(v, self.num_heads, axis= -1), axis= 0) # [batch_size * Heads, Tv, Att_Dim / Heads] 181 | k_split = tf.concat(tf.split(k, self.num_heads, axis= -1), axis= 0) # [batch_size * Heads, Tv, Att_Dim / Heads] 182 | 183 | q_mask = mask[0] if mask else None 184 | v_mask = mask[1] if mask else None 185 | 186 | scores = self._calculate_scores(query= q_split, key= k_split) 187 | if v_mask is not None: 188 | # Mask of shape [batch_size, 1, Tv]. 189 | v_mask = tf.expand_dims(v_mask, axis= -2) 190 | if self.causal: 191 | # Creates a lower triangular mask, so position i cannot attend to 192 | # positions j>i. This prevents the flow of information from the future 193 | # into the past. 194 | scores_shape = tf.shape(scores) 195 | # causal_mask_shape = [1, Tq, Tv]. 196 | causal_mask_shape = tf.concat( 197 | [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]], 198 | axis=0) 199 | causal_mask = _lower_triangular_mask(causal_mask_shape) 200 | else: 201 | causal_mask = None 202 | scores_mask = _merge_masks(v_mask, causal_mask) 203 | result, attention_distribution = _apply_scores(scores=scores, value= v_split, scores_mask=scores_mask) #reslut: [batch_size * Heads, Tq, Att_Dim / Heads], attention_distribution: [batch_size * Heads, Tq, Tv] 204 | if q_mask is not None: 205 | # Mask of shape [batch_size, Tq, 1]. 206 | q_mask = tf.expand_dims(q_mask, axis=-1) 207 | result *= tf.cast(q_mask, dtype=result.dtype) 208 | 209 | result = tf.concat(tf.split(result, self.num_heads, axis= 0), axis= -1) # [batch_size, Tq, Att_Dim] 210 | 211 | result = self.layer_Dict['Layer_Normalization'](result + q) #Residual, layer normalization 212 | attention_distribution = tf.reduce_mean(tf.stack(tf.split(attention_distribution, self.num_heads, axis= 0), axis= 1), axis= 1) # [batch_size * Heads, Tq, Tv] -> [batch_size, Heads, Tq, Tv] -> [batch_size, Tq, Tv] 213 | 214 | return result, attention_distribution 215 | 216 | def _calculate_scores(self, query, key): 217 | """Calculates attention scores as a query-key dot product. 218 | Args: 219 | query: Query tensor of shape `[batch_size, Tq, dim]`. 220 | key: Key tensor of shape `[batch_size, Tv, dim]`. 221 | Returns: 222 | Tensor of shape `[batch_size, Tq, Tv]`. 223 | """ 224 | scores = tf.matmul(query, key, transpose_b=True) 225 | 226 | if self.scale is not None: 227 | scores *= self.scale 228 | return scores 229 | 230 | def _apply_scores(scores, value, scores_mask=None): 231 | if scores_mask is not None: 232 | padding_mask = tf.logical_not(scores_mask) 233 | # Bias so padding positions do not contribute to attention distribution. 234 | scores -= 1.e9 * tf.cast(padding_mask, dtype= scores.dtype) 235 | attention_distribution = tf.nn.softmax(scores) 236 | 237 | return tf.matmul(attention_distribution, value), attention_distribution 238 | 239 | def _lower_triangular_mask(shape): 240 | """Creates a lower-triangular boolean mask over the last 2 dimensions.""" 241 | row_index = tf.cumsum( 242 | tf.ones(shape=shape, dtype=tf.int32), axis=-2) 243 | col_index = tf.cumsum( 244 | tf.ones(shape=shape, dtype=tf.int32), axis=-1) 245 | return tf.greater_equal(row_index, col_index) 246 | 247 | def _merge_masks(x, y): 248 | if x is None: 249 | return y 250 | if y is None: 251 | return x 252 | return tf.logical_and(x, y) 253 | 254 | class Layer_Norm(tf.keras.layers.Layer): 255 | ''' 256 | There are several restriction in 'tf.keras.layers.LayerNormalization'. 257 | ''' 258 | def __init__(self, epsilon= 1e-8): 259 | super(Layer_Norm, self).__init__() 260 | self.epsilon = epsilon 261 | 262 | def build(self, input_shape): 263 | self.beta = self.add_weight( 264 | name= 'beta', 265 | shape= input_shape[-1:], 266 | initializer= tf.zeros_initializer(), 267 | dtype= self.dtype, 268 | trainable= True 269 | ) 270 | self.gamma = self.add_weight( 271 | name= 'gamma', 272 | shape= input_shape[-1:], 273 | initializer= tf.ones_initializer(), 274 | dtype= self.dtype, 275 | trainable= True 276 | ) 277 | 278 | self.built = True 279 | 280 | def call(self, inputs): 281 | mean, variance = tf.nn.moments(inputs, [-1], keepdims= True) 282 | normalized = (inputs - mean) / ((variance + self.epsilon) ** .5) 283 | outputs = self.gamma * normalized + self.beta 284 | 285 | return outputs 286 | 287 | 288 | # Refer: https://github.com/begeekmyfriend/tacotron/blob/60d6932f510bf591acb25620290868900b5c0a41/models/attention.py 289 | class LocationSensitiveAttention(tf.keras.layers.AdditiveAttention): 290 | ''' 291 | Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440 292 | This is for attention size managing and getting the attention history(scores). 293 | ''' 294 | def __init__( 295 | self, 296 | size, 297 | conv_filters, 298 | conv_kernel_size, 299 | conv_stride, 300 | smoothing= False, 301 | use_scale=False, 302 | cumulate_weights= True, 303 | **kwargs 304 | ): 305 | super(LocationSensitiveAttention, self).__init__(use_scale= use_scale, **kwargs) 306 | 307 | self.size = size 308 | self.smoothing = smoothing 309 | self.cumulate_weights = cumulate_weights 310 | self.layer_Dict = { 311 | 'Query': tf.keras.layers.Dense(size), 312 | 'Value': tf.keras.layers.Dense(size), 313 | 'Key': tf.keras.layers.Dense(size), 314 | 'Alignment_Conv': tf.keras.layers.Conv1D( 315 | filters= conv_filters, 316 | kernel_size= conv_kernel_size, 317 | strides= conv_stride, 318 | padding='same' 319 | ), 320 | 'Alignment_Dense': tf.keras.layers.Dense(size) 321 | } 322 | 323 | def build(self, input_shape): 324 | """Creates scale and bias variable if use_scale==True.""" 325 | if self.use_scale: 326 | self.scale = self.add_weight( 327 | name='scale', 328 | shape=[self.size], 329 | initializer= tf.initializers.glorot_uniform(), 330 | dtype=self.dtype, 331 | trainable=True) 332 | else: 333 | self.scale = None 334 | 335 | self.bias = self.add_weight( 336 | name='bias', 337 | shape=[self.size,], 338 | initializer=tf.zeros_initializer(), 339 | dtype=self.dtype, 340 | trainable=True 341 | ) 342 | 343 | self.bulit = True 344 | 345 | def call(self, inputs): 346 | ''' 347 | inputs: [query, value] or [query, value, key] 348 | I don't implement the mask function now. 349 | ''' 350 | self._validate_call_args(inputs=inputs, mask= None) 351 | query = self.layer_Dict['Query'](inputs[0]) 352 | value = self.layer_Dict['Value'](inputs[1]) 353 | key = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else value 354 | 355 | contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype) #initial attention, [Batch, 1, Att_dim] 356 | alignments = tf.zeros(shape= (tf.shape(query)[0], 1, tf.shape(key)[1]), dtype= query.dtype) #initial alignment, [Batch, 1, T_k] 357 | 358 | initial_Step = tf.constant(0) 359 | def body(step, query, contexts, alignments): 360 | query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim] 361 | previous_alignment = tf.reduce_sum(alignments, axis= 1) if self.cumulate_weights else alignments[:, -1] 362 | location_features = tf.expand_dims(previous_alignment, axis= -1) #[Batch, T_k, 1] 363 | location_features = self.layer_Dict['Alignment_Conv'](location_features) #[Batch, T_k, Filters] 364 | location_features = self.layer_Dict['Alignment_Dense'](location_features) #[Batch, T_k, Att_dim] 365 | 366 | score = self._calculate_scores(query= query_Step, key= key, location_features= location_features) #[Batch, T_k] 367 | context, alignment = self._apply_scores(score= score, value= value) #[Batch, Att_dim], [Batch, T_v] 368 | 369 | return step + 1, query, tf.concat([contexts, context], axis= 1), tf.concat([alignments, alignment], axis= 1) 370 | 371 | _, _, contexts, alignments = tf.while_loop( 372 | cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]), 373 | body= body, 374 | loop_vars= [initial_Step, query, contexts, alignments], 375 | shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])] 376 | ) 377 | 378 | # # The following code cannot use now because normal for-loop does not support 'shape_invariants'. 379 | # for step in tf.range(tf.shape(query)[1]): 380 | # query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim] 381 | # location_features = tf.expand_dims(alignments[:, -1], axis= -1) #[Batch, T_k, 1] 382 | # location_features = self.layer_Dict['Alignment_Conv'](location_features) #[Batch, T_k, Filters] 383 | # location_features = self.layer_Dict['Alignment_Dense'](location_features) #[Batch, T_k, Att_dim] 384 | 385 | # score = self._calculate_scores(query= query_Step, key= key, location_features= location_features) #[Batch, T_k] 386 | # context, alignment = self._apply_scores(score= score, value= value) #[Batch, Att_dim], [Batch, T_v] 387 | 388 | # contexts = tf.concat([contexts, context], axis= 1) 389 | # alignments = tf.concat([alignments, alignment], axis= 1) 390 | 391 | return contexts[:, 1:], alignments[:, 1:] #Remove initial step 392 | 393 | def _calculate_scores(self, query, key, location_features): 394 | """Calculates attention scores as a nonlinear sum of query and key. 395 | Args: 396 | query: Query tensor of shape `[batch_size, 1, Att_dim]`. 397 | key: Key tensor of shape `[batch_size, T_k, Att_dim]`. 398 | location_features: Location_features of shape `[batch_size, T_k, Att_dim]`. 399 | Returns: 400 | Tensor of shape `[batch_size, T_k]`. 401 | """ 402 | if self.use_scale: 403 | scale = self.scale 404 | else: 405 | scale = 1. 406 | 407 | return tf.reduce_sum(scale * tf.tanh(query + key + location_features + self.bias), axis=-1) #[Batch, T_k, Att_dim] -> [Batch, T_k] 408 | 409 | #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism. 410 | def _apply_scores(self, score, value): 411 | ''' 412 | score shape: [batch_size, T_k]`. 413 | value shape: [batch_size, T_v, Att_dim]`. 414 | Must T_k == T_v 415 | 416 | Return: [batch_size, Att_dim] 417 | ''' 418 | score = tf.expand_dims(score, axis= 1) #[Batch_size, 1, T_v] 419 | probability_fn = self._smoothing_normalization if self.smoothing else tf.nn.softmax 420 | alignment = probability_fn(score) #[Batch_size, 1, T_v] 421 | context = tf.matmul(alignment, value) #[Batch_size, 1, Att_dim] 422 | 423 | #return tf.squeeze(context, axis= 1), tf.squeeze(alignment, axis= 1), #[Batch, Att_dim], [Batch, T_v] 424 | return context, alignment 425 | 426 | def _smoothing_normalization(self, e): 427 | """Applies a smoothing normalization function instead of softmax 428 | Introduced in: 429 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 430 | gio, “Attention-based models for speech recognition,” in Ad- 431 | vances in Neural Information Processing Systems, 2015, pp. 432 | 577–585. 433 | ############################################################################ 434 | Smoothing normalization function 435 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 436 | ############################################################################ 437 | Args: 438 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 439 | values of an attention mechanism 440 | Returns: 441 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 442 | attendance to multiple memory time steps. 443 | """ 444 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 445 | 446 | class BahdanauMonotonicAttention(tf.keras.layers.AdditiveAttention): 447 | ''' 448 | Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440 449 | This is for attention size managing and getting the attention history(scores). 450 | ''' 451 | def __init__( 452 | self, 453 | size, 454 | sigmoid_noise= 0.0, 455 | normalize= False, 456 | **kwargs 457 | ): 458 | super(BahdanauMonotonicAttention, self).__init__(use_scale= False, **kwargs) 459 | 460 | self.size = size 461 | self.sigmoid_noise = sigmoid_noise 462 | self.normalize = normalize 463 | 464 | def build(self, input_shape): 465 | self.layer_Dict = { 466 | 'Query': tf.keras.layers.Dense(self.size), 467 | 'Value': tf.keras.layers.Dense(self.size), 468 | 'Key': tf.keras.layers.Dense(self.size) 469 | } 470 | 471 | self.attention_v = self.add_weight( 472 | name='attention_v', 473 | shape=[self.size,], 474 | initializer='glorot_uniform', 475 | dtype=self.dtype, 476 | trainable=True 477 | ) 478 | 479 | self.attention_score_bias = self.add_weight( 480 | name='attention_score_bias', 481 | shape=[], 482 | initializer=tf.zeros_initializer(), 483 | dtype=self.dtype, 484 | trainable=True 485 | ) 486 | 487 | if self.normalize: 488 | self.attention_g = self.add_weight( 489 | name='attention_g', 490 | shape=[], 491 | initializer= tf.initializers.constant([np.sqrt(1. / self.size),]), 492 | dtype=self.dtype, 493 | trainable=True 494 | ) 495 | 496 | self.attention_b = self.add_weight( 497 | name='attention_b', 498 | shape=[self.size,], 499 | initializer= tf.zeros_initializer(), 500 | dtype=self.dtype, 501 | trainable=True 502 | ) 503 | 504 | self.bulit = True 505 | 506 | def call(self, inputs): 507 | ''' 508 | inputs: [query, value] or [query, value, key] 509 | I don't implement the mask function now. 510 | ''' 511 | self._validate_call_args(inputs=inputs, mask= None) 512 | query = self.layer_Dict['Query'](inputs[0]) 513 | value = self.layer_Dict['Value'](inputs[1]) 514 | key = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else value 515 | 516 | contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype) #initial attention, [Batch, 1, Att_dim] 517 | alignments = tf.expand_dims( 518 | tf.one_hot( 519 | indices= tf.zeros((tf.shape(query)[0]), dtype= tf.int32), 520 | depth= tf.shape(key)[1], 521 | dtype= query.dtype 522 | ), 523 | axis= 1 524 | ) #initial alignment, [Batch, 1, T_k]. This part is different by monotonic or not. 525 | 526 | initial_Step = tf.constant(0) 527 | def body(step, query, contexts, alignments): 528 | query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim] 529 | previous_alignment = tf.expand_dims(alignments[:, -1], axis= 1) #[Batch, 1, T_k] 530 | 531 | score = self._calculate_scores(query= query_Step, key= key) #[Batch, T_k] 532 | context, alignment = self._apply_scores(score= score, value= value, previous_alignment= previous_alignment) #[Batch, Att_dim], [Batch, T_v] 533 | 534 | return step + 1, query, tf.concat([contexts, context], axis= 1), tf.concat([alignments, alignment], axis= 1) 535 | 536 | _, _, contexts, alignments = tf.while_loop( 537 | cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]), 538 | body= body, 539 | loop_vars= [initial_Step, query, contexts, alignments], 540 | shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])] 541 | ) 542 | 543 | return contexts[:, 1:], alignments[:, 1:] #Remove initial step 544 | 545 | def _calculate_scores(self, query, key): 546 | """Calculates attention scores as a nonlinear sum of query and key. 547 | Args: 548 | query: Query tensor of shape `[batch_size, 1, Att_dim]`. 549 | key: Key tensor of shape `[batch_size, T_k, Att_dim]`. 550 | 551 | Returns: 552 | Tensor of shape `[batch_size, T_k]`. 553 | """ 554 | if self.normalize: 555 | norm_v = self.attention_g * self.attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(self.attention_v))) 556 | return tf.reduce_sum(norm_v * tf.tanh(query + key + self.attention_b), axis= -1) + self.attention_score_bias #[Batch, T_k, Att_dim] -> [Batch, T_k] 557 | else: 558 | return tf.reduce_sum(self.attention_v * tf.tanh(query + key), axis= -1) + self.attention_score_bias #[Batch, T_k, Att_dim] -> [Batch, T_k] 559 | 560 | #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism. 561 | def _apply_scores(self, score, value, previous_alignment): 562 | ''' 563 | score shape: [batch_size, T_v]`. (Must T_k == T_v) 564 | value shape: [batch_size, T_v, Att_dim]`. 565 | previous_alignment shape: [batch_size, 1, T_v]`. 566 | 567 | 568 | Return: [batch_size, Att_dim] 569 | ''' 570 | score = tf.expand_dims(score, axis= 1) #[Batch_size, 1, T_v] 571 | alignment = self._monotonic_probability_fn(score, previous_alignment) #[Batch_size, 1, T_v] 572 | context = tf.matmul(alignment, value) #[Batch_size, 1, Att_dim] 573 | 574 | return context, alignment 575 | 576 | def _monotonic_probability_fn(self, score, previous_alignment): 577 | if self.sigmoid_noise > 0.0: 578 | score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype) 579 | p_choose_i = tf.sigmoid(score) 580 | 581 | cumprod_1mp_choose_i = self.safe_cumprod(1 - p_choose_i, axis= 2, exclusive= True) 582 | 583 | alignment = p_choose_i * cumprod_1mp_choose_i * tf.cumsum( 584 | previous_alignment / tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), 585 | axis= 2 586 | ) 587 | 588 | return alignment 589 | 590 | # https://github.com/tensorflow/addons/blob/9e9031133c8362fedf40f2d05f00334b6f7a970b/tensorflow_addons/seq2seq/attention_wrapper.py#L810 591 | def safe_cumprod(self, x, *args, **kwargs): 592 | """Computes cumprod of x in logspace using cumsum to avoid underflow. 593 | The cumprod function and its gradient can result in numerical instabilities 594 | when its argument has very small and/or zero values. As long as the 595 | argument is all positive, we can instead compute the cumulative product as 596 | exp(cumsum(log(x))). This function can be called identically to 597 | tf.cumprod. 598 | Args: 599 | x: Tensor to take the cumulative product of. 600 | *args: Passed on to cumsum; these are identical to those in cumprod. 601 | **kwargs: Passed on to cumsum; these are identical to those in cumprod. 602 | Returns: 603 | Cumulative product of x. 604 | """ 605 | x = tf.convert_to_tensor(x, name='x') 606 | tiny = np.finfo(x.dtype.as_numpy_dtype).tiny 607 | return tf.exp(tf.cumsum(tf.math.log(tf.clip_by_value(x, tiny, 1)), *args, **kwargs)) 608 | 609 | class StepwiseMonotonicAttention(BahdanauMonotonicAttention): 610 | ''' 611 | Refer: https://gist.github.com/dy-octa/38a7638f75c21479582d7391490df37c 612 | ''' 613 | def __init__( 614 | self, 615 | size, 616 | sigmoid_noise= 2.0, 617 | normalize= False, 618 | **kwargs 619 | ): 620 | super(StepwiseMonotonicAttention, self).__init__( 621 | size= size, 622 | sigmoid_noise= sigmoid_noise, 623 | normalize= normalize, **kwargs 624 | ) 625 | 626 | def _monotonic_probability_fn(self, score, previous_alignment): 627 | ''' 628 | score: [Batch_size, 1, T_v] 629 | previous_alignment: [batch_size, 1, T_v] 630 | ''' 631 | if self.sigmoid_noise > 0.0: 632 | score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype) 633 | p_choose_i = tf.sigmoid(score) # [Batch_size, 1, T_v] 634 | 635 | pad = tf.zeros([tf.shape(p_choose_i)[0], 1, 1], dtype=p_choose_i.dtype) # [Batch_size, 1, 1] 636 | 637 | alignment = previous_alignment * p_choose_i + tf.concat( 638 | [pad, previous_alignment[:, :, :-1] * (1.0 - p_choose_i[:, :, :-1])], axis= -1) 639 | 640 | return alignment 641 | 642 | 643 | class DynamicConvolutionAttention(tf.keras.layers.AdditiveAttention): 644 | ''' 645 | Refer: https://gist.github.com/attitudechunfeng/c162a5ed9b034be8f3f5800652af7c83 646 | ''' 647 | def __init__( 648 | self, 649 | size, 650 | f_conv_filters= 8, 651 | f_conv_kernel_size= 21, 652 | f_conv_stride= 1, 653 | g_conv_filters= 8, 654 | g_conv_kernel_size= 21, 655 | g_conv_stride= [1, 1, 1, 1], 656 | p_conv_size = 11, 657 | p_alpha= 0.1, 658 | p_beta = 2.9, 659 | use_scale=False, 660 | cumulate_weights= False, 661 | **kwargs 662 | ): 663 | super(DynamicConvolutionAttention, self).__init__(use_scale= use_scale, **kwargs) 664 | 665 | self.size = size 666 | self.f_conv_filters= f_conv_filters 667 | self.f_conv_kernel_size= f_conv_kernel_size 668 | self.f_conv_stride= f_conv_stride 669 | self.g_conv_filters= g_conv_filters 670 | self.g_conv_kernel_size= g_conv_kernel_size 671 | self.g_conv_stride= g_conv_stride 672 | self.p_conv_size = p_conv_size 673 | self.p_alpha= p_alpha 674 | self.p_beta = p_beta 675 | self.cumulate_weights = cumulate_weights 676 | 677 | def build(self, input_shape): 678 | self.layer_Dict = {} 679 | self.layer_Dict['Key'] = tf.keras.layers.Dense(self.size) 680 | 681 | self.layer_Dict['F_Conv'] = tf.keras.layers.Conv1D( 682 | filters= self.f_conv_filters, 683 | kernel_size= self.f_conv_kernel_size, 684 | strides= self.f_conv_stride, 685 | padding='same' 686 | ) 687 | self.layer_Dict['F_Dense'] = tf.keras.layers.Dense( 688 | self.size, 689 | use_bias= False 690 | ) 691 | 692 | self.layer_Dict['G_Filter_Dense'] = tf.keras.Sequential() 693 | self.layer_Dict['G_Filter_Dense'].add(tf.keras.layers.Dense( 694 | units= self.g_conv_kernel_size * self.g_conv_filters, 695 | use_bias= True, 696 | activation= 'tanh' 697 | )) 698 | self.layer_Dict['G_Filter_Dense'].add(tf.keras.layers.Dense( 699 | units= self.g_conv_kernel_size * self.g_conv_filters, 700 | use_bias= False 701 | )) 702 | self.layer_Dict['G_Dense'] = tf.keras.layers.Dense( 703 | self.size, 704 | use_bias= False 705 | ) 706 | 707 | self.layer_Dict['P_Conv'] = DCA_P_Conv1D( 708 | p_conv_size = self.p_conv_size, 709 | p_alpha= self.p_alpha, 710 | p_beta = self.p_beta, 711 | ) 712 | 713 | """Creates scale and bias variable if use_scale==True.""" 714 | if self.use_scale: 715 | self.scale = self.add_weight( 716 | name='scale', 717 | shape=[self.size], 718 | initializer= tf.initializers.glorot_uniform(), 719 | dtype=self.dtype, 720 | trainable=True) 721 | else: 722 | self.scale = None 723 | 724 | self.bias = self.add_weight( 725 | name='bias', 726 | shape=[self.size,], 727 | initializer=tf.zeros_initializer(), 728 | dtype=self.dtype, 729 | trainable=True 730 | ) 731 | 732 | # self.g_scale = self.add_weight( 733 | # name='g_scale', 734 | # shape=[self.g_conv_kernel_size * self.g_conv_filters,], 735 | # initializer=tf.zeros_initializer(), 736 | # dtype=self.dtype, 737 | # trainable=True 738 | # ) 739 | 740 | self.bulit = True 741 | 742 | def call(self, inputs): 743 | ''' 744 | inputs: [query, key] 745 | I don't implement the mask function now. 746 | ''' 747 | self._validate_call_args(inputs=inputs, mask= None) 748 | query = inputs[0] #[Batch, Q_dim] 749 | key = self.layer_Dict['Key'](inputs[1]) #[Batch, T_k, Att_dim] 750 | 751 | batch_size = tf.shape(query)[0] 752 | contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype) #initial attention, [Batch, 1, Att_dim] 753 | alignments = tf.one_hot( 754 | indices= tf.zeros((tf.shape(query)[0], 1), dtype= tf.int32), 755 | depth= tf.shape(key)[1], 756 | dtype= query.dtype 757 | ) #initial alignment, [Batch, 1, T_k]. This part is different by monotonic or not. 758 | 759 | initial_Step = tf.constant(0) 760 | def body(step, query, contexts, alignments): 761 | query_Step = query[:, step] #[Batch, Q_dim] 762 | previous_alignment = tf.reduce_sum(alignments, axis= 1) if self.cumulate_weights else alignments[:, -1] #[Batch, T_k] 763 | previous_alignment = tf.expand_dims(previous_alignment, axis= -1) #[Batch, T_k, 1] 764 | 765 | feature_previous_alignment = self.layer_Dict['F_Conv'](previous_alignment) #[Batch, T_k, Filters] 766 | feature_previous_alignment = self.layer_Dict['F_Dense'](feature_previous_alignment) #[Batch, T_k, Att_dim] 767 | 768 | #dynamic_filter = self.g_scale * self.layer_Dict['G_Filter_Dense'](query_Step) # [Batch, Conv_Size * Conv_Ch] 769 | dynamic_filter = self.layer_Dict['G_Filter_Dense'](query_Step) # [Batch, Conv_Size * Conv_Ch] 770 | dynamic_filter = tf.reshape( 771 | dynamic_filter, 772 | shape= [batch_size, 1, self.g_conv_kernel_size, self.g_conv_filters] 773 | ) # [Batch, 1, Conv_Size, Conv_Ch] 774 | dynamic_filter = tf.transpose( 775 | dynamic_filter, 776 | perm= [1, 2, 0, 3] 777 | ) # [1, Conv_Size, Batch, Conv_Ch] [H(1), W, C_in, C_out] 778 | dynamic_previous_alignment = tf.expand_dims( 779 | tf.transpose( 780 | previous_alignment, 781 | perm= [2, 1, 0] 782 | ), 783 | axis = 0 784 | ) #[N(Batch), W(K_t), C(1)] -> [C(1), W(K_t), N(Batch)] -> [1, C(1), W(K_t), N(Batch)] 785 | dynamic_previous_alignment = tf.nn.depthwise_conv2d( 786 | dynamic_previous_alignment, 787 | filter= dynamic_filter, 788 | strides= self.g_conv_stride, 789 | padding= 'SAME' 790 | ) # [1, 1, K_t, Batch * G_Filter] 791 | dynamic_previous_alignment = tf.squeeze(input= dynamic_previous_alignment, axis= [0, 1]) # [K_t, Batch * G_Filter] 792 | dynamic_previous_alignment = tf.reshape( 793 | dynamic_previous_alignment, 794 | shape= [tf.shape(dynamic_previous_alignment)[0], batch_size, self.g_conv_filters] 795 | ) # [K_t, Batch, G_Filter] 796 | dynamic_previous_alignment = tf.transpose( 797 | dynamic_previous_alignment, 798 | perm= [1, 0, 2] 799 | ) # [Batch, K_t, G_Filter] 800 | dynamic_previous_alignment = self.layer_Dict['G_Dense'](dynamic_previous_alignment) #[Batch, K_t, Att_Dim] 801 | 802 | prior_filter_bias = self.layer_Dict['P_Conv'](previous_alignment) #[Batch, K_t] 803 | 804 | score = self._calculate_scores( 805 | feature_previous_alignment= feature_previous_alignment, 806 | dynamic_previous_alignment= dynamic_previous_alignment, 807 | prior_filter_bias= prior_filter_bias 808 | ) #[Batch, T_k] 809 | context, alignment = self._apply_scores(score= score, key= key) #[Batch, 1, Att_dim], [Batch, 1, T_k] 810 | 811 | return step + 1, query, tf.concat([contexts, context], axis= 1), tf.concat([alignments, alignment], axis= 1) 812 | 813 | _, _, contexts, alignments = tf.while_loop( 814 | cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]), 815 | body= body, 816 | loop_vars= [initial_Step, query, contexts, alignments], 817 | shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])] 818 | ) #[Batch, T_q + 1, Att_dim], [Batch, T_q + 1, T_k] 819 | 820 | return contexts[:, 1:], alignments[:, 1:] #Remove initial step 821 | 822 | def _calculate_scores(self, feature_previous_alignment, dynamic_previous_alignment, prior_filter_bias): 823 | """Calculates attention scores as a nonlinear sum of query and key. 824 | Args: 825 | feature_previous_alignment: Location_features of shape `[batch_size, T_k, Att_dim]`. 826 | dynamic_previous_alignment: Dynamic features of shape `[batch_size, T_k, Att_dim]`. 827 | prior_filter_bias: Prior filter bias of shape `[batch_size, T_k]`. 828 | Returns: 829 | Tensor of shape `[batch_size, T_k]`. 830 | """ 831 | if self.use_scale: 832 | scale = self.scale 833 | else: 834 | scale = 1. 835 | score = tf.reduce_sum( 836 | scale * tf.tanh(feature_previous_alignment + dynamic_previous_alignment + self.bias), 837 | axis=-1 838 | ) #[Batch, T_k, Att_dim] -> [Batch, T_k] 839 | return score + prior_filter_bias 840 | 841 | #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism. 842 | def _apply_scores(self, score, key): 843 | ''' 844 | score shape: [batch_size, T_k]`. 845 | key shape: [batch_size, T_k, Att_dim]`. 846 | Must T_k == T_v 847 | 848 | Return: [batch_size, Att_dim] 849 | ''' 850 | score = tf.expand_dims(score, axis= 1) #[Batch_size, 1, T_v] 851 | alignment = tf.nn.softmax(score) #[Batch_size, 1, T_v] 852 | context = tf.matmul(alignment, key) #[Batch_size, 1, Att_dim] 853 | 854 | return context, alignment #[Batch, 1, Att_dim], [Batch, 1, T_v] 855 | 856 | class DCA_P_Conv1D(tf.keras.layers.Conv1D): 857 | def __init__(self, p_conv_size= 11, p_alpha= 0.1, p_beta= 0.9): 858 | self.p_conv_size= p_conv_size 859 | self.p_alpha= p_alpha 860 | self.p_beta= p_beta 861 | 862 | prior_filter = self.beta_binomial(self.p_conv_size, self.p_alpha, self.p_beta) 863 | prior_filter = np.flip(prior_filter, axis= 0) 864 | prior_filter = np.reshape(prior_filter, [self.p_conv_size, 1, 1]) 865 | 866 | super(DCA_P_Conv1D, self).__init__( 867 | filters= 1, 868 | kernel_size= self.p_conv_size, 869 | padding='valid', 870 | use_bias= False, 871 | kernel_initializer= tf.initializers.constant(prior_filter) 872 | ) 873 | 874 | def call(self, inputs): 875 | ''' 876 | inputs: 3D tensor with shape: `(batch_size, steps, input_dim)` 877 | After front padding, call a superior class(Conv1D) 878 | ''' 879 | inputs = tf.pad(inputs, paddings= [[0,0], [self.p_conv_size - 1, 0], [0, 0]]) 880 | new_Tensor = super(DCA_P_Conv1D, self).call(inputs) 881 | new_Tensor = tf.squeeze(new_Tensor, axis= -1) 882 | 883 | return tf.math.log(tf.maximum(new_Tensor, np.finfo(inputs.dtype.as_numpy_dtype).tiny)) 884 | # return tf.maximum(tf.math.log(new_Tensor), -1e+6) # NaN problem. 885 | 886 | def beta_binomial(self, _n, _alpha, _beta): 887 | return [comb(_n,i) * beta(i+_alpha, _n-i+_beta) / beta(_alpha, _beta) for i in range(_n)] -------------------------------------------------------------------------------- /Modules/Attention/Steps.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | ''' 5 | TF 2.0's basic attention layers(Attention and AdditiveAttention) calculate parallelly. 6 | TO USE MONOTONIC FUNCTION, ATTENTION MUST KNOW 'n-1 ALIGNMENT'. 7 | Thus, this parallel versions do not support the monotonic function. 8 | ''' 9 | 10 | class BahdanauAttention(tf.keras.layers.Layer): 11 | ''' 12 | Refer: https://www.tensorflow.org/tutorials/text/nmt_with_attention 13 | ''' 14 | def __init__(self, size): 15 | super(BahdanauAttention, self).__init__() 16 | self.size = size 17 | 18 | def build(self, input_shapes): 19 | self.layer_Dict = { 20 | 'Query': tf.keras.layers.Dense(self.size), 21 | 'Value': tf.keras.layers.Dense(self.size), 22 | 'V': tf.keras.layers.Dense(1) 23 | } 24 | 25 | self.built = True 26 | 27 | def call(self, inputs): 28 | ''' 29 | inputs: [queries, values] 30 | queries: [Batch, Query_dim] 31 | values: [Batch, T_v, Value_dim] 32 | ''' 33 | queries, values = inputs 34 | 35 | queries = self.layer_Dict['Query'](queries) #[Batch, Att_dim] 36 | values = self.layer_Dict['Value'](values) #[Batch, T_v, Att_dim] 37 | 38 | queries = tf.expand_dims(queries, 1) #[Batch, 1, Att_dim] 39 | 40 | score = self.layer_Dict['V'](tf.nn.tanh(values + queries)) #[Batch, T_v, 1] 41 | 42 | attention_weights = tf.nn.softmax(score - tf.reduce_max(score, axis= 1, keepdims= True), axis=1) #[Batch, T_v, 1] 43 | 44 | context_vector = tf.reduce_sum(attention_weights * values, axis=1) #[Batch, T_v, Att_dim] -> [Batch, Att_dim] 45 | 46 | return context_vector, tf.squeeze(attention_weights, axis= -1) 47 | 48 | def initial_alignment_fn(self, batch_size, key_time, dtype): 49 | return tf.zeros((batch_size, key_time), dtype= dtype) 50 | 51 | class BahdanauMonotonicAttention(tf.keras.layers.Layer): 52 | ''' 53 | Refer 54 | https://www.tensorflow.org/tutorials/text/nmt_with_attention 55 | https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/seq2seq/attention_wrapper.py#L1004-L1175 56 | 57 | ''' 58 | def __init__(self, size, sigmoid_noise= 0.0, normalize= False, **kwargs): 59 | super(BahdanauMonotonicAttention, self).__init__() 60 | 61 | self.size = size 62 | self.sigmoid_noise = sigmoid_noise 63 | self.normalize = normalize 64 | 65 | def build(self, input_shapes): 66 | self.layer_Dict = { 67 | 'Query': tf.keras.layers.Dense(self.size), 68 | 'Value': tf.keras.layers.Dense(self.size), 69 | 'Key': tf.keras.layers.Dense(self.size) 70 | } 71 | 72 | self.attention_v = self.add_weight( 73 | name='attention_v', 74 | shape=[self.size,], 75 | initializer='glorot_uniform', 76 | dtype=self.dtype, 77 | trainable=True 78 | ) 79 | 80 | self.attention_score_bias = self.add_weight( 81 | name='attention_score_bias', 82 | shape=[], 83 | initializer=tf.zeros_initializer(), 84 | dtype=self.dtype, 85 | trainable=True 86 | ) 87 | 88 | if self.normalize: 89 | self.attention_g = self.add_weight( 90 | name='attention_g', 91 | shape=[], 92 | initializer= tf.initializers.constant([np.sqrt(1. / self.size),]), 93 | dtype=self.dtype, 94 | trainable=True 95 | ) 96 | 97 | self.attention_b = self.add_weight( 98 | name='attention_b', 99 | shape=[self.size,], 100 | initializer= tf.zeros_initializer(), 101 | dtype=self.dtype, 102 | trainable=True 103 | ) 104 | 105 | self.bulit = True 106 | 107 | def call(self, inputs): 108 | ''' 109 | inputs: [queries, values, previous_alignments] or [queries, values, keys, previous_alignments] 110 | query: [Batch, Query_dim] 111 | value: [Batch, T_v, Value_dim] 112 | key: [Batch, T_v, Key_dim] 113 | previous_alignment: [Batch, T_v] 114 | ''' 115 | if len(inputs) == 3: 116 | query, value, previous_alignment = inputs 117 | elif len(inputs) == 4: 118 | query, value, key, previous_alignment = inputs 119 | else: 120 | raise ValueError('Unexpected input length') 121 | 122 | query = self.layer_Dict['Query'](query) # [Batch, Att_dim] 123 | value = self.layer_Dict['Value'](value) # [Batch, T_v, Att_dim] 124 | key = self.layer_Dict['Key'](key) if len(inputs) == 4 else value # [Batch, T_v, Att_dim] 125 | 126 | query = tf.expand_dims(query, 1) # [Batch, 1, Att_dim] 127 | previous_alignment = tf.expand_dims(previous_alignment, axis= 1) # [Batch, 1, T_v] 128 | 129 | score = self._calculate_scores(query= query, key= key) 130 | context, alignment = self._apply_scores( 131 | score= score, 132 | value= value, 133 | previous_alignment= previous_alignment 134 | ) # [Batch, Att_dim], [Batch, 1, T_v] 135 | 136 | return context, alignment 137 | 138 | def _calculate_scores(self, query, key): 139 | ''' 140 | Calculates attention scores as a nonlinear sum of query and key. 141 | Args: 142 | query: Query tensor of shape `[batch_size, 1, Att_dim]`. 143 | key: Key tensor of shape `[batch_size, T_k, Att_dim]`. 144 | 145 | Returns: 146 | Tensor of shape `[batch_size, T_k]`. 147 | ''' 148 | if self.normalize: 149 | norm_v = self.attention_g * self.attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(self.attention_v))) 150 | return tf.reduce_sum(norm_v * tf.tanh(query + key + self.attention_b), axis= -1) + self.attention_score_bias #[Batch, T_k, Att_dim] -> [Batch, T_k] 151 | else: 152 | return tf.reduce_sum(self.attention_v * tf.tanh(query + key), axis= -1) + self.attention_score_bias #[Batch, T_k, Att_dim] -> [Batch, T_k] 153 | 154 | def _apply_scores(self, score, value, previous_alignment): 155 | ''' 156 | score shape: [batch_size, T_v]`. (Must T_k == T_v) 157 | value shape: [batch_size, T_v, Att_dim]`. 158 | previous_alignment shape: [batch_size, 1, T_v]`. 159 | 160 | Return: [batch_size, Att_dim], [batch_size, T_v] 161 | ''' 162 | score = tf.expand_dims(score, axis= 1) #[Batch_size, 1, T_v] 163 | alignment = self._monotonic_probability_fn(score, previous_alignment) #[Batch_size, 1, T_v] 164 | context = tf.matmul(alignment, value) #[Batch_size, 1, Att_dim] 165 | 166 | return tf.squeeze(context, axis= 1), tf.squeeze(alignment, axis= 1) 167 | 168 | def _monotonic_probability_fn(self, score, previous_alignment): 169 | if self.sigmoid_noise > 0.0: 170 | score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype) 171 | p_choose_i = tf.sigmoid(score) 172 | 173 | cumprod_1mp_choose_i = self.safe_cumprod(1 - p_choose_i, axis= 2, exclusive= True) 174 | 175 | alignment = p_choose_i * cumprod_1mp_choose_i * tf.cumsum( 176 | previous_alignment / tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), 177 | axis= 2 178 | ) 179 | 180 | return alignment 181 | 182 | # https://github.com/tensorflow/addons/blob/9e9031133c8362fedf40f2d05f00334b6f7a970b/tensorflow_addons/seq2seq/attention_wrapper.py#L810 183 | def safe_cumprod(self, x, *args, **kwargs): 184 | """Computes cumprod of x in logspace using cumsum to avoid underflow. 185 | The cumprod function and its gradient can result in numerical instabilities 186 | when its argument has very small and/or zero values. As long as the 187 | argument is all positive, we can instead compute the cumulative product as 188 | exp(cumsum(log(x))). This function can be called identically to 189 | tf.cumprod. 190 | Args: 191 | x: Tensor to take the cumulative product of. 192 | *args: Passed on to cumsum; these are identical to those in cumprod. 193 | **kwargs: Passed on to cumsum; these are identical to those in cumprod. 194 | Returns: 195 | Cumulative product of x. 196 | """ 197 | x = tf.convert_to_tensor(x, name='x') 198 | tiny = np.finfo(x.dtype.as_numpy_dtype).tiny 199 | return tf.exp(tf.cumsum(tf.math.log(tf.clip_by_value(x, tiny, 1)), *args, **kwargs)) 200 | 201 | def initial_alignment_fn(self, batch_size, key_time, dtype): 202 | return tf.one_hot( 203 | indices= tf.zeros((batch_size), dtype= tf.int32), 204 | depth= key_time, 205 | dtype= dtype 206 | ) 207 | 208 | class StepwiseMonotonicAttention(BahdanauMonotonicAttention): 209 | ''' 210 | Refer: https://gist.github.com/dy-octa/38a7638f75c21479582d7391490df37c 211 | ''' 212 | def __init__(self, size, sigmoid_noise= 2.0, normalize= False, **kwargs): 213 | super(StepwiseMonotonicAttention, self).__init__(size, sigmoid_noise, normalize, **kwargs) 214 | 215 | def _monotonic_probability_fn(self, score, previous_alignment): 216 | ''' 217 | score: [Batch_size, 1, T_v] 218 | previous_alignment: [batch_size, 1, T_v] 219 | ''' 220 | if self.sigmoid_noise > 0.0: 221 | score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype) 222 | p_choose_i = tf.sigmoid(score) # [Batch_size, 1, T_v] 223 | 224 | pad = tf.zeros([tf.shape(p_choose_i)[0], 1, 1], dtype=p_choose_i.dtype) # [Batch_size, 1, 1] 225 | 226 | alignment = previous_alignment * p_choose_i + tf.concat( 227 | [pad, previous_alignment[:, :, :-1] * (1.0 - p_choose_i[:, :, :-1])], axis= -1) 228 | 229 | return alignment -------------------------------------------------------------------------------- /Modules/Attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Modules/Attention/__init__.py -------------------------------------------------------------------------------- /Modules/GST.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import json 3 | from .Attention.Layers import MultiHeadAttention 4 | 5 | 6 | with open('Hyper_Parameters.json', 'r') as f: 7 | hp_Dict = json.load(f) 8 | 9 | with open(hp_Dict['Token_JSON_Path'], 'r') as f: 10 | token_Index_Dict = json.load(f) 11 | 12 | class Reference_Encoder(tf.keras.Model): 13 | def __init__(self): 14 | super(Reference_Encoder, self).__init__() 15 | self.layer_Dict = {} 16 | 17 | for index, (filters, kernel_Size, strides) in enumerate(zip( 18 | hp_Dict['GST']['Reference_Encoder']['Conv']['Filters'], 19 | hp_Dict['GST']['Reference_Encoder']['Conv']['Kernel_Size'], 20 | hp_Dict['GST']['Reference_Encoder']['Conv']['Strides'] 21 | )): 22 | self.layer_Dict['Conv2D_{}'.format(index)] = tf.keras.Sequential() 23 | self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.Conv2D( 24 | filters= filters, 25 | kernel_size= kernel_Size, 26 | strides= strides, 27 | padding='same', 28 | use_bias= False 29 | )) 30 | self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.BatchNormalization()) 31 | self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.ReLU()) 32 | 33 | self.layer_Dict['RNN'] = tf.keras.layers.GRU( 34 | units= hp_Dict['GST']['Reference_Encoder']['RNN']['Size'], 35 | return_sequences= True 36 | ) 37 | 38 | self.layer_Dict['Compress_Length'] = tf.keras.layers.Lambda( 39 | lambda x: tf.cast(tf.math.ceil(x / tf.reduce_prod(hp_Dict['GST']['Reference_Encoder']['Conv']['Strides'])), tf.int32) 40 | ) 41 | 42 | self.layer_Dict['Dense'] = tf.keras.layers.Dense( 43 | units= hp_Dict['GST']['Reference_Encoder']['Dense']['Size'], 44 | activation= 'tanh' 45 | ) 46 | 47 | def call(self, inputs): 48 | ''' 49 | inputs: [mels, mel_lengths] 50 | mels: [Batch, Time, Mel_Dim] 51 | mel_lengths: [Batch] 52 | ''' 53 | mels, mel_lengths = inputs 54 | new_Tensor = tf.expand_dims(mels, axis= -1) #[Batch, Time, Mel_Dim, 1] 55 | for index in range(len(hp_Dict['GST']['Reference_Encoder']['Conv']['Filters'])): 56 | new_Tensor = self.layer_Dict['Conv2D_{}'.format(index)](new_Tensor) 57 | batch_Size, time_Step = tf.shape(new_Tensor)[0], tf.shape(new_Tensor)[1] 58 | height, width = new_Tensor.get_shape().as_list()[2:] 59 | new_Tensor = tf.reshape( 60 | new_Tensor, 61 | shape= [batch_Size, time_Step, height * width] 62 | ) 63 | new_Tensor = self.layer_Dict['RNN'](new_Tensor) 64 | 65 | new_Tensor = tf.gather_nd( 66 | params= new_Tensor, 67 | indices= tf.stack([tf.range(batch_Size), self.layer_Dict['Compress_Length'](mel_lengths) - 1], axis= 1) 68 | ) 69 | 70 | return self.layer_Dict['Dense'](new_Tensor) 71 | 72 | class Style_Token_Layer(tf.keras.layers.Layer): #Attention which is in layer must be able to access directly. 73 | def __init__(self): 74 | super(Style_Token_Layer, self).__init__() 75 | 76 | def build(self, input_shape): 77 | self.layer_Dict = {} 78 | self.layer_Dict['Reference_Encoder'] = Reference_Encoder() 79 | self.layer_Dict['Attention'] = MultiHeadAttention( 80 | num_heads= hp_Dict['GST']['Style_Token']['Attention']['Head'], 81 | size= hp_Dict['GST']['Style_Token']['Attention']['Size'] 82 | ) 83 | 84 | self.gst_tokens = self.add_weight( 85 | name= 'gst_tokens', 86 | shape= [hp_Dict['GST']['Style_Token']['Size'], hp_Dict['GST']['Style_Token']['Embedding']['Size']], 87 | initializer= tf.keras.initializers.TruncatedNormal(stddev= 0.5), 88 | trainable= True, 89 | ) 90 | 91 | def call(self, inputs): 92 | ''' 93 | inputs: [mels, mel_lengths] 94 | mels: [Batch, Time, Mel_Dim] 95 | mel_lengths: [Batch] 96 | ''' 97 | mels_for_gst, mel_lengths = inputs 98 | new_Tensor = self.layer_Dict['Reference_Encoder']([mels_for_gst[:, 1:], mel_lengths]) #Initial frame deletion 99 | 100 | tiled_GST_Tokens = tf.tile( 101 | tf.expand_dims(tf.tanh(self.gst_tokens), axis=0), 102 | [tf.shape(new_Tensor)[0], 1, 1] 103 | ) #[Token_Dim, Emedding_Dim] -> [Batch, Token_Dim, Emedding_Dim] 104 | new_Tensor = tf.expand_dims(new_Tensor, axis= 1) #[Batch, R_dim] -> [Batch, 1, R_dim] 105 | new_Tensor, _ = self.layer_Dict['Attention']( 106 | inputs= [new_Tensor, tiled_GST_Tokens] #[query, value] 107 | ) #[Batch, 1, Att_dim] 108 | 109 | return tf.squeeze(new_Tensor, axis= 1) 110 | 111 | class GST_Concated_Encoder(tf.keras.layers.Layer): 112 | def __init__(self): 113 | super(GST_Concated_Encoder, self).__init__() 114 | 115 | def call(self, inputs): 116 | ''' 117 | inputs: [encoder, gsts] 118 | ''' 119 | encoders, gsts = inputs 120 | 121 | return tf.concat([ 122 | tf.tile(tf.expand_dims(gsts, axis= 1), [1, tf.shape(encoders)[1], 1]), 123 | encoders 124 | ], axis= -1) -------------------------------------------------------------------------------- /Modules/Taco2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import json 3 | from .Attention.Steps import BahdanauMonotonicAttention, StepwiseMonotonicAttention 4 | 5 | 6 | with open('Hyper_Parameters.json', 'r') as f: 7 | hp_Dict = json.load(f) 8 | 9 | with open(hp_Dict['Token_JSON_Path'], 'r') as f: 10 | token_Index_Dict = json.load(f) 11 | 12 | class Encoder(tf.keras.Model): 13 | def __init__(self): 14 | super(Encoder, self).__init__() 15 | 16 | def build(self, input_shapes): 17 | self.layer = tf.keras.Sequential() 18 | self.layer.add(tf.keras.layers.Embedding( 19 | input_dim= len(token_Index_Dict), 20 | output_dim= hp_Dict['Tacotron2']['Encoder']['Embedding']['Size'], 21 | )) 22 | for filters, kernel_size, stride in zip( 23 | hp_Dict['Tacotron2']['Encoder']['Conv']['Filters'], 24 | hp_Dict['Tacotron2']['Encoder']['Conv']['Kernel_Size'], 25 | hp_Dict['Tacotron2']['Encoder']['Conv']['Strides'] 26 | ): 27 | self.layer.add(tf.keras.layers.Conv1D( 28 | filters= filters, 29 | kernel_size= kernel_size, 30 | strides= stride, 31 | padding= 'same', 32 | use_bias= False 33 | )) 34 | self.layer.add(tf.keras.layers.BatchNormalization()) 35 | self.layer.add(tf.keras.layers.ReLU()) 36 | self.layer.add(tf.keras.layers.Dropout( 37 | rate= hp_Dict['Tacotron2']['Encoder']['Conv']['Dropout_Rate'] 38 | )) 39 | self.layer.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM( 40 | units= hp_Dict['Tacotron2']['Encoder']['RNN']['Size'], 41 | recurrent_dropout= hp_Dict['Tacotron2']['Encoder']['RNN']['Zoneout'], #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet. 42 | return_sequences= True 43 | ))) 44 | 45 | self.bulit = True 46 | 47 | def call(self, inputs, training): 48 | ''' 49 | inputs: texts 50 | ''' 51 | return self.layer(inputs, training) 52 | 53 | class Decoder_Step(tf.keras.Model): 54 | def __init__(self): 55 | super(Decoder_Step, self).__init__() 56 | 57 | self.build(None) #I want to generate the initial state and alignment functions early. 58 | 59 | def build(self, input_shapes): 60 | self.layer_Dict = {} 61 | self.layer_Dict['Prenet'] = Prenet( 62 | sizes= hp_Dict['Tacotron2']['Decoder']['Prenet']['Size'], 63 | dropout_rate= hp_Dict['Tacotron2']['Decoder']['Prenet']['Dropout_Rate'] 64 | ) 65 | 66 | if hp_Dict['Tacotron2']['Decoder']['Attention']['Type'] == 'BMA': 67 | self.layer_Dict['Attention'] = BahdanauMonotonicAttention( 68 | size= hp_Dict['Tacotron2']['Decoder']['Attention']['Size'] 69 | ) 70 | elif hp_Dict['Tacotron2']['Decoder']['Attention']['Type'] == 'SMA': 71 | self.layer_Dict['Attention'] = StepwiseMonotonicAttention( 72 | size= hp_Dict['Tacotron2']['Decoder']['Attention']['Size'] 73 | ) 74 | else: 75 | raise ValueError('Unsupported attention type: {}'.format(hp_Dict['Tacotron2']['Decoder']['Attention']['Type'])) 76 | 77 | rnn_Cell_List = [] 78 | for size in hp_Dict['Tacotron2']['Decoder']['RNN']['Size']: 79 | rnn_Cell_List.append(tf.keras.layers.LSTMCell( 80 | units= size, 81 | recurrent_dropout= hp_Dict['Tacotron2']['Decoder']['RNN']['Zoneout'], #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet. 82 | )) 83 | self.layer_Dict['RNN'] = tf.keras.layers.StackedRNNCells( 84 | cells= rnn_Cell_List 85 | ) 86 | 87 | self.layer_Dict['Projection'] = tf.keras.layers.Dense( 88 | units= hp_Dict['Sound']['Mel_Dim'] * hp_Dict['Step_Reduction'] + 1 89 | ) 90 | 91 | self.get_initial_state = self.layer_Dict['RNN'].get_initial_state 92 | self.get_initial_alignment = self.layer_Dict['Attention'].initial_alignment_fn 93 | 94 | self.built = True 95 | 96 | def call(self, inputs, training): 97 | ''' 98 | inputs: [encodings, current_mels, previous_alignments, previous_rnn_states] 99 | encodings: [Batch, T_v, V_dim] 100 | current_mels: [Batch, Mel_dim] 101 | previous_alignments: [Batch, T_v] 102 | previous_rnn_states: A tuple of states 103 | ''' 104 | encodings, mels, previous_alignments, previous_rnn_states = inputs 105 | 106 | new_Tensor = self.layer_Dict['Prenet'](mels) 107 | attentions, alignments = self.layer_Dict['Attention']( 108 | [new_Tensor, encodings, previous_alignments] 109 | ) # [Batch, Att_dim], [Batch, T_v] 110 | new_Tensor = tf.concat([new_Tensor, attentions], axis= -1) # [Batch, Prenet_dim + Att_dim] 111 | new_Tensor, states = self.layer_Dict['RNN'](new_Tensor, states= previous_rnn_states) 112 | new_Tensor = tf.concat([new_Tensor, attentions], axis= -1) # [Batch, RNN_dim + Att_dim] 113 | new_Tensor = self.layer_Dict['Projection'](new_Tensor) # [Batch, Mel_Dim * r + 1] 114 | new_Tensor, stops = tf.split( 115 | new_Tensor, 116 | num_or_size_splits= [new_Tensor.get_shape()[-1] - 1 ,1], 117 | axis= -1 118 | ) # [Batch, Mel_Dim * r], # [Batch, 1] 119 | 120 | return new_Tensor, stops, alignments, states 121 | 122 | class Decoder(tf.keras.Model): 123 | def __init__(self): 124 | super(Decoder, self).__init__() 125 | 126 | def build(self, input_shapes): 127 | self.layer_Dict = {} 128 | 129 | self.layer_Dict['Decoder_Step'] = Decoder_Step() 130 | 131 | self.layer_Dict['Postnet'] = tf.keras.Sequential() # Last filters must be Mel 132 | for index, (filters, kernel_size, stride) in enumerate(zip( 133 | hp_Dict['Tacotron2']['Decoder']['Conv']['Filters'] + [hp_Dict['Sound']['Mel_Dim']], 134 | hp_Dict['Tacotron2']['Decoder']['Conv']['Kernel_Size'] + [5], 135 | hp_Dict['Tacotron2']['Decoder']['Conv']['Strides'] + [1] 136 | )): 137 | self.layer_Dict['Postnet'].add(tf.keras.layers.Conv1D( 138 | filters= filters, 139 | kernel_size= kernel_size, 140 | strides= stride, 141 | padding= 'same', 142 | use_bias= False 143 | )) 144 | self.layer_Dict['Postnet'].add(tf.keras.layers.BatchNormalization()) 145 | if index < len(hp_Dict['Tacotron2']['Decoder']['Conv']['Filters']) - 1: 146 | self.layer_Dict['Postnet'].add(tf.keras.layers.Activation(activation= tf.nn.tanh)) 147 | self.layer_Dict['Postnet'].add(tf.keras.layers.Dropout( 148 | rate= hp_Dict['Tacotron2']['Encoder']['Conv']['Dropout_Rate'] 149 | )) 150 | 151 | self.built = True 152 | 153 | def call(self, inputs, training): 154 | ''' 155 | inputs: [encodings, mels] 156 | encoders: [Batch, T_v, V_dim] 157 | mels: [Batch, T_q, Mel_dim] 158 | ''' 159 | encodings, mels = inputs 160 | 161 | mels = mels[:, 0:-1:hp_Dict['Step_Reduction'], :] #Only use last slices of each reduction for training 162 | decodings = tf.zeros( 163 | shape=[tf.shape(encodings)[0], 1, hp_Dict['Sound']['Mel_Dim']], 164 | dtype= encodings.dtype 165 | ) # [Batch, 1, Mel * r] 166 | stops = tf.zeros( 167 | shape=[tf.shape(encodings)[0], 0], 168 | dtype= encodings.dtype 169 | ) # [Batch, 0] 170 | alignments = tf.expand_dims( # [Batch, 1, T_v] 171 | self.layer_Dict['Decoder_Step'] .get_initial_alignment( 172 | tf.shape(encodings)[0], 173 | tf.shape(encodings)[1], 174 | encodings.dtype 175 | ), 176 | axis= 1 177 | ) 178 | initial_state = self.layer_Dict['Decoder_Step'] .get_initial_state( 179 | batch_size= tf.shape(encodings)[0], 180 | dtype= encodings.dtype 181 | ) 182 | def body(step, decodings, stops, alignments, previous_state): 183 | mel_step = tf.cond( 184 | pred= tf.convert_to_tensor(training), 185 | true_fn= lambda: mels[:, step], 186 | false_fn= lambda: decodings[:, -1] 187 | ) 188 | 189 | decoding, stop, alignment, state = self.layer_Dict['Decoder_Step']( 190 | inputs= [encodings, mel_step, alignments[:, -1], previous_state], 191 | training= training 192 | ) 193 | 194 | decoding = tf.reshape( 195 | decoding, 196 | shape= [ 197 | -1, 198 | hp_Dict['Step_Reduction'], 199 | hp_Dict['Sound']['Mel_Dim'] 200 | ] 201 | ) #Reshape to r1 202 | 203 | decodings = tf.concat([decodings, decoding], axis= 1) 204 | stops = tf.concat([stops, stop], axis= -1) 205 | alignments = tf.concat([alignments, tf.expand_dims(alignment, axis=1)], axis= 1) 206 | 207 | return step + 1, decodings, stops, alignments, state 208 | 209 | 210 | max_Step = tf.cond( 211 | pred= tf.convert_to_tensor(training), 212 | true_fn= lambda: tf.shape(mels)[1], 213 | false_fn= lambda: hp_Dict['Max_Step'] // hp_Dict['Step_Reduction'] 214 | ) 215 | _, decodings, stops, alignments, _ = tf.while_loop( 216 | cond= lambda step, decodings, stops, alignments, previous_state: tf.less(step, max_Step), 217 | body= body, 218 | loop_vars= [0, decodings, stops, alignments, initial_state], 219 | shape_invariants= [ 220 | tf.TensorShape([]), 221 | tf.TensorShape([None, None, hp_Dict['Sound']['Mel_Dim']]), 222 | tf.TensorShape([None, None]), 223 | tf.TensorShape([None, None, None]), 224 | tf.nest.map_structure(lambda x: x.get_shape(), initial_state), 225 | ] 226 | ) 227 | decodings = decodings[:, 1:] 228 | alignments = alignments[:, 1:] 229 | 230 | post_decodings = self.layer_Dict['Postnet'](decodings) + decodings 231 | 232 | return decodings, post_decodings, stops, alignments 233 | 234 | class Vocoder_Taco1(tf.keras.Model): 235 | def __init__(self): 236 | super(Vocoder_Taco1, self).__init__() 237 | 238 | def build(self, input_shapes): 239 | self.layer_Dict = {} 240 | self.layer_Dict['CBHG'] = CBHG( 241 | convbank_stack_count= hp_Dict['Vocoder_Taco1']['CBHG']['Conv_Bank']['Stack_Count'], 242 | convbank_filters= hp_Dict['Vocoder_Taco1']['CBHG']['Conv_Bank']['Filters'], 243 | pool_size= hp_Dict['Vocoder_Taco1']['CBHG']['Pool']['Pool_Size'], 244 | pool_strides= hp_Dict['Vocoder_Taco1']['CBHG']['Pool']['Strides'], 245 | project_conv_filters= hp_Dict['Vocoder_Taco1']['CBHG']['Conv1D']['Filters'], 246 | project_conv_kernel_size= hp_Dict['Vocoder_Taco1']['CBHG']['Conv1D']['Kernel_Size'], 247 | highwaynet_count= hp_Dict['Vocoder_Taco1']['CBHG']['Highwaynet']['Count'], 248 | highwaynet_size= hp_Dict['Vocoder_Taco1']['CBHG']['Highwaynet']['Size'], 249 | rnn_size= hp_Dict['Vocoder_Taco1']['CBHG']['RNN']['Size'], 250 | rnn_zoneout_rate= hp_Dict['Vocoder_Taco1']['CBHG']['RNN']['Zoneout'], 251 | ) 252 | self.layer_Dict['Dense'] = tf.keras.layers.Dense( 253 | units= hp_Dict['Sound']['Spectrogram_Dim'] 254 | ) 255 | 256 | self.built = True 257 | 258 | def call(self, inputs, training= False): 259 | new_Tensor = self.layer_Dict['CBHG'](inputs= inputs, training= training) 260 | return self.layer_Dict['Dense'](inputs= new_Tensor) 261 | 262 | class Prenet(tf.keras.layers.Layer): 263 | def __init__(self, sizes, dropout_rate): 264 | super(Prenet, self).__init__() 265 | self.prenet_Count = len(sizes) 266 | self.sizes = sizes 267 | self.dropout_rate = dropout_rate 268 | 269 | def build(self, input_shapes): 270 | self.layer = tf.keras.Sequential() 271 | for size in self.sizes: 272 | self.layer.add(tf.keras.layers.Dense( 273 | units= size, 274 | activation='relu' 275 | )) 276 | self.layer.add(tf.keras.layers.Dropout( 277 | rate= self.dropout_rate 278 | )) 279 | 280 | self.built = True 281 | 282 | def call(self, inputs, training): 283 | return self.layer(inputs= inputs, training= True) #Always true 284 | 285 | class CBHG(tf.keras.layers.Layer): 286 | def __init__( 287 | self, 288 | convbank_stack_count, 289 | convbank_filters, 290 | pool_size, 291 | pool_strides, 292 | project_conv_filters, 293 | project_conv_kernel_size, 294 | highwaynet_count, 295 | highwaynet_size, 296 | rnn_size, 297 | rnn_zoneout_rate, 298 | ): 299 | self.convbank_stack_count = convbank_stack_count 300 | self.convbank_filters = convbank_filters 301 | self.pool_size = pool_size 302 | self.pool_strides = pool_strides 303 | self.project_conv_filters = project_conv_filters 304 | self.project_conv_kernel_size = project_conv_kernel_size 305 | self.highwaynet_count = highwaynet_count 306 | self.highwaynet_size = highwaynet_size 307 | self.rnn_size = rnn_size 308 | self.rnn_zoneout_rate = rnn_zoneout_rate 309 | 310 | super(CBHG, self).__init__() 311 | 312 | def build(self, input_shapes): 313 | self.layer_Dict = {} 314 | 315 | self.layer_Dict['ConvBank'] = ConvBank( 316 | stack_count= self.convbank_stack_count, 317 | filters= self.convbank_filters 318 | ) 319 | 320 | self.layer_Dict['Max_Pooling'] = tf.keras.layers.MaxPool1D( 321 | pool_size= self.pool_size, 322 | strides= self.pool_strides, 323 | padding='same' 324 | ) 325 | 326 | self.layer_Dict['Conv1D_Projection'] = tf.keras.Sequential() 327 | for index, (filters, kernel_Size) in enumerate(zip( 328 | self.project_conv_filters, 329 | self.project_conv_kernel_size 330 | )): 331 | self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.Conv1D( 332 | filters= filters, 333 | kernel_size= kernel_Size, 334 | padding= 'same', 335 | use_bias= False 336 | )) 337 | self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.BatchNormalization()) 338 | if index < len(self.project_conv_filters) - 1: 339 | self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.ReLU()) 340 | 341 | if input_shapes[-1] != self.project_conv_filters[-1]: 342 | self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.Dense( 343 | units= input_shapes[-1] 344 | )) 345 | 346 | self.layer_Dict['Highwaynet'] = tf.keras.Sequential() 347 | if input_shapes[-1] != self.highwaynet_size: 348 | self.layer_Dict['Highwaynet'].add(tf.keras.layers.Dense( 349 | units= self.highwaynet_size 350 | )) 351 | for index in range(self.highwaynet_count): 352 | self.layer_Dict['Highwaynet'].add(Highwaynet( 353 | size= self.highwaynet_size 354 | )) 355 | 356 | self.layer_Dict['RNN'] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM( 357 | units= self.rnn_size, 358 | recurrent_dropout= self.rnn_zoneout_rate, #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet. 359 | return_sequences= True 360 | )) 361 | 362 | self.built = True 363 | 364 | def call(self, inputs, training= False): 365 | new_Tensor = inputs 366 | 367 | new_Tensor = self.layer_Dict['ConvBank'](inputs= new_Tensor, training= training) 368 | 369 | new_Tensor = self.layer_Dict['Max_Pooling'](inputs= new_Tensor) 370 | 371 | new_Tensor = self.layer_Dict['Conv1D_Projection'](inputs= new_Tensor, training= training) 372 | new_Tensor = new_Tensor + inputs # Residual 373 | 374 | new_Tensor = self.layer_Dict['Highwaynet'](inputs= new_Tensor, training= training) 375 | 376 | return self.layer_Dict['RNN'](inputs= new_Tensor, training= training) 377 | 378 | 379 | class ConvBank(tf.keras.layers.Layer): 380 | def __init__(self, stack_count, filters): 381 | super(ConvBank, self).__init__() 382 | 383 | self.stack_count = stack_count 384 | self.filters = filters 385 | 386 | def build(self, input_shapes): 387 | self.layer_Dict = {} 388 | for index in range(self.stack_count): 389 | self.layer_Dict['ConvBank_{}'.format(index)] = tf.keras.Sequential() 390 | self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.Conv1D( 391 | filters= self.filters, 392 | kernel_size= index + 1, 393 | padding= 'same', 394 | use_bias= False 395 | )) 396 | self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.BatchNormalization()) 397 | self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.ReLU()) 398 | 399 | self.built = True 400 | 401 | def call(self, inputs): 402 | return tf.concat( 403 | [self.layer_Dict['ConvBank_{}'.format(index)](inputs) for index in range(self.stack_count)], 404 | axis= -1 405 | ) 406 | 407 | class Highwaynet(tf.keras.layers.Layer): 408 | def __init__(self, size): 409 | super(Highwaynet, self).__init__() 410 | self.layer_Dict = { 411 | 'Dense_Relu': tf.keras.layers.Dense( 412 | units= size, 413 | activation= 'relu' 414 | ), 415 | 'Dense_Sigmoid': tf.keras.layers.Dense( 416 | units= size, 417 | activation= 'sigmoid' 418 | ) 419 | } 420 | def call(self, inputs): 421 | h_Tensor = self.layer_Dict['Dense_Relu'](inputs) 422 | t_Tensor = self.layer_Dict['Dense_Sigmoid'](inputs) 423 | 424 | return h_Tensor * t_Tensor + inputs * (1.0 - t_Tensor) 425 | 426 | class ExponentialDecay(tf.keras.optimizers.schedules.ExponentialDecay): 427 | 428 | def __init__( 429 | self, 430 | initial_learning_rate, 431 | decay_steps, 432 | decay_rate, 433 | min_learning_rate= None, 434 | staircase=False, 435 | name=None 436 | ): 437 | super(ExponentialDecay, self).__init__( 438 | initial_learning_rate= initial_learning_rate, 439 | decay_steps= decay_steps, 440 | decay_rate= decay_rate, 441 | staircase= staircase, 442 | name= name 443 | ) 444 | 445 | self.min_learning_rate = min_learning_rate 446 | 447 | def __call__(self, step): 448 | learning_rate = super(ExponentialDecay, self).__call__(step) 449 | if self.min_learning_rate is None: 450 | return learning_rate 451 | 452 | return tf.maximum(learning_rate, self.min_learning_rate) 453 | 454 | def get_config(self): 455 | config_dict = super(ExponentialDecay, self).get_config() 456 | config_dict['min_learning_rate'] = self.min_learning_rate 457 | 458 | return config_dict 459 | 460 | # if __name__ == "__main__": 461 | # mels = tf.keras.layers.Input(shape=[None, 80], dtype= tf.float32) 462 | # tokens = tf.keras.layers.Input(shape=[None], dtype= tf.int32) 463 | # # ref_E = Reference_Encoder()(mels) 464 | # # st_L = Style_Token_Layer()(ref_E) 465 | 466 | # # print(mels) 467 | # # print(ref_E) 468 | # # print(st_L) 469 | 470 | # # enc = Tacotron_Encoder()(tokens) 471 | # # dec = Tacotron_Decoder()(inputs=[enc, mels]) 472 | 473 | # import numpy as np 474 | # tokens = np.random.randint(0, 33, size=(3, 52)).astype(np.int32) 475 | # mels = (np.random.rand(3, 50, 80).astype(np.float32) - 0.5) * 8 476 | # enc = Tacotron_Encoder()(inputs= tokens) 477 | # dec, _ = Tacotron_Decoder()(inputs=[enc, mels]) 478 | # spec = Vocoder_Taco1()(inputs= dec) 479 | # print(enc.get_shape()) 480 | # print(dec.get_shape()) 481 | # print(spec.get_shape()) -------------------------------------------------------------------------------- /Modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Modules/__init__.py -------------------------------------------------------------------------------- /Papers/He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf -------------------------------------------------------------------------------- /Papers/Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf -------------------------------------------------------------------------------- /Papers/Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf -------------------------------------------------------------------------------- /Papers/Style Tokens Unsupervised Style Modeling Control and Transfer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Style Tokens Unsupervised Style Modeling Control and Transfer.pdf -------------------------------------------------------------------------------- /Papers/Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf -------------------------------------------------------------------------------- /Pattern_Generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json, os, time, pickle, librosa, re, argparse 3 | from concurrent.futures import ThreadPoolExecutor as PE 4 | from collections import deque 5 | from threading import Thread 6 | from random import shuffle 7 | 8 | from Audio import melspectrogram, spectrogram, preemphasis, inv_preemphasis 9 | 10 | with open('Hyper_Parameters.json', 'r') as f: 11 | hp_Dict = json.load(f) 12 | 13 | with open(hp_Dict['Token_JSON_Path'], 'r') as f: 14 | token_Index_Dict = json.load(f) 15 | 16 | using_Extension = [x.upper() for x in ['.wav', '.m4a', '.flac']] 17 | regex_Checker = re.compile('[A-Z,.?!\-\s]+') 18 | max_Worker= 10 19 | 20 | def Text_Filtering(text): 21 | remove_Letter_List = ['(', ')', '?', '!', '\'', '\"', '[', ']', ':', ';'] 22 | replace_List = [(' ', ' '), (' ,', ',')] 23 | 24 | text = text.upper().strip() 25 | for filter in remove_Letter_List: 26 | text= text.replace(filter, '') 27 | for filter, replace_STR in replace_List: 28 | text= text.replace(filter, replace_STR) 29 | 30 | text= text.strip() 31 | 32 | if len(regex_Checker.findall(text)) > 1: 33 | return None 34 | elif text.startswith('\''): 35 | return None 36 | else: 37 | return regex_Checker.findall(text)[0] 38 | 39 | def Mel_Generate(path, top_db= 60, range_Ignore = False): 40 | sig = librosa.core.load( 41 | path, 42 | sr = hp_Dict['Sound']['Sample_Rate'] 43 | )[0] 44 | sig = preemphasis(sig) 45 | sig = librosa.effects.trim(sig, top_db= top_db, frame_length= 32, hop_length= 16)[0] * 0.99 46 | sig = inv_preemphasis(sig) 47 | 48 | sig_Length = sig.shape[0] / hp_Dict['Sound']['Sample_Rate'] * 1000 #ms 49 | if not range_Ignore and (sig_Length < hp_Dict['Train']['Min_Wav_Length'] or sig_Length > hp_Dict['Train']['Max_Wav_Length']): 50 | return None 51 | 52 | return np.transpose(melspectrogram( 53 | y= sig, 54 | num_freq= hp_Dict['Sound']['Spectrogram_Dim'], 55 | hop_length= hp_Dict['Sound']['Frame_Shift'], 56 | win_length= hp_Dict['Sound']['Frame_Length'], 57 | num_mels= hp_Dict['Sound']['Mel_Dim'], 58 | sample_rate= hp_Dict['Sound']['Sample_Rate'], 59 | max_abs_value= hp_Dict['Sound']['Max_Abs_Mel'] 60 | ).astype(np.float32)) 61 | 62 | def Spectrogram_Generate(path, top_db= 60, range_Ignore = False): 63 | sig = librosa.core.load( 64 | path, 65 | sr = hp_Dict['Sound']['Sample_Rate'] 66 | )[0] 67 | sig = preemphasis(sig) 68 | sig = librosa.effects.trim(sig, top_db= top_db, frame_length= 32, hop_length= 16)[0] * 0.99 69 | sig = inv_preemphasis(sig) 70 | 71 | sig_Length = sig.shape[0] / hp_Dict['Sound']['Sample_Rate'] * 1000 #ms 72 | if not range_Ignore and (sig_Length < hp_Dict['Train']['Min_Wav_Length'] or sig_Length > hp_Dict['Train']['Max_Wav_Length']): 73 | return None 74 | 75 | return np.transpose(spectrogram( 76 | y= sig, 77 | num_freq= hp_Dict['Sound']['Spectrogram_Dim'], 78 | hop_length= hp_Dict['Sound']['Frame_Shift'], 79 | win_length= hp_Dict['Sound']['Frame_Length'], 80 | sample_rate= hp_Dict['Sound']['Sample_Rate'], 81 | max_abs_value= hp_Dict['Sound']['Max_Abs_Mel'] 82 | ).astype(np.float32)) 83 | 84 | def Pattern_File_Generate(path, text, token_Index_Dict, dataset, file_Prefix='', display_Prefix = '', top_db= 60, range_Ignore = False): 85 | mel = Mel_Generate(path, top_db, range_Ignore) 86 | 87 | if mel is None: 88 | print('[{}]'.format(display_Prefix), '{}'.format(path), '->', 'Ignored because of length.') 89 | return 90 | 91 | spect = Spectrogram_Generate(path, top_db, range_Ignore) 92 | 93 | token = np.array( 94 | [token_Index_Dict['']] + [token_Index_Dict[letter] for letter in text] + [token_Index_Dict['']], 95 | dtype= np.int32 96 | ) 97 | 98 | new_Pattern_Dict = { 99 | 'Token': token, 100 | 'Mel': mel, 101 | 'Spectrogram': spect, 102 | 'Text': text, 103 | 'Dataset': dataset, 104 | } 105 | 106 | pickle_File_Name = '{}.{}{}.PICKLE'.format(dataset, file_Prefix, os.path.splitext(os.path.basename(path))[0]).upper() 107 | 108 | with open(os.path.join(hp_Dict['Train']['Pattern_Path'], pickle_File_Name).replace("\\", "/"), 'wb') as f: 109 | pickle.dump(new_Pattern_Dict, f, protocol=2) 110 | 111 | print('[{}]'.format(display_Prefix), '{}'.format(path), '->', '{}'.format(pickle_File_Name)) 112 | 113 | 114 | def VCTK_Info_Load(vctk_Path, max_Count= None): 115 | vctk_Wav_Path = os.path.join(vctk_Path, 'wav48').replace('\\', '/') 116 | vctk_Txt_Path = os.path.join(vctk_Path, 'txt').replace('\\', '/') 117 | with open(os.path.join(vctk_Path, 'VCTK.NonOutlier.txt').replace('\\', '/'), 'r') as f: 118 | vctk_Non_Outlier_List = [x.strip() for x in f.readlines()] 119 | # try: 120 | # with open(os.path.join(vctk_Path, 'VCTK.NonOutlier.txt').replace('\\', '/'), 'r') as f: 121 | # vctk_Non_Outlier_List = [x.strip() for x in f.readlines()] 122 | # except: 123 | # vctk_Non_Outlier_List = None 124 | 125 | vctk_File_Path_List = [] 126 | vctk_Text_Dict = {} 127 | for root, _, file_Name_List in os.walk(vctk_Wav_Path): 128 | for file_Name in file_Name_List: 129 | if not vctk_Non_Outlier_List is None and not file_Name in vctk_Non_Outlier_List: 130 | continue 131 | wav_File_Path = os.path.join(root, file_Name).replace('\\', '/') 132 | if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension: 133 | continue 134 | txt_File_Path = wav_File_Path.replace(vctk_Wav_Path, vctk_Txt_Path).replace('wav', 'txt') 135 | if not os.path.exists(txt_File_Path): 136 | continue 137 | with open(txt_File_Path, 'r') as f: 138 | text = Text_Filtering(f.read().strip()) 139 | if text is None: 140 | continue 141 | vctk_File_Path_List.append(wav_File_Path) 142 | vctk_Text_Dict[wav_File_Path] = text 143 | 144 | if not max_Count is None: 145 | vctk_File_Path_List = vctk_File_Path_List[:max_Count] 146 | 147 | print('VCTK info generated: {}'.format(len(vctk_File_Path_List))) 148 | return vctk_File_Path_List, vctk_Text_Dict 149 | 150 | def LS_Info_Load(ls_Path, max_Count= None): 151 | ls_File_Path_List = [] 152 | ls_Text_Dict = {} 153 | for root, _, file_Name_List in os.walk(ls_Path): 154 | speaker, text_ID = root.replace('\\', '/').split('/')[-2:] 155 | 156 | txt_File_Path = os.path.join(ls_Path, speaker, text_ID, '{}-{}.trans.txt'.format(speaker, text_ID)).replace('\\', '/') 157 | if not os.path.exists(txt_File_Path): 158 | continue 159 | 160 | with open(txt_File_Path, 'r') as f: 161 | text_Data = f.readlines() 162 | 163 | text_Dict = {} 164 | for text_Line in text_Data: 165 | text_Line = text_Line.strip().split(' ') 166 | text_Dict[text_Line[0]] = ' '.join(text_Line[1:]) 167 | 168 | for file_Name in file_Name_List: 169 | wav_File_Path = os.path.join(root, file_Name).replace('\\', '/') 170 | if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension: 171 | continue 172 | text = Text_Filtering(text_Dict[os.path.splitext(os.path.basename(wav_File_Path))[0]]) 173 | if text is None: 174 | continue 175 | ls_File_Path_List.append(wav_File_Path) 176 | ls_Text_Dict[wav_File_Path] = text 177 | 178 | if not max_Count is None: 179 | ls_File_Path_List = ls_File_Path_List[:max_Count] 180 | 181 | print('LS info generated: {}'.format(len(ls_File_Path_List))) 182 | return ls_File_Path_List, ls_Text_Dict 183 | 184 | def TIMIT_Info_Load(timit_Path, max_Count= None): 185 | timit_File_Path_List = [] 186 | timit_Text_List_Dict = {} 187 | for root, _, file_Name_List in os.walk(timit_Path): 188 | for file_Name in file_Name_List: 189 | wav_File_Path = os.path.join(root, file_Name).replace('\\', '/') 190 | if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension: 191 | continue 192 | txt_File_Path = wav_File_Path.replace('WAV', 'TXT') 193 | if not os.path.exists(txt_File_Path): 194 | continue 195 | with open(txt_File_Path, 'r') as f: 196 | text = Text_Filtering(' '.join(f.read().strip().split(' ')[2:]).strip()) 197 | if text is None: 198 | continue 199 | timit_File_Path_List.append(wav_File_Path) 200 | timit_Text_List_Dict[wav_File_Path] = text 201 | 202 | if not max_Count is None: 203 | timit_File_Path_List = timit_File_Path_List[:max_Count] 204 | 205 | print('TIMIT info generated: {}'.format(len(timit_File_Path_List))) 206 | return timit_File_Path_List, timit_Text_List_Dict 207 | 208 | def LJ_Info_Load(lj_Path, max_Count= None): 209 | lj_File_Path_List = [] 210 | lj_Text_Dict = {} 211 | 212 | text_Dict = {} 213 | with open(os.path.join(lj_Path, 'metadata.csv').replace('\\', '/'), 'r', encoding= 'utf-8') as f: 214 | readlines = f.readlines() 215 | 216 | for line in readlines: 217 | key, _, text = line.strip().split('|') 218 | text = Text_Filtering(text) 219 | if text is None: 220 | continue 221 | text_Dict[key.upper()] = text 222 | 223 | for root, _, file_Name_List in os.walk(lj_Path): 224 | for file_Name in file_Name_List: 225 | wav_File_Path = os.path.join(root, file_Name).replace('\\', '/') 226 | if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension: 227 | continue 228 | if not os.path.splitext(file_Name)[0].upper() in text_Dict.keys(): 229 | continue 230 | lj_File_Path_List.append(wav_File_Path) 231 | lj_Text_Dict[wav_File_Path] = text_Dict[os.path.splitext(file_Name)[0].upper()] 232 | 233 | if not max_Count is None: 234 | lj_File_Path_List = lj_File_Path_List[:max_Count] 235 | 236 | print('LJ info generated: {}'.format(len(lj_File_Path_List))) 237 | return lj_File_Path_List, lj_Text_Dict 238 | 239 | def BC2013_Info_Load(bc2013_Path, max_Count= None): 240 | text_Path_List = [] 241 | for root, _, files in os.walk(bc2013_Path): 242 | for filename in files: 243 | if os.path.splitext(filename)[1].upper() != '.txt'.upper(): 244 | continue 245 | text_Path_List.append(os.path.join(root, filename).replace('\\', '/')) 246 | 247 | bc2013_File_Path_List = [] 248 | bc2013_Text_Dict = {} 249 | 250 | for text_Path in text_Path_List: 251 | wav_Path = text_Path.replace('txt', 'wav') 252 | if not os.path.exists(wav_Path): 253 | continue 254 | with open(text_Path, 'r') as f: 255 | text = Text_Filtering(f.read().strip()) 256 | if text is None: 257 | continue 258 | 259 | bc2013_File_Path_List.append(wav_Path) 260 | bc2013_Text_Dict[wav_Path] = text 261 | 262 | if not max_Count is None: 263 | bc2013_File_Path_List = bc2013_File_Path_List[:max_Count] 264 | 265 | print('BC2013 info generated: {}'.format(len(bc2013_File_Path_List))) 266 | return bc2013_File_Path_List, bc2013_Text_Dict 267 | 268 | def FV_Info_Load(fv_Path, max_Count= None): 269 | text_Path_List = [] 270 | for root, _, file_Name_List in os.walk(fv_Path): 271 | for file in file_Name_List: 272 | if os.path.splitext(file)[1] == '.data': 273 | text_Path_List.append(os.path.join(root, file).replace('\\', '/')) 274 | 275 | fv_File_Path_List = [] 276 | fv_Text_Dict = {} 277 | fv_Speaker_Dict = {} 278 | for text_Path in text_Path_List: 279 | speaker = text_Path.split('/')[-3].split('_')[2].upper() 280 | with open(text_Path, 'r') as f: 281 | lines = f.readlines() 282 | for line in lines: 283 | file_Path, text, _ = line.strip().split('"') 284 | 285 | file_Path = file_Path.strip().split(' ')[1] 286 | wav_File_Path = os.path.join( 287 | os.path.split(text_Path)[0].replace('etc', 'wav'), 288 | '{}.wav'.format(file_Path) 289 | ).replace('\\', '/') 290 | 291 | text = Text_Filtering(text) 292 | if text is None: 293 | continue 294 | fv_File_Path_List.append(wav_File_Path) 295 | fv_Text_Dict[wav_File_Path] = text 296 | fv_Speaker_Dict[wav_File_Path] = speaker 297 | 298 | if not max_Count is None: 299 | fv_File_Path_List = fv_File_Path_List[:max_Count] 300 | 301 | print('FV info generated: {}'.format(len(fv_File_Path_List))) 302 | return fv_File_Path_List, fv_Text_Dict, fv_Speaker_Dict 303 | 304 | 305 | 306 | def Metadata_Generate(token_Index_Dict): 307 | new_Metadata_Dict = { 308 | 'Token_Index_Dict': token_Index_Dict, 309 | 'Spectrogram_Dim': hp_Dict['Sound']['Spectrogram_Dim'], 310 | 'Mel_Dim': hp_Dict['Sound']['Mel_Dim'], 311 | 'Frame_Shift': hp_Dict['Sound']['Frame_Shift'], 312 | 'Frame_Length': hp_Dict['Sound']['Frame_Length'], 313 | 'Sample_Rate': hp_Dict['Sound']['Sample_Rate'], 314 | 'Max_Abs_Mel': hp_Dict['Sound']['Max_Abs_Mel'], 315 | 'File_List': [], 316 | 'Token_Length_Dict': {}, 317 | 'Mel_Length_Dict': {}, 318 | 'Dataset_Dict': {}, 319 | } 320 | 321 | for root, _, files in os.walk(hp_Dict['Train']['Pattern_Path']): 322 | for file in files: 323 | with open(os.path.join(root, file).replace("\\", "/"), "rb") as f: 324 | pattern_Dict = pickle.load(f) 325 | try: 326 | new_Metadata_Dict['Token_Length_Dict'][file] = pattern_Dict['Token'].shape[0] 327 | new_Metadata_Dict['Mel_Length_Dict'][file] = pattern_Dict['Mel'].shape[0] 328 | new_Metadata_Dict['Dataset_Dict'][file] = pattern_Dict['Dataset'] 329 | new_Metadata_Dict['File_List'].append(file) 330 | except: 331 | print('File \'{}\' is not correct pattern file. This file is ignored.'.format(file)) 332 | 333 | with open(os.path.join(hp_Dict['Train']['Pattern_Path'], hp_Dict['Train']['Metadata_File'].upper()).replace("\\", "/"), 'wb') as f: 334 | pickle.dump(new_Metadata_Dict, f, protocol=2) 335 | 336 | print('Metadata generate done.') 337 | 338 | if __name__ == '__main__': 339 | argParser = argparse.ArgumentParser() 340 | argParser.add_argument("-lj", "--lj_path", required=False) 341 | argParser.add_argument("-vctk", "--vctk_path", required=False) 342 | argParser.add_argument("-ls", "--ls_path", required=False) 343 | argParser.add_argument("-timit", "--timit_path", required=False) 344 | argParser.add_argument("-bc2013", "--bc2013_path", required=False) 345 | argParser.add_argument("-fv", "--fv_path", required=False) 346 | argParser.add_argument("-all", "--all_save", action='store_true') #When this parameter is False, only correct time range patterns are generated. 347 | argParser.set_defaults(all_save = False) 348 | argParser.add_argument("-mc", "--max_count", required=False) 349 | argParser.add_argument("-mw", "--max_worker", required=False) 350 | argParser.set_defaults(max_worker = 10) 351 | argument_Dict = vars(argParser.parse_args()) 352 | 353 | if not argument_Dict['max_count'] is None: 354 | argument_Dict['max_count'] = int(argument_Dict['max_count']) 355 | 356 | total_Pattern_Count = 0 357 | 358 | if not argument_Dict['lj_path'] is None: 359 | lj_File_Path_List, lj_Text_Dict = LJ_Info_Load(lj_Path= argument_Dict['lj_path'], max_Count= argument_Dict['max_count']) 360 | total_Pattern_Count += len(lj_File_Path_List) 361 | if not argument_Dict['vctk_path'] is None: 362 | vctk_File_Path_List, vctk_Text_Dict = VCTK_Info_Load(vctk_Path= argument_Dict['vctk_path'], max_Count= argument_Dict['max_count']) 363 | total_Pattern_Count += len(vctk_File_Path_List) 364 | if not argument_Dict['ls_path'] is None: 365 | ls_File_Path_List, ls_Text_Dict = LS_Info_Load(ls_Path= argument_Dict['ls_path'], max_Count= argument_Dict['max_count']) 366 | total_Pattern_Count += len(ls_File_Path_List) 367 | if not argument_Dict['timit_path'] is None: 368 | timit_File_Path_List, timit_Text_List_Dict = TIMIT_Info_Load(timit_Path= argument_Dict['timit_path'], max_Count= argument_Dict['max_count']) 369 | total_Pattern_Count += len(timit_File_Path_List) 370 | if not argument_Dict['bc2013_path'] is None: 371 | bc2013_File_Path_List, bc2013_Text_List_Dict = BC2013_Info_Load(bc2013_Path= argument_Dict['bc2013_path'], max_Count= argument_Dict['max_count']) 372 | total_Pattern_Count += len(bc2013_File_Path_List) 373 | if not argument_Dict['fv_path'] is None: 374 | fv_File_Path_List, fv_Text_List_Dict, fv_Speaker_Dict = FV_Info_Load(fv_Path= argument_Dict['fv_path'], max_Count= argument_Dict['max_count']) 375 | total_Pattern_Count += len(fv_File_Path_List) 376 | 377 | if total_Pattern_Count == 0: 378 | raise ValueError('Total pattern count is zero.') 379 | 380 | os.makedirs(hp_Dict['Train']['Pattern_Path'], exist_ok= True) 381 | total_Generated_Pattern_Count = 0 382 | with PE(max_workers = int(argument_Dict['max_worker'])) as pe: 383 | if not argument_Dict['lj_path'] is None: 384 | for index, file_Path in enumerate(lj_File_Path_List): 385 | pe.submit( 386 | Pattern_File_Generate, 387 | file_Path, 388 | lj_Text_Dict[file_Path], 389 | token_Index_Dict, 390 | 'LJ', 391 | '', 392 | 'LJ {:05d}/{:05d} Total {:05d}/{:05d}'.format( 393 | index, 394 | len(lj_File_Path_List), 395 | total_Generated_Pattern_Count, 396 | total_Pattern_Count 397 | ), 398 | 60, 399 | argument_Dict['all_save'] 400 | ) 401 | total_Generated_Pattern_Count += 1 402 | 403 | if not argument_Dict['vctk_path'] is None: 404 | for index, file_Path in enumerate(vctk_File_Path_List): 405 | pe.submit( 406 | Pattern_File_Generate, 407 | file_Path, 408 | vctk_Text_Dict[file_Path], 409 | token_Index_Dict, 410 | 'VCTK', 411 | '', 412 | 'VCTK {:05d}/{:05d} Total {:05d}/{:05d}'.format( 413 | index, 414 | len(vctk_File_Path_List), 415 | total_Generated_Pattern_Count, 416 | total_Pattern_Count 417 | ), 418 | 15, 419 | argument_Dict['all_save'] 420 | ) 421 | total_Generated_Pattern_Count += 1 422 | 423 | if not argument_Dict['ls_path'] is None: 424 | for index, file_Path in enumerate(ls_File_Path_List): 425 | pe.submit( 426 | Pattern_File_Generate, 427 | file_Path, 428 | ls_Text_Dict[file_Path], 429 | token_Index_Dict, 430 | 'LS', 431 | '', 432 | 'LS {:05d}/{:05d} Total {:05d}/{:05d}'.format( 433 | index, 434 | len(ls_File_Path_List), 435 | total_Generated_Pattern_Count, 436 | total_Pattern_Count 437 | ), 438 | 60, 439 | argument_Dict['all_save'] 440 | ) 441 | total_Generated_Pattern_Count += 1 442 | 443 | if not argument_Dict['timit_path'] is None: 444 | for index, file_Path in enumerate(timit_File_Path_List): 445 | pe.submit( 446 | Pattern_File_Generate, 447 | file_Path, 448 | timit_Text_List_Dict[file_Path], 449 | token_Index_Dict, 450 | 'TIMIT', 451 | '{}.'.format(file_Path.split('/')[-2]), 452 | 'TIMIT {:05d}/{:05d} Total {:05d}/{:05d}'.format( 453 | index, 454 | len(timit_File_Path_List), 455 | total_Generated_Pattern_Count, 456 | total_Pattern_Count 457 | ), 458 | 60, 459 | argument_Dict['all_save'] 460 | ) 461 | total_Generated_Pattern_Count += 1 462 | 463 | if not argument_Dict['bc2013_path'] is None: 464 | for index, file_Path in enumerate(bc2013_File_Path_List): 465 | pe.submit( 466 | Pattern_File_Generate, 467 | file_Path, 468 | bc2013_Text_List_Dict[file_Path], 469 | token_Index_Dict, 470 | 'BC2013', 471 | '{}.'.format(file_Path.split('/')[-2]), 472 | 'BC2013 {:05d}/{:05d} Total {:05d}/{:05d}'.format( 473 | index, 474 | len(bc2013_File_Path_List), 475 | total_Generated_Pattern_Count, 476 | total_Pattern_Count 477 | ), 478 | 60, 479 | argument_Dict['all_save'] 480 | ) 481 | total_Generated_Pattern_Count += 1 482 | 483 | if not argument_Dict['fv_path'] is None: 484 | for index, file_Path in enumerate(fv_File_Path_List): 485 | pe.submit( 486 | Pattern_File_Generate, 487 | file_Path, 488 | fv_Text_List_Dict[file_Path], 489 | token_Index_Dict, 490 | 'FV', 491 | '{}.'.format(fv_Speaker_Dict[file_Path]), 492 | 'FV {:05d}/{:05d} Total {:05d}/{:05d}'.format( 493 | index, 494 | len(fv_File_Path_List), 495 | total_Generated_Pattern_Count, 496 | total_Pattern_Count 497 | ), 498 | 60, 499 | argument_Dict['all_save'] 500 | ) 501 | total_Generated_Pattern_Count += 1 502 | 503 | Metadata_Generate(token_Index_Dict) -------------------------------------------------------------------------------- /ProgressBar.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright (c) 2016 Vladimir Ignatev 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining 5 | # a copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the Software 9 | # is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included 12 | # in all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 18 | # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 19 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | # Script from 'https://gist.github.com/vladignatyev/06860ec2040cb497f0f3' 22 | 23 | import sys 24 | 25 | 26 | def progress(count, total, status=''): 27 | bar_len = 60 28 | filled_len = int(round(bar_len * count / float(total))) 29 | 30 | percents = round(100.0 * count / float(total), 1) 31 | bar = '=' * filled_len + '-' * (bar_len - filled_len) 32 | 33 | sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status)) 34 | sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GST Tacotron in TF2 2 | 3 | This code is an implementation of the paper 'Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis'. The algorithm is based on the following papers: 4 | 5 | ``` 6 | Wang, Y., Stanton, D., Zhang, Y., Skerry-Ryan, R. J., Battenberg, E., Shor, J., ... & Saurous, R. A. (2018). Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis. arXiv preprint arXiv:1803.09017. 7 | He, M., Deng, Y., & He, L. (2019). Robust Sequence-to-Sequence Acoustic Modeling with Stepwise Monotonic Attention for Neural TTS. arXiv preprint arXiv:1906.00672. 8 | Shen, J., Pang, R., Weiss, R. J., Schuster, M., Jaitly, N., Yang, Z., ... & Saurous, R. A. (2018, April). Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4779-4783). IEEE. 9 | Wang, Y., Skerry-Ryan, R. J., Stanton, D., Wu, Y., Weiss, R. J., Jaitly, N., ... & Le, Q. (2017). Tacotron: Towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135. 10 | ``` 11 | 12 | # Update 13 | * 2020-05-02 14 | * BN and ReLU order is fixed (Now 'BN -> ReLU' and no bias) 15 | * Frame shift and frame window is based on the sample for compatibility of Vocoder. 16 | * `tf.train.Checkpoint` is used to save optimizer parameter saving. Thus, the step information is saved. 17 | 18 | # Requirements 19 | Please see the 'Requirements.txt' 20 | 21 | # Structrue 22 | ![Structure](./Figures/Structure.png) 23 | 24 | Currently, model supports only grffin lim vocoder. Other vocoder is one of the future works. 25 | 26 | 27 | # Used dataset 28 | Currently uploaded code is compatible with the following datasets. The O mark to the left of the dataset name is the dataset actually used in the uploaded result. 29 | 30 | ``` 31 | [O] LJSpeech: https://keithito.com/LJ-Speech-Dataset/ 32 | [X] VCTK: https://datashare.is.ed.ac.uk/handle/10283/2651 33 | [X] LibriSpeech: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/ 34 | [X] TIMIT: http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3 35 | [X] Blizzard Challenge 2013: http://www.cstr.ed.ac.uk/projects/blizzard/ 36 | [O] FastVox: http://www.festvox.org/cmu_arctic/index.html 37 | ``` 38 | 39 | # Hyper parameters 40 | Before proceeding, please set the pattern, inference, and checkpoint paths in 'Hyper_Parameter.json' according to your environment. 41 | 42 | 43 | 44 | * Sound 45 | * Setting basic sound parameters. 46 | 47 | * Token_JSON_Path 48 | * Setting the text token 49 | 50 | * GST 51 | * Setting the global style token modules. 52 | * If 'Use' is false, model does not use GST. It become just tacotron 2. 53 | * In 'Reference_Encoder/Conv', 'Filters', 'Kernel_Size', and 'Strides' must be lists of the same size. 54 | * In 'Style_Token/Attention', 'Size' must be divied by 'Head'. 55 | 56 | * Tacotron1 57 | * Setting the parameters of tacotron 1. 58 | * If 'Taco_Version' is 2, the parameters of this part will be ignored. 59 | * I recommend that all 'Zoneout' parameters are set 0.0 because the CuDNN does not support the recurrent_dropout yet. See the following reference. 60 | * https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM 61 | * Currently, this hyper parameter is ignored because tacotron 1 does not implemented. 62 | 63 | * Tacotron2 64 | * Setting the parameters of tacotron 2. 65 | * If 'Taco_Version' is 1, the parameters of this part will be ignored. 66 | * I recommend that all 'Zoneout' parameters are set 0.0 because the CuDNN does not support the recurrent_dropout yet. 67 | * See the following reference for details. 68 | * https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM 69 | 70 | * Step_Reduction 71 | * Setting how many steps will be exported at single step in decoder. 72 | 73 | * Max_Step 74 | * Setting the maximum setp of inference mel or spectrogram While inference. 75 | 76 | * Vocoder_Taco1 77 | * Setting the parameters of Griffin-Lim vocoder. 78 | 79 | * Train 80 | * Setting the parameters of training. 81 | 82 | * Taco_Version 83 | * Setting the tacotron version. 84 | * Currently, this hyper parameter is ignored because tacotron 1 does not implemented. 85 | * Use_Mixed_Precision 86 | * Setting the usage of mixed precision. 87 | * If using, the tensors are stored by 16bit, not 32bit. 88 | * The weights are stored by 32bit, so the model is compatible with checkpoints learned with different mixed precisions if the rest of the parameters are the same. 89 | * Usually, this parameter makes be possible to use larger batch size. 90 | * In the unsupported machine, the speed is extreamly slower. 91 | * When using, I recommend to increase the epsilon of ADAM to 1e-4 to prevent the underflow problem. 92 | * See the following reference for details. 93 | * https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/experimental/Policy 94 | * Inference_Path 95 | * Setting the inference path 96 | * Checkpoint_Path 97 | * Setting the checkpoint path 98 | * Inference_Cut 99 | * The figure and wav files will be cutted at stop token when this parameter is true. 100 | * Device 101 | * Setting which GPU device is used in multi-GPU enviornment. 102 | * Or, if using only CPU, please set '-1'. 103 | 104 | # Generate pattern 105 | 106 | ## Command 107 | ``` 108 | python Pattern_Generate.py [parameters] 109 | ``` 110 | 111 | ## Parameters 112 | 113 | At least, one or more of datasets must be used. 114 | 115 | * -lj 116 | * Set the path of LJSpeech. LJSpeech's patterns are generated. 117 | * -vctk 118 | * Set the path of VCTK. VCTK's patterns are generated. 119 | * -ls 120 | * Set the path of LibriSpeech. LibriSpeech's patterns are generated. 121 | * -timit 122 | * Set the path of TIMIT. TIMIT's patterns are generated. 123 | * -bc2013 124 | * Set the path of Blizzard Challenge 2013. Blizzard Challenge 2013's patterns are generated. 125 | * -fv 126 | * Set the path of FastVox. FastVox's patterns are generated. 127 | * -all 128 | * All save option. 129 | * Generator ignore the 'Train/Min_Wav_Length' and 'Train/Max_Wav_Length' of hyper parameters. 130 | * If this option is not set, only patterns matching 'Train/Min_Wav_Length' and 'Train/Max_Wav_Length' are generated. 131 | * -mc 132 | * Ignore patterns that exceed the set number of each dataset. 133 | * -mw 134 | * The number of threads used to create the pattern 135 | 136 | # Inference file path while training for verification. 137 | 138 | * Inference_Sentence_for_Training.txt 139 | * Sentence list which is used for inference while training. 140 | * Inference_Wav_for_Training.txt 141 | * Wav path which is used for inference while training. 142 | * If 'GST/Use' is false, this will be ignored. 143 | * The count of paths must be 1 or same to sentence count. 144 | 145 | # Run 146 | 147 | ## Command 148 | ``` 149 | python Model.py [parameters] 150 | ``` 151 | 152 | ## Parameters 153 | 154 | * -s 155 | * Set the start step. 156 | * In TF2, there is no global step. However, to decay the learning rate, model require the step value. 157 | * Default is 0. 158 | 159 | 160 | # Inference 161 | 162 | 1. Run 'ipython' in the model's directory. 163 | 2. Run following command: 164 | ``` 165 | from Model import GST_Tacotron 166 | new_GST_Tacotron = GST_Tacotron(is_Training= False) 167 | new_GST_Tacotron.Restore() 168 | ``` 169 | 3. Set the speaker's Wav path list and text list like the following example: 170 | 171 | ``` 172 | sentence_List = [ 173 | 'The grass is always greener on the other side of the fence.', 174 | 'Strike while the iron is hot.', 175 | 'A creative artist works on his next composition because he was not satisfied with his previous one.', 176 | 'You cannot make an omelet without breaking a few eggs.', 177 | ] 178 | wav_List_for_GST = [ 179 | './Wav_for_Inference/FV.AWB.arctic_a0001.wav', 180 | './Wav_for_Inference/FV.JMK.arctic_a0004.wav', 181 | './Wav_for_Inference/FV.SLT.arctic_a0007.wav', 182 | './Wav_for_Inference/LJ.LJ050-0278.wav', 183 | ] 184 | ``` 185 | __※The length of wav path must be 1 or same to text list.__ 186 | 187 | 188 | 4. Run following command: 189 | ``` 190 | new_GST_Tacotron.Inference( 191 | sentence_List = sentence_List, 192 | wav_List_for_GST = wav_List_for_GST, 193 | label = 'Result' 194 | ) 195 | ``` 196 | 197 | # GST embedding inference 198 | 1. Do until 2 of [Inference](#Inference) 199 | 200 | 2. Set the Wav path list and tag list like the following example: 201 | ``` 202 | wav_List = [ 203 | './Wav_for_Inference/FV.AWB.arctic_a0001.wav' 204 | './Wav_for_Inference/FV.JMK.arctic_a0004.wav' 205 | './Wav_for_Inference/FV.SLT.arctic_a0007.wav' 206 | './Wav_for_Inference/LJ.LJ050-0278.wav' 207 | ] 208 | tag_List = [ 209 | 'AWB' 210 | 'JMK' 211 | 'SLT' 212 | 'LJ' 213 | ] 214 | ``` 215 | __※The length of two lists must be same.__ 216 | 217 | 3. Run following command: 218 | 219 | * You can take the output as numpy arrays. 220 | 221 | ``` 222 | mels, stops, spectrograms, alignments = new_GST_Tacotron.Inference_GST(wav_List, tag_List) 223 | ``` 224 | 225 | 226 | 4. The result is saved as a text file in inference directory. You can get the t-SNE analysis graph by using [R script](./R_Script/TSNE.R) 227 | 228 | 229 | # Result 230 | * The following results are based on the checkpoint of 38000 steps of 40 batchs (43.77 epochs). 231 | * In figures, vertical line is stop detection. 232 | * All speakers are distinguishable. 233 | * Voice quality is not perfect, but I concluded that the reason is because the insufficient learning steps and the use of Griffin-Lim, not vocoder. 234 | * I stopped training this model. I will focus to generate and attach a vocoder. 235 | 236 | ## Mel for GST: FastVox AWB A0001 237 | * Sentence: The grass is always greener on the other side of the fence. 238 | 239 | [Wav_IDX_0](./Example_Results/Wav/20200505.214958.IDX_0.WAV) 240 | ![Figure_IDX_0](./Example_Results/Figures/20200505.214958.IDX_0.PNG) 241 | 242 | ## Mel for GST: FastVox BDL A0002 243 | * Sentence: Strike while the iron is hot. 244 | 245 | [Wav_IDX_1](./Example_Results/Wav/20200505.214958.IDX_1.WAV) 246 | ![Figure_IDX_1](./Example_Results/Figures/20200505.214958.IDX_1.PNG) 247 | 248 | ## Mel for GST: FastVox CLB A0003 249 | * Sentence: A creative artist works on his next composition because he was not satisfied with his previous one. 250 | 251 | [Wav_IDX_2](./Example_Results/Wav/20200505.214958.IDX_2.WAV) 252 | ![Figure_IDX_2](./Example_Results/Figures/20200505.214958.IDX_2.PNG) 253 | 254 | ## Mel for GST: FastVox JMK A0004 255 | * Sentence: You cannot make an omelet without breaking a few eggs. 256 | 257 | [Wav_IDX_3](./Example_Results/Wav/20200505.214958.IDX_3.WAV) 258 | ![Figure_IDX_3](./Example_Results/Figures/20200505.214958.IDX_3.PNG) 259 | 260 | ## Mel for GST: FastVox KSP A0005.wav 261 | * Sentence: Death is like a fisherman who catches fish in his net and leaves them for a while in the water. The fish is still swimming but the net is around him, and the fisherman will draw him up. 262 | 263 | [Wav_IDX_4](./Example_Results/Wav/20200505.214958.IDX_4.WAV) 264 | ![Figure_IDX_4](./Example_Results/Figures/20200505.214958.IDX_4.PNG) 265 | 266 | ## Mel for GST: FastVox.RMS A0006 267 | * Sentence: A man who marries a woman to educate her falls a victim to the same fallacy as the woman who marries a man to reform him. 268 | 269 | [Wav_IDX_5](./Example_Results/Wav/20200505.214958.IDX_5.WAV) 270 | ![Figure_IDX_5](./Example_Results/Figures/20200505.214958.IDX_5.PNG) 271 | 272 | ## Mel for GST: FastVox.SLT A0007 273 | * Sentence: Birds of a feather flock together. 274 | 275 | [Wav_IDX_6](./Example_Results/Wav/20200505.214958.IDX_6.WAV) 276 | ![Figure_IDX_6](./Example_Results/Figures/20200505.214958.IDX_6.PNG) 277 | 278 | ## Mel for GST: LJspeech LJ050-0278 279 | * Sentence: Too many cooks in the kitchen spoil the broth. 280 | 281 | [Wav_IDX_7](./Example_Results/Wav/20200505.214958.IDX_7.WAV) 282 | ![Figure_IDX_7](./Example_Results/Figures/20200505.214958.IDX_7.PNG) 283 | 284 | ## GST embedding t-SNE 285 | ![GST_Embedding](./Example_Results/GST/20200506.001527.GST.PNG) 286 | 287 | # Trained checkpoint 288 | 289 | [Checkpoint here](https://drive.google.com/open?id=1qcm_eUS7R2Xa7N5quD1r0Iy2qQl1r6wd) 290 | 291 | * This is the checkpoint of 38000 steps of 24 batchs (43.77 epochs). 292 | * There is the hyper-parameter about this checkpoint in the zip file. 293 | 294 | 295 | # Future works 296 | 1. Vocoder attaching. (I am focusing several vocdoers....) 297 | ``` 298 | Prenger, R., Valle, R., & Catanzaro, B. (2019, May). Waveglow: A flow-based generative network for speech synthesis. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 3617-3621). IEEE. 299 | Oord, A. V. D., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., ... & Kavukcuoglu, K. (2016). Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499. 300 | Kalchbrenner, N., Elsen, E., Simonyan, K., Noury, S., Casagrande, N., Lockhart, E., ... & Kavukcuoglu, K. (2018). Efficient neural audio synthesis. arXiv preprint arXiv:1802.08435. 301 | Yamamoto, R., Song, E., & Kim, J. M. (2020, May). Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 6199-6203). IEEE. 302 | Kumar, K., Kumar, R., de Boissiere, T., Gestin, L., Teoh, W. Z., Sotelo, J., ... & Courville, A. C. (2019). Melgan: Generative adversarial networks for conditional waveform synthesis. In Advances in Neural Information Processing Systems (pp. 14881-14892). 303 | ``` 304 | 305 | 2. Tacotron 1 module update 306 | * Original paper used the tacotron 1, not tacotron 2. 307 | * I hope to add tacotron 1 for performance comparison and more. 308 | -------------------------------------------------------------------------------- /R_Script/TSNE.R: -------------------------------------------------------------------------------- 1 | # Refer: https://statkclee.github.io/model/model-tsne.html 2 | 3 | library(tidyverse) 4 | library(Rtsne) 5 | library(tools) 6 | 7 | base_Dir <- 'D:/GST.Results/Inference/GST/' 8 | 9 | list.files(base_Dir) 10 | for(file in list.files(base_Dir)) 11 | { 12 | if (toupper(file_ext(file)) != 'TXT') 13 | { 14 | next 15 | } 16 | else if (file.exists(sprintf('%s%s', base_Dir, str_replace(file, '.TXT', '.PNG')))) 17 | { 18 | next 19 | } 20 | 21 | gst.Data <- read_delim( 22 | sprintf('%s%s', base_Dir, file), 23 | "\t", 24 | escape_double = FALSE, 25 | locale = locale(encoding = "UTF-8"), 26 | trim_ws = TRUE 27 | ) 28 | gst.TSNE <- Rtsne( 29 | gst.Data[,c(-1)], 30 | PCA = TRUE, 31 | check_duplicates = FALSE, 32 | dims = 2, 33 | max_iter = 1000, 34 | perplexity= 5 35 | ) 36 | gst.TSNE.DF <- data.frame( 37 | TSNE_x = gst.TSNE$Y[, 1], 38 | TSNE_y = gst.TSNE$Y[, 2], 39 | Data_Tag = gst.Data$Tag 40 | ) 41 | 42 | 43 | plot <- ggplot(data= gst.TSNE.DF, aes(x= TSNE_x, y= TSNE_y, color= Data_Tag)) + 44 | geom_point() + 45 | #geom_text(aes(label= Data_Tag)) + 46 | labs(title= 'GST t-SNE', x= '', y= '') + 47 | theme_bw() + 48 | theme( 49 | axis.title.x = element_blank(), 50 | axis.title.y = element_blank(), 51 | # axis.text = element_blank(), 52 | strip.text = element_text(size = 20), 53 | panel.grid=element_blank(), 54 | legend.position = 'right', 55 | plot.title = element_text(hjust = 0.5) 56 | ) 57 | 58 | 59 | ggsave( 60 | filename = sprintf('%s%s', base_Dir, str_replace(file, '.TXT', '.PNG')), 61 | plot = plot, 62 | device = "png", width = 12, height = 10, units = "cm", dpi = 300 63 | ) 64 | } -------------------------------------------------------------------------------- /R_Script/VCTK_Outlier_Checker.R: -------------------------------------------------------------------------------- 1 | library(readr) 2 | library(ggplot2) 3 | library(car) 4 | 5 | repeat_Count <- 1000 6 | base_Dir = 'D:/Python_Programming/GST_Tacotron/' 7 | 8 | vctk_Length.Data <- read_delim( 9 | sprintf('%sVCTK_Length.txt', base_Dir), 10 | "\t", 11 | escape_double = FALSE, 12 | locale = locale(encoding = "UTF-8"), 13 | trim_ws = TRUE 14 | ) 15 | 16 | vctk_Length.Sig <- vctk_Length.Data[c(-4,-5)] 17 | vctk_Length.Trim <- vctk_Length.Data[c(-3,-5)] 18 | vctk_Length.Split <- vctk_Length.Data[c(-3,-4)] 19 | 20 | vctk_Length.Sig.Plot <- ggplot(vctk_Length.Sig, aes(x= Sig_Length, y= Text_Length)) + 21 | geom_point() + 22 | labs(title=sprintf('Original Sig count: %s', nrow(vctk_Length.Sig))) + 23 | geom_smooth(method = "lm") 24 | vctk_Length.Trim.Plot <- ggplot(vctk_Length.Trim, aes(x= Trim_Length, y= Text_Length)) + 25 | geom_point() + 26 | labs(title=sprintf('Original Trim count: %s', nrow(vctk_Length.Trim))) + 27 | geom_smooth(method = "lm") 28 | vctk_Length.Split.Plot <- ggplot(vctk_Length.Split, aes(x= Split_Length, y= Text_Length)) + 29 | geom_point() + 30 | labs(title=sprintf('Original Split count: %s', nrow(vctk_Length.Split))) + 31 | geom_smooth(method = "lm") 32 | 33 | for (index in seq(repeat_Count)) 34 | { 35 | vctk_Length.Sig$Num <- row.names(vctk_Length.Sig) 36 | vctk_Length.Trim$Num <- row.names(vctk_Length.Trim) 37 | vctk_Length.Split$Num <- row.names(vctk_Length.Split) 38 | 39 | vctk_Length.Sig.LM <- lm( 40 | Sig_Length ~ Text_Length + I(Text_Length^2), 41 | data=vctk_Length.Sig 42 | ) 43 | vctk_Length.Trim.LM <- lm( 44 | Trim_Length ~ Text_Length + I(Text_Length^2), 45 | data=vctk_Length.Trim 46 | ) 47 | vctk_Length.Split.LM <- lm( 48 | Split_Length ~ Text_Length + I(Text_Length^2), 49 | data=vctk_Length.Split 50 | ) 51 | 52 | vctk_Length.Sig.Outlier <- outlierTest(vctk_Length.Sig.LM) 53 | vctk_Length.Trim.Outlier <- outlierTest(vctk_Length.Trim.LM) 54 | vctk_Length.Split.Outlier <- outlierTest(vctk_Length.Split.LM) 55 | 56 | vctk_Length.Sig$Outlier <- vctk_Length.Sig$Num %in% as.numeric(names(vctk_Length.Sig.Outlier$p)) 57 | vctk_Length.Trim$Outlier <- vctk_Length.Trim$Num %in% as.numeric(names(vctk_Length.Trim.Outlier$p)) 58 | vctk_Length.Split$Outlier <- vctk_Length.Split$Num %in% as.numeric(names(vctk_Length.Split.Outlier$p)) 59 | 60 | vctk_Length.Sig <- subset(vctk_Length.Sig, !Outlier) 61 | vctk_Length.Trim <- subset(vctk_Length.Trim, !Outlier) 62 | vctk_Length.Split <- subset(vctk_Length.Split, !Outlier) 63 | } 64 | 65 | vctk_Length.Sig.Plot.Remove_Outlier <- ggplot(vctk_Length.Sig, aes(x= Sig_Length, y= Text_Length)) + 66 | geom_point() + 67 | labs(title=sprintf('Outlier removed Sig count: %s', nrow(vctk_Length.Sig))) + 68 | geom_smooth(method = "lm") 69 | vctk_Length.Trim.Plot.Remove_Outlier <- ggplot(vctk_Length.Trim, aes(x= Trim_Length, y= Text_Length)) + 70 | geom_point() + 71 | labs(title=sprintf('Outlier removed Trim count: %s', nrow(vctk_Length.Trim))) + 72 | geom_smooth(method = "lm") 73 | vctk_Length.Split.Plot.Remove_Outlier <- ggplot(vctk_Length.Split, aes(x= Split_Length, y= Text_Length)) + 74 | geom_point() + 75 | labs(title=sprintf('Outlier removed Split count: %s', nrow(vctk_Length.Split))) + 76 | geom_smooth(method = "lm") 77 | 78 | 79 | 80 | ggsave( 81 | filename = sprintf('%sSig.Original.png', base_Dir), 82 | plot = vctk_Length.Sig.Plot, 83 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 84 | ) 85 | ggsave( 86 | filename = sprintf('%sTrim.Original.png', base_Dir), 87 | plot = vctk_Length.Trim.Plot, 88 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 89 | ) 90 | ggsave( 91 | filename = sprintf('%sSplit.Original.png', base_Dir), 92 | plot = vctk_Length.Split.Plot, 93 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 94 | ) 95 | ggsave( 96 | filename = sprintf('%sSig.RemoveOutlier.png', base_Dir), 97 | plot = vctk_Length.Sig.Plot.Remove_Outlier, 98 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 99 | ) 100 | ggsave( 101 | filename = sprintf('%sTrim.RemoveOutlier.png', base_Dir), 102 | plot = vctk_Length.Trim.Plot.Remove_Outlier, 103 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 104 | ) 105 | ggsave( 106 | filename = sprintf('%sSplit.RemoveOutlier.png', base_Dir), 107 | plot = vctk_Length.Split.Plot.Remove_Outlier, 108 | device = "png", width = 12, height = 12, units = "cm", dpi = 300 109 | ) 110 | 111 | write.table(vctk_Length.Trim[c(1)], sprintf('%svctk_nonoutlier.txt', base_Dir),sep='\t', row.names=FALSE, quote= FALSE) 112 | -------------------------------------------------------------------------------- /Requirements.txt: -------------------------------------------------------------------------------- 1 | librosa>=0.7.2 2 | matplotlib>=3.1.1 3 | tensorflow>=2.1.2 4 | -------------------------------------------------------------------------------- /Token_Index_Dict.ENG.json: -------------------------------------------------------------------------------- 1 | { 2 | "": 0, 3 | "": 1, 4 | " ": 2, 5 | "!": 3, 6 | ",": 4, 7 | "-": 5, 8 | ".": 6, 9 | "?": 7, 10 | "A": 8, 11 | "B": 9, 12 | "C": 10, 13 | "D": 11, 14 | "E": 12, 15 | "F": 13, 16 | "G": 14, 17 | "H": 15, 18 | "I": 16, 19 | "J": 17, 20 | "K": 18, 21 | "L": 19, 22 | "M": 20, 23 | "N": 21, 24 | "O": 22, 25 | "P": 23, 26 | "Q": 24, 27 | "R": 25, 28 | "S": 26, 29 | "T": 27, 30 | "U": 28, 31 | "V": 29, 32 | "W": 30, 33 | "X": 31, 34 | "Y": 32, 35 | "Z": 33 36 | } -------------------------------------------------------------------------------- /Wav_for_Inference/FV.AWB.arctic_a0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.AWB.arctic_a0001.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.BDL.arctic_a0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.BDL.arctic_a0002.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.CLB.arctic_a0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.CLB.arctic_a0003.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.JMK.arctic_a0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.JMK.arctic_a0004.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.KSP.arctic_a0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.KSP.arctic_a0005.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.RMS.arctic_a0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.RMS.arctic_a0006.wav -------------------------------------------------------------------------------- /Wav_for_Inference/FV.SLT.arctic_a0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.SLT.arctic_a0007.wav -------------------------------------------------------------------------------- /Wav_for_Inference/LJ.LJ050-0278.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/LJ.LJ050-0278.wav --------------------------------------------------------------------------------