├── .gitignore
├── .vscode
    └── settings.json
├── Audio.py
├── Example_Results
    ├── Figures
    │   ├── 20200505.214958.IDX_0.PNG
    │   ├── 20200505.214958.IDX_1.PNG
    │   ├── 20200505.214958.IDX_2.PNG
    │   ├── 20200505.214958.IDX_3.PNG
    │   ├── 20200505.214958.IDX_4.PNG
    │   ├── 20200505.214958.IDX_5.PNG
    │   ├── 20200505.214958.IDX_6.PNG
    │   └── 20200505.214958.IDX_7.PNG
    ├── GST
    │   └── 20200506.001527.GST.PNG
    └── Wav
    │   ├── 20200505.214958.IDX_0.WAV
    │   ├── 20200505.214958.IDX_1.WAV
    │   ├── 20200505.214958.IDX_2.WAV
    │   ├── 20200505.214958.IDX_3.WAV
    │   ├── 20200505.214958.IDX_4.WAV
    │   ├── 20200505.214958.IDX_5.WAV
    │   ├── 20200505.214958.IDX_6.WAV
    │   └── 20200505.214958.IDX_7.WAV
├── Feeder.py
├── Figures
    └── Structure.png
├── Get_Path.py
├── Hyper_Parameters.json
├── Inference_Sentence_for_Training.txt
├── Inference_Wav_for_Training.txt
├── LICENSE
├── LICENSE.txt
├── Model.py
├── Modules
    ├── Attention
    │   ├── Layers.py
    │   ├── Steps.py
    │   └── __init__.py
    ├── GST.py
    ├── Taco2.py
    └── __init__.py
├── Papers
    ├── He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf
    ├── Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf
    ├── Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf
    ├── Style Tokens Unsupervised Style Modeling Control and Transfer.pdf
    └── Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf
├── Pattern_Generator.py
├── ProgressBar.py
├── README.md
├── R_Script
    ├── TSNE.R
    └── VCTK_Outlier_Checker.R
├── Requirements.txt
├── Token_Index_Dict.ENG.json
├── Wav_for_Inference
    ├── FV.AWB.arctic_a0001.wav
    ├── FV.BDL.arctic_a0002.wav
    ├── FV.CLB.arctic_a0003.wav
    ├── FV.JMK.arctic_a0004.wav
    ├── FV.KSP.arctic_a0005.wav
    ├── FV.RMS.arctic_a0006.wav
    ├── FV.SLT.arctic_a0007.wav
    └── LJ.LJ050-0278.wav
└── vctk_nonoutlier.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | Temp.py
131 | Temp1.py
132 | Temp/stepwise.py
133 | Wav_for_Inference/BC2013.CB-20K1-01-01.wav
134 | Wav_for_Inference/LJ.LJ001-0001.wav
135 | Wav_for_Inference/VCTK.p376_001.wav
136 | .vscode/settings.json
137 | Hyper_Parameters.CP.json
138 | nonoutlier.txt
139 | Sig.Original.png
140 | Sig.RemoveOutlier.png
141 | Split.Original.png
142 | Split.RemoveOutlier.png
143 | Trim.Original.png
144 | Trim.RemoveOutlier.png
145 | VCTK_Length.txt
146 | Bak/.gitignore
147 | Bak/Attention_Modules.py
148 | Bak/Audio.py
149 | Bak/Feeder.py
150 | Bak/Hyper_Parameters.json
151 | Bak/Inference_Sentence_for_Training.txt
152 | Bak/Inference_Wav_for_Training.txt
153 | Bak/LICENSE
154 | Bak/Model.py
155 | Bak/Modules.py
156 | Bak/Pattern_Generator.py
157 | Bak/ProgressBar.py
158 | Bak/README.md
159 | Bak/Token_Index_Dict.ENG.json
160 | Bak/VCTK_Outlier_Checker.R
161 | Bak/DCA.py
162 | Bak/Taco1_Modules.py
163 | Figures/Figure.pptx
164 | RAdam.py
165 | R_Script/Token_analysis.R
166 | Get_Path.py
167 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "C:\\Users\\Heejo\\Anaconda3\\python.exe"
3 | }


--------------------------------------------------------------------------------
/Audio.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/keithito/tacotron/blob/master/util/audio.py
  2 | # https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/audio/__init__.py
  3 | # I only changed the hparams to usual parameters from oroginal code.
  4 | 
  5 | import numpy as np
  6 | from scipy import signal
  7 | import librosa.filters
  8 | import librosa
  9 | 
 10 | 
 11 | def preemphasis(x, preemphasis = 0.97):
 12 |     return signal.lfilter([1, -preemphasis], [1], x)
 13 | 
 14 | def inv_preemphasis(x, preemphasis = 0.97):
 15 |     return signal.lfilter([1], [1, -preemphasis], x)
 16 | 
 17 | 
 18 | def spectrogram(y, num_freq, hop_length, win_length, sample_rate, ref_level_db = 20, max_abs_value = None, spectral_subtract= False):    
 19 |     M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract)
 20 |     S = _amp_to_db(M) - ref_level_db
 21 |     return _normalize(S) if max_abs_value is None else _symmetric_normalize(S, max_abs_value= max_abs_value)
 22 | 
 23 | def inv_spectrogram(spectrogram, num_freq, hop_length, win_length, sample_rate, ref_level_db = 20, power = 1.5, max_abs_value = None, griffin_lim_iters= 60):
 24 |     '''Converts spectrogram to waveform using librosa'''
 25 |     spectrogram = _denormalize(spectrogram) if max_abs_value is None else _symmetric_denormalize(spectrogram, max_abs_value= max_abs_value)
 26 |     S = _db_to_amp(spectrogram + ref_level_db)  # Convert back to linear
 27 |     return inv_preemphasis(_griffin_lim(S ** power, num_freq, hop_length, win_length, sample_rate, griffin_lim_iters= griffin_lim_iters))          # Reconstruct phase
 28 | 
 29 | def melspectrogram(y, num_freq, hop_length, win_length, num_mels, sample_rate, max_abs_value = None, spectral_subtract= False):
 30 |     M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract)
 31 |     S = _amp_to_db(_linear_to_mel(M, num_freq, num_mels, sample_rate))
 32 |     return _normalize(S) if max_abs_value is None else _symmetric_normalize(S, max_abs_value= max_abs_value)
 33 | 
 34 | def spectrogram_and_mel(y, num_freq, hop_length, win_length, sample_rate, spect_ref_level_db = 20, num_mels= 80, max_abs_mels = None, spectral_subtract= False):
 35 |     M = _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract)
 36 |     spect_S = _normalize(_amp_to_db(M) - spect_ref_level_db)
 37 |     mel_S = _amp_to_db(_linear_to_mel(M, num_freq, num_mels, sample_rate))
 38 |     mel_S = _normalize(mel_S) if max_abs_mels is None else _symmetric_normalize(mel_S, max_abs_value= max_abs_mels)
 39 | 
 40 |     return spect_S, mel_S
 41 | 
 42 | def mfcc(y, num_freq, num_mfcc, hop_length, win_length, sample_rate, use_energy= False):
 43 |     n_fft = (num_freq - 1) * 2
 44 |     mfcc_Array = librosa.feature.mfcc(y, sr= sample_rate, n_mfcc= num_mfcc + 1, n_fft= n_fft, hop_length= hop_length, win_length= win_length)
 45 |     mfcc_Array = mfcc_Array[:-1] if use_energy else mfcc_Array[1:]
 46 |     
 47 |     return mfcc_Array
 48 | 
 49 | def _magnitude(y, num_freq, hop_length, win_length, sample_rate, spectral_subtract= False):
 50 |     D = _stft(preemphasis(y), num_freq, hop_length, win_length, sample_rate)
 51 |     M = np.abs(D)
 52 |     if spectral_subtract:
 53 |         M = np.clip(M - np.mean(M, axis= 1, keepdims= True) / 10, a_min= 0.0, a_max= np.inf)
 54 | 
 55 |     return M
 56 | 
 57 | def _griffin_lim(S, num_freq, hop_length, win_length, sample_rate, griffin_lim_iters = 60):
 58 |     '''librosa implementation of Griffin-Lim
 59 |     Based on https://github.com/librosa/librosa/issues/434
 60 |     '''
 61 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 62 |     S_complex = np.abs(S).astype(np.complex)
 63 |     y = _istft(S_complex * angles, num_freq, hop_length, win_length, sample_rate)
 64 | 
 65 |     for _ in range(griffin_lim_iters):
 66 |         angles = np.exp(1j * np.angle(_stft(y, num_freq, hop_length, win_length, sample_rate)))
 67 |         y = _istft(S_complex * angles, num_freq, hop_length, win_length, sample_rate)
 68 |     return y
 69 | 
 70 | def _stft(y, num_freq, hop_length, win_length, sample_rate):
 71 |     n_fft = (num_freq - 1) * 2
 72 |     return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
 73 | 
 74 | def _istft(y, num_freq, hop_length, win_length, sample_rate):
 75 |     return librosa.istft(y, hop_length=hop_length, win_length=win_length)
 76 | 
 77 | def _linear_to_mel(spectrogram, num_freq, num_mels, sample_rate):
 78 |     _mel_basis = _build_mel_basis(num_freq, num_mels, sample_rate)
 79 |     return np.dot(_mel_basis, spectrogram)
 80 | 
 81 | def _build_mel_basis(num_freq, num_mels, sample_rate):
 82 |     n_fft = (num_freq - 1) * 2
 83 |     return librosa.filters.mel(sample_rate, n_fft, n_mels=num_mels)
 84 | 
 85 | 
 86 | def _amp_to_db(x):
 87 |     return 20 * np.log10(np.maximum(1e-5, x))
 88 | 
 89 | def _db_to_amp(x):
 90 |     return np.power(10.0, x * 0.05)
 91 | 
 92 | def _normalize(S, min_level_db = -100):
 93 |     return np.clip((S - min_level_db) / -min_level_db, 0, 1)
 94 | 
 95 | def _symmetric_normalize(S, min_level_db = -100, max_abs_value = 4):
 96 |     return np.clip((2 * max_abs_value) * ((S - min_level_db) / (-min_level_db)) - max_abs_value, -max_abs_value, max_abs_value)
 97 | 
 98 | def _denormalize(S, min_level_db = -100):
 99 |     return (np.clip(S, 0, 1) * -min_level_db) + min_level_db
100 | 
101 | def _symmetric_denormalize(S, min_level_db = -100, max_abs_value = 4):
102 |     return ((np.clip(S, -max_abs_value, max_abs_value) + max_abs_value) / (2 * max_abs_value) * -min_level_db) + min_level_db


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_0.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_0.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_1.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_2.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_3.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_4.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_5.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_6.PNG


--------------------------------------------------------------------------------
/Example_Results/Figures/20200505.214958.IDX_7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Figures/20200505.214958.IDX_7.PNG


--------------------------------------------------------------------------------
/Example_Results/GST/20200506.001527.GST.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/GST/20200506.001527.GST.PNG


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_0.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_0.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_1.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_1.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_2.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_2.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_3.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_3.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_4.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_4.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_5.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_5.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_6.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_6.WAV


--------------------------------------------------------------------------------
/Example_Results/Wav/20200505.214958.IDX_7.WAV:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Example_Results/Wav/20200505.214958.IDX_7.WAV


--------------------------------------------------------------------------------
/Feeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json, os, time, pickle, librosa
  3 | from collections import deque
  4 | from threading import Thread
  5 | from random import shuffle
  6 | 
  7 | from Pattern_Generator import Mel_Generate
  8 | 
  9 | 
 10 | with open('Hyper_Parameters.json', 'r') as f:
 11 |     hp_Dict = json.load(f)
 12 | 
 13 | class Feeder:
 14 |     def __init__(self, is_Training= False):
 15 |         self.is_Training = is_Training
 16 | 
 17 |         self.Metadata_Load()
 18 | 
 19 |         if self.is_Training:
 20 |             self.pattern_Queue = deque()
 21 |             pattern_Generate_Thread = Thread(target= self.Pattern_Generate)
 22 |             pattern_Generate_Thread.daemon = True
 23 |             pattern_Generate_Thread.start()
 24 | 
 25 |     def Metadata_Load(self):
 26 |         with open(hp_Dict['Token_JSON_Path'], 'r') as f:
 27 |             self.token_Index_Dict = json.load(f)
 28 | 
 29 |         if self.is_Training:
 30 |             with open(os.path.join(hp_Dict['Train']['Pattern_Path'], hp_Dict['Train']['Metadata_File']).replace('\\', '/'), 'rb') as f:
 31 |                 self.metadata_Dict = pickle.load(f)
 32 | 
 33 |             if not all([
 34 |                 self.token_Index_Dict[key] == self.metadata_Dict['Token_Index_Dict'][key]
 35 |                 for key in self.token_Index_Dict.keys()
 36 |                 ]):
 37 |                 raise ValueError('The token information of metadata information and hyper parameter is not consistent.')
 38 |             elif not all([
 39 |                 self.metadata_Dict['Spectrogram_Dim'] == hp_Dict['Sound']['Spectrogram_Dim'],
 40 |                 self.metadata_Dict['Mel_Dim'] == hp_Dict['Sound']['Mel_Dim'],
 41 |                 self.metadata_Dict['Frame_Shift'] == hp_Dict['Sound']['Frame_Shift'],
 42 |                 self.metadata_Dict['Frame_Length'] == hp_Dict['Sound']['Frame_Length'],
 43 |                 self.metadata_Dict['Sample_Rate'] == hp_Dict['Sound']['Sample_Rate'],
 44 |                 self.metadata_Dict['Max_Abs_Mel'] == hp_Dict['Sound']['Max_Abs_Mel'],
 45 |                 ]):
 46 |                 raise ValueError('The metadata information and hyper parameter setting are not consistent.')
 47 | 
 48 |     def Pattern_Generate(self):
 49 |         min_Mel_Length = hp_Dict['Train']['Min_Wav_Length'] * hp_Dict['Sound']['Sample_Rate'] / hp_Dict['Sound']['Frame_Shift'] / 1000
 50 |         max_Mel_Length = hp_Dict['Train']['Max_Wav_Length'] * hp_Dict['Sound']['Sample_Rate'] / hp_Dict['Sound']['Frame_Shift'] / 1000
 51 | 
 52 |         path_List = [
 53 |             (path, self.metadata_Dict['Mel_Length_Dict'][path])
 54 |             for path in self.metadata_Dict['File_List']
 55 |             if self.metadata_Dict['Mel_Length_Dict'][path] >= min_Mel_Length and self.metadata_Dict['Mel_Length_Dict'][path] <= max_Mel_Length
 56 |             ]
 57 | 
 58 |         print(
 59 |             'Train pattern info', '\n',
 60 |             'Total pattern count: {}'.format(len(self.metadata_Dict['Mel_Length_Dict'])), '\n',
 61 |             'Use pattern count: {}'.format(len(path_List)), '\n',
 62 |             'Excluded pattern count: {}'.format(len(self.metadata_Dict['Mel_Length_Dict']) - len(path_List))
 63 |             )
 64 | 
 65 |         if hp_Dict['Train']['Pattern_Sorting']:
 66 |             path_List = [file_Name for file_Name, _ in sorted(path_List, key=lambda x: x[1])]
 67 |         else:
 68 |             path_List = [file_Name for file_Name, _ in path_List]
 69 | 
 70 |         while True:
 71 |             if not hp_Dict['Train']['Pattern_Sorting']:
 72 |                 shuffle(path_List)
 73 | 
 74 |             path_Batch_List = [
 75 |                 path_List[x:x + hp_Dict['Train']['Batch_Size']]
 76 |                 for x in range(0, len(path_List), hp_Dict['Train']['Batch_Size'])
 77 |                 ]
 78 |             if hp_Dict['Train']['Sequential_Pattern']:
 79 |                 path_Batch_List = path_Batch_List[0:2] + list(reversed(path_Batch_List))  #Batch size의 적절성을 위한 코드. 10회 이상 되면 문제 없음
 80 |             else:
 81 |                 shuffle(path_Batch_List)            
 82 | 
 83 |             batch_Index = 0
 84 |             while batch_Index < len(path_Batch_List):
 85 |                 if len(self.pattern_Queue) >= hp_Dict['Train']['Max_Pattern_Queue']:
 86 |                     time.sleep(0.1)
 87 |                     continue
 88 | 
 89 |                 pattern_Count = len(path_Batch_List[batch_Index])
 90 | 
 91 |                 mel_List = []
 92 |                 token_List = []
 93 |                 spectrogram_List = []
 94 | 
 95 |                 for file_Path in path_Batch_List[batch_Index]:
 96 |                     with open(os.path.join(hp_Dict['Train']['Pattern_Path'], file_Path).replace('\\', '/'), 'rb') as f:
 97 |                         pattern_Dict = pickle.load(f)
 98 | 
 99 |                     mel_List.append(pattern_Dict['Mel'])                    
100 |                     token_List.append(pattern_Dict['Token'])
101 |                     spectrogram_List.append(pattern_Dict['Spectrogram'])
102 | 
103 |                 max_Mel_Length = max([mel.shape[0] for mel in mel_List])
104 |                 max_Token_Length = max([token.shape[0] for token in token_List])
105 |                 max_Spectrogram_Length = max([spect.shape[0] for spect in spectrogram_List])
106 | 
107 |                 new_Mel_Pattern = np.zeros(
108 |                     shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']),
109 |                     dtype= np.float32
110 |                     )
111 |                 new_Token_Pattern = np.zeros(
112 |                     shape=(pattern_Count, max_Token_Length),
113 |                     dtype= np.int32
114 |                     ) + self.token_Index_Dict['<E>']
115 |                 new_Spectrogram_Pattern = np.zeros(
116 |                     shape=(pattern_Count, max_Spectrogram_Length, hp_Dict['Sound']['Spectrogram_Dim']),
117 |                     dtype= np.float32
118 |                     )
119 |                 
120 |                 for pattern_Index, (mel, token, spect) in enumerate(zip(mel_List, token_List, spectrogram_List)):
121 |                     new_Mel_Pattern[pattern_Index, :mel.shape[0]] = mel
122 |                     new_Token_Pattern[pattern_Index, :token.shape[0]] = token
123 |                     new_Spectrogram_Pattern[pattern_Index, :spect.shape[0]] = spect
124 | 
125 |                 new_Mel_Pattern = np.hstack([
126 |                     np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32),
127 |                     new_Mel_Pattern
128 |                     ])  #initial frame
129 |                 new_Spectrogram_Pattern = np.hstack([
130 |                     np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Spectrogram_Dim']), dtype= np.float32),
131 |                     new_Spectrogram_Pattern
132 |                     ])  #initial frame
133 |                 
134 |                 padded_Length = np.maximum(new_Mel_Pattern.shape[1], new_Spectrogram_Pattern.shape[1])
135 |                 padded_Length = int(np.ceil(padded_Length / hp_Dict['Step_Reduction']) * hp_Dict['Step_Reduction'])
136 |                 new_Mel_Pattern = np.hstack([
137 |                     new_Mel_Pattern,
138 |                     np.zeros(shape=(pattern_Count, padded_Length - new_Mel_Pattern.shape[1] + 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32)
139 |                     ])  # +1 is initial frame. This frame is removed when loss calc.
140 |                 new_Spectrogram_Pattern = np.hstack([                    
141 |                     new_Spectrogram_Pattern,
142 |                     np.zeros(shape=(pattern_Count, padded_Length - new_Spectrogram_Pattern.shape[1] + 1, hp_Dict['Sound']['Spectrogram_Dim']), dtype= np.float32),
143 |                     ])  # +1 is initial frame. This frame is removed when loss calc.
144 |                 
145 |                 self.pattern_Queue.append({
146 |                     'mels': new_Mel_Pattern,
147 |                     'mel_lengths': np.array([mel.shape[0] for mel in mel_List], dtype=np.int32),
148 |                     'tokens': new_Token_Pattern,
149 |                     'token_lengths': np.array([token.shape[0] for token in token_List], dtype=np.int32),
150 |                     'spectrograms': new_Spectrogram_Pattern,
151 |                     'spectrogram_lengths': np.array([spect.shape[0] for spect in spectrogram_List], dtype=np.int32),
152 |                     })
153 | 
154 |                 batch_Index += 1
155 | 
156 |     def Get_Pattern(self):
157 |         while len(self.pattern_Queue) == 0: #When training speed is faster than making pattern, model should be wait.
158 |             time.sleep(0.01)
159 |         return self.pattern_Queue.popleft()
160 |     
161 |     def Get_Inference_Pattern(self, sentence_List, wav_List_for_GST= None):
162 |         pattern_Count = len(sentence_List)
163 | 
164 |         sentence_List = [sentence.upper().strip() for sentence in sentence_List]
165 | 
166 |         token_List = [
167 |             np.array(
168 |                 [self.token_Index_Dict['<S>']] +
169 |                 [self.token_Index_Dict[letter] for letter in sentence] +
170 |                 [self.token_Index_Dict['<E>']],
171 |                 dtype= np.int32
172 |                 )
173 |             for sentence in sentence_List
174 |             ]
175 |         max_Token_Length = max([token.shape[0] for token in token_List])
176 |         
177 |         new_Token_Pattern = np.zeros(
178 |             shape=(pattern_Count, max_Token_Length),
179 |             dtype= np.int32
180 |             ) + self.token_Index_Dict['<E>']
181 | 
182 |         new_Initial_Mel_Pattern = np.zeros(
183 |             shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']),
184 |             dtype= np.float32
185 |             )
186 | 
187 |         for pattern_Index, token in enumerate(token_List):
188 |             new_Token_Pattern[pattern_Index, :token.shape[0]] = token
189 |     
190 |         pattern_Dict = {
191 |             'tokens': new_Token_Pattern,
192 |             'token_lengths': np.array([token.shape[0] for token in token_List], dtype=np.int32),
193 |             'initial_mels': new_Initial_Mel_Pattern
194 |             }
195 | 
196 |         if hp_Dict['GST']['Use']:        
197 |             if wav_List_for_GST is None:
198 |                 print('GST is enabled, but no wav information.')
199 |                 return
200 |             if not len(wav_List_for_GST) in [1, pattern_Count]:
201 |                 print('The length of wav_List_for_GST must be 1 or same to the length of sentence_List and wav_List_for_GST must be same.')
202 |                 return
203 | 
204 |             if len(wav_List_for_GST) == 1:
205 |                 mel = Mel_Generate(wav_List_for_GST[0], top_db= 60, range_Ignore= True)
206 |                 new_Mel_Pattern_for_GST = np.stack([mel] * pattern_Count, axis= 0)
207 |                 new_Mel_Length_for_GST = np.array([mel.shape[0]] * pattern_Count, dtype= np.int32)
208 |             else:
209 |                 mel_List = [Mel_Generate(path, top_db= 15, range_Ignore= True) for path in wav_List_for_GST]                
210 |                 max_Mel_Length = max([mel.shape[0] for mel in mel_List])
211 |                 new_Mel_Pattern_for_GST = np.zeros(
212 |                     shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']),
213 |                     dtype= np.float32
214 |                     )
215 |                 for pattern_Index, mel in enumerate(mel_List):
216 |                     new_Mel_Pattern_for_GST[pattern_Index, :mel.shape[0]] = mel
217 | 
218 |                 new_Mel_Length_for_GST = np.array([mel.shape[0] for mel in mel_List], dtype=np.int32)
219 |                 
220 |             # GST does not need an initial frame. But for the same pattern input as the training, I add an initial frame
221 |             pattern_Dict['mels_for_gst'] = np.hstack([
222 |                 np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32),
223 |                 new_Mel_Pattern_for_GST
224 |                 ])
225 |             pattern_Dict['mel_lengths_for_gst'] = new_Mel_Length_for_GST
226 | 
227 |         return pattern_Dict
228 | 
229 |     def Get_Inference_GST_Pattern(self, wav_List):
230 |         pattern_Count = len(wav_List)
231 |         
232 |         mel_List = [Mel_Generate(path, top_db= 60, range_Ignore= True) for path in wav_List]                
233 |         max_Mel_Length = max([mel.shape[0] for mel in mel_List])
234 |         new_Mel_Pattern = np.zeros(
235 |             shape=(pattern_Count, max_Mel_Length, hp_Dict['Sound']['Mel_Dim']),
236 |             dtype= np.float32
237 |             )
238 |         for pattern_Index, mel in enumerate(mel_List):
239 |             new_Mel_Pattern[pattern_Index, :mel.shape[0]] = mel
240 | 
241 |         new_Mel_Length = np.array([mel.shape[0] for mel in mel_List], dtype=np.int32)
242 |             
243 |         # GST does not need an initial frame. But for the same pattern input as the training, I add an initial frame
244 |         pattern_Dict = {
245 |             'mels_for_gst': np.hstack([
246 |                 np.zeros(shape=(pattern_Count, 1, hp_Dict['Sound']['Mel_Dim']), dtype= np.float32),
247 |                 new_Mel_Pattern
248 |                 ]),
249 |             'mel_lengths_for_gst': new_Mel_Length
250 |             }
251 | 
252 |         return pattern_Dict
253 | 
254 | 
255 | if __name__ == "__main__":
256 |     new_Feeder = Feeder(is_Training= True)
257 |     x = new_Feeder.Get_Pattern()
258 |     
259 |     print(x['mels'].shape)
260 |     print(x['spectrograms'].shape)
261 |     print(x['tokens'].shape)
262 |     print(x['mel_lengths'].shape)
263 |     print(x['spectrogram_lengths'].shape)
264 |     print(x['token_lengths'].shape)
265 |     print(x['tokens'])
266 | 
267 |     print('######################################################')
268 | 
269 |     x = new_Feeder.Get_Inference_Pattern(sentence_List= [
270 |         'The grass is always greener on the other side of the fence.',
271 |         'Strike while the iron is hot.'
272 |         ])
273 |     print(x['initial_mels'].shape)
274 |     print(x['tokens'].shape)
275 |     print(x['token_lengths'].shape)
276 |     print(x['tokens'])
277 | 
278 |     # while True:
279 |     #     time.sleep(1)
280 |     #     print(new_Feeder.Get_Pattern())
281 |     


--------------------------------------------------------------------------------
/Figures/Structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Figures/Structure.png


--------------------------------------------------------------------------------
/Get_Path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from random import sample
 3 | 
 4 | def Get_Path(sample_count= 50):
 5 |     path_List = [
 6 |         ('LJ(F)', 'D:/Pattern/ENG/LJSpeech/wavs'),
 7 |         ('CLB(F)', 'D:/Pattern/ENG/FastVox/cmu_us_clb_arctic/wav'),
 8 |         ('SLT(F)', 'D:/Pattern/ENG/FastVox/cmu_us_slt_arctic/wav'),
 9 |         ('AWB(M)', 'D:/Pattern/ENG/FastVox/cmu_us_awb_arctic/wav'),
10 |         ('BDL(M)', 'D:/Pattern/ENG/FastVox/cmu_us_bdl_arctic/wav'),        
11 |         ('JMK(M)', 'D:/Pattern/ENG/FastVox/cmu_us_jmk_arctic/wav'),
12 |         ('KSP(M)', 'D:/Pattern/ENG/FastVox/cmu_us_ksp_arctic/wav'),
13 |         ('RMS(M)', 'D:/Pattern/ENG/FastVox/cmu_us_rms_arctic/wav'),        
14 |         ]
15 |     
16 |     wav_List = []
17 |     tag_List = []
18 |     for tag, path in path_List:
19 |         for root, _, files in os.walk(path):            
20 |             for file in sample(files, sample_count):
21 |                 wav_List.append(os.path.join(root, file).replace('\\', '/'))
22 |                 tag_List.append(tag)
23 | 
24 |     return wav_List, tag_List
25 |                 
26 | 
27 |     
28 |     
29 |     


--------------------------------------------------------------------------------
/Hyper_Parameters.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Sound": {
  3 |         "Spectrogram_Dim": 513,
  4 |         "Mel_Dim": 80,
  5 |         "Frame_Length": 1024,
  6 |         "Frame_Shift": 256,
  7 |         "Sample_Rate": 16000,
  8 |         "Max_Abs_Mel": 4
  9 |     },
 10 | 
 11 |     "Token_JSON_Path": "Token_Index_Dict.ENG.json",
 12 | 
 13 |     "GST": {
 14 |         "Use": true,
 15 |         "Reference_Encoder": {
 16 |             "Conv": {
 17 |                 "Filters": [32, 32, 64, 64, 128, 128],
 18 |                 "Kernel_Size": [3, 3, 3, 3, 3, 3],
 19 |                 "Strides": [2, 2, 2, 2, 2, 2]
 20 |             },
 21 |             "RNN": {
 22 |                 "Size": 128
 23 |             },
 24 |             "Dense": {
 25 |                 "Size": 128
 26 |             }
 27 |         },
 28 |         "Style_Token": {
 29 |             "Size": 16,
 30 |             "Embedding": {
 31 |                 "Size": 256
 32 |             },
 33 |             "Attention": {
 34 |                 "Head": 4,
 35 |                 "Size": 128
 36 |             }
 37 |         }
 38 |     },
 39 | 
 40 |     "Tacotron1": {
 41 |         "Encoder": {
 42 |             "Embedding": {
 43 |                 "Size": 256
 44 |             },
 45 |             "Prenet": {
 46 |                 "Size": [256, 128],
 47 |                 "Dropout_Rate": 0.5
 48 |             },
 49 |             "CBHG": {
 50 |                 "Conv_Bank": {
 51 |                     "Stack_Count": 16,
 52 |                     "Filters": 128
 53 |                 },
 54 |                 "Pool": {
 55 |                     "Pool_Size": 2,
 56 |                     "Strides": 1
 57 |                 },
 58 |                 "Conv1D": {
 59 |                     "Filters": [128, 128],
 60 |                     "Kernel_Size": [3, 3]
 61 |                 },
 62 |                 "Highwaynet": {
 63 |                     "Count": 4,
 64 |                     "Size": 128
 65 |                 },
 66 |                 "RNN": {
 67 |                     "Size": 128,
 68 |                     "Zoneout": 0.0
 69 |                 }
 70 |             }
 71 |         },        
 72 |         "Decoder": {
 73 |             "Prenet": {
 74 |                 "Size": [256, 128],
 75 |                 "Dropout_Rate": 0.5
 76 |             },
 77 |             "Pre_RNN": {                
 78 |                 "Size": [256],
 79 |                 "Zoneout": 0.0
 80 |             },
 81 |             "Attention": {
 82 |                 "Type": ["SMA"],
 83 |                 "Size": [128]
 84 |             },
 85 |             "Post_RNN": {
 86 |                 "Count": 2,
 87 |                 "Size": 256,
 88 |                 "Zoneout": 0.0
 89 |             }
 90 |         }
 91 |     },
 92 | 
 93 |     "Tacotron2": {
 94 |         "Encoder": {
 95 |             "Embedding": {
 96 |                 "Size": 512
 97 |             },
 98 |             "Conv": {
 99 |                 "Filters": [512, 512, 512],
100 |                 "Kernel_Size": [5, 5, 5],
101 |                 "Strides": [1, 1, 1],
102 |                 "Dropout_Rate": 0.5
103 |             },
104 |             "RNN": {
105 |                 "Size": 256,
106 |                 "Zoneout": 0.0
107 |             }
108 |         },        
109 |         "Decoder": {
110 |             "Prenet": {
111 |                 "Size": [256, 256],
112 |                 "Dropout_Rate": 0.5
113 |             },
114 |             "RNN": {                
115 |                 "Size": [1024, 1024],
116 |                 "Zoneout": 0.0
117 |             },
118 |             "Attention": {
119 |                 "Type": "SMA",
120 |                 "Size": 128
121 |             },
122 |             "Conv": {
123 |                 "Filters": [512, 512, 512, 512],
124 |                 "Kernel_Size": [5, 5, 5, 5],
125 |                 "Strides": [1, 1, 1, 1],
126 |                 "Dropout_Rate": 0.5
127 |             }
128 |         }
129 |     },
130 | 
131 |     "Step_Reduction": 1,
132 |     "Max_Step": 1000,
133 | 
134 |     "Vocoder_Taco1": {
135 |         "CBHG": {
136 |             "Conv_Bank": {
137 |                 "Stack_Count": 8,
138 |                 "Filters": 256
139 |             },
140 |             "Pool": {
141 |                 "Pool_Size": 2,
142 |                 "Strides": 1
143 |             },
144 |             "Conv1D": {
145 |                 "Filters": [128, 128],
146 |                 "Kernel_Size": [3, 3]
147 |             },
148 |             "Highwaynet": {
149 |                 "Count": 4,
150 |                 "Size": 128
151 |             },
152 |             "RNN": {
153 |                 "Size": 256,
154 |                 "Zoneout": 0.0
155 |             }
156 |         },
157 |         "Griffin-Lim_Iter": 60
158 |     },
159 | 
160 |     "Train": {
161 |         "Pattern_Path": "C:/Pattern/GST.Pattern.LJFV",
162 |         "Metadata_File": "METADATA.PICKLE",
163 |         "Batch_Size": 24,
164 |         "Pattern_Sorting": true,
165 |         "Min_Wav_Length": 500,
166 |         "Max_Wav_Length": 10000,
167 |         "Max_Pattern_Queue": 50,
168 |         "Initial_Learning_Rate": 1e-3,
169 |         "Min_Learning_Rate": 1e-5,
170 |         "ADAM": {
171 |             "Beta1": 0.9,
172 |             "Beta2": 0.999,
173 |             "Epsilon": 1e-7
174 |         },
175 |         "Use_L2_Loss": true,
176 |         "Inference_Timing": 1000,
177 |         "Checkpoint_Save_Timing": 1000,
178 | 
179 |         "Sequential_Pattern": false,
180 |         "Initial_Inference": true
181 |     },
182 | 
183 |         
184 |     "Taco_Version": 2,
185 |     "Use_Mixed_Precision": false,
186 |     "Inference_Cut": true,
187 |     "Inference_Path": "D:/GST.Results/Inference",
188 |     "Checkpoint_Path": "D:/GST.Results/Checkpoint",
189 |     "Device": "0"
190 | }


--------------------------------------------------------------------------------
/Inference_Sentence_for_Training.txt:
--------------------------------------------------------------------------------
1 | The grass is always greener on the other side of the fence.
2 | Strike while the iron is hot.
3 | A creative artist works on his next composition because he was not satisfied with his previous one.
4 | You cannot make an omelet without breaking a few eggs.
5 | Death is like a fisherman who catches fish in his net and leaves them for a while in the water. The fish is still swimming but the net is around him, and the fisherman will draw him up.
6 | A man who marries a woman to educate her falls a victim to the same fallacy as the woman who marries a man to reform him.
7 | Birds of a feather flock together.
8 | Too many cooks in the kitchen spoil the broth.


--------------------------------------------------------------------------------
/Inference_Wav_for_Training.txt:
--------------------------------------------------------------------------------
1 | ./Wav_for_Inference/FV.AWB.arctic_a0001.wav
2 | ./Wav_for_Inference/FV.BDL.arctic_a0002.wav
3 | ./Wav_for_Inference/FV.CLB.arctic_a0003.wav
4 | ./Wav_for_Inference/FV.JMK.arctic_a0004.wav
5 | ./Wav_for_Inference/FV.KSP.arctic_a0005.wav
6 | ./Wav_for_Inference/FV.RMS.arctic_a0006.wav
7 | ./Wav_for_Inference/FV.SLT.arctic_a0007.wav
8 | ./Wav_for_Inference/LJ.LJ050-0278.wav


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Heejo You
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Heejo You
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.keras.mixed_precision import experimental as mixed_precision
  3 | import numpy as np
  4 | import json, os, time, argparse
  5 | from threading import Thread
  6 | import matplotlib
  7 | matplotlib.use('agg')
  8 | import matplotlib.pyplot as plt
  9 | from datetime import datetime
 10 | 
 11 | from ProgressBar import progress
 12 | from Feeder import Feeder
 13 | from Modules.GST import Style_Token_Layer, GST_Concated_Encoder
 14 | from Audio import inv_spectrogram
 15 | from scipy.io import wavfile
 16 | 
 17 | with open('Hyper_Parameters.json', 'r') as f:
 18 |     hp_Dict = json.load(f)
 19 | 
 20 | # if hp_Dict['Taco_Version'] == 1:
 21 | #     import Modules_Taco1 as Modules
 22 | # elif hp_Dict['Taco_Version'] == 2:
 23 | #     import Modules_Taco2 as Modules
 24 | # else:
 25 | #     raise ValueError('Unexpected tactoron version hyperparameters: {}'.format(hp_Dict['Version']))
 26 | from Modules import Taco2 as Modules
 27 | 
 28 | if not hp_Dict['Device'] is None:
 29 |     os.environ["CUDA_VISIBLE_DEVICES"]= hp_Dict['Device']
 30 | 
 31 | if hp_Dict['Use_Mixed_Precision']:    
 32 |     policy = mixed_precision.Policy('mixed_float16')
 33 | else:
 34 |     policy = mixed_precision.Policy('float32')
 35 | mixed_precision.set_policy(policy)
 36 | 
 37 | class GST_Tacotron:
 38 |     def __init__(self, is_Training= False):
 39 |         self.feeder = Feeder(is_Training= is_Training)
 40 |         self.Model_Generate()
 41 | 
 42 |     def Model_Generate(self):
 43 |         input_Dict = {}
 44 |         layer_Dict = {}
 45 |         tensor_Dict = {}
 46 | 
 47 |         input_Dict['Mel'] = tf.keras.layers.Input(
 48 |             shape=[None, hp_Dict['Sound']['Mel_Dim']],
 49 |             dtype= tf.as_dtype(policy.compute_dtype)
 50 |             )        
 51 |         input_Dict['Mel_Length'] = tf.keras.layers.Input(
 52 |             shape=[],
 53 |             dtype= tf.int32
 54 |             )        
 55 |         input_Dict['Token'] = tf.keras.layers.Input(
 56 |             shape=[None,],
 57 |             dtype= tf.int32
 58 |             )
 59 |         input_Dict['Token_Length'] = tf.keras.layers.Input(
 60 |             shape=[],
 61 |             dtype= tf.int32
 62 |             )
 63 |         input_Dict['Spectrogram'] = tf.keras.layers.Input(
 64 |             shape=[None, hp_Dict['Sound']['Spectrogram_Dim']],
 65 |             dtype= tf.as_dtype(policy.compute_dtype)
 66 |             )
 67 |         input_Dict['Spectrogram_Length'] = tf.keras.layers.Input(
 68 |             shape=[],
 69 |             dtype= tf.int32
 70 |             )        
 71 |         if hp_Dict['GST']['Use']:
 72 |             input_Dict['GST_Mel'] = tf.keras.layers.Input(
 73 |                 shape=[None, hp_Dict['Sound']['Mel_Dim']],
 74 |                 dtype= tf.as_dtype(policy.compute_dtype)
 75 |                 )
 76 |         
 77 |         layer_Dict['Encoder'] = Modules.Encoder()
 78 |         layer_Dict['Decoder'] = Modules.Decoder()
 79 |         layer_Dict['Vocoder_Taco1'] = Modules.Vocoder_Taco1()
 80 |         if hp_Dict['GST']['Use']:
 81 |             layer_Dict['Style_Token_Layer'] = Style_Token_Layer()
 82 |             layer_Dict['GST_Concated_Encoder'] = GST_Concated_Encoder()
 83 | 
 84 |         
 85 |         tensor_Dict['Train', 'Encoder'] = layer_Dict['Encoder'](
 86 |             input_Dict['Token'],
 87 |             training= True
 88 |             )
 89 |         if hp_Dict['GST']['Use']:            
 90 |             tensor_Dict['Train', 'GST'] = layer_Dict['Style_Token_Layer']([                
 91 |                 input_Dict['GST_Mel'],
 92 |                 input_Dict['Mel_Length']
 93 |                 ])
 94 |             tensor_Dict['Train', 'Encoder'] = layer_Dict['GST_Concated_Encoder']([
 95 |                 tensor_Dict['Train', 'Encoder'],
 96 |                 tensor_Dict['Train', 'GST']
 97 |                 ])
 98 | 
 99 |         tensor_Dict['Train', 'Export_Pre_Mel'], tensor_Dict['Train', 'Export_Mel'], tensor_Dict['Train', 'Stop_Token'], _ = layer_Dict['Decoder'](
100 |             [tensor_Dict['Train', 'Encoder'], input_Dict['Mel']],
101 |             training= True
102 |             )            
103 |         tensor_Dict['Train', 'Export_Spectrogram'] = layer_Dict['Vocoder_Taco1'](
104 |             tensor_Dict['Train', 'Export_Mel'],
105 |             training= True
106 |             )
107 |         
108 |         tensor_Dict['Inference', 'Encoder'] = layer_Dict['Encoder'](
109 |             input_Dict['Token'],
110 |             training= False
111 |             )        
112 |         if hp_Dict['GST']['Use']:
113 |             tensor_Dict['Inference', 'GST'] = layer_Dict['Style_Token_Layer']([                
114 |                 input_Dict['GST_Mel'],
115 |                 input_Dict['Mel_Length']
116 |                 ])
117 |             tensor_Dict['Inference', 'Encoder'] = layer_Dict['GST_Concated_Encoder']([
118 |                 tensor_Dict['Inference', 'Encoder'],
119 |                 tensor_Dict['Inference', 'GST']
120 |                 ])
121 | 
122 |         _, tensor_Dict['Inference', 'Export_Mel'], tensor_Dict['Inference', 'Stop_Token'], tensor_Dict['Inference', 'Alignment'] = layer_Dict['Decoder'](
123 |             [tensor_Dict['Inference', 'Encoder'], input_Dict['Mel']],
124 |             training= False
125 |             )
126 |         tensor_Dict['Inference', 'Export_Spectrogram'] = layer_Dict['Vocoder_Taco1'](
127 |             tensor_Dict['Inference', 'Export_Mel'],
128 |             training= False
129 |             )
130 | 
131 |         self.model_Dict = {}
132 |         self.model_Dict['Train'] = tf.keras.Model(
133 |             inputs=[
134 |                 input_Dict['Mel'],
135 |                 input_Dict['Token'],
136 |                 input_Dict['Spectrogram']
137 |                 ] + ([input_Dict['GST_Mel'], input_Dict['Mel_Length']] if hp_Dict['GST']['Use'] else []),
138 |             outputs= [
139 |                 tensor_Dict['Train', 'Export_Pre_Mel'],
140 |                 tensor_Dict['Train', 'Export_Mel'],
141 |                 tensor_Dict['Train', 'Stop_Token'],
142 |                 tensor_Dict['Train', 'Export_Spectrogram']
143 |                 ]
144 |             )
145 |         self.model_Dict['Inference'] = tf.keras.Model(
146 |             inputs=[
147 |                 input_Dict['Mel'],
148 |                 input_Dict['Token']
149 |                 ] + ([input_Dict['GST_Mel'], input_Dict['Mel_Length']] if hp_Dict['GST']['Use'] else []),
150 |             outputs= [
151 |                 tensor_Dict['Inference', 'Export_Mel'],
152 |                 tensor_Dict['Inference', 'Stop_Token'],
153 |                 tensor_Dict['Inference', 'Export_Spectrogram'],
154 |                 tensor_Dict['Inference', 'Alignment']
155 |                 ]
156 |             )
157 | 
158 |         self.model_Dict['Train'].summary()
159 |         self.model_Dict['Inference'].summary()
160 |                 
161 |         if hp_Dict['GST']['Use']:
162 |             self.model_Dict['GST'] = tf.keras.Model(
163 |                 inputs= [
164 |                     input_Dict['GST_Mel'],
165 |                     input_Dict['Mel_Length']
166 |                     ],
167 |                 outputs= tensor_Dict['Inference', 'GST']
168 |                 )
169 |             self.model_Dict['GST'].summary()
170 | 
171 |         learning_Rate = Modules.ExponentialDecay(
172 |             initial_learning_rate= hp_Dict['Train']['Initial_Learning_Rate'],
173 |             decay_steps= 50000,
174 |             decay_rate= 0.1,
175 |             min_learning_rate= hp_Dict['Train']['Min_Learning_Rate'],
176 |             staircase= False
177 |             )
178 | 
179 |         self.optimizer = tf.keras.optimizers.Adam(
180 |             learning_rate= learning_Rate,
181 |             beta_1= hp_Dict['Train']['ADAM']['Beta1'],
182 |             beta_2= hp_Dict['Train']['ADAM']['Beta2'],
183 |             epsilon= hp_Dict['Train']['ADAM']['Epsilon'],
184 |             )
185 | 
186 |         self.checkpoint = tf.train.Checkpoint(
187 |             optimizer= self.optimizer,
188 |             model= self.model_Dict['Train']
189 |             )
190 | 
191 |     # @tf.function(
192 |     #     input_signature=[
193 |     #         tf.TensorSpec(shape=[None, None, hp_Dict['Sound']['Mel_Dim']], dtype= tf.as_dtype(policy.compute_dtype)),
194 |     #         tf.TensorSpec(shape=[None,], dtype=tf.int32),
195 |     #         tf.TensorSpec(shape=[None, None], dtype=tf.int32),
196 |     #         tf.TensorSpec(shape=[None,], dtype=tf.int32),
197 |     #         tf.TensorSpec(shape=[None, None, hp_Dict['Sound']['Spectrogram_Dim']], dtype= tf.as_dtype(policy.compute_dtype)),
198 |     #         tf.TensorSpec(shape=[None,], dtype=tf.int32)
199 |     #         ],
200 |     #     autograph= False,
201 |     #     experimental_relax_shapes= False
202 |     #     )
203 |     def Train_Step(self, mels, mel_lengths, tokens, token_lengths, spectrograms, spectrogram_lengths):
204 |         with tf.GradientTape() as tape:
205 |             pre_Mel_Logits, mel_Logits, stop_Logits, spectrogram_Logits = self.model_Dict['Train'](
206 |                 inputs= [mels, tokens, spectrograms] + ([mels, mel_lengths] if hp_Dict['GST']['Use'] else []),
207 |                 training= True
208 |                 )
209 | 
210 |             pre_Mel_Loss = tf.reduce_mean(tf.abs(mels[:, 1:] - pre_Mel_Logits), axis= -1)
211 |             mel_Loss = tf.reduce_mean(tf.abs(mels[:, 1:] - mel_Logits), axis= -1)
212 |             spectrogram_Loss = tf.reduce_mean(tf.abs(spectrograms[:, 1:] - spectrogram_Logits), axis= -1)
213 |             if hp_Dict['Train']['Use_L2_Loss']:
214 |                 mel_Loss += tf.reduce_mean(tf.pow(mels[:, 1:] - mel_Logits, 2), axis= -1)
215 |                 spectrogram_Loss += tf.reduce_mean(tf.pow(spectrograms[:, 1:] - spectrogram_Logits, 2), axis= -1)
216 | 
217 |             pre_Mel_Loss *= tf.sequence_mask(
218 |                 lengths= mel_lengths,
219 |                 maxlen= tf.shape(mel_Loss)[-1],
220 |                 dtype= tf.as_dtype(policy.compute_dtype)
221 |                 )
222 |             mel_Loss *= tf.sequence_mask(
223 |                 lengths= mel_lengths,
224 |                 maxlen= tf.shape(mel_Loss)[-1],
225 |                 dtype= tf.as_dtype(policy.compute_dtype)
226 |                 )
227 |             stop_Loss = tf.nn.sigmoid_cross_entropy_with_logits(
228 |                 labels= tf.sequence_mask(
229 |                     lengths= tf.math.ceil(mel_lengths / hp_Dict['Step_Reduction']),   # stop > 0.5: Going, stop < 0.5: Done
230 |                     maxlen= tf.math.ceil(tf.shape(mel_Loss)[-1] / hp_Dict['Step_Reduction']),
231 |                     dtype= tf.as_dtype(policy.compute_dtype)
232 |                     ),
233 |                 logits= stop_Logits
234 |                 )
235 |             spectrogram_Loss *= tf.sequence_mask(
236 |                 lengths= spectrogram_lengths,
237 |                 maxlen= tf.shape(spectrogram_Loss)[-1],
238 |                 dtype= tf.as_dtype(policy.compute_dtype)
239 |                 )
240 |                 
241 |             loss = tf.reduce_mean(pre_Mel_Loss) + tf.reduce_mean(mel_Loss) + tf.reduce_mean(stop_Loss) + tf.reduce_mean(spectrogram_Loss)
242 | 
243 |         gradients = tape.gradient(loss, self.model_Dict['Train'].trainable_variables)
244 |         self.optimizer.apply_gradients(zip(gradients, self.model_Dict['Train'].trainable_variables))
245 | 
246 |         return loss
247 | 
248 |     # @tf.function
249 |     def Inference_Step(self, tokens, token_lengths, initial_mels, mels_for_gst= None, mel_lengths_for_gst= None):
250 |         mel_Logits, stop_Logits, spectrogram_Logits, alignments = self.model_Dict['Inference'](
251 |             inputs= [initial_mels, tokens] + ([mels_for_gst, mel_lengths_for_gst] if hp_Dict['GST']['Use'] else []),
252 |             training= False
253 |             )
254 | 
255 |         return mel_Logits, stop_Logits, spectrogram_Logits, alignments
256 | 
257 |     def Inference_GST_Step(self, mels_for_gst, mel_lengths_for_gst):
258 |         if not hp_Dict['GST']['Use']:
259 |             raise NotImplementedError('GST is not used')
260 |         gst = self.model_Dict['GST'](
261 |             inputs= [mels_for_gst, mel_lengths_for_gst],
262 |             training= False
263 |             )
264 | 
265 |         return gst        
266 | 
267 |     def Restore(self, checkpoint_File_Path= None):
268 |         if checkpoint_File_Path is None:
269 |             checkpoint_File_Path = tf.train.latest_checkpoint(hp_Dict['Checkpoint_Path'])
270 | 
271 |         if not os.path.exists('{}.index'.format(checkpoint_File_Path)):
272 |             print('There is no checkpoint.')
273 |             return
274 | 
275 |         self.checkpoint.restore(checkpoint_File_Path)
276 |         print('Checkpoint \'{}\' is loaded.'.format(checkpoint_File_Path))
277 | 
278 |     def Train(self):
279 |         if not os.path.exists(os.path.join(hp_Dict['Inference_Path'], 'Hyper_Parameters.json')):
280 |             os.makedirs(hp_Dict['Inference_Path'], exist_ok= True)
281 |             with open(os.path.join(hp_Dict['Inference_Path'], 'Hyper_Parameters.json').replace("\\", "/"), "w") as f:
282 |                 json.dump(hp_Dict, f, indent= 4)
283 | 
284 |         def Save_Checkpoint():
285 |             os.makedirs(os.path.join(hp_Dict['Checkpoint_Path']).replace("\\", "/"), exist_ok= True)
286 |             self.checkpoint.save(
287 |                 os.path.join(
288 |                     hp_Dict['Checkpoint_Path'],
289 |                     'S_{}.CHECKPOINT.H5'.format(self.optimizer.iterations.numpy())
290 |                     ).replace('\\', '/')
291 |                 )
292 | 
293 |         def Run_Inference():
294 |             sentence_List = []
295 |             with open('Inference_Sentence_for_Training.txt', 'r') as f:
296 |                 for line in f.readlines():
297 |                     sentence_List.append(line.strip())
298 | 
299 |             if hp_Dict['GST']['Use']:
300 |                 wav_List_for_GST = []
301 |                 with open('Inference_Wav_for_Training.txt', 'r') as f:
302 |                     for line in f.readlines():
303 |                         wav_List_for_GST.append(line.strip())
304 |             else:
305 |                 wav_List_for_GST = None
306 | 
307 |             self.Inference(sentence_List, wav_List_for_GST)
308 | 
309 |         def Run_GST_Inference():
310 |             from Get_Path import Get_Path
311 |             wav_List, tag_List = Get_Path(100)
312 |             self.Inference_GST(wav_List, tag_List)
313 | 
314 |         # Save_Checkpoint()        
315 |         if hp_Dict['Train']['Initial_Inference']:
316 |             Run_Inference()
317 |             Run_GST_Inference()
318 | 
319 |         while True:
320 |             start_Time = time.time()
321 | 
322 |             loss = self.Train_Step(**self.feeder.Get_Pattern())
323 |             if np.isnan(loss):
324 |                 raise ValueError('NaN loss')
325 |             display_List = [
326 |                 'Time: {:0.3f}'.format(time.time() - start_Time),
327 |                 'Step: {}'.format(self.optimizer.iterations.numpy()),
328 |                 'LR: {:0.5f}'.format(self.optimizer.lr(self.optimizer.iterations.numpy() - 1)),
329 |                 'Loss: {:0.5f}'.format(loss),
330 |                 ]
331 |             print('\t\t'.join(display_List))
332 | 
333 |             if self.optimizer.iterations.numpy() % hp_Dict['Train']['Checkpoint_Save_Timing'] == 0:
334 |                 Save_Checkpoint()
335 |             
336 |             if self.optimizer.iterations.numpy() % hp_Dict['Train']['Inference_Timing'] == 0:
337 |                 Run_Inference()
338 | 
339 |             if self.optimizer.iterations.numpy() % (hp_Dict['Train']['Inference_Timing'] * 10) == 0:
340 |                 Run_GST_Inference()
341 | 
342 |     def Inference(self, sentence_List, wav_List_for_GST= None, label= None):
343 |         print('Inference running...')
344 | 
345 |         pattern_Dict = self.feeder.Get_Inference_Pattern(sentence_List, wav_List_for_GST)
346 |         if pattern_Dict is None:
347 |             print('Inference fail.')
348 |             return
349 |         mels, stops, spectrograms, alignments = self.Inference_Step(
350 |             **pattern_Dict
351 |             )
352 | 
353 |         export_Inference_Thread = Thread(
354 |             target= self.Export_Inference,
355 |             args= [
356 |                 sentence_List,
357 |                 mels.numpy(),
358 |                 stops.numpy(),
359 |                 spectrograms.numpy(),
360 |                 alignments.numpy(),
361 |                 label or datetime.now().strftime("%Y%m%d.%H%M%S")
362 |                 ]
363 |             )
364 |         export_Inference_Thread.daemon = True
365 |         export_Inference_Thread.start()
366 | 
367 |         return mels, stops, spectrograms, alignments
368 | 
369 |     def Export_Inference(self, sentence_List, mel_List, stop_List, spectrogram_List, alignment_List, label):
370 |         os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'Plot').replace("\\", "/"), exist_ok= True)
371 |         os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'Wav').replace("\\", "/"), exist_ok= True)        
372 | 
373 |         for index, (sentence, mel, stop, spect, alignment) in enumerate(zip(sentence_List, mel_List, stop_List, spectrogram_List, alignment_List)):
374 |             #matplotlib does not supprt float16
375 |             mel = mel.astype(np.float32)
376 |             stop = stop.astype(np.float32)
377 |             spect = spect.astype(np.float32)
378 |             alignment = alignment.astype(np.float32)
379 | 
380 |             slice_Index = np.argmax(stop < 0) if any(stop < 0) else stop.shape[0] # Check stop tokens            
381 |             
382 |             new_Figure = plt.figure(figsize=(24, 6 * 5), dpi=100)
383 |             plt.subplot2grid((5, 1), (0, 0))
384 |             plt.imshow(np.transpose(mel), aspect='auto', origin='lower')
385 |             plt.title('Mel    Sentence: {}'.format(sentence))
386 |             plt.colorbar()
387 |             plt.subplot2grid((5, 1), (1, 0))
388 |             plt.imshow(np.transpose(spect), aspect='auto', origin='lower')
389 |             plt.title('Spectrogram    Sentence: {}'.format(sentence))
390 |             plt.colorbar()
391 |             plt.subplot2grid((5, 1), (2, 0), rowspan=2)
392 |             plt.imshow(np.transpose(alignment), aspect='auto', origin='lower')            
393 |             plt.title('Alignment    Sentence: {}'.format(sentence))
394 |             plt.yticks(
395 |                 range(alignment.shape[1]),
396 |                 ['<S>'] + list(sentence) + ['<E>'],
397 |                 fontsize = 10
398 |                 )
399 |             plt.colorbar()
400 |             plt.subplot2grid((5, 1), (4, 0))
401 |             plt.plot(stop)
402 |             plt.axvline(x= slice_Index, linestyle='--', linewidth=1)
403 |             plt.title('Stop token    Sentence: {}'.format(sentence))
404 |             plt.colorbar()
405 |             
406 |             plt.tight_layout()
407 |             plt.savefig(
408 |                 os.path.join(hp_Dict['Inference_Path'], 'Plot', '{}.IDX_{}.PNG'.format(label, index)).replace("\\", "/")
409 |                 )
410 |             plt.close(new_Figure)
411 | 
412 |             new_Sig = inv_spectrogram(
413 |                 spectrogram= np.transpose(spect[:np.maximum(1, slice_Index) * hp_Dict['Step_Reduction']]),
414 |                 num_freq= hp_Dict['Sound']['Spectrogram_Dim'],        
415 |                 hop_length= hp_Dict['Sound']['Frame_Shift'],
416 |                 win_length= hp_Dict['Sound']['Frame_Length'],
417 |                 sample_rate= hp_Dict['Sound']['Sample_Rate'],
418 |                 max_abs_value= hp_Dict['Sound']['Max_Abs_Mel'],
419 |                 griffin_lim_iters= hp_Dict['Vocoder_Taco1']['Griffin-Lim_Iter']
420 |                 )
421 |             wavfile.write(
422 |                 filename= os.path.join(hp_Dict['Inference_Path'], 'Wav', '{}.IDX_{}.WAV'.format(label, index)).replace("\\", "/"),
423 |                 data= (new_Sig * 32768).astype(np.int16),
424 |                 rate= hp_Dict['Sound']['Sample_Rate']
425 |                 )
426 | 
427 |     def Inference_GST(self, wav_List, tag_List, label= None):
428 |         if not hp_Dict['GST']['Use']:
429 |             raise NotImplementedError('GST is not used')            
430 | 
431 |         print('GST Inference running...')
432 |         gsts = self.Inference_GST_Step(
433 |             **self.feeder.Get_Inference_GST_Pattern(wav_List)
434 |             )
435 | 
436 |         export_Inference_Thread = Thread(
437 |             target= self.Export_GST,
438 |             args= [
439 |                 wav_List,
440 |                 tag_List,
441 |                 gsts,
442 |                 label or datetime.now().strftime("%Y%m%d.%H%M%S")
443 |                 ]
444 |             )
445 |         export_Inference_Thread.daemon = True
446 |         export_Inference_Thread.start()
447 | 
448 |     def Export_GST(self, wav_List, tag_List, gst_List, label):
449 |         os.makedirs(os.path.join(hp_Dict['Inference_Path'], 'GST').replace("\\", "/"), exist_ok= True)        
450 | 
451 |         title_Column_List = ['Wav', 'Tag'] + ['Unit_{}'.format(x) for x in range(gst_List[0].shape[0])]
452 |         export_List = ['\t'.join(title_Column_List)]
453 |         for wav_Path, tag, gst in zip(wav_List, tag_List, gst_List):
454 |             new_Line_List = [wav_Path, tag] + [x for x in gst]
455 |             new_Line_List = ['{}'.format(x) for x in new_Line_List]
456 |             export_List.append('\t'.join(new_Line_List))
457 | 
458 |         with open(os.path.join(hp_Dict['Inference_Path'], 'GST', '{}.GST.TXT'.format(label)).replace("\\", "/"), 'w') as f:
459 |             f.write('\n'.join(export_List))
460 | 
461 | if __name__ == '__main__':
462 |     new_Model = GST_Tacotron(is_Training= True)
463 |     new_Model.Restore()
464 |     new_Model.Train()


--------------------------------------------------------------------------------
/Modules/Attention/Layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from scipy.special import comb, beta
  4 | 
  5 | class DotProductAttention(tf.keras.layers.Attention):
  6 |     '''
  7 |     Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L182-L303
  8 |     Changes
  9 |     1. Attention size managing
 10 |     2. Getting the attention history(scores).
 11 |     '''
 12 |     def __init__(self, size, use_scale=False, **kwargs):
 13 |         super(DotProductAttention, self).__init__(use_scale= use_scale, **kwargs)
 14 |         self.size = size
 15 |         self.layer_Dict = {
 16 |             'Query': tf.keras.layers.Dense(size),
 17 |             'Value': tf.keras.layers.Dense(size),
 18 |             'Key': tf.keras.layers.Dense(size)
 19 |             }
 20 | 
 21 |     def call(self, inputs, mask=None):
 22 |         self._validate_call_args(inputs=inputs, mask=mask)
 23 |         q = self.layer_Dict['Query'](inputs[0])
 24 |         v = self.layer_Dict['Value'](inputs[1])
 25 |         k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v
 26 |         q_mask = mask[0] if mask else None
 27 |         v_mask = mask[1] if mask else None
 28 |         scores = self._calculate_scores(query=q, key=k)
 29 |         if v_mask is not None:
 30 |             # Mask of shape [batch_size, 1, Tv].
 31 |             v_mask = tf.expand_dims(v_mask, axis=-2)
 32 |         if self.causal:
 33 |             # Creates a lower triangular mask, so position i cannot attend to
 34 |             # positions j>i. This prevents the flow of information from the future
 35 |             # into the past.
 36 |             scores_shape = tf.shape(scores)
 37 |             # causal_mask_shape = [1, Tq, Tv].
 38 |             causal_mask_shape = tf.concat(
 39 |                 [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]],
 40 |                 axis=0)
 41 |             causal_mask = _lower_triangular_mask(causal_mask_shape)
 42 |         else:
 43 |             causal_mask = None
 44 |         scores_mask = _merge_masks(v_mask, causal_mask)
 45 |         result, attention_distribution = _apply_scores(scores=scores, value=v, scores_mask=scores_mask)
 46 |         if q_mask is not None:
 47 |             # Mask of shape [batch_size, Tq, 1].
 48 |             q_mask = tf.expand_dims(q_mask, axis=-1)
 49 |             result *= tf.cast(q_mask, dtype=result.dtype)
 50 | 
 51 |         return result, attention_distribution
 52 | 
 53 |     def _calculate_scores(self, query, key):
 54 |         """Calculates attention scores as a query-key dot product.
 55 |         Args:
 56 |         query: Query tensor of shape `[batch_size, Tq, dim]`.
 57 |         key: Key tensor of shape `[batch_size, Tv, dim]`.
 58 |         Returns:
 59 |         Tensor of shape `[batch_size, Tq, Tv]`.
 60 |         """
 61 |         scores = tf.matmul(query, key, transpose_b=True)
 62 | 
 63 |         if self.scale is not None:            
 64 |             scores *= self.scale
 65 |         return scores
 66 | 
 67 | class BahdanauAttention(tf.keras.layers.AdditiveAttention):
 68 |     '''
 69 |     Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440
 70 |     This is for attention size managing and getting the attention history(scores).
 71 |     '''
 72 |     def __init__(self, size, use_scale=False, **kwargs):
 73 |         super(BahdanauAttention, self).__init__(use_scale= use_scale, **kwargs)
 74 |         self.size = size
 75 |         self.layer_Dict = {
 76 |             'Query': tf.keras.layers.Dense(size),
 77 |             'Value': tf.keras.layers.Dense(size),
 78 |             'Key': tf.keras.layers.Dense(size)
 79 |             }        
 80 | 
 81 |     def build(self, input_shape):
 82 |         if self.use_scale:
 83 |             self.scale = self.add_weight(
 84 |                 name='scale',
 85 |                 shape=[self.size],
 86 |                 initializer= tf.initializers.glorot_uniform(),
 87 |                 dtype=self.dtype,
 88 |                 trainable=True)
 89 |         else:
 90 |             self.scale = None
 91 |         
 92 |         self.built = True
 93 | 
 94 |     def call(self, inputs, mask=None):
 95 |         self._validate_call_args(inputs=inputs, mask=mask)
 96 |         q = self.layer_Dict['Query'](inputs[0])
 97 |         v = self.layer_Dict['Value'](inputs[1])
 98 |         k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v
 99 |         q_mask = mask[0] if mask else None
100 |         v_mask = mask[1] if mask else None
101 |         scores = self._calculate_scores(query=q, key=k) #[Batch, T_q, T_k]
102 |         if v_mask is not None:
103 |             # Mask of shape [batch_size, 1, Tv].
104 |             v_mask = tf.expand_dims(v_mask, axis=-2)
105 |         if self.causal:
106 |             # Creates a lower triangular mask, so position i cannot attend to
107 |             # positions j>i. This prevents the flow of information from the future
108 |             # into the past.
109 |             scores_shape = tf.shape(scores)
110 |             # causal_mask_shape = [1, Tq, Tv].
111 |             causal_mask_shape = tf.concat(
112 |                 [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]],
113 |                 axis=0)
114 |             causal_mask = _lower_triangular_mask(causal_mask_shape)
115 |         else:
116 |             causal_mask = None
117 |         scores_mask = _merge_masks(v_mask, causal_mask)
118 | 
119 |         result, attention_distribution = _apply_scores(scores=scores, value=v, scores_mask=scores_mask)
120 |         if q_mask is not None:
121 |             # Mask of shape [batch_size, Tq, 1].
122 |             q_mask = tf.expand_dims(q_mask, axis=-1)
123 |             result *= tf.cast(q_mask, dtype=result.dtype)
124 |         
125 |         return result, attention_distribution
126 | 
127 |     def _calculate_scores(self, query, key):
128 |         """Calculates attention scores as a nonlinear sum of query and key.
129 |         Args:
130 |         query: Query tensor of shape `[batch_size, Tq, dim]`.
131 |         key: Key tensor of shape `[batch_size, Tv, dim]`.
132 |         Returns:
133 |         Tensor of shape `[batch_size, Tq, Tv]`.
134 |         """
135 |         # Reshape tensors to enable broadcasting.
136 |         # Reshape into [batch_size, Tq, 1, dim].
137 |         q_reshaped = tf.expand_dims(query, axis=-2)
138 |         # Reshape into [batch_size, 1, Tv, dim].
139 |         k_reshaped = tf.expand_dims(key, axis=-3)
140 |         if self.use_scale:
141 |             scale = self.scale
142 |         else:
143 |             scale = 1.
144 |         return tf.reduce_sum(
145 |             scale * tf.tanh(q_reshaped + k_reshaped), axis=-1)
146 | 
147 | class MultiHeadAttention(tf.keras.layers.Attention):
148 |     '''
149 |     Refer1: DotProductAttention    
150 |     Refer2: https://github.com/Kyubyong/transformer/blob/master/modules.py
151 |     '''
152 |     def __init__(self, num_heads, size, use_scale=False, **kwargs):
153 |         super(MultiHeadAttention, self).__init__(use_scale= use_scale, **kwargs)
154 | 
155 |         if size % num_heads != 0:
156 |             raise ValueError('size must be divisible by num_heads. (\'{}\' % \'{}\' != 0)'.format(size, num_heads))
157 | 
158 |         self.num_heads = num_heads
159 |         self.size = size
160 |         self.use_scale = use_scale
161 | 
162 |     def build(self, input_shape):
163 |         self.layer_Dict = {
164 |             'Query': tf.keras.layers.Dense(self.size),
165 |             'Value': tf.keras.layers.Dense(self.size),
166 |             'Key': tf.keras.layers.Dense(self.size),
167 |             'Layer_Normalization': Layer_Norm()
168 |             }
169 | 
170 |         super(MultiHeadAttention, self).build(input_shape= input_shape)
171 | 
172 |     def call(self, inputs, mask=None):
173 |         self._validate_call_args(inputs=inputs, mask=mask)
174 |         q = self.layer_Dict['Query'](inputs[0]) # [batch_size, Tq, Att_Dim]
175 |         v = self.layer_Dict['Value'](inputs[1]) # [batch_size, Tv, Att_Dim]
176 |         k = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else v # [batch_size, Tv, Att_Dim]
177 | 
178 |         #Multihead
179 |         q_split = tf.concat(tf.split(q, self.num_heads, axis= -1), axis= 0)   # [batch_size * Heads, Tq, Att_Dim / Heads]
180 |         v_split = tf.concat(tf.split(v, self.num_heads, axis= -1), axis= 0)   # [batch_size * Heads, Tv, Att_Dim / Heads]
181 |         k_split = tf.concat(tf.split(k, self.num_heads, axis= -1), axis= 0)   # [batch_size * Heads, Tv, Att_Dim / Heads]
182 |         
183 |         q_mask = mask[0] if mask else None
184 |         v_mask = mask[1] if mask else None
185 | 
186 |         scores = self._calculate_scores(query= q_split, key= k_split)
187 |         if v_mask is not None:
188 |             # Mask of shape [batch_size, 1, Tv].
189 |             v_mask = tf.expand_dims(v_mask, axis= -2)
190 |         if self.causal:
191 |             # Creates a lower triangular mask, so position i cannot attend to
192 |             # positions j>i. This prevents the flow of information from the future
193 |             # into the past.
194 |             scores_shape = tf.shape(scores)
195 |             # causal_mask_shape = [1, Tq, Tv].
196 |             causal_mask_shape = tf.concat(
197 |                 [tf.ones_like(scores_shape[:-2]), scores_shape[-2:]],
198 |                 axis=0)
199 |             causal_mask = _lower_triangular_mask(causal_mask_shape)
200 |         else:
201 |             causal_mask = None
202 |         scores_mask = _merge_masks(v_mask, causal_mask)
203 |         result, attention_distribution = _apply_scores(scores=scores, value= v_split, scores_mask=scores_mask) #reslut: [batch_size * Heads, Tq, Att_Dim / Heads], attention_distribution: [batch_size * Heads, Tq, Tv]
204 |         if q_mask is not None:
205 |             # Mask of shape [batch_size, Tq, 1].
206 |             q_mask = tf.expand_dims(q_mask, axis=-1)
207 |             result *= tf.cast(q_mask, dtype=result.dtype)
208 | 
209 |         result = tf.concat(tf.split(result, self.num_heads, axis= 0), axis= -1)  # [batch_size, Tq, Att_Dim]
210 | 
211 |         result = self.layer_Dict['Layer_Normalization'](result + q) #Residual, layer normalization
212 |         attention_distribution = tf.reduce_mean(tf.stack(tf.split(attention_distribution, self.num_heads, axis= 0), axis= 1), axis= 1)  # [batch_size * Heads, Tq, Tv] -> [batch_size, Heads, Tq, Tv] -> [batch_size, Tq, Tv]
213 | 
214 |         return result, attention_distribution
215 | 
216 |     def _calculate_scores(self, query, key):
217 |         """Calculates attention scores as a query-key dot product.
218 |         Args:
219 |         query: Query tensor of shape `[batch_size, Tq, dim]`.
220 |         key: Key tensor of shape `[batch_size, Tv, dim]`.
221 |         Returns:
222 |         Tensor of shape `[batch_size, Tq, Tv]`.
223 |         """
224 |         scores = tf.matmul(query, key, transpose_b=True)
225 | 
226 |         if self.scale is not None:            
227 |             scores *= self.scale
228 |         return scores
229 | 
230 | def _apply_scores(scores, value, scores_mask=None):
231 |     if scores_mask is not None:
232 |         padding_mask = tf.logical_not(scores_mask)
233 |         # Bias so padding positions do not contribute to attention distribution.
234 |         scores -= 1.e9 * tf.cast(padding_mask, dtype= scores.dtype)
235 |     attention_distribution = tf.nn.softmax(scores)
236 | 
237 |     return tf.matmul(attention_distribution, value), attention_distribution
238 | 
239 | def _lower_triangular_mask(shape):
240 |     """Creates a lower-triangular boolean mask over the last 2 dimensions."""
241 |     row_index = tf.cumsum(
242 |         tf.ones(shape=shape, dtype=tf.int32), axis=-2)
243 |     col_index = tf.cumsum(
244 |         tf.ones(shape=shape, dtype=tf.int32), axis=-1)
245 |     return tf.greater_equal(row_index, col_index)
246 | 
247 | def _merge_masks(x, y):
248 |     if x is None:
249 |         return y
250 |     if y is None:
251 |         return x
252 |     return tf.logical_and(x, y)
253 | 
254 | class Layer_Norm(tf.keras.layers.Layer):
255 |     '''
256 |     There are several restriction in 'tf.keras.layers.LayerNormalization'.    
257 |     '''    
258 |     def __init__(self, epsilon= 1e-8):
259 |         super(Layer_Norm, self).__init__()
260 |         self.epsilon = epsilon
261 | 
262 |     def build(self, input_shape):
263 |         self.beta = self.add_weight(
264 |             name= 'beta',
265 |             shape= input_shape[-1:],
266 |             initializer= tf.zeros_initializer(),
267 |             dtype= self.dtype,
268 |             trainable= True
269 |             )
270 |         self.gamma = self.add_weight(
271 |             name= 'gamma',
272 |             shape= input_shape[-1:],
273 |             initializer= tf.ones_initializer(),
274 |             dtype= self.dtype,
275 |             trainable= True
276 |             )
277 | 
278 |         self.built = True
279 | 
280 |     def call(self, inputs):
281 |         mean, variance = tf.nn.moments(inputs, [-1], keepdims= True)
282 |         normalized = (inputs - mean) / ((variance + self.epsilon) ** .5)
283 |         outputs = self.gamma * normalized + self.beta
284 | 
285 |         return outputs
286 | 
287 | 
288 | # Refer: https://github.com/begeekmyfriend/tacotron/blob/60d6932f510bf591acb25620290868900b5c0a41/models/attention.py
289 | class LocationSensitiveAttention(tf.keras.layers.AdditiveAttention):
290 |     '''
291 |     Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440
292 |     This is for attention size managing and getting the attention history(scores).
293 |     '''
294 |     def __init__(
295 |         self,
296 |         size,
297 |         conv_filters,
298 |         conv_kernel_size,
299 |         conv_stride,
300 |         smoothing= False,
301 |         use_scale=False,
302 |         cumulate_weights= True,
303 |         **kwargs
304 |         ):
305 |         super(LocationSensitiveAttention, self).__init__(use_scale= use_scale, **kwargs)
306 |         
307 |         self.size = size
308 |         self.smoothing = smoothing
309 |         self.cumulate_weights = cumulate_weights        
310 |         self.layer_Dict = {
311 |             'Query': tf.keras.layers.Dense(size),
312 |             'Value': tf.keras.layers.Dense(size),
313 |             'Key': tf.keras.layers.Dense(size),
314 |             'Alignment_Conv': tf.keras.layers.Conv1D(
315 |                 filters= conv_filters,
316 |                 kernel_size= conv_kernel_size,
317 |                 strides= conv_stride,
318 |                 padding='same'
319 |                 ),
320 |             'Alignment_Dense': tf.keras.layers.Dense(size)
321 |             }
322 | 
323 |     def build(self, input_shape):
324 |         """Creates scale and bias variable if use_scale==True."""
325 |         if self.use_scale:
326 |             self.scale = self.add_weight(
327 |                 name='scale',
328 |                 shape=[self.size],
329 |                 initializer= tf.initializers.glorot_uniform(),
330 |                 dtype=self.dtype,
331 |                 trainable=True)            
332 |         else:
333 |             self.scale = None
334 | 
335 |         self.bias = self.add_weight(
336 |             name='bias',
337 |             shape=[self.size,],
338 |             initializer=tf.zeros_initializer(),
339 |             dtype=self.dtype,
340 |             trainable=True
341 |             )
342 | 
343 |         self.bulit = True
344 | 
345 |     def call(self, inputs):
346 |         '''
347 |         inputs: [query, value] or [query, value, key]
348 |         I don't implement the mask function now.
349 |         '''
350 |         self._validate_call_args(inputs=inputs, mask= None)
351 |         query = self.layer_Dict['Query'](inputs[0])
352 |         value = self.layer_Dict['Value'](inputs[1])
353 |         key = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else value
354 | 
355 |         contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype)  #initial attention, [Batch, 1, Att_dim]
356 |         alignments = tf.zeros(shape= (tf.shape(query)[0], 1, tf.shape(key)[1]), dtype= query.dtype)   #initial alignment, [Batch, 1, T_k]
357 | 
358 |         initial_Step = tf.constant(0)
359 |         def body(step, query, contexts, alignments):
360 |             query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim]            
361 |             previous_alignment = tf.reduce_sum(alignments, axis= 1) if self.cumulate_weights else alignments[:, -1]
362 |             location_features = tf.expand_dims(previous_alignment, axis= -1) #[Batch, T_k, 1]
363 |             location_features = self.layer_Dict['Alignment_Conv'](location_features)    #[Batch, T_k, Filters]
364 |             location_features = self.layer_Dict['Alignment_Dense'](location_features)   #[Batch, T_k, Att_dim]
365 | 
366 |             score = self._calculate_scores(query= query_Step, key= key, location_features= location_features)   #[Batch, T_k]
367 |             context, alignment  = self._apply_scores(score= score, value= value) #[Batch, Att_dim], [Batch, T_v]
368 | 
369 |             return step + 1, query, tf.concat([contexts, context], axis= 1),  tf.concat([alignments, alignment], axis= 1)
370 | 
371 |         _, _, contexts, alignments = tf.while_loop(
372 |             cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]),
373 |             body= body,
374 |             loop_vars= [initial_Step, query, contexts, alignments],
375 |             shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])]
376 |             )
377 | 
378 |         # # The following code cannot use now because normal for-loop does not support 'shape_invariants'.
379 |         # for step in tf.range(tf.shape(query)[1]):
380 |         #     query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim]
381 |         #     location_features = tf.expand_dims(alignments[:, -1], axis= -1) #[Batch, T_k, 1]
382 |         #     location_features = self.layer_Dict['Alignment_Conv'](location_features)    #[Batch, T_k, Filters]
383 |         #     location_features = self.layer_Dict['Alignment_Dense'](location_features)   #[Batch, T_k, Att_dim]
384 | 
385 |         #     score = self._calculate_scores(query= query_Step, key= key, location_features= location_features)   #[Batch, T_k]
386 |         #     context, alignment  = self._apply_scores(score= score, value= value) #[Batch, Att_dim], [Batch, T_v]
387 | 
388 |         #     contexts = tf.concat([contexts, context], axis= 1)
389 |         #     alignments = tf.concat([alignments, alignment], axis= 1)
390 | 
391 |         return contexts[:, 1:], alignments[:, 1:]   #Remove initial step
392 | 
393 |     def _calculate_scores(self, query, key, location_features):
394 |         """Calculates attention scores as a nonlinear sum of query and key.
395 |         Args:
396 |         query: Query tensor of shape `[batch_size, 1, Att_dim]`.
397 |         key: Key tensor of shape `[batch_size, T_k, Att_dim]`.
398 |         location_features: Location_features of shape `[batch_size, T_k, Att_dim]`.
399 |         Returns:
400 |         Tensor of shape `[batch_size, T_k]`.
401 |         """
402 |         if self.use_scale:
403 |             scale = self.scale
404 |         else:
405 |             scale = 1.
406 | 
407 |         return tf.reduce_sum(scale * tf.tanh(query + key + location_features + self.bias), axis=-1)    #[Batch, T_k, Att_dim] -> [Batch, T_k]
408 | 
409 |     #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism.
410 |     def _apply_scores(self, score, value):
411 |         '''
412 |         score shape: [batch_size, T_k]`.
413 |         value shape: [batch_size, T_v, Att_dim]`.
414 |         Must T_k == T_v
415 | 
416 |         Return: [batch_size, Att_dim]
417 |         '''
418 |         score = tf.expand_dims(score, axis= 1)  #[Batch_size, 1, T_v]
419 |         probability_fn = self._smoothing_normalization if self.smoothing else tf.nn.softmax
420 |         alignment = probability_fn(score)   #[Batch_size, 1, T_v]
421 |         context = tf.matmul(alignment, value)   #[Batch_size, 1, Att_dim]
422 | 
423 |         #return tf.squeeze(context, axis= 1), tf.squeeze(alignment, axis= 1),   #[Batch, Att_dim], [Batch, T_v]
424 |         return context, alignment
425 | 
426 |     def _smoothing_normalization(self, e):
427 |         """Applies a smoothing normalization function instead of softmax
428 |         Introduced in:
429 |             J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
430 |         gio, “Attention-based models for speech recognition,” in Ad-
431 |         vances in Neural Information Processing Systems, 2015, pp.
432 |         577–585.
433 |         ############################################################################
434 |                             Smoothing normalization function
435 |                     a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
436 |         ############################################################################
437 |         Args:
438 |             e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
439 |                 values of an attention mechanism
440 |         Returns:
441 |             matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
442 |                 attendance to multiple memory time steps.
443 |         """
444 |         return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
445 | 
446 | class BahdanauMonotonicAttention(tf.keras.layers.AdditiveAttention):
447 |     '''
448 |     Refer: https://github.com/tensorflow/tensorflow/blob/r2.0/tensorflow/python/keras/layers/dense_attention.py#L307-L440
449 |     This is for attention size managing and getting the attention history(scores).
450 |     '''
451 |     def __init__(
452 |         self,
453 |         size,
454 |         sigmoid_noise= 0.0,
455 |         normalize= False,
456 |         **kwargs
457 |         ):
458 |         super(BahdanauMonotonicAttention, self).__init__(use_scale= False, **kwargs)
459 |         
460 |         self.size = size
461 |         self.sigmoid_noise = sigmoid_noise
462 |         self.normalize = normalize
463 | 
464 |     def build(self, input_shape):
465 |         self.layer_Dict = {
466 |             'Query': tf.keras.layers.Dense(self.size),
467 |             'Value': tf.keras.layers.Dense(self.size),
468 |             'Key': tf.keras.layers.Dense(self.size)
469 |             }
470 | 
471 |         self.attention_v = self.add_weight(
472 |             name='attention_v',
473 |             shape=[self.size,],
474 |             initializer='glorot_uniform',
475 |             dtype=self.dtype,
476 |             trainable=True
477 |             )
478 | 
479 |         self.attention_score_bias = self.add_weight(
480 |             name='attention_score_bias',
481 |             shape=[],
482 |             initializer=tf.zeros_initializer(),
483 |             dtype=self.dtype,
484 |             trainable=True
485 |             )
486 | 
487 |         if self.normalize:
488 |             self.attention_g = self.add_weight(
489 |                 name='attention_g',
490 |                 shape=[],
491 |                 initializer= tf.initializers.constant([np.sqrt(1. / self.size),]),
492 |                 dtype=self.dtype,
493 |                 trainable=True
494 |                 )
495 | 
496 |             self.attention_b = self.add_weight(
497 |                 name='attention_b',
498 |                 shape=[self.size,],
499 |                 initializer= tf.zeros_initializer(),
500 |                 dtype=self.dtype,
501 |                 trainable=True
502 |                 )
503 | 
504 |         self.bulit = True
505 | 
506 |     def call(self, inputs):
507 |         '''
508 |         inputs: [query, value] or [query, value, key]
509 |         I don't implement the mask function now.
510 |         '''
511 |         self._validate_call_args(inputs=inputs, mask= None)
512 |         query = self.layer_Dict['Query'](inputs[0])
513 |         value = self.layer_Dict['Value'](inputs[1])
514 |         key = self.layer_Dict['Key'](inputs[2]) if len(inputs) > 2 else value
515 | 
516 |         contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype)  #initial attention, [Batch, 1, Att_dim]
517 |         alignments = tf.expand_dims(
518 |             tf.one_hot(
519 |                 indices= tf.zeros((tf.shape(query)[0]), dtype= tf.int32),
520 |                 depth= tf.shape(key)[1],
521 |                 dtype= query.dtype
522 |                 ),
523 |             axis= 1
524 |             )   #initial alignment, [Batch, 1, T_k]. This part is different by monotonic or not.
525 | 
526 |         initial_Step = tf.constant(0)
527 |         def body(step, query, contexts, alignments):
528 |             query_Step = tf.expand_dims(query[:, step], axis= 1) #[Batch, 1, Att_dim]            
529 |             previous_alignment = tf.expand_dims(alignments[:, -1], axis= 1) #[Batch, 1, T_k]
530 | 
531 |             score = self._calculate_scores(query= query_Step, key= key)   #[Batch, T_k]            
532 |             context, alignment  = self._apply_scores(score= score, value= value, previous_alignment= previous_alignment) #[Batch, Att_dim], [Batch, T_v]
533 | 
534 |             return step + 1, query, tf.concat([contexts, context], axis= 1),  tf.concat([alignments, alignment], axis= 1)
535 | 
536 |         _, _, contexts, alignments = tf.while_loop(
537 |             cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]),
538 |             body= body,
539 |             loop_vars= [initial_Step, query, contexts, alignments],
540 |             shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])]
541 |             )
542 | 
543 |         return contexts[:, 1:], alignments[:, 1:]   #Remove initial step
544 | 
545 |     def _calculate_scores(self, query, key):
546 |         """Calculates attention scores as a nonlinear sum of query and key.
547 |         Args:
548 |         query: Query tensor of shape `[batch_size, 1, Att_dim]`.
549 |         key: Key tensor of shape `[batch_size, T_k, Att_dim]`.
550 |         
551 |         Returns:
552 |         Tensor of shape `[batch_size, T_k]`.
553 |         """
554 |         if self.normalize:
555 |             norm_v = self.attention_g * self.attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(self.attention_v)))
556 |             return tf.reduce_sum(norm_v * tf.tanh(query + key + self.attention_b), axis= -1) + self.attention_score_bias   #[Batch, T_k, Att_dim] -> [Batch, T_k]
557 |         else:
558 |             return tf.reduce_sum(self.attention_v * tf.tanh(query + key), axis= -1) + self.attention_score_bias   #[Batch, T_k, Att_dim] -> [Batch, T_k]
559 | 
560 |     #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism.
561 |     def _apply_scores(self, score, value, previous_alignment):
562 |         '''
563 |         score shape: [batch_size, T_v]`.    (Must T_k == T_v)
564 |         value shape: [batch_size, T_v, Att_dim]`.
565 |         previous_alignment shape: [batch_size, 1, T_v]`.
566 |         
567 | 
568 |         Return: [batch_size, Att_dim]
569 |         '''
570 |         score = tf.expand_dims(score, axis= 1)  #[Batch_size, 1, T_v]        
571 |         alignment = self._monotonic_probability_fn(score, previous_alignment)   #[Batch_size, 1, T_v]
572 |         context = tf.matmul(alignment, value)   #[Batch_size, 1, Att_dim]
573 |         
574 |         return context, alignment
575 | 
576 |     def _monotonic_probability_fn(self, score, previous_alignment):
577 |         if self.sigmoid_noise > 0.0:
578 |             score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype)
579 |         p_choose_i = tf.sigmoid(score)
580 | 
581 |         cumprod_1mp_choose_i = self.safe_cumprod(1 - p_choose_i, axis= 2, exclusive= True)
582 | 
583 |         alignment = p_choose_i * cumprod_1mp_choose_i * tf.cumsum(
584 |             previous_alignment / tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.),
585 |             axis= 2
586 |             )
587 | 
588 |         return alignment
589 | 
590 |     # https://github.com/tensorflow/addons/blob/9e9031133c8362fedf40f2d05f00334b6f7a970b/tensorflow_addons/seq2seq/attention_wrapper.py#L810
591 |     def safe_cumprod(self, x, *args, **kwargs):
592 |         """Computes cumprod of x in logspace using cumsum to avoid underflow.
593 |         The cumprod function and its gradient can result in numerical instabilities
594 |         when its argument has very small and/or zero values.  As long as the
595 |         argument is all positive, we can instead compute the cumulative product as
596 |         exp(cumsum(log(x))).  This function can be called identically to
597 |         tf.cumprod.
598 |         Args:
599 |         x: Tensor to take the cumulative product of.
600 |         *args: Passed on to cumsum; these are identical to those in cumprod.
601 |         **kwargs: Passed on to cumsum; these are identical to those in cumprod.
602 |         Returns:
603 |         Cumulative product of x.
604 |         """
605 |         x = tf.convert_to_tensor(x, name='x')
606 |         tiny = np.finfo(x.dtype.as_numpy_dtype).tiny
607 |         return tf.exp(tf.cumsum(tf.math.log(tf.clip_by_value(x, tiny, 1)), *args, **kwargs))
608 | 
609 | class StepwiseMonotonicAttention(BahdanauMonotonicAttention):
610 |     '''
611 |     Refer: https://gist.github.com/dy-octa/38a7638f75c21479582d7391490df37c
612 |     '''
613 |     def __init__(
614 |         self,
615 |         size,
616 |         sigmoid_noise= 2.0,
617 |         normalize= False,
618 |         **kwargs
619 |         ):
620 |         super(StepwiseMonotonicAttention, self).__init__(
621 |             size= size,
622 |             sigmoid_noise= sigmoid_noise,
623 |             normalize= normalize, **kwargs
624 |             )
625 | 
626 |     def _monotonic_probability_fn(self, score, previous_alignment):
627 |         '''
628 |         score:  [Batch_size, 1, T_v]
629 |         previous_alignment: [batch_size, 1, T_v]
630 |         '''
631 |         if self.sigmoid_noise > 0.0:
632 |             score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype)
633 |         p_choose_i = tf.sigmoid(score)  # [Batch_size, 1, T_v]
634 | 
635 |         pad = tf.zeros([tf.shape(p_choose_i)[0], 1, 1], dtype=p_choose_i.dtype)    # [Batch_size, 1, 1]
636 | 
637 |         alignment = previous_alignment * p_choose_i + tf.concat(
638 |             [pad, previous_alignment[:, :, :-1] * (1.0 - p_choose_i[:, :, :-1])], axis= -1)
639 | 
640 |         return alignment
641 | 
642 | 
643 | class DynamicConvolutionAttention(tf.keras.layers.AdditiveAttention):
644 |     '''
645 |     Refer: https://gist.github.com/attitudechunfeng/c162a5ed9b034be8f3f5800652af7c83
646 |     '''
647 |     def __init__(
648 |         self,
649 |         size,
650 |         f_conv_filters= 8,
651 |         f_conv_kernel_size= 21,
652 |         f_conv_stride= 1,
653 |         g_conv_filters= 8,
654 |         g_conv_kernel_size= 21,
655 |         g_conv_stride= [1, 1, 1, 1],
656 |         p_conv_size = 11,
657 |         p_alpha= 0.1,
658 |         p_beta = 2.9,        
659 |         use_scale=False,
660 |         cumulate_weights= False,
661 |         **kwargs
662 |         ):
663 |         super(DynamicConvolutionAttention, self).__init__(use_scale= use_scale, **kwargs)
664 |         
665 |         self.size = size
666 |         self.f_conv_filters= f_conv_filters
667 |         self.f_conv_kernel_size= f_conv_kernel_size
668 |         self.f_conv_stride= f_conv_stride
669 |         self.g_conv_filters= g_conv_filters
670 |         self.g_conv_kernel_size= g_conv_kernel_size
671 |         self.g_conv_stride= g_conv_stride
672 |         self.p_conv_size = p_conv_size
673 |         self.p_alpha= p_alpha
674 |         self.p_beta = p_beta
675 |         self.cumulate_weights = cumulate_weights
676 |         
677 |     def build(self, input_shape):
678 |         self.layer_Dict = {}
679 |         self.layer_Dict['Key'] = tf.keras.layers.Dense(self.size)
680 | 
681 |         self.layer_Dict['F_Conv'] = tf.keras.layers.Conv1D(
682 |             filters= self.f_conv_filters,
683 |             kernel_size= self.f_conv_kernel_size,
684 |             strides= self.f_conv_stride,
685 |             padding='same'
686 |             )
687 |         self.layer_Dict['F_Dense'] = tf.keras.layers.Dense(
688 |             self.size,
689 |             use_bias= False
690 |             )
691 |         
692 |         self.layer_Dict['G_Filter_Dense'] = tf.keras.Sequential()
693 |         self.layer_Dict['G_Filter_Dense'].add(tf.keras.layers.Dense(
694 |             units= self.g_conv_kernel_size * self.g_conv_filters,
695 |             use_bias= True,
696 |             activation= 'tanh'
697 |             ))
698 |         self.layer_Dict['G_Filter_Dense'].add(tf.keras.layers.Dense(
699 |             units= self.g_conv_kernel_size * self.g_conv_filters,
700 |             use_bias= False
701 |             ))
702 |         self.layer_Dict['G_Dense'] = tf.keras.layers.Dense(
703 |             self.size,
704 |             use_bias= False
705 |             )
706 | 
707 |         self.layer_Dict['P_Conv'] = DCA_P_Conv1D(
708 |             p_conv_size = self.p_conv_size,
709 |             p_alpha= self.p_alpha,
710 |             p_beta = self.p_beta,
711 |             )
712 |         
713 |         """Creates scale and bias variable if use_scale==True."""
714 |         if self.use_scale:
715 |             self.scale = self.add_weight(
716 |                 name='scale',
717 |                 shape=[self.size],
718 |                 initializer= tf.initializers.glorot_uniform(),
719 |                 dtype=self.dtype,
720 |                 trainable=True)            
721 |         else:
722 |             self.scale = None
723 | 
724 |         self.bias = self.add_weight(
725 |             name='bias',
726 |             shape=[self.size,],
727 |             initializer=tf.zeros_initializer(),
728 |             dtype=self.dtype,
729 |             trainable=True
730 |             )
731 | 
732 |         # self.g_scale = self.add_weight(
733 |         #     name='g_scale',
734 |         #     shape=[self.g_conv_kernel_size * self.g_conv_filters,],
735 |         #     initializer=tf.zeros_initializer(),
736 |         #     dtype=self.dtype,
737 |         #     trainable=True
738 |         #     )
739 | 
740 |         self.bulit = True
741 | 
742 |     def call(self, inputs):
743 |         '''
744 |         inputs: [query, key]
745 |         I don't implement the mask function now.
746 |         '''
747 |         self._validate_call_args(inputs=inputs, mask= None)
748 |         query = inputs[0]   #[Batch, Q_dim]
749 |         key = self.layer_Dict['Key'](inputs[1]) #[Batch, T_k, Att_dim]
750 | 
751 |         batch_size = tf.shape(query)[0]
752 |         contexts = tf.zeros(shape= [tf.shape(query)[0], 1, self.size], dtype= query.dtype)  #initial attention, [Batch, 1, Att_dim]
753 |         alignments = tf.one_hot(
754 |             indices= tf.zeros((tf.shape(query)[0], 1), dtype= tf.int32),
755 |             depth= tf.shape(key)[1],
756 |             dtype= query.dtype
757 |             )   #initial alignment, [Batch, 1, T_k]. This part is different by monotonic or not.
758 | 
759 |         initial_Step = tf.constant(0)
760 |         def body(step, query, contexts, alignments):
761 |             query_Step = query[:, step] #[Batch, Q_dim]            
762 |             previous_alignment = tf.reduce_sum(alignments, axis= 1) if self.cumulate_weights else alignments[:, -1] #[Batch, T_k]
763 |             previous_alignment = tf.expand_dims(previous_alignment, axis= -1) #[Batch, T_k, 1]
764 | 
765 |             feature_previous_alignment = self.layer_Dict['F_Conv'](previous_alignment)    #[Batch, T_k, Filters]
766 |             feature_previous_alignment = self.layer_Dict['F_Dense'](feature_previous_alignment)   #[Batch, T_k, Att_dim]
767 | 
768 |             #dynamic_filter = self.g_scale * self.layer_Dict['G_Filter_Dense'](query_Step)    # [Batch, Conv_Size * Conv_Ch]
769 |             dynamic_filter = self.layer_Dict['G_Filter_Dense'](query_Step)    # [Batch, Conv_Size * Conv_Ch]
770 |             dynamic_filter = tf.reshape(
771 |                 dynamic_filter,
772 |                 shape= [batch_size, 1, self.g_conv_kernel_size, self.g_conv_filters]
773 |                 )   # [Batch, 1, Conv_Size, Conv_Ch]
774 |             dynamic_filter = tf.transpose(
775 |                 dynamic_filter,
776 |                 perm= [1, 2, 0, 3]
777 |                 )   # [1, Conv_Size, Batch, Conv_Ch]    [H(1), W, C_in, C_out]
778 |             dynamic_previous_alignment = tf.expand_dims(
779 |                 tf.transpose(
780 |                     previous_alignment,    
781 |                     perm= [2, 1, 0]
782 |                     ),   
783 |                     axis = 0
784 |                 )   #[N(Batch), W(K_t), C(1)] -> [C(1), W(K_t), N(Batch)] -> [1, C(1), W(K_t), N(Batch)]
785 |             dynamic_previous_alignment  = tf.nn.depthwise_conv2d(
786 |                 dynamic_previous_alignment,
787 |                 filter= dynamic_filter,
788 |                 strides= self.g_conv_stride,
789 |                 padding= 'SAME'
790 |                 )   # [1, 1, K_t, Batch * G_Filter]
791 |             dynamic_previous_alignment = tf.squeeze(input= dynamic_previous_alignment, axis= [0, 1])  # [K_t, Batch * G_Filter]
792 |             dynamic_previous_alignment = tf.reshape(
793 |                 dynamic_previous_alignment,
794 |                 shape= [tf.shape(dynamic_previous_alignment)[0], batch_size, self.g_conv_filters]
795 |                 )   # [K_t, Batch, G_Filter]
796 |             dynamic_previous_alignment = tf.transpose(
797 |                 dynamic_previous_alignment,
798 |                 perm= [1, 0, 2]
799 |                 )   # [Batch, K_t, G_Filter]
800 |             dynamic_previous_alignment = self.layer_Dict['G_Dense'](dynamic_previous_alignment)  #[Batch, K_t, Att_Dim]
801 | 
802 |             prior_filter_bias = self.layer_Dict['P_Conv'](previous_alignment)   #[Batch, K_t]
803 | 
804 |             score = self._calculate_scores(
805 |                 feature_previous_alignment= feature_previous_alignment,
806 |                 dynamic_previous_alignment= dynamic_previous_alignment,
807 |                 prior_filter_bias= prior_filter_bias
808 |                 )   #[Batch, T_k]
809 |             context, alignment  = self._apply_scores(score= score, key= key) #[Batch, 1, Att_dim], [Batch, 1, T_k]
810 |             
811 |             return step + 1, query, tf.concat([contexts, context], axis= 1),  tf.concat([alignments, alignment], axis= 1)
812 | 
813 |         _, _, contexts, alignments = tf.while_loop(
814 |             cond= lambda step, query, contexts, alignments: tf.less(step, tf.shape(query)[1]),
815 |             body= body,
816 |             loop_vars= [initial_Step, query, contexts, alignments],
817 |             shape_invariants= [initial_Step.get_shape(), query.get_shape(), tf.TensorShape([None, None, self.size]), tf.TensorShape([None, None, None])]
818 |             )   #[Batch, T_q + 1, Att_dim], [Batch, T_q + 1, T_k]
819 | 
820 |         return contexts[:, 1:], alignments[:, 1:]   #Remove initial step
821 | 
822 |     def _calculate_scores(self, feature_previous_alignment, dynamic_previous_alignment, prior_filter_bias):
823 |         """Calculates attention scores as a nonlinear sum of query and key.
824 |         Args:
825 |         feature_previous_alignment: Location_features of shape `[batch_size, T_k, Att_dim]`.
826 |         dynamic_previous_alignment: Dynamic features of shape `[batch_size, T_k, Att_dim]`.
827 |         prior_filter_bias: Prior filter bias of shape `[batch_size, T_k]`.
828 |         Returns:
829 |         Tensor of shape `[batch_size, T_k]`.
830 |         """
831 |         if self.use_scale:
832 |             scale = self.scale
833 |         else:
834 |             scale = 1.
835 |         score = tf.reduce_sum(
836 |             scale * tf.tanh(feature_previous_alignment + dynamic_previous_alignment + self.bias),
837 |             axis=-1
838 |             )   #[Batch, T_k, Att_dim] -> [Batch, T_k]
839 |         return score + prior_filter_bias
840 | 
841 |     #In TF1, 'context' is calculated in AttentionWrapper, not attention mechanism.
842 |     def _apply_scores(self, score, key):
843 |         '''
844 |         score shape: [batch_size, T_k]`.
845 |         key shape: [batch_size, T_k, Att_dim]`.
846 |         Must T_k == T_v
847 | 
848 |         Return: [batch_size, Att_dim]
849 |         '''
850 |         score = tf.expand_dims(score, axis= 1)  #[Batch_size, 1, T_v]
851 |         alignment = tf.nn.softmax(score)   #[Batch_size, 1, T_v]
852 |         context = tf.matmul(alignment, key)   #[Batch_size, 1, Att_dim]
853 | 
854 |         return context, alignment   #[Batch, 1, Att_dim], [Batch, 1, T_v]
855 | 
856 | class DCA_P_Conv1D(tf.keras.layers.Conv1D):
857 |     def __init__(self, p_conv_size= 11, p_alpha= 0.1, p_beta= 0.9):
858 |         self.p_conv_size= p_conv_size
859 |         self.p_alpha= p_alpha
860 |         self.p_beta= p_beta
861 |         
862 |         prior_filter = self.beta_binomial(self.p_conv_size, self.p_alpha, self.p_beta)
863 |         prior_filter = np.flip(prior_filter, axis= 0)
864 |         prior_filter = np.reshape(prior_filter, [self.p_conv_size, 1, 1])
865 | 
866 |         super(DCA_P_Conv1D, self).__init__(
867 |             filters= 1,
868 |             kernel_size= self.p_conv_size,
869 |             padding='valid',
870 |             use_bias= False,
871 |             kernel_initializer= tf.initializers.constant(prior_filter)
872 |             )
873 |     
874 |     def call(self, inputs):
875 |         '''
876 |         inputs: 3D tensor with shape: `(batch_size, steps, input_dim)`
877 |         After front padding, call a superior class(Conv1D)
878 |         '''
879 |         inputs = tf.pad(inputs, paddings= [[0,0], [self.p_conv_size - 1, 0], [0, 0]])
880 |         new_Tensor = super(DCA_P_Conv1D, self).call(inputs)
881 |         new_Tensor = tf.squeeze(new_Tensor, axis= -1)
882 |         
883 |         return tf.math.log(tf.maximum(new_Tensor, np.finfo(inputs.dtype.as_numpy_dtype).tiny))
884 |         # return tf.maximum(tf.math.log(new_Tensor), -1e+6) # NaN problem.
885 | 
886 |     def beta_binomial(self, _n, _alpha, _beta):   
887 |         return [comb(_n,i) * beta(i+_alpha, _n-i+_beta) / beta(_alpha, _beta) for i in range(_n)]


--------------------------------------------------------------------------------
/Modules/Attention/Steps.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | '''
  5 | TF 2.0's basic attention layers(Attention and AdditiveAttention) calculate parallelly.
  6 | TO USE MONOTONIC FUNCTION, ATTENTION MUST KNOW 'n-1 ALIGNMENT'.
  7 | Thus, this parallel versions do not support the monotonic function.
  8 | '''
  9 | 
 10 | class BahdanauAttention(tf.keras.layers.Layer):
 11 |     '''
 12 |     Refer: https://www.tensorflow.org/tutorials/text/nmt_with_attention
 13 |     '''
 14 |     def __init__(self, size):
 15 |         super(BahdanauAttention, self).__init__()
 16 |         self.size = size
 17 | 
 18 |     def build(self, input_shapes):
 19 |         self.layer_Dict = {
 20 |             'Query': tf.keras.layers.Dense(self.size),
 21 |             'Value': tf.keras.layers.Dense(self.size),
 22 |             'V': tf.keras.layers.Dense(1)
 23 |             }
 24 | 
 25 |         self.built = True
 26 | 
 27 |     def call(self, inputs):
 28 |         '''
 29 |         inputs: [queries, values]
 30 |         queries: [Batch, Query_dim]
 31 |         values: [Batch, T_v, Value_dim]
 32 |         '''
 33 |         queries, values = inputs
 34 | 
 35 |         queries = self.layer_Dict['Query'](queries) #[Batch, Att_dim]
 36 |         values = self.layer_Dict['Value'](values)   #[Batch, T_v, Att_dim]
 37 | 
 38 |         queries = tf.expand_dims(queries, 1)    #[Batch, 1, Att_dim]
 39 | 
 40 |         score = self.layer_Dict['V'](tf.nn.tanh(values + queries))  #[Batch, T_v, 1]
 41 | 
 42 |         attention_weights = tf.nn.softmax(score - tf.reduce_max(score, axis= 1, keepdims= True), axis=1)    #[Batch, T_v, 1]
 43 | 
 44 |         context_vector = tf.reduce_sum(attention_weights * values, axis=1)  #[Batch, T_v, Att_dim] -> [Batch, Att_dim]
 45 | 
 46 |         return context_vector, tf.squeeze(attention_weights, axis= -1)
 47 | 
 48 |     def initial_alignment_fn(self, batch_size, key_time, dtype):
 49 |         return tf.zeros((batch_size, key_time), dtype= dtype)
 50 | 
 51 | class BahdanauMonotonicAttention(tf.keras.layers.Layer):
 52 |     '''
 53 |     Refer
 54 |     https://www.tensorflow.org/tutorials/text/nmt_with_attention
 55 |     https://github.com/tensorflow/addons/blob/v0.7.1/tensorflow_addons/seq2seq/attention_wrapper.py#L1004-L1175
 56 | 
 57 |     '''
 58 |     def __init__(self, size, sigmoid_noise= 0.0, normalize= False, **kwargs):
 59 |         super(BahdanauMonotonicAttention, self).__init__()
 60 | 
 61 |         self.size = size
 62 |         self.sigmoid_noise = sigmoid_noise
 63 |         self.normalize = normalize
 64 | 
 65 |     def build(self, input_shapes):
 66 |         self.layer_Dict = {
 67 |             'Query': tf.keras.layers.Dense(self.size),
 68 |             'Value': tf.keras.layers.Dense(self.size),
 69 |             'Key': tf.keras.layers.Dense(self.size)
 70 |             }
 71 | 
 72 |         self.attention_v = self.add_weight(
 73 |             name='attention_v',
 74 |             shape=[self.size,],
 75 |             initializer='glorot_uniform',
 76 |             dtype=self.dtype,
 77 |             trainable=True
 78 |             )
 79 | 
 80 |         self.attention_score_bias = self.add_weight(
 81 |             name='attention_score_bias',
 82 |             shape=[],
 83 |             initializer=tf.zeros_initializer(),
 84 |             dtype=self.dtype,
 85 |             trainable=True
 86 |             )
 87 | 
 88 |         if self.normalize:
 89 |             self.attention_g = self.add_weight(
 90 |                 name='attention_g',
 91 |                 shape=[],
 92 |                 initializer= tf.initializers.constant([np.sqrt(1. / self.size),]),
 93 |                 dtype=self.dtype,
 94 |                 trainable=True
 95 |                 )
 96 | 
 97 |             self.attention_b = self.add_weight(
 98 |                 name='attention_b',
 99 |                 shape=[self.size,],
100 |                 initializer= tf.zeros_initializer(),
101 |                 dtype=self.dtype,
102 |                 trainable=True
103 |                 )
104 | 
105 |         self.bulit = True
106 | 
107 |     def call(self, inputs):
108 |         '''
109 |         inputs: [queries, values, previous_alignments] or [queries, values, keys, previous_alignments]
110 |         query: [Batch, Query_dim]
111 |         value: [Batch, T_v, Value_dim]
112 |         key: [Batch, T_v, Key_dim]
113 |         previous_alignment: [Batch, T_v]
114 |         '''
115 |         if len(inputs) == 3:
116 |             query, value, previous_alignment = inputs
117 |         elif len(inputs) == 4:
118 |             query, value, key, previous_alignment = inputs
119 |         else:
120 |             raise ValueError('Unexpected input length')
121 |         
122 |         query = self.layer_Dict['Query'](query) # [Batch, Att_dim]
123 |         value = self.layer_Dict['Value'](value) # [Batch, T_v, Att_dim]
124 |         key = self.layer_Dict['Key'](key) if len(inputs) == 4 else value   # [Batch, T_v, Att_dim]
125 |         
126 |         query = tf.expand_dims(query, 1)    # [Batch, 1, Att_dim]
127 |         previous_alignment = tf.expand_dims(previous_alignment, axis= 1)  # [Batch, 1, T_v]
128 | 
129 |         score = self._calculate_scores(query= query, key= key)
130 |         context, alignment  = self._apply_scores(
131 |             score= score,
132 |             value= value,
133 |             previous_alignment= previous_alignment
134 |             ) # [Batch, Att_dim], [Batch, 1, T_v]
135 | 
136 |         return context, alignment
137 | 
138 |     def _calculate_scores(self, query, key):
139 |         '''
140 |         Calculates attention scores as a nonlinear sum of query and key.
141 |         Args:
142 |         query: Query tensor of shape `[batch_size, 1, Att_dim]`.
143 |         key: Key tensor of shape `[batch_size, T_k, Att_dim]`.
144 |         
145 |         Returns:
146 |         Tensor of shape `[batch_size, T_k]`.
147 |         '''
148 |         if self.normalize:
149 |             norm_v = self.attention_g * self.attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(self.attention_v)))
150 |             return tf.reduce_sum(norm_v * tf.tanh(query + key + self.attention_b), axis= -1) + self.attention_score_bias   #[Batch, T_k, Att_dim] -> [Batch, T_k]
151 |         else:
152 |             return tf.reduce_sum(self.attention_v * tf.tanh(query + key), axis= -1) + self.attention_score_bias   #[Batch, T_k, Att_dim] -> [Batch, T_k]
153 | 
154 |     def _apply_scores(self, score, value, previous_alignment):
155 |         '''
156 |         score shape: [batch_size, T_v]`.    (Must T_k == T_v)
157 |         value shape: [batch_size, T_v, Att_dim]`.
158 |         previous_alignment shape: [batch_size, 1, T_v]`.
159 |         
160 |         Return: [batch_size, Att_dim], [batch_size, T_v]
161 |         '''
162 |         score = tf.expand_dims(score, axis= 1)  #[Batch_size, 1, T_v]        
163 |         alignment = self._monotonic_probability_fn(score, previous_alignment)   #[Batch_size, 1, T_v]
164 |         context = tf.matmul(alignment, value)   #[Batch_size, 1, Att_dim]
165 |         
166 |         return tf.squeeze(context, axis= 1), tf.squeeze(alignment, axis= 1)
167 | 
168 |     def _monotonic_probability_fn(self, score, previous_alignment):
169 |         if self.sigmoid_noise > 0.0:
170 |             score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype)
171 |         p_choose_i = tf.sigmoid(score)
172 | 
173 |         cumprod_1mp_choose_i = self.safe_cumprod(1 - p_choose_i, axis= 2, exclusive= True)
174 | 
175 |         alignment = p_choose_i * cumprod_1mp_choose_i * tf.cumsum(
176 |             previous_alignment / tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.),
177 |             axis= 2
178 |             )
179 | 
180 |         return alignment
181 | 
182 |     # https://github.com/tensorflow/addons/blob/9e9031133c8362fedf40f2d05f00334b6f7a970b/tensorflow_addons/seq2seq/attention_wrapper.py#L810
183 |     def safe_cumprod(self, x, *args, **kwargs):
184 |         """Computes cumprod of x in logspace using cumsum to avoid underflow.
185 |         The cumprod function and its gradient can result in numerical instabilities
186 |         when its argument has very small and/or zero values.  As long as the
187 |         argument is all positive, we can instead compute the cumulative product as
188 |         exp(cumsum(log(x))).  This function can be called identically to
189 |         tf.cumprod.
190 |         Args:
191 |         x: Tensor to take the cumulative product of.
192 |         *args: Passed on to cumsum; these are identical to those in cumprod.
193 |         **kwargs: Passed on to cumsum; these are identical to those in cumprod.
194 |         Returns:
195 |         Cumulative product of x.
196 |         """
197 |         x = tf.convert_to_tensor(x, name='x')
198 |         tiny = np.finfo(x.dtype.as_numpy_dtype).tiny
199 |         return tf.exp(tf.cumsum(tf.math.log(tf.clip_by_value(x, tiny, 1)), *args, **kwargs))
200 | 
201 |     def initial_alignment_fn(self, batch_size, key_time, dtype):
202 |         return tf.one_hot(
203 |             indices= tf.zeros((batch_size), dtype= tf.int32),
204 |             depth= key_time,
205 |             dtype= dtype
206 |             )
207 | 
208 | class StepwiseMonotonicAttention(BahdanauMonotonicAttention):
209 |     '''
210 |     Refer: https://gist.github.com/dy-octa/38a7638f75c21479582d7391490df37c
211 |     '''
212 |     def __init__(self, size, sigmoid_noise= 2.0, normalize= False, **kwargs):
213 |         super(StepwiseMonotonicAttention, self).__init__(size, sigmoid_noise, normalize, **kwargs)
214 | 
215 |     def _monotonic_probability_fn(self, score, previous_alignment):
216 |         '''
217 |         score:  [Batch_size, 1, T_v]
218 |         previous_alignment: [batch_size, 1, T_v]
219 |         '''
220 |         if self.sigmoid_noise > 0.0:
221 |             score += self.sigmoid_noise * tf.random.normal(tf.shape(score), dtype= score.dtype)
222 |         p_choose_i = tf.sigmoid(score)  # [Batch_size, 1, T_v]
223 | 
224 |         pad = tf.zeros([tf.shape(p_choose_i)[0], 1, 1], dtype=p_choose_i.dtype)    # [Batch_size, 1, 1]
225 | 
226 |         alignment = previous_alignment * p_choose_i + tf.concat(
227 |             [pad, previous_alignment[:, :, :-1] * (1.0 - p_choose_i[:, :, :-1])], axis= -1)
228 | 
229 |         return alignment


--------------------------------------------------------------------------------
/Modules/Attention/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Modules/Attention/__init__.py


--------------------------------------------------------------------------------
/Modules/GST.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import json
  3 | from .Attention.Layers import MultiHeadAttention
  4 | 
  5 | 
  6 | with open('Hyper_Parameters.json', 'r') as f:
  7 |     hp_Dict = json.load(f)
  8 | 
  9 | with open(hp_Dict['Token_JSON_Path'], 'r') as f:
 10 |     token_Index_Dict = json.load(f)
 11 | 
 12 | class Reference_Encoder(tf.keras.Model):
 13 |     def __init__(self):        
 14 |         super(Reference_Encoder, self).__init__()
 15 |         self.layer_Dict = {}
 16 | 
 17 |         for index, (filters, kernel_Size, strides) in enumerate(zip(
 18 |             hp_Dict['GST']['Reference_Encoder']['Conv']['Filters'],
 19 |             hp_Dict['GST']['Reference_Encoder']['Conv']['Kernel_Size'],
 20 |             hp_Dict['GST']['Reference_Encoder']['Conv']['Strides']
 21 |             )):
 22 |             self.layer_Dict['Conv2D_{}'.format(index)] = tf.keras.Sequential()
 23 |             self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.Conv2D(
 24 |                 filters= filters,
 25 |                 kernel_size= kernel_Size,
 26 |                 strides= strides,
 27 |                 padding='same',
 28 |                 use_bias= False
 29 |                 ))
 30 |             self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.BatchNormalization())
 31 |             self.layer_Dict['Conv2D_{}'.format(index)].add(tf.keras.layers.ReLU())
 32 | 
 33 |         self.layer_Dict['RNN'] = tf.keras.layers.GRU(
 34 |             units= hp_Dict['GST']['Reference_Encoder']['RNN']['Size'],
 35 |             return_sequences= True
 36 |             )
 37 | 
 38 |         self.layer_Dict['Compress_Length'] = tf.keras.layers.Lambda(
 39 |             lambda x: tf.cast(tf.math.ceil(x / tf.reduce_prod(hp_Dict['GST']['Reference_Encoder']['Conv']['Strides'])), tf.int32)
 40 |             )
 41 | 
 42 |         self.layer_Dict['Dense'] = tf.keras.layers.Dense(
 43 |             units= hp_Dict['GST']['Reference_Encoder']['Dense']['Size'],
 44 |             activation= 'tanh'
 45 |             )
 46 | 
 47 |     def call(self, inputs):
 48 |         '''
 49 |         inputs: [mels, mel_lengths]
 50 |         mels: [Batch, Time, Mel_Dim]
 51 |         mel_lengths: [Batch]
 52 |         '''
 53 |         mels, mel_lengths = inputs
 54 |         new_Tensor = tf.expand_dims(mels, axis= -1)   #[Batch, Time, Mel_Dim, 1]
 55 |         for index in range(len(hp_Dict['GST']['Reference_Encoder']['Conv']['Filters'])):
 56 |             new_Tensor = self.layer_Dict['Conv2D_{}'.format(index)](new_Tensor)
 57 |         batch_Size, time_Step = tf.shape(new_Tensor)[0], tf.shape(new_Tensor)[1]
 58 |         height, width = new_Tensor.get_shape().as_list()[2:]
 59 |         new_Tensor = tf.reshape(
 60 |             new_Tensor,
 61 |             shape= [batch_Size, time_Step, height * width]
 62 |             )
 63 |         new_Tensor = self.layer_Dict['RNN'](new_Tensor)        
 64 | 
 65 |         new_Tensor = tf.gather_nd(
 66 |             params= new_Tensor,
 67 |             indices= tf.stack([tf.range(batch_Size), self.layer_Dict['Compress_Length'](mel_lengths) - 1], axis= 1)
 68 |             )
 69 | 
 70 |         return self.layer_Dict['Dense'](new_Tensor)
 71 | 
 72 | class Style_Token_Layer(tf.keras.layers.Layer): #Attention which is in layer must be able to access directly.
 73 |     def __init__(self):
 74 |         super(Style_Token_Layer, self).__init__()
 75 |         
 76 |     def build(self, input_shape):        
 77 |         self.layer_Dict = {}
 78 |         self.layer_Dict['Reference_Encoder'] = Reference_Encoder()
 79 |         self.layer_Dict['Attention'] = MultiHeadAttention(
 80 |             num_heads= hp_Dict['GST']['Style_Token']['Attention']['Head'],
 81 |             size= hp_Dict['GST']['Style_Token']['Attention']['Size']
 82 |             )
 83 | 
 84 |         self.gst_tokens = self.add_weight(
 85 |             name= 'gst_tokens',
 86 |             shape= [hp_Dict['GST']['Style_Token']['Size'], hp_Dict['GST']['Style_Token']['Embedding']['Size']],
 87 |             initializer= tf.keras.initializers.TruncatedNormal(stddev= 0.5),
 88 |             trainable= True,
 89 |             )
 90 | 
 91 |     def call(self, inputs):
 92 |         '''
 93 |         inputs: [mels, mel_lengths]
 94 |         mels: [Batch, Time, Mel_Dim]
 95 |         mel_lengths: [Batch]
 96 |         '''
 97 |         mels_for_gst, mel_lengths = inputs        
 98 |         new_Tensor = self.layer_Dict['Reference_Encoder']([mels_for_gst[:, 1:], mel_lengths])  #Initial frame deletion
 99 | 
100 |         tiled_GST_Tokens = tf.tile(
101 |             tf.expand_dims(tf.tanh(self.gst_tokens), axis=0),
102 |             [tf.shape(new_Tensor)[0], 1, 1]
103 |             )   #[Token_Dim, Emedding_Dim] -> [Batch, Token_Dim, Emedding_Dim]
104 |         new_Tensor = tf.expand_dims(new_Tensor, axis= 1)    #[Batch, R_dim] -> [Batch, 1, R_dim]
105 |         new_Tensor, _ = self.layer_Dict['Attention'](
106 |             inputs= [new_Tensor, tiled_GST_Tokens]  #[query, value]
107 |             )   #[Batch, 1, Att_dim]
108 |         
109 |         return tf.squeeze(new_Tensor, axis= 1)
110 | 
111 | class GST_Concated_Encoder(tf.keras.layers.Layer):
112 |     def __init__(self):
113 |         super(GST_Concated_Encoder, self).__init__()
114 | 
115 |     def call(self, inputs):
116 |         '''
117 |         inputs: [encoder, gsts]
118 |         '''
119 |         encoders, gsts = inputs
120 |         
121 |         return tf.concat([
122 |             tf.tile(tf.expand_dims(gsts, axis= 1), [1, tf.shape(encoders)[1], 1]),
123 |             encoders
124 |             ], axis= -1)


--------------------------------------------------------------------------------
/Modules/Taco2.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import json
  3 | from .Attention.Steps import BahdanauMonotonicAttention, StepwiseMonotonicAttention
  4 | 
  5 | 
  6 | with open('Hyper_Parameters.json', 'r') as f:
  7 |     hp_Dict = json.load(f)
  8 | 
  9 | with open(hp_Dict['Token_JSON_Path'], 'r') as f:
 10 |     token_Index_Dict = json.load(f)
 11 | 
 12 | class Encoder(tf.keras.Model):
 13 |     def __init__(self):
 14 |         super(Encoder, self).__init__()
 15 | 
 16 |     def build(self, input_shapes):
 17 |         self.layer = tf.keras.Sequential()
 18 |         self.layer.add(tf.keras.layers.Embedding(
 19 |             input_dim= len(token_Index_Dict),
 20 |             output_dim= hp_Dict['Tacotron2']['Encoder']['Embedding']['Size'],
 21 |             ))
 22 |         for filters, kernel_size, stride in zip(
 23 |             hp_Dict['Tacotron2']['Encoder']['Conv']['Filters'],
 24 |             hp_Dict['Tacotron2']['Encoder']['Conv']['Kernel_Size'],
 25 |             hp_Dict['Tacotron2']['Encoder']['Conv']['Strides']
 26 |             ):
 27 |             self.layer.add(tf.keras.layers.Conv1D(
 28 |                 filters= filters,
 29 |                 kernel_size= kernel_size,
 30 |                 strides= stride,
 31 |                 padding= 'same',
 32 |                 use_bias= False
 33 |                 ))
 34 |             self.layer.add(tf.keras.layers.BatchNormalization())
 35 |             self.layer.add(tf.keras.layers.ReLU())
 36 |             self.layer.add(tf.keras.layers.Dropout(
 37 |                 rate= hp_Dict['Tacotron2']['Encoder']['Conv']['Dropout_Rate']
 38 |                 ))
 39 |         self.layer.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
 40 |             units= hp_Dict['Tacotron2']['Encoder']['RNN']['Size'],
 41 |             recurrent_dropout= hp_Dict['Tacotron2']['Encoder']['RNN']['Zoneout'], #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet.
 42 |             return_sequences= True
 43 |             )))
 44 | 
 45 |         self.bulit = True
 46 | 
 47 |     def call(self, inputs, training):
 48 |         '''
 49 |         inputs: texts
 50 |         '''
 51 |         return self.layer(inputs, training)
 52 | 
 53 | class Decoder_Step(tf.keras.Model):
 54 |     def __init__(self):
 55 |         super(Decoder_Step, self).__init__()
 56 | 
 57 |         self.build(None)    #I want to generate the initial state and alignment functions early.
 58 | 
 59 |     def build(self, input_shapes):
 60 |         self.layer_Dict = {}
 61 |         self.layer_Dict['Prenet'] = Prenet(
 62 |             sizes= hp_Dict['Tacotron2']['Decoder']['Prenet']['Size'],
 63 |             dropout_rate= hp_Dict['Tacotron2']['Decoder']['Prenet']['Dropout_Rate']
 64 |             )
 65 | 
 66 |         if hp_Dict['Tacotron2']['Decoder']['Attention']['Type'] == 'BMA':
 67 |             self.layer_Dict['Attention'] = BahdanauMonotonicAttention(
 68 |                 size= hp_Dict['Tacotron2']['Decoder']['Attention']['Size']
 69 |                 )
 70 |         elif hp_Dict['Tacotron2']['Decoder']['Attention']['Type'] == 'SMA':
 71 |             self.layer_Dict['Attention'] = StepwiseMonotonicAttention(
 72 |                 size= hp_Dict['Tacotron2']['Decoder']['Attention']['Size']
 73 |                 )
 74 |         else:
 75 |             raise ValueError('Unsupported attention type: {}'.format(hp_Dict['Tacotron2']['Decoder']['Attention']['Type']))
 76 |             
 77 |         rnn_Cell_List = []
 78 |         for size in hp_Dict['Tacotron2']['Decoder']['RNN']['Size']:
 79 |             rnn_Cell_List.append(tf.keras.layers.LSTMCell(
 80 |                 units= size,
 81 |                 recurrent_dropout= hp_Dict['Tacotron2']['Decoder']['RNN']['Zoneout'],    #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet.
 82 |                 ))
 83 |         self.layer_Dict['RNN'] = tf.keras.layers.StackedRNNCells(
 84 |             cells= rnn_Cell_List
 85 |             )
 86 | 
 87 |         self.layer_Dict['Projection'] = tf.keras.layers.Dense(
 88 |             units= hp_Dict['Sound']['Mel_Dim'] * hp_Dict['Step_Reduction'] + 1
 89 |             )
 90 |         
 91 |         self.get_initial_state = self.layer_Dict['RNN'].get_initial_state
 92 |         self.get_initial_alignment = self.layer_Dict['Attention'].initial_alignment_fn
 93 |         
 94 |         self.built = True
 95 | 
 96 |     def call(self, inputs, training):
 97 |         '''
 98 |         inputs: [encodings, current_mels, previous_alignments, previous_rnn_states]
 99 |         encodings: [Batch, T_v, V_dim]
100 |         current_mels: [Batch, Mel_dim]
101 |         previous_alignments: [Batch, T_v]
102 |         previous_rnn_states: A tuple of states
103 |         '''
104 |         encodings, mels, previous_alignments, previous_rnn_states = inputs
105 | 
106 |         new_Tensor = self.layer_Dict['Prenet'](mels)
107 |         attentions, alignments = self.layer_Dict['Attention'](
108 |             [new_Tensor, encodings, previous_alignments]
109 |             )   # [Batch, Att_dim], [Batch, T_v]
110 |         new_Tensor = tf.concat([new_Tensor, attentions], axis= -1)  # [Batch, Prenet_dim + Att_dim]
111 |         new_Tensor, states = self.layer_Dict['RNN'](new_Tensor, states= previous_rnn_states)
112 |         new_Tensor = tf.concat([new_Tensor, attentions], axis= -1)  # [Batch, RNN_dim + Att_dim]
113 |         new_Tensor = self.layer_Dict['Projection'](new_Tensor)  # [Batch, Mel_Dim * r + 1]
114 |         new_Tensor, stops = tf.split(
115 |             new_Tensor,
116 |             num_or_size_splits= [new_Tensor.get_shape()[-1] - 1 ,1],
117 |             axis= -1
118 |             )   # [Batch, Mel_Dim * r], # [Batch, 1]        
119 | 
120 |         return new_Tensor, stops, alignments, states
121 | 
122 | class Decoder(tf.keras.Model):
123 |     def __init__(self):
124 |         super(Decoder, self).__init__()
125 | 
126 |     def build(self, input_shapes):
127 |         self.layer_Dict = {}
128 | 
129 |         self.layer_Dict['Decoder_Step'] = Decoder_Step()
130 |         
131 |         self.layer_Dict['Postnet'] = tf.keras.Sequential()  # Last filters must be Mel
132 |         for index, (filters, kernel_size, stride) in enumerate(zip(
133 |             hp_Dict['Tacotron2']['Decoder']['Conv']['Filters'] + [hp_Dict['Sound']['Mel_Dim']],
134 |             hp_Dict['Tacotron2']['Decoder']['Conv']['Kernel_Size'] + [5],
135 |             hp_Dict['Tacotron2']['Decoder']['Conv']['Strides'] + [1]
136 |             )):
137 |             self.layer_Dict['Postnet'].add(tf.keras.layers.Conv1D(
138 |                 filters= filters,
139 |                 kernel_size= kernel_size,
140 |                 strides= stride,
141 |                 padding= 'same',
142 |                 use_bias= False
143 |                 ))
144 |             self.layer_Dict['Postnet'].add(tf.keras.layers.BatchNormalization())
145 |             if index < len(hp_Dict['Tacotron2']['Decoder']['Conv']['Filters']) - 1:
146 |                 self.layer_Dict['Postnet'].add(tf.keras.layers.Activation(activation= tf.nn.tanh))
147 |             self.layer_Dict['Postnet'].add(tf.keras.layers.Dropout(
148 |                 rate= hp_Dict['Tacotron2']['Encoder']['Conv']['Dropout_Rate']
149 |                 ))
150 | 
151 |         self.built = True
152 | 
153 |     def call(self, inputs, training):
154 |         '''
155 |         inputs: [encodings, mels]
156 |         encoders: [Batch, T_v, V_dim]
157 |         mels: [Batch, T_q, Mel_dim]
158 |         '''
159 |         encodings, mels = inputs
160 | 
161 |         mels = mels[:, 0:-1:hp_Dict['Step_Reduction'], :]  #Only use last slices of each reduction for training
162 |         decodings = tf.zeros(
163 |             shape=[tf.shape(encodings)[0], 1, hp_Dict['Sound']['Mel_Dim']],
164 |             dtype= encodings.dtype
165 |             )  # [Batch, 1, Mel * r]
166 |         stops = tf.zeros(
167 |             shape=[tf.shape(encodings)[0], 0],
168 |             dtype= encodings.dtype
169 |             )  # [Batch, 0]
170 |         alignments = tf.expand_dims(    # [Batch, 1, T_v]
171 |             self.layer_Dict['Decoder_Step'] .get_initial_alignment(
172 |                 tf.shape(encodings)[0],
173 |                 tf.shape(encodings)[1],
174 |                 encodings.dtype
175 |                 ),
176 |             axis= 1
177 |             )
178 |         initial_state = self.layer_Dict['Decoder_Step'] .get_initial_state(
179 |             batch_size= tf.shape(encodings)[0],
180 |             dtype= encodings.dtype
181 |             )      
182 |         def body(step, decodings, stops, alignments, previous_state):
183 |             mel_step = tf.cond(
184 |                 pred= tf.convert_to_tensor(training),
185 |                 true_fn= lambda: mels[:, step],
186 |                 false_fn= lambda: decodings[:, -1]
187 |                 )
188 | 
189 |             decoding, stop, alignment, state = self.layer_Dict['Decoder_Step'](
190 |                 inputs= [encodings, mel_step, alignments[:, -1], previous_state],
191 |                 training= training
192 |                 )
193 | 
194 |             decoding = tf.reshape(
195 |                 decoding,
196 |                 shape= [
197 |                     -1,
198 |                     hp_Dict['Step_Reduction'],
199 |                     hp_Dict['Sound']['Mel_Dim']
200 |                     ]
201 |                 )   #Reshape to r1 
202 | 
203 |             decodings = tf.concat([decodings, decoding], axis= 1)
204 |             stops = tf.concat([stops, stop], axis= -1)
205 |             alignments = tf.concat([alignments, tf.expand_dims(alignment, axis=1)], axis= 1)
206 | 
207 |             return step + 1, decodings, stops, alignments, state
208 | 
209 | 
210 |         max_Step = tf.cond(
211 |             pred= tf.convert_to_tensor(training),
212 |             true_fn= lambda: tf.shape(mels)[1],
213 |             false_fn= lambda: hp_Dict['Max_Step'] // hp_Dict['Step_Reduction']
214 |             )
215 |         _, decodings, stops, alignments, _ = tf.while_loop(
216 |             cond= lambda step, decodings, stops, alignments, previous_state: tf.less(step, max_Step),
217 |             body= body,
218 |             loop_vars= [0, decodings, stops, alignments, initial_state],
219 |             shape_invariants= [
220 |                 tf.TensorShape([]),
221 |                 tf.TensorShape([None, None, hp_Dict['Sound']['Mel_Dim']]),
222 |                 tf.TensorShape([None, None]),
223 |                 tf.TensorShape([None, None, None]),
224 |                 tf.nest.map_structure(lambda x: x.get_shape(), initial_state),
225 |                 ]
226 |             )
227 |         decodings = decodings[:, 1:]
228 |         alignments = alignments[:, 1:]
229 | 
230 |         post_decodings = self.layer_Dict['Postnet'](decodings) + decodings
231 | 
232 |         return decodings, post_decodings, stops, alignments
233 | 
234 | class Vocoder_Taco1(tf.keras.Model):
235 |     def __init__(self):
236 |         super(Vocoder_Taco1, self).__init__()
237 | 
238 |     def build(self, input_shapes):
239 |         self.layer_Dict = {}
240 |         self.layer_Dict['CBHG'] = CBHG(
241 |             convbank_stack_count= hp_Dict['Vocoder_Taco1']['CBHG']['Conv_Bank']['Stack_Count'],
242 |             convbank_filters= hp_Dict['Vocoder_Taco1']['CBHG']['Conv_Bank']['Filters'],
243 |             pool_size= hp_Dict['Vocoder_Taco1']['CBHG']['Pool']['Pool_Size'],
244 |             pool_strides= hp_Dict['Vocoder_Taco1']['CBHG']['Pool']['Strides'],
245 |             project_conv_filters= hp_Dict['Vocoder_Taco1']['CBHG']['Conv1D']['Filters'],
246 |             project_conv_kernel_size= hp_Dict['Vocoder_Taco1']['CBHG']['Conv1D']['Kernel_Size'],
247 |             highwaynet_count= hp_Dict['Vocoder_Taco1']['CBHG']['Highwaynet']['Count'],
248 |             highwaynet_size= hp_Dict['Vocoder_Taco1']['CBHG']['Highwaynet']['Size'],
249 |             rnn_size= hp_Dict['Vocoder_Taco1']['CBHG']['RNN']['Size'],
250 |             rnn_zoneout_rate= hp_Dict['Vocoder_Taco1']['CBHG']['RNN']['Zoneout'],
251 |             )
252 |         self.layer_Dict['Dense'] = tf.keras.layers.Dense(
253 |             units= hp_Dict['Sound']['Spectrogram_Dim']
254 |             )
255 | 
256 |         self.built = True
257 | 
258 |     def call(self, inputs, training= False):
259 |         new_Tensor = self.layer_Dict['CBHG'](inputs= inputs, training= training)
260 |         return self.layer_Dict['Dense'](inputs= new_Tensor)
261 | 
262 | class Prenet(tf.keras.layers.Layer):
263 |     def __init__(self, sizes, dropout_rate):
264 |         super(Prenet, self).__init__()
265 |         self.prenet_Count = len(sizes)
266 |         self.sizes = sizes
267 |         self.dropout_rate = dropout_rate
268 | 
269 |     def build(self, input_shapes):
270 |         self.layer = tf.keras.Sequential()
271 |         for size in self.sizes:
272 |             self.layer.add(tf.keras.layers.Dense(
273 |                 units= size,
274 |                 activation='relu'
275 |                 ))
276 |             self.layer.add(tf.keras.layers.Dropout(
277 |                 rate= self.dropout_rate
278 |                 ))
279 | 
280 |         self.built = True
281 | 
282 |     def call(self, inputs, training):
283 |         return self.layer(inputs= inputs, training= True)   #Always true
284 | 
285 | class CBHG(tf.keras.layers.Layer):
286 |     def __init__(
287 |         self,
288 |         convbank_stack_count,
289 |         convbank_filters,
290 |         pool_size,
291 |         pool_strides,
292 |         project_conv_filters,
293 |         project_conv_kernel_size,
294 |         highwaynet_count,
295 |         highwaynet_size,
296 |         rnn_size,
297 |         rnn_zoneout_rate,
298 |         ):
299 |         self.convbank_stack_count = convbank_stack_count
300 |         self.convbank_filters = convbank_filters
301 |         self.pool_size = pool_size
302 |         self.pool_strides = pool_strides
303 |         self.project_conv_filters = project_conv_filters
304 |         self.project_conv_kernel_size = project_conv_kernel_size
305 |         self.highwaynet_count = highwaynet_count
306 |         self.highwaynet_size = highwaynet_size
307 |         self.rnn_size = rnn_size
308 |         self.rnn_zoneout_rate = rnn_zoneout_rate
309 | 
310 |         super(CBHG, self).__init__()
311 |         
312 |     def build(self, input_shapes):
313 |         self.layer_Dict = {}
314 | 
315 |         self.layer_Dict['ConvBank'] = ConvBank(
316 |             stack_count= self.convbank_stack_count,
317 |             filters= self.convbank_filters
318 |             )
319 | 
320 |         self.layer_Dict['Max_Pooling'] = tf.keras.layers.MaxPool1D(
321 |             pool_size= self.pool_size,
322 |             strides= self.pool_strides,
323 |             padding='same'
324 |             )
325 | 
326 |         self.layer_Dict['Conv1D_Projection'] = tf.keras.Sequential()
327 |         for index, (filters, kernel_Size) in enumerate(zip(
328 |             self.project_conv_filters,
329 |             self.project_conv_kernel_size
330 |             )):
331 |             self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.Conv1D(
332 |                 filters= filters,
333 |                 kernel_size= kernel_Size,
334 |                 padding= 'same',
335 |                 use_bias= False
336 |                 ))
337 |             self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.BatchNormalization())
338 |             if index < len(self.project_conv_filters) - 1:
339 |                 self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.ReLU())
340 | 
341 |         if input_shapes[-1] != self.project_conv_filters[-1]:
342 |             self.layer_Dict['Conv1D_Projection'].add(tf.keras.layers.Dense(
343 |                 units= input_shapes[-1]
344 |                 ))
345 | 
346 |         self.layer_Dict['Highwaynet'] = tf.keras.Sequential()
347 |         if input_shapes[-1] != self.highwaynet_size:
348 |             self.layer_Dict['Highwaynet'].add(tf.keras.layers.Dense(
349 |                 units= self.highwaynet_size
350 |                 ))
351 |         for index in range(self.highwaynet_count):
352 |             self.layer_Dict['Highwaynet'].add(Highwaynet(
353 |                 size= self.highwaynet_size
354 |                 ))
355 | 
356 |         self.layer_Dict['RNN'] = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
357 |             units= self.rnn_size,
358 |             recurrent_dropout= self.rnn_zoneout_rate, #Paper is '0.1'. However, TF2.0 cuDNN implementation does not support that yet.
359 |             return_sequences= True
360 |             ))
361 | 
362 |         self.built = True
363 | 
364 |     def call(self, inputs, training= False):
365 |         new_Tensor = inputs
366 |         
367 |         new_Tensor = self.layer_Dict['ConvBank'](inputs= new_Tensor, training= training)
368 | 
369 |         new_Tensor = self.layer_Dict['Max_Pooling'](inputs= new_Tensor)
370 |         
371 |         new_Tensor = self.layer_Dict['Conv1D_Projection'](inputs= new_Tensor, training= training)
372 |         new_Tensor = new_Tensor + inputs    # Residual
373 | 
374 |         new_Tensor = self.layer_Dict['Highwaynet'](inputs= new_Tensor, training= training)
375 |         
376 |         return self.layer_Dict['RNN'](inputs= new_Tensor, training= training)
377 | 
378 | 
379 | class ConvBank(tf.keras.layers.Layer):
380 |     def __init__(self, stack_count, filters):
381 |         super(ConvBank, self).__init__() 
382 | 
383 |         self.stack_count = stack_count
384 |         self.filters = filters
385 | 
386 |     def build(self, input_shapes):
387 |         self.layer_Dict = {}       
388 |         for index in range(self.stack_count):
389 |             self.layer_Dict['ConvBank_{}'.format(index)] = tf.keras.Sequential()
390 |             self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.Conv1D(
391 |                 filters= self.filters,
392 |                 kernel_size= index + 1,
393 |                 padding= 'same',
394 |                 use_bias= False
395 |                 ))
396 |             self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.BatchNormalization())
397 |             self.layer_Dict['ConvBank_{}'.format(index)].add(tf.keras.layers.ReLU())
398 | 
399 |         self.built = True
400 | 
401 |     def call(self, inputs):
402 |         return tf.concat(
403 |             [self.layer_Dict['ConvBank_{}'.format(index)](inputs) for index in range(self.stack_count)],
404 |             axis= -1
405 |             )
406 | 
407 | class Highwaynet(tf.keras.layers.Layer):
408 |     def __init__(self, size):
409 |         super(Highwaynet, self).__init__()        
410 |         self.layer_Dict = {
411 |             'Dense_Relu': tf.keras.layers.Dense(
412 |                 units= size,
413 |                 activation= 'relu'
414 |                 ),
415 |             'Dense_Sigmoid': tf.keras.layers.Dense(
416 |                 units= size,
417 |                 activation= 'sigmoid'
418 |                 )
419 |             }
420 |     def call(self, inputs):
421 |         h_Tensor = self.layer_Dict['Dense_Relu'](inputs)
422 |         t_Tensor = self.layer_Dict['Dense_Sigmoid'](inputs)
423 |         
424 |         return h_Tensor * t_Tensor + inputs * (1.0 - t_Tensor)
425 | 
426 | class ExponentialDecay(tf.keras.optimizers.schedules.ExponentialDecay):
427 | 
428 |     def __init__(
429 |         self,
430 |         initial_learning_rate,
431 |         decay_steps,
432 |         decay_rate,
433 |         min_learning_rate= None,
434 |         staircase=False,
435 |         name=None
436 |         ):    
437 |         super(ExponentialDecay, self).__init__(
438 |             initial_learning_rate= initial_learning_rate,
439 |             decay_steps= decay_steps,
440 |             decay_rate= decay_rate,
441 |             staircase= staircase,
442 |             name= name
443 |             )
444 | 
445 |         self.min_learning_rate = min_learning_rate
446 | 
447 |     def __call__(self, step):
448 |         learning_rate = super(ExponentialDecay, self).__call__(step)
449 |         if self.min_learning_rate is None:
450 |             return learning_rate
451 | 
452 |         return tf.maximum(learning_rate, self.min_learning_rate)
453 | 
454 |     def get_config(self):
455 |         config_dict = super(ExponentialDecay, self).get_config()
456 |         config_dict['min_learning_rate'] = self.min_learning_rate
457 | 
458 |         return config_dict
459 | 
460 | # if __name__ == "__main__":
461 | #     mels = tf.keras.layers.Input(shape=[None, 80], dtype= tf.float32)
462 | #     tokens = tf.keras.layers.Input(shape=[None], dtype= tf.int32)
463 | #     # ref_E = Reference_Encoder()(mels)
464 | #     # st_L = Style_Token_Layer()(ref_E)
465 | 
466 | #     # print(mels)
467 | #     # print(ref_E)
468 | #     # print(st_L)
469 | 
470 | #     # enc = Tacotron_Encoder()(tokens)
471 | #     # dec = Tacotron_Decoder()(inputs=[enc, mels])
472 |     
473 | #     import numpy as np
474 | #     tokens = np.random.randint(0, 33, size=(3, 52)).astype(np.int32)
475 | #     mels = (np.random.rand(3, 50, 80).astype(np.float32) - 0.5) * 8
476 | #     enc = Tacotron_Encoder()(inputs= tokens)    
477 | #     dec, _ = Tacotron_Decoder()(inputs=[enc, mels])
478 | #     spec = Vocoder_Taco1()(inputs= dec)
479 | #     print(enc.get_shape())
480 | #     print(dec.get_shape())
481 | #     print(spec.get_shape())


--------------------------------------------------------------------------------
/Modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Modules/__init__.py


--------------------------------------------------------------------------------
/Papers/He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/He, Deng, He - 2019 - Robust sequence-to-sequence acoustic modeling with stepwise monotonic attention for neural TTS.pdf


--------------------------------------------------------------------------------
/Papers/Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Prenger, Valle, Catanzaro - 2019 - Waveglow A Flow-based Generative Network for Speech Synthesis.pdf


--------------------------------------------------------------------------------
/Papers/Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Shen et al. - 2018 - Natural TTS Synthesis by Conditioning Wavenet on MEL Spectrogram Predictions.pdf


--------------------------------------------------------------------------------
/Papers/Style Tokens Unsupervised Style Modeling Control and Transfer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Style Tokens Unsupervised Style Modeling Control and Transfer.pdf


--------------------------------------------------------------------------------
/Papers/Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Papers/Wang et al. - 2017 - Tacotron Towards end-To-end speech synthesis.pdf


--------------------------------------------------------------------------------
/Pattern_Generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json, os, time, pickle, librosa, re, argparse
  3 | from concurrent.futures import ThreadPoolExecutor as PE
  4 | from collections import deque
  5 | from threading import Thread
  6 | from random import shuffle
  7 | 
  8 | from Audio import melspectrogram, spectrogram, preemphasis, inv_preemphasis
  9 | 
 10 | with open('Hyper_Parameters.json', 'r') as f:
 11 |     hp_Dict = json.load(f)
 12 | 
 13 | with open(hp_Dict['Token_JSON_Path'], 'r') as f:
 14 |     token_Index_Dict = json.load(f)
 15 | 
 16 | using_Extension = [x.upper() for x in ['.wav', '.m4a', '.flac']]
 17 | regex_Checker = re.compile('[A-Z,.?!\-\s]+')
 18 | max_Worker= 10
 19 | 
 20 | def Text_Filtering(text):
 21 |     remove_Letter_List = ['(', ')', '?', '!', '\'', '\"', '[', ']', ':', ';']
 22 |     replace_List = [('  ', ' '), (' ,', ',')]
 23 | 
 24 |     text = text.upper().strip()
 25 |     for filter in remove_Letter_List:
 26 |         text= text.replace(filter, '')
 27 |     for filter, replace_STR in replace_List:
 28 |         text= text.replace(filter, replace_STR)
 29 | 
 30 |     text= text.strip()
 31 | 
 32 |     if len(regex_Checker.findall(text)) > 1:
 33 |         return None
 34 |     elif text.startswith('\''):
 35 |         return None
 36 |     else:
 37 |         return regex_Checker.findall(text)[0]
 38 | 
 39 | def Mel_Generate(path, top_db= 60, range_Ignore = False):
 40 |     sig = librosa.core.load(
 41 |         path,
 42 |         sr = hp_Dict['Sound']['Sample_Rate']
 43 |         )[0]
 44 |     sig = preemphasis(sig)
 45 |     sig = librosa.effects.trim(sig, top_db= top_db, frame_length= 32, hop_length= 16)[0] * 0.99
 46 |     sig = inv_preemphasis(sig)
 47 | 
 48 |     sig_Length = sig.shape[0] / hp_Dict['Sound']['Sample_Rate'] * 1000  #ms
 49 |     if not range_Ignore and (sig_Length < hp_Dict['Train']['Min_Wav_Length'] or sig_Length > hp_Dict['Train']['Max_Wav_Length']):
 50 |         return None
 51 | 
 52 |     return np.transpose(melspectrogram(
 53 |         y= sig,
 54 |         num_freq= hp_Dict['Sound']['Spectrogram_Dim'],
 55 |         hop_length= hp_Dict['Sound']['Frame_Shift'],
 56 |         win_length= hp_Dict['Sound']['Frame_Length'],
 57 |         num_mels= hp_Dict['Sound']['Mel_Dim'],
 58 |         sample_rate= hp_Dict['Sound']['Sample_Rate'],
 59 |         max_abs_value= hp_Dict['Sound']['Max_Abs_Mel']
 60 |         ).astype(np.float32))
 61 | 
 62 | def Spectrogram_Generate(path, top_db= 60, range_Ignore = False):
 63 |     sig = librosa.core.load(
 64 |         path,
 65 |         sr = hp_Dict['Sound']['Sample_Rate']
 66 |         )[0]
 67 |     sig = preemphasis(sig)
 68 |     sig = librosa.effects.trim(sig, top_db= top_db, frame_length= 32, hop_length= 16)[0] * 0.99
 69 |     sig = inv_preemphasis(sig)
 70 | 
 71 |     sig_Length = sig.shape[0] / hp_Dict['Sound']['Sample_Rate'] * 1000  #ms
 72 |     if not range_Ignore and (sig_Length < hp_Dict['Train']['Min_Wav_Length'] or sig_Length > hp_Dict['Train']['Max_Wav_Length']):
 73 |         return None
 74 |     
 75 |     return np.transpose(spectrogram(
 76 |         y= sig,
 77 |         num_freq= hp_Dict['Sound']['Spectrogram_Dim'],        
 78 |         hop_length= hp_Dict['Sound']['Frame_Shift'],
 79 |         win_length= hp_Dict['Sound']['Frame_Length'],
 80 |         sample_rate= hp_Dict['Sound']['Sample_Rate'],
 81 |         max_abs_value= hp_Dict['Sound']['Max_Abs_Mel']
 82 |         ).astype(np.float32))
 83 | 
 84 | def Pattern_File_Generate(path, text, token_Index_Dict, dataset, file_Prefix='', display_Prefix = '', top_db= 60, range_Ignore = False):
 85 |     mel = Mel_Generate(path, top_db, range_Ignore)
 86 | 
 87 |     if mel is None:
 88 |         print('[{}]'.format(display_Prefix), '{}'.format(path), '->', 'Ignored because of length.')
 89 |         return
 90 |     
 91 |     spect = Spectrogram_Generate(path, top_db, range_Ignore)
 92 | 
 93 |     token = np.array(
 94 |         [token_Index_Dict['<S>']] + [token_Index_Dict[letter] for letter in text] + [token_Index_Dict['<E>']],
 95 |         dtype= np.int32
 96 |         )
 97 |     
 98 |     new_Pattern_Dict = {
 99 |         'Token': token,
100 |         'Mel': mel,
101 |         'Spectrogram': spect,
102 |         'Text': text,
103 |         'Dataset': dataset,
104 |         }
105 | 
106 |     pickle_File_Name = '{}.{}{}.PICKLE'.format(dataset, file_Prefix, os.path.splitext(os.path.basename(path))[0]).upper()
107 | 
108 |     with open(os.path.join(hp_Dict['Train']['Pattern_Path'], pickle_File_Name).replace("\\", "/"), 'wb') as f:
109 |         pickle.dump(new_Pattern_Dict, f, protocol=2)
110 |             
111 |     print('[{}]'.format(display_Prefix), '{}'.format(path), '->', '{}'.format(pickle_File_Name))
112 | 
113 | 
114 | def VCTK_Info_Load(vctk_Path, max_Count= None):
115 |     vctk_Wav_Path = os.path.join(vctk_Path, 'wav48').replace('\\', '/')
116 |     vctk_Txt_Path = os.path.join(vctk_Path, 'txt').replace('\\', '/')
117 |     with open(os.path.join(vctk_Path, 'VCTK.NonOutlier.txt').replace('\\', '/'), 'r') as f:
118 |         vctk_Non_Outlier_List = [x.strip() for x in f.readlines()]
119 |     # try:
120 |         # with open(os.path.join(vctk_Path, 'VCTK.NonOutlier.txt').replace('\\', '/'), 'r') as f:
121 |         #     vctk_Non_Outlier_List = [x.strip() for x in f.readlines()]
122 |     # except:
123 |     #     vctk_Non_Outlier_List = None
124 | 
125 |     vctk_File_Path_List = []
126 |     vctk_Text_Dict = {}
127 |     for root, _, file_Name_List in os.walk(vctk_Wav_Path):
128 |         for file_Name in file_Name_List:
129 |             if not vctk_Non_Outlier_List is None and not file_Name in vctk_Non_Outlier_List:
130 |                 continue
131 |             wav_File_Path = os.path.join(root, file_Name).replace('\\', '/')
132 |             if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension:
133 |                 continue
134 |             txt_File_Path = wav_File_Path.replace(vctk_Wav_Path, vctk_Txt_Path).replace('wav', 'txt')
135 |             if not os.path.exists(txt_File_Path):
136 |                 continue
137 |             with open(txt_File_Path, 'r') as f:
138 |                 text = Text_Filtering(f.read().strip())
139 |             if text is None:
140 |                 continue
141 |             vctk_File_Path_List.append(wav_File_Path)
142 |             vctk_Text_Dict[wav_File_Path] = text
143 | 
144 |     if not max_Count is None:
145 |         vctk_File_Path_List = vctk_File_Path_List[:max_Count]
146 | 
147 |     print('VCTK info generated: {}'.format(len(vctk_File_Path_List)))
148 |     return vctk_File_Path_List, vctk_Text_Dict
149 | 
150 | def LS_Info_Load(ls_Path, max_Count= None):
151 |     ls_File_Path_List = []
152 |     ls_Text_Dict = {}
153 |     for root, _, file_Name_List in os.walk(ls_Path):
154 |         speaker, text_ID = root.replace('\\', '/').split('/')[-2:]
155 | 
156 |         txt_File_Path = os.path.join(ls_Path, speaker, text_ID, '{}-{}.trans.txt'.format(speaker, text_ID)).replace('\\', '/')
157 |         if not os.path.exists(txt_File_Path):
158 |             continue
159 | 
160 |         with open(txt_File_Path, 'r') as f:
161 |             text_Data = f.readlines()
162 | 
163 |         text_Dict = {}
164 |         for text_Line in text_Data:
165 |             text_Line = text_Line.strip().split(' ')
166 |             text_Dict[text_Line[0]] = ' '.join(text_Line[1:])
167 | 
168 |         for file_Name in file_Name_List:
169 |             wav_File_Path = os.path.join(root, file_Name).replace('\\', '/')
170 |             if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension:
171 |                 continue
172 |             text = Text_Filtering(text_Dict[os.path.splitext(os.path.basename(wav_File_Path))[0]])
173 |             if text is None:
174 |                 continue
175 |             ls_File_Path_List.append(wav_File_Path)
176 |             ls_Text_Dict[wav_File_Path] = text
177 | 
178 |     if not max_Count is None:
179 |         ls_File_Path_List = ls_File_Path_List[:max_Count]
180 | 
181 |     print('LS info generated: {}'.format(len(ls_File_Path_List)))
182 |     return ls_File_Path_List, ls_Text_Dict
183 | 
184 | def TIMIT_Info_Load(timit_Path, max_Count= None):
185 |     timit_File_Path_List = []
186 |     timit_Text_List_Dict = {}
187 |     for root, _, file_Name_List in os.walk(timit_Path):
188 |         for file_Name in file_Name_List:
189 |             wav_File_Path = os.path.join(root, file_Name).replace('\\', '/')
190 |             if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension:
191 |                 continue
192 |             txt_File_Path = wav_File_Path.replace('WAV', 'TXT')
193 |             if not os.path.exists(txt_File_Path):
194 |                 continue
195 |             with open(txt_File_Path, 'r') as f:
196 |                 text = Text_Filtering(' '.join(f.read().strip().split(' ')[2:]).strip())
197 |             if text is None:
198 |                 continue
199 |             timit_File_Path_List.append(wav_File_Path)
200 |             timit_Text_List_Dict[wav_File_Path] = text
201 | 
202 |     if not max_Count is None:
203 |         timit_File_Path_List = timit_File_Path_List[:max_Count]
204 | 
205 |     print('TIMIT info generated: {}'.format(len(timit_File_Path_List)))
206 |     return timit_File_Path_List, timit_Text_List_Dict
207 | 
208 | def LJ_Info_Load(lj_Path, max_Count= None):
209 |     lj_File_Path_List = []
210 |     lj_Text_Dict = {}
211 | 
212 |     text_Dict = {}
213 |     with open(os.path.join(lj_Path, 'metadata.csv').replace('\\', '/'), 'r', encoding= 'utf-8') as f:
214 |         readlines = f.readlines()
215 |         
216 |     for line in readlines:
217 |         key, _, text = line.strip().split('|')
218 |         text = Text_Filtering(text)
219 |         if text is None:
220 |             continue
221 |         text_Dict[key.upper()] = text
222 | 
223 |     for root, _, file_Name_List in os.walk(lj_Path):
224 |         for file_Name in file_Name_List:
225 |             wav_File_Path = os.path.join(root, file_Name).replace('\\', '/')
226 |             if not os.path.splitext(wav_File_Path)[1].upper() in using_Extension:
227 |                 continue
228 |             if not os.path.splitext(file_Name)[0].upper() in text_Dict.keys():
229 |                 continue
230 |             lj_File_Path_List.append(wav_File_Path)
231 |             lj_Text_Dict[wav_File_Path] = text_Dict[os.path.splitext(file_Name)[0].upper()]
232 | 
233 |     if not max_Count is None:
234 |         lj_File_Path_List = lj_File_Path_List[:max_Count]
235 | 
236 |     print('LJ info generated: {}'.format(len(lj_File_Path_List)))
237 |     return lj_File_Path_List, lj_Text_Dict
238 | 
239 | def BC2013_Info_Load(bc2013_Path, max_Count= None):
240 |     text_Path_List = []
241 |     for root, _, files in os.walk(bc2013_Path):
242 |         for filename in files:
243 |             if os.path.splitext(filename)[1].upper() != '.txt'.upper():
244 |                 continue
245 |             text_Path_List.append(os.path.join(root, filename).replace('\\', '/'))
246 | 
247 |     bc2013_File_Path_List = []
248 |     bc2013_Text_Dict = {}
249 | 
250 |     for text_Path in text_Path_List:
251 |         wav_Path = text_Path.replace('txt', 'wav')
252 |         if not os.path.exists(wav_Path):
253 |             continue
254 |         with open(text_Path, 'r') as f:
255 |             text = Text_Filtering(f.read().strip())
256 |             if text is None:
257 |                 continue
258 | 
259 |         bc2013_File_Path_List.append(wav_Path)
260 |         bc2013_Text_Dict[wav_Path] = text
261 | 
262 |     if not max_Count is None:
263 |         bc2013_File_Path_List = bc2013_File_Path_List[:max_Count]
264 | 
265 |     print('BC2013 info generated: {}'.format(len(bc2013_File_Path_List)))
266 |     return bc2013_File_Path_List, bc2013_Text_Dict
267 | 
268 | def FV_Info_Load(fv_Path, max_Count= None):
269 |     text_Path_List = []
270 |     for root, _, file_Name_List in os.walk(fv_Path):
271 |         for file in file_Name_List:
272 |             if os.path.splitext(file)[1] == '.data':
273 |                 text_Path_List.append(os.path.join(root, file).replace('\\', '/'))
274 | 
275 |     fv_File_Path_List = []
276 |     fv_Text_Dict = {}
277 |     fv_Speaker_Dict = {}
278 |     for text_Path in text_Path_List:        
279 |         speaker = text_Path.split('/')[-3].split('_')[2].upper()
280 |         with open(text_Path, 'r') as f:
281 |             lines = f.readlines()
282 |         for line in lines:
283 |             file_Path, text, _ = line.strip().split('"')
284 | 
285 |             file_Path = file_Path.strip().split(' ')[1]
286 |             wav_File_Path = os.path.join(
287 |                 os.path.split(text_Path)[0].replace('etc', 'wav'),
288 |                 '{}.wav'.format(file_Path)
289 |                 ).replace('\\', '/')
290 | 
291 |             text = Text_Filtering(text)
292 |             if text is None:
293 |                 continue
294 |             fv_File_Path_List.append(wav_File_Path)
295 |             fv_Text_Dict[wav_File_Path] = text
296 |             fv_Speaker_Dict[wav_File_Path] = speaker
297 | 
298 |     if not max_Count is None:
299 |         fv_File_Path_List = fv_File_Path_List[:max_Count]
300 | 
301 |     print('FV info generated: {}'.format(len(fv_File_Path_List)))
302 |     return fv_File_Path_List, fv_Text_Dict, fv_Speaker_Dict
303 | 
304 | 
305 | 
306 | def Metadata_Generate(token_Index_Dict):
307 |     new_Metadata_Dict = {
308 |         'Token_Index_Dict': token_Index_Dict,        
309 |         'Spectrogram_Dim': hp_Dict['Sound']['Spectrogram_Dim'],
310 |         'Mel_Dim': hp_Dict['Sound']['Mel_Dim'],
311 |         'Frame_Shift': hp_Dict['Sound']['Frame_Shift'],
312 |         'Frame_Length': hp_Dict['Sound']['Frame_Length'],
313 |         'Sample_Rate': hp_Dict['Sound']['Sample_Rate'],
314 |         'Max_Abs_Mel': hp_Dict['Sound']['Max_Abs_Mel'],
315 |         'File_List': [],
316 |         'Token_Length_Dict': {},
317 |         'Mel_Length_Dict': {},
318 |         'Dataset_Dict': {},
319 |         }
320 | 
321 |     for root, _, files in os.walk(hp_Dict['Train']['Pattern_Path']):
322 |         for file in files:
323 |             with open(os.path.join(root, file).replace("\\", "/"), "rb") as f:
324 |                 pattern_Dict = pickle.load(f)
325 |                 try:
326 |                     new_Metadata_Dict['Token_Length_Dict'][file] = pattern_Dict['Token'].shape[0]
327 |                     new_Metadata_Dict['Mel_Length_Dict'][file] = pattern_Dict['Mel'].shape[0]
328 |                     new_Metadata_Dict['Dataset_Dict'][file] = pattern_Dict['Dataset']
329 |                     new_Metadata_Dict['File_List'].append(file)
330 |                 except:
331 |                     print('File \'{}\' is not correct pattern file. This file is ignored.'.format(file))
332 | 
333 |     with open(os.path.join(hp_Dict['Train']['Pattern_Path'], hp_Dict['Train']['Metadata_File'].upper()).replace("\\", "/"), 'wb') as f:
334 |         pickle.dump(new_Metadata_Dict, f, protocol=2)
335 | 
336 |     print('Metadata generate done.')
337 | 
338 | if __name__ == '__main__':
339 |     argParser = argparse.ArgumentParser()
340 |     argParser.add_argument("-lj", "--lj_path", required=False)
341 |     argParser.add_argument("-vctk", "--vctk_path", required=False)
342 |     argParser.add_argument("-ls", "--ls_path", required=False)
343 |     argParser.add_argument("-timit", "--timit_path", required=False)
344 |     argParser.add_argument("-bc2013", "--bc2013_path", required=False)
345 |     argParser.add_argument("-fv", "--fv_path", required=False)
346 |     argParser.add_argument("-all", "--all_save", action='store_true') #When this parameter is False, only correct time range patterns are generated.
347 |     argParser.set_defaults(all_save = False)
348 |     argParser.add_argument("-mc", "--max_count", required=False)
349 |     argParser.add_argument("-mw", "--max_worker", required=False)
350 |     argParser.set_defaults(max_worker = 10)
351 |     argument_Dict = vars(argParser.parse_args())
352 |     
353 |     if not argument_Dict['max_count'] is None:
354 |         argument_Dict['max_count'] = int(argument_Dict['max_count'])
355 | 
356 |     total_Pattern_Count = 0
357 | 
358 |     if not argument_Dict['lj_path'] is None:
359 |         lj_File_Path_List, lj_Text_Dict = LJ_Info_Load(lj_Path= argument_Dict['lj_path'], max_Count= argument_Dict['max_count'])
360 |         total_Pattern_Count += len(lj_File_Path_List)
361 |     if not argument_Dict['vctk_path'] is None:
362 |         vctk_File_Path_List, vctk_Text_Dict = VCTK_Info_Load(vctk_Path= argument_Dict['vctk_path'], max_Count= argument_Dict['max_count'])
363 |         total_Pattern_Count += len(vctk_File_Path_List)
364 |     if not argument_Dict['ls_path'] is None:
365 |         ls_File_Path_List, ls_Text_Dict = LS_Info_Load(ls_Path= argument_Dict['ls_path'], max_Count= argument_Dict['max_count'])
366 |         total_Pattern_Count += len(ls_File_Path_List)
367 |     if not argument_Dict['timit_path'] is None:
368 |         timit_File_Path_List, timit_Text_List_Dict = TIMIT_Info_Load(timit_Path= argument_Dict['timit_path'], max_Count= argument_Dict['max_count'])
369 |         total_Pattern_Count += len(timit_File_Path_List)
370 |     if not argument_Dict['bc2013_path'] is None:
371 |         bc2013_File_Path_List, bc2013_Text_List_Dict = BC2013_Info_Load(bc2013_Path= argument_Dict['bc2013_path'], max_Count= argument_Dict['max_count'])
372 |         total_Pattern_Count += len(bc2013_File_Path_List)
373 |     if not argument_Dict['fv_path'] is None:
374 |         fv_File_Path_List, fv_Text_List_Dict, fv_Speaker_Dict = FV_Info_Load(fv_Path= argument_Dict['fv_path'], max_Count= argument_Dict['max_count'])
375 |         total_Pattern_Count += len(fv_File_Path_List)
376 | 
377 |     if total_Pattern_Count == 0:
378 |         raise ValueError('Total pattern count is zero.')
379 |     
380 |     os.makedirs(hp_Dict['Train']['Pattern_Path'], exist_ok= True)
381 |     total_Generated_Pattern_Count = 0
382 |     with PE(max_workers = int(argument_Dict['max_worker'])) as pe:
383 |         if not argument_Dict['lj_path'] is None:            
384 |             for index, file_Path in enumerate(lj_File_Path_List):
385 |                 pe.submit(
386 |                     Pattern_File_Generate,
387 |                     file_Path,
388 |                     lj_Text_Dict[file_Path],
389 |                     token_Index_Dict,
390 |                     'LJ',
391 |                     '',
392 |                     'LJ {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
393 |                         index,
394 |                         len(lj_File_Path_List),
395 |                         total_Generated_Pattern_Count,
396 |                         total_Pattern_Count
397 |                         ),
398 |                     60,
399 |                     argument_Dict['all_save']
400 |                     )
401 |                 total_Generated_Pattern_Count += 1
402 | 
403 |         if not argument_Dict['vctk_path'] is None:
404 |             for index, file_Path in enumerate(vctk_File_Path_List):
405 |                 pe.submit(
406 |                     Pattern_File_Generate,
407 |                     file_Path,
408 |                     vctk_Text_Dict[file_Path],
409 |                     token_Index_Dict,
410 |                     'VCTK',
411 |                     '',
412 |                     'VCTK {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
413 |                         index,
414 |                         len(vctk_File_Path_List),
415 |                         total_Generated_Pattern_Count,
416 |                         total_Pattern_Count
417 |                         ),
418 |                     15,
419 |                     argument_Dict['all_save']
420 |                     )
421 |                 total_Generated_Pattern_Count += 1
422 | 
423 |         if not argument_Dict['ls_path'] is None:
424 |             for index, file_Path in enumerate(ls_File_Path_List):
425 |                 pe.submit(
426 |                     Pattern_File_Generate,
427 |                     file_Path,
428 |                     ls_Text_Dict[file_Path],
429 |                     token_Index_Dict,
430 |                     'LS',
431 |                     '',
432 |                     'LS {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
433 |                         index,
434 |                         len(ls_File_Path_List),
435 |                         total_Generated_Pattern_Count,
436 |                         total_Pattern_Count
437 |                         ),
438 |                     60,
439 |                     argument_Dict['all_save']
440 |                     )
441 |                 total_Generated_Pattern_Count += 1
442 | 
443 |         if not argument_Dict['timit_path'] is None:
444 |             for index, file_Path in enumerate(timit_File_Path_List):
445 |                 pe.submit(
446 |                     Pattern_File_Generate,
447 |                     file_Path,
448 |                     timit_Text_List_Dict[file_Path],
449 |                     token_Index_Dict,
450 |                     'TIMIT',
451 |                     '{}.'.format(file_Path.split('/')[-2]),
452 |                     'TIMIT {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
453 |                         index,
454 |                         len(timit_File_Path_List),
455 |                         total_Generated_Pattern_Count,
456 |                         total_Pattern_Count
457 |                         ),
458 |                     60,
459 |                     argument_Dict['all_save']
460 |                     )
461 |                 total_Generated_Pattern_Count += 1
462 | 
463 |         if not argument_Dict['bc2013_path'] is None:
464 |             for index, file_Path in enumerate(bc2013_File_Path_List):
465 |                 pe.submit(
466 |                     Pattern_File_Generate,
467 |                     file_Path,
468 |                     bc2013_Text_List_Dict[file_Path],
469 |                     token_Index_Dict,
470 |                     'BC2013',
471 |                     '{}.'.format(file_Path.split('/')[-2]),
472 |                     'BC2013 {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
473 |                         index,
474 |                         len(bc2013_File_Path_List),
475 |                         total_Generated_Pattern_Count,
476 |                         total_Pattern_Count
477 |                         ),
478 |                     60,
479 |                     argument_Dict['all_save']
480 |                     )
481 |                 total_Generated_Pattern_Count += 1
482 | 
483 |         if not argument_Dict['fv_path'] is None:
484 |             for index, file_Path in enumerate(fv_File_Path_List):
485 |                 pe.submit(
486 |                     Pattern_File_Generate,
487 |                     file_Path,
488 |                     fv_Text_List_Dict[file_Path],
489 |                     token_Index_Dict,
490 |                     'FV',
491 |                     '{}.'.format(fv_Speaker_Dict[file_Path]),
492 |                     'FV {:05d}/{:05d}    Total {:05d}/{:05d}'.format(
493 |                         index,
494 |                         len(fv_File_Path_List),
495 |                         total_Generated_Pattern_Count,
496 |                         total_Pattern_Count
497 |                         ),
498 |                     60,
499 |                     argument_Dict['all_save']
500 |                     )
501 |                 total_Generated_Pattern_Count += 1
502 | 
503 |     Metadata_Generate(token_Index_Dict)


--------------------------------------------------------------------------------
/ProgressBar.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright (c) 2016 Vladimir Ignatev
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining 
 5 | # a copy of this software and associated documentation files (the "Software"), 
 6 | # to deal in the Software without restriction, including without limitation 
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 8 | # and/or sell copies of the Software, and to permit persons to whom the Software 
 9 | # is furnished to do so, subject to the following conditions:
10 | # 
11 | # The above copyright notice and this permission notice shall be included 
12 | # in all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
18 | # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
19 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | 
21 | # Script from 'https://gist.github.com/vladignatyev/06860ec2040cb497f0f3'
22 | 
23 | import sys
24 | 
25 | 
26 | def progress(count, total, status=''):
27 |     bar_len = 60
28 |     filled_len = int(round(bar_len * count / float(total)))
29 | 
30 |     percents = round(100.0 * count / float(total), 1)
31 |     bar = '=' * filled_len + '-' * (bar_len - filled_len)
32 | 
33 |     sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
34 |     sys.stdout.flush()  # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GST Tacotron in TF2
  2 | 
  3 | This code is an implementation of the paper 'Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis'. The algorithm is based on the following papers:
  4 | 
  5 | ```
  6 | Wang, Y., Stanton, D., Zhang, Y., Skerry-Ryan, R. J., Battenberg, E., Shor, J., ... & Saurous, R. A. (2018). Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis. arXiv preprint arXiv:1803.09017.
  7 | He, M., Deng, Y., & He, L. (2019). Robust Sequence-to-Sequence Acoustic Modeling with Stepwise Monotonic Attention for Neural TTS. arXiv preprint arXiv:1906.00672.
  8 | Shen, J., Pang, R., Weiss, R. J., Schuster, M., Jaitly, N., Yang, Z., ... & Saurous, R. A. (2018, April). Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4779-4783). IEEE.
  9 | Wang, Y., Skerry-Ryan, R. J., Stanton, D., Wu, Y., Weiss, R. J., Jaitly, N., ... & Le, Q. (2017). Tacotron: Towards end-to-end speech synthesis. arXiv preprint arXiv:1703.10135.
 10 | ```
 11 | 
 12 | # Update
 13 | * 2020-05-02
 14 |     * BN and ReLU order is fixed (Now 'BN -> ReLU' and no bias)
 15 |     * Frame shift and frame window is based on the sample for compatibility of Vocoder.
 16 |     * `tf.train.Checkpoint` is used to save optimizer parameter saving. Thus, the step information is saved.
 17 | 
 18 | # Requirements
 19 | Please see the 'Requirements.txt'
 20 | 
 21 | # Structrue
 22 | ![Structure](./Figures/Structure.png)
 23 | 
 24 | Currently, model supports only grffin lim vocoder. Other vocoder is one of the future works.
 25 | 
 26 | 
 27 | # Used dataset
 28 | Currently uploaded code is compatible with the following datasets. The O mark to the left of the dataset name is the dataset actually used in the uploaded result.
 29 | 
 30 | ```
 31 | [O] LJSpeech: https://keithito.com/LJ-Speech-Dataset/
 32 | [X] VCTK: https://datashare.is.ed.ac.uk/handle/10283/2651
 33 | [X] LibriSpeech: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
 34 | [X] TIMIT: http://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3
 35 | [X] Blizzard Challenge 2013: http://www.cstr.ed.ac.uk/projects/blizzard/
 36 | [O] FastVox: http://www.festvox.org/cmu_arctic/index.html
 37 | ```
 38 | 
 39 | # Hyper parameters
 40 | Before proceeding, please set the pattern, inference, and checkpoint paths in 'Hyper_Parameter.json' according to your environment.
 41 | 
 42 | 
 43 | 
 44 | * Sound
 45 |     * Setting basic sound parameters.
 46 | 
 47 | * Token_JSON_Path
 48 |     * Setting the text token
 49 | 
 50 | * GST
 51 |     * Setting the global style token modules.
 52 |     * If 'Use' is false, model does not use GST. It become just tacotron 2.
 53 |     * In 'Reference_Encoder/Conv', 'Filters', 'Kernel_Size', and 'Strides' must be lists of the same size.
 54 |     * In 'Style_Token/Attention', 'Size' must be divied by 'Head'.
 55 | 
 56 | * Tacotron1
 57 |     * Setting the parameters of tacotron 1.
 58 |     * If 'Taco_Version' is 2, the parameters of this part will be ignored.
 59 |     * I recommend that all 'Zoneout' parameters are set 0.0 because the CuDNN does not support the recurrent_dropout yet. See the following reference.
 60 |         * https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
 61 |     * Currently, this hyper parameter is ignored because tacotron 1 does not implemented.
 62 | 
 63 | * Tacotron2
 64 |     * Setting the parameters of tacotron 2.
 65 |     * If 'Taco_Version' is 1, the parameters of this part will be ignored.
 66 |     * I recommend that all 'Zoneout' parameters are set 0.0 because the CuDNN does not support the recurrent_dropout yet.
 67 |     * See the following reference for details.
 68 |         * https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
 69 |     
 70 | * Step_Reduction
 71 |     * Setting how many steps will be exported at single step in decoder.
 72 | 
 73 | * Max_Step
 74 |     * Setting the maximum setp of inference mel or spectrogram While inference.
 75 | 
 76 | * Vocoder_Taco1
 77 |     * Setting the parameters of Griffin-Lim vocoder.
 78 | 
 79 | * Train
 80 |     * Setting the parameters of training.
 81 |     
 82 | * Taco_Version
 83 |     * Setting the tacotron version.
 84 |     * Currently, this hyper parameter is ignored because tacotron 1 does not implemented.
 85 | * Use_Mixed_Precision
 86 |     * Setting the usage of mixed precision.
 87 |     * If using, the tensors are stored by 16bit, not 32bit.
 88 |     * The weights are stored by 32bit, so the model is compatible with checkpoints learned with different mixed precisions if the rest of the parameters are the same.
 89 |     * Usually, this parameter makes be possible to use larger batch size.
 90 |     * In the unsupported machine, the speed is extreamly slower.
 91 |     * When using, I recommend to increase the epsilon of ADAM to 1e-4 to prevent the underflow problem.
 92 |     * See the following reference for details.
 93 |         * https://www.tensorflow.org/api_docs/python/tf/keras/mixed_precision/experimental/Policy
 94 | * Inference_Path
 95 |     * Setting the inference path
 96 | * Checkpoint_Path
 97 |     * Setting the checkpoint path
 98 | * Inference_Cut
 99 |     * The figure and wav files will be cutted at stop token when this parameter is true.
100 | * Device
101 |     * Setting which GPU device is used in multi-GPU enviornment.
102 |     * Or, if using only CPU, please set '-1'.
103 | 
104 | # Generate pattern
105 | 
106 | ## Command
107 | ```
108 | python Pattern_Generate.py [parameters]
109 | ```
110 | 
111 | ## Parameters
112 | 
113 | At least, one or more of datasets must be used.
114 | 
115 | * -lj <path>
116 |     * Set the path of LJSpeech. LJSpeech's patterns are generated.
117 | * -vctk <path>
118 |     * Set the path of VCTK. VCTK's patterns are generated.
119 | * -ls <path>
120 |     * Set the path of LibriSpeech. LibriSpeech's patterns are generated.
121 | * -timit <path>
122 |     * Set the path of TIMIT. TIMIT's patterns are generated.
123 | * -bc2013 <path>
124 |     * Set the path of Blizzard Challenge 2013. Blizzard Challenge 2013's patterns are generated.
125 | * -fv <path>
126 |     * Set the path of FastVox. FastVox's patterns are generated.
127 | * -all
128 |     * All save option.
129 |     * Generator ignore the 'Train/Min_Wav_Length' and 'Train/Max_Wav_Length' of hyper parameters.
130 |     * If this option is not set, only patterns matching 'Train/Min_Wav_Length' and 'Train/Max_Wav_Length' are generated. 
131 | * -mc
132 |     * Ignore patterns that exceed the set number of each dataset.
133 | * -mw
134 |     * The number of threads used to create the pattern
135 |     
136 | # Inference file path while training for verification.
137 | 
138 | * Inference_Sentence_for_Training.txt
139 |     * Sentence list which is used for inference while training.
140 | * Inference_Wav_for_Training.txt
141 |     * Wav path which is used for inference while training.
142 |     * If 'GST/Use' is false, this will be ignored.
143 |     * The count of paths must be 1 or same to sentence count.
144 | 
145 | # Run
146 | 
147 | ## Command
148 | ```
149 | python Model.py [parameters]
150 | ```
151 | 
152 | ## Parameters
153 | 
154 | * -s <int>
155 |     * Set the start step.
156 |     * In TF2, there is no global step. However, to decay the learning rate, model require the step value.
157 |     * Default is 0.
158 | 
159 |     
160 | # Inference
161 | 
162 | 1. Run 'ipython' in the model's directory.
163 | 2. Run following command:
164 | ```
165 | from Model import GST_Tacotron
166 | new_GST_Tacotron = GST_Tacotron(is_Training= False)
167 | new_GST_Tacotron.Restore()
168 | ```
169 | 3. Set the speaker's Wav path list and text list like the following example:
170 | 
171 | ```
172 | sentence_List = [
173 |     'The grass is always greener on the other side of the fence.',
174 |     'Strike while the iron is hot.',
175 |     'A creative artist works on his next composition because he was not satisfied with his previous one.',
176 |     'You cannot make an omelet without breaking a few eggs.',
177 |     ]
178 | wav_List_for_GST = [
179 |     './Wav_for_Inference/FV.AWB.arctic_a0001.wav',
180 |     './Wav_for_Inference/FV.JMK.arctic_a0004.wav',
181 |     './Wav_for_Inference/FV.SLT.arctic_a0007.wav',
182 |     './Wav_for_Inference/LJ.LJ050-0278.wav',
183 |     ]
184 | ```
185 | __※The length of wav path must be 1 or same to text list.__
186 | 
187 | 
188 | 4. Run following command:
189 | ```
190 | new_GST_Tacotron.Inference(
191 |     sentence_List = sentence_List,
192 |     wav_List_for_GST = wav_List_for_GST,
193 |     label = 'Result'
194 |     )
195 | ```
196 | 
197 | # GST embedding inference
198 | 1. Do until 2 of [Inference](#Inference)
199 | 
200 | 2. Set the Wav path list and tag list like the following example:
201 | ```
202 | wav_List = [
203 |     './Wav_for_Inference/FV.AWB.arctic_a0001.wav'
204 |     './Wav_for_Inference/FV.JMK.arctic_a0004.wav'
205 |     './Wav_for_Inference/FV.SLT.arctic_a0007.wav'
206 |     './Wav_for_Inference/LJ.LJ050-0278.wav'
207 |     ]
208 | tag_List = [
209 |     'AWB'
210 |     'JMK'
211 |     'SLT'
212 |     'LJ'
213 |     ]
214 | ```
215 | __※The length of two lists must be same.__
216 | 
217 | 3. Run following command:
218 | 
219 | * You can take the output as numpy arrays.
220 | 
221 | ```
222 | mels, stops, spectrograms, alignments = new_GST_Tacotron.Inference_GST(wav_List, tag_List)
223 | ```
224 | 
225 | 
226 | 4. The result is saved as a text file in inference directory. You can get the t-SNE analysis graph by using [R script](./R_Script/TSNE.R)
227 | 
228 | 
229 | # Result
230 | * The following results are based on the checkpoint of 38000 steps of 40 batchs (43.77 epochs).
231 | * In figures, vertical line is stop detection.
232 | * All speakers are distinguishable.
233 | * Voice quality is not perfect, but I concluded that the reason is because the insufficient learning steps and the use of Griffin-Lim, not vocoder.
234 | * I stopped training this model. I will focus to generate and attach a vocoder.
235 | 
236 | ## Mel for GST: FastVox AWB A0001
237 | * Sentence: The grass is always greener on the other side of the fence.
238 | 
239 | [Wav_IDX_0](./Example_Results/Wav/20200505.214958.IDX_0.WAV)
240 | ![Figure_IDX_0](./Example_Results/Figures/20200505.214958.IDX_0.PNG)
241 | 
242 | ## Mel for GST: FastVox BDL A0002
243 | * Sentence: Strike while the iron is hot.
244 | 
245 | [Wav_IDX_1](./Example_Results/Wav/20200505.214958.IDX_1.WAV)
246 | ![Figure_IDX_1](./Example_Results/Figures/20200505.214958.IDX_1.PNG)
247 | 
248 | ## Mel for GST: FastVox CLB A0003
249 | * Sentence: A creative artist works on his next composition because he was not satisfied with his previous one.
250 | 
251 | [Wav_IDX_2](./Example_Results/Wav/20200505.214958.IDX_2.WAV)
252 | ![Figure_IDX_2](./Example_Results/Figures/20200505.214958.IDX_2.PNG)
253 | 
254 | ## Mel for GST: FastVox JMK A0004
255 | * Sentence: You cannot make an omelet without breaking a few eggs.
256 | 
257 | [Wav_IDX_3](./Example_Results/Wav/20200505.214958.IDX_3.WAV)
258 | ![Figure_IDX_3](./Example_Results/Figures/20200505.214958.IDX_3.PNG)
259 | 
260 | ## Mel for GST: FastVox KSP A0005.wav
261 | * Sentence: Death is like a fisherman who catches fish in his net and leaves them for a while in the water. The fish is still swimming but the net is around him, and the fisherman will draw him up.
262 | 
263 | [Wav_IDX_4](./Example_Results/Wav/20200505.214958.IDX_4.WAV)
264 | ![Figure_IDX_4](./Example_Results/Figures/20200505.214958.IDX_4.PNG)
265 | 
266 | ## Mel for GST: FastVox.RMS A0006
267 | * Sentence: A man who marries a woman to educate her falls a victim to the same fallacy as the woman who marries a man to reform him.
268 | 
269 | [Wav_IDX_5](./Example_Results/Wav/20200505.214958.IDX_5.WAV)
270 | ![Figure_IDX_5](./Example_Results/Figures/20200505.214958.IDX_5.PNG)
271 | 
272 | ## Mel for GST: FastVox.SLT A0007
273 | * Sentence: Birds of a feather flock together.
274 | 
275 | [Wav_IDX_6](./Example_Results/Wav/20200505.214958.IDX_6.WAV)
276 | ![Figure_IDX_6](./Example_Results/Figures/20200505.214958.IDX_6.PNG)
277 | 
278 | ## Mel for GST: LJspeech LJ050-0278
279 | * Sentence: Too many cooks in the kitchen spoil the broth.
280 | 
281 | [Wav_IDX_7](./Example_Results/Wav/20200505.214958.IDX_7.WAV)
282 | ![Figure_IDX_7](./Example_Results/Figures/20200505.214958.IDX_7.PNG)
283 | 
284 | ## GST embedding t-SNE
285 | ![GST_Embedding](./Example_Results/GST/20200506.001527.GST.PNG)
286 | 
287 | # Trained checkpoint
288 | 
289 | [Checkpoint here](https://drive.google.com/open?id=1qcm_eUS7R2Xa7N5quD1r0Iy2qQl1r6wd)
290 | 
291 | * This is the checkpoint of 38000 steps of 24 batchs (43.77 epochs).
292 | * There is the hyper-parameter about this checkpoint in the zip file.
293 | 
294 | 
295 | # Future works
296 | 1. Vocoder attaching. (I am focusing several vocdoers....)
297 | ```
298 | Prenger, R., Valle, R., & Catanzaro, B. (2019, May). Waveglow: A flow-based generative network for speech synthesis. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 3617-3621). IEEE.
299 | Oord, A. V. D., Dieleman, S., Zen, H., Simonyan, K., Vinyals, O., Graves, A., ... & Kavukcuoglu, K. (2016). Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499.
300 | Kalchbrenner, N., Elsen, E., Simonyan, K., Noury, S., Casagrande, N., Lockhart, E., ... & Kavukcuoglu, K. (2018). Efficient neural audio synthesis. arXiv preprint arXiv:1802.08435.
301 | Yamamoto, R., Song, E., & Kim, J. M. (2020, May). Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 6199-6203). IEEE.
302 | Kumar, K., Kumar, R., de Boissiere, T., Gestin, L., Teoh, W. Z., Sotelo, J., ... & Courville, A. C. (2019). Melgan: Generative adversarial networks for conditional waveform synthesis. In Advances in Neural Information Processing Systems (pp. 14881-14892).
303 | ```
304 | 
305 | 2. Tacotron 1 module update
306 |     * Original paper used the tacotron 1, not tacotron 2.
307 |     * I hope to add tacotron 1 for performance comparison and more.
308 | 


--------------------------------------------------------------------------------
/R_Script/TSNE.R:
--------------------------------------------------------------------------------
 1 | # Refer: https://statkclee.github.io/model/model-tsne.html
 2 | 
 3 | library(tidyverse)
 4 | library(Rtsne)
 5 | library(tools)
 6 | 
 7 | base_Dir <- 'D:/GST.Results/Inference/GST/'
 8 | 
 9 | list.files(base_Dir)
10 | for(file in list.files(base_Dir))
11 | {
12 |   if (toupper(file_ext(file)) != 'TXT')
13 |   {
14 |     next
15 |   }
16 |   else if (file.exists(sprintf('%s%s', base_Dir, str_replace(file, '.TXT', '.PNG'))))
17 |   {
18 |     next
19 |   }
20 |   
21 |   gst.Data <- read_delim(
22 |     sprintf('%s%s', base_Dir, file),
23 |     "\t",
24 |     escape_double = FALSE,
25 |     locale = locale(encoding = "UTF-8"),
26 |     trim_ws = TRUE
27 |   )
28 |   gst.TSNE <- Rtsne(
29 |     gst.Data[,c(-1)],
30 |     PCA = TRUE,
31 |     check_duplicates = FALSE,
32 |     dims = 2,
33 |     max_iter = 1000,
34 |     perplexity= 5
35 |   )
36 |   gst.TSNE.DF <- data.frame(
37 |     TSNE_x = gst.TSNE$Y[, 1], 
38 |     TSNE_y = gst.TSNE$Y[, 2], 
39 |     Data_Tag = gst.Data$Tag
40 |   )
41 |   
42 |   
43 |   plot <- ggplot(data= gst.TSNE.DF, aes(x= TSNE_x, y= TSNE_y, color= Data_Tag)) +
44 |     geom_point() +
45 |     #geom_text(aes(label= Data_Tag)) +
46 |     labs(title= 'GST t-SNE', x= '', y= '') +
47 |     theme_bw() +
48 |     theme(
49 |       axis.title.x = element_blank(),
50 |       axis.title.y = element_blank(),
51 |       # axis.text = element_blank(),
52 |       strip.text = element_text(size = 20),
53 |       panel.grid=element_blank(),
54 |       legend.position = 'right',
55 |       plot.title = element_text(hjust = 0.5)
56 |     )
57 |   
58 |   
59 |   ggsave(
60 |     filename = sprintf('%s%s', base_Dir, str_replace(file, '.TXT', '.PNG')),
61 |     plot = plot,
62 |     device = "png", width = 12, height = 10, units = "cm", dpi = 300
63 |   )
64 | }


--------------------------------------------------------------------------------
/R_Script/VCTK_Outlier_Checker.R:
--------------------------------------------------------------------------------
  1 | library(readr)
  2 | library(ggplot2)
  3 | library(car)
  4 | 
  5 | repeat_Count <- 1000
  6 | base_Dir = 'D:/Python_Programming/GST_Tacotron/'
  7 | 
  8 | vctk_Length.Data <- read_delim(
  9 |   sprintf('%sVCTK_Length.txt', base_Dir),
 10 |   "\t",
 11 |   escape_double = FALSE,
 12 |   locale = locale(encoding = "UTF-8"),
 13 |   trim_ws = TRUE
 14 |   )
 15 | 
 16 | vctk_Length.Sig <- vctk_Length.Data[c(-4,-5)]
 17 | vctk_Length.Trim <- vctk_Length.Data[c(-3,-5)]
 18 | vctk_Length.Split <- vctk_Length.Data[c(-3,-4)]
 19 | 
 20 | vctk_Length.Sig.Plot <- ggplot(vctk_Length.Sig, aes(x= Sig_Length, y= Text_Length)) +
 21 |   geom_point() +
 22 |   labs(title=sprintf('Original Sig count: %s', nrow(vctk_Length.Sig))) +
 23 |   geom_smooth(method = "lm")
 24 | vctk_Length.Trim.Plot <- ggplot(vctk_Length.Trim, aes(x= Trim_Length, y= Text_Length)) +
 25 |   geom_point() +
 26 |   labs(title=sprintf('Original Trim count: %s', nrow(vctk_Length.Trim))) +
 27 |   geom_smooth(method = "lm")
 28 | vctk_Length.Split.Plot <- ggplot(vctk_Length.Split, aes(x= Split_Length, y= Text_Length)) +
 29 |   geom_point() +
 30 |   labs(title=sprintf('Original Split count: %s', nrow(vctk_Length.Split))) +
 31 |   geom_smooth(method = "lm")
 32 | 
 33 | for (index in seq(repeat_Count))
 34 | {
 35 |   vctk_Length.Sig$Num <- row.names(vctk_Length.Sig)
 36 |   vctk_Length.Trim$Num <- row.names(vctk_Length.Trim)
 37 |   vctk_Length.Split$Num <- row.names(vctk_Length.Split)
 38 |   
 39 |   vctk_Length.Sig.LM <- lm(
 40 |     Sig_Length ~ Text_Length + I(Text_Length^2),
 41 |     data=vctk_Length.Sig
 42 |   )
 43 |   vctk_Length.Trim.LM <- lm(
 44 |     Trim_Length ~ Text_Length + I(Text_Length^2),
 45 |     data=vctk_Length.Trim
 46 |   )
 47 |   vctk_Length.Split.LM <- lm(
 48 |     Split_Length ~ Text_Length + I(Text_Length^2),
 49 |     data=vctk_Length.Split
 50 |   )
 51 |   
 52 |   vctk_Length.Sig.Outlier <- outlierTest(vctk_Length.Sig.LM)
 53 |   vctk_Length.Trim.Outlier <- outlierTest(vctk_Length.Trim.LM)
 54 |   vctk_Length.Split.Outlier <- outlierTest(vctk_Length.Split.LM)
 55 |   
 56 |   vctk_Length.Sig$Outlier <- vctk_Length.Sig$Num %in% as.numeric(names(vctk_Length.Sig.Outlier$p))
 57 |   vctk_Length.Trim$Outlier <- vctk_Length.Trim$Num %in% as.numeric(names(vctk_Length.Trim.Outlier$p))
 58 |   vctk_Length.Split$Outlier <- vctk_Length.Split$Num %in% as.numeric(names(vctk_Length.Split.Outlier$p))
 59 |   
 60 |   vctk_Length.Sig <- subset(vctk_Length.Sig, !Outlier)
 61 |   vctk_Length.Trim <- subset(vctk_Length.Trim, !Outlier)
 62 |   vctk_Length.Split <- subset(vctk_Length.Split, !Outlier)
 63 | }
 64 | 
 65 | vctk_Length.Sig.Plot.Remove_Outlier <- ggplot(vctk_Length.Sig, aes(x= Sig_Length, y= Text_Length)) +
 66 |   geom_point() +
 67 |   labs(title=sprintf('Outlier removed Sig count: %s', nrow(vctk_Length.Sig))) +
 68 |   geom_smooth(method = "lm")
 69 | vctk_Length.Trim.Plot.Remove_Outlier <- ggplot(vctk_Length.Trim, aes(x= Trim_Length, y= Text_Length)) +
 70 |   geom_point() +
 71 |   labs(title=sprintf('Outlier removed Trim count: %s', nrow(vctk_Length.Trim))) +
 72 |   geom_smooth(method = "lm")
 73 | vctk_Length.Split.Plot.Remove_Outlier <- ggplot(vctk_Length.Split, aes(x= Split_Length, y= Text_Length)) +
 74 |   geom_point() +
 75 |   labs(title=sprintf('Outlier removed Split count: %s', nrow(vctk_Length.Split))) +
 76 |   geom_smooth(method = "lm")
 77 | 
 78 | 
 79 | 
 80 | ggsave(
 81 |   filename = sprintf('%sSig.Original.png', base_Dir),
 82 |   plot = vctk_Length.Sig.Plot,
 83 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
 84 | )
 85 | ggsave(
 86 |   filename = sprintf('%sTrim.Original.png', base_Dir),
 87 |   plot = vctk_Length.Trim.Plot,
 88 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
 89 | )
 90 | ggsave(
 91 |   filename = sprintf('%sSplit.Original.png', base_Dir),
 92 |   plot = vctk_Length.Split.Plot,
 93 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
 94 | )
 95 | ggsave(
 96 |   filename = sprintf('%sSig.RemoveOutlier.png', base_Dir),
 97 |   plot = vctk_Length.Sig.Plot.Remove_Outlier,
 98 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
 99 | )
100 | ggsave(
101 |   filename = sprintf('%sTrim.RemoveOutlier.png', base_Dir),
102 |   plot = vctk_Length.Trim.Plot.Remove_Outlier,
103 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
104 | )
105 | ggsave(
106 |   filename = sprintf('%sSplit.RemoveOutlier.png', base_Dir),
107 |   plot = vctk_Length.Split.Plot.Remove_Outlier,
108 |   device = "png", width = 12, height = 12, units = "cm", dpi = 300
109 | )
110 | 
111 | write.table(vctk_Length.Trim[c(1)], sprintf('%svctk_nonoutlier.txt', base_Dir),sep='\t', row.names=FALSE, quote= FALSE)
112 | 


--------------------------------------------------------------------------------
/Requirements.txt:
--------------------------------------------------------------------------------
1 | librosa>=0.7.2
2 | matplotlib>=3.1.1
3 | tensorflow>=2.1.2
4 | 


--------------------------------------------------------------------------------
/Token_Index_Dict.ENG.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "<S>": 0,
 3 |     "<E>": 1,
 4 |     " ": 2,
 5 |     "!": 3,
 6 |     ",": 4,
 7 |     "-": 5,
 8 |     ".": 6,
 9 |     "?": 7,
10 |     "A": 8,
11 |     "B": 9,
12 |     "C": 10,
13 |     "D": 11,
14 |     "E": 12,
15 |     "F": 13,
16 |     "G": 14,
17 |     "H": 15,
18 |     "I": 16,
19 |     "J": 17,
20 |     "K": 18,
21 |     "L": 19,
22 |     "M": 20,
23 |     "N": 21,
24 |     "O": 22,
25 |     "P": 23,
26 |     "Q": 24,
27 |     "R": 25,
28 |     "S": 26,
29 |     "T": 27,
30 |     "U": 28,
31 |     "V": 29,
32 |     "W": 30,
33 |     "X": 31,
34 |     "Y": 32,
35 |     "Z": 33
36 | }


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.AWB.arctic_a0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.AWB.arctic_a0001.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.BDL.arctic_a0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.BDL.arctic_a0002.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.CLB.arctic_a0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.CLB.arctic_a0003.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.JMK.arctic_a0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.JMK.arctic_a0004.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.KSP.arctic_a0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.KSP.arctic_a0005.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.RMS.arctic_a0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.RMS.arctic_a0006.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/FV.SLT.arctic_a0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/FV.SLT.arctic_a0007.wav


--------------------------------------------------------------------------------
/Wav_for_Inference/LJ.LJ050-0278.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CODEJIN/GST_Tacotron/d341feff9ce811defff00576dcae8dc96069b8f7/Wav_for_Inference/LJ.LJ050-0278.wav


--------------------------------------------------------------------------------