├── dllogger ├── __init__.py ├── autologging.py ├── tags.py └── logger.py ├── scripts ├── docker │ ├── build.sh │ └── interactive.sh ├── griffin_lim_synth.sh ├── gta_synth.sh ├── train_tacotron2.sh ├── prepare_dataset.sh └── prepare_mels.sh ├── requirements.txt ├── Dockerfile ├── tacotron2 ├── text │ ├── symbols.py │ ├── LICENCE │ ├── cmudict.py │ ├── numbers.py │ ├── __init__.py │ └── cleaners.py ├── loss_function.py ├── loader.py ├── data_function.py └── model.py ├── README.md ├── LICENCE ├── .gitignore ├── plot.py ├── multiproc.py ├── common ├── preprocessor.py ├── utils.py ├── audio_processing.py ├── layers.py ├── audio.py └── stft.py ├── preprocess.py ├── inference.py ├── gta.py ├── hparams.py ├── filelists ├── ljs_mel_text_val_filelist.txt └── ljs_audio_text_val_filelist.txt └── train.py /dllogger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build . --rm -t tacotron2 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | inflect 4 | scipy 5 | Unidecode 6 | pillow 7 | apex 8 | json 9 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:19.08-py3 2 | 3 | ADD . /workspace/tacotron2 4 | WORKDIR /workspace/tacotron2 5 | RUN pip install -r requirements.txt 6 | -------------------------------------------------------------------------------- /scripts/griffin_lim_synth.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python inference.py -i text.txt -o outputs --amp-run --speaker-num 4 --speaker-id 0 --log-file nvlog.json 2 | -------------------------------------------------------------------------------- /scripts/docker/interactive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -it --rm --ipc=host -v $PWD:/workspace/tacotron2/ tacotron2 bash 4 | -------------------------------------------------------------------------------- /scripts/gta_synth.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python gta.py --amp-run -o gta --dataset-path training_data --training-anchor-dirs tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050 2 | -------------------------------------------------------------------------------- /scripts/train_tacotron2.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python train.py --amp-run -o logs --init-lr 1e-3 --final-lr 1e-5 --epochs 200 -bs 32 --weight-decay 1e-6 --log-file nvlog.json --dataset-path training_data --training-anchor-dirs --load-mel-from-disk tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050 2 | -------------------------------------------------------------------------------- /scripts/prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | DATADIR="LJSpeech-1.1" 6 | BZ2ARCHIVE="${DATADIR}.tar.bz2" 7 | ENDPOINT="http://data.keithito.com/data/speech/$BZ2ARCHIVE" 8 | 9 | if [ ! -d "$DATADIR" ]; then 10 | echo "dataset is missing, unpacking ..." 11 | if [ ! -f "$BZ2ARCHIVE" ]; then 12 | echo "dataset archive is missing, downloading ..." 13 | wget "$ENDPOINT" 14 | fi 15 | tar jxvf "$BZ2ARCHIVE" 16 | fi 17 | -------------------------------------------------------------------------------- /tacotron2/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | from tacotron2.text import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _punctuation = '!\'(),.:;? ' 12 | _special = '-' 13 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890' 14 | 15 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 16 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 17 | 18 | # Export all symbols: 19 | symbols = [_pad, _eos] + list(_special) + list(_punctuation) + list(_letters)# + _arpabet 20 | -------------------------------------------------------------------------------- /scripts/prepare_mels.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | DATADIR="ljs" 6 | FILELISTSDIR="filelists" 7 | 8 | TESTLIST="$FILELISTSDIR/ljs_audio_text_test_filelist.txt" 9 | TRAINLIST="$FILELISTSDIR/ljs_audio_text_train_filelist.txt" 10 | VALLIST="$FILELISTSDIR/ljs_audio_text_val_filelist.txt" 11 | 12 | TESTLIST_MEL="$FILELISTSDIR/ljs_mel_text_test_filelist.txt" 13 | TRAINLIST_MEL="$FILELISTSDIR/ljs_mel_text_train_filelist.txt" 14 | VALLIST_MEL="$FILELISTSDIR/ljs_mel_text_val_filelist.txt" 15 | 16 | mkdir -p "$DATADIR/mels" 17 | if [ $(ls $DATADIR/mels | wc -l) -ne 13100 ]; then 18 | python preprocess_audio2mel.py --wav-files "$TRAINLIST" --mel-files "$TRAINLIST_MEL" 19 | python preprocess_audio2mel.py --wav-files "$TESTLIST" --mel-files "$TESTLIST_MEL" 20 | python preprocess_audio2mel.py --wav-files "$VALLIST" --mel-files "$VALLIST_MEL" 21 | fi 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron 2 for PyTorch 2 | 3 | This repository provides a script and recipe to train Tacotron 2. The source is forked from [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2) and combined with [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2). It supports multi-speaker TTS, GTA synthesis as well as reduction factor. 4 | 5 | ## Run the scripts 6 | ```shell 7 | # Preprocessing 8 | python preprocess.py 9 | # Training 10 | nohup bash scripts/train_tacotron2.sh & 11 | # Evaluation 12 | bash scripts/griffin_lim_synth.sh 13 | # GTA synthesis 14 | bash scripts/gta_synth.sh 15 | ``` 16 | 17 | ## Vocoder recommended 18 | [WaveRNN](https://github.com/begeekmyfriend/WaveRNN) 19 | 20 | [WaveGlow](https://github.com/begeekmyfriend/WaveGlow) 21 | 22 | [SqueezeWave](https://github.com/begeekmyfriend/SqueezeWave) 23 | 24 | ## Audio samples 25 | [Two males and two females in Chinese](https://github.com/begeekmyfriend/tacotron2/issues/1) 26 | -------------------------------------------------------------------------------- /tacotron2/text/LICENCE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, NVIDIA Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | logs/ 28 | runs/ 29 | *.out 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /tacotron2/text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 8 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 9 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 10 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 11 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 12 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 13 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | def __init__(self, file_or_path, keep_ambiguous=True): 22 | if isinstance(file_or_path, str): 23 | with open(file_or_path, encoding='latin-1') as f: 24 | entries = _parse_cmudict(f) 25 | else: 26 | entries = _parse_cmudict(file_or_path) 27 | if not keep_ambiguous: 28 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 29 | self._entries = entries 30 | 31 | 32 | def __len__(self): 33 | return len(self._entries) 34 | 35 | 36 | def lookup(self, word): 37 | '''Returns list of ARPAbet pronunciations of the given word.''' 38 | return self._entries.get(word.upper()) 39 | 40 | 41 | 42 | _alt_re = re.compile(r'\([0-9]+\)') 43 | 44 | 45 | def _parse_cmudict(file): 46 | cmudict = {} 47 | for line in file: 48 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 49 | parts = line.split(' ') 50 | word = re.sub(_alt_re, '', parts[0]) 51 | pronunciation = _get_pronunciation(parts[1]) 52 | if pronunciation: 53 | if word in cmudict: 54 | cmudict[word].append(pronunciation) 55 | else: 56 | cmudict[word] = [pronunciation] 57 | return cmudict 58 | 59 | 60 | def _get_pronunciation(s): 61 | parts = s.strip().split(' ') 62 | for part in parts: 63 | if part not in _valid_symbol_set: 64 | return None 65 | return ' '.join(parts) 66 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def split_title_line(title_text, max_words=5): 8 | """ 9 | A function that splits any string based on specific character 10 | (returning it with the string), with maximum number of words on it 11 | """ 12 | seq = title_text.split() 13 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 14 | 15 | 16 | def plot_alignment(alignment, path, info=None, split_title=False): 17 | fig = plt.figure(figsize=(8, 6)) 18 | ax = fig.add_subplot(111) 19 | 20 | im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none') 21 | fig.colorbar(im, ax=ax) 22 | xlabel = 'Decoder timestep' 23 | title = split_title_line(info) if split_title else info 24 | plt.xlabel(xlabel) 25 | plt.title(title) 26 | plt.ylabel('Encoder timestep') 27 | plt.tight_layout() 28 | plt.savefig(path, format='png') 29 | plt.close() 30 | 31 | 32 | def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 33 | if max_len is not None: 34 | target_spectrogram = target_spectrogram[:max_len] 35 | pred_spectrogram = pred_spectrogram[:max_len] 36 | 37 | title = split_title_line(info) if split_title else info 38 | fig = plt.figure(figsize=(10, 8)) 39 | # Set common labels 40 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 41 | 42 | #target spectrogram subplot 43 | if target_spectrogram is not None: 44 | ax1 = fig.add_subplot(311) 45 | ax2 = fig.add_subplot(312) 46 | 47 | if auto_aspect: 48 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none') 49 | else: 50 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none') 51 | ax1.set_title('Target Mel-Spectrogram') 52 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) 53 | ax2.set_title('Predicted Mel-Spectrogram') 54 | else: 55 | ax2 = fig.add_subplot(211) 56 | 57 | if auto_aspect: 58 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none') 59 | else: 60 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none') 61 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2) 62 | 63 | plt.tight_layout() 64 | plt.savefig(path, format='png') 65 | plt.close() 66 | -------------------------------------------------------------------------------- /tacotron2/loss_function.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | from torch import nn 29 | 30 | 31 | class Tacotron2Loss(nn.Module): 32 | def __init__(self): 33 | super(Tacotron2Loss, self).__init__() 34 | 35 | def forward(self, model_output, targets): 36 | mel_target, gate_target = targets[0], targets[1] 37 | mel_out_before, mel_out_after, gate_out, _ = model_output 38 | 39 | mel_loss = nn.MSELoss()(mel_out_before, mel_target) + nn.MSELoss()(mel_out_after, mel_target) 40 | gate_loss = nn.BCEWithLogitsLoss()(gate_out.view(-1, 1), gate_target.view(-1, 1)) 41 | return mel_loss + gate_loss 42 | -------------------------------------------------------------------------------- /tacotron2/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /dllogger/autologging.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import subprocess 17 | import xml.etree.ElementTree as ET 18 | 19 | from dllogger.logger import LOGGER 20 | 21 | #TODO: print CUDA version, container version etc 22 | 23 | def log_hardware(): 24 | # TODO: asserts - what if you cannot launch those commands? 25 | # number of CPU threads 26 | cpu_info_command = 'cat /proc/cpuinfo' 27 | cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split() 28 | cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1 29 | cpu_num = int(cpu_info[cpu_num_index]) + 1 30 | 31 | # CPU name 32 | cpu_name_begin_index = cpu_info.index(b'name') 33 | cpu_name_end_index = cpu_info.index(b'stepping') 34 | cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8') 35 | 36 | LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name}) 37 | 38 | # RAM memory 39 | ram_info_command = 'free -m -h' 40 | ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split() 41 | ram_index = ram_info.index(b'Mem:') + 1 42 | ram = ram_info[ram_index].decode('utf-8') 43 | 44 | LOGGER.log(key='mem_info', value={"ram": ram}) 45 | 46 | # GPU 47 | nvidia_smi_command = 'nvidia-smi -q -x' 48 | nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout 49 | nvidia_smi = ET.fromstring(nvidia_smi_output) 50 | gpus = nvidia_smi.findall('gpu') 51 | ver = nvidia_smi.findall('driver_version') 52 | 53 | LOGGER.log(key="gpu_info", 54 | value={ 55 | "driver_version": ver[0].text, 56 | "num": len(gpus), 57 | "name": [g.find('product_name').text for g in gpus], 58 | "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]}) 59 | 60 | def log_args(args): 61 | LOGGER.log(key='args', value=vars(args)) 62 | -------------------------------------------------------------------------------- /tacotron2/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | from tacotron2.text import cleaners 4 | from tacotron2.text.symbols import symbols 5 | 6 | 7 | # Mappings from symbol to numeric ID and vice versa: 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 10 | 11 | # Regular expression matching text enclosed in curly braces: 12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 13 | 14 | 15 | def text_to_sequence(text, speaker_id, cleaner_names): 16 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 17 | 18 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 19 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 20 | 21 | Args: 22 | text: string to convert to a sequence 23 | cleaner_names: names of the cleaner functions to run the text through 24 | 25 | Returns: 26 | List of integers corresponding to the symbols in the text 27 | ''' 28 | sequence = [] 29 | 30 | # Check for curly braces and treat their contents as ARPAbet: 31 | while len(text): 32 | m = _curly_re.match(text) 33 | if not m: 34 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 35 | break 36 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 37 | sequence += _arpabet_to_sequence(m.group(2)) 38 | text = m.group(3) 39 | 40 | # Append EOS token 41 | sequence.append(_symbol_to_id['~']) 42 | return [s + speaker_id * len(symbols) for s in sequence] 43 | 44 | 45 | def sequence_to_text(sequence, speaker_id): 46 | '''Converts a sequence of IDs back to a string''' 47 | result = '' 48 | sequence = [s - speaker_id * len(symbols) for s in sequence] 49 | for symbol_id in sequence: 50 | if symbol_id in _id_to_symbol: 51 | s = _id_to_symbol[symbol_id] 52 | # Enclose ARPAbet back in curly braces: 53 | if len(s) > 1 and s[0] == '@': 54 | s = '{%s}' % s[1:] 55 | result += s 56 | return result.replace('}{', ' ') 57 | 58 | 59 | def _clean_text(text, cleaner_names): 60 | for name in cleaner_names: 61 | cleaner = getattr(cleaners, name) 62 | if not cleaner: 63 | raise Exception('Unknown cleaner: %s' % name) 64 | text = cleaner(text) 65 | return text 66 | 67 | 68 | def _symbols_to_sequence(symbols): 69 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 70 | 71 | 72 | def _arpabet_to_sequence(text): 73 | return _symbols_to_sequence(['@' + s for s in text.split()]) 74 | 75 | 76 | def _should_keep_symbol(s): 77 | return s in _symbol_to_id and s is not '_' and s is not '~' 78 | -------------------------------------------------------------------------------- /tacotron2/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from .numbers import normalize_numbers 18 | 19 | 20 | # Regular expression matching whitespace: 21 | _whitespace_re = re.compile(r'\s+') 22 | 23 | # List of (regular expression, replacement) pairs for abbreviations: 24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 25 | ('mrs', 'misess'), 26 | ('mr', 'mister'), 27 | ('dr', 'doctor'), 28 | ('st', 'saint'), 29 | ('co', 'company'), 30 | ('jr', 'junior'), 31 | ('maj', 'major'), 32 | ('gen', 'general'), 33 | ('drs', 'doctors'), 34 | ('rev', 'reverend'), 35 | ('lt', 'lieutenant'), 36 | ('hon', 'honorable'), 37 | ('sgt', 'sergeant'), 38 | ('capt', 'captain'), 39 | ('esq', 'esquire'), 40 | ('ltd', 'limited'), 41 | ('col', 'colonel'), 42 | ('ft', 'fort'), 43 | ]] 44 | 45 | 46 | def expand_abbreviations(text): 47 | for regex, replacement in _abbreviations: 48 | text = re.sub(regex, replacement, text) 49 | return text 50 | 51 | 52 | def expand_numbers(text): 53 | return normalize_numbers(text) 54 | 55 | 56 | def lowercase(text): 57 | return text.lower() 58 | 59 | 60 | def collapse_whitespace(text): 61 | return re.sub(_whitespace_re, ' ', text) 62 | 63 | 64 | def convert_to_ascii(text): 65 | return unidecode(text) 66 | 67 | 68 | def basic_cleaners(text): 69 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 70 | text = lowercase(text) 71 | text = collapse_whitespace(text) 72 | return text 73 | 74 | 75 | def transliteration_cleaners(text): 76 | '''Pipeline for non-English text that transliterates to ASCII.''' 77 | text = convert_to_ascii(text) 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def english_cleaners(text): 84 | '''Pipeline for English text, including number and abbreviation expansion.''' 85 | text = convert_to_ascii(text) 86 | text = lowercase(text) 87 | text = expand_numbers(text) 88 | text = expand_abbreviations(text) 89 | text = collapse_whitespace(text) 90 | return text 91 | -------------------------------------------------------------------------------- /multiproc.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import sys 29 | import subprocess 30 | 31 | import torch 32 | 33 | 34 | def main(): 35 | argslist = list(sys.argv)[1:] 36 | world_size = torch.cuda.device_count() 37 | 38 | if '--world-size' in argslist: 39 | argslist[argslist.index('--world-size') + 1] = str(world_size) 40 | else: 41 | argslist.append('--world-size') 42 | argslist.append(str(world_size)) 43 | 44 | workers = [] 45 | 46 | for i in range(world_size): 47 | if '--rank' in argslist: 48 | argslist[argslist.index('--rank') + 1] = str(i) 49 | else: 50 | argslist.append('--rank') 51 | argslist.append(str(i)) 52 | stdout = None if i == 0 else subprocess.DEVNULL 53 | worker = subprocess.Popen( 54 | [str(sys.executable)] + argslist, stdout=stdout) 55 | workers.append(worker) 56 | 57 | returncode = 0 58 | try: 59 | pending = len(workers) 60 | while pending > 0: 61 | for worker in workers: 62 | try: 63 | worker_returncode = worker.wait(1) 64 | except subprocess.TimeoutExpired: 65 | continue 66 | pending -= 1 67 | if worker_returncode != 0: 68 | if returncode != 1: 69 | for worker in workers: 70 | worker.terminate() 71 | returncode = 1 72 | 73 | except KeyboardInterrupt: 74 | print('Pressed CTRL-C, TERMINATING') 75 | for worker in workers: 76 | worker.terminate() 77 | for worker in workers: 78 | worker.wait() 79 | raise 80 | 81 | sys.exit(returncode) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /common/preprocessor.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | from common import audio 4 | import glob 5 | import librosa 6 | import numpy as np 7 | import os 8 | 9 | 10 | def build_from_path(hparams, input_dir, wav_dir, mel_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - wav_dir: output directory of the preprocessed speech audio dataset 19 | - n_jobs: Optional, number of worker process to parallelize across 20 | - tqdm: Optional, provides a nice progress bar 21 | 22 | Returns: 23 | - A list of tuple describing the train examples. this should be written to train.txt 24 | """ 25 | 26 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 27 | # optimization purposes and it can be omited 28 | futures = [] 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | for root, _, files in os.walk(input_dir): 31 | for f in files: 32 | if f.endswith('.trn'): 33 | trn_file = os.path.join(root, f) 34 | with open(trn_file) as f: 35 | basename = trn_file[:-4] 36 | wav_file = basename + '.wav' 37 | basename = basename.split('/')[-1] 38 | text = f.readline().strip() 39 | futures.append(executor.submit(partial(_process_utterance, wav_dir, mel_dir, basename, wav_file, text, hparams))) 40 | 41 | return [future.result() for future in tqdm(futures) if future.result() is not None] 42 | 43 | 44 | def _process_utterance(wav_dir, mel_dir, basename, wav_file, text, hparams): 45 | """ 46 | Preprocesses a single utterance wav/text pair 47 | 48 | this writes the mel scale spectogram to disk and return a tuple to write 49 | to the train.txt file 50 | 51 | Args: 52 | - wav_dir: the directory to write the preprocessed wav into 53 | - mel_dir: the directory to write the mel spectograms into 54 | - basename: the basename of each file 55 | - wav_file: path to the audio file containing the speech input 56 | - text: text spoken in the input audio file 57 | - hparams: hyper parameters 58 | 59 | Returns: 60 | - A tuple: (audio_filename, mel_filename, time_steps, mel_frames, text) 61 | """ 62 | try: 63 | # Load the audio as numpy array 64 | wav, sr = librosa.core.load(wav_file, sr=hparams.sample_rate) 65 | except FileNotFoundError: #catch missing wav exception 66 | print(f'file {wav_file} present in csv metadata is not present in wav folder. skipping!') 67 | return None 68 | 69 | #rescale wav 70 | if hparams.rescale: 71 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 72 | 73 | #M-AILABS extra silence specific 74 | if hparams.trim_silence: 75 | wav = audio.trim_silence(wav) 76 | 77 | # Compute the mel scale spectrogram from the wav 78 | mel = audio.melspectrogram(wav).astype(np.float32) 79 | mel_frames = mel.shape[1] 80 | 81 | if mel_frames > hparams.max_mel_frames or len(text) > hparams.max_text_length: 82 | return None 83 | 84 | #Zero pad for quantized signal 85 | #time resolution adjustement 86 | #ensure length of raw audio is multiple of hop size so that we can use 87 | #transposed convolution to upsample 88 | r = mel_frames * audio.get_hop_size() - len(wav) 89 | wav = np.pad(wav, (0, r), mode='constant', constant_values=0.) 90 | assert len(wav) == mel_frames * audio.get_hop_size() 91 | time_steps = len(wav) 92 | 93 | # Write the spectrogram and audio to disk 94 | filename = f'{basename}.npy' 95 | np.save(os.path.join(wav_dir, filename), wav, allow_pickle=False) 96 | np.save(os.path.join(mel_dir, filename), mel, allow_pickle=False) 97 | 98 | # Return a tuple describing this training example 99 | return (filename, time_steps, mel_frames, text) 100 | -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import numpy as np 29 | from scipy.io.wavfile import read, write 30 | from scipy import signal 31 | import math 32 | import torch 33 | import os 34 | 35 | 36 | def cosine_decay(init_val, final_val, step, decay_steps): 37 | alpha = final_val / init_val 38 | cosine_decay = 0.5 * (1 + math.cos(math.pi * step / decay_steps)) 39 | decayed = (1 - alpha) * cosine_decay + alpha 40 | return init_val * decayed 41 | 42 | 43 | def get_mask_from_lengths(lengths): 44 | max_len = torch.max(lengths).item() 45 | ids = torch.arange(0, max_len, out=torch.cuda.IntTensor(max_len)) 46 | mask = ids < lengths.unsqueeze(1) 47 | return mask 48 | 49 | 50 | def preemphasize(wav, k=0.97): 51 | return signal.lfilter([1, -k], [1], wav) 52 | 53 | 54 | def de_emphasize(wav, k=0.97): 55 | return signal.lfilter([1], [1, -k], wav) 56 | 57 | 58 | def load_wav_to_torch(path, max_value=32768): 59 | wav = np.load(path) 60 | wav = preemphasize(wav) 61 | return torch.FloatTensor(wav.astype(np.float32)) 62 | 63 | 64 | def dc_notch_filter(wav): 65 | # code from speex 66 | notch_radius = 0.982 67 | den = notch_radius ** 2 + 0.7 * (1 - notch_radius) ** 2 68 | b = np.array([1, -2, 1]) * notch_radius 69 | a = np.array([1, -2 * notch_radius, den]) 70 | return signal.lfilter(b, a, wav) 71 | 72 | 73 | def save_wav(wav, path, sr=22050): 74 | wav = dc_notch_filter(wav) 75 | f1 = 0.8 * 32768 / max(0.01, np.max(np.abs(wav))) 76 | f2 = np.sign(wav) * np.power(np.abs(wav), 0.95) 77 | wav = f1 * f2 78 | write(path, sr, wav.astype(np.int16)) 79 | 80 | 81 | def load_metadata(dirname, filename='train.txt', split="|"): 82 | with open(os.path.join(dirname, filename)) as f: 83 | def split_line(line): 84 | parts = line.strip().split(split) 85 | wav_path = os.path.join(dirname, 'audio', parts[0]) 86 | text = parts[-1] 87 | return wav_path, text 88 | return [split_line(line) for line in f.readlines()] 89 | 90 | 91 | def to_gpu(x): 92 | x = x.contiguous() 93 | 94 | if torch.cuda.is_available(): 95 | x = x.cuda(non_blocking=True) 96 | return torch.autograd.Variable(x) 97 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | 5 | from common import preprocessor 6 | from hparams import hparams 7 | from tqdm import tqdm 8 | 9 | 10 | def preprocess(args, input_folders, output_dir, hparams): 11 | mel_frames, timesteps = 0, 0 12 | max_text_lens, max_mel_lens, max_timestep_lens = [], [], [] 13 | 14 | for input_dir in input_folders: 15 | wav_dir = os.path.join(output_dir, input_dir.split('/')[-1], 'audio') 16 | mel_dir = os.path.join(output_dir, input_dir.split('/')[-1], 'mels') 17 | os.makedirs(wav_dir, exist_ok=True) 18 | os.makedirs(mel_dir, exist_ok=True) 19 | 20 | metadata = preprocessor.build_from_path(hparams, input_dir, wav_dir, mel_dir, args.n_jobs, tqdm=tqdm) 21 | with open(os.path.join(output_dir, input_dir.split('/')[-1], 'train.txt'), 'w') as f: 22 | for m in metadata: 23 | f.write('|'.join([str(x) for x in m]) + '\n') 24 | max_text_lens.append(max(len(m[3]) for m in metadata)) 25 | max_mel_lens.append(max(int(m[2]) for m in metadata)) 26 | max_timestep_lens.append(max(m[1] for m in metadata)) 27 | mel_frames += sum([int(m[2]) for m in metadata]) 28 | timesteps += sum([int(m[1]) for m in metadata]) 29 | 30 | hours = timesteps / hparams.sample_rate / 3600 31 | print(f'Write {len(metadata)} utterances, {mel_frames} mel frames, {timesteps} audio timesteps, ({hours:.2f} hours)') 32 | print(f'Max input length (text chars): {max(max_text_lens)}') 33 | print(f'Max mel frames length: {max(max_mel_lens)}') 34 | print(f'Max audio timesteps length: {max(max_timestep_lens)}') 35 | 36 | 37 | def norm_data(args): 38 | 39 | merge_books = (args.merge_books=='True') 40 | 41 | print('Selecting data folders..') 42 | supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS', 'MANDARIN'] 43 | if args.dataset not in supported_datasets: 44 | raise ValueError(f'dataset value entered {args.dataset} does not belong to supported datasets: {supported_datasets}') 45 | 46 | if args.dataset.startswith('LJSpeech'): 47 | return [os.path.join(args.base_dir, args.dataset)] 48 | 49 | if args.dataset.startswith('MANDARIN'): 50 | return [os.path.join(args.base_dir, 'data_mandarin', anchor) for anchor in hparams.anchor_dirs] 51 | 52 | if args.dataset == 'M-AILABS': 53 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 54 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA'] 55 | if args.language not in supported_languages: 56 | raise ValueError(f'Please enter a supported language to use from M-AILABS dataset! \n{supported_languages}') 57 | 58 | supported_voices = ['female', 'male', 'mix'] 59 | if args.voice not in supported_voices: 60 | raise ValueError(f'Please enter a supported voice option to use from M-AILABS dataset! \n{supported_voices}') 61 | 62 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice) 63 | supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 64 | if args.reader not in supported_readers: 65 | raise ValueError(f'Please enter a valid reader for your language and voice settings! \n{supported_readers}') 66 | 67 | path = os.path.join(path, args.reader) 68 | supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 69 | if merge_books: 70 | return [os.path.join(path, book) for book in supported_books] 71 | 72 | else: 73 | if args.book not in supported_books: 74 | raise ValueError(f'Please enter a valid book for your reader settings! \n{supported_books}') 75 | return [os.path.join(path, args.book)] 76 | 77 | 78 | def run_preprocess(args, hparams): 79 | input_folders = norm_data(args) 80 | output_folder = os.path.join(args.base_dir, args.output) 81 | preprocess(args, input_folders, output_folder, hparams) 82 | 83 | 84 | def main(): 85 | print('initializing preprocessing..') 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument('--base_dir', default='') 88 | parser.add_argument('--hparams', default='', help='Hyperparameter overrides as a comma-separated list of name=value pairs') 89 | parser.add_argument('--dataset', default='MANDARIN') 90 | parser.add_argument('--language', default='en_US') 91 | parser.add_argument('--voice', default='female') 92 | parser.add_argument('--reader', default='mary_ann') 93 | parser.add_argument('--merge_books', default='False') 94 | parser.add_argument('--book', default='northandsouth') 95 | parser.add_argument('--output', default='training_data') 96 | parser.add_argument('--n_jobs', type=int, default=cpu_count()) 97 | args = parser.parse_args() 98 | 99 | modified_hp = hparams.parse(args.hparams) 100 | 101 | assert args.merge_books in ('False', 'True') 102 | 103 | run_preprocess(args, modified_hp) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /common/audio_processing.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import torch 29 | import numpy as np 30 | import librosa.util as librosa_util 31 | from scipy.signal import get_window 32 | from .utils import de_emphasize 33 | 34 | 35 | def window_sumsquare(window, n_frames, hop_length=256, win_length=1024, 36 | n_fft=1024, dtype=np.float32, norm=None): 37 | """ 38 | # from librosa 0.6 39 | Compute the sum-square envelope of a window function at a given hop length. 40 | 41 | This is used to estimate modulation effects induced by windowing 42 | observations in short-time fourier transforms. 43 | 44 | Parameters 45 | ---------- 46 | window : string, tuple, number, callable, or list-like 47 | Window specification, as in `get_window` 48 | 49 | n_frames : int > 0 50 | The number of analysis frames 51 | 52 | hop_length : int > 0 53 | The number of samples to advance between frames 54 | 55 | win_length : [optional] 56 | The length of the window function. By default, this matches `n_fft`. 57 | 58 | n_fft : int > 0 59 | The length of each analysis frame. 60 | 61 | dtype : np.dtype 62 | The data type of the output 63 | 64 | Returns 65 | ------- 66 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 67 | The sum-squared envelope of the window function 68 | """ 69 | if win_length is None: 70 | win_length = n_fft 71 | 72 | n = n_fft + hop_length * (n_frames - 1) 73 | x = np.zeros(n, dtype=dtype) 74 | 75 | # Compute the squared window at the desired length 76 | win_sq = get_window(window, win_length, fftbins=True) 77 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 78 | win_sq = librosa_util.pad_center(win_sq, n_fft) 79 | 80 | # Fill the envelope 81 | for i in range(n_frames): 82 | sample = i * hop_length 83 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 84 | return x 85 | 86 | 87 | def griffin_lim(magnitudes, stft_fn, n_iters=50, power=1.5): 88 | """ 89 | PARAMS 90 | ------ 91 | magnitudes: spectrogram magnitudes 92 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 93 | """ 94 | magnitudes = magnitudes.unsqueeze(0) ** power 95 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 96 | angles = angles.astype(np.float32) 97 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 98 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 99 | 100 | for i in range(n_iters): 101 | _, angles = stft_fn.transform(signal) 102 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 103 | return de_emphasize(signal.squeeze()) 104 | 105 | 106 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 107 | """ 108 | PARAMS 109 | ------ 110 | C: compression factor 111 | """ 112 | return torch.log(torch.clamp(x, min=clip_val) * C) 113 | 114 | 115 | def dynamic_range_decompression(x, C=1): 116 | """ 117 | PARAMS 118 | ------ 119 | C: compression factor used to compress 120 | """ 121 | return torch.exp(x) / C 122 | -------------------------------------------------------------------------------- /common/layers.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import librosa 29 | import torch 30 | from common.audio_processing import dynamic_range_compression, dynamic_range_decompression 31 | from common.stft import STFT 32 | 33 | 34 | class LinearNorm(torch.nn.Module): 35 | def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): 36 | super(LinearNorm, self).__init__() 37 | self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) 38 | 39 | # torch.nn.init.xavier_uniform_(self.linear_layer.weight, 40 | # gain=torch.nn.init.calculate_gain(w_init_gain)) 41 | 42 | def forward(self, x): 43 | return self.linear_layer(x) 44 | 45 | 46 | class ConvNorm(torch.nn.Module): 47 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, 48 | padding=None, dilation=1, bias=True, w_init_gain='linear'): 49 | super(ConvNorm, self).__init__() 50 | if padding is None: 51 | assert(kernel_size % 2 == 1) 52 | padding = int(dilation * (kernel_size - 1) / 2) 53 | 54 | self.conv = torch.nn.Conv1d(in_channels, out_channels, 55 | kernel_size=kernel_size, stride=stride, 56 | padding=padding, dilation=dilation, 57 | bias=bias) 58 | 59 | # torch.nn.init.xavier_uniform_(self.conv.weight, 60 | # gain=torch.nn.init.calculate_gain(w_init_gain)) 61 | 62 | def forward(self, signal): 63 | return self.conv(signal) 64 | 65 | 66 | class TacotronSTFT(torch.nn.Module): 67 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, 68 | n_mel_channels=80, sampling_rate=22050, mel_fmin=50.0, mel_fmax=7600.0): 69 | super(TacotronSTFT, self).__init__() 70 | self.n_mel_channels = n_mel_channels 71 | self.sampling_rate = sampling_rate 72 | self.stft_fn = STFT(filter_length, hop_length, win_length) 73 | mel_basis = librosa.filters.mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) 74 | import numpy as np 75 | inv_mel_basis = np.linalg.pinv(mel_basis) 76 | mel_basis = torch.from_numpy(mel_basis).float() 77 | inv_mel_basis = torch.from_numpy(inv_mel_basis).float() 78 | self.register_buffer('mel_basis', mel_basis) 79 | self.register_buffer('inv_mel_basis', inv_mel_basis) 80 | 81 | 82 | def spectral_normalize(self, magnitudes): 83 | return dynamic_range_compression(magnitudes) 84 | 85 | def spectral_de_normalize(self, magnitudes): 86 | return dynamic_range_decompression(magnitudes) 87 | 88 | def mel_spectrogram(self, y): 89 | """Computes mel-spectrograms from a batch of waves 90 | PARAMS 91 | ------ 92 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 93 | 94 | RETURNS 95 | ------- 96 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 97 | """ 98 | #assert(torch.min(y.data) >= -1) 99 | #assert(torch.max(y.data) <= 1) 100 | 101 | magnitudes, phases = self.stft_fn.transform(y) 102 | magnitudes = magnitudes.data 103 | mel_output = torch.matmul(self.mel_basis, magnitudes) 104 | mel_output = self.spectral_normalize(mel_output) 105 | return mel_output 106 | 107 | def inv_mel_spectrogram(self, mel): 108 | """Computes mel-spectrograms from a batch of waves 109 | PARAMS 110 | ------ 111 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 112 | 113 | RETURNS 114 | ------- 115 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 116 | """ 117 | mel = self.spectral_de_normalize(mel.float()) 118 | magnitudes = torch.matmul(self.inv_mel_basis, mel.data) 119 | magnitudes = torch.max(magnitudes.clone().detach().fill_(1e-10), magnitudes) 120 | return magnitudes.data 121 | -------------------------------------------------------------------------------- /common/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from hparams import hparams 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | 8 | 9 | def dc_notch_filter(wav): 10 | # code from speex 11 | notch_radius = 0.982 12 | den = notch_radius ** 2 + 0.7 * (1 - notch_radius) ** 2 13 | b = np.array([1, -2, 1]) * notch_radius 14 | a = np.array([1, -2 * notch_radius, den]) 15 | return signal.lfilter(b, a, wav) 16 | 17 | def load_wav(path, sr): 18 | return librosa.core.load(path, sr=sr)[0] 19 | 20 | def save_wav(wav, path): 21 | wav = dc_notch_filter(wav) 22 | wav = wav / np.abs(wav).max() * 0.999 23 | f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav))) 24 | f2 = np.sign(wav) * np.power(np.abs(wav), 0.95) 25 | wav = f1 * f2 26 | #proposed by @dsmiller 27 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 28 | 29 | def preemphasis(wav, k): 30 | return signal.lfilter([1, -k], [1], wav) 31 | 32 | def inv_preemphasis(wav, k): 33 | return signal.lfilter([1], [1, -k], wav) 34 | 35 | def trim_silence(wav): 36 | '''Trim leading and trailing silence 37 | 38 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end. 39 | ''' 40 | #Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset. 41 | return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0] 42 | 43 | def get_hop_size(): 44 | hop_size = hparams.hop_size 45 | if hop_size is None: 46 | assert hparams.frame_shift_ms is not None 47 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 48 | return hop_size 49 | 50 | def linearspectrogram(wav): 51 | D = _stft(preemphasis(wav, hparams.preemphasis)) 52 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 53 | 54 | if hparams.signal_normalization: 55 | return _normalize(S) 56 | return S 57 | 58 | def melspectrogram(wav): 59 | D = _stft(preemphasis(wav, hparams.preemphasis)) 60 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 61 | 62 | if hparams.signal_normalization: 63 | return _normalize(S) 64 | return S 65 | 66 | def inv_linear_spectrogram(linear_spectrogram): 67 | '''Converts linear spectrogram to waveform using librosa''' 68 | if hparams.signal_normalization: 69 | D = _denormalize(linear_spectrogram) 70 | else: 71 | D = linear_spectrogram 72 | 73 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 74 | return inv_preemphasis(_griffin_lim(S ** hparams.power), hparams.preemphasis) 75 | 76 | def inv_mel_spectrogram(mel_spectrogram): 77 | '''Converts mel spectrogram to waveform using librosa''' 78 | if hparams.signal_normalization: 79 | D = _denormalize(mel_spectrogram) 80 | else: 81 | D = mel_spectrogram 82 | 83 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)) # Convert back to linear 84 | return inv_preemphasis(_griffin_lim(S ** hparams.power), hparams.preemphasis) 85 | 86 | def _griffin_lim(S): 87 | '''librosa implementation of Griffin-Lim 88 | Based on https://github.com/librosa/librosa/issues/434 89 | ''' 90 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 91 | S_complex = np.abs(S).astype(np.complex) 92 | y = _istft(S_complex * angles) 93 | for i in range(hparams.griffin_lim_iters): 94 | angles = np.exp(1j * np.angle(_stft(y))) 95 | y = _istft(S_complex * angles) 96 | return y 97 | 98 | def _stft(y): 99 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(), win_length=hparams.win_size) 100 | 101 | def _istft(y): 102 | return librosa.istft(y, hop_length=get_hop_size(), win_length=hparams.win_size) 103 | 104 | # Conversions 105 | _mel_basis = None 106 | _inv_mel_basis = None 107 | 108 | def _linear_to_mel(spectogram): 109 | global _mel_basis 110 | if _mel_basis is None: 111 | _mel_basis = _build_mel_basis() 112 | return np.dot(_mel_basis, spectogram) 113 | 114 | def _mel_to_linear(mel_spectrogram): 115 | global _inv_mel_basis 116 | if _inv_mel_basis is None: 117 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 118 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 119 | 120 | def _build_mel_basis(): 121 | assert hparams.fmax <= hparams.sample_rate // 2 122 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 123 | fmin=hparams.fmin, fmax=hparams.fmax) 124 | 125 | def _amp_to_db(x): 126 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 127 | return 20 * np.log10(np.maximum(min_level, x)) 128 | 129 | def _db_to_amp(x): 130 | return np.power(10.0, (x) * 0.05) 131 | 132 | def _normalize(S): 133 | if hparams.allow_clipping_in_normalization: 134 | if hparams.symmetric_mels: 135 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 136 | -hparams.max_abs_value, hparams.max_abs_value) 137 | else: 138 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 139 | 140 | if hparams.symmetric_mels: 141 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 142 | else: 143 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 144 | 145 | def _denormalize(D): 146 | if hparams.allow_clipping_in_normalization: 147 | if hparams.symmetric_mels: 148 | return (((np.clip(D, -hparams.max_abs_value, 149 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 150 | + hparams.min_level_db) 151 | else: 152 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 153 | 154 | if hparams.symmetric_mels: 155 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 156 | else: 157 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 158 | -------------------------------------------------------------------------------- /common/stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | 4 | Copyright (c) 2017, Prem Seetharaman 5 | All rights reserved. 6 | 7 | * Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the 15 | documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | """ 32 | 33 | import torch 34 | import numpy as np 35 | import torch.nn.functional as F 36 | from torch.autograd import Variable 37 | from scipy.signal import get_window, lfilter 38 | from librosa.util import pad_center, tiny 39 | from common.audio_processing import window_sumsquare 40 | 41 | 42 | class STFT(torch.nn.Module): 43 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 44 | def __init__(self, filter_length=1024, hop_length=256, win_length=1024, window='hann'): 45 | super(STFT, self).__init__() 46 | self.filter_length = filter_length 47 | self.hop_length = hop_length 48 | self.win_length = win_length 49 | self.window = window 50 | self.forward_transform = None 51 | scale = self.filter_length / self.hop_length 52 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 53 | 54 | cutoff = int((self.filter_length / 2 + 1)) 55 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 56 | np.imag(fourier_basis[:cutoff, :])]) 57 | 58 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 59 | inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 60 | 61 | if window is not None: 62 | assert(filter_length >= win_length) 63 | # get window and zero center pad it to filter_length 64 | fft_window = get_window(window, win_length, fftbins=True) 65 | fft_window = pad_center(fft_window, filter_length) 66 | fft_window = torch.from_numpy(fft_window).float() 67 | 68 | # window the bases 69 | forward_basis *= fft_window 70 | inverse_basis *= fft_window 71 | 72 | self.register_buffer('forward_basis', forward_basis.float()) 73 | self.register_buffer('inverse_basis', inverse_basis.float()) 74 | 75 | def transform(self, input_data): 76 | num_batches = input_data.size(0) 77 | num_samples = input_data.size(1) 78 | 79 | self.num_samples = num_samples 80 | 81 | # similar to librosa, reflect-pad the input 82 | input_data = input_data.view(num_batches, 1, num_samples) 83 | input_data = F.pad( 84 | input_data.unsqueeze(1), 85 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 86 | mode='reflect') 87 | input_data = input_data.squeeze(1) 88 | 89 | forward_transform = F.conv1d( 90 | input_data, 91 | Variable(self.forward_basis, requires_grad=False), 92 | stride=self.hop_length, 93 | padding=0) 94 | 95 | cutoff = int((self.filter_length / 2) + 1) 96 | real_part = forward_transform[:, :cutoff, :] 97 | imag_part = forward_transform[:, cutoff:, :] 98 | 99 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 100 | phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data)) 101 | 102 | return magnitude, phase 103 | 104 | def inverse(self, magnitude, phase): 105 | recombine_magnitude_phase = torch.cat( 106 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 107 | 108 | inverse_transform = F.conv_transpose1d( 109 | recombine_magnitude_phase, 110 | Variable(self.inverse_basis, requires_grad=False), 111 | stride=self.hop_length, 112 | padding=0) 113 | 114 | if self.window is not None: 115 | window_sum = window_sumsquare( 116 | self.window, magnitude.size(-1), hop_length=self.hop_length, 117 | win_length=self.win_length, n_fft=self.filter_length, 118 | dtype=np.float32) 119 | # remove modulation effects 120 | approx_nonzero_indices = torch.from_numpy( 121 | np.where(window_sum > tiny(window_sum))[0]) 122 | window_sum = torch.autograd.Variable(torch.from_numpy(window_sum), requires_grad=False) 123 | window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum 124 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 125 | 126 | # scale by hop ratio 127 | inverse_transform *= float(self.filter_length) / self.hop_length 128 | 129 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 130 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 131 | 132 | return inverse_transform 133 | 134 | def forward(self, input_data): 135 | self.magnitude, self.phase = self.transform(input_data) 136 | reconstruction = self.inverse(self.magnitude, self.phase) 137 | return reconstruction 138 | -------------------------------------------------------------------------------- /tacotron2/loader.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import argparse 29 | import torch 30 | from tacotron2.text import symbols 31 | from tacotron2.model import Tacotron2 32 | 33 | 34 | def parse_tacotron2_args(parent, add_help=False): 35 | """ 36 | Parse commandline arguments. 37 | """ 38 | parser = argparse.ArgumentParser(parents=[parent], add_help=add_help) 39 | 40 | # misc parameters 41 | parser.add_argument('--mask-padding', default=False, type=bool, help='Use mask padding') 42 | parser.add_argument('--n-mel-channels', default=80, type=int, help='Number of bins in mel-spectrograms') 43 | parser.add_argument('--mel-pad-val', default=-4, type=float, help='Corresponding to silence') 44 | 45 | # symbols parameters 46 | global symbols 47 | len_symbols = len(symbols) 48 | symbols = parser.add_argument_group('symbols parameters') 49 | symbols.add_argument('--n-symbols', default=len_symbols, type=int, help='Number of symbols in dictionary') 50 | symbols.add_argument('--symbols-embedding-dim', default=512, type=int, help='Input embedding dimension') 51 | 52 | # encoder parameters 53 | encoder = parser.add_argument_group('encoder parameters') 54 | encoder.add_argument('--encoder-kernel-size', default=5, type=int, help='Encoder kernel size') 55 | encoder.add_argument('--encoder-n-convolutions', default=3, type=int, help='Number of encoder convolutions') 56 | encoder.add_argument('--encoder-embedding-dim', default=512, type=int, help='Encoder embedding dimension') 57 | 58 | # decoder parameters 59 | decoder = parser.add_argument_group('decoder parameters') 60 | decoder.add_argument('--n-frames-per-step', default=3, type=int, help='Number of frames processed per step') 61 | decoder.add_argument('--decoder-rnn-dim', default=1024, type=int, help='Number of units in decoder LSTM') 62 | decoder.add_argument('--decoder-n-lstms', default=2, type=int, help='Number of decoder LSTM layers') 63 | decoder.add_argument('--prenet-dim', default=256, type=int, help='Number of ReLU units in prenet layers') 64 | decoder.add_argument('--max-decoder-steps', default=1000, type=int, help='Maximum number of output mel spectrograms') 65 | decoder.add_argument('--gate-threshold', default=0.5, type=float, help='Probability threshold for stop token') 66 | decoder.add_argument('--p-decoder-dropout', default=0.1, type=float, help='Dropout probability for decoder LSTM') 67 | 68 | # attention parameters 69 | attention = parser.add_argument_group('attention parameters') 70 | attention.add_argument('--attention-dim', default=128, type=int, help='Dimension of attention hidden representation') 71 | 72 | # location layer parameters 73 | location = parser.add_argument_group('location parameters') 74 | location.add_argument('--attention-location-n-filters', default=32, type=int, help='Number of filters for location-sensitive attention') 75 | location.add_argument('--attention-location-kernel-size', default=31, type=int, help='Kernel size for location-sensitive attention') 76 | 77 | # Mel-post processing network parameters 78 | postnet = parser.add_argument_group('postnet parameters') 79 | postnet.add_argument('--postnet-embedding-dim', default=512, type=int, help='Postnet embedding dimension') 80 | postnet.add_argument('--postnet-kernel-size', default=5, type=int, help='Postnet kernel size') 81 | postnet.add_argument('--postnet-n-convolutions', default=5, type=int, help='Number of postnet convolutions') 82 | 83 | return parser 84 | 85 | 86 | def _batchnorm_to_float(module): 87 | """Converts batch norm to FP32""" 88 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): 89 | module.float() 90 | for child in module.children(): 91 | _batchnorm_to_float(child) 92 | return module 93 | 94 | 95 | def _init_bn(module): 96 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): 97 | if module.affine: 98 | module.weight.data.uniform_() 99 | for child in module.children(): 100 | _init_bn(child) 101 | 102 | 103 | def get_tacotron2_model(args, speaker_num, is_training=True): 104 | config = dict( 105 | # optimization 106 | mask_padding=args.mask_padding, 107 | # audio 108 | n_mel_channels=args.n_mel_channels, 109 | # symbols 110 | n_symbols=args.n_symbols * speaker_num, 111 | symbols_embedding_dim=args.symbols_embedding_dim, 112 | # encoder 113 | encoder_kernel_size=args.encoder_kernel_size, 114 | encoder_n_convolutions=args.encoder_n_convolutions, 115 | encoder_embedding_dim=args.encoder_embedding_dim, 116 | # attention 117 | attention_dim=args.attention_dim, 118 | # attention location 119 | attention_location_n_filters=args.attention_location_n_filters, 120 | attention_location_kernel_size=args.attention_location_kernel_size, 121 | # decoder 122 | n_frames_per_step=args.n_frames_per_step, 123 | decoder_rnn_dim=args.decoder_rnn_dim, 124 | prenet_dim=args.prenet_dim, 125 | max_decoder_steps=args.max_decoder_steps, 126 | gate_threshold=args.gate_threshold, 127 | decoder_n_lstms=args.decoder_n_lstms, 128 | p_decoder_dropout=args.p_decoder_dropout, 129 | # postnet 130 | postnet_embedding_dim=args.postnet_embedding_dim, 131 | postnet_kernel_size=args.postnet_kernel_size, 132 | postnet_n_convolutions=args.postnet_n_convolutions, 133 | ) 134 | 135 | model = Tacotron2(**config) 136 | 137 | if is_training: 138 | _init_bn(model) 139 | 140 | return model.cuda() 141 | -------------------------------------------------------------------------------- /tacotron2/data_function.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import random 29 | import numpy as np 30 | import os 31 | import torch 32 | import torch.utils.data 33 | from common.layers import TacotronSTFT 34 | from common.utils import load_wav_to_torch, load_metadata 35 | from tacotron2.text import text_to_sequence 36 | 37 | 38 | class TextMelDataset(torch.utils.data.Dataset): 39 | """ 40 | 1) loads audio,text pairs 41 | 2) normalizes text and converts them to sequences of one-hot vectors 42 | 3) computes mel-spectrograms from audio files. 43 | """ 44 | def __init__(self, args, anchor_dirs): 45 | self.speaker_num = len(anchor_dirs) 46 | self.meta_dirs = [os.path.join(args.dataset_path, anchor_dirs[i]) for i in range(self.speaker_num)] 47 | self.metadatas = [load_metadata(meta_dir) for meta_dir in self.meta_dirs] 48 | self.offsets = [0] * self.speaker_num 49 | self.text_cleaners = args.text_cleaners 50 | self.sampling_rate = args.sampling_rate 51 | self.load_mel_from_disk = args.load_mel_from_disk 52 | self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, 53 | args.n_mel_channels, args.sampling_rate, args.mel_fmin, 54 | args.mel_fmax) 55 | random.seed(1234) 56 | for i in range(self.speaker_num): 57 | random.shuffle(self.metadatas[i]) 58 | 59 | def get_mel_text_pair(self, speaker_id, metadata): 60 | mel_path, text = metadata 61 | seq_len = len(text) 62 | seq = self.get_sequence(text, speaker_id) 63 | mel = self.get_mel(mel_path) 64 | return (seq, mel, seq_len) 65 | 66 | def get_mel(self, filename): 67 | if not self.load_mel_from_disk: 68 | audio = load_wav_to_torch(filename) 69 | melspec = self.stft.mel_spectrogram(audio.unsqueeze(0)) 70 | melspec = torch.squeeze(melspec, 0) 71 | else: 72 | melspec = torch.from_numpy(np.load(filename)) 73 | assert melspec.size(0) == self.stft.n_mel_channels, ( 74 | 'Mel dimension mismatch: given {}, expected {}'.format( 75 | melspec.size(0), self.stft.n_mel_channels)) 76 | 77 | return melspec 78 | 79 | def get_sequence(self, text, speaker_id): 80 | return text_to_sequence(text, speaker_id, self.text_cleaners) 81 | 82 | def __getitem__(self, index): 83 | group = [self.get_mel_text_pair(i, self.metadatas[i][self.offsets[i]]) for i in range(self.speaker_num)] 84 | self.offsets = [(self.offsets[i] + 1) % len(self.metadatas[i]) for i in range(self.speaker_num)] 85 | return group 86 | 87 | def __len__(self): 88 | return sum([len(m) for m in self.metadatas]) // self.speaker_num 89 | 90 | 91 | class TextMelCollate(): 92 | """ Zero-pads model inputs and targets based on number of frames per step 93 | """ 94 | def __init__(self, args): 95 | self.n_frames_per_step = args.n_frames_per_step 96 | self.mel_pad_val = args.mel_pad_val 97 | 98 | def __call__(self, batch): 99 | """Collate's training batch from normalized text and mel-spectrogram 100 | PARAMS 101 | ------ 102 | batch: [text_normalized, mel_normalized] 103 | """ 104 | # Flatten the batch 105 | batch = [sample for group in batch for sample in group] 106 | 107 | # Right zero-pad all one-hot text sequences to max input length 108 | seq_lens, ids_sorted_decreasing = torch.sort( 109 | torch.IntTensor([len(x[0]) for x in batch]), 110 | dim=0, descending=True) 111 | max_seq_len = seq_lens[0] 112 | 113 | seqs = [] 114 | for i in range(len(ids_sorted_decreasing)): 115 | seq = batch[ids_sorted_decreasing[i]][0] 116 | seqs.append(np.pad(seq, [0, max_seq_len - len(seq)], mode='constant')) 117 | 118 | # Right zero-pad mel-spec 119 | num_mels = batch[0][1].size(0) 120 | max_target_len = max([x[1].size(1) for x in batch]) 121 | if max_target_len % self.n_frames_per_step != 0: 122 | max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step 123 | assert max_target_len % self.n_frames_per_step == 0 124 | 125 | # include mel padded and gate padded 126 | targets, reduced_targets = [], [] 127 | gates = np.zeros([len(batch), max_target_len], dtype=np.float32) 128 | target_lengths = torch.IntTensor(len(batch)) 129 | for i in range(len(ids_sorted_decreasing)): 130 | mel = batch[ids_sorted_decreasing[i]][1] 131 | target_lengths[i] = mel.shape[1] 132 | gates[i, mel.shape[1] - 1:] = 1 133 | padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=self.mel_pad_val) 134 | targets.append(padded_mel) 135 | reduced_mel = padded_mel[:, ::self.n_frames_per_step] 136 | reduced_targets.append(reduced_mel) 137 | 138 | seqs = torch.from_numpy(np.stack(seqs)) 139 | targets = torch.from_numpy(np.stack(targets)) 140 | reduced_targets = torch.from_numpy(np.stack(reduced_targets)) 141 | gates = torch.from_numpy(gates) 142 | return seqs, seq_lens, targets, reduced_targets, gates, target_lengths 143 | 144 | 145 | def to_gpu(x): 146 | x = x.contiguous() 147 | if torch.cuda.is_available(): 148 | x = x.cuda(non_blocking=True) 149 | return x 150 | 151 | 152 | def batch_to_gpu(batch): 153 | texts, text_lengths, targets, reduced_targets, gates, target_lengths = batch 154 | x = (to_gpu(texts).long(), to_gpu(text_lengths).int(), to_gpu(reduced_targets).float(), to_gpu(target_lengths).int()) 155 | y = (targets, gates) 156 | num_frames = torch.sum(target_lengths) 157 | return (x, y, num_frames) 158 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import argparse 29 | import numpy as np 30 | import os 31 | import sys 32 | import time 33 | import torch 34 | from apex import amp 35 | from common.audio_processing import griffin_lim 36 | from common.layers import TacotronSTFT 37 | from common.utils import save_wav 38 | from scipy.io.wavfile import write 39 | from tacotron2.loader import parse_tacotron2_args 40 | from tacotron2.loader import get_tacotron2_model 41 | from tacotron2.text import text_to_sequence 42 | from train import parse_training_args 43 | from dllogger.logger import LOGGER 44 | import dllogger.logger as dllg 45 | from dllogger.autologging import log_hardware, log_args 46 | 47 | 48 | def parse_args(parser): 49 | """ 50 | Parse commandline arguments. 51 | """ 52 | parser.add_argument('-i', '--input-file', type=str, default="text.txt", help='full path to the input text (phareses separated by new line)') 53 | parser.add_argument('--checkpoint', type=str, default="logs/checkpoint_latest.pt", help='full path to the Tacotron2 model checkpoint file') 54 | parser.add_argument('-id', '--speaker-id', default=0, type=int, help='Speaker identity') 55 | parser.add_argument('-sn', '--speaker-num', default=1, type=int, help='Speaker number') 56 | parser.add_argument('--include-warmup', action='store_true', help='Include warmup') 57 | 58 | return parser 59 | 60 | 61 | def load_checkpoint(checkpoint_path, model_name): 62 | assert os.path.isfile(checkpoint_path) 63 | model.load_state_dict(torch.load(checkpoint_path)) 64 | print(f"Loaded checkpoint: {checkpoint_path}") 65 | return model 66 | 67 | 68 | def load_and_setup_model(parser, args): 69 | checkpoint_path = args.checkpoint 70 | parser = parse_tacotron2_args(parser, add_help=False) 71 | args, _ = parser.parse_known_args() 72 | model = get_tacotron2_model(args, args.speaker_num, is_training=False) 73 | model.restore_checkpoint(checkpoint_path) 74 | model.eval() 75 | 76 | if args.amp_run: 77 | model, _ = amp.initialize(model, [], opt_level='O1') 78 | 79 | return model, args 80 | 81 | 82 | # taken from tacotron2/data_function.py:TextMelCollate.__call__ 83 | def pad_sequences(sequences): 84 | # Right zero-pad all one-hot text sequences to max input length 85 | text_lengths, ids_sorted_decreasing = torch.sort( 86 | torch.IntTensor([len(x) for x in sequences]), 87 | dim=0, descending=True) 88 | max_text_len = text_lengths[0] 89 | 90 | texts = [] 91 | for i in range(len(ids_sorted_decreasing)): 92 | text = sequences[ids_sorted_decreasing[i]] 93 | texts.append(np.pad(text, [0, max_text_len - len(text)], mode='constant')) 94 | 95 | texts = torch.from_numpy(np.stack(texts)) 96 | return texts, text_lengths, ids_sorted_decreasing 97 | 98 | 99 | def prepare_input_sequence(texts, speaker_id): 100 | sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts] 101 | texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences) 102 | 103 | if torch.cuda.is_available(): 104 | texts = texts.cuda().long() 105 | text_lengths = text_lengths.cuda().int() 106 | else: 107 | texts = texts.long() 108 | text_lengths = text_lengths.int() 109 | 110 | return texts, text_lengths, ids_sorted_decreasing 111 | 112 | 113 | class MeasureTime(): 114 | def __init__(self, measurements, key): 115 | self.measurements = measurements 116 | self.key = key 117 | 118 | def __enter__(self): 119 | torch.cuda.synchronize() 120 | self.t0 = time.perf_counter() 121 | 122 | def __exit__(self, exc_type, exc_value, exc_traceback): 123 | torch.cuda.synchronize() 124 | self.measurements[self.key] = time.perf_counter() - self.t0 125 | 126 | 127 | def main(): 128 | """ 129 | Launches text to speech (inference). 130 | Inference is executed on a single GPU. 131 | """ 132 | parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference') 133 | parser = parse_training_args(parser) 134 | parser = parse_args(parser) 135 | args, _ = parser.parse_known_args() 136 | 137 | LOGGER.set_model_name("Tacotron2_PyT") 138 | LOGGER.set_backends([ 139 | dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), 140 | dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) 141 | ]) 142 | LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) 143 | LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) 144 | LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) 145 | 146 | model, args = load_and_setup_model(parser, args) 147 | 148 | log_hardware() 149 | log_args(args) 150 | 151 | try: 152 | f = open(args.input_file) 153 | sentences = list(map(lambda s : s.strip(), f.readlines())) 154 | except UnicodeDecodeError: 155 | f = open(args.input_file, encoding='gbk') 156 | sentences = list(map(lambda s : s.strip(), f.readlines())) 157 | 158 | os.makedirs(args.output_dir, exist_ok=True) 159 | 160 | LOGGER.iteration_start() 161 | 162 | measurements = {} 163 | 164 | sequences, text_lengths, ids_sorted_decreasing = prepare_input_sequence(sentences, args.speaker_id) 165 | 166 | with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): 167 | outputs = model.infer(sequences, text_lengths) 168 | _, mels, _, _, mel_lengths = [output.cpu() for output in outputs] 169 | 170 | tacotron2_infer_perf = mels.size(0)*mels.size(2)/measurements['tacotron2_time'] 171 | 172 | LOGGER.log(key="tacotron2_frames_per_sec", value=tacotron2_infer_perf) 173 | LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) 174 | LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) 175 | LOGGER.iteration_stop() 176 | LOGGER.finish() 177 | 178 | # recover to the original order and concatenate 179 | stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, 180 | args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) 181 | ids_sorted_decreasing = ids_sorted_decreasing.numpy().tolist() 182 | mels = [mel[:, :length] for mel, length in zip(mels, mel_lengths)] 183 | mels = [mels[ids_sorted_decreasing.index(i)] for i in range(len(ids_sorted_decreasing))] 184 | magnitudes = stft.inv_mel_spectrogram(torch.cat(mels, axis=-1)) 185 | wav = griffin_lim(magnitudes, stft.stft_fn) 186 | save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) 187 | np.save(os.path.join(args.output_dir, 'eval.npy'), np.concatenate(mels, axis=-1), allow_pickle=False) 188 | 189 | 190 | if __name__ == '__main__': 191 | main() 192 | -------------------------------------------------------------------------------- /dllogger/tags.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 MLBenchmark Group. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | # 16 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 17 | # 18 | # Licensed under the Apache License, Version 2.0 (the "License"); 19 | # you may not use this file except in compliance with the License. 20 | # You may obtain a copy of the License at 21 | # 22 | # http://www.apache.org/licenses/LICENSE-2.0 23 | # 24 | # Unless required by applicable law or agreed to in writing, software 25 | # distributed under the License is distributed on an "AS IS" BASIS, 26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 | # See the License for the specific language governing permissions and 28 | # limitations under the License. 29 | 30 | # Common values reported 31 | 32 | VALUE_EPOCH = "epoch" 33 | VALUE_ITERATION = "iteration" 34 | VALUE_ACCURACY = "accuracy" 35 | VALUE_BLEU = "bleu" 36 | VALUE_TOP1 = "top1" 37 | VALUE_TOP5 = "top5" 38 | VALUE_BBOX_MAP = "bbox_map" 39 | VALUE_MASK_MAP = "mask_map" 40 | VALUE_BCE = "binary_cross_entropy" 41 | 42 | 43 | # Timed blocks (used with timed_function & timed_block 44 | # For each there should be *_start and *_stop tags defined 45 | 46 | RUN_BLOCK = "run" 47 | SETUP_BLOCK = "setup" 48 | PREPROC_BLOCK = "preproc" 49 | 50 | TRAIN_BLOCK = "train" 51 | TRAIN_PREPROC_BLOCK = "train_preproc" 52 | TRAIN_EPOCH_BLOCK = "train_epoch" 53 | TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc" 54 | TRAIN_CHECKPOINT_BLOCK = "train_checkpoint" 55 | TRAIN_ITER_BLOCK = "train_iteration" 56 | 57 | EVAL_BLOCK = "eval" 58 | EVAL_ITER_BLOCK = "eval_iteration" 59 | 60 | #TODO: to remove? 61 | TIMED_BLOCKS = { 62 | RUN_BLOCK, 63 | SETUP_BLOCK, 64 | PREPROC_BLOCK, 65 | TRAIN_BLOCK, 66 | TRAIN_PREPROC_BLOCK, 67 | TRAIN_EPOCH_BLOCK, 68 | TRAIN_EPOCH_PREPROC_BLOCK, 69 | TRAIN_CHECKPOINT_BLOCK, 70 | TRAIN_ITER_BLOCK, 71 | EVAL_BLOCK, 72 | EVAL_ITER_BLOCK, 73 | } 74 | 75 | 76 | # Events 77 | 78 | RUN_INIT = "run_init" 79 | 80 | SETUP_START = "setup_start" 81 | SETUP_STOP = "setup_stop" 82 | 83 | PREPROC_START = "preproc_start" 84 | PREPROC_STOP = "preproc_stop" 85 | 86 | RUN_START = "run_start" 87 | RUN_STOP = "run_stop" 88 | RUN_FINAL = "run_final" 89 | 90 | TRAIN_CHECKPOINT_START = "train_checkpoint_start" 91 | TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop" 92 | 93 | TRAIN_PREPROC_START = "train_preproc_start" 94 | TRAIN_PREPROC_STOP = "train_preproc_stop" 95 | 96 | TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start" 97 | TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop" 98 | 99 | TRAIN_ITER_START = "train_iter_start" 100 | TRAIN_ITER_STOP = "train_iter_stop" 101 | 102 | TRAIN_EPOCH_START = "train_epoch_start" 103 | TRAIN_EPOCH_STOP = "train_epoch_stop" 104 | 105 | 106 | # MLPerf specific tags 107 | 108 | RUN_CLEAR_CACHES = "run_clear_caches" 109 | 110 | PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples" 111 | PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples" 112 | PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training" 113 | PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval" 114 | PREPROC_VOCAB_SIZE = "preproc_vocab_size" 115 | 116 | RUN_SET_RANDOM_SEED = "run_set_random_seed" 117 | 118 | INPUT_SIZE = "input_size" 119 | INPUT_BATCH_SIZE = "input_batch_size" 120 | INPUT_ORDER = "input_order" 121 | INPUT_SHARD = "input_shard" 122 | INPUT_BN_SPAN = "input_bn_span" 123 | 124 | INPUT_CENTRAL_CROP = "input_central_crop" 125 | INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes" 126 | INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered" 127 | INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range" 128 | INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range" 129 | INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts" 130 | INPUT_MEAN_SUBTRACTION = "input_mean_subtraction" 131 | INPUT_RANDOM_FLIP = "input_random_flip" 132 | 133 | INPUT_RESIZE = "input_resize" 134 | INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving" 135 | 136 | 137 | # Opt 138 | 139 | OPT_NAME = "opt_name" 140 | 141 | OPT_LR = "opt_learning_rate" 142 | OPT_MOMENTUM = "opt_momentum" 143 | 144 | OPT_WEIGHT_DECAY = "opt_weight_decay" 145 | 146 | OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1" 147 | OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2" 148 | OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon" 149 | 150 | OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps" 151 | 152 | 153 | # Train 154 | 155 | TRAIN_LOOP = "train_loop" 156 | TRAIN_EPOCH = "train_epoch" 157 | TRAIN_CHECKPOINT = "train_checkpoint" 158 | TRAIN_LOSS = "train_loss" 159 | TRAIN_ITERATION_LOSS = "train_iteration_loss" 160 | 161 | 162 | # Eval 163 | 164 | EVAL_START = "eval_start" 165 | EVAL_SIZE = "eval_size" 166 | EVAL_TARGET = "eval_target" 167 | EVAL_ACCURACY = "eval_accuracy" 168 | EVAL_STOP = "eval_stop" 169 | 170 | 171 | # Perf 172 | 173 | PERF_IT_PER_SEC = "perf_it_per_sec" 174 | PERF_TIME_TO_TRAIN = "time_to_train" 175 | 176 | EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy" 177 | 178 | 179 | # Model 180 | 181 | MODEL_HP_LOSS_FN = "model_hp_loss_fn" 182 | 183 | MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape" 184 | MODEL_HP_FINAL_SHAPE = "model_hp_final_shape" 185 | 186 | MODEL_L2_REGULARIZATION = "model_l2_regularization" 187 | MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2" 188 | 189 | MODEL_HP_RELU = "model_hp_relu" 190 | MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding" 191 | MODEL_HP_BATCH_NORM = "model_hp_batch_norm" 192 | MODEL_HP_DENSE = "model_hp_dense" 193 | 194 | 195 | # GNMT specific 196 | 197 | MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing" 198 | MODEL_HP_NUM_LAYERS = "model_hp_num_layers" 199 | MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size" 200 | MODEL_HP_DROPOUT = "model_hp_dropout" 201 | 202 | EVAL_HP_BEAM_SIZE = "eval_hp_beam_size" 203 | TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length" 204 | EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length" 205 | EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant" 206 | EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor" 207 | EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor" 208 | 209 | 210 | # NCF specific 211 | 212 | PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings" 213 | PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval" 214 | PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement" 215 | 216 | INPUT_HP_NUM_NEG = "input_hp_num_neg" 217 | INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement" 218 | INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen" 219 | INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen" 220 | 221 | EVAL_HP_NUM_USERS = "eval_hp_num_users" 222 | EVAL_HP_NUM_NEG = "eval_hp_num_neg" 223 | 224 | MODEL_HP_MF_DIM = "model_hp_mf_dim" 225 | MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes" 226 | 227 | 228 | # RESNET specific 229 | 230 | EVAL_EPOCH_OFFSET = "eval_offset" 231 | 232 | MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool" 233 | MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block" 234 | MODEL_HP_END_BLOCK = "model_hp_end_block" 235 | MODEL_HP_BLOCK_TYPE = "model_hp_block_type" 236 | MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut" 237 | MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add" 238 | MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology" 239 | 240 | 241 | # Transformer specific 242 | 243 | INPUT_MAX_LENGTH = "input_max_length" 244 | 245 | MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain" 246 | MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size" 247 | MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers" 248 | MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights" 249 | MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense" 250 | MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout" 251 | MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense" 252 | MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense" 253 | MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout" 254 | MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout" 255 | MODEL_HP_NORM = "model_hp_norm" 256 | MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search" 257 | 258 | -------------------------------------------------------------------------------- /gta.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import argparse 29 | import numpy as np 30 | import os 31 | import sys 32 | import time 33 | import torch 34 | from apex import amp 35 | from scipy.io.wavfile import write 36 | from tacotron2.data_function import to_gpu 37 | from tacotron2.loader import parse_tacotron2_args 38 | from tacotron2.loader import get_tacotron2_model 39 | from tacotron2.text import text_to_sequence 40 | from train import parse_training_args 41 | from common.audio_processing import griffin_lim 42 | from common.layers import TacotronSTFT 43 | from common.utils import load_metadata, load_wav_to_torch, save_wav 44 | from dllogger.logger import LOGGER 45 | import dllogger.logger as dllg 46 | from dllogger.autologging import log_hardware, log_args 47 | from tqdm import tqdm 48 | 49 | 50 | def load_checkpoint(checkpoint_path, model_name): 51 | assert os.path.isfile(checkpoint_path) 52 | model.load_state_dict(torch.load(checkpoint_path)) 53 | print(f"Loaded checkpoint: {checkpoint_path}") 54 | return model 55 | 56 | 57 | def load_and_setup_model(parser, args): 58 | checkpoint_path = os.path.join('logs', args.latest_checkpoint_file) 59 | parser = parse_tacotron2_args(parser, add_help=False) 60 | args, _ = parser.parse_known_args() 61 | model = get_tacotron2_model(args, len(args.training_anchor_dirs), is_training=False) 62 | model.restore_checkpoint(checkpoint_path) 63 | model.eval() 64 | 65 | if args.amp_run: 66 | model, _ = amp.initialize(model, [], opt_level='O1') 67 | 68 | return model, args 69 | 70 | 71 | # taken from tacotron2/data_function.py:TextMelCollate.__call__ 72 | def pad_sequences(sequences): 73 | # Right zero-pad all one-hot text sequences to max input length 74 | text_lengths, ids_sorted_decreasing = torch.sort( 75 | torch.IntTensor([len(x) for x in sequences]), 76 | dim=0, descending=True) 77 | max_text_len = text_lengths[0] 78 | 79 | texts = [] 80 | for i in range(len(ids_sorted_decreasing)): 81 | text = sequences[ids_sorted_decreasing[i]] 82 | texts.append(np.pad(text, [0, max_text_len - len(text)], mode='constant')) 83 | 84 | texts = torch.from_numpy(np.stack(texts)) 85 | return texts, text_lengths, ids_sorted_decreasing 86 | 87 | 88 | def prepare_input_sequence(texts, speaker_id): 89 | sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts] 90 | texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences) 91 | 92 | if torch.cuda.is_available(): 93 | texts = texts.cuda().long() 94 | text_lengths = text_lengths.cuda().int() 95 | else: 96 | texts = texts.long() 97 | text_lengths = text_lengths.int() 98 | 99 | return texts, text_lengths, ids_sorted_decreasing 100 | 101 | 102 | class MeasureTime(): 103 | def __init__(self, measurements, key): 104 | self.measurements = measurements 105 | self.key = key 106 | 107 | def __enter__(self): 108 | torch.cuda.synchronize() 109 | self.t0 = time.perf_counter() 110 | 111 | def __exit__(self, exc_type, exc_value, exc_traceback): 112 | torch.cuda.synchronize() 113 | self.measurements[self.key] = time.perf_counter() - self.t0 114 | 115 | 116 | def main(): 117 | """ 118 | Launches text to speech (inference). 119 | Inference is executed on a single GPU. 120 | """ 121 | parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference') 122 | parser = parse_training_args(parser) 123 | args, _ = parser.parse_known_args() 124 | 125 | LOGGER.set_model_name("Tacotron2_PyT") 126 | LOGGER.set_backends([ 127 | dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), 128 | dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) 129 | ]) 130 | LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) 131 | LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) 132 | LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) 133 | 134 | model, args = load_and_setup_model(parser, args) 135 | 136 | log_hardware() 137 | log_args(args) 138 | 139 | os.makedirs(args.output_dir, exist_ok=True) 140 | 141 | LOGGER.iteration_start() 142 | 143 | measurements = {} 144 | 145 | anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs] 146 | metadatas = [load_metadata(anchor) for anchor in anchor_dirs] 147 | stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, 148 | args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) 149 | with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): 150 | for speaker_id in range(len(anchor_dirs)): 151 | metadata = metadatas[speaker_id] 152 | for npy_path, text in tqdm(metadata): 153 | seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) 154 | seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) 155 | seq_lens = torch.IntTensor([len(seq)]) 156 | wav = load_wav_to_torch(npy_path) 157 | mel = stft.mel_spectrogram(wav.unsqueeze(0)) 158 | mel = mel.squeeze() 159 | max_target_len = mel.size(1) - 1 160 | max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step 161 | padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val) 162 | target = padded_mel[:, ::args.n_frames_per_step] 163 | targets = torch.from_numpy(np.stack(target)).unsqueeze(0) 164 | target_lengths = torch.IntTensor([target.shape[1]]) 165 | outputs = model.infer(to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int()) 166 | _, mel_out, _, _ = [output.cpu() for output in outputs if output is not None] 167 | mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1] 168 | # clamp the range according to reference level decibel bias to eliminate background noises (20db) 169 | mel_out = np.clip(mel_out, args.mel_pad_val, -args.mel_pad_val) 170 | assert(mel_out.shape[-1] == wav.shape[-1] // args.hop_length) 171 | fname = os.path.basename(npy_path) 172 | np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False) 173 | # GTA synthesis 174 | # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze()) 175 | # wav = griffin_lim(magnitudes, stft.stft_fn, 60) 176 | # save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) 177 | 178 | LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) 179 | LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) 180 | LOGGER.iteration_stop() 181 | LOGGER.finish() 182 | 183 | 184 | if __name__ == '__main__': 185 | main() 186 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import pprint 3 | 4 | class HParams(object): 5 | def __init__(self, **kwargs): self.__dict__.update(kwargs) 6 | def __setitem__(self, key, value): setattr(self, key, value) 7 | def __getitem__(self, key): return getattr(self, key) 8 | def __repr__(self): return pprint.pformat(self.__dict__) 9 | 10 | def parse(self, string): 11 | # Overrides hparams from a comma-separated string of name=value pairs 12 | if len(string) > 0: 13 | overrides = [s.split("=") for s in string.split(",")] 14 | keys, values = zip(*overrides) 15 | keys = list(map(str.strip, keys)) 16 | values = list(map(str.strip, values)) 17 | for k in keys: 18 | self.__dict__[k] = ast.literal_eval(values[keys.index(k)]) 19 | return self 20 | 21 | 22 | # Default hyperparameters 23 | hparams = HParams( 24 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 25 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners". 26 | cleaners='basic_cleaners', 27 | 28 | #Hardware setup (TODO: multi-GPU parallel tacotron training) 29 | use_all_gpus = False, #Whether to use all GPU resources. If True, total number of available gpus will override num_gpus. 30 | num_gpus = 1, #Determines the number of gpus in use 31 | ########################################################################################################################################### 32 | 33 | #Audio 34 | num_mels = 80, #Number of mel-spectrogram channels and local conditioning dimensionality 35 | rescale = False, #Whether to rescale audio prior to preprocessing 36 | rescaling_max = 0.999, #Rescaling value 37 | trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle) 38 | clip_mels_length = True, #For cases of OOM (Not really recommended, working on a workaround) 39 | max_mel_frames = 900, #Only relevant when clip_mels_length = True 40 | max_text_length = 300, #Only relevant when clip_mels_length = True 41 | sentence_span = 20, # Number of mel hops for each sentence interval 42 | 43 | #Mel spectrogram 44 | n_fft = 1024, #Extra window size is filled with 0 paddings to match this parameter 45 | hop_size = 256, #For 22050Hz, 256 ~= 11.5 ms 46 | win_size = 1024, #For 22050Hz, 1024 ~= 46 ms (If None, win_size = n_fft) 47 | sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset) 48 | frame_shift_ms = None, 49 | preemphasis = 0.97, # preemphasis coefficient 50 | 51 | #Multi-speaker batch_size should be integer multiplies number of speakers. 52 | anchor_dirs = ['tts_fanfanli_22050', 'tts_xiaoya_22050', 'tts_yangluzhuo_22050', 'tts_yuanzhonglu_22050'], 53 | 54 | #M-AILABS (and other datasets) trim params 55 | trim_fft_size = 512, 56 | trim_hop_size = 128, 57 | trim_top_db = 60, 58 | 59 | #Mel and Linear spectrograms normalization/scaling and clipping 60 | signal_normalization = True, 61 | allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True 62 | symmetric_mels = True, #Whether to scale the data to be symmetric around 0 63 | max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 64 | 65 | #Limits 66 | min_level_db = -100, 67 | ref_level_db = 20, 68 | fmin = 50, #Set this to 75 if your speaker is male! if female, 125 should help taking off noise. (To test depending on dataset) 69 | fmax = 7600, 70 | 71 | #Griffin Lim 72 | power = 1.2, 73 | griffin_lim_iters = 60, 74 | ########################################################################################################################################### 75 | 76 | #Tacotron 77 | outputs_per_step = 2, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size) 78 | stop_at_any = False, #Determines whether the decoder should stop when predicting to any frame or to all of them 79 | batch_norm_position = 'after', #Can be in ('before', 'after'). Determines whether we use batch norm before or after the activation function (relu). Matter for debate. 80 | 81 | embedding_dim = 512, #dimension of embedding space 82 | 83 | enc_conv_num_layers = 3, #number of encoder convolutional layers 84 | enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer 85 | enc_conv_channels = 512, #number of encoder convolutions filters for each layer 86 | encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward) 87 | 88 | smoothing = False, #Whether to smooth the attention normalization function 89 | attention_dim = 128, #dimension of attention space 90 | attention_filters = 32, #number of attention convolution filters 91 | attention_kernel = (31, ), #kernel size of attention convolution 92 | cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) 93 | 94 | #Attention synthesis constraints 95 | #"Monotonic" constraint forces the model to only look at the forwards attention_win_size steps. 96 | #"Window" allows the model to look at attention_win_size neighbors, both forward and backward steps. 97 | synthesis_constraint = False, #Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis) 98 | synthesis_constraint_type = 'window', #can be in ('window', 'monotonic'). 99 | attention_win_size = 7, #Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window. 100 | 101 | prenet_layers = [256, 256], #number of layers and number of units of prenet 102 | decoder_layers = 2, #number of decoder lstm layers 103 | decoder_lstm_units = 1024, #number of decoder lstm units on each layer 104 | max_iters = 1000, #Max decoder steps during inference (Just for safety from infinite loop cases) 105 | 106 | postnet_num_layers = 5, #number of postnet convolutional layers 107 | postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer 108 | postnet_channels = 512, #number of postnet convolution filters for each layer 109 | 110 | #Loss params 111 | mask_encoder = False, #whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence. 112 | mask_decoder = False, #Whether to use loss mask for padded sequences (if False, loss function will not be weighted, else recommended pos_weight = 20) 113 | cross_entropy_pos_weight = 1, #Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1 = disabled) 114 | ########################################################################################################################################### 115 | 116 | #Tacotron Training 117 | #Reproduction seeds 118 | tacotron_random_seed = 5339, #Determines initial graph and operations (i.e: model) random state for reproducibility 119 | tacotron_data_random_state = 1234, #random state for train test split repeatability 120 | 121 | #performance parameters 122 | tacotron_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!) 123 | 124 | #train/test split ratios, mini-batches sizes 125 | tacotron_batch_size = 36, #number of training samples on each training steps 126 | #Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing). 127 | #Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder. 128 | tacotron_synthesis_batch_size = 48, #DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!! 129 | tacotron_test_size = 0.05, #% of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit) 130 | tacotron_test_batches = None, #number of test batches. 131 | 132 | #Learning rate schedule 133 | tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay 134 | tacotron_start_decay = 40000, #Step at which learning decay starts 135 | tacotron_decay_steps = 40000, #Determines the learning rate decay slope (UNDER TEST) 136 | tacotron_decay_rate = 0.4, #learning rate decay rate (UNDER TEST) 137 | tacotron_initial_learning_rate = 1e-3, #starting learning rate 138 | tacotron_final_learning_rate = 1e-5, #minimal learning rate 139 | 140 | #Optimization parameters 141 | tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter 142 | tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter 143 | tacotron_adam_epsilon = 1e-6, #AdamOptimizer Epsilon parameter 144 | 145 | #Regularization parameters 146 | tacotron_reg_weight = 1e-6, #regularization weight (for L2 regularization) 147 | tacotron_scale_regularization = False, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model) 148 | tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network 149 | tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet 150 | tacotron_clip_gradients = True, #whether to clip gradients 151 | 152 | #Evaluation parameters 153 | tacotron_natural_eval = False, #Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same teacher-forcing ratio as in training (just for overfit) 154 | 155 | #Decoder RNN learning can take be done in one of two ways: 156 | # Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode='constant' 157 | # Scheduled Sampling Scheme: From Teacher-Forcing to sampling from previous outputs is function of global step. (teacher forcing ratio decay) mode='scheduled' 158 | #The second approach is inspired by: 159 | #Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. 160 | #Can be found under: https://arxiv.org/pdf/1506.03099.pdf 161 | tacotron_teacher_forcing_mode = 'constant', #Can be ('constant' or 'scheduled'). 'scheduled' mode applies a cosine teacher forcing ratio decay. (Preference: scheduled) 162 | tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs, Only relevant if mode='constant' 163 | tacotron_teacher_forcing_init_ratio = 1., #initial teacher forcing ratio. Relevant if mode='scheduled' 164 | tacotron_teacher_forcing_final_ratio = 0., #final teacher forcing ratio. (Set None to use alpha instead) Relevant if mode='scheduled' 165 | tacotron_teacher_forcing_start_decay = 10000, #starting point of teacher forcing ratio decay. Relevant if mode='scheduled' 166 | tacotron_teacher_forcing_decay_steps = 40000, #Determines the teacher forcing ratio decay slope. Relevant if mode='scheduled' 167 | tacotron_teacher_forcing_decay_alpha = None, #teacher forcing ratio decay rate. Defines the final tfr as a ratio of initial tfr. Relevant if mode='scheduled' 168 | ) 169 | 170 | def hparams_debug_string(): 171 | return str(hparams) 172 | -------------------------------------------------------------------------------- /filelists/ljs_mel_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | LJSpeech-1.1/mels/LJ022-0023.pt|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read. 2 | LJSpeech-1.1/mels/LJ043-0030.pt|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too. 3 | LJSpeech-1.1/mels/LJ005-0201.pt|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five. 4 | LJSpeech-1.1/mels/LJ001-0110.pt|Even the Caslon type when enlarged shows great shortcomings in this respect: 5 | LJSpeech-1.1/mels/LJ003-0345.pt|All the committee could do in this respect was to throw the responsibility on others. 6 | LJSpeech-1.1/mels/LJ007-0154.pt|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated, 7 | LJSpeech-1.1/mels/LJ018-0098.pt|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others. 8 | LJSpeech-1.1/mels/LJ047-0044.pt|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies 9 | LJSpeech-1.1/mels/LJ031-0038.pt|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery. 10 | LJSpeech-1.1/mels/LJ048-0194.pt|during the morning of November twenty-two prior to the motorcade. 11 | LJSpeech-1.1/mels/LJ049-0026.pt|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President. 12 | LJSpeech-1.1/mels/LJ004-0152.pt|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four. 13 | LJSpeech-1.1/mels/LJ008-0278.pt|or theirs might be one of many, and it might be considered necessary to "make an example." 14 | LJSpeech-1.1/mels/LJ043-0002.pt|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald: 15 | LJSpeech-1.1/mels/LJ009-0114.pt|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here. 16 | LJSpeech-1.1/mels/LJ028-0506.pt|A modern artist would have difficulty in doing such accurate work. 17 | LJSpeech-1.1/mels/LJ050-0168.pt|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area 18 | LJSpeech-1.1/mels/LJ039-0223.pt|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon 19 | LJSpeech-1.1/mels/LJ029-0032.pt|According to O'Donnell, quote, we had a motorcade wherever we went, end quote. 20 | LJSpeech-1.1/mels/LJ031-0070.pt|Dr. Clark, who most closely observed the head wound, 21 | LJSpeech-1.1/mels/LJ034-0198.pt|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window. 22 | LJSpeech-1.1/mels/LJ026-0068.pt|Energy enters the plant, to a small extent, 23 | LJSpeech-1.1/mels/LJ039-0075.pt|once you know that you must put the crosshairs on the target and that is all that is necessary. 24 | LJSpeech-1.1/mels/LJ004-0096.pt|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized 25 | LJSpeech-1.1/mels/LJ005-0014.pt|Speaking on a debate on prison matters, he declared that 26 | LJSpeech-1.1/mels/LJ012-0161.pt|he was reported to have fallen away to a shadow. 27 | LJSpeech-1.1/mels/LJ018-0239.pt|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to 28 | LJSpeech-1.1/mels/LJ019-0257.pt|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines. 29 | LJSpeech-1.1/mels/LJ028-0008.pt|you tap gently with your heel upon the shoulder of the dromedary to urge her on. 30 | LJSpeech-1.1/mels/LJ024-0083.pt|This plan of mine is no attack on the Court; 31 | LJSpeech-1.1/mels/LJ042-0129.pt|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough. 32 | LJSpeech-1.1/mels/LJ036-0103.pt|The police asked him whether he could pick out his passenger from the lineup. 33 | LJSpeech-1.1/mels/LJ046-0058.pt|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles. 34 | LJSpeech-1.1/mels/LJ014-0076.pt|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive. 35 | LJSpeech-1.1/mels/LJ002-0043.pt|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen, 36 | LJSpeech-1.1/mels/LJ009-0076.pt|We come to the sermon. 37 | LJSpeech-1.1/mels/LJ017-0131.pt|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution. 38 | LJSpeech-1.1/mels/LJ046-0184.pt|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes. 39 | LJSpeech-1.1/mels/LJ014-0263.pt|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 40 | LJSpeech-1.1/mels/LJ042-0096.pt|(old exchange rate) in addition to his factory salary of approximately equal amount 41 | LJSpeech-1.1/mels/LJ049-0050.pt|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy. 42 | LJSpeech-1.1/mels/LJ019-0186.pt|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties, 43 | LJSpeech-1.1/mels/LJ028-0307.pt|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand. 44 | LJSpeech-1.1/mels/LJ012-0235.pt|While they were in a state of insensibility the murder was committed. 45 | LJSpeech-1.1/mels/LJ034-0053.pt|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald. 46 | LJSpeech-1.1/mels/LJ014-0030.pt|These were damnatory facts which well supported the prosecution. 47 | LJSpeech-1.1/mels/LJ015-0203.pt|but were the precautions too minute, the vigilance too close to be eluded or overcome? 48 | LJSpeech-1.1/mels/LJ028-0093.pt|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters. 49 | LJSpeech-1.1/mels/LJ002-0018.pt|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London, 50 | LJSpeech-1.1/mels/LJ028-0275.pt|At last, in the twentieth month, 51 | LJSpeech-1.1/mels/LJ012-0042.pt|which he kept concealed in a hiding-place with a trap-door just under his bed. 52 | LJSpeech-1.1/mels/LJ011-0096.pt|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm, 53 | LJSpeech-1.1/mels/LJ036-0077.pt|Roger D. Craig, a deputy sheriff of Dallas County, 54 | LJSpeech-1.1/mels/LJ016-0318.pt|Other officials, great lawyers, governors of prisons, and chaplains supported this view. 55 | LJSpeech-1.1/mels/LJ013-0164.pt|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning. 56 | LJSpeech-1.1/mels/LJ027-0141.pt|is closely reproduced in the life-history of existing deer. Or, in other words, 57 | LJSpeech-1.1/mels/LJ028-0335.pt|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands. 58 | LJSpeech-1.1/mels/LJ031-0202.pt|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy. 59 | LJSpeech-1.1/mels/LJ021-0145.pt|From those willing to join in establishing this hoped-for period of peace, 60 | LJSpeech-1.1/mels/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. 61 | LJSpeech-1.1/mels/LJ028-0081.pt|Years later, when the archaeologists could readily distinguish the false from the true, 62 | LJSpeech-1.1/mels/LJ018-0081.pt|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him, 63 | LJSpeech-1.1/mels/LJ021-0066.pt|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits 64 | LJSpeech-1.1/mels/LJ009-0238.pt|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail. 65 | LJSpeech-1.1/mels/LJ005-0079.pt|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders. 66 | LJSpeech-1.1/mels/LJ035-0019.pt|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal. 67 | LJSpeech-1.1/mels/LJ036-0174.pt|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there. 68 | LJSpeech-1.1/mels/LJ046-0146.pt|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files 69 | LJSpeech-1.1/mels/LJ017-0044.pt|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator. 70 | LJSpeech-1.1/mels/LJ017-0070.pt|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash. 71 | LJSpeech-1.1/mels/LJ014-0020.pt|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 72 | LJSpeech-1.1/mels/LJ016-0020.pt|He never reached the cistern, but fell back into the yard, injuring his legs severely. 73 | LJSpeech-1.1/mels/LJ045-0230.pt|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present, 74 | LJSpeech-1.1/mels/LJ035-0129.pt|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him. 75 | LJSpeech-1.1/mels/LJ008-0307.pt|afterwards express a wish to murder the Recorder for having kept them so long in suspense. 76 | LJSpeech-1.1/mels/LJ008-0294.pt|nearly indefinitely deferred. 77 | LJSpeech-1.1/mels/LJ047-0148.pt|On October twenty-five, 78 | LJSpeech-1.1/mels/LJ008-0111.pt|They entered a "stone cold room," and were presently joined by the prisoner. 79 | LJSpeech-1.1/mels/LJ034-0042.pt|that he could only testify with certainty that the print was less than three days old. 80 | LJSpeech-1.1/mels/LJ037-0234.pt|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male, 81 | LJSpeech-1.1/mels/LJ040-0002.pt|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one. 82 | LJSpeech-1.1/mels/LJ045-0140.pt|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved 83 | LJSpeech-1.1/mels/LJ012-0035.pt|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands. 84 | LJSpeech-1.1/mels/LJ012-0250.pt|On the seventh July, eighteen thirty-seven, 85 | LJSpeech-1.1/mels/LJ016-0179.pt|contracted with sheriffs and conveners to work by the job. 86 | LJSpeech-1.1/mels/LJ016-0138.pt|at a distance from the prison. 87 | LJSpeech-1.1/mels/LJ027-0052.pt|These principles of homology are essential to a correct interpretation of the facts of morphology. 88 | LJSpeech-1.1/mels/LJ031-0134.pt|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally. 89 | LJSpeech-1.1/mels/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. 90 | LJSpeech-1.1/mels/LJ014-0110.pt|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects. 91 | LJSpeech-1.1/mels/LJ034-0160.pt|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle. 92 | LJSpeech-1.1/mels/LJ038-0199.pt|eleven. If I am alive and taken prisoner, 93 | LJSpeech-1.1/mels/LJ014-0010.pt|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came. 94 | LJSpeech-1.1/mels/LJ033-0047.pt|I noticed when I went out that the light was on, end quote, 95 | LJSpeech-1.1/mels/LJ040-0027.pt|He was never satisfied with anything. 96 | LJSpeech-1.1/mels/LJ048-0228.pt|and others who were present say that no agent was inebriated or acted improperly. 97 | LJSpeech-1.1/mels/LJ003-0111.pt|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity. 98 | LJSpeech-1.1/mels/LJ008-0258.pt|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days, 99 | LJSpeech-1.1/mels/LJ029-0022.pt|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston. 100 | LJSpeech-1.1/mels/LJ004-0045.pt|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce. 101 | -------------------------------------------------------------------------------- /filelists/ljs_audio_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | LJSpeech-1.1/wavs/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read. 2 | LJSpeech-1.1/wavs/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too. 3 | LJSpeech-1.1/wavs/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five. 4 | LJSpeech-1.1/wavs/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect: 5 | LJSpeech-1.1/wavs/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others. 6 | LJSpeech-1.1/wavs/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated, 7 | LJSpeech-1.1/wavs/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others. 8 | LJSpeech-1.1/wavs/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies 9 | LJSpeech-1.1/wavs/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery. 10 | LJSpeech-1.1/wavs/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade. 11 | LJSpeech-1.1/wavs/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President. 12 | LJSpeech-1.1/wavs/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four. 13 | LJSpeech-1.1/wavs/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example." 14 | LJSpeech-1.1/wavs/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald: 15 | LJSpeech-1.1/wavs/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here. 16 | LJSpeech-1.1/wavs/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work. 17 | LJSpeech-1.1/wavs/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area 18 | LJSpeech-1.1/wavs/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon 19 | LJSpeech-1.1/wavs/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote. 20 | LJSpeech-1.1/wavs/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound, 21 | LJSpeech-1.1/wavs/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window. 22 | LJSpeech-1.1/wavs/LJ026-0068.wav|Energy enters the plant, to a small extent, 23 | LJSpeech-1.1/wavs/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary. 24 | LJSpeech-1.1/wavs/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized 25 | LJSpeech-1.1/wavs/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that 26 | LJSpeech-1.1/wavs/LJ012-0161.wav|he was reported to have fallen away to a shadow. 27 | LJSpeech-1.1/wavs/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to 28 | LJSpeech-1.1/wavs/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines. 29 | LJSpeech-1.1/wavs/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on. 30 | LJSpeech-1.1/wavs/LJ024-0083.wav|This plan of mine is no attack on the Court; 31 | LJSpeech-1.1/wavs/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough. 32 | LJSpeech-1.1/wavs/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup. 33 | LJSpeech-1.1/wavs/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles. 34 | LJSpeech-1.1/wavs/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive. 35 | LJSpeech-1.1/wavs/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen, 36 | LJSpeech-1.1/wavs/LJ009-0076.wav|We come to the sermon. 37 | LJSpeech-1.1/wavs/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution. 38 | LJSpeech-1.1/wavs/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes. 39 | LJSpeech-1.1/wavs/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 40 | LJSpeech-1.1/wavs/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount 41 | LJSpeech-1.1/wavs/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy. 42 | LJSpeech-1.1/wavs/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties, 43 | LJSpeech-1.1/wavs/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand. 44 | LJSpeech-1.1/wavs/LJ012-0235.wav|While they were in a state of insensibility the murder was committed. 45 | LJSpeech-1.1/wavs/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald. 46 | LJSpeech-1.1/wavs/LJ014-0030.wav|These were damnatory facts which well supported the prosecution. 47 | LJSpeech-1.1/wavs/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome? 48 | LJSpeech-1.1/wavs/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters. 49 | LJSpeech-1.1/wavs/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London, 50 | LJSpeech-1.1/wavs/LJ028-0275.wav|At last, in the twentieth month, 51 | LJSpeech-1.1/wavs/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed. 52 | LJSpeech-1.1/wavs/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm, 53 | LJSpeech-1.1/wavs/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County, 54 | LJSpeech-1.1/wavs/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view. 55 | LJSpeech-1.1/wavs/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning. 56 | LJSpeech-1.1/wavs/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words, 57 | LJSpeech-1.1/wavs/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands. 58 | LJSpeech-1.1/wavs/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy. 59 | LJSpeech-1.1/wavs/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace, 60 | LJSpeech-1.1/wavs/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. 61 | LJSpeech-1.1/wavs/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true, 62 | LJSpeech-1.1/wavs/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him, 63 | LJSpeech-1.1/wavs/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits 64 | LJSpeech-1.1/wavs/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail. 65 | LJSpeech-1.1/wavs/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders. 66 | LJSpeech-1.1/wavs/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal. 67 | LJSpeech-1.1/wavs/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there. 68 | LJSpeech-1.1/wavs/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files 69 | LJSpeech-1.1/wavs/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator. 70 | LJSpeech-1.1/wavs/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash. 71 | LJSpeech-1.1/wavs/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 72 | LJSpeech-1.1/wavs/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely. 73 | LJSpeech-1.1/wavs/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present, 74 | LJSpeech-1.1/wavs/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him. 75 | LJSpeech-1.1/wavs/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense. 76 | LJSpeech-1.1/wavs/LJ008-0294.wav|nearly indefinitely deferred. 77 | LJSpeech-1.1/wavs/LJ047-0148.wav|On October twenty-five, 78 | LJSpeech-1.1/wavs/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner. 79 | LJSpeech-1.1/wavs/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old. 80 | LJSpeech-1.1/wavs/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male, 81 | LJSpeech-1.1/wavs/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one. 82 | LJSpeech-1.1/wavs/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved 83 | LJSpeech-1.1/wavs/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands. 84 | LJSpeech-1.1/wavs/LJ012-0250.wav|On the seventh July, eighteen thirty-seven, 85 | LJSpeech-1.1/wavs/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job. 86 | LJSpeech-1.1/wavs/LJ016-0138.wav|at a distance from the prison. 87 | LJSpeech-1.1/wavs/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology. 88 | LJSpeech-1.1/wavs/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally. 89 | LJSpeech-1.1/wavs/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. 90 | LJSpeech-1.1/wavs/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects. 91 | LJSpeech-1.1/wavs/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle. 92 | LJSpeech-1.1/wavs/LJ038-0199.wav|eleven. If I am alive and taken prisoner, 93 | LJSpeech-1.1/wavs/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came. 94 | LJSpeech-1.1/wavs/LJ033-0047.wav|I noticed when I went out that the light was on, end quote, 95 | LJSpeech-1.1/wavs/LJ040-0027.wav|He was never satisfied with anything. 96 | LJSpeech-1.1/wavs/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly. 97 | LJSpeech-1.1/wavs/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity. 98 | LJSpeech-1.1/wavs/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days, 99 | LJSpeech-1.1/wavs/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston. 100 | LJSpeech-1.1/wavs/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce. 101 | -------------------------------------------------------------------------------- /dllogger/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 MLBenchmark Group. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | # 16 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 17 | # 18 | # Licensed under the Apache License, Version 2.0 (the "License"); 19 | # you may not use this file except in compliance with the License. 20 | # You may obtain a copy of the License at 21 | # 22 | # http://www.apache.org/licenses/LICENSE-2.0 23 | # 24 | # Unless required by applicable law or agreed to in writing, software 25 | # distributed under the License is distributed on an "AS IS" BASIS, 26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 27 | # See the License for the specific language governing permissions and 28 | # limitations under the License. 29 | 30 | import time 31 | import json 32 | import logging 33 | import os 34 | import inspect 35 | import sys 36 | import re 37 | from contextlib import contextmanager 38 | import functools 39 | from collections import OrderedDict 40 | 41 | NVLOGGER_NAME = 'nv_logger' 42 | NVLOGGER_VERSION = '0.2.2' 43 | NVLOGGER_TOKEN = ':::NVLOG' 44 | 45 | MLPERF_NAME = 'mlperf_logger' 46 | MLPERF_VERSION = '0.5.0' 47 | MLPERF_TOKEN = ':::MLP' 48 | 49 | DEFAULT_JSON_FILENAME = 'nvlog.json' 50 | 51 | RUN_SCOPE = 0 52 | EPOCH_SCOPE = 1 53 | TRAIN_ITER_SCOPE = 2 54 | 55 | _data = OrderedDict([ 56 | ('model', None), 57 | ('epoch', -1), 58 | ('iteration', -1), 59 | ('total_iteration', -1), 60 | ('metrics', OrderedDict()), 61 | ('timed_blocks', OrderedDict()), 62 | ('current_scope', RUN_SCOPE) 63 | ]) 64 | 65 | def get_caller(stack_index=2, root_dir=None): 66 | caller = inspect.getframeinfo(inspect.stack()[stack_index][0]) 67 | 68 | # Trim the file names for readability. 69 | filename = caller.filename 70 | if root_dir is not None: 71 | filename = re.sub("^" + root_dir + "/", "", filename) 72 | return "%s:%d" % (filename, caller.lineno) 73 | 74 | class StandardMeter(object): 75 | 76 | def __init__(self): 77 | self.reset() 78 | 79 | def reset(self): 80 | self.value = 0 81 | 82 | def record(self, value): 83 | self.value = value 84 | 85 | def get_value(self): 86 | return self.value 87 | 88 | def get_last(self): 89 | return self.value 90 | 91 | class AverageMeter(object): 92 | 93 | def __init__(self): 94 | self.reset() 95 | 96 | def reset(self): 97 | self.n = 0 98 | self.value = 0 99 | self.last = 0 100 | 101 | def record(self, value, n = 1): 102 | self.last = value 103 | self.n += n 104 | self.value += value * n 105 | 106 | def get_value(self): 107 | return self.value / self.n 108 | 109 | def get_last(self): 110 | return self.last 111 | 112 | class JsonBackend(object): 113 | 114 | def __init__(self, log_file=DEFAULT_JSON_FILENAME, logging_scope=TRAIN_ITER_SCOPE, 115 | iteration_interval=1): 116 | self.log_file = log_file 117 | self.logging_scope = logging_scope 118 | self.iteration_interval = iteration_interval 119 | 120 | self.json_log = OrderedDict([ 121 | ('run', OrderedDict()), 122 | ('epoch', OrderedDict()), 123 | ('iter', OrderedDict()), 124 | ('event', OrderedDict()), 125 | ]) 126 | 127 | self.json_log['epoch']['x'] = [] 128 | if self.logging_scope == TRAIN_ITER_SCOPE: 129 | self.json_log['iter']['x'] = [[]] 130 | 131 | def register_metric(self, key, metric_scope): 132 | if (metric_scope == TRAIN_ITER_SCOPE and 133 | self.logging_scope == TRAIN_ITER_SCOPE): 134 | if not key in self.json_log['iter'].keys(): 135 | self.json_log['iter'][key] = [[]] 136 | if metric_scope == EPOCH_SCOPE: 137 | if not key in self.json_log['epoch'].keys(): 138 | self.json_log['epoch'][key] = [] 139 | 140 | def log(self, key, value): 141 | if _data['current_scope'] == RUN_SCOPE: 142 | self.json_log['run'][key] = value 143 | elif _data['current_scope'] == EPOCH_SCOPE: 144 | pass 145 | elif _data['current_scope'] == TRAIN_ITER_SCOPE: 146 | pass 147 | else: 148 | raise ValueError('log function for scope "', _data['current_scope'], 149 | '" not implemented') 150 | 151 | def log_event(self, key, value): 152 | if not key in self.json_log['event'].keys(): 153 | self.json_log['event'][key] = [] 154 | entry = OrderedDict() 155 | entry['epoch'] = _data['epoch'] 156 | entry['iter'] = _data['iteration'] 157 | entry['timestamp'] = time.time() 158 | if value: 159 | entry['value'] = value 160 | self.json_log['event'][key].append(entry) 161 | 162 | def log_iteration_summary(self): 163 | if (self.logging_scope == TRAIN_ITER_SCOPE and 164 | _data['total_iteration'] % self.iteration_interval == 0): 165 | for key, m in _data['metrics'].items(): 166 | if m.metric_scope == TRAIN_ITER_SCOPE: 167 | self.json_log['iter'][key][-1].append(m.get_last()) 168 | 169 | # log x for iteration number 170 | self.json_log['iter']['x'][-1].append(_data['iteration']) 171 | 172 | 173 | def dump_json(self): 174 | if self.log_file is None: 175 | print(json.dumps(self.json_log, indent=4)) 176 | else: 177 | with open(self.log_file, 'w') as f: 178 | json.dump(self.json_log, fp=f, indent=4) 179 | 180 | def log_epoch_summary(self): 181 | for key, m in _data['metrics'].items(): 182 | if m.metric_scope == EPOCH_SCOPE: 183 | self.json_log['epoch'][key].append(m.get_value()) 184 | elif (m.metric_scope == TRAIN_ITER_SCOPE and 185 | self.logging_scope == TRAIN_ITER_SCOPE): 186 | # create new sublists for each iter metric in the next epoch 187 | self.json_log['iter'][key].append([]) 188 | 189 | # log x for epoch number 190 | self.json_log['epoch']['x'].append(_data['epoch']) 191 | 192 | # create new sublist for iter's x in the next epoch 193 | if self.logging_scope == TRAIN_ITER_SCOPE: 194 | self.json_log['iter']['x'].append([]) 195 | 196 | self.dump_json() 197 | 198 | def timed_block_start(self, name): 199 | pass 200 | 201 | def timed_block_stop(self, name): 202 | pass 203 | 204 | def finish(self): 205 | self.dump_json() 206 | 207 | class _ParentStdOutBackend(object): 208 | 209 | def __init__(self, name, token, version, log_file, logging_scope, iteration_interval): 210 | 211 | self.root_dir = None 212 | self.worker = [0] 213 | self.prefix = '' 214 | 215 | self.name = name 216 | self.token = token 217 | self.version = version 218 | self.log_file = log_file 219 | self.logging_scope = logging_scope 220 | self.iteration_interval = iteration_interval 221 | 222 | self.logger = logging.getLogger(self.name) 223 | self.logger.setLevel(logging.DEBUG) 224 | self.logger.handlers = [] 225 | 226 | if (self.log_file == None): 227 | self.stream_handler = logging.StreamHandler(stream=sys.stdout) 228 | self.stream_handler.setLevel(logging.DEBUG) 229 | self.logger.addHandler(self.stream_handler) 230 | else: 231 | self.file_handler = logging.FileHandler(self.log_file, mode='w') 232 | self.file_handler.setLevel(logging.DEBUG) 233 | self.logger.addHandler(self.file_handler) 234 | 235 | def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE): 236 | pass 237 | 238 | def log_epoch_summary(self): 239 | pass 240 | 241 | def log_iteration_summary(self): 242 | pass 243 | 244 | def log(self, key, value): 245 | if _data['current_scope'] > self.logging_scope: 246 | pass 247 | elif (_data['current_scope'] == TRAIN_ITER_SCOPE and 248 | _data['total_iteration'] % self.iteration_interval != 0): 249 | pass 250 | else: 251 | self.log_stdout(key, value) 252 | 253 | def log_event(self, key, value): 254 | self.log_stdout(key, value) 255 | 256 | def log_stdout(self, key, value=None, forced=False): 257 | # TODO: worker 0 258 | # only the 0-worker will log 259 | #if not forced and self.worker != 0: 260 | # pass 261 | 262 | if value is None: 263 | msg = key 264 | else: 265 | str_json = json.dumps(value) 266 | msg = f'{key}: {str_json}' 267 | 268 | call_site = get_caller(root_dir=self.root_dir) 269 | now = time.time() 270 | 271 | model=_data['model'] 272 | message = f'{self.prefix}{self.token}v{self.version} {model} {now:.5f} ({call_site}) {msg}' 273 | self.logger.debug(message) 274 | 275 | def timed_block_start(self, name): 276 | self.log_stdout(key=name + "_start") 277 | 278 | def timed_block_stop(self, name): 279 | self.log_stdout(key=name + "_stop") 280 | 281 | def finish(self): 282 | pass 283 | 284 | class StdOutBackend(_ParentStdOutBackend): 285 | 286 | def __init__(self, log_file=None, logging_scope=EPOCH_SCOPE, iteration_interval=1): 287 | _ParentStdOutBackend.__init__(self, name=NVLOGGER_NAME, token=NVLOGGER_TOKEN, 288 | version=NVLOGGER_VERSION, log_file=log_file, logging_scope=logging_scope, 289 | iteration_interval=iteration_interval) 290 | 291 | class MLPerfBackend(_ParentStdOutBackend): 292 | 293 | def __init__(self, log_file=None, logging_scope=TRAIN_ITER_SCOPE, iteration_interval=1): 294 | _ParentStdOutBackend.__init__(self, name=MLPERF_NAME, token=MLPERF_TOKEN, 295 | version=MLPERF_VERSION, log_file=log_file, logging_scope=logging_scope, 296 | iteration_interval=iteration_interval) 297 | 298 | class _Logger(object): 299 | def __init__(self): 300 | 301 | self.backends = [ 302 | StdOutBackend(), 303 | JsonBackend() 304 | ] 305 | 306 | def set_model_name(self, name): 307 | _data['model'] = name 308 | 309 | 310 | def set_backends(self, backends): 311 | self.backends = backends 312 | 313 | 314 | def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE): 315 | if meter == None: 316 | meter = StandardMeter() 317 | #TODO: move to argument of Meter? 318 | meter.metric_scope = metric_scope 319 | _data['metrics'][key] = meter 320 | for b in self.backends: 321 | b.register_metric(key, metric_scope) 322 | 323 | def log(self, key, value=None, forced=False): 324 | if _data['current_scope'] == TRAIN_ITER_SCOPE or _data['current_scope'] == EPOCH_SCOPE: 325 | if key in _data['metrics'].keys(): 326 | if _data['metrics'][key].metric_scope == _data['current_scope']: 327 | _data['metrics'][key].record(value) 328 | for b in self.backends: 329 | b.log(key, value) 330 | 331 | def log_event(self, key, value=None): 332 | for b in self.backends: 333 | b.log_event(key, value) 334 | 335 | def timed_block_start(self, name): 336 | if not name in _data['timed_blocks']: 337 | _data['timed_blocks'][name] = OrderedDict() 338 | _data['timed_blocks'][name]['start'] = time.time() 339 | for b in self.backends: 340 | b.timed_block_start(name) 341 | 342 | def timed_block_stop(self, name): 343 | if not name in _data['timed_blocks']: 344 | raise ValueError('timed_block_stop called before timed_block_start for ' + name) 345 | _data['timed_blocks'][name]['stop'] = time.time() 346 | delta = _data['timed_blocks'][name]['stop'] - _data['timed_blocks'][name]['start'] 347 | self.log(name + '_time', delta) 348 | for b in self.backends: 349 | b.timed_block_stop(name) 350 | 351 | def iteration_start(self): 352 | _data['current_scope'] = TRAIN_ITER_SCOPE 353 | _data['iteration'] += 1 354 | _data['total_iteration'] += 1 355 | 356 | 357 | def iteration_stop(self): 358 | for b in self.backends: 359 | b.log_iteration_summary() 360 | _data['current_scope'] = EPOCH_SCOPE 361 | 362 | def epoch_start(self): 363 | _data['current_scope'] = EPOCH_SCOPE 364 | _data['epoch'] += 1 365 | _data['iteration'] = -1 366 | 367 | for n, m in _data['metrics'].items(): 368 | if m.metric_scope == TRAIN_ITER_SCOPE: 369 | m.reset() 370 | 371 | def epoch_stop(self): 372 | for b in self.backends: 373 | b.log_epoch_summary() 374 | _data['current_scope'] = RUN_SCOPE 375 | 376 | def finish(self): 377 | for b in self.backends: 378 | b.finish() 379 | 380 | def iteration_generator_wrapper(self, gen): 381 | for g in gen: 382 | self.iteration_start() 383 | yield g 384 | self.iteration_stop() 385 | 386 | def epoch_generator_wrapper(self, gen): 387 | for g in gen: 388 | self.epoch_start() 389 | yield g 390 | self.epoch_stop() 391 | 392 | LOGGER = _Logger() 393 | 394 | @contextmanager 395 | def timed_block(prefix, value=None, logger=LOGGER, forced=False): 396 | """ This function helps with timed blocks 397 | ---- 398 | Parameters: 399 | prefix - one of items from TIMED_BLOCKS; the action to be timed 400 | logger - NVLogger object 401 | forced - if True then the events are always logged (even if it should be skipped) 402 | """ 403 | if logger is None: 404 | pass 405 | logger.timed_block_start(prefix) 406 | yield logger 407 | logger.timed_block_stop(prefix) 408 | 409 | def timed_function(prefix, variable=None, forced=False): 410 | """ This decorator helps with timed functions 411 | ---- 412 | Parameters: 413 | prefix - one of items from TIME_BLOCK; the action to be timed 414 | logger - NVLogger object 415 | forced - if True then the events are always logged (even if it should be skipped) 416 | """ 417 | def timed_function_decorator(func): 418 | @functools.wraps(func) 419 | def wrapper(*args, **kwargs): 420 | logger = kwargs.get('logger', LOGGER) 421 | value = kwargs.get(variable, next(iter(args), None)) 422 | with timed_block(prefix=prefix, logger=logger, value=value, forced=forced): 423 | func(*args, **kwargs) 424 | return wrapper 425 | return timed_function_decorator 426 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import os 29 | import time 30 | import argparse 31 | import numpy as np 32 | from common.utils import cosine_decay 33 | from contextlib import contextmanager 34 | from datetime import datetime 35 | from plot import plot_alignment 36 | import torch 37 | from torch.utils.data import DataLoader 38 | from torch.utils.tensorboard import SummaryWriter 39 | 40 | import torch.distributed as dist 41 | 42 | from apex import amp 43 | from apex.parallel import DistributedDataParallel as DDP 44 | 45 | from tacotron2.loader import parse_tacotron2_args 46 | from tacotron2.loader import get_tacotron2_model 47 | from tacotron2.loss_function import Tacotron2Loss 48 | from tacotron2.data_function import TextMelCollate 49 | from tacotron2.data_function import TextMelDataset 50 | from tacotron2.data_function import batch_to_gpu 51 | from dllogger.logger import LOGGER 52 | import dllogger.logger as dllg 53 | from dllogger import tags 54 | from dllogger.autologging import log_hardware, log_args 55 | from scipy.io.wavfile import write as write_wav 56 | 57 | 58 | def parse_training_args(parser): 59 | """ 60 | Parse commandline arguments. 61 | """ 62 | 63 | parser.add_argument('-o', '--output_dir', type=str, default='logs', required=True, help='Directory to save checkpoints') 64 | parser.add_argument('-d', '--dataset-path', type=str, default='filelists', help='Path to dataset') 65 | parser.add_argument('--log-file', type=str, default='nvlog.json', help='Filename for logging') 66 | parser.add_argument('--latest-checkpoint-file', type=str, default='checkpoint_latest.pt', help='Store the latest checkpoint in each epoch') 67 | parser.add_argument('--phrase-path', type=str, default=None, help='Path to phrase sequence file used for sample generation') 68 | parser.add_argument('--tacotron2-checkpoint', type=str, default=None, help='Path to pre-trained Tacotron2 checkpoint for sample generation') 69 | 70 | # training 71 | training = parser.add_argument_group('training setup') 72 | training.add_argument('--epochs', type=int, default=500, help='Number of total epochs to run') 73 | training.add_argument('--epochs-per-alignment', type=int, default=1, help='Number of epochs per alignment') 74 | training.add_argument('--epochs-per-checkpoint', type=int, default=50, help='Number of epochs per checkpoint') 75 | training.add_argument('--seed', type=int, default=1234, help='Seed for PyTorch random number generators') 76 | training.add_argument('--dynamic-loss-scaling', type=bool, default=True, help='Enable dynamic loss scaling') 77 | training.add_argument('--amp-run', action='store_true', help='Enable AMP') 78 | training.add_argument('--cudnn-enabled', default=True, help='Enable cudnn') 79 | training.add_argument('--cudnn-benchmark', default=True, help='Run cudnn benchmark') 80 | training.add_argument('--disable-uniform-initialize-bn-weight', action='store_true', help='disable uniform initialization of batchnorm layer weight') 81 | 82 | optimization = parser.add_argument_group('optimization setup') 83 | optimization.add_argument('--use-saved-learning-rate', default=False, type=bool) 84 | optimization.add_argument('--init-lr', '--initial-learning-rate', default=1e-3, type=float, help='Initial learing rate') 85 | optimization.add_argument('--final-lr', '--final-learning-rate', default=1e-5, type=float, help='Final earing rate') 86 | optimization.add_argument('--weight-decay', default=1e-6, type=float, help='Weight decay') 87 | optimization.add_argument('--grad-clip-thresh', default=1.0, type=float, help='Clip threshold for gradients') 88 | optimization.add_argument('-bs', '--batch-size', default=32, type=int, help='Batch size per GPU') 89 | 90 | # dataset parameters 91 | dataset = parser.add_argument_group('dataset parameters') 92 | dataset.add_argument('--load-mel-from-disk', action='store_true', help='Loads mel spectrograms from disk instead of computing them on the fly') 93 | dataset.add_argument('--training-anchor-dirs', default=['ljs_mel_text_train_filelist.txt'], type=str, nargs='*', help='Path to training filelist') 94 | dataset.add_argument('--validation-anchor-dirs', default=['ljs_mel_text_val_filelist.txt'], type=str, nargs='*', help='Path to validation filelist') 95 | dataset.add_argument('--text-cleaners', nargs='*', default=['basic_cleaners'], type=str, help='Type of text cleaners for input text') 96 | 97 | # audio parameters 98 | audio = parser.add_argument_group('audio parameters') 99 | audio.add_argument('--max-wav-value', default=32768.0, type=float, help='Maximum audiowave value') 100 | audio.add_argument('--sampling-rate', default=22050, type=int, help='Sampling rate') 101 | audio.add_argument('--filter-length', default=1024, type=int, help='Filter length') 102 | audio.add_argument('--hop-length', default=256, type=int, help='Hop (stride) length') 103 | audio.add_argument('--win-length', default=1024, type=int, help='Window length') 104 | audio.add_argument('--mel-fmin', default=50.0, type=float, help='Minimum mel frequency') 105 | audio.add_argument('--mel-fmax', default=7600.0, type=float, help='Maximum mel frequency') 106 | 107 | distributed = parser.add_argument_group('distributed setup') 108 | distributed.add_argument('--distributed-run', default=False, type=bool, help='enable distributed run') 109 | distributed.add_argument('--rank', default=0, type=int, help='Rank of the process, do not set! Done by multiproc module') 110 | distributed.add_argument('--world-size', default=1, type=int, help='Number of processes, do not set! Done by multiproc module') 111 | distributed.add_argument('--dist-url', type=str, default='tcp://localhost:23456', help='Url used to set up distributed training') 112 | distributed.add_argument('--group-name', type=str, default='group_name', help='Distributed group name') 113 | distributed.add_argument('--dist-backend', default='nccl', type=str, choices={'nccl'}, help='Distributed run backend') 114 | 115 | return parser 116 | 117 | 118 | def reduce_tensor(tensor, num_gpus): 119 | rt = tensor.clone() 120 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 121 | rt /= num_gpus 122 | return rt 123 | 124 | 125 | def init_distributed(args, world_size, rank, group_name): 126 | assert torch.cuda.is_available(), "Distributed mode requires CUDA." 127 | print("Initializing Distributed") 128 | 129 | # Set cuda device so everything is done on the right GPU. 130 | torch.cuda.set_device(rank % torch.cuda.device_count()) 131 | 132 | # Initialize distributed communication 133 | dist.init_process_group( 134 | backend=args.dist_backend, init_method=args.dist_url, 135 | world_size=world_size, rank=rank, group_name=group_name) 136 | 137 | print("Done initializing distributed") 138 | 139 | 140 | def save_eval(model, filepath, args): 141 | if args.phrase_path: 142 | phrase = torch.load(args.phrase_path, map_location='cpu') 143 | with torch.no_grad(): 144 | model.eval() 145 | mel = model.infer(phrase.cuda())[0].cpu() 146 | model.train() 147 | 148 | # audio = audio[0].numpy() 149 | # audio = audio.astype('int16') 150 | # write_wav(filepath, sampling_rate, audio) 151 | 152 | # adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3 153 | # Following snippet is licensed under MIT license 154 | 155 | 156 | @contextmanager 157 | def evaluating(model): 158 | '''Temporarily switch to evaluation mode.''' 159 | try: 160 | model.eval() 161 | yield model 162 | finally: 163 | if model.training: 164 | model.train() 165 | 166 | 167 | def validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args): 168 | """Handles all the validation scoring and printing""" 169 | with evaluating(model), torch.no_grad(): 170 | val_loader = DataLoader(valate_dataset, num_workers=1, shuffle=False, 171 | batch_size=args.batch_size//len(args.validation_anchor_dirs), 172 | pin_memory=False, collate_fn=collate_fn) 173 | 174 | val_loss = 0.0 175 | for i, batch in enumerate(val_loader): 176 | x, y, num_frames = batch_to_gpu(batch) 177 | y_pred = model(x) 178 | loss = criterion(y_pred, y) 179 | if distributed_run: 180 | reduced_val_loss = reduce_tensor(loss.data, args.world_size).item() 181 | else: 182 | reduced_val_loss = loss.item() 183 | val_loss += reduced_val_loss 184 | val_loss = val_loss / (i + 1) 185 | 186 | LOGGER.log(key="val_iter_loss", value=reduced_val_loss) 187 | 188 | 189 | def adjust_learning_rate(optimizer, epoch, args): 190 | lr = cosine_decay(args.init_lr, args.final_lr, epoch, args.epochs) 191 | 192 | if optimizer.param_groups[0]['lr'] != lr: 193 | LOGGER.log_event("learning_rate changed", 194 | value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr)) 195 | 196 | for param_group in optimizer.param_groups: 197 | param_group['lr'] = lr 198 | 199 | 200 | def main(): 201 | 202 | parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') 203 | parser = parse_training_args(parser) 204 | args, _ = parser.parse_known_args() 205 | 206 | LOGGER.set_model_name("Tacotron2_PyT") 207 | LOGGER.set_backends([ 208 | dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), 209 | dllg.JsonBackend(log_file=os.path.join(args.output_dir, args.log_file) if args.rank == 0 else None, 210 | logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) 211 | ]) 212 | 213 | LOGGER.timed_block_start("run") 214 | LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) 215 | LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) 216 | LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) 217 | LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) 218 | LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) 219 | LOGGER.register_metric("train_epoch_frames/sec", metric_scope=dllg.EPOCH_SCOPE) 220 | LOGGER.register_metric("train_epoch_avg_frames/sec", metric_scope=dllg.EPOCH_SCOPE) 221 | LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) 222 | 223 | log_hardware() 224 | 225 | parser = parse_tacotron2_args(parser) 226 | args = parser.parse_args() 227 | 228 | log_args(args) 229 | 230 | torch.backends.cudnn.enabled = args.cudnn_enabled 231 | torch.backends.cudnn.benchmark = args.cudnn_benchmark 232 | 233 | distributed_run = args.world_size > 1 234 | if distributed_run: 235 | init_distributed(args, args.world_size, args.rank, args.group_name) 236 | 237 | os.makedirs(args.output_dir, exist_ok=True) 238 | 239 | LOGGER.log(key=tags.RUN_START) 240 | run_start_time = time.time() 241 | 242 | model = get_tacotron2_model(args, len(args.training_anchor_dirs), is_training=True) 243 | 244 | if not args.amp_run and distributed_run: 245 | model = DDP(model) 246 | 247 | model.restore_checkpoint(os.path.join(args.output_dir, args.latest_checkpoint_file)) 248 | 249 | optimizer = torch.optim.Adam(model.parameters(), lr=args.init_lr, weight_decay=args.weight_decay) 250 | 251 | writer = SummaryWriter(args.output_dir) 252 | 253 | if args.amp_run: 254 | model, optimizer = amp.initialize(model, optimizer, opt_level='O1') 255 | if distributed_run: 256 | model = DDP(model) 257 | 258 | criterion = Tacotron2Loss() 259 | 260 | collate_fn = TextMelCollate(args) 261 | train_dataset = TextMelDataset(args, args.training_anchor_dirs) 262 | train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, 263 | batch_size=args.batch_size//len(args.training_anchor_dirs), 264 | pin_memory=False, drop_last=True, collate_fn=collate_fn) 265 | # valate_dataset = TextMelDataset(args, args.validation_anchor_dirs) 266 | 267 | model.train() 268 | 269 | elapsed_epochs = model.get_elapsed_epochs() 270 | epochs = args.epochs - elapsed_epochs 271 | iteration = elapsed_epochs * len(train_loader) 272 | 273 | LOGGER.log(key=tags.TRAIN_LOOP) 274 | 275 | for epoch in range(1, epochs + 1): 276 | LOGGER.epoch_start() 277 | epoch_start_time = time.time() 278 | epoch += elapsed_epochs 279 | LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) 280 | 281 | # used to calculate avg frames/sec over epoch 282 | reduced_num_frames_epoch = 0 283 | 284 | # used to calculate avg loss over epoch 285 | train_epoch_avg_loss = 0.0 286 | train_epoch_avg_frames_per_sec = 0.0 287 | num_iters = 0 288 | 289 | adjust_learning_rate(optimizer, epoch, args) 290 | 291 | for i, batch in enumerate(train_loader): 292 | print(f"Batch: {i}/{len(train_loader)} epoch {epoch}") 293 | LOGGER.iteration_start() 294 | iter_start_time = time.time() 295 | LOGGER.log(key=tags.TRAIN_ITER_START, value=i) 296 | 297 | # start = time.perf_counter() 298 | 299 | optimizer.zero_grad() 300 | x, y, num_frames = batch_to_gpu(batch) 301 | 302 | outputs = model(x) 303 | y_pred = [output.cpu() for output in outputs] 304 | 305 | loss = criterion(y_pred, y) 306 | 307 | if distributed_run: 308 | reduced_loss = reduce_tensor(loss.data, args.world_size).item() 309 | reduced_num_frames = reduce_tensor(num_frames.data, 1).item() 310 | else: 311 | reduced_loss = loss.item() 312 | reduced_num_frames = num_frames.item() 313 | 314 | if np.isnan(reduced_loss): 315 | raise Exception("loss is NaN") 316 | 317 | LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) 318 | 319 | train_epoch_avg_loss += reduced_loss 320 | num_iters += 1 321 | 322 | # accumulate number of frames processed in this epoch 323 | reduced_num_frames_epoch += reduced_num_frames 324 | 325 | if args.amp_run: 326 | with amp.scale_loss(loss, optimizer) as scaled_loss: 327 | scaled_loss.backward() 328 | grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_thresh) 329 | else: 330 | loss.backward() 331 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh) 332 | 333 | optimizer.step() 334 | 335 | iteration += 1 336 | 337 | writer.add_scalar('Training/Loss', reduced_loss, iteration) 338 | 339 | LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) 340 | 341 | iter_stop_time = time.time() 342 | iter_time = iter_stop_time - iter_start_time 343 | frames_per_sec = reduced_num_frames/iter_time 344 | train_epoch_avg_frames_per_sec += frames_per_sec 345 | 346 | LOGGER.log(key="train_iter_frames/sec", value=frames_per_sec) 347 | LOGGER.log(key="iter_time", value=iter_time) 348 | LOGGER.iteration_stop() 349 | 350 | LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) 351 | epoch_stop_time = time.time() 352 | epoch_time = epoch_stop_time - epoch_start_time 353 | 354 | LOGGER.log(key="train_epoch_frames/sec", value=(reduced_num_frames_epoch/epoch_time)) 355 | LOGGER.log(key="train_epoch_avg_frames/sec", value=(train_epoch_avg_frames_per_sec/num_iters if num_iters > 0 else 0.0)) 356 | LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0)) 357 | LOGGER.log(key="epoch_time", value=epoch_time) 358 | 359 | LOGGER.log(key=tags.EVAL_START, value=epoch) 360 | 361 | # validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args) 362 | 363 | LOGGER.log(key=tags.EVAL_STOP, value=epoch) 364 | 365 | # Store latest checkpoint in each epoch 366 | model.elapse_epoch() 367 | checkpoint_path = os.path.join(args.output_dir, args.latest_checkpoint_file) 368 | model.save_checkpoint(checkpoint_path) 369 | 370 | # Plot alignemnt 371 | if epoch % args.epochs_per_alignment == 0 and args.rank == 0: 372 | alignments = y_pred[3].data.numpy() 373 | index = np.random.randint(len(alignments)) 374 | plot_alignment(alignments[index], # [enc_step, dec_step] 375 | os.path.join(args.output_dir, f"align_{epoch:04d}_{iteration}.png"), 376 | info=f"{datetime.now().strftime('%Y-%m-%d %H:%M')} Epoch={epoch:04d} Iteration={iteration} Average loss={train_epoch_avg_loss/num_iters:.5f}") 377 | 378 | # Save checkpoint 379 | if epoch % args.epochs_per_checkpoint == 0 and args.rank == 0: 380 | checkpoint_path = os.path.join(args.output_dir, f"checkpoint_{epoch:04d}.pt") 381 | print(f"Saving model and optimizer state at epoch {epoch:04d} to {checkpoint_path}") 382 | model.save_checkpoint(checkpoint_path) 383 | 384 | # Save evaluation 385 | # save_sample(model, args.tacotron2_checkpoint, args.phrase_path, 386 | # os.path.join(args.output_dir, f"sample_{epoch:04d}_{iteration}.wav"), args.sampling_rate) 387 | 388 | LOGGER.epoch_stop() 389 | 390 | run_stop_time = time.time() 391 | run_time = run_stop_time - run_start_time 392 | LOGGER.log(key="run_time", value=run_time) 393 | LOGGER.log(key=tags.RUN_FINAL) 394 | 395 | print("training time", run_stop_time - run_start_time) 396 | writer.close() 397 | 398 | LOGGER.timed_block_stop("run") 399 | 400 | if args.rank == 0: 401 | LOGGER.finish() 402 | 403 | 404 | if __name__ == '__main__': 405 | main() 406 | -------------------------------------------------------------------------------- /tacotron2/model.py: -------------------------------------------------------------------------------- 1 | #: ***************************************************************************** 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | import math 29 | import os 30 | import torch 31 | from torch import nn 32 | from torch.nn import functional as F 33 | import sys 34 | from os.path import abspath, dirname 35 | # enabling modules discovery from global entrypoint 36 | sys.path.append(abspath(dirname(__file__)+'/../')) 37 | from common.layers import ConvNorm, LinearNorm 38 | from common.utils import to_gpu, get_mask_from_lengths 39 | 40 | 41 | class LocationLayer(nn.Module): 42 | def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): 43 | super(LocationLayer, self).__init__() 44 | self.location_conv = ConvNorm(1, attention_n_filters, 45 | kernel_size=attention_kernel_size, 46 | padding=int((attention_kernel_size - 1) / 2), 47 | stride=1, dilation=1) 48 | self.location_dense = LinearNorm(attention_n_filters, attention_dim, 49 | bias=False, w_init_gain='tanh') 50 | 51 | def forward(self, attention_weights_cum): 52 | processed_attention_weights = self.location_conv(attention_weights_cum) 53 | processed_attention_weights = processed_attention_weights.transpose(1, 2) 54 | processed_attention_weights = self.location_dense(processed_attention_weights) 55 | return processed_attention_weights 56 | 57 | 58 | class Attention(nn.Module): 59 | def __init__(self, query_dim, memory_dim, attention_dim, 60 | attention_location_n_filters, attention_location_kernel_size): 61 | super(Attention, self).__init__() 62 | self.query_layer = LinearNorm(query_dim, attention_dim, w_init_gain='tanh') 63 | self.memory_layer = LinearNorm(memory_dim, attention_dim, w_init_gain='tanh') 64 | self.v = LinearNorm(attention_dim, 1) 65 | self.location_layer = LocationLayer(attention_location_n_filters, 66 | attention_location_kernel_size, 67 | attention_dim) 68 | self.score_mask_value = -float("inf") 69 | 70 | def get_alignment_energies(self, query, memory, attention_weights_cum): 71 | """ 72 | PARAMS 73 | ------ 74 | query: decoder output (B, decoder_dim) 75 | memory: encoder outputs (B, T_in, embed_dim) 76 | attention_weights_cum: cumulative attention weights (B, 1, max_time) 77 | 78 | RETURNS 79 | ------- 80 | alignment (batch, max_time) 81 | """ 82 | 83 | # [B, T_in, attn_dim] 84 | key = self.memory_layer(memory) 85 | # [B, 1, attn_dim] 86 | query = self.query_layer(query.unsqueeze(1)) 87 | # [B, T_in, attn_dim] 88 | location_sensitive_weights = self.location_layer(attention_weights_cum) 89 | # score function 90 | energies = self.v(torch.tanh(query + location_sensitive_weights + key)) 91 | # [B, T_in] 92 | energies = energies.squeeze(-1) 93 | 94 | return energies 95 | 96 | def forward(self, query, memory, attention_weights_cum, mask=None): 97 | """ 98 | PARAMS 99 | ------ 100 | query: attention rnn last output [B, decoder_dim] 101 | memory: encoder outputs [B, T_in, embed_dim] 102 | attention_weights_cum: cummulative attention weights 103 | mask: binary mask for padded data 104 | """ 105 | alignment = self.get_alignment_energies(query, memory, attention_weights_cum) 106 | 107 | if mask is not None: 108 | alignment.masked_fill_(mask, self.score_mask_value) 109 | 110 | # [B, T_in] 111 | attention_weights = F.softmax(alignment, dim=1) 112 | # [B, 1, T_in] * [B, T_in, embbed_dim] 113 | attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) 114 | # [B, embbed_dim] 115 | attention_context = attention_context.squeeze(1) 116 | 117 | return attention_context, attention_weights 118 | 119 | 120 | class Prenet(nn.Module): 121 | def __init__(self, in_dim, sizes): 122 | super(Prenet, self).__init__() 123 | in_sizes = [in_dim] + sizes[:-1] 124 | self.layers = nn.ModuleList( 125 | [LinearNorm(in_size, out_size) for (in_size, out_size) in zip(in_sizes, sizes)]) 126 | 127 | def forward(self, x, inference=False): 128 | if inference: 129 | for linear in self.layers: 130 | x = F.relu(linear(x), inplace=True) 131 | x0 = x[0].unsqueeze(0) 132 | mask = torch.bernoulli(x0.new(x0.size()).fill_(0.5)) 133 | mask = mask.expand(x.size()) 134 | x = x * mask * 2 135 | else: 136 | for linear in self.layers: 137 | x = F.dropout(F.relu(linear(x), inplace=True), p=0.5, training=True) 138 | return x 139 | 140 | 141 | class Postnet(nn.Module): 142 | """Postnet 143 | - Five 1-d convolution with 512 channels and kernel size 5 144 | """ 145 | 146 | def __init__(self, n_mel_channels, postnet_embedding_dim, 147 | postnet_kernel_size, postnet_n_convolutions): 148 | super(Postnet, self).__init__() 149 | self.convolutions = nn.ModuleList() 150 | 151 | self.convolutions.append( 152 | nn.Sequential( 153 | ConvNorm(n_mel_channels, postnet_embedding_dim, 154 | kernel_size=postnet_kernel_size, stride=1, 155 | padding=int((postnet_kernel_size - 1) / 2), 156 | dilation=1, w_init_gain='tanh'), 157 | nn.BatchNorm1d(postnet_embedding_dim)) 158 | ) 159 | 160 | for i in range(1, postnet_n_convolutions - 1): 161 | self.convolutions.append( 162 | nn.Sequential( 163 | ConvNorm(postnet_embedding_dim, 164 | postnet_embedding_dim, 165 | kernel_size=postnet_kernel_size, stride=1, 166 | padding=int((postnet_kernel_size - 1) / 2), 167 | dilation=1, w_init_gain='tanh'), 168 | nn.BatchNorm1d(postnet_embedding_dim)) 169 | ) 170 | 171 | self.convolutions.append( 172 | nn.Sequential( 173 | ConvNorm(postnet_embedding_dim, n_mel_channels, 174 | kernel_size=postnet_kernel_size, stride=1, 175 | padding=int((postnet_kernel_size - 1) / 2), 176 | dilation=1, w_init_gain='linear'), 177 | nn.BatchNorm1d(n_mel_channels)) 178 | ) 179 | 180 | def forward(self, x): 181 | for i in range(len(self.convolutions) - 1): 182 | x = torch.tanh(self.convolutions[i](x)) 183 | return self.convolutions[-1](x) 184 | 185 | 186 | class Encoder(nn.Module): 187 | """Encoder module: 188 | - Three 1-d convolution banks 189 | - Bidirectional LSTM 190 | """ 191 | def __init__(self, encoder_n_convolutions, encoder_embedding_dim, encoder_kernel_size): 192 | super(Encoder, self).__init__() 193 | 194 | convolutions = [] 195 | for _ in range(encoder_n_convolutions): 196 | conv_layer = nn.Sequential( 197 | ConvNorm(encoder_embedding_dim, 198 | encoder_embedding_dim, 199 | kernel_size=encoder_kernel_size, stride=1, 200 | padding=int((encoder_kernel_size - 1) / 2), 201 | dilation=1, w_init_gain='relu'), 202 | nn.BatchNorm1d(encoder_embedding_dim)) 203 | convolutions.append(conv_layer) 204 | self.convolutions = nn.ModuleList(convolutions) 205 | 206 | self.encoder_lstm = nn.LSTM(encoder_embedding_dim, 207 | int(encoder_embedding_dim / 2), 1, 208 | batch_first=True, bidirectional=True) 209 | 210 | def forward(self, x, text_lengths): 211 | for conv in self.convolutions: 212 | x = F.relu(conv(x), inplace=True) 213 | 214 | # [B, encoder_dim, T_in] -> [B, T_in, encoder_dim] 215 | x = x.transpose(1, 2) 216 | 217 | # pytorch tensor are not reversible, hence the conversion 218 | text_lengths = text_lengths.cpu().numpy() 219 | x = nn.utils.rnn.pack_padded_sequence(x, text_lengths, batch_first=True) 220 | # [B, T_in, encoder_dim] 221 | outputs, _ = self.encoder_lstm(x) 222 | outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) 223 | 224 | return outputs 225 | 226 | 227 | class Decoder(nn.Module): 228 | def __init__(self, n_mel_channels, n_frames_per_step, 229 | encoder_embedding_dim, attention_dim, 230 | attention_location_n_filters, 231 | attention_location_kernel_size, 232 | prenet_dim, decoder_rnn_dim, 233 | max_decoder_steps, gate_threshold, 234 | decoder_n_lstms, p_decoder_dropout): 235 | super(Decoder, self).__init__() 236 | self.n_mel_channels = n_mel_channels 237 | self.n_frames_per_step = n_frames_per_step 238 | self.encoder_embedding_dim = encoder_embedding_dim 239 | self.decoder_rnn_dim = decoder_rnn_dim 240 | self.prenet_dim = prenet_dim 241 | self.max_decoder_steps = max_decoder_steps 242 | self.gate_threshold = gate_threshold 243 | self.decoder_n_lstms = decoder_n_lstms 244 | self.p_decoder_dropout = p_decoder_dropout 245 | 246 | self.prenet = Prenet(n_mel_channels, [prenet_dim, prenet_dim]) 247 | 248 | self.lstm0 = nn.LSTMCell(prenet_dim + encoder_embedding_dim, decoder_rnn_dim) 249 | self.lstm1 = nn.LSTMCell(decoder_rnn_dim + encoder_embedding_dim, decoder_rnn_dim) 250 | 251 | self.attention_layer = Attention(decoder_rnn_dim, encoder_embedding_dim, 252 | attention_dim, attention_location_n_filters, 253 | attention_location_kernel_size) 254 | 255 | self.linear_projection = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step) 256 | 257 | self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_frames_per_step, w_init_gain='sigmoid') 258 | 259 | def initialize_decoder_states(self, memory, mask=None, inference=False): 260 | """ Initializes attention rnn states, decoder rnn states, attention 261 | weights, attention cumulative weights, attention context, stores memory 262 | PARAMS 263 | ------ 264 | memory: Encoder outputs 265 | mask: Mask for padded data if training, expects None for inference 266 | """ 267 | B = memory.size(0) 268 | MAX_TIME = memory.size(1) 269 | 270 | self.h0 = torch.zeros(B, self.decoder_rnn_dim).cuda() 271 | self.c0 = torch.zeros(B, self.decoder_rnn_dim).cuda() 272 | self.h1 = torch.zeros(B, self.decoder_rnn_dim).cuda() 273 | self.c1 = torch.zeros(B, self.decoder_rnn_dim).cuda() 274 | 275 | # if inference: 276 | # self.h0 = self.h0.half() 277 | # self.c0 = self.c0.half() 278 | # self.h1 = self.h1.half() 279 | # self.c1 = self.c1.half() 280 | 281 | self.attention_weights = memory.new(B, MAX_TIME).zero_() 282 | self.attention_weights_cum = memory.new(B, MAX_TIME).zero_() 283 | self.attention_context = memory.new(B, self.encoder_embedding_dim).zero_() 284 | 285 | self.memory = memory 286 | self.mask = mask 287 | 288 | def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments, mel_lengths=None): 289 | """ Prepares decoder outputs for output 290 | PARAMS 291 | ------ 292 | mel_outputs: 293 | gate_outputs: gate output energies 294 | alignments: 295 | 296 | RETURNS 297 | ------- 298 | mel_outputs: 299 | gate_outpust: gate output energies 300 | alignments: 301 | """ 302 | # (T_out, B, T_in) -> (B, T_in, T_out) 303 | alignments = torch.stack(alignments).transpose(0, 1).transpose(1, 2).contiguous() 304 | # (T_out, B, n_frames_per_step) -> (B, T_out, n_frames_per_step) 305 | gate_outputs = torch.stack(gate_outputs).transpose(0, 1) 306 | # (B, T_out, n_frames_per_step) -> (B, T_out) 307 | gate_outputs = gate_outputs.contiguous().view(gate_outputs.size(0), -1) 308 | # (T_out, B, n_mel_channels * n_frames_per_step) -> (B, T_out, n_mel_channels * n_frames_per_step) 309 | mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() 310 | # decouple frames per step 311 | mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, self.n_mel_channels) 312 | # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) 313 | mel_outputs = mel_outputs.transpose(1, 2) 314 | # mel lengths scale to the target length 315 | if mel_lengths is not None: 316 | mel_lengths *= self.n_frames_per_step 317 | 318 | return mel_outputs, gate_outputs, alignments, mel_lengths 319 | 320 | def decode(self, prenet_output): 321 | """ Decoder step using stored states, attention and memory 322 | PARAMS 323 | ------ 324 | prenet_output: previous mel output 325 | 326 | RETURNS 327 | ------- 328 | mel_output: 329 | gate_output: gate output energies 330 | attention_weights: 331 | """ 332 | x = torch.cat((prenet_output, self.attention_context), dim=-1) 333 | self.h0, self.c0 = self.lstm0(x, (self.h0, self.c0)) 334 | # [B, 1, decoder_dim] 335 | x = F.dropout(self.h0, self.p_decoder_dropout, self.training) 336 | 337 | x = torch.cat((x, self.attention_context), dim=-1) 338 | self.h1, self.c1 = self.lstm1(x, (self.h1, self.c1)) 339 | # [B, 1, decoder_dim] 340 | self.query = F.dropout(self.h1, self.p_decoder_dropout, self.training) 341 | 342 | attention_weights_cumulative = self.attention_weights_cum.unsqueeze(1) 343 | self.attention_context, self.attention_weights = self.attention_layer( 344 | self.query, self.memory, attention_weights_cumulative, self.mask) 345 | 346 | # [B, MAX_TIME] 347 | # Avoid '+=' as in-place operation in case of gradient computation 348 | self.attention_weights_cum = self.attention_weights_cum + self.attention_weights 349 | 350 | x = torch.cat((self.query, self.attention_context), dim=-1) 351 | # [B, n_mel_channels * n_frames_per_step] 352 | mel_output = self.linear_projection(x) 353 | # [B, n_frames_per_step] 354 | gate_output = self.gate_layer(x) 355 | return mel_output, gate_output, self.attention_weights 356 | 357 | def forward(self, memory, memory_lengths, targets): 358 | """ Decoder forward pass for training 359 | PARAMS 360 | ------ 361 | memory: Encoder outputs 362 | targets: Decoder inputs for teacher forcing. i.e. mel-specs 363 | memory_lengths: Encoder output lengths for attention masking. 364 | 365 | RETURNS 366 | ------- 367 | mel_outputs: mel outputs from the decoder 368 | gate_outputs: gate outputs from the decoder 369 | alignments: sequence of attention weights from the decoder 370 | """ 371 | go_frame = memory.new(memory.size(0), self.n_mel_channels).zero_().unsqueeze(0) 372 | # (B, n_mel_channels, T_out) -> (T_out, B, n_mel_channels) 373 | targets = targets.permute(2, 0, 1) 374 | decoder_inputs = torch.cat((go_frame, targets), dim=0) 375 | prenet_outputs = self.prenet(decoder_inputs) 376 | 377 | mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None 378 | self.initialize_decoder_states(memory, mask) 379 | 380 | mel_outputs, gate_outputs, alignments = [], [], [] 381 | # size - 1 for ignoring EOS symbol 382 | while len(mel_outputs) < decoder_inputs.size(0) - 1: 383 | prenet_output = prenet_outputs[len(mel_outputs)] 384 | mel_output, gate_output, attention_weights = self.decode(prenet_output) 385 | 386 | mel_outputs += [mel_output] 387 | gate_outputs += [gate_output] 388 | alignments += [attention_weights] 389 | 390 | return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments) 391 | 392 | def infer(self, memory, memory_lengths): 393 | """ Decoder inference 394 | PARAMS 395 | ------ 396 | memory: Encoder outputs 397 | 398 | RETURNS 399 | ------- 400 | mel_outputs: mel outputs from the decoder 401 | gate_outputs: gate outputs from the decoder 402 | alignments: sequence of attention weights from the decoder 403 | """ 404 | mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None 405 | self.initialize_decoder_states(memory, mask, inference=True) 406 | 407 | mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32) 408 | if torch.cuda.is_available(): 409 | mel_lengths = mel_lengths.cuda() 410 | 411 | mel_outputs, gate_outputs, alignments = [], [], [] 412 | frame = memory.new(memory.size(0), self.n_mel_channels).zero_() 413 | while True: 414 | prenet_output = self.prenet(frame, inference=True) 415 | 416 | mel_output, gate_output, alignment = self.decode(prenet_output) 417 | gate_output = torch.sigmoid(gate_output) 418 | 419 | finished = torch.gt(gate_output, self.gate_threshold).all(-1) 420 | mel_lengths += (~finished).to(torch.int32) 421 | 422 | if finished.all(): 423 | break 424 | 425 | mel_outputs += [mel_output] 426 | gate_outputs += [gate_output] 427 | alignments += [alignment] 428 | 429 | if len(mel_outputs) == self.max_decoder_steps: 430 | print("Warning! Reached max decoder steps") 431 | break 432 | 433 | frame = mel_output[:, :self.n_mel_channels] 434 | 435 | return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments, mel_lengths) 436 | 437 | def gta(self, memory, memory_lengths, targets): 438 | """ Decoder forward pass for training 439 | PARAMS 440 | ------ 441 | memory: Encoder outputs 442 | memory_lengths: Encoder output lengths for attention masking. 443 | targets: Decoder inputs for teacher forcing. i.e. mel-specs 444 | 445 | RETURNS 446 | ------- 447 | mel_outputs: mel outputs from the decoder 448 | gate_outputs: gate outputs from the decoder 449 | alignments: sequence of attention weights from the decoder 450 | """ 451 | go_frame = memory.new(memory.size(0), self.n_mel_channels).zero_().unsqueeze(0) 452 | # (B, n_mel_channels, T_out) -> (T_out, B, n_mel_channels) 453 | targets = targets.permute(2, 0, 1) 454 | decoder_inputs = torch.cat((go_frame, targets), dim=0) 455 | prenet_outputs = self.prenet(decoder_inputs, inference=True) 456 | 457 | mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None 458 | self.initialize_decoder_states(memory, mask, inference=True) 459 | 460 | mel_outputs, gate_outputs, alignments = [], [], [] 461 | # size - 1 for ignoring EOS symbol 462 | while len(mel_outputs) < decoder_inputs.size(0) - 1: 463 | prenet_output = prenet_outputs[len(mel_outputs)] 464 | mel_output, gate_output, attention_weights = self.decode(prenet_output) 465 | 466 | mel_outputs += [mel_output] 467 | gate_outputs += [gate_output] 468 | alignments += [attention_weights] 469 | 470 | return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments) 471 | 472 | 473 | class Tacotron2(nn.Module): 474 | def __init__(self, mask_padding, n_mel_channels, 475 | n_symbols, symbols_embedding_dim, encoder_kernel_size, 476 | encoder_n_convolutions, encoder_embedding_dim, 477 | attention_dim, attention_location_n_filters, 478 | attention_location_kernel_size, n_frames_per_step, 479 | prenet_dim, decoder_rnn_dim, max_decoder_steps, gate_threshold, 480 | decoder_n_lstms, p_decoder_dropout, 481 | postnet_embedding_dim, postnet_kernel_size, 482 | postnet_n_convolutions): 483 | super(Tacotron2, self).__init__() 484 | self.elapsed_epochs = 0 485 | self.mask_padding = mask_padding 486 | self.n_mel_channels = n_mel_channels 487 | self.n_frames_per_step = n_frames_per_step 488 | self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim) 489 | std = math.sqrt(2.0 / (n_symbols + symbols_embedding_dim)) 490 | val = math.sqrt(3.0) * std # uniform bounds for std 491 | self.embedding.weight.data.uniform_(-val, val) 492 | self.encoder = Encoder(encoder_n_convolutions, encoder_embedding_dim, encoder_kernel_size) 493 | self.decoder = Decoder(n_mel_channels, n_frames_per_step, 494 | encoder_embedding_dim, attention_dim, 495 | attention_location_n_filters, 496 | attention_location_kernel_size, 497 | prenet_dim, decoder_rnn_dim, 498 | max_decoder_steps, 499 | gate_threshold, decoder_n_lstms, 500 | p_decoder_dropout) 501 | self.postnet = Postnet(n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions) 502 | 503 | def parse_outputs(self, outputs, target_lengths=None): 504 | if self.mask_padding and target_lengths is not None: 505 | mask = ~get_mask_from_lengths(target_lengths) 506 | mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) 507 | mask = mask.permute(1, 0, 2) 508 | 509 | outputs[0].masked_fill_(mask, 0.0) 510 | outputs[1].masked_fill_(mask, 0.0) 511 | outputs[2].masked_fill_(mask[:, 0, :], 1e3) # gate energies 512 | 513 | return outputs 514 | 515 | def forward(self, inputs): 516 | texts, text_lengths, targets, target_lengths = inputs 517 | 518 | # [B, T_in] -> [B, embed_dim, T_in] 519 | embedded_inputs = self.embedding(texts).transpose(1, 2) 520 | # [B, T_in, encoder_dim] 521 | encoder_outputs = self.encoder(embedded_inputs, text_lengths) 522 | 523 | mel_outputs_before, gate_outputs, alignments, _ = self.decoder(encoder_outputs, text_lengths, targets) 524 | mel_outputs_after = mel_outputs_before + self.postnet(mel_outputs_before) 525 | 526 | return self.parse_outputs([mel_outputs_before, mel_outputs_after, gate_outputs, alignments]) 527 | 528 | def infer(self, texts, text_lengths, targets=None, target_lengths=None): 529 | # [B, T_in] -> [B, embed_dim, T_in] 530 | embedded_inputs = self.embedding(texts).transpose(1, 2) 531 | # [B, T_in, encoder_dim] 532 | encoder_outputs = self.encoder(embedded_inputs, text_lengths) 533 | 534 | if targets is None: 535 | mel_outputs_before, gate_outputs, alignments, mel_lengths = self.decoder.infer(encoder_outputs, text_lengths) 536 | else: 537 | mel_outputs_before, gate_outputs, alignments, mel_lengths = self.decoder.gta(encoder_outputs, text_lengths, targets) 538 | 539 | mel_outputs_after = mel_outputs_before + self.postnet(mel_outputs_before) 540 | 541 | return self.parse_outputs([mel_outputs_before, mel_outputs_after, gate_outputs, alignments, mel_lengths]) 542 | 543 | def elapse_epoch(self): 544 | self.elapsed_epochs += 1 545 | 546 | def get_elapsed_epochs(self): 547 | return self.elapsed_epochs 548 | 549 | def save_checkpoint(self, filepath): 550 | torch.save({'epoch': self.elapsed_epochs, 'model': self.state_dict()}, filepath) 551 | 552 | def restore_checkpoint(self, filepath): 553 | if os.path.exists(filepath) : 554 | def _checkpoint_from_distributed(state_dict): 555 | """ 556 | Checks whether checkpoint was generated by DistributedDataParallel. DDP 557 | wraps model in additional "module.", it needs to be unwrapped for single 558 | GPU inference. 559 | :param state_dict: model's state dict 560 | """ 561 | for key, _ in state_dict.items(): 562 | if key.find('module.') != -1: 563 | return True 564 | return False 565 | 566 | def _unwrap_distributed(state_dict): 567 | """ 568 | Unwraps model from DistributedDataParallel. 569 | DDP wraps model in additional "module.", it needs to be removed for single 570 | GPU inference. 571 | :param state_dict: model's state dict 572 | """ 573 | new_state_dict = {} 574 | for key, value in state_dict.items(): 575 | new_key = key.replace('module.', '') 576 | new_state_dict[new_key] = value 577 | return new_state_dict 578 | 579 | print(f'Loading Weights: "{filepath}"') 580 | checkpoint = torch.load(filepath) 581 | self.elapsed_epochs = checkpoint['epoch'] 582 | if _checkpoint_from_distributed(checkpoint['model']): 583 | checkpoint['model'] = _unwrap_distributed(checkpoint['model']) 584 | self.load_state_dict(checkpoint['model']) 585 | --------------------------------------------------------------------------------