├── dllogger
    ├── __init__.py
    ├── autologging.py
    ├── tags.py
    └── logger.py
├── scripts
    ├── docker
    │   ├── build.sh
    │   └── interactive.sh
    ├── griffin_lim_synth.sh
    ├── gta_synth.sh
    ├── train_tacotron2.sh
    ├── prepare_dataset.sh
    └── prepare_mels.sh
├── requirements.txt
├── Dockerfile
├── tacotron2
    ├── text
    │   ├── symbols.py
    │   ├── LICENCE
    │   ├── cmudict.py
    │   ├── numbers.py
    │   ├── __init__.py
    │   └── cleaners.py
    ├── loss_function.py
    ├── loader.py
    ├── data_function.py
    └── model.py
├── README.md
├── LICENCE
├── .gitignore
├── plot.py
├── multiproc.py
├── common
    ├── preprocessor.py
    ├── utils.py
    ├── audio_processing.py
    ├── layers.py
    ├── audio.py
    └── stft.py
├── preprocess.py
├── inference.py
├── gta.py
├── hparams.py
├── filelists
    ├── ljs_mel_text_val_filelist.txt
    └── ljs_audio_text_val_filelist.txt
└── train.py


/dllogger/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build . --rm -t tacotron2
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy
3 | inflect
4 | scipy
5 | Unidecode
6 | pillow
7 | apex
8 | json
9 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvcr.io/nvidia/pytorch:19.08-py3
2 | 
3 | ADD . /workspace/tacotron2
4 | WORKDIR /workspace/tacotron2
5 | RUN pip install -r requirements.txt
6 | 


--------------------------------------------------------------------------------
/scripts/griffin_lim_synth.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python inference.py -i text.txt -o outputs --amp-run --speaker-num 4 --speaker-id 0 --log-file nvlog.json
2 | 


--------------------------------------------------------------------------------
/scripts/docker/interactive.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -it --rm --ipc=host -v $PWD:/workspace/tacotron2/ tacotron2 bash
4 | 


--------------------------------------------------------------------------------
/scripts/gta_synth.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python gta.py --amp-run -o gta --dataset-path training_data --training-anchor-dirs tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050
2 | 


--------------------------------------------------------------------------------
/scripts/train_tacotron2.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python train.py --amp-run -o logs --init-lr 1e-3 --final-lr 1e-5 --epochs 200 -bs 32 --weight-decay 1e-6 --log-file nvlog.json --dataset-path training_data --training-anchor-dirs --load-mel-from-disk tts_fanfanli_22050 tts_xiaoya_22050 tts_yangluzhuo_22050 tts_yuanzhonglu_22050
2 | 


--------------------------------------------------------------------------------
/scripts/prepare_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | DATADIR="LJSpeech-1.1"
 6 | BZ2ARCHIVE="${DATADIR}.tar.bz2"
 7 | ENDPOINT="http://data.keithito.com/data/speech/$BZ2ARCHIVE"
 8 | 
 9 | if [ ! -d "$DATADIR" ]; then
10 |   echo "dataset is missing, unpacking ..."
11 |   if [ ! -f "$BZ2ARCHIVE" ]; then
12 |     echo "dataset archive is missing, downloading ..."
13 |     wget "$ENDPOINT"
14 |   fi
15 |   tar jxvf "$BZ2ARCHIVE"
16 | fi
17 | 


--------------------------------------------------------------------------------
/tacotron2/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | from tacotron2.text import cmudict
 8 | 
 9 | _pad         = '_'
10 | _eos         = '~'
11 | _punctuation = '!\'(),.:;? '
12 | _special = '-'
13 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890'
14 | 
15 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
16 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
17 | 
18 | # Export all symbols:
19 | symbols = [_pad, _eos] + list(_special) + list(_punctuation) + list(_letters)# + _arpabet
20 | 


--------------------------------------------------------------------------------
/scripts/prepare_mels.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | DATADIR="ljs"
 6 | FILELISTSDIR="filelists"
 7 | 
 8 | TESTLIST="$FILELISTSDIR/ljs_audio_text_test_filelist.txt"
 9 | TRAINLIST="$FILELISTSDIR/ljs_audio_text_train_filelist.txt"
10 | VALLIST="$FILELISTSDIR/ljs_audio_text_val_filelist.txt"
11 | 
12 | TESTLIST_MEL="$FILELISTSDIR/ljs_mel_text_test_filelist.txt"
13 | TRAINLIST_MEL="$FILELISTSDIR/ljs_mel_text_train_filelist.txt"
14 | VALLIST_MEL="$FILELISTSDIR/ljs_mel_text_val_filelist.txt"
15 | 
16 | mkdir -p "$DATADIR/mels"
17 | if [ $(ls $DATADIR/mels | wc -l) -ne 13100 ]; then
18 |     python preprocess_audio2mel.py --wav-files "$TRAINLIST" --mel-files "$TRAINLIST_MEL"
19 |     python preprocess_audio2mel.py --wav-files "$TESTLIST" --mel-files "$TESTLIST_MEL"
20 |     python preprocess_audio2mel.py --wav-files "$VALLIST" --mel-files "$VALLIST_MEL"
21 | fi
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tacotron 2 for PyTorch
 2 | 
 3 | This repository provides a script and recipe to train Tacotron 2. The source is forked from [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2) and combined with [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2). It supports multi-speaker TTS, GTA synthesis as well as reduction factor. 
 4 | 
 5 | ## Run the scripts
 6 | ```shell
 7 | # Preprocessing
 8 | python preprocess.py
 9 | # Training
10 | nohup bash scripts/train_tacotron2.sh &
11 | # Evaluation
12 | bash scripts/griffin_lim_synth.sh
13 | # GTA synthesis
14 | bash scripts/gta_synth.sh
15 | ```
16 | 
17 | ## Vocoder recommended
18 | [WaveRNN](https://github.com/begeekmyfriend/WaveRNN)
19 | 
20 | [WaveGlow](https://github.com/begeekmyfriend/WaveGlow)
21 | 
22 | [SqueezeWave](https://github.com/begeekmyfriend/SqueezeWave)
23 | 
24 | ## Audio samples
25 | [Two males and two females in Chinese](https://github.com/begeekmyfriend/tacotron2/issues/1)
26 | 


--------------------------------------------------------------------------------
/tacotron2/text/LICENCE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, NVIDIA Corporation
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | logs/
 28 | runs/
 29 | *.out
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/tacotron2/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | valid_symbols = [
 7 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 8 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 9 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14 | ]
15 | 
16 | _valid_symbol_set = set(valid_symbols)
17 | 
18 | 
19 | class CMUDict:
20 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21 |   def __init__(self, file_or_path, keep_ambiguous=True):
22 |     if isinstance(file_or_path, str):
23 |       with open(file_or_path, encoding='latin-1') as f:
24 |         entries = _parse_cmudict(f)
25 |     else:
26 |       entries = _parse_cmudict(file_or_path)
27 |     if not keep_ambiguous:
28 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29 |     self._entries = entries
30 | 
31 | 
32 |   def __len__(self):
33 |     return len(self._entries)
34 | 
35 | 
36 |   def lookup(self, word):
37 |     '''Returns list of ARPAbet pronunciations of the given word.'''
38 |     return self._entries.get(word.upper())
39 | 
40 | 
41 | 
42 | _alt_re = re.compile(r'\([0-9]+\)')
43 | 
44 | 
45 | def _parse_cmudict(file):
46 |   cmudict = {}
47 |   for line in file:
48 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49 |       parts = line.split('  ')
50 |       word = re.sub(_alt_re, '', parts[0])
51 |       pronunciation = _get_pronunciation(parts[1])
52 |       if pronunciation:
53 |         if word in cmudict:
54 |           cmudict[word].append(pronunciation)
55 |         else:
56 |           cmudict[word] = [pronunciation]
57 |   return cmudict
58 | 
59 | 
60 | def _get_pronunciation(s):
61 |   parts = s.strip().split(' ')
62 |   for part in parts:
63 |     if part not in _valid_symbol_set:
64 |       return None
65 |   return ' '.join(parts)
66 | 


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def split_title_line(title_text, max_words=5):
 8 | 	"""
 9 | 	A function that splits any string based on specific character
10 | 	(returning it with the string), with maximum number of words on it
11 | 	"""
12 | 	seq = title_text.split()
13 | 	return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
14 | 
15 | 
16 | def plot_alignment(alignment, path, info=None, split_title=False):
17 | 	fig = plt.figure(figsize=(8, 6))
18 | 	ax = fig.add_subplot(111)
19 | 
20 | 	im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none')
21 | 	fig.colorbar(im, ax=ax)
22 | 	xlabel = 'Decoder timestep'
23 | 	title = split_title_line(info) if split_title else info
24 | 	plt.xlabel(xlabel)
25 | 	plt.title(title)
26 | 	plt.ylabel('Encoder timestep')
27 | 	plt.tight_layout()
28 | 	plt.savefig(path, format='png')
29 | 	plt.close()
30 | 
31 | 
32 | def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
33 | 	if max_len is not None:
34 | 		target_spectrogram = target_spectrogram[:max_len]
35 | 		pred_spectrogram = pred_spectrogram[:max_len]
36 | 
37 | 	title = split_title_line(info) if split_title else info
38 | 	fig = plt.figure(figsize=(10, 8))
39 | 	# Set common labels
40 | 	fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
41 | 
42 | 	#target spectrogram subplot
43 | 	if target_spectrogram is not None:
44 | 		ax1 = fig.add_subplot(311)
45 | 		ax2 = fig.add_subplot(312)
46 | 
47 | 		if auto_aspect:
48 | 			im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
49 | 		else:
50 | 			im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
51 | 		ax1.set_title('Target Mel-Spectrogram')
52 | 		fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
53 | 		ax2.set_title('Predicted Mel-Spectrogram')
54 | 	else:
55 | 		ax2 = fig.add_subplot(211)
56 | 
57 | 	if auto_aspect:
58 | 		im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
59 | 	else:
60 | 		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
61 | 	fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
62 | 
63 | 	plt.tight_layout()
64 | 	plt.savefig(path, format='png')
65 | 	plt.close()
66 | 


--------------------------------------------------------------------------------
/tacotron2/loss_function.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | 
28 | from torch import nn
29 | 
30 | 
31 | class Tacotron2Loss(nn.Module):
32 |     def __init__(self):
33 |         super(Tacotron2Loss, self).__init__()
34 | 
35 |     def forward(self, model_output, targets):
36 |         mel_target, gate_target = targets[0], targets[1]
37 |         mel_out_before, mel_out_after, gate_out, _ = model_output
38 | 
39 |         mel_loss = nn.MSELoss()(mel_out_before, mel_target) + nn.MSELoss()(mel_out_after, mel_target)
40 |         gate_loss = nn.BCEWithLogitsLoss()(gate_out.view(-1, 1), gate_target.view(-1, 1))
41 |         return mel_loss + gate_loss
42 | 


--------------------------------------------------------------------------------
/tacotron2/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |   return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |   return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |   match = m.group(1)
26 |   parts = match.split('.')
27 |   if len(parts) > 2:
28 |     return match + ' dollars'  # Unexpected format
29 |   dollars = int(parts[0]) if parts[0] else 0
30 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |   if dollars and cents:
32 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |     cent_unit = 'cent' if cents == 1 else 'cents'
34 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |   elif dollars:
36 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |     return '%s %s' % (dollars, dollar_unit)
38 |   elif cents:
39 |     cent_unit = 'cent' if cents == 1 else 'cents'
40 |     return '%s %s' % (cents, cent_unit)
41 |   else:
42 |     return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |   return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |   num = int(m.group(0))
51 |   if num > 1000 and num < 3000:
52 |     if num == 2000:
53 |       return 'two thousand'
54 |     elif num > 2000 and num < 2010:
55 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |     elif num % 100 == 0:
57 |       return _inflect.number_to_words(num // 100) + ' hundred'
58 |     else:
59 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |   else:
61 |     return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |   text = re.sub(_comma_number_re, _remove_commas, text)
66 |   text = re.sub(_pounds_re, r'\1 pounds', text)
67 |   text = re.sub(_dollars_re, _expand_dollars, text)
68 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |   text = re.sub(_number_re, _expand_number, text)
71 |   return text
72 | 


--------------------------------------------------------------------------------
/dllogger/autologging.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import subprocess
17 | import xml.etree.ElementTree as ET
18 | 
19 | from dllogger.logger import LOGGER
20 | 
21 | #TODO: print CUDA version, container version etc
22 | 
23 | def log_hardware():
24 |     # TODO: asserts - what if you cannot launch those commands?
25 |     # number of CPU threads
26 |     cpu_info_command = 'cat /proc/cpuinfo'
27 |     cpu_info = subprocess.run(cpu_info_command.split(), stdout=subprocess.PIPE).stdout.split()
28 |     cpu_num_index = len(cpu_info) - cpu_info[::-1].index(b'processor') + 1
29 |     cpu_num = int(cpu_info[cpu_num_index]) + 1
30 | 
31 |     # CPU name
32 |     cpu_name_begin_index = cpu_info.index(b'name')
33 |     cpu_name_end_index = cpu_info.index(b'stepping')
34 |     cpu_name = b' '.join(cpu_info[cpu_name_begin_index + 2:cpu_name_end_index]).decode('utf-8')
35 | 
36 |     LOGGER.log(key='cpu_info', value={"num": cpu_num, "name": cpu_name})
37 | 
38 |     # RAM memory
39 |     ram_info_command = 'free -m -h'
40 |     ram_info = subprocess.run(ram_info_command.split(), stdout=subprocess.PIPE).stdout.split()
41 |     ram_index = ram_info.index(b'Mem:') + 1
42 |     ram = ram_info[ram_index].decode('utf-8')
43 | 
44 |     LOGGER.log(key='mem_info', value={"ram": ram})
45 | 
46 |     # GPU
47 |     nvidia_smi_command = 'nvidia-smi -q -x'
48 |     nvidia_smi_output = subprocess.run(nvidia_smi_command.split(), stdout=subprocess.PIPE).stdout
49 |     nvidia_smi = ET.fromstring(nvidia_smi_output)
50 |     gpus = nvidia_smi.findall('gpu')
51 |     ver = nvidia_smi.findall('driver_version')
52 | 
53 |     LOGGER.log(key="gpu_info",
54 |                  value={
55 |                       "driver_version": ver[0].text,
56 |                       "num": len(gpus),
57 |                       "name": [g.find('product_name').text for g in gpus],
58 |                       "mem": [g.find('fb_memory_usage').find('total').text for g in gpus]})
59 | 
60 | def log_args(args):
61 |     LOGGER.log(key='args', value=vars(args))
62 | 


--------------------------------------------------------------------------------
/tacotron2/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | import re
 3 | from tacotron2.text import cleaners
 4 | from tacotron2.text.symbols import symbols
 5 | 
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | 
11 | # Regular expression matching text enclosed in curly braces:
12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
13 | 
14 | 
15 | def text_to_sequence(text, speaker_id, cleaner_names):
16 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17 | 
18 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20 | 
21 |     Args:
22 |       text: string to convert to a sequence
23 |       cleaner_names: names of the cleaner functions to run the text through
24 | 
25 |     Returns:
26 |       List of integers corresponding to the symbols in the text
27 |   '''
28 |   sequence = []
29 | 
30 |   # Check for curly braces and treat their contents as ARPAbet:
31 |   while len(text):
32 |     m = _curly_re.match(text)
33 |     if not m:
34 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
35 |       break
36 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
37 |     sequence += _arpabet_to_sequence(m.group(2))
38 |     text = m.group(3)
39 | 
40 |   # Append EOS token
41 |   sequence.append(_symbol_to_id['~'])
42 |   return [s + speaker_id * len(symbols) for s in sequence]
43 | 
44 | 
45 | def sequence_to_text(sequence, speaker_id):
46 |   '''Converts a sequence of IDs back to a string'''
47 |   result = ''
48 |   sequence = [s - speaker_id * len(symbols) for s in sequence]
49 |   for symbol_id in sequence:
50 |     if symbol_id in _id_to_symbol:
51 |       s = _id_to_symbol[symbol_id]
52 |       # Enclose ARPAbet back in curly braces:
53 |       if len(s) > 1 and s[0] == '@':
54 |         s = '{%s}' % s[1:]
55 |       result += s
56 |   return result.replace('}{', ' ')
57 | 
58 | 
59 | def _clean_text(text, cleaner_names):
60 |   for name in cleaner_names:
61 |     cleaner = getattr(cleaners, name)
62 |     if not cleaner:
63 |       raise Exception('Unknown cleaner: %s' % name)
64 |     text = cleaner(text)
65 |   return text
66 | 
67 | 
68 | def _symbols_to_sequence(symbols):
69 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
70 | 
71 | 
72 | def _arpabet_to_sequence(text):
73 |   return _symbols_to_sequence(['@' + s for s in text.split()])
74 | 
75 | 
76 | def _should_keep_symbol(s):
77 |   return s in _symbol_to_id and s is not '_' and s is not '~'
78 | 


--------------------------------------------------------------------------------
/tacotron2/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Cleaners are transformations that run over the input text at both training and eval time.
 5 | 
 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 8 |   1. "english_cleaners" for English text
 9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 |      the symbols in symbols.py to match your data).
13 | '''
14 | 
15 | import re
16 | from unidecode import unidecode
17 | from .numbers import normalize_numbers
18 | 
19 | 
20 | # Regular expression matching whitespace:
21 | _whitespace_re = re.compile(r'\s+')
22 | 
23 | # List of (regular expression, replacement) pairs for abbreviations:
24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25 |   ('mrs', 'misess'),
26 |   ('mr', 'mister'),
27 |   ('dr', 'doctor'),
28 |   ('st', 'saint'),
29 |   ('co', 'company'),
30 |   ('jr', 'junior'),
31 |   ('maj', 'major'),
32 |   ('gen', 'general'),
33 |   ('drs', 'doctors'),
34 |   ('rev', 'reverend'),
35 |   ('lt', 'lieutenant'),
36 |   ('hon', 'honorable'),
37 |   ('sgt', 'sergeant'),
38 |   ('capt', 'captain'),
39 |   ('esq', 'esquire'),
40 |   ('ltd', 'limited'),
41 |   ('col', 'colonel'),
42 |   ('ft', 'fort'),
43 | ]]
44 | 
45 | 
46 | def expand_abbreviations(text):
47 |   for regex, replacement in _abbreviations:
48 |     text = re.sub(regex, replacement, text)
49 |   return text
50 | 
51 | 
52 | def expand_numbers(text):
53 |   return normalize_numbers(text)
54 | 
55 | 
56 | def lowercase(text):
57 |   return text.lower()
58 | 
59 | 
60 | def collapse_whitespace(text):
61 |   return re.sub(_whitespace_re, ' ', text)
62 | 
63 | 
64 | def convert_to_ascii(text):
65 |   return unidecode(text)
66 | 
67 | 
68 | def basic_cleaners(text):
69 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70 |   text = lowercase(text)
71 |   text = collapse_whitespace(text)
72 |   return text
73 | 
74 | 
75 | def transliteration_cleaners(text):
76 |   '''Pipeline for non-English text that transliterates to ASCII.'''
77 |   text = convert_to_ascii(text)
78 |   text = lowercase(text)
79 |   text = collapse_whitespace(text)
80 |   return text
81 | 
82 | 
83 | def english_cleaners(text):
84 |   '''Pipeline for English text, including number and abbreviation expansion.'''
85 |   text = convert_to_ascii(text)
86 |   text = lowercase(text)
87 |   text = expand_numbers(text)
88 |   text = expand_abbreviations(text)
89 |   text = collapse_whitespace(text)
90 |   return text
91 | 


--------------------------------------------------------------------------------
/multiproc.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | 
28 | import sys
29 | import subprocess
30 | 
31 | import torch
32 | 
33 | 
34 | def main():
35 |     argslist = list(sys.argv)[1:]
36 |     world_size = torch.cuda.device_count()
37 | 
38 |     if '--world-size' in argslist:
39 |         argslist[argslist.index('--world-size') + 1] = str(world_size)
40 |     else:
41 |         argslist.append('--world-size')
42 |         argslist.append(str(world_size))
43 | 
44 |     workers = []
45 | 
46 |     for i in range(world_size):
47 |         if '--rank' in argslist:
48 |             argslist[argslist.index('--rank') + 1] = str(i)
49 |         else:
50 |             argslist.append('--rank')
51 |             argslist.append(str(i))
52 |         stdout = None if i == 0 else subprocess.DEVNULL
53 |         worker = subprocess.Popen(
54 |             [str(sys.executable)] + argslist, stdout=stdout)
55 |         workers.append(worker)
56 | 
57 |     returncode = 0
58 |     try:
59 |         pending = len(workers)
60 |         while pending > 0:
61 |             for worker in workers:
62 |                 try:
63 |                     worker_returncode = worker.wait(1)
64 |                 except subprocess.TimeoutExpired:
65 |                     continue
66 |                 pending -= 1
67 |                 if worker_returncode != 0:
68 |                     if returncode != 1:
69 |                         for worker in workers:
70 |                             worker.terminate()
71 |                     returncode = 1
72 | 
73 |     except KeyboardInterrupt:
74 |         print('Pressed CTRL-C, TERMINATING')
75 |         for worker in workers:
76 |             worker.terminate()
77 |         for worker in workers:
78 |             worker.wait()
79 |         raise
80 | 
81 |     sys.exit(returncode)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/common/preprocessor.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | from common import audio
  4 | import glob
  5 | import librosa
  6 | import numpy as np
  7 | import os
  8 | 
  9 | 
 10 | def build_from_path(hparams, input_dir, wav_dir, mel_dir, n_jobs=12, tqdm=lambda x: x):
 11 | 	"""
 12 | 	Preprocesses the speech dataset from a gven input path to given output directories
 13 | 
 14 | 	Args:
 15 | 		- hparams: hyper parameters
 16 | 		- input_dir: input directory that contains the files to prerocess
 17 | 		- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
 18 | 		- wav_dir: output directory of the preprocessed speech audio dataset
 19 | 		- n_jobs: Optional, number of worker process to parallelize across
 20 | 		- tqdm: Optional, provides a nice progress bar
 21 | 
 22 | 	Returns:
 23 | 		- A list of tuple describing the train examples. this should be written to train.txt
 24 | 	"""
 25 | 
 26 | 	# We use ProcessPoolExecutor to parallelize across processes, this is just for
 27 | 	# optimization purposes and it can be omited
 28 | 	futures = []
 29 | 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 30 | 	for root, _, files in os.walk(input_dir):
 31 | 		for f in files:
 32 | 			if f.endswith('.trn'):
 33 | 				trn_file = os.path.join(root, f)
 34 | 				with open(trn_file) as f:
 35 | 					basename = trn_file[:-4]
 36 | 					wav_file = basename + '.wav'
 37 | 					basename = basename.split('/')[-1]
 38 | 					text = f.readline().strip()
 39 | 					futures.append(executor.submit(partial(_process_utterance, wav_dir, mel_dir, basename, wav_file, text, hparams)))
 40 | 
 41 | 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 42 | 
 43 | 
 44 | def _process_utterance(wav_dir, mel_dir, basename, wav_file, text, hparams):
 45 | 	"""
 46 | 	Preprocesses a single utterance wav/text pair
 47 | 
 48 | 	this writes the mel scale spectogram to disk and return a tuple to write
 49 | 	to the train.txt file
 50 | 
 51 | 	Args:
 52 | 		- wav_dir: the directory to write the preprocessed wav into
 53 | 		- mel_dir: the directory to write the mel spectograms into
 54 | 		- basename: the basename of each file
 55 | 		- wav_file: path to the audio file containing the speech input
 56 | 		- text: text spoken in the input audio file
 57 | 		- hparams: hyper parameters
 58 | 
 59 | 	Returns:
 60 | 		- A tuple: (audio_filename, mel_filename, time_steps, mel_frames, text)
 61 | 	"""
 62 | 	try:
 63 | 		# Load the audio as numpy array
 64 | 		wav, sr = librosa.core.load(wav_file, sr=hparams.sample_rate)
 65 | 	except FileNotFoundError: #catch missing wav exception
 66 | 		print(f'file {wav_file} present in csv metadata is not present in wav folder. skipping!')
 67 | 		return None
 68 | 
 69 | 	#rescale wav
 70 | 	if hparams.rescale:
 71 | 		wav = wav / np.abs(wav).max() * hparams.rescaling_max
 72 | 
 73 | 	#M-AILABS extra silence specific
 74 | 	if hparams.trim_silence:
 75 | 		wav = audio.trim_silence(wav)
 76 | 
 77 | 	# Compute the mel scale spectrogram from the wav
 78 | 	mel = audio.melspectrogram(wav).astype(np.float32)
 79 | 	mel_frames = mel.shape[1]
 80 | 
 81 | 	if mel_frames > hparams.max_mel_frames or len(text) > hparams.max_text_length:
 82 | 		return None
 83 | 
 84 | 	#Zero pad for quantized signal
 85 | 	#time resolution adjustement
 86 | 	#ensure length of raw audio is multiple of hop size so that we can use
 87 | 	#transposed convolution to upsample
 88 | 	r = mel_frames * audio.get_hop_size() - len(wav)
 89 | 	wav = np.pad(wav, (0, r), mode='constant', constant_values=0.)
 90 | 	assert len(wav) == mel_frames * audio.get_hop_size()
 91 | 	time_steps = len(wav)
 92 | 
 93 | 	# Write the spectrogram and audio to disk
 94 | 	filename = f'{basename}.npy'
 95 | 	np.save(os.path.join(wav_dir, filename), wav, allow_pickle=False)
 96 | 	np.save(os.path.join(mel_dir, filename), mel, allow_pickle=False)
 97 | 
 98 | 	# Return a tuple describing this training example
 99 | 	return (filename, time_steps, mel_frames, text)
100 | 


--------------------------------------------------------------------------------
/common/utils.py:
--------------------------------------------------------------------------------
 1 | # *****************************************************************************
 2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | #  Redistribution and use in source and binary forms, with or without
 5 | #  modification, are permitted provided that the following conditions are met:
 6 | #      * Redistributions of source code must retain the above copyright
 7 | #        notice, this list of conditions and the following disclaimer.
 8 | #      * Redistributions in binary form must reproduce the above copyright
 9 | #        notice, this list of conditions and the following disclaimer in the
10 | #        documentation and/or other materials provided with the distribution.
11 | #      * Neither the name of the NVIDIA CORPORATION nor the
12 | #        names of its contributors may be used to endorse or promote products
13 | #        derived from this software without specific prior written permission.
14 | #
15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | #
26 | # *****************************************************************************
27 | 
28 | import numpy as np
29 | from scipy.io.wavfile import read, write
30 | from scipy import signal
31 | import math
32 | import torch
33 | import os
34 | 
35 | 
36 | def cosine_decay(init_val, final_val, step, decay_steps):
37 |     alpha = final_val / init_val
38 |     cosine_decay = 0.5 * (1 + math.cos(math.pi * step / decay_steps))
39 |     decayed = (1 - alpha) * cosine_decay + alpha
40 |     return init_val * decayed
41 | 
42 | 
43 | def get_mask_from_lengths(lengths):
44 |     max_len = torch.max(lengths).item()
45 |     ids = torch.arange(0, max_len, out=torch.cuda.IntTensor(max_len))
46 |     mask = ids < lengths.unsqueeze(1)
47 |     return mask
48 | 
49 | 
50 | def preemphasize(wav, k=0.97):
51 |     return signal.lfilter([1, -k], [1], wav)
52 | 
53 | 
54 | def de_emphasize(wav, k=0.97):
55 |     return signal.lfilter([1], [1, -k], wav)
56 | 
57 | 
58 | def load_wav_to_torch(path, max_value=32768):
59 |     wav = np.load(path)
60 |     wav = preemphasize(wav)
61 |     return torch.FloatTensor(wav.astype(np.float32))
62 | 
63 | 
64 | def dc_notch_filter(wav):
65 |     # code from speex
66 |     notch_radius = 0.982
67 |     den = notch_radius ** 2 + 0.7 * (1 - notch_radius) ** 2
68 |     b = np.array([1, -2, 1]) * notch_radius
69 |     a = np.array([1, -2 * notch_radius, den])
70 |     return signal.lfilter(b, a, wav)
71 | 
72 | 
73 | def save_wav(wav, path, sr=22050):
74 |     wav = dc_notch_filter(wav)
75 |     f1 = 0.8 * 32768 / max(0.01, np.max(np.abs(wav)))
76 |     f2 = np.sign(wav) * np.power(np.abs(wav), 0.95)
77 |     wav = f1 * f2
78 |     write(path, sr, wav.astype(np.int16))
79 | 
80 | 
81 | def load_metadata(dirname, filename='train.txt', split="|"):
82 |     with open(os.path.join(dirname, filename)) as f:
83 |         def split_line(line):
84 |             parts = line.strip().split(split)
85 |             wav_path = os.path.join(dirname, 'audio', parts[0])
86 |             text = parts[-1]
87 |             return wav_path, text
88 |         return [split_line(line) for line in f.readlines()]
89 | 
90 | 
91 | def to_gpu(x):
92 |     x = x.contiguous()
93 | 
94 |     if torch.cuda.is_available():
95 |         x = x.cuda(non_blocking=True)
96 |     return torch.autograd.Variable(x)
97 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from multiprocessing import cpu_count
  4 | 
  5 | from common import preprocessor
  6 | from hparams import hparams
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | def preprocess(args, input_folders, output_dir, hparams):
 11 | 	mel_frames, timesteps = 0, 0
 12 | 	max_text_lens, max_mel_lens, max_timestep_lens = [], [], []
 13 | 
 14 | 	for input_dir in input_folders:
 15 | 		wav_dir = os.path.join(output_dir, input_dir.split('/')[-1], 'audio')
 16 | 		mel_dir = os.path.join(output_dir, input_dir.split('/')[-1], 'mels')
 17 | 		os.makedirs(wav_dir, exist_ok=True)
 18 | 		os.makedirs(mel_dir, exist_ok=True)
 19 | 
 20 | 		metadata = preprocessor.build_from_path(hparams, input_dir, wav_dir, mel_dir, args.n_jobs, tqdm=tqdm)
 21 | 		with open(os.path.join(output_dir, input_dir.split('/')[-1], 'train.txt'), 'w') as f:
 22 | 			for m in metadata:
 23 | 				f.write('|'.join([str(x) for x in m]) + '\n')
 24 | 		max_text_lens.append(max(len(m[3]) for m in metadata))
 25 | 		max_mel_lens.append(max(int(m[2]) for m in metadata))
 26 | 		max_timestep_lens.append(max(m[1] for m in metadata))
 27 | 		mel_frames += sum([int(m[2]) for m in metadata])
 28 | 		timesteps += sum([int(m[1]) for m in metadata])
 29 | 
 30 | 	hours = timesteps / hparams.sample_rate / 3600
 31 | 	print(f'Write {len(metadata)} utterances, {mel_frames} mel frames, {timesteps} audio timesteps, ({hours:.2f} hours)')
 32 | 	print(f'Max input length (text chars): {max(max_text_lens)}')
 33 | 	print(f'Max mel frames length: {max(max_mel_lens)}')
 34 | 	print(f'Max audio timesteps length: {max(max_timestep_lens)}')
 35 | 
 36 | 
 37 | def norm_data(args):
 38 | 
 39 | 	merge_books = (args.merge_books=='True')
 40 | 
 41 | 	print('Selecting data folders..')
 42 | 	supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS', 'MANDARIN']
 43 | 	if args.dataset not in supported_datasets:
 44 | 		raise ValueError(f'dataset value entered {args.dataset} does not belong to supported datasets: {supported_datasets}')
 45 | 
 46 | 	if args.dataset.startswith('LJSpeech'):
 47 | 		return [os.path.join(args.base_dir, args.dataset)]
 48 | 
 49 | 	if args.dataset.startswith('MANDARIN'):
 50 | 		return [os.path.join(args.base_dir, 'data_mandarin', anchor) for anchor in hparams.anchor_dirs]
 51 | 
 52 | 	if args.dataset == 'M-AILABS':
 53 | 		supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
 54 | 			'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
 55 | 		if args.language not in supported_languages:
 56 | 			raise ValueError(f'Please enter a supported language to use from M-AILABS dataset! \n{supported_languages}')
 57 | 
 58 | 		supported_voices = ['female', 'male', 'mix']
 59 | 		if args.voice not in supported_voices:
 60 | 			raise ValueError(f'Please enter a supported voice option to use from M-AILABS dataset! \n{supported_voices}')
 61 | 
 62 | 		path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
 63 | 		supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 64 | 		if args.reader not in supported_readers:
 65 | 			raise ValueError(f'Please enter a valid reader for your language and voice settings! \n{supported_readers}')
 66 | 
 67 | 		path = os.path.join(path, args.reader)
 68 | 		supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 69 | 		if merge_books:
 70 | 			return [os.path.join(path, book) for book in supported_books]
 71 | 
 72 | 		else:
 73 | 			if args.book not in supported_books:
 74 | 				raise ValueError(f'Please enter a valid book for your reader settings! \n{supported_books}')
 75 | 			return [os.path.join(path, args.book)]
 76 | 
 77 | 
 78 | def run_preprocess(args, hparams):
 79 | 	input_folders = norm_data(args)
 80 | 	output_folder = os.path.join(args.base_dir, args.output)
 81 | 	preprocess(args, input_folders, output_folder, hparams)
 82 | 
 83 | 
 84 | def main():
 85 | 	print('initializing preprocessing..')
 86 | 	parser = argparse.ArgumentParser()
 87 | 	parser.add_argument('--base_dir', default='')
 88 | 	parser.add_argument('--hparams', default='', help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 89 | 	parser.add_argument('--dataset', default='MANDARIN')
 90 | 	parser.add_argument('--language', default='en_US')
 91 | 	parser.add_argument('--voice', default='female')
 92 | 	parser.add_argument('--reader', default='mary_ann')
 93 | 	parser.add_argument('--merge_books', default='False')
 94 | 	parser.add_argument('--book', default='northandsouth')
 95 | 	parser.add_argument('--output', default='training_data')
 96 | 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
 97 | 	args = parser.parse_args()
 98 | 
 99 | 	modified_hp = hparams.parse(args.hparams)
100 | 
101 | 	assert args.merge_books in ('False', 'True')
102 | 
103 | 	run_preprocess(args, modified_hp)
104 | 
105 | 
106 | if __name__ == '__main__':
107 | 	main()
108 | 


--------------------------------------------------------------------------------
/common/audio_processing.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import torch
 29 | import numpy as np
 30 | import librosa.util as librosa_util
 31 | from scipy.signal import get_window
 32 | from .utils import de_emphasize
 33 | 
 34 | 
 35 | def window_sumsquare(window, n_frames, hop_length=256, win_length=1024,
 36 |                      n_fft=1024, dtype=np.float32, norm=None):
 37 |     """
 38 |     # from librosa 0.6
 39 |     Compute the sum-square envelope of a window function at a given hop length.
 40 | 
 41 |     This is used to estimate modulation effects induced by windowing
 42 |     observations in short-time fourier transforms.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     window : string, tuple, number, callable, or list-like
 47 |         Window specification, as in `get_window`
 48 | 
 49 |     n_frames : int > 0
 50 |         The number of analysis frames
 51 | 
 52 |     hop_length : int > 0
 53 |         The number of samples to advance between frames
 54 | 
 55 |     win_length : [optional]
 56 |         The length of the window function.  By default, this matches `n_fft`.
 57 | 
 58 |     n_fft : int > 0
 59 |         The length of each analysis frame.
 60 | 
 61 |     dtype : np.dtype
 62 |         The data type of the output
 63 | 
 64 |     Returns
 65 |     -------
 66 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 67 |         The sum-squared envelope of the window function
 68 |     """
 69 |     if win_length is None:
 70 |         win_length = n_fft
 71 | 
 72 |     n = n_fft + hop_length * (n_frames - 1)
 73 |     x = np.zeros(n, dtype=dtype)
 74 | 
 75 |     # Compute the squared window at the desired length
 76 |     win_sq = get_window(window, win_length, fftbins=True)
 77 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
 78 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 79 | 
 80 |     # Fill the envelope
 81 |     for i in range(n_frames):
 82 |         sample = i * hop_length
 83 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
 84 |     return x
 85 | 
 86 | 
 87 | def griffin_lim(magnitudes, stft_fn, n_iters=50, power=1.5):
 88 |     """
 89 |     PARAMS
 90 |     ------
 91 |     magnitudes: spectrogram magnitudes
 92 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
 93 |     """
 94 |     magnitudes = magnitudes.unsqueeze(0) ** power
 95 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
 96 |     angles = angles.astype(np.float32)
 97 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
 98 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 99 | 
100 |     for i in range(n_iters):
101 |         _, angles = stft_fn.transform(signal)
102 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
103 |     return de_emphasize(signal.squeeze())
104 | 
105 | 
106 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
107 |     """
108 |     PARAMS
109 |     ------
110 |     C: compression factor
111 |     """
112 |     return torch.log(torch.clamp(x, min=clip_val) * C)
113 | 
114 | 
115 | def dynamic_range_decompression(x, C=1):
116 |     """
117 |     PARAMS
118 |     ------
119 |     C: compression factor used to compress
120 |     """
121 |     return torch.exp(x) / C
122 | 


--------------------------------------------------------------------------------
/common/layers.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import librosa
 29 | import torch
 30 | from common.audio_processing import dynamic_range_compression, dynamic_range_decompression
 31 | from common.stft import STFT
 32 | 
 33 | 
 34 | class LinearNorm(torch.nn.Module):
 35 |     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
 36 |         super(LinearNorm, self).__init__()
 37 |         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
 38 | 
 39 |         # torch.nn.init.xavier_uniform_(self.linear_layer.weight,
 40 |         #     gain=torch.nn.init.calculate_gain(w_init_gain))
 41 | 
 42 |     def forward(self, x):
 43 |         return self.linear_layer(x)
 44 | 
 45 | 
 46 | class ConvNorm(torch.nn.Module):
 47 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
 48 |                  padding=None, dilation=1, bias=True, w_init_gain='linear'):
 49 |         super(ConvNorm, self).__init__()
 50 |         if padding is None:
 51 |             assert(kernel_size % 2 == 1)
 52 |             padding = int(dilation * (kernel_size - 1) / 2)
 53 | 
 54 |         self.conv = torch.nn.Conv1d(in_channels, out_channels,
 55 |                                     kernel_size=kernel_size, stride=stride,
 56 |                                     padding=padding, dilation=dilation,
 57 |                                     bias=bias)
 58 | 
 59 |         # torch.nn.init.xavier_uniform_(self.conv.weight,
 60 |         #     gain=torch.nn.init.calculate_gain(w_init_gain))
 61 | 
 62 |     def forward(self, signal):
 63 |         return self.conv(signal)
 64 | 
 65 | 
 66 | class TacotronSTFT(torch.nn.Module):
 67 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
 68 |                  n_mel_channels=80, sampling_rate=22050, mel_fmin=50.0, mel_fmax=7600.0):
 69 |         super(TacotronSTFT, self).__init__()
 70 |         self.n_mel_channels = n_mel_channels
 71 |         self.sampling_rate = sampling_rate
 72 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
 73 |         mel_basis = librosa.filters.mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
 74 |         import numpy as np
 75 |         inv_mel_basis = np.linalg.pinv(mel_basis)
 76 |         mel_basis = torch.from_numpy(mel_basis).float()
 77 |         inv_mel_basis = torch.from_numpy(inv_mel_basis).float()
 78 |         self.register_buffer('mel_basis', mel_basis)
 79 |         self.register_buffer('inv_mel_basis', inv_mel_basis)
 80 | 
 81 | 
 82 |     def spectral_normalize(self, magnitudes):
 83 |         return dynamic_range_compression(magnitudes)
 84 | 
 85 |     def spectral_de_normalize(self, magnitudes):
 86 |         return dynamic_range_decompression(magnitudes)
 87 | 
 88 |     def mel_spectrogram(self, y):
 89 |         """Computes mel-spectrograms from a batch of waves
 90 |         PARAMS
 91 |         ------
 92 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
 93 | 
 94 |         RETURNS
 95 |         -------
 96 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
 97 |         """
 98 |         #assert(torch.min(y.data) >= -1)
 99 |         #assert(torch.max(y.data) <= 1)
100 | 
101 |         magnitudes, phases = self.stft_fn.transform(y)
102 |         magnitudes = magnitudes.data
103 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
104 |         mel_output = self.spectral_normalize(mel_output)
105 |         return mel_output
106 | 
107 |     def inv_mel_spectrogram(self, mel):
108 |         """Computes mel-spectrograms from a batch of waves
109 |         PARAMS
110 |         ------
111 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
112 | 
113 |         RETURNS
114 |         -------
115 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
116 |         """
117 |         mel = self.spectral_de_normalize(mel.float())
118 |         magnitudes = torch.matmul(self.inv_mel_basis, mel.data)
119 |         magnitudes = torch.max(magnitudes.clone().detach().fill_(1e-10), magnitudes)
120 |         return magnitudes.data
121 | 


--------------------------------------------------------------------------------
/common/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | from hparams import hparams
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | 
  8 | 
  9 | def dc_notch_filter(wav):
 10 | 	# code from speex
 11 | 	notch_radius = 0.982
 12 | 	den = notch_radius ** 2 + 0.7 * (1 - notch_radius) ** 2
 13 | 	b = np.array([1, -2, 1]) * notch_radius
 14 | 	a = np.array([1, -2 * notch_radius, den])
 15 | 	return signal.lfilter(b, a, wav)
 16 | 
 17 | def load_wav(path, sr):
 18 | 	return librosa.core.load(path, sr=sr)[0]
 19 | 
 20 | def save_wav(wav, path):
 21 | 	wav = dc_notch_filter(wav)
 22 | 	wav = wav / np.abs(wav).max() * 0.999
 23 | 	f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav)))
 24 | 	f2 = np.sign(wav) * np.power(np.abs(wav), 0.95)
 25 | 	wav = f1 * f2
 26 | 	#proposed by @dsmiller
 27 | 	wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 28 | 
 29 | def preemphasis(wav, k):
 30 | 	return signal.lfilter([1, -k], [1], wav)
 31 | 
 32 | def inv_preemphasis(wav, k):
 33 | 	return signal.lfilter([1], [1, -k], wav)
 34 | 
 35 | def trim_silence(wav):
 36 | 	'''Trim leading and trailing silence
 37 | 
 38 | 	Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end.
 39 | 	'''
 40 | 	#Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset.
 41 | 	return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
 42 | 
 43 | def get_hop_size():
 44 | 	hop_size = hparams.hop_size
 45 | 	if hop_size is None:
 46 | 		assert hparams.frame_shift_ms is not None
 47 | 		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 48 | 	return hop_size
 49 | 
 50 | def linearspectrogram(wav):
 51 | 	D = _stft(preemphasis(wav, hparams.preemphasis))
 52 | 	S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 53 | 
 54 | 	if hparams.signal_normalization:
 55 | 		return _normalize(S)
 56 | 	return S
 57 | 
 58 | def melspectrogram(wav):
 59 | 	D = _stft(preemphasis(wav, hparams.preemphasis))
 60 | 	S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 61 | 
 62 | 	if hparams.signal_normalization:
 63 | 		return _normalize(S)
 64 | 	return S
 65 | 
 66 | def inv_linear_spectrogram(linear_spectrogram):
 67 | 	'''Converts linear spectrogram to waveform using librosa'''
 68 | 	if hparams.signal_normalization:
 69 | 		D = _denormalize(linear_spectrogram)
 70 | 	else:
 71 | 		D = linear_spectrogram
 72 | 
 73 | 	S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
 74 | 	return inv_preemphasis(_griffin_lim(S ** hparams.power), hparams.preemphasis)
 75 | 
 76 | def inv_mel_spectrogram(mel_spectrogram):
 77 | 	'''Converts mel spectrogram to waveform using librosa'''
 78 | 	if hparams.signal_normalization:
 79 | 		D = _denormalize(mel_spectrogram)
 80 | 	else:
 81 | 		D = mel_spectrogram
 82 | 
 83 | 	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db))  # Convert back to linear
 84 | 	return inv_preemphasis(_griffin_lim(S ** hparams.power), hparams.preemphasis)
 85 | 
 86 | def _griffin_lim(S):
 87 | 	'''librosa implementation of Griffin-Lim
 88 | 	Based on https://github.com/librosa/librosa/issues/434
 89 | 	'''
 90 | 	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 91 | 	S_complex = np.abs(S).astype(np.complex)
 92 | 	y = _istft(S_complex * angles)
 93 | 	for i in range(hparams.griffin_lim_iters):
 94 | 		angles = np.exp(1j * np.angle(_stft(y)))
 95 | 		y = _istft(S_complex * angles)
 96 | 	return y
 97 | 
 98 | def _stft(y):
 99 | 	return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(), win_length=hparams.win_size)
100 | 
101 | def _istft(y):
102 | 	return librosa.istft(y, hop_length=get_hop_size(), win_length=hparams.win_size)
103 | 
104 | # Conversions
105 | _mel_basis = None
106 | _inv_mel_basis = None
107 | 
108 | def _linear_to_mel(spectogram):
109 | 	global _mel_basis
110 | 	if _mel_basis is None:
111 | 		_mel_basis = _build_mel_basis()
112 | 	return np.dot(_mel_basis, spectogram)
113 | 
114 | def _mel_to_linear(mel_spectrogram):
115 | 	global _inv_mel_basis
116 | 	if _inv_mel_basis is None:
117 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
118 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
119 | 
120 | def _build_mel_basis():
121 | 	assert hparams.fmax <= hparams.sample_rate // 2
122 | 	return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
123 | 							   fmin=hparams.fmin, fmax=hparams.fmax)
124 | 
125 | def _amp_to_db(x):
126 | 	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
127 | 	return 20 * np.log10(np.maximum(min_level, x))
128 | 
129 | def _db_to_amp(x):
130 | 	return np.power(10.0, (x) * 0.05)
131 | 
132 | def _normalize(S):
133 | 	if hparams.allow_clipping_in_normalization:
134 | 		if hparams.symmetric_mels:
135 | 			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
136 | 			 -hparams.max_abs_value, hparams.max_abs_value)
137 | 		else:
138 | 			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
139 | 
140 | 	if hparams.symmetric_mels:
141 | 		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
142 | 	else:
143 | 		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
144 | 
145 | def _denormalize(D):
146 | 	if hparams.allow_clipping_in_normalization:
147 | 		if hparams.symmetric_mels:
148 | 			return (((np.clip(D, -hparams.max_abs_value,
149 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
150 | 				+ hparams.min_level_db)
151 | 		else:
152 | 			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
153 | 
154 | 	if hparams.symmetric_mels:
155 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
156 | 	else:
157 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
158 | 


--------------------------------------------------------------------------------
/common/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | Copyright (c) 2017, Prem Seetharaman
  5 | All rights reserved.
  6 | 
  7 | * Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice,
 11 |   this list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice, this
 14 |   list of conditions and the following disclaimer in the
 15 |   documentation and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from this
 19 |   software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import torch
 34 | import numpy as np
 35 | import torch.nn.functional as F
 36 | from torch.autograd import Variable
 37 | from scipy.signal import get_window, lfilter
 38 | from librosa.util import pad_center, tiny
 39 | from common.audio_processing import window_sumsquare
 40 | 
 41 | 
 42 | class STFT(torch.nn.Module):
 43 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 44 |     def __init__(self, filter_length=1024, hop_length=256, win_length=1024, window='hann'):
 45 |         super(STFT, self).__init__()
 46 |         self.filter_length = filter_length
 47 |         self.hop_length = hop_length
 48 |         self.win_length = win_length
 49 |         self.window = window
 50 |         self.forward_transform = None
 51 |         scale = self.filter_length / self.hop_length
 52 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 53 | 
 54 |         cutoff = int((self.filter_length / 2 + 1))
 55 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 56 |                                    np.imag(fourier_basis[:cutoff, :])])
 57 | 
 58 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 59 |         inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 60 | 
 61 |         if window is not None:
 62 |             assert(filter_length >= win_length)
 63 |             # get window and zero center pad it to filter_length
 64 |             fft_window = get_window(window, win_length, fftbins=True)
 65 |             fft_window = pad_center(fft_window, filter_length)
 66 |             fft_window = torch.from_numpy(fft_window).float()
 67 | 
 68 |             # window the bases
 69 |             forward_basis *= fft_window
 70 |             inverse_basis *= fft_window
 71 | 
 72 |         self.register_buffer('forward_basis', forward_basis.float())
 73 |         self.register_buffer('inverse_basis', inverse_basis.float())
 74 | 
 75 |     def transform(self, input_data):
 76 |         num_batches = input_data.size(0)
 77 |         num_samples = input_data.size(1)
 78 | 
 79 |         self.num_samples = num_samples
 80 | 
 81 |         # similar to librosa, reflect-pad the input
 82 |         input_data = input_data.view(num_batches, 1, num_samples)
 83 |         input_data = F.pad(
 84 |             input_data.unsqueeze(1),
 85 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 86 |             mode='reflect')
 87 |         input_data = input_data.squeeze(1)
 88 | 
 89 |         forward_transform = F.conv1d(
 90 |             input_data,
 91 |             Variable(self.forward_basis, requires_grad=False),
 92 |             stride=self.hop_length,
 93 |             padding=0)
 94 | 
 95 |         cutoff = int((self.filter_length / 2) + 1)
 96 |         real_part = forward_transform[:, :cutoff, :]
 97 |         imag_part = forward_transform[:, cutoff:, :]
 98 | 
 99 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
100 |         phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
101 | 
102 |         return magnitude, phase
103 | 
104 |     def inverse(self, magnitude, phase):
105 |         recombine_magnitude_phase = torch.cat(
106 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
107 | 
108 |         inverse_transform = F.conv_transpose1d(
109 |             recombine_magnitude_phase,
110 |             Variable(self.inverse_basis, requires_grad=False),
111 |             stride=self.hop_length,
112 |             padding=0)
113 | 
114 |         if self.window is not None:
115 |             window_sum = window_sumsquare(
116 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
117 |                 win_length=self.win_length, n_fft=self.filter_length,
118 |                 dtype=np.float32)
119 |             # remove modulation effects
120 |             approx_nonzero_indices = torch.from_numpy(
121 |                 np.where(window_sum > tiny(window_sum))[0])
122 |             window_sum = torch.autograd.Variable(torch.from_numpy(window_sum), requires_grad=False)
123 |             window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
124 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
125 | 
126 |             # scale by hop ratio
127 |             inverse_transform *= float(self.filter_length) / self.hop_length
128 | 
129 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
130 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
131 | 
132 |         return inverse_transform
133 | 
134 |     def forward(self, input_data):
135 |         self.magnitude, self.phase = self.transform(input_data)
136 |         reconstruction = self.inverse(self.magnitude, self.phase)
137 |         return reconstruction
138 | 


--------------------------------------------------------------------------------
/tacotron2/loader.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import argparse
 29 | import torch
 30 | from tacotron2.text import symbols
 31 | from tacotron2.model import Tacotron2
 32 | 
 33 | 
 34 | def parse_tacotron2_args(parent, add_help=False):
 35 |     """
 36 |     Parse commandline arguments.
 37 |     """
 38 |     parser = argparse.ArgumentParser(parents=[parent], add_help=add_help)
 39 | 
 40 |     # misc parameters
 41 |     parser.add_argument('--mask-padding', default=False, type=bool, help='Use mask padding')
 42 |     parser.add_argument('--n-mel-channels', default=80, type=int, help='Number of bins in mel-spectrograms')
 43 |     parser.add_argument('--mel-pad-val', default=-4, type=float, help='Corresponding to silence')
 44 | 
 45 |     # symbols parameters
 46 |     global symbols
 47 |     len_symbols = len(symbols)
 48 |     symbols = parser.add_argument_group('symbols parameters')
 49 |     symbols.add_argument('--n-symbols', default=len_symbols, type=int, help='Number of symbols in dictionary')
 50 |     symbols.add_argument('--symbols-embedding-dim', default=512, type=int, help='Input embedding dimension')
 51 | 
 52 |     # encoder parameters
 53 |     encoder = parser.add_argument_group('encoder parameters')
 54 |     encoder.add_argument('--encoder-kernel-size', default=5, type=int, help='Encoder kernel size')
 55 |     encoder.add_argument('--encoder-n-convolutions', default=3, type=int, help='Number of encoder convolutions')
 56 |     encoder.add_argument('--encoder-embedding-dim', default=512, type=int, help='Encoder embedding dimension')
 57 | 
 58 |     # decoder parameters
 59 |     decoder = parser.add_argument_group('decoder parameters')
 60 |     decoder.add_argument('--n-frames-per-step', default=3, type=int, help='Number of frames processed per step')
 61 |     decoder.add_argument('--decoder-rnn-dim', default=1024, type=int, help='Number of units in decoder LSTM')
 62 |     decoder.add_argument('--decoder-n-lstms', default=2, type=int, help='Number of decoder LSTM layers')
 63 |     decoder.add_argument('--prenet-dim', default=256, type=int, help='Number of ReLU units in prenet layers')
 64 |     decoder.add_argument('--max-decoder-steps', default=1000, type=int, help='Maximum number of output mel spectrograms')
 65 |     decoder.add_argument('--gate-threshold', default=0.5, type=float, help='Probability threshold for stop token')
 66 |     decoder.add_argument('--p-decoder-dropout', default=0.1, type=float, help='Dropout probability for decoder LSTM')
 67 | 
 68 |     # attention parameters
 69 |     attention = parser.add_argument_group('attention parameters')
 70 |     attention.add_argument('--attention-dim', default=128, type=int, help='Dimension of attention hidden representation')
 71 | 
 72 |     # location layer parameters
 73 |     location = parser.add_argument_group('location parameters')
 74 |     location.add_argument('--attention-location-n-filters', default=32, type=int, help='Number of filters for location-sensitive attention')
 75 |     location.add_argument('--attention-location-kernel-size', default=31, type=int, help='Kernel size for location-sensitive attention')
 76 | 
 77 |     # Mel-post processing network parameters
 78 |     postnet = parser.add_argument_group('postnet parameters')
 79 |     postnet.add_argument('--postnet-embedding-dim', default=512, type=int, help='Postnet embedding dimension')
 80 |     postnet.add_argument('--postnet-kernel-size', default=5, type=int, help='Postnet kernel size')
 81 |     postnet.add_argument('--postnet-n-convolutions', default=5, type=int, help='Number of postnet convolutions')
 82 | 
 83 |     return parser
 84 | 
 85 | 
 86 | def _batchnorm_to_float(module):
 87 |     """Converts batch norm to FP32"""
 88 |     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
 89 |         module.float()
 90 |     for child in module.children():
 91 |         _batchnorm_to_float(child)
 92 |     return module
 93 | 
 94 | 
 95 | def _init_bn(module):
 96 |     if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
 97 |         if module.affine:
 98 |             module.weight.data.uniform_()
 99 |     for child in module.children():
100 |         _init_bn(child)
101 | 
102 | 
103 | def get_tacotron2_model(args, speaker_num, is_training=True):
104 |     config = dict(
105 |         # optimization
106 |         mask_padding=args.mask_padding,
107 |         # audio
108 |         n_mel_channels=args.n_mel_channels,
109 |         # symbols
110 |         n_symbols=args.n_symbols * speaker_num,
111 |         symbols_embedding_dim=args.symbols_embedding_dim,
112 |         # encoder
113 |         encoder_kernel_size=args.encoder_kernel_size,
114 |         encoder_n_convolutions=args.encoder_n_convolutions,
115 |         encoder_embedding_dim=args.encoder_embedding_dim,
116 |         # attention
117 |         attention_dim=args.attention_dim,
118 |         # attention location
119 |         attention_location_n_filters=args.attention_location_n_filters,
120 |         attention_location_kernel_size=args.attention_location_kernel_size,
121 |         # decoder
122 |         n_frames_per_step=args.n_frames_per_step,
123 |         decoder_rnn_dim=args.decoder_rnn_dim,
124 |         prenet_dim=args.prenet_dim,
125 |         max_decoder_steps=args.max_decoder_steps,
126 |         gate_threshold=args.gate_threshold,
127 |         decoder_n_lstms=args.decoder_n_lstms,
128 |         p_decoder_dropout=args.p_decoder_dropout,
129 |         # postnet
130 |         postnet_embedding_dim=args.postnet_embedding_dim,
131 |         postnet_kernel_size=args.postnet_kernel_size,
132 |         postnet_n_convolutions=args.postnet_n_convolutions,
133 |     )
134 | 
135 |     model = Tacotron2(**config)
136 | 
137 |     if is_training:
138 |         _init_bn(model)
139 | 
140 |     return model.cuda()
141 | 


--------------------------------------------------------------------------------
/tacotron2/data_function.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import random
 29 | import numpy as np
 30 | import os
 31 | import torch
 32 | import torch.utils.data
 33 | from common.layers import TacotronSTFT
 34 | from common.utils import load_wav_to_torch, load_metadata
 35 | from tacotron2.text import text_to_sequence
 36 | 
 37 | 
 38 | class TextMelDataset(torch.utils.data.Dataset):
 39 |     """
 40 |         1) loads audio,text pairs
 41 |         2) normalizes text and converts them to sequences of one-hot vectors
 42 |         3) computes mel-spectrograms from audio files.
 43 |     """
 44 |     def __init__(self, args, anchor_dirs):
 45 |         self.speaker_num = len(anchor_dirs)
 46 |         self.meta_dirs = [os.path.join(args.dataset_path, anchor_dirs[i]) for i in range(self.speaker_num)]
 47 |         self.metadatas = [load_metadata(meta_dir) for meta_dir in self.meta_dirs]
 48 |         self.offsets = [0] * self.speaker_num
 49 |         self.text_cleaners = args.text_cleaners
 50 |         self.sampling_rate = args.sampling_rate
 51 |         self.load_mel_from_disk = args.load_mel_from_disk
 52 |         self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
 53 |                                  args.n_mel_channels, args.sampling_rate, args.mel_fmin,
 54 |                                  args.mel_fmax)
 55 |         random.seed(1234)
 56 |         for i in range(self.speaker_num):
 57 |             random.shuffle(self.metadatas[i])
 58 | 
 59 |     def get_mel_text_pair(self, speaker_id, metadata):
 60 |         mel_path, text = metadata
 61 |         seq_len = len(text)
 62 |         seq = self.get_sequence(text, speaker_id)
 63 |         mel = self.get_mel(mel_path)
 64 |         return (seq, mel, seq_len)
 65 | 
 66 |     def get_mel(self, filename):
 67 |         if not self.load_mel_from_disk:
 68 |             audio = load_wav_to_torch(filename)
 69 |             melspec = self.stft.mel_spectrogram(audio.unsqueeze(0))
 70 |             melspec = torch.squeeze(melspec, 0)
 71 |         else:
 72 |             melspec = torch.from_numpy(np.load(filename))
 73 |             assert melspec.size(0) == self.stft.n_mel_channels, (
 74 |                 'Mel dimension mismatch: given {}, expected {}'.format(
 75 |                     melspec.size(0), self.stft.n_mel_channels))
 76 | 
 77 |         return melspec
 78 | 
 79 |     def get_sequence(self, text, speaker_id):
 80 |         return text_to_sequence(text, speaker_id, self.text_cleaners)
 81 | 
 82 |     def __getitem__(self, index):
 83 |         group = [self.get_mel_text_pair(i, self.metadatas[i][self.offsets[i]]) for i in range(self.speaker_num)]
 84 |         self.offsets = [(self.offsets[i] + 1) % len(self.metadatas[i]) for i in range(self.speaker_num)]
 85 |         return group
 86 | 
 87 |     def __len__(self):
 88 |         return sum([len(m) for m in self.metadatas]) // self.speaker_num
 89 | 
 90 | 
 91 | class TextMelCollate():
 92 |     """ Zero-pads model inputs and targets based on number of frames per step
 93 |     """
 94 |     def __init__(self, args):
 95 |         self.n_frames_per_step = args.n_frames_per_step
 96 |         self.mel_pad_val = args.mel_pad_val
 97 | 
 98 |     def __call__(self, batch):
 99 |         """Collate's training batch from normalized text and mel-spectrogram
100 |         PARAMS
101 |         ------
102 |         batch: [text_normalized, mel_normalized]
103 |         """
104 |         # Flatten the batch
105 |         batch = [sample for group in batch for sample in group]
106 | 
107 |         # Right zero-pad all one-hot text sequences to max input length
108 |         seq_lens, ids_sorted_decreasing = torch.sort(
109 |             torch.IntTensor([len(x[0]) for x in batch]),
110 |             dim=0, descending=True)
111 |         max_seq_len = seq_lens[0]
112 | 
113 |         seqs = []
114 |         for i in range(len(ids_sorted_decreasing)):
115 |             seq = batch[ids_sorted_decreasing[i]][0]
116 |             seqs.append(np.pad(seq, [0, max_seq_len - len(seq)], mode='constant'))
117 | 
118 |         # Right zero-pad mel-spec
119 |         num_mels = batch[0][1].size(0)
120 |         max_target_len = max([x[1].size(1) for x in batch])
121 |         if max_target_len % self.n_frames_per_step != 0:
122 |             max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
123 |             assert max_target_len % self.n_frames_per_step == 0
124 | 
125 |         # include mel padded and gate padded
126 |         targets, reduced_targets = [], []
127 |         gates = np.zeros([len(batch), max_target_len], dtype=np.float32)
128 |         target_lengths = torch.IntTensor(len(batch))
129 |         for i in range(len(ids_sorted_decreasing)):
130 |             mel = batch[ids_sorted_decreasing[i]][1]
131 |             target_lengths[i] = mel.shape[1]
132 |             gates[i, mel.shape[1] - 1:] = 1
133 |             padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=self.mel_pad_val)
134 |             targets.append(padded_mel)
135 |             reduced_mel = padded_mel[:, ::self.n_frames_per_step]
136 |             reduced_targets.append(reduced_mel)
137 | 
138 |         seqs = torch.from_numpy(np.stack(seqs))
139 |         targets = torch.from_numpy(np.stack(targets))
140 |         reduced_targets = torch.from_numpy(np.stack(reduced_targets))
141 |         gates = torch.from_numpy(gates)
142 |         return seqs, seq_lens, targets, reduced_targets, gates, target_lengths
143 | 
144 | 
145 | def to_gpu(x):
146 |     x = x.contiguous()
147 |     if torch.cuda.is_available():
148 |         x = x.cuda(non_blocking=True)
149 |     return x
150 | 
151 | 
152 | def batch_to_gpu(batch):
153 |     texts, text_lengths, targets, reduced_targets, gates, target_lengths = batch
154 |     x = (to_gpu(texts).long(), to_gpu(text_lengths).int(), to_gpu(reduced_targets).float(), to_gpu(target_lengths).int())
155 |     y = (targets, gates)
156 |     num_frames = torch.sum(target_lengths)
157 |     return (x, y, num_frames)
158 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import argparse
 29 | import numpy as np
 30 | import os
 31 | import sys
 32 | import time
 33 | import torch
 34 | from apex import amp
 35 | from common.audio_processing import griffin_lim
 36 | from common.layers import TacotronSTFT
 37 | from common.utils import save_wav
 38 | from scipy.io.wavfile import write
 39 | from tacotron2.loader import parse_tacotron2_args
 40 | from tacotron2.loader import get_tacotron2_model
 41 | from tacotron2.text import text_to_sequence
 42 | from train import parse_training_args
 43 | from dllogger.logger import LOGGER
 44 | import dllogger.logger as dllg
 45 | from dllogger.autologging import log_hardware, log_args
 46 | 
 47 | 
 48 | def parse_args(parser):
 49 |     """
 50 |     Parse commandline arguments.
 51 |     """
 52 |     parser.add_argument('-i', '--input-file', type=str, default="text.txt", help='full path to the input text (phareses separated by new line)')
 53 |     parser.add_argument('--checkpoint', type=str, default="logs/checkpoint_latest.pt", help='full path to the Tacotron2 model checkpoint file')
 54 |     parser.add_argument('-id', '--speaker-id', default=0, type=int, help='Speaker identity')
 55 |     parser.add_argument('-sn', '--speaker-num', default=1, type=int, help='Speaker number')
 56 |     parser.add_argument('--include-warmup', action='store_true', help='Include warmup')
 57 | 
 58 |     return parser
 59 | 
 60 | 
 61 | def load_checkpoint(checkpoint_path, model_name):
 62 |     assert os.path.isfile(checkpoint_path)
 63 |     model.load_state_dict(torch.load(checkpoint_path))
 64 |     print(f"Loaded checkpoint: {checkpoint_path}")
 65 |     return model
 66 | 
 67 | 
 68 | def load_and_setup_model(parser, args):
 69 |     checkpoint_path = args.checkpoint
 70 |     parser = parse_tacotron2_args(parser, add_help=False)
 71 |     args, _ = parser.parse_known_args()
 72 |     model = get_tacotron2_model(args, args.speaker_num, is_training=False)
 73 |     model.restore_checkpoint(checkpoint_path)
 74 |     model.eval()
 75 | 
 76 |     if args.amp_run:
 77 |         model, _ = amp.initialize(model, [], opt_level='O1')
 78 | 
 79 |     return model, args
 80 | 
 81 | 
 82 | # taken from tacotron2/data_function.py:TextMelCollate.__call__
 83 | def pad_sequences(sequences):
 84 |     # Right zero-pad all one-hot text sequences to max input length
 85 |     text_lengths, ids_sorted_decreasing = torch.sort(
 86 |         torch.IntTensor([len(x) for x in sequences]),
 87 |         dim=0, descending=True)
 88 |     max_text_len = text_lengths[0]
 89 | 
 90 |     texts = []
 91 |     for i in range(len(ids_sorted_decreasing)):
 92 |         text = sequences[ids_sorted_decreasing[i]]
 93 |         texts.append(np.pad(text, [0, max_text_len - len(text)], mode='constant'))
 94 | 
 95 |     texts = torch.from_numpy(np.stack(texts))
 96 |     return texts, text_lengths, ids_sorted_decreasing
 97 | 
 98 | 
 99 | def prepare_input_sequence(texts, speaker_id):
100 |     sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts]
101 |     texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences)
102 | 
103 |     if torch.cuda.is_available():
104 |         texts = texts.cuda().long()
105 |         text_lengths = text_lengths.cuda().int()
106 |     else:
107 |         texts = texts.long()
108 |         text_lengths = text_lengths.int()
109 | 
110 |     return texts, text_lengths, ids_sorted_decreasing
111 | 
112 | 
113 | class MeasureTime():
114 |     def __init__(self, measurements, key):
115 |         self.measurements = measurements
116 |         self.key = key
117 | 
118 |     def __enter__(self):
119 |         torch.cuda.synchronize()
120 |         self.t0 = time.perf_counter()
121 | 
122 |     def __exit__(self, exc_type, exc_value, exc_traceback):
123 |         torch.cuda.synchronize()
124 |         self.measurements[self.key] = time.perf_counter() - self.t0
125 | 
126 | 
127 | def main():
128 |     """
129 |     Launches text to speech (inference).
130 |     Inference is executed on a single GPU.
131 |     """
132 |     parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference')
133 |     parser = parse_training_args(parser)
134 |     parser = parse_args(parser)
135 |     args, _ = parser.parse_known_args()
136 | 
137 |     LOGGER.set_model_name("Tacotron2_PyT")
138 |     LOGGER.set_backends([
139 |         dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
140 |         dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
141 |     ])
142 |     LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
143 |     LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
144 |     LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
145 | 
146 |     model, args = load_and_setup_model(parser, args)
147 | 
148 |     log_hardware()
149 |     log_args(args)
150 | 
151 |     try:
152 |         f = open(args.input_file)
153 |         sentences = list(map(lambda s : s.strip(), f.readlines()))
154 |     except UnicodeDecodeError:
155 |         f = open(args.input_file, encoding='gbk')
156 |         sentences = list(map(lambda s : s.strip(), f.readlines()))
157 | 
158 |     os.makedirs(args.output_dir, exist_ok=True)
159 | 
160 |     LOGGER.iteration_start()
161 | 
162 |     measurements = {}
163 | 
164 |     sequences, text_lengths, ids_sorted_decreasing = prepare_input_sequence(sentences, args.speaker_id)
165 | 
166 |     with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
167 |         outputs = model.infer(sequences, text_lengths)
168 |         _, mels, _, _, mel_lengths = [output.cpu() for output in outputs]
169 | 
170 |     tacotron2_infer_perf = mels.size(0)*mels.size(2)/measurements['tacotron2_time']
171 | 
172 |     LOGGER.log(key="tacotron2_frames_per_sec", value=tacotron2_infer_perf)
173 |     LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
174 |     LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
175 |     LOGGER.iteration_stop()
176 |     LOGGER.finish()
177 | 
178 |     # recover to the original order and concatenate
179 |     stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
180 |                         args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax)
181 |     ids_sorted_decreasing = ids_sorted_decreasing.numpy().tolist()
182 |     mels = [mel[:, :length] for mel, length in zip(mels, mel_lengths)]
183 |     mels = [mels[ids_sorted_decreasing.index(i)] for i in range(len(ids_sorted_decreasing))]
184 |     magnitudes = stft.inv_mel_spectrogram(torch.cat(mels, axis=-1))
185 |     wav = griffin_lim(magnitudes, stft.stft_fn)
186 |     save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))
187 |     np.save(os.path.join(args.output_dir, 'eval.npy'), np.concatenate(mels, axis=-1), allow_pickle=False)
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     main()
192 | 


--------------------------------------------------------------------------------
/dllogger/tags.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 MLBenchmark Group. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | #
 16 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 17 | #
 18 | # Licensed under the Apache License, Version 2.0 (the "License");
 19 | # you may not use this file except in compliance with the License.
 20 | # You may obtain a copy of the License at
 21 | #
 22 | #     http://www.apache.org/licenses/LICENSE-2.0
 23 | #
 24 | # Unless required by applicable law or agreed to in writing, software
 25 | # distributed under the License is distributed on an "AS IS" BASIS,
 26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 27 | # See the License for the specific language governing permissions and
 28 | # limitations under the License.
 29 | 
 30 | # Common values reported
 31 | 
 32 | VALUE_EPOCH = "epoch"
 33 | VALUE_ITERATION = "iteration"
 34 | VALUE_ACCURACY = "accuracy"
 35 | VALUE_BLEU = "bleu"
 36 | VALUE_TOP1 = "top1"
 37 | VALUE_TOP5 = "top5"
 38 | VALUE_BBOX_MAP = "bbox_map"
 39 | VALUE_MASK_MAP = "mask_map"
 40 | VALUE_BCE = "binary_cross_entropy"
 41 | 
 42 | 
 43 | # Timed blocks (used with timed_function & timed_block
 44 | # For each there should be *_start and *_stop tags defined
 45 | 
 46 | RUN_BLOCK = "run"
 47 | SETUP_BLOCK = "setup"
 48 | PREPROC_BLOCK = "preproc"
 49 | 
 50 | TRAIN_BLOCK = "train"
 51 | TRAIN_PREPROC_BLOCK = "train_preproc"
 52 | TRAIN_EPOCH_BLOCK = "train_epoch"
 53 | TRAIN_EPOCH_PREPROC_BLOCK = "train_epoch_preproc"
 54 | TRAIN_CHECKPOINT_BLOCK = "train_checkpoint"
 55 | TRAIN_ITER_BLOCK = "train_iteration"
 56 | 
 57 | EVAL_BLOCK = "eval"
 58 | EVAL_ITER_BLOCK = "eval_iteration"
 59 | 
 60 | #TODO: to remove?
 61 | TIMED_BLOCKS = {
 62 |     RUN_BLOCK,
 63 |     SETUP_BLOCK,
 64 |     PREPROC_BLOCK,
 65 |     TRAIN_BLOCK,
 66 |     TRAIN_PREPROC_BLOCK,
 67 |     TRAIN_EPOCH_BLOCK,
 68 |     TRAIN_EPOCH_PREPROC_BLOCK,
 69 |     TRAIN_CHECKPOINT_BLOCK,
 70 |     TRAIN_ITER_BLOCK,
 71 |     EVAL_BLOCK,
 72 |     EVAL_ITER_BLOCK,
 73 | }
 74 | 
 75 | 
 76 | # Events
 77 | 
 78 | RUN_INIT = "run_init"
 79 | 
 80 | SETUP_START = "setup_start"
 81 | SETUP_STOP = "setup_stop"
 82 | 
 83 | PREPROC_START = "preproc_start"
 84 | PREPROC_STOP = "preproc_stop"
 85 | 
 86 | RUN_START = "run_start"
 87 | RUN_STOP = "run_stop"
 88 | RUN_FINAL = "run_final"
 89 | 
 90 | TRAIN_CHECKPOINT_START = "train_checkpoint_start"
 91 | TRAIN_CHECKPOINT_STOP = "train_checkpoint_stop"
 92 | 
 93 | TRAIN_PREPROC_START = "train_preproc_start"
 94 | TRAIN_PREPROC_STOP = "train_preproc_stop"
 95 | 
 96 | TRAIN_EPOCH_PREPROC_START = "train_epoch_preproc_start"
 97 | TRAIN_EPOCH_PREPROC_STOP = "train_epoch_preproc_stop"
 98 | 
 99 | TRAIN_ITER_START = "train_iter_start"
100 | TRAIN_ITER_STOP = "train_iter_stop"
101 | 
102 | TRAIN_EPOCH_START = "train_epoch_start"
103 | TRAIN_EPOCH_STOP = "train_epoch_stop"
104 | 
105 | 
106 | # MLPerf specific tags
107 | 
108 | RUN_CLEAR_CACHES = "run_clear_caches"
109 | 
110 | PREPROC_NUM_TRAIN_EXAMPLES = "preproc_num_train_examples"
111 | PREPROC_NUM_EVAL_EXAMPLES = "preproc_num_eval_examples"
112 | PREPROC_TOKENIZE_TRAINING = "preproc_tokenize_training"
113 | PREPROC_TOKENIZE_EVAL = "preproc_tokenize_eval"
114 | PREPROC_VOCAB_SIZE = "preproc_vocab_size"
115 | 
116 | RUN_SET_RANDOM_SEED = "run_set_random_seed"
117 | 
118 | INPUT_SIZE = "input_size"
119 | INPUT_BATCH_SIZE = "input_batch_size"
120 | INPUT_ORDER = "input_order"
121 | INPUT_SHARD = "input_shard"
122 | INPUT_BN_SPAN = "input_bn_span"
123 | 
124 | INPUT_CENTRAL_CROP = "input_central_crop"
125 | INPUT_CROP_USES_BBOXES = "input_crop_uses_bboxes"
126 | INPUT_DISTORTED_CROP_MIN_OBJ_COV = "input_distorted_crop_min_object_covered"
127 | INPUT_DISTORTED_CROP_RATIO_RANGE = "input_distorted_crop_aspect_ratio_range"
128 | INPUT_DISTORTED_CROP_AREA_RANGE = "input_distorted_crop_area_range"
129 | INPUT_DISTORTED_CROP_MAX_ATTEMPTS = "input_distorted_crop_max_attempts"
130 | INPUT_MEAN_SUBTRACTION = "input_mean_subtraction"
131 | INPUT_RANDOM_FLIP = "input_random_flip"
132 | 
133 | INPUT_RESIZE = "input_resize"
134 | INPUT_RESIZE_ASPECT_PRESERVING = "input_resize_aspect_preserving"
135 | 
136 | 
137 | # Opt
138 | 
139 | OPT_NAME = "opt_name"
140 | 
141 | OPT_LR = "opt_learning_rate"
142 | OPT_MOMENTUM = "opt_momentum"
143 | 
144 | OPT_WEIGHT_DECAY = "opt_weight_decay"
145 | 
146 | OPT_HP_ADAM_BETA1 = "opt_hp_Adam_beta1"
147 | OPT_HP_ADAM_BETA2 = "opt_hp_Adam_beta2"
148 | OPT_HP_ADAM_EPSILON = "opt_hp_Adam_epsilon"
149 | 
150 | OPT_LR_WARMUP_STEPS = "opt_learning_rate_warmup_steps"
151 | 
152 | 
153 | #  Train
154 | 
155 | TRAIN_LOOP = "train_loop"
156 | TRAIN_EPOCH = "train_epoch"
157 | TRAIN_CHECKPOINT = "train_checkpoint"
158 | TRAIN_LOSS = "train_loss"
159 | TRAIN_ITERATION_LOSS = "train_iteration_loss"
160 | 
161 | 
162 | # Eval
163 | 
164 | EVAL_START = "eval_start"
165 | EVAL_SIZE = "eval_size"
166 | EVAL_TARGET = "eval_target"
167 | EVAL_ACCURACY = "eval_accuracy"
168 | EVAL_STOP = "eval_stop"
169 | 
170 | 
171 | # Perf
172 | 
173 | PERF_IT_PER_SEC = "perf_it_per_sec"
174 | PERF_TIME_TO_TRAIN = "time_to_train"
175 | 
176 | EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
177 | 
178 | 
179 | # Model
180 | 
181 | MODEL_HP_LOSS_FN = "model_hp_loss_fn"
182 | 
183 | MODEL_HP_INITIAL_SHAPE = "model_hp_initial_shape"
184 | MODEL_HP_FINAL_SHAPE = "model_hp_final_shape"
185 | 
186 | MODEL_L2_REGULARIZATION = "model_l2_regularization"
187 | MODEL_EXCLUDE_BN_FROM_L2 = "model_exclude_bn_from_l2"
188 | 
189 | MODEL_HP_RELU = "model_hp_relu"
190 | MODEL_HP_CONV2D_FIXED_PADDING = "model_hp_conv2d_fixed_padding"
191 | MODEL_HP_BATCH_NORM = "model_hp_batch_norm"
192 | MODEL_HP_DENSE = "model_hp_dense"
193 | 
194 | 
195 | # GNMT specific
196 | 
197 | MODEL_HP_LOSS_SMOOTHING = "model_hp_loss_smoothing"
198 | MODEL_HP_NUM_LAYERS = "model_hp_num_layers"
199 | MODEL_HP_HIDDEN_SIZE = "model_hp_hidden_size"
200 | MODEL_HP_DROPOUT = "model_hp_dropout"
201 | 
202 | EVAL_HP_BEAM_SIZE = "eval_hp_beam_size"
203 | TRAIN_HP_MAX_SEQ_LEN = "train_hp_max_sequence_length"
204 | EVAL_HP_MAX_SEQ_LEN = "eval_hp_max_sequence_length"
205 | EVAL_HP_LEN_NORM_CONST = "eval_hp_length_normalization_constant"
206 | EVAL_HP_LEN_NORM_FACTOR = "eval_hp_length_normalization_factor"
207 | EVAL_HP_COV_PENALTY_FACTOR = "eval_hp_coverage_penalty_factor"
208 | 
209 | 
210 | # NCF specific
211 | 
212 | PREPROC_HP_MIN_RATINGS = "preproc_hp_min_ratings"
213 | PREPROC_HP_NUM_EVAL = "preproc_hp_num_eval"
214 | PREPROC_HP_SAMPLE_EVAL_REPLACEMENT = "preproc_hp_sample_eval_replacement"
215 | 
216 | INPUT_HP_NUM_NEG = "input_hp_num_neg"
217 | INPUT_HP_SAMPLE_TRAIN_REPLACEMENT = "input_hp_sample_train_replacement"
218 | INPUT_STEP_TRAIN_NEG_GEN = "input_step_train_neg_gen"
219 | INPUT_STEP_EVAL_NEG_GEN = "input_step_eval_neg_gen"
220 | 
221 | EVAL_HP_NUM_USERS = "eval_hp_num_users"
222 | EVAL_HP_NUM_NEG = "eval_hp_num_neg"
223 | 
224 | MODEL_HP_MF_DIM = "model_hp_mf_dim"
225 | MODEL_HP_MLP_LAYER_SIZES = "model_hp_mlp_layer_sizes"
226 | 
227 | 
228 | # RESNET specific
229 | 
230 | EVAL_EPOCH_OFFSET = "eval_offset"
231 | 
232 | MODEL_HP_INITIAL_MAX_POOL = "model_hp_initial_max_pool"
233 | MODEL_HP_BEGIN_BLOCK = "model_hp_begin_block"
234 | MODEL_HP_END_BLOCK = "model_hp_end_block"
235 | MODEL_HP_BLOCK_TYPE = "model_hp_block_type"
236 | MODEL_HP_PROJECTION_SHORTCUT = "model_hp_projection_shortcut"
237 | MODEL_HP_SHORTCUT_ADD = "model_hp_shorcut_add"
238 | MODEL_HP_RESNET_TOPOLOGY = "model_hp_resnet_topology"
239 | 
240 | 
241 | # Transformer specific
242 | 
243 | INPUT_MAX_LENGTH = "input_max_length"
244 | 
245 | MODEL_HP_INITIALIZER_GAIN = "model_hp_initializer_gain"
246 | MODEL_HP_VOCAB_SIZE = "model_hp_vocab_size"
247 | MODEL_HP_NUM_HIDDEN_LAYERS = "model_hp_hidden_layers"
248 | MODEL_HP_EMBEDDING_SHARED_WEIGHTS = "model_hp_embedding_shared_weights"
249 | MODEL_HP_ATTENTION_DENSE = "model_hp_attention_dense"
250 | MODEL_HP_ATTENTION_DROPOUT = "model_hp_attention_dropout"
251 | MODEL_HP_FFN_OUTPUT_DENSE = "model_hp_ffn_output_dense"
252 | MODEL_HP_FFN_FILTER_DENSE = "model_hp_ffn_filter_dense"
253 | MODEL_HP_RELU_DROPOUT = "model_hp_relu_dropout"
254 | MODEL_HP_LAYER_POSTPROCESS_DROPOUT = "model_hp_layer_postprocess_dropout"
255 | MODEL_HP_NORM = "model_hp_norm"
256 | MODEL_HP_SEQ_BEAM_SEARCH = "model_hp_sequence_beam_search"
257 | 
258 | 


--------------------------------------------------------------------------------
/gta.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import argparse
 29 | import numpy as np
 30 | import os
 31 | import sys
 32 | import time
 33 | import torch
 34 | from apex import amp
 35 | from scipy.io.wavfile import write
 36 | from tacotron2.data_function import to_gpu
 37 | from tacotron2.loader import parse_tacotron2_args
 38 | from tacotron2.loader import get_tacotron2_model
 39 | from tacotron2.text import text_to_sequence
 40 | from train import parse_training_args
 41 | from common.audio_processing import griffin_lim
 42 | from common.layers import TacotronSTFT
 43 | from common.utils import load_metadata, load_wav_to_torch, save_wav
 44 | from dllogger.logger import LOGGER
 45 | import dllogger.logger as dllg
 46 | from dllogger.autologging import log_hardware, log_args
 47 | from tqdm import tqdm
 48 | 
 49 | 
 50 | def load_checkpoint(checkpoint_path, model_name):
 51 |     assert os.path.isfile(checkpoint_path)
 52 |     model.load_state_dict(torch.load(checkpoint_path))
 53 |     print(f"Loaded checkpoint: {checkpoint_path}")
 54 |     return model
 55 | 
 56 | 
 57 | def load_and_setup_model(parser, args):
 58 |     checkpoint_path = os.path.join('logs', args.latest_checkpoint_file)
 59 |     parser = parse_tacotron2_args(parser, add_help=False)
 60 |     args, _ = parser.parse_known_args()
 61 |     model = get_tacotron2_model(args, len(args.training_anchor_dirs), is_training=False)
 62 |     model.restore_checkpoint(checkpoint_path)
 63 |     model.eval()
 64 | 
 65 |     if args.amp_run:
 66 |         model, _ = amp.initialize(model, [], opt_level='O1')
 67 | 
 68 |     return model, args
 69 | 
 70 | 
 71 | # taken from tacotron2/data_function.py:TextMelCollate.__call__
 72 | def pad_sequences(sequences):
 73 |     # Right zero-pad all one-hot text sequences to max input length
 74 |     text_lengths, ids_sorted_decreasing = torch.sort(
 75 |         torch.IntTensor([len(x) for x in sequences]),
 76 |         dim=0, descending=True)
 77 |     max_text_len = text_lengths[0]
 78 | 
 79 |     texts = []
 80 |     for i in range(len(ids_sorted_decreasing)):
 81 |         text = sequences[ids_sorted_decreasing[i]]
 82 |         texts.append(np.pad(text, [0, max_text_len - len(text)], mode='constant'))
 83 | 
 84 |     texts = torch.from_numpy(np.stack(texts))
 85 |     return texts, text_lengths, ids_sorted_decreasing
 86 | 
 87 | 
 88 | def prepare_input_sequence(texts, speaker_id):
 89 |     sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts]
 90 |     texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences)
 91 | 
 92 |     if torch.cuda.is_available():
 93 |         texts = texts.cuda().long()
 94 |         text_lengths = text_lengths.cuda().int()
 95 |     else:
 96 |         texts = texts.long()
 97 |         text_lengths = text_lengths.int()
 98 | 
 99 |     return texts, text_lengths, ids_sorted_decreasing
100 | 
101 | 
102 | class MeasureTime():
103 |     def __init__(self, measurements, key):
104 |         self.measurements = measurements
105 |         self.key = key
106 | 
107 |     def __enter__(self):
108 |         torch.cuda.synchronize()
109 |         self.t0 = time.perf_counter()
110 | 
111 |     def __exit__(self, exc_type, exc_value, exc_traceback):
112 |         torch.cuda.synchronize()
113 |         self.measurements[self.key] = time.perf_counter() - self.t0
114 | 
115 | 
116 | def main():
117 |     """
118 |     Launches text to speech (inference).
119 |     Inference is executed on a single GPU.
120 |     """
121 |     parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference')
122 |     parser = parse_training_args(parser)
123 |     args, _ = parser.parse_known_args()
124 | 
125 |     LOGGER.set_model_name("Tacotron2_PyT")
126 |     LOGGER.set_backends([
127 |         dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
128 |         dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
129 |     ])
130 |     LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE)
131 |     LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
132 |     LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)
133 | 
134 |     model, args = load_and_setup_model(parser, args)
135 | 
136 |     log_hardware()
137 |     log_args(args)
138 | 
139 |     os.makedirs(args.output_dir, exist_ok=True)
140 | 
141 |     LOGGER.iteration_start()
142 | 
143 |     measurements = {}
144 | 
145 |     anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs]
146 |     metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
147 |     stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
148 |                         args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax)
149 |     with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
150 |         for speaker_id in range(len(anchor_dirs)):
151 |             metadata = metadatas[speaker_id]
152 |             for npy_path, text in tqdm(metadata):
153 |                 seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
154 |                 seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
155 |                 seq_lens = torch.IntTensor([len(seq)])
156 |                 wav = load_wav_to_torch(npy_path)
157 |                 mel = stft.mel_spectrogram(wav.unsqueeze(0))
158 |                 mel = mel.squeeze()
159 |                 max_target_len = mel.size(1) - 1
160 |                 max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
161 |                 padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val)
162 |                 target = padded_mel[:, ::args.n_frames_per_step]
163 |                 targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
164 |                 target_lengths = torch.IntTensor([target.shape[1]])
165 |                 outputs = model.infer(to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int())
166 |                 _, mel_out, _, _ = [output.cpu() for output in outputs if output is not None]
167 |                 mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
168 |                 # clamp the range according to reference level decibel bias to eliminate background noises (20db)
169 |                 mel_out = np.clip(mel_out, args.mel_pad_val, -args.mel_pad_val)
170 |                 assert(mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
171 |                 fname = os.path.basename(npy_path)
172 |                 np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False)
173 |                 # GTA synthesis
174 |                 # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
175 |                 # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
176 |                 # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))
177 | 
178 |     LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
179 |     LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
180 |     LOGGER.iteration_stop()
181 |     LOGGER.finish()
182 | 
183 | 
184 | if __name__ == '__main__':
185 |     main()
186 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import pprint
  3 | 
  4 | class HParams(object):
  5 |     def __init__(self, **kwargs): self.__dict__.update(kwargs)
  6 |     def __setitem__(self, key, value): setattr(self, key, value)
  7 |     def __getitem__(self, key): return getattr(self, key)
  8 |     def __repr__(self): return pprint.pformat(self.__dict__)
  9 | 
 10 |     def parse(self, string):
 11 |         # Overrides hparams from a comma-separated string of name=value pairs
 12 |         if len(string) > 0:
 13 |             overrides = [s.split("=") for s in string.split(",")]
 14 |             keys, values = zip(*overrides)
 15 |             keys = list(map(str.strip, keys))
 16 |             values = list(map(str.strip, values))
 17 |             for k in keys:
 18 |                 self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
 19 |         return self
 20 | 
 21 | 
 22 | # Default hyperparameters
 23 | hparams = HParams(
 24 | 	# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
 25 | 	# text, you may want to use "basic_cleaners" or "transliteration_cleaners".
 26 | 	cleaners='basic_cleaners',
 27 | 
 28 | 	#Hardware setup (TODO: multi-GPU parallel tacotron training)
 29 | 	use_all_gpus = False, #Whether to use all GPU resources. If True, total number of available gpus will override num_gpus.
 30 | 	num_gpus = 1, #Determines the number of gpus in use
 31 | 	###########################################################################################################################################
 32 | 
 33 | 	#Audio
 34 | 	num_mels = 80, #Number of mel-spectrogram channels and local conditioning dimensionality
 35 | 	rescale = False, #Whether to rescale audio prior to preprocessing
 36 | 	rescaling_max = 0.999, #Rescaling value
 37 | 	trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
 38 | 	clip_mels_length = True, #For cases of OOM (Not really recommended, working on a workaround)
 39 | 	max_mel_frames = 900,  #Only relevant when clip_mels_length = True
 40 | 	max_text_length = 300,  #Only relevant when clip_mels_length = True
 41 | 	sentence_span = 20,  # Number of mel hops for each sentence interval
 42 | 
 43 | 	#Mel spectrogram
 44 | 	n_fft = 1024, #Extra window size is filled with 0 paddings to match this parameter
 45 | 	hop_size = 256, #For 22050Hz, 256 ~= 11.5 ms
 46 | 	win_size = 1024, #For 22050Hz, 1024 ~= 46 ms (If None, win_size = n_fft)
 47 | 	sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset)
 48 | 	frame_shift_ms = None,
 49 | 	preemphasis = 0.97, # preemphasis coefficient
 50 | 
 51 | 	#Multi-speaker batch_size should be integer multiplies number of speakers.
 52 | 	anchor_dirs = ['tts_fanfanli_22050', 'tts_xiaoya_22050', 'tts_yangluzhuo_22050', 'tts_yuanzhonglu_22050'],
 53 | 
 54 | 	#M-AILABS (and other datasets) trim params
 55 | 	trim_fft_size = 512,
 56 | 	trim_hop_size = 128,
 57 | 	trim_top_db = 60,
 58 | 
 59 | 	#Mel and Linear spectrograms normalization/scaling and clipping
 60 | 	signal_normalization = True,
 61 | 	allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
 62 | 	symmetric_mels = True, #Whether to scale the data to be symmetric around 0
 63 | 	max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max]
 64 | 
 65 | 	#Limits
 66 | 	min_level_db = -100,
 67 | 	ref_level_db = 20,
 68 | 	fmin = 50, #Set this to 75 if your speaker is male! if female, 125 should help taking off noise. (To test depending on dataset)
 69 | 	fmax = 7600,
 70 | 
 71 | 	#Griffin Lim
 72 | 	power = 1.2,
 73 | 	griffin_lim_iters = 60,
 74 | 	###########################################################################################################################################
 75 | 
 76 | 	#Tacotron
 77 | 	outputs_per_step = 2, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
 78 | 	stop_at_any = False, #Determines whether the decoder should stop when predicting <stop> to any frame or to all of them
 79 | 	batch_norm_position = 'after', #Can be in ('before', 'after'). Determines whether we use batch norm before or after the activation function (relu). Matter for debate.
 80 | 
 81 | 	embedding_dim = 512, #dimension of embedding space
 82 | 
 83 | 	enc_conv_num_layers = 3, #number of encoder convolutional layers
 84 | 	enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer
 85 | 	enc_conv_channels = 512, #number of encoder convolutions filters for each layer
 86 | 	encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward)
 87 | 
 88 | 	smoothing = False, #Whether to smooth the attention normalization function
 89 | 	attention_dim = 128, #dimension of attention space
 90 | 	attention_filters = 32, #number of attention convolution filters
 91 | 	attention_kernel = (31, ), #kernel size of attention convolution
 92 | 	cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
 93 | 
 94 | 	#Attention synthesis constraints
 95 | 	#"Monotonic" constraint forces the model to only look at the forwards attention_win_size steps.
 96 | 	#"Window" allows the model to look at attention_win_size neighbors, both forward and backward steps.
 97 | 	synthesis_constraint = False,  #Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis)
 98 | 	synthesis_constraint_type = 'window', #can be in ('window', 'monotonic').
 99 | 	attention_win_size = 7, #Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window.
100 | 
101 | 	prenet_layers = [256, 256], #number of layers and number of units of prenet
102 | 	decoder_layers = 2, #number of decoder lstm layers
103 | 	decoder_lstm_units = 1024, #number of decoder lstm units on each layer
104 | 	max_iters = 1000, #Max decoder steps during inference (Just for safety from infinite loop cases)
105 | 
106 | 	postnet_num_layers = 5, #number of postnet convolutional layers
107 | 	postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer
108 | 	postnet_channels = 512, #number of postnet convolution filters for each layer
109 | 
110 | 	#Loss params
111 | 	mask_encoder = False, #whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence.
112 | 	mask_decoder = False, #Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not be weighted, else recommended pos_weight = 20)
113 | 	cross_entropy_pos_weight = 1, #Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1 = disabled)
114 | 	###########################################################################################################################################
115 | 
116 | 	#Tacotron Training
117 | 	#Reproduction seeds
118 | 	tacotron_random_seed = 5339, #Determines initial graph and operations (i.e: model) random state for reproducibility
119 | 	tacotron_data_random_state = 1234, #random state for train test split repeatability
120 | 
121 | 	#performance parameters
122 | 	tacotron_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!)
123 | 
124 | 	#train/test split ratios, mini-batches sizes
125 | 	tacotron_batch_size = 36, #number of training samples on each training steps
126 | 	#Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing).
127 | 	#Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder.
128 | 	tacotron_synthesis_batch_size = 48, #DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!!
129 | 	tacotron_test_size = 0.05, #% of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit)
130 | 	tacotron_test_batches = None, #number of test batches.
131 | 
132 | 	#Learning rate schedule
133 | 	tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
134 | 	tacotron_start_decay = 40000, #Step at which learning decay starts
135 | 	tacotron_decay_steps = 40000, #Determines the learning rate decay slope (UNDER TEST)
136 | 	tacotron_decay_rate = 0.4, #learning rate decay rate (UNDER TEST)
137 | 	tacotron_initial_learning_rate = 1e-3, #starting learning rate
138 | 	tacotron_final_learning_rate = 1e-5, #minimal learning rate
139 | 
140 | 	#Optimization parameters
141 | 	tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
142 | 	tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
143 | 	tacotron_adam_epsilon = 1e-6, #AdamOptimizer Epsilon parameter
144 | 
145 | 	#Regularization parameters
146 | 	tacotron_reg_weight = 1e-6, #regularization weight (for L2 regularization)
147 | 	tacotron_scale_regularization = False, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)
148 | 	tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network
149 | 	tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet
150 | 	tacotron_clip_gradients = True, #whether to clip gradients
151 | 
152 | 	#Evaluation parameters
153 | 	tacotron_natural_eval = False, #Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same teacher-forcing ratio as in training (just for overfit)
154 | 
155 | 	#Decoder RNN learning can take be done in one of two ways:
156 | 	#       Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode='constant'
157 | 	#       Scheduled Sampling Scheme: From Teacher-Forcing to sampling from previous outputs is function of global step. (teacher forcing ratio decay) mode='scheduled'
158 | 	#The second approach is inspired by:
159 | 	#Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
160 | 	#Can be found under: https://arxiv.org/pdf/1506.03099.pdf
161 | 	tacotron_teacher_forcing_mode = 'constant', #Can be ('constant' or 'scheduled'). 'scheduled' mode applies a cosine teacher forcing ratio decay. (Preference: scheduled)
162 | 	tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs, Only relevant if mode='constant'
163 | 	tacotron_teacher_forcing_init_ratio = 1., #initial teacher forcing ratio. Relevant if mode='scheduled'
164 | 	tacotron_teacher_forcing_final_ratio = 0., #final teacher forcing ratio. (Set None to use alpha instead) Relevant if mode='scheduled'
165 | 	tacotron_teacher_forcing_start_decay = 10000, #starting point of teacher forcing ratio decay. Relevant if mode='scheduled'
166 | 	tacotron_teacher_forcing_decay_steps = 40000, #Determines the teacher forcing ratio decay slope. Relevant if mode='scheduled'
167 | 	tacotron_teacher_forcing_decay_alpha = None, #teacher forcing ratio decay rate. Defines the final tfr as a ratio of initial tfr. Relevant if mode='scheduled'
168 | 	)
169 | 
170 | def hparams_debug_string():
171 |     return str(hparams)
172 | 


--------------------------------------------------------------------------------
/filelists/ljs_mel_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | LJSpeech-1.1/mels/LJ022-0023.pt|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
  2 | LJSpeech-1.1/mels/LJ043-0030.pt|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
  3 | LJSpeech-1.1/mels/LJ005-0201.pt|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
  4 | LJSpeech-1.1/mels/LJ001-0110.pt|Even the Caslon type when enlarged shows great shortcomings in this respect:
  5 | LJSpeech-1.1/mels/LJ003-0345.pt|All the committee could do in this respect was to throw the responsibility on others.
  6 | LJSpeech-1.1/mels/LJ007-0154.pt|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
  7 | LJSpeech-1.1/mels/LJ018-0098.pt|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
  8 | LJSpeech-1.1/mels/LJ047-0044.pt|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
  9 | LJSpeech-1.1/mels/LJ031-0038.pt|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
 10 | LJSpeech-1.1/mels/LJ048-0194.pt|during the morning of November twenty-two prior to the motorcade.
 11 | LJSpeech-1.1/mels/LJ049-0026.pt|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
 12 | LJSpeech-1.1/mels/LJ004-0152.pt|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
 13 | LJSpeech-1.1/mels/LJ008-0278.pt|or theirs might be one of many, and it might be considered necessary to "make an example."
 14 | LJSpeech-1.1/mels/LJ043-0002.pt|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
 15 | LJSpeech-1.1/mels/LJ009-0114.pt|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
 16 | LJSpeech-1.1/mels/LJ028-0506.pt|A modern artist would have difficulty in doing such accurate work.
 17 | LJSpeech-1.1/mels/LJ050-0168.pt|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
 18 | LJSpeech-1.1/mels/LJ039-0223.pt|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
 19 | LJSpeech-1.1/mels/LJ029-0032.pt|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
 20 | LJSpeech-1.1/mels/LJ031-0070.pt|Dr. Clark, who most closely observed the head wound,
 21 | LJSpeech-1.1/mels/LJ034-0198.pt|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
 22 | LJSpeech-1.1/mels/LJ026-0068.pt|Energy enters the plant, to a small extent,
 23 | LJSpeech-1.1/mels/LJ039-0075.pt|once you know that you must put the crosshairs on the target and that is all that is necessary.
 24 | LJSpeech-1.1/mels/LJ004-0096.pt|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
 25 | LJSpeech-1.1/mels/LJ005-0014.pt|Speaking on a debate on prison matters, he declared that
 26 | LJSpeech-1.1/mels/LJ012-0161.pt|he was reported to have fallen away to a shadow.
 27 | LJSpeech-1.1/mels/LJ018-0239.pt|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
 28 | LJSpeech-1.1/mels/LJ019-0257.pt|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
 29 | LJSpeech-1.1/mels/LJ028-0008.pt|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
 30 | LJSpeech-1.1/mels/LJ024-0083.pt|This plan of mine is no attack on the Court;
 31 | LJSpeech-1.1/mels/LJ042-0129.pt|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
 32 | LJSpeech-1.1/mels/LJ036-0103.pt|The police asked him whether he could pick out his passenger from the lineup.
 33 | LJSpeech-1.1/mels/LJ046-0058.pt|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
 34 | LJSpeech-1.1/mels/LJ014-0076.pt|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
 35 | LJSpeech-1.1/mels/LJ002-0043.pt|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
 36 | LJSpeech-1.1/mels/LJ009-0076.pt|We come to the sermon.
 37 | LJSpeech-1.1/mels/LJ017-0131.pt|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
 38 | LJSpeech-1.1/mels/LJ046-0184.pt|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
 39 | LJSpeech-1.1/mels/LJ014-0263.pt|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
 40 | LJSpeech-1.1/mels/LJ042-0096.pt|(old exchange rate) in addition to his factory salary of approximately equal amount
 41 | LJSpeech-1.1/mels/LJ049-0050.pt|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
 42 | LJSpeech-1.1/mels/LJ019-0186.pt|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
 43 | LJSpeech-1.1/mels/LJ028-0307.pt|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
 44 | LJSpeech-1.1/mels/LJ012-0235.pt|While they were in a state of insensibility the murder was committed.
 45 | LJSpeech-1.1/mels/LJ034-0053.pt|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
 46 | LJSpeech-1.1/mels/LJ014-0030.pt|These were damnatory facts which well supported the prosecution.
 47 | LJSpeech-1.1/mels/LJ015-0203.pt|but were the precautions too minute, the vigilance too close to be eluded or overcome?
 48 | LJSpeech-1.1/mels/LJ028-0093.pt|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
 49 | LJSpeech-1.1/mels/LJ002-0018.pt|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
 50 | LJSpeech-1.1/mels/LJ028-0275.pt|At last, in the twentieth month,
 51 | LJSpeech-1.1/mels/LJ012-0042.pt|which he kept concealed in a hiding-place with a trap-door just under his bed.
 52 | LJSpeech-1.1/mels/LJ011-0096.pt|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
 53 | LJSpeech-1.1/mels/LJ036-0077.pt|Roger D. Craig, a deputy sheriff of Dallas County,
 54 | LJSpeech-1.1/mels/LJ016-0318.pt|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
 55 | LJSpeech-1.1/mels/LJ013-0164.pt|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
 56 | LJSpeech-1.1/mels/LJ027-0141.pt|is closely reproduced in the life-history of existing deer. Or, in other words,
 57 | LJSpeech-1.1/mels/LJ028-0335.pt|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
 58 | LJSpeech-1.1/mels/LJ031-0202.pt|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
 59 | LJSpeech-1.1/mels/LJ021-0145.pt|From those willing to join in establishing this hoped-for period of peace,
 60 | LJSpeech-1.1/mels/LJ016-0288.pt|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 61 | LJSpeech-1.1/mels/LJ028-0081.pt|Years later, when the archaeologists could readily distinguish the false from the true,
 62 | LJSpeech-1.1/mels/LJ018-0081.pt|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
 63 | LJSpeech-1.1/mels/LJ021-0066.pt|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
 64 | LJSpeech-1.1/mels/LJ009-0238.pt|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
 65 | LJSpeech-1.1/mels/LJ005-0079.pt|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
 66 | LJSpeech-1.1/mels/LJ035-0019.pt|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
 67 | LJSpeech-1.1/mels/LJ036-0174.pt|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
 68 | LJSpeech-1.1/mels/LJ046-0146.pt|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
 69 | LJSpeech-1.1/mels/LJ017-0044.pt|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
 70 | LJSpeech-1.1/mels/LJ017-0070.pt|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
 71 | LJSpeech-1.1/mels/LJ014-0020.pt|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
 72 | LJSpeech-1.1/mels/LJ016-0020.pt|He never reached the cistern, but fell back into the yard, injuring his legs severely.
 73 | LJSpeech-1.1/mels/LJ045-0230.pt|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
 74 | LJSpeech-1.1/mels/LJ035-0129.pt|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
 75 | LJSpeech-1.1/mels/LJ008-0307.pt|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
 76 | LJSpeech-1.1/mels/LJ008-0294.pt|nearly indefinitely deferred.
 77 | LJSpeech-1.1/mels/LJ047-0148.pt|On October twenty-five,
 78 | LJSpeech-1.1/mels/LJ008-0111.pt|They entered a "stone cold room," and were presently joined by the prisoner.
 79 | LJSpeech-1.1/mels/LJ034-0042.pt|that he could only testify with certainty that the print was less than three days old.
 80 | LJSpeech-1.1/mels/LJ037-0234.pt|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
 81 | LJSpeech-1.1/mels/LJ040-0002.pt|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
 82 | LJSpeech-1.1/mels/LJ045-0140.pt|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
 83 | LJSpeech-1.1/mels/LJ012-0035.pt|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
 84 | LJSpeech-1.1/mels/LJ012-0250.pt|On the seventh July, eighteen thirty-seven,
 85 | LJSpeech-1.1/mels/LJ016-0179.pt|contracted with sheriffs and conveners to work by the job.
 86 | LJSpeech-1.1/mels/LJ016-0138.pt|at a distance from the prison.
 87 | LJSpeech-1.1/mels/LJ027-0052.pt|These principles of homology are essential to a correct interpretation of the facts of morphology.
 88 | LJSpeech-1.1/mels/LJ031-0134.pt|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
 89 | LJSpeech-1.1/mels/LJ019-0273.pt|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
 90 | LJSpeech-1.1/mels/LJ014-0110.pt|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
 91 | LJSpeech-1.1/mels/LJ034-0160.pt|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
 92 | LJSpeech-1.1/mels/LJ038-0199.pt|eleven. If I am alive and taken prisoner,
 93 | LJSpeech-1.1/mels/LJ014-0010.pt|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
 94 | LJSpeech-1.1/mels/LJ033-0047.pt|I noticed when I went out that the light was on, end quote,
 95 | LJSpeech-1.1/mels/LJ040-0027.pt|He was never satisfied with anything.
 96 | LJSpeech-1.1/mels/LJ048-0228.pt|and others who were present say that no agent was inebriated or acted improperly.
 97 | LJSpeech-1.1/mels/LJ003-0111.pt|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
 98 | LJSpeech-1.1/mels/LJ008-0258.pt|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
 99 | LJSpeech-1.1/mels/LJ029-0022.pt|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | LJSpeech-1.1/mels/LJ004-0045.pt|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.
101 | 


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | LJSpeech-1.1/wavs/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
  2 | LJSpeech-1.1/wavs/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
  3 | LJSpeech-1.1/wavs/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
  4 | LJSpeech-1.1/wavs/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect:
  5 | LJSpeech-1.1/wavs/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others.
  6 | LJSpeech-1.1/wavs/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
  7 | LJSpeech-1.1/wavs/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
  8 | LJSpeech-1.1/wavs/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
  9 | LJSpeech-1.1/wavs/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
 10 | LJSpeech-1.1/wavs/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade.
 11 | LJSpeech-1.1/wavs/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
 12 | LJSpeech-1.1/wavs/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
 13 | LJSpeech-1.1/wavs/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example."
 14 | LJSpeech-1.1/wavs/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
 15 | LJSpeech-1.1/wavs/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
 16 | LJSpeech-1.1/wavs/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work.
 17 | LJSpeech-1.1/wavs/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
 18 | LJSpeech-1.1/wavs/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
 19 | LJSpeech-1.1/wavs/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
 20 | LJSpeech-1.1/wavs/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound,
 21 | LJSpeech-1.1/wavs/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
 22 | LJSpeech-1.1/wavs/LJ026-0068.wav|Energy enters the plant, to a small extent,
 23 | LJSpeech-1.1/wavs/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary.
 24 | LJSpeech-1.1/wavs/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
 25 | LJSpeech-1.1/wavs/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that
 26 | LJSpeech-1.1/wavs/LJ012-0161.wav|he was reported to have fallen away to a shadow.
 27 | LJSpeech-1.1/wavs/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
 28 | LJSpeech-1.1/wavs/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
 29 | LJSpeech-1.1/wavs/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
 30 | LJSpeech-1.1/wavs/LJ024-0083.wav|This plan of mine is no attack on the Court;
 31 | LJSpeech-1.1/wavs/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
 32 | LJSpeech-1.1/wavs/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup.
 33 | LJSpeech-1.1/wavs/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
 34 | LJSpeech-1.1/wavs/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
 35 | LJSpeech-1.1/wavs/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
 36 | LJSpeech-1.1/wavs/LJ009-0076.wav|We come to the sermon.
 37 | LJSpeech-1.1/wavs/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
 38 | LJSpeech-1.1/wavs/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
 39 | LJSpeech-1.1/wavs/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
 40 | LJSpeech-1.1/wavs/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount
 41 | LJSpeech-1.1/wavs/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
 42 | LJSpeech-1.1/wavs/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
 43 | LJSpeech-1.1/wavs/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
 44 | LJSpeech-1.1/wavs/LJ012-0235.wav|While they were in a state of insensibility the murder was committed.
 45 | LJSpeech-1.1/wavs/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
 46 | LJSpeech-1.1/wavs/LJ014-0030.wav|These were damnatory facts which well supported the prosecution.
 47 | LJSpeech-1.1/wavs/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome?
 48 | LJSpeech-1.1/wavs/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
 49 | LJSpeech-1.1/wavs/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
 50 | LJSpeech-1.1/wavs/LJ028-0275.wav|At last, in the twentieth month,
 51 | LJSpeech-1.1/wavs/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed.
 52 | LJSpeech-1.1/wavs/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
 53 | LJSpeech-1.1/wavs/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County,
 54 | LJSpeech-1.1/wavs/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
 55 | LJSpeech-1.1/wavs/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
 56 | LJSpeech-1.1/wavs/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words,
 57 | LJSpeech-1.1/wavs/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
 58 | LJSpeech-1.1/wavs/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
 59 | LJSpeech-1.1/wavs/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace,
 60 | LJSpeech-1.1/wavs/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 61 | LJSpeech-1.1/wavs/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true,
 62 | LJSpeech-1.1/wavs/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
 63 | LJSpeech-1.1/wavs/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
 64 | LJSpeech-1.1/wavs/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
 65 | LJSpeech-1.1/wavs/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
 66 | LJSpeech-1.1/wavs/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
 67 | LJSpeech-1.1/wavs/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
 68 | LJSpeech-1.1/wavs/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
 69 | LJSpeech-1.1/wavs/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
 70 | LJSpeech-1.1/wavs/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
 71 | LJSpeech-1.1/wavs/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
 72 | LJSpeech-1.1/wavs/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely.
 73 | LJSpeech-1.1/wavs/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
 74 | LJSpeech-1.1/wavs/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
 75 | LJSpeech-1.1/wavs/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
 76 | LJSpeech-1.1/wavs/LJ008-0294.wav|nearly indefinitely deferred.
 77 | LJSpeech-1.1/wavs/LJ047-0148.wav|On October twenty-five,
 78 | LJSpeech-1.1/wavs/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner.
 79 | LJSpeech-1.1/wavs/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old.
 80 | LJSpeech-1.1/wavs/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
 81 | LJSpeech-1.1/wavs/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
 82 | LJSpeech-1.1/wavs/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
 83 | LJSpeech-1.1/wavs/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
 84 | LJSpeech-1.1/wavs/LJ012-0250.wav|On the seventh July, eighteen thirty-seven,
 85 | LJSpeech-1.1/wavs/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job.
 86 | LJSpeech-1.1/wavs/LJ016-0138.wav|at a distance from the prison.
 87 | LJSpeech-1.1/wavs/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology.
 88 | LJSpeech-1.1/wavs/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
 89 | LJSpeech-1.1/wavs/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
 90 | LJSpeech-1.1/wavs/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
 91 | LJSpeech-1.1/wavs/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
 92 | LJSpeech-1.1/wavs/LJ038-0199.wav|eleven. If I am alive and taken prisoner,
 93 | LJSpeech-1.1/wavs/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
 94 | LJSpeech-1.1/wavs/LJ033-0047.wav|I noticed when I went out that the light was on, end quote,
 95 | LJSpeech-1.1/wavs/LJ040-0027.wav|He was never satisfied with anything.
 96 | LJSpeech-1.1/wavs/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly.
 97 | LJSpeech-1.1/wavs/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
 98 | LJSpeech-1.1/wavs/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
 99 | LJSpeech-1.1/wavs/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | LJSpeech-1.1/wavs/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.
101 | 


--------------------------------------------------------------------------------
/dllogger/logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 MLBenchmark Group. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | #
 16 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 17 | #
 18 | # Licensed under the Apache License, Version 2.0 (the "License");
 19 | # you may not use this file except in compliance with the License.
 20 | # You may obtain a copy of the License at
 21 | #
 22 | #     http://www.apache.org/licenses/LICENSE-2.0
 23 | #
 24 | # Unless required by applicable law or agreed to in writing, software
 25 | # distributed under the License is distributed on an "AS IS" BASIS,
 26 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 27 | # See the License for the specific language governing permissions and
 28 | # limitations under the License.
 29 | 
 30 | import time
 31 | import json
 32 | import logging
 33 | import os
 34 | import inspect
 35 | import sys
 36 | import re
 37 | from contextlib import contextmanager
 38 | import functools
 39 | from collections import OrderedDict
 40 | 
 41 | NVLOGGER_NAME = 'nv_logger'
 42 | NVLOGGER_VERSION = '0.2.2'
 43 | NVLOGGER_TOKEN = ':::NVLOG'
 44 | 
 45 | MLPERF_NAME = 'mlperf_logger'
 46 | MLPERF_VERSION = '0.5.0'
 47 | MLPERF_TOKEN = ':::MLP'
 48 | 
 49 | DEFAULT_JSON_FILENAME = 'nvlog.json'
 50 | 
 51 | RUN_SCOPE = 0
 52 | EPOCH_SCOPE = 1
 53 | TRAIN_ITER_SCOPE = 2
 54 | 
 55 | _data = OrderedDict([
 56 |     ('model', None),
 57 |     ('epoch', -1),
 58 |     ('iteration', -1),
 59 |     ('total_iteration', -1),
 60 |     ('metrics', OrderedDict()),
 61 |     ('timed_blocks', OrderedDict()),
 62 |     ('current_scope', RUN_SCOPE)
 63 |     ])
 64 | 
 65 | def get_caller(stack_index=2, root_dir=None):
 66 |     caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
 67 | 
 68 |     # Trim the file names for readability.
 69 |     filename = caller.filename
 70 |     if root_dir is not None:
 71 |         filename = re.sub("^" + root_dir + "/", "", filename)
 72 |     return "%s:%d" % (filename, caller.lineno)
 73 | 
 74 | class StandardMeter(object):
 75 | 
 76 |     def __init__(self):
 77 |         self.reset()
 78 | 
 79 |     def reset(self):
 80 |         self.value = 0
 81 | 
 82 |     def record(self, value):
 83 |         self.value = value
 84 | 
 85 |     def get_value(self):
 86 |         return self.value
 87 | 
 88 |     def get_last(self):
 89 |         return self.value
 90 | 
 91 | class AverageMeter(object):
 92 | 
 93 |     def __init__(self):
 94 |         self.reset()
 95 | 
 96 |     def reset(self):
 97 |         self.n = 0
 98 |         self.value = 0
 99 |         self.last = 0
100 | 
101 |     def record(self, value, n = 1):
102 |         self.last = value
103 |         self.n += n
104 |         self.value += value * n
105 | 
106 |     def get_value(self):
107 |         return self.value / self.n
108 | 
109 |     def get_last(self):
110 |         return self.last
111 | 
112 | class JsonBackend(object):
113 | 
114 |     def __init__(self, log_file=DEFAULT_JSON_FILENAME, logging_scope=TRAIN_ITER_SCOPE,
115 |             iteration_interval=1):
116 |         self.log_file = log_file
117 |         self.logging_scope = logging_scope
118 |         self.iteration_interval = iteration_interval
119 | 
120 |         self.json_log = OrderedDict([
121 |             ('run', OrderedDict()),
122 |             ('epoch', OrderedDict()),
123 |             ('iter', OrderedDict()),
124 |             ('event', OrderedDict()),
125 |             ])
126 |         
127 |         self.json_log['epoch']['x'] = []
128 |         if self.logging_scope == TRAIN_ITER_SCOPE:
129 |             self.json_log['iter']['x'] = [[]]
130 | 
131 |     def register_metric(self, key, metric_scope):
132 |         if (metric_scope == TRAIN_ITER_SCOPE and 
133 |                 self.logging_scope == TRAIN_ITER_SCOPE):
134 |             if not key in self.json_log['iter'].keys():
135 |                 self.json_log['iter'][key] = [[]]
136 |         if metric_scope == EPOCH_SCOPE:
137 |             if not key in self.json_log['epoch'].keys():
138 |                 self.json_log['epoch'][key] = []
139 | 
140 |     def log(self, key, value):
141 |         if _data['current_scope'] == RUN_SCOPE:
142 |             self.json_log['run'][key] = value
143 |         elif _data['current_scope'] == EPOCH_SCOPE: 
144 |             pass
145 |         elif _data['current_scope'] == TRAIN_ITER_SCOPE:
146 |             pass
147 |         else:
148 |             raise ValueError('log function for scope "', _data['current_scope'], 
149 |                     '" not implemented')
150 |     
151 |     def log_event(self, key, value):
152 |         if not key in self.json_log['event'].keys():
153 |             self.json_log['event'][key] = []
154 |         entry = OrderedDict()
155 |         entry['epoch'] = _data['epoch']
156 |         entry['iter'] = _data['iteration']
157 |         entry['timestamp'] = time.time()
158 |         if value:
159 |             entry['value'] = value
160 |         self.json_log['event'][key].append(entry)
161 | 
162 |     def log_iteration_summary(self):
163 |         if (self.logging_scope == TRAIN_ITER_SCOPE and 
164 |                 _data['total_iteration'] % self.iteration_interval == 0):
165 |             for key, m in _data['metrics'].items():
166 |                 if m.metric_scope == TRAIN_ITER_SCOPE:
167 |                     self.json_log['iter'][key][-1].append(m.get_last())
168 | 
169 |             # log x for iteration number
170 |             self.json_log['iter']['x'][-1].append(_data['iteration'])
171 | 
172 | 
173 |     def dump_json(self):
174 |         if self.log_file is None:
175 |             print(json.dumps(self.json_log, indent=4))
176 |         else:
177 |             with open(self.log_file, 'w') as f:
178 |                 json.dump(self.json_log, fp=f, indent=4)
179 | 
180 |     def log_epoch_summary(self):
181 |         for key, m in _data['metrics'].items():
182 |             if m.metric_scope == EPOCH_SCOPE:
183 |                 self.json_log['epoch'][key].append(m.get_value())
184 |             elif (m.metric_scope == TRAIN_ITER_SCOPE and 
185 |                     self.logging_scope == TRAIN_ITER_SCOPE):
186 |                 # create new sublists for each iter metric in the next epoch
187 |                 self.json_log['iter'][key].append([])
188 |         
189 |         # log x for epoch number
190 |         self.json_log['epoch']['x'].append(_data['epoch'])
191 | 
192 |         # create new sublist for iter's x in the next epoch
193 |         if self.logging_scope == TRAIN_ITER_SCOPE:
194 |             self.json_log['iter']['x'].append([])
195 | 
196 |         self.dump_json()
197 | 
198 |     def timed_block_start(self, name):
199 |         pass
200 | 
201 |     def timed_block_stop(self, name):
202 |         pass
203 | 
204 |     def finish(self):
205 |         self.dump_json()
206 | 
207 | class _ParentStdOutBackend(object):
208 | 
209 |     def __init__(self, name, token, version, log_file, logging_scope, iteration_interval):
210 | 
211 |         self.root_dir = None
212 |         self.worker = [0]
213 |         self.prefix = ''
214 | 
215 |         self.name = name
216 |         self.token = token
217 |         self.version = version
218 |         self.log_file = log_file
219 |         self.logging_scope = logging_scope
220 |         self.iteration_interval = iteration_interval
221 | 
222 |         self.logger = logging.getLogger(self.name)
223 |         self.logger.setLevel(logging.DEBUG)
224 |         self.logger.handlers = []
225 | 
226 |         if (self.log_file == None):
227 |             self.stream_handler = logging.StreamHandler(stream=sys.stdout)
228 |             self.stream_handler.setLevel(logging.DEBUG)
229 |             self.logger.addHandler(self.stream_handler)
230 |         else:
231 |             self.file_handler = logging.FileHandler(self.log_file, mode='w')
232 |             self.file_handler.setLevel(logging.DEBUG)
233 |             self.logger.addHandler(self.file_handler)
234 | 
235 |     def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
236 |         pass
237 | 
238 |     def log_epoch_summary(self):
239 |         pass
240 | 
241 |     def log_iteration_summary(self):
242 |         pass
243 | 
244 |     def log(self, key, value):
245 |         if _data['current_scope'] > self.logging_scope:
246 |             pass
247 |         elif (_data['current_scope'] == TRAIN_ITER_SCOPE and 
248 |                 _data['total_iteration'] % self.iteration_interval != 0):
249 |             pass
250 |         else:
251 |             self.log_stdout(key, value)
252 | 
253 |     def log_event(self, key, value):
254 |         self.log_stdout(key, value)
255 |         
256 |     def log_stdout(self, key, value=None, forced=False):
257 |         # TODO: worker 0 
258 |         # only the 0-worker will log
259 |         #if not forced and self.worker != 0:
260 |         #    pass
261 | 
262 |         if value is None:
263 |             msg = key
264 |         else:
265 |             str_json = json.dumps(value)
266 |             msg = f'{key}: {str_json}'
267 | 
268 |         call_site = get_caller(root_dir=self.root_dir)
269 |         now = time.time()
270 | 
271 |         model=_data['model']
272 |         message = f'{self.prefix}{self.token}v{self.version} {model} {now:.5f} ({call_site}) {msg}'
273 |         self.logger.debug(message)
274 | 
275 |     def timed_block_start(self, name):
276 |         self.log_stdout(key=name + "_start")
277 | 
278 |     def timed_block_stop(self, name):
279 |         self.log_stdout(key=name + "_stop")
280 | 
281 |     def finish(self):
282 |         pass
283 | 
284 | class StdOutBackend(_ParentStdOutBackend):
285 | 
286 |     def __init__(self, log_file=None, logging_scope=EPOCH_SCOPE, iteration_interval=1):
287 |         _ParentStdOutBackend.__init__(self, name=NVLOGGER_NAME, token=NVLOGGER_TOKEN, 
288 |                 version=NVLOGGER_VERSION, log_file=log_file, logging_scope=logging_scope, 
289 |                 iteration_interval=iteration_interval)
290 |         
291 | class MLPerfBackend(_ParentStdOutBackend):
292 | 
293 |     def __init__(self, log_file=None, logging_scope=TRAIN_ITER_SCOPE, iteration_interval=1):
294 |         _ParentStdOutBackend.__init__(self, name=MLPERF_NAME, token=MLPERF_TOKEN, 
295 |                 version=MLPERF_VERSION, log_file=log_file, logging_scope=logging_scope, 
296 |                 iteration_interval=iteration_interval)
297 | 
298 | class _Logger(object):
299 |     def __init__(self):
300 | 
301 |         self.backends = [
302 |                 StdOutBackend(),
303 |                 JsonBackend()
304 |                 ]
305 |    
306 |     def set_model_name(self, name):
307 |         _data['model'] = name
308 | 
309 | 
310 |     def set_backends(self, backends):
311 |         self.backends = backends
312 | 
313 |         
314 |     def register_metric(self, key, meter=None, metric_scope=EPOCH_SCOPE):
315 |         if meter == None:
316 |             meter = StandardMeter()
317 |         #TODO: move to argument of Meter?
318 |         meter.metric_scope = metric_scope
319 |         _data['metrics'][key] = meter
320 |         for b in self.backends:
321 |             b.register_metric(key, metric_scope)
322 | 
323 |     def log(self, key, value=None, forced=False):
324 |         if _data['current_scope'] == TRAIN_ITER_SCOPE or _data['current_scope'] == EPOCH_SCOPE:
325 |             if key in _data['metrics'].keys():
326 |                 if _data['metrics'][key].metric_scope == _data['current_scope']:
327 |                     _data['metrics'][key].record(value)
328 |         for b in self.backends:
329 |             b.log(key, value)
330 | 
331 |     def log_event(self, key, value=None):
332 |         for b in self.backends:
333 |             b.log_event(key, value)
334 |     
335 |     def timed_block_start(self, name):
336 |         if not name in _data['timed_blocks']:
337 |             _data['timed_blocks'][name] = OrderedDict()
338 |         _data['timed_blocks'][name]['start'] = time.time()
339 |         for b in self.backends:
340 |             b.timed_block_start(name)
341 |     
342 |     def timed_block_stop(self, name):
343 |         if not name in _data['timed_blocks']:
344 |             raise ValueError('timed_block_stop called before timed_block_start for ' + name)
345 |         _data['timed_blocks'][name]['stop'] = time.time()
346 |         delta = _data['timed_blocks'][name]['stop'] - _data['timed_blocks'][name]['start']
347 |         self.log(name + '_time', delta)
348 |         for b in self.backends:
349 |             b.timed_block_stop(name)
350 | 
351 |     def iteration_start(self):
352 |         _data['current_scope'] = TRAIN_ITER_SCOPE
353 |         _data['iteration'] += 1
354 |         _data['total_iteration'] += 1
355 | 
356 | 
357 |     def iteration_stop(self):
358 |         for b in self.backends:
359 |             b.log_iteration_summary()
360 |         _data['current_scope'] = EPOCH_SCOPE
361 | 
362 |     def epoch_start(self):
363 |         _data['current_scope'] = EPOCH_SCOPE 
364 |         _data['epoch'] += 1
365 |         _data['iteration'] = -1
366 | 
367 |         for n, m in _data['metrics'].items():
368 |             if m.metric_scope == TRAIN_ITER_SCOPE:
369 |                 m.reset()
370 | 
371 |     def epoch_stop(self):
372 |         for b in self.backends:
373 |             b.log_epoch_summary()
374 |         _data['current_scope'] = RUN_SCOPE
375 | 
376 |     def finish(self):
377 |         for b in self.backends:
378 |             b.finish()
379 | 
380 |     def iteration_generator_wrapper(self, gen):
381 |         for g in gen:
382 |             self.iteration_start()
383 |             yield g
384 |             self.iteration_stop()
385 | 
386 |     def epoch_generator_wrapper(self, gen):
387 |         for g in gen:
388 |             self.epoch_start()
389 |             yield g
390 |             self.epoch_stop()
391 | 
392 | LOGGER = _Logger()
393 | 
394 | @contextmanager
395 | def timed_block(prefix, value=None, logger=LOGGER, forced=False):
396 |     """ This function helps with timed blocks
397 |         ----
398 |         Parameters:
399 |         prefix - one of items from TIMED_BLOCKS; the action to be timed
400 |         logger - NVLogger object
401 |         forced - if True then the events are always logged (even if it should be skipped)
402 |     """
403 |     if logger is None:
404 |         pass
405 |     logger.timed_block_start(prefix)
406 |     yield logger
407 |     logger.timed_block_stop(prefix)
408 | 
409 | def timed_function(prefix, variable=None, forced=False):
410 |     """ This decorator helps with timed functions
411 |         ----
412 |         Parameters:
413 |         prefix - one of items from TIME_BLOCK; the action to be timed
414 |         logger - NVLogger object
415 |         forced - if True then the events are always logged (even if it should be skipped)
416 |     """
417 |     def timed_function_decorator(func):
418 |         @functools.wraps(func)
419 |         def wrapper(*args, **kwargs):
420 |             logger = kwargs.get('logger', LOGGER)
421 |             value = kwargs.get(variable, next(iter(args), None))
422 |             with timed_block(prefix=prefix, logger=logger, value=value, forced=forced):
423 |                     func(*args, **kwargs)
424 |         return wrapper
425 |     return timed_function_decorator
426 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import os
 29 | import time
 30 | import argparse
 31 | import numpy as np
 32 | from common.utils import cosine_decay
 33 | from contextlib import contextmanager
 34 | from datetime import datetime
 35 | from plot import plot_alignment
 36 | import torch
 37 | from torch.utils.data import DataLoader
 38 | from torch.utils.tensorboard import SummaryWriter
 39 | 
 40 | import torch.distributed as dist
 41 | 
 42 | from apex import amp
 43 | from apex.parallel import DistributedDataParallel as DDP
 44 | 
 45 | from tacotron2.loader import parse_tacotron2_args
 46 | from tacotron2.loader import get_tacotron2_model
 47 | from tacotron2.loss_function import Tacotron2Loss
 48 | from tacotron2.data_function import TextMelCollate
 49 | from tacotron2.data_function import TextMelDataset
 50 | from tacotron2.data_function import batch_to_gpu
 51 | from dllogger.logger import LOGGER
 52 | import dllogger.logger as dllg
 53 | from dllogger import tags
 54 | from dllogger.autologging import log_hardware, log_args
 55 | from scipy.io.wavfile import write as write_wav
 56 | 
 57 | 
 58 | def parse_training_args(parser):
 59 |     """
 60 |     Parse commandline arguments.
 61 |     """
 62 | 
 63 |     parser.add_argument('-o', '--output_dir', type=str, default='logs', required=True, help='Directory to save checkpoints')
 64 |     parser.add_argument('-d', '--dataset-path', type=str, default='filelists', help='Path to dataset')
 65 |     parser.add_argument('--log-file', type=str, default='nvlog.json', help='Filename for logging')
 66 |     parser.add_argument('--latest-checkpoint-file', type=str, default='checkpoint_latest.pt', help='Store the latest checkpoint in each epoch')
 67 |     parser.add_argument('--phrase-path', type=str, default=None, help='Path to phrase sequence file used for sample generation')
 68 |     parser.add_argument('--tacotron2-checkpoint', type=str, default=None, help='Path to pre-trained Tacotron2 checkpoint for sample generation')
 69 | 
 70 |     # training
 71 |     training = parser.add_argument_group('training setup')
 72 |     training.add_argument('--epochs', type=int, default=500, help='Number of total epochs to run')
 73 |     training.add_argument('--epochs-per-alignment', type=int, default=1, help='Number of epochs per alignment')
 74 |     training.add_argument('--epochs-per-checkpoint', type=int, default=50, help='Number of epochs per checkpoint')
 75 |     training.add_argument('--seed', type=int, default=1234, help='Seed for PyTorch random number generators')
 76 |     training.add_argument('--dynamic-loss-scaling', type=bool, default=True, help='Enable dynamic loss scaling')
 77 |     training.add_argument('--amp-run', action='store_true', help='Enable AMP')
 78 |     training.add_argument('--cudnn-enabled', default=True, help='Enable cudnn')
 79 |     training.add_argument('--cudnn-benchmark', default=True, help='Run cudnn benchmark')
 80 |     training.add_argument('--disable-uniform-initialize-bn-weight', action='store_true', help='disable uniform initialization of batchnorm layer weight')
 81 | 
 82 |     optimization = parser.add_argument_group('optimization setup')
 83 |     optimization.add_argument('--use-saved-learning-rate', default=False, type=bool)
 84 |     optimization.add_argument('--init-lr', '--initial-learning-rate', default=1e-3, type=float, help='Initial learing rate')
 85 |     optimization.add_argument('--final-lr', '--final-learning-rate', default=1e-5, type=float, help='Final earing rate')
 86 |     optimization.add_argument('--weight-decay', default=1e-6, type=float, help='Weight decay')
 87 |     optimization.add_argument('--grad-clip-thresh', default=1.0, type=float, help='Clip threshold for gradients')
 88 |     optimization.add_argument('-bs', '--batch-size', default=32, type=int, help='Batch size per GPU')
 89 | 
 90 |     # dataset parameters
 91 |     dataset = parser.add_argument_group('dataset parameters')
 92 |     dataset.add_argument('--load-mel-from-disk', action='store_true', help='Loads mel spectrograms from disk instead of computing them on the fly')
 93 |     dataset.add_argument('--training-anchor-dirs', default=['ljs_mel_text_train_filelist.txt'], type=str, nargs='*', help='Path to training filelist')
 94 |     dataset.add_argument('--validation-anchor-dirs', default=['ljs_mel_text_val_filelist.txt'], type=str, nargs='*', help='Path to validation filelist')
 95 |     dataset.add_argument('--text-cleaners', nargs='*', default=['basic_cleaners'], type=str, help='Type of text cleaners for input text')
 96 | 
 97 |     # audio parameters
 98 |     audio = parser.add_argument_group('audio parameters')
 99 |     audio.add_argument('--max-wav-value', default=32768.0, type=float, help='Maximum audiowave value')
100 |     audio.add_argument('--sampling-rate', default=22050, type=int, help='Sampling rate')
101 |     audio.add_argument('--filter-length', default=1024, type=int, help='Filter length')
102 |     audio.add_argument('--hop-length', default=256, type=int, help='Hop (stride) length')
103 |     audio.add_argument('--win-length', default=1024, type=int, help='Window length')
104 |     audio.add_argument('--mel-fmin', default=50.0, type=float, help='Minimum mel frequency')
105 |     audio.add_argument('--mel-fmax', default=7600.0, type=float, help='Maximum mel frequency')
106 | 
107 |     distributed = parser.add_argument_group('distributed setup')
108 |     distributed.add_argument('--distributed-run', default=False, type=bool, help='enable distributed run')
109 |     distributed.add_argument('--rank', default=0, type=int, help='Rank of the process, do not set! Done by multiproc module')
110 |     distributed.add_argument('--world-size', default=1, type=int, help='Number of processes, do not set! Done by multiproc module')
111 |     distributed.add_argument('--dist-url', type=str, default='tcp://localhost:23456', help='Url used to set up distributed training')
112 |     distributed.add_argument('--group-name', type=str, default='group_name', help='Distributed group name')
113 |     distributed.add_argument('--dist-backend', default='nccl', type=str, choices={'nccl'}, help='Distributed run backend')
114 | 
115 |     return parser
116 | 
117 | 
118 | def reduce_tensor(tensor, num_gpus):
119 |     rt = tensor.clone()
120 |     dist.all_reduce(rt, op=dist.reduce_op.SUM)
121 |     rt /= num_gpus
122 |     return rt
123 | 
124 | 
125 | def init_distributed(args, world_size, rank, group_name):
126 |     assert torch.cuda.is_available(), "Distributed mode requires CUDA."
127 |     print("Initializing Distributed")
128 | 
129 |     # Set cuda device so everything is done on the right GPU.
130 |     torch.cuda.set_device(rank % torch.cuda.device_count())
131 | 
132 |     # Initialize distributed communication
133 |     dist.init_process_group(
134 |         backend=args.dist_backend, init_method=args.dist_url,
135 |         world_size=world_size, rank=rank, group_name=group_name)
136 | 
137 |     print("Done initializing distributed")
138 | 
139 | 
140 | def save_eval(model, filepath, args):
141 |     if args.phrase_path:
142 |         phrase = torch.load(args.phrase_path, map_location='cpu')
143 |         with torch.no_grad():
144 |             model.eval()
145 |             mel = model.infer(phrase.cuda())[0].cpu()
146 |             model.train()
147 | 
148 |     # audio = audio[0].numpy()
149 |     # audio = audio.astype('int16')
150 |     # write_wav(filepath, sampling_rate, audio)
151 | 
152 | # adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3
153 | # Following snippet is licensed under MIT license
154 | 
155 | 
156 | @contextmanager
157 | def evaluating(model):
158 |     '''Temporarily switch to evaluation mode.'''
159 |     try:
160 |         model.eval()
161 |         yield model
162 |     finally:
163 |         if model.training:
164 |             model.train()
165 | 
166 | 
167 | def validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args):
168 |     """Handles all the validation scoring and printing"""
169 |     with evaluating(model), torch.no_grad():
170 |         val_loader = DataLoader(valate_dataset, num_workers=1, shuffle=False,
171 |                                 batch_size=args.batch_size//len(args.validation_anchor_dirs),
172 |                                 pin_memory=False, collate_fn=collate_fn)
173 | 
174 |         val_loss = 0.0
175 |         for i, batch in enumerate(val_loader):
176 |             x, y, num_frames = batch_to_gpu(batch)
177 |             y_pred = model(x)
178 |             loss = criterion(y_pred, y)
179 |             if distributed_run:
180 |                 reduced_val_loss = reduce_tensor(loss.data, args.world_size).item()
181 |             else:
182 |                 reduced_val_loss = loss.item()
183 |             val_loss += reduced_val_loss
184 |         val_loss = val_loss / (i + 1)
185 | 
186 |     LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
187 | 
188 | 
189 | def adjust_learning_rate(optimizer, epoch, args):
190 |     lr = cosine_decay(args.init_lr, args.final_lr, epoch, args.epochs)
191 | 
192 |     if optimizer.param_groups[0]['lr'] != lr:
193 |         LOGGER.log_event("learning_rate changed",
194 |                          value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr))
195 | 
196 |     for param_group in optimizer.param_groups:
197 |         param_group['lr'] = lr
198 | 
199 | 
200 | def main():
201 | 
202 |     parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
203 |     parser = parse_training_args(parser)
204 |     args, _ = parser.parse_known_args()
205 | 
206 |     LOGGER.set_model_name("Tacotron2_PyT")
207 |     LOGGER.set_backends([
208 |         dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
209 |         dllg.JsonBackend(log_file=os.path.join(args.output_dir, args.log_file) if args.rank == 0 else None,
210 |                          logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
211 |     ])
212 | 
213 |     LOGGER.timed_block_start("run")
214 |     LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE)
215 |     LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE)
216 |     LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE)
217 |     LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE)
218 |     LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE)
219 |     LOGGER.register_metric("train_epoch_frames/sec", metric_scope=dllg.EPOCH_SCOPE)
220 |     LOGGER.register_metric("train_epoch_avg_frames/sec", metric_scope=dllg.EPOCH_SCOPE)
221 |     LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE)
222 | 
223 |     log_hardware()
224 | 
225 |     parser = parse_tacotron2_args(parser)
226 |     args = parser.parse_args()
227 | 
228 |     log_args(args)
229 | 
230 |     torch.backends.cudnn.enabled = args.cudnn_enabled
231 |     torch.backends.cudnn.benchmark = args.cudnn_benchmark
232 | 
233 |     distributed_run = args.world_size > 1
234 |     if distributed_run:
235 |         init_distributed(args, args.world_size, args.rank, args.group_name)
236 | 
237 |     os.makedirs(args.output_dir, exist_ok=True)
238 | 
239 |     LOGGER.log(key=tags.RUN_START)
240 |     run_start_time = time.time()
241 | 
242 |     model = get_tacotron2_model(args, len(args.training_anchor_dirs), is_training=True)
243 | 
244 |     if not args.amp_run and distributed_run:
245 |         model = DDP(model)
246 | 
247 |     model.restore_checkpoint(os.path.join(args.output_dir, args.latest_checkpoint_file))
248 | 
249 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.init_lr, weight_decay=args.weight_decay)
250 | 
251 |     writer = SummaryWriter(args.output_dir)
252 | 
253 |     if args.amp_run:
254 |         model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
255 |         if distributed_run:
256 |             model = DDP(model)
257 | 
258 |     criterion = Tacotron2Loss()
259 | 
260 |     collate_fn = TextMelCollate(args)
261 |     train_dataset = TextMelDataset(args, args.training_anchor_dirs)
262 |     train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False,
263 |                               batch_size=args.batch_size//len(args.training_anchor_dirs),
264 |                               pin_memory=False, drop_last=True, collate_fn=collate_fn)
265 |     # valate_dataset = TextMelDataset(args, args.validation_anchor_dirs)
266 | 
267 |     model.train()
268 | 
269 |     elapsed_epochs = model.get_elapsed_epochs()
270 |     epochs = args.epochs - elapsed_epochs
271 |     iteration = elapsed_epochs * len(train_loader)
272 | 
273 |     LOGGER.log(key=tags.TRAIN_LOOP)
274 | 
275 |     for epoch in range(1, epochs + 1):
276 |         LOGGER.epoch_start()
277 |         epoch_start_time = time.time()
278 |         epoch += elapsed_epochs
279 |         LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
280 | 
281 |         # used to calculate avg frames/sec over epoch
282 |         reduced_num_frames_epoch = 0
283 | 
284 |         # used to calculate avg loss over epoch
285 |         train_epoch_avg_loss = 0.0
286 |         train_epoch_avg_frames_per_sec = 0.0
287 |         num_iters = 0
288 | 
289 |         adjust_learning_rate(optimizer, epoch, args)
290 | 
291 |         for i, batch in enumerate(train_loader):
292 |             print(f"Batch: {i}/{len(train_loader)} epoch {epoch}")
293 |             LOGGER.iteration_start()
294 |             iter_start_time = time.time()
295 |             LOGGER.log(key=tags.TRAIN_ITER_START, value=i)
296 | 
297 |             # start = time.perf_counter()
298 | 
299 |             optimizer.zero_grad()
300 |             x, y, num_frames = batch_to_gpu(batch)
301 | 
302 |             outputs = model(x)
303 |             y_pred = [output.cpu() for output in outputs]
304 | 
305 |             loss = criterion(y_pred, y)
306 | 
307 |             if distributed_run:
308 |                 reduced_loss = reduce_tensor(loss.data, args.world_size).item()
309 |                 reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
310 |             else:
311 |                 reduced_loss = loss.item()
312 |                 reduced_num_frames = num_frames.item()
313 | 
314 |             if np.isnan(reduced_loss):
315 |                 raise Exception("loss is NaN")
316 | 
317 |             LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)
318 | 
319 |             train_epoch_avg_loss += reduced_loss
320 |             num_iters += 1
321 | 
322 |             # accumulate number of frames processed in this epoch
323 |             reduced_num_frames_epoch += reduced_num_frames
324 | 
325 |             if args.amp_run:
326 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
327 |                     scaled_loss.backward()
328 |                 grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_thresh)
329 |             else:
330 |                 loss.backward()
331 |                 grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh)
332 | 
333 |             optimizer.step()
334 | 
335 |             iteration += 1
336 | 
337 |             writer.add_scalar('Training/Loss', reduced_loss, iteration)
338 | 
339 |             LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)
340 | 
341 |             iter_stop_time = time.time()
342 |             iter_time = iter_stop_time - iter_start_time
343 |             frames_per_sec = reduced_num_frames/iter_time
344 |             train_epoch_avg_frames_per_sec += frames_per_sec
345 | 
346 |             LOGGER.log(key="train_iter_frames/sec", value=frames_per_sec)
347 |             LOGGER.log(key="iter_time", value=iter_time)
348 |             LOGGER.iteration_stop()
349 | 
350 |         LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
351 |         epoch_stop_time = time.time()
352 |         epoch_time = epoch_stop_time - epoch_start_time
353 | 
354 |         LOGGER.log(key="train_epoch_frames/sec", value=(reduced_num_frames_epoch/epoch_time))
355 |         LOGGER.log(key="train_epoch_avg_frames/sec", value=(train_epoch_avg_frames_per_sec/num_iters if num_iters > 0 else 0.0))
356 |         LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss/num_iters if num_iters > 0 else 0.0))
357 |         LOGGER.log(key="epoch_time", value=epoch_time)
358 | 
359 |         LOGGER.log(key=tags.EVAL_START, value=epoch)
360 | 
361 |         # validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args)
362 | 
363 |         LOGGER.log(key=tags.EVAL_STOP, value=epoch)
364 | 
365 |         # Store latest checkpoint in each epoch
366 |         model.elapse_epoch()
367 |         checkpoint_path = os.path.join(args.output_dir, args.latest_checkpoint_file)
368 |         model.save_checkpoint(checkpoint_path)
369 | 
370 |         # Plot alignemnt
371 |         if epoch % args.epochs_per_alignment == 0 and args.rank == 0:
372 |             alignments = y_pred[3].data.numpy()
373 |             index = np.random.randint(len(alignments))
374 |             plot_alignment(alignments[index], # [enc_step, dec_step]
375 |                            os.path.join(args.output_dir, f"align_{epoch:04d}_{iteration}.png"),
376 |                            info=f"{datetime.now().strftime('%Y-%m-%d %H:%M')} Epoch={epoch:04d} Iteration={iteration} Average loss={train_epoch_avg_loss/num_iters:.5f}")
377 | 
378 |         # Save checkpoint
379 |         if epoch % args.epochs_per_checkpoint == 0 and args.rank == 0:
380 |             checkpoint_path = os.path.join(args.output_dir, f"checkpoint_{epoch:04d}.pt")
381 |             print(f"Saving model and optimizer state at epoch {epoch:04d} to {checkpoint_path}")
382 |             model.save_checkpoint(checkpoint_path)
383 | 
384 |             # Save evaluation
385 |             # save_sample(model, args.tacotron2_checkpoint, args.phrase_path,
386 |             #             os.path.join(args.output_dir, f"sample_{epoch:04d}_{iteration}.wav"), args.sampling_rate)
387 | 
388 |         LOGGER.epoch_stop()
389 | 
390 |     run_stop_time = time.time()
391 |     run_time = run_stop_time - run_start_time
392 |     LOGGER.log(key="run_time", value=run_time)
393 |     LOGGER.log(key=tags.RUN_FINAL)
394 | 
395 |     print("training time", run_stop_time - run_start_time)
396 |     writer.close()
397 | 
398 |     LOGGER.timed_block_stop("run")
399 | 
400 |     if args.rank == 0:
401 |         LOGGER.finish()
402 | 
403 | 
404 | if __name__ == '__main__':
405 |     main()
406 | 


--------------------------------------------------------------------------------
/tacotron2/model.py:
--------------------------------------------------------------------------------
  1 | #: *****************************************************************************
  2 | #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | #  Redistribution and use in source and binary forms, with or without
  5 | #  modification, are permitted provided that the following conditions are met:
  6 | #      * Redistributions of source code must retain the above copyright
  7 | #        notice, this list of conditions and the following disclaimer.
  8 | #      * Redistributions in binary form must reproduce the above copyright
  9 | #        notice, this list of conditions and the following disclaimer in the
 10 | #        documentation and/or other materials provided with the distribution.
 11 | #      * Neither the name of the NVIDIA CORPORATION nor the
 12 | #        names of its contributors may be used to endorse or promote products
 13 | #        derived from this software without specific prior written permission.
 14 | #
 15 | #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | #  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | #  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | #  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | #
 26 | # *****************************************************************************
 27 | 
 28 | import math
 29 | import os
 30 | import torch
 31 | from torch import nn
 32 | from torch.nn import functional as F
 33 | import sys
 34 | from os.path import abspath, dirname
 35 | # enabling modules discovery from global entrypoint
 36 | sys.path.append(abspath(dirname(__file__)+'/../'))
 37 | from common.layers import ConvNorm, LinearNorm
 38 | from common.utils import to_gpu, get_mask_from_lengths
 39 | 
 40 | 
 41 | class LocationLayer(nn.Module):
 42 |     def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
 43 |         super(LocationLayer, self).__init__()
 44 |         self.location_conv = ConvNorm(1, attention_n_filters,
 45 |                                       kernel_size=attention_kernel_size,
 46 |                                       padding=int((attention_kernel_size - 1) / 2),
 47 |                                       stride=1, dilation=1)
 48 |         self.location_dense = LinearNorm(attention_n_filters, attention_dim,
 49 |                                          bias=False, w_init_gain='tanh')
 50 | 
 51 |     def forward(self, attention_weights_cum):
 52 |         processed_attention_weights = self.location_conv(attention_weights_cum)
 53 |         processed_attention_weights = processed_attention_weights.transpose(1, 2)
 54 |         processed_attention_weights = self.location_dense(processed_attention_weights)
 55 |         return processed_attention_weights
 56 | 
 57 | 
 58 | class Attention(nn.Module):
 59 |     def __init__(self, query_dim, memory_dim, attention_dim,
 60 |                  attention_location_n_filters, attention_location_kernel_size):
 61 |         super(Attention, self).__init__()
 62 |         self.query_layer = LinearNorm(query_dim, attention_dim, w_init_gain='tanh')
 63 |         self.memory_layer = LinearNorm(memory_dim, attention_dim, w_init_gain='tanh')
 64 |         self.v = LinearNorm(attention_dim, 1)
 65 |         self.location_layer = LocationLayer(attention_location_n_filters,
 66 |                                             attention_location_kernel_size,
 67 |                                             attention_dim)
 68 |         self.score_mask_value = -float("inf")
 69 | 
 70 |     def get_alignment_energies(self, query, memory, attention_weights_cum):
 71 |         """
 72 |         PARAMS
 73 |         ------
 74 |         query: decoder output (B, decoder_dim)
 75 |         memory: encoder outputs (B, T_in, embed_dim)
 76 |         attention_weights_cum: cumulative attention weights (B, 1, max_time)
 77 | 
 78 |         RETURNS
 79 |         -------
 80 |         alignment (batch, max_time)
 81 |         """
 82 | 
 83 |         # [B, T_in, attn_dim]
 84 |         key = self.memory_layer(memory)
 85 |         # [B, 1, attn_dim]
 86 |         query = self.query_layer(query.unsqueeze(1))
 87 |         # [B, T_in, attn_dim]
 88 |         location_sensitive_weights = self.location_layer(attention_weights_cum)
 89 |         # score function
 90 |         energies = self.v(torch.tanh(query + location_sensitive_weights + key))
 91 |         # [B, T_in]
 92 |         energies = energies.squeeze(-1)
 93 | 
 94 |         return energies
 95 | 
 96 |     def forward(self, query, memory, attention_weights_cum, mask=None):
 97 |         """
 98 |         PARAMS
 99 |         ------
100 |         query: attention rnn last output [B, decoder_dim]
101 |         memory: encoder outputs [B, T_in, embed_dim]
102 |         attention_weights_cum: cummulative attention weights
103 |         mask: binary mask for padded data
104 |         """
105 |         alignment = self.get_alignment_energies(query, memory, attention_weights_cum)
106 | 
107 |         if mask is not None:
108 |             alignment.masked_fill_(mask, self.score_mask_value)
109 | 
110 |         # [B, T_in]
111 |         attention_weights = F.softmax(alignment, dim=1)
112 |         # [B, 1, T_in] * [B, T_in, embbed_dim]
113 |         attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
114 |         # [B, embbed_dim]
115 |         attention_context = attention_context.squeeze(1)
116 | 
117 |         return attention_context, attention_weights
118 | 
119 | 
120 | class Prenet(nn.Module):
121 |     def __init__(self, in_dim, sizes):
122 |         super(Prenet, self).__init__()
123 |         in_sizes = [in_dim] + sizes[:-1]
124 |         self.layers = nn.ModuleList(
125 |             [LinearNorm(in_size, out_size) for (in_size, out_size) in zip(in_sizes, sizes)])
126 | 
127 |     def forward(self, x, inference=False):
128 |         if inference:
129 |             for linear in self.layers:
130 |                 x = F.relu(linear(x), inplace=True)
131 |                 x0 = x[0].unsqueeze(0)
132 |                 mask = torch.bernoulli(x0.new(x0.size()).fill_(0.5))
133 |                 mask = mask.expand(x.size())
134 |                 x = x * mask * 2
135 |         else:
136 |             for linear in self.layers:
137 |                 x = F.dropout(F.relu(linear(x), inplace=True), p=0.5, training=True)
138 |         return x
139 | 
140 | 
141 | class Postnet(nn.Module):
142 |     """Postnet
143 |         - Five 1-d convolution with 512 channels and kernel size 5
144 |     """
145 | 
146 |     def __init__(self, n_mel_channels, postnet_embedding_dim,
147 |                  postnet_kernel_size, postnet_n_convolutions):
148 |         super(Postnet, self).__init__()
149 |         self.convolutions = nn.ModuleList()
150 | 
151 |         self.convolutions.append(
152 |             nn.Sequential(
153 |                 ConvNorm(n_mel_channels, postnet_embedding_dim,
154 |                          kernel_size=postnet_kernel_size, stride=1,
155 |                          padding=int((postnet_kernel_size - 1) / 2),
156 |                          dilation=1, w_init_gain='tanh'),
157 |                 nn.BatchNorm1d(postnet_embedding_dim))
158 |         )
159 | 
160 |         for i in range(1, postnet_n_convolutions - 1):
161 |             self.convolutions.append(
162 |                 nn.Sequential(
163 |                     ConvNorm(postnet_embedding_dim,
164 |                              postnet_embedding_dim,
165 |                              kernel_size=postnet_kernel_size, stride=1,
166 |                              padding=int((postnet_kernel_size - 1) / 2),
167 |                              dilation=1, w_init_gain='tanh'),
168 |                     nn.BatchNorm1d(postnet_embedding_dim))
169 |             )
170 | 
171 |         self.convolutions.append(
172 |             nn.Sequential(
173 |                 ConvNorm(postnet_embedding_dim, n_mel_channels,
174 |                          kernel_size=postnet_kernel_size, stride=1,
175 |                          padding=int((postnet_kernel_size - 1) / 2),
176 |                          dilation=1, w_init_gain='linear'),
177 |                 nn.BatchNorm1d(n_mel_channels))
178 |         )
179 | 
180 |     def forward(self, x):
181 |         for i in range(len(self.convolutions) - 1):
182 |             x = torch.tanh(self.convolutions[i](x))
183 |         return self.convolutions[-1](x)
184 | 
185 | 
186 | class Encoder(nn.Module):
187 |     """Encoder module:
188 |         - Three 1-d convolution banks
189 |         - Bidirectional LSTM
190 |     """
191 |     def __init__(self, encoder_n_convolutions, encoder_embedding_dim, encoder_kernel_size):
192 |         super(Encoder, self).__init__()
193 | 
194 |         convolutions = []
195 |         for _ in range(encoder_n_convolutions):
196 |             conv_layer = nn.Sequential(
197 |                 ConvNorm(encoder_embedding_dim,
198 |                          encoder_embedding_dim,
199 |                          kernel_size=encoder_kernel_size, stride=1,
200 |                          padding=int((encoder_kernel_size - 1) / 2),
201 |                          dilation=1, w_init_gain='relu'),
202 |                 nn.BatchNorm1d(encoder_embedding_dim))
203 |             convolutions.append(conv_layer)
204 |         self.convolutions = nn.ModuleList(convolutions)
205 | 
206 |         self.encoder_lstm = nn.LSTM(encoder_embedding_dim,
207 |                                     int(encoder_embedding_dim / 2), 1,
208 |                                     batch_first=True, bidirectional=True)
209 | 
210 |     def forward(self, x, text_lengths):
211 |         for conv in self.convolutions:
212 |             x = F.relu(conv(x), inplace=True)
213 | 
214 |         # [B, encoder_dim, T_in] -> [B, T_in, encoder_dim]
215 |         x = x.transpose(1, 2)
216 | 
217 |         # pytorch tensor are not reversible, hence the conversion
218 |         text_lengths = text_lengths.cpu().numpy()
219 |         x = nn.utils.rnn.pack_padded_sequence(x, text_lengths, batch_first=True)
220 |         # [B, T_in, encoder_dim]
221 |         outputs, _ = self.encoder_lstm(x)
222 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
223 | 
224 |         return outputs
225 | 
226 | 
227 | class Decoder(nn.Module):
228 |     def __init__(self, n_mel_channels, n_frames_per_step,
229 |                  encoder_embedding_dim, attention_dim,
230 |                  attention_location_n_filters,
231 |                  attention_location_kernel_size,
232 |                  prenet_dim, decoder_rnn_dim,
233 |                  max_decoder_steps, gate_threshold,
234 |                  decoder_n_lstms, p_decoder_dropout):
235 |         super(Decoder, self).__init__()
236 |         self.n_mel_channels = n_mel_channels
237 |         self.n_frames_per_step = n_frames_per_step
238 |         self.encoder_embedding_dim = encoder_embedding_dim
239 |         self.decoder_rnn_dim = decoder_rnn_dim
240 |         self.prenet_dim = prenet_dim
241 |         self.max_decoder_steps = max_decoder_steps
242 |         self.gate_threshold = gate_threshold
243 |         self.decoder_n_lstms = decoder_n_lstms
244 |         self.p_decoder_dropout = p_decoder_dropout
245 | 
246 |         self.prenet = Prenet(n_mel_channels, [prenet_dim, prenet_dim])
247 | 
248 |         self.lstm0 = nn.LSTMCell(prenet_dim + encoder_embedding_dim, decoder_rnn_dim)
249 |         self.lstm1 = nn.LSTMCell(decoder_rnn_dim + encoder_embedding_dim, decoder_rnn_dim)
250 | 
251 |         self.attention_layer = Attention(decoder_rnn_dim, encoder_embedding_dim,
252 |                                          attention_dim, attention_location_n_filters,
253 |                                          attention_location_kernel_size)
254 | 
255 |         self.linear_projection = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step)
256 | 
257 |         self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_frames_per_step, w_init_gain='sigmoid')
258 | 
259 |     def initialize_decoder_states(self, memory, mask=None, inference=False):
260 |         """ Initializes attention rnn states, decoder rnn states, attention
261 |         weights, attention cumulative weights, attention context, stores memory
262 |         PARAMS
263 |         ------
264 |         memory: Encoder outputs
265 |         mask: Mask for padded data if training, expects None for inference
266 |         """
267 |         B = memory.size(0)
268 |         MAX_TIME = memory.size(1)
269 | 
270 |         self.h0 = torch.zeros(B, self.decoder_rnn_dim).cuda()
271 |         self.c0 = torch.zeros(B, self.decoder_rnn_dim).cuda()
272 |         self.h1 = torch.zeros(B, self.decoder_rnn_dim).cuda()
273 |         self.c1 = torch.zeros(B, self.decoder_rnn_dim).cuda()
274 | 
275 |         # if inference:
276 |         #     self.h0 = self.h0.half()
277 |         #     self.c0 = self.c0.half()
278 |         #     self.h1 = self.h1.half()
279 |         #     self.c1 = self.c1.half()
280 | 
281 |         self.attention_weights = memory.new(B, MAX_TIME).zero_()
282 |         self.attention_weights_cum = memory.new(B, MAX_TIME).zero_()
283 |         self.attention_context = memory.new(B, self.encoder_embedding_dim).zero_()
284 | 
285 |         self.memory = memory
286 |         self.mask = mask
287 | 
288 |     def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments, mel_lengths=None):
289 |         """ Prepares decoder outputs for output
290 |         PARAMS
291 |         ------
292 |         mel_outputs:
293 |         gate_outputs: gate output energies
294 |         alignments:
295 | 
296 |         RETURNS
297 |         -------
298 |         mel_outputs:
299 |         gate_outpust: gate output energies
300 |         alignments:
301 |         """
302 |         # (T_out, B, T_in) -> (B, T_in, T_out)
303 |         alignments = torch.stack(alignments).transpose(0, 1).transpose(1, 2).contiguous()
304 |         # (T_out, B, n_frames_per_step) -> (B, T_out, n_frames_per_step)
305 |         gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
306 |         # (B, T_out, n_frames_per_step) -> (B, T_out)
307 |         gate_outputs = gate_outputs.contiguous().view(gate_outputs.size(0), -1)
308 |         # (T_out, B, n_mel_channels * n_frames_per_step) -> (B, T_out, n_mel_channels * n_frames_per_step)
309 |         mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
310 |         # decouple frames per step
311 |         mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, self.n_mel_channels)
312 |         # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
313 |         mel_outputs = mel_outputs.transpose(1, 2)
314 |         # mel lengths scale to the target length
315 |         if mel_lengths is not None:
316 |             mel_lengths *= self.n_frames_per_step
317 | 
318 |         return mel_outputs, gate_outputs, alignments, mel_lengths
319 | 
320 |     def decode(self, prenet_output):
321 |         """ Decoder step using stored states, attention and memory
322 |         PARAMS
323 |         ------
324 |         prenet_output: previous mel output
325 | 
326 |         RETURNS
327 |         -------
328 |         mel_output:
329 |         gate_output: gate output energies
330 |         attention_weights:
331 |         """
332 |         x = torch.cat((prenet_output, self.attention_context), dim=-1)
333 |         self.h0, self.c0 = self.lstm0(x, (self.h0, self.c0))
334 |         # [B, 1, decoder_dim]
335 |         x = F.dropout(self.h0, self.p_decoder_dropout, self.training)
336 | 
337 |         x = torch.cat((x, self.attention_context), dim=-1)
338 |         self.h1, self.c1 = self.lstm1(x, (self.h1, self.c1))
339 |         # [B, 1, decoder_dim]
340 |         self.query = F.dropout(self.h1, self.p_decoder_dropout, self.training)
341 | 
342 |         attention_weights_cumulative = self.attention_weights_cum.unsqueeze(1)
343 |         self.attention_context, self.attention_weights = self.attention_layer(
344 |             self.query, self.memory, attention_weights_cumulative, self.mask)
345 | 
346 |         # [B, MAX_TIME]
347 |         # Avoid '+=' as in-place operation in case of gradient computation
348 |         self.attention_weights_cum = self.attention_weights_cum + self.attention_weights
349 | 
350 |         x = torch.cat((self.query, self.attention_context), dim=-1)
351 |         # [B, n_mel_channels * n_frames_per_step]
352 |         mel_output = self.linear_projection(x)
353 |         # [B, n_frames_per_step]
354 |         gate_output = self.gate_layer(x)
355 |         return mel_output, gate_output, self.attention_weights
356 | 
357 |     def forward(self, memory, memory_lengths, targets):
358 |         """ Decoder forward pass for training
359 |         PARAMS
360 |         ------
361 |         memory: Encoder outputs
362 |         targets: Decoder inputs for teacher forcing. i.e. mel-specs
363 |         memory_lengths: Encoder output lengths for attention masking.
364 | 
365 |         RETURNS
366 |         -------
367 |         mel_outputs: mel outputs from the decoder
368 |         gate_outputs: gate outputs from the decoder
369 |         alignments: sequence of attention weights from the decoder
370 |         """
371 |         go_frame = memory.new(memory.size(0), self.n_mel_channels).zero_().unsqueeze(0)
372 |         # (B, n_mel_channels, T_out) -> (T_out, B, n_mel_channels)
373 |         targets = targets.permute(2, 0, 1)
374 |         decoder_inputs = torch.cat((go_frame, targets), dim=0)
375 |         prenet_outputs = self.prenet(decoder_inputs)
376 | 
377 |         mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None
378 |         self.initialize_decoder_states(memory, mask)
379 | 
380 |         mel_outputs, gate_outputs, alignments = [], [], []
381 |         # size - 1 for ignoring EOS symbol
382 |         while len(mel_outputs) < decoder_inputs.size(0) - 1:
383 |             prenet_output = prenet_outputs[len(mel_outputs)]
384 |             mel_output, gate_output, attention_weights = self.decode(prenet_output)
385 | 
386 |             mel_outputs += [mel_output]
387 |             gate_outputs += [gate_output]
388 |             alignments += [attention_weights]
389 | 
390 |         return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
391 | 
392 |     def infer(self, memory, memory_lengths):
393 |         """ Decoder inference
394 |         PARAMS
395 |         ------
396 |         memory: Encoder outputs
397 | 
398 |         RETURNS
399 |         -------
400 |         mel_outputs: mel outputs from the decoder
401 |         gate_outputs: gate outputs from the decoder
402 |         alignments: sequence of attention weights from the decoder
403 |         """
404 |         mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None
405 |         self.initialize_decoder_states(memory, mask, inference=True)
406 | 
407 |         mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32)
408 |         if torch.cuda.is_available():
409 |             mel_lengths = mel_lengths.cuda()
410 | 
411 |         mel_outputs, gate_outputs, alignments = [], [], []
412 |         frame = memory.new(memory.size(0), self.n_mel_channels).zero_()
413 |         while True:
414 |             prenet_output = self.prenet(frame, inference=True)
415 | 
416 |             mel_output, gate_output, alignment = self.decode(prenet_output)
417 |             gate_output = torch.sigmoid(gate_output)
418 | 
419 |             finished = torch.gt(gate_output, self.gate_threshold).all(-1)
420 |             mel_lengths += (~finished).to(torch.int32)
421 | 
422 |             if finished.all():
423 |                 break
424 | 
425 |             mel_outputs += [mel_output]
426 |             gate_outputs += [gate_output]
427 |             alignments += [alignment]
428 | 
429 |             if len(mel_outputs) == self.max_decoder_steps:
430 |                 print("Warning! Reached max decoder steps")
431 |                 break
432 | 
433 |             frame = mel_output[:, :self.n_mel_channels]
434 | 
435 |         return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments, mel_lengths)
436 | 
437 |     def gta(self, memory, memory_lengths, targets):
438 |         """ Decoder forward pass for training
439 |         PARAMS
440 |         ------
441 |         memory: Encoder outputs
442 |         memory_lengths: Encoder output lengths for attention masking.
443 |         targets: Decoder inputs for teacher forcing. i.e. mel-specs
444 | 
445 |         RETURNS
446 |         -------
447 |         mel_outputs: mel outputs from the decoder
448 |         gate_outputs: gate outputs from the decoder
449 |         alignments: sequence of attention weights from the decoder
450 |         """
451 |         go_frame = memory.new(memory.size(0), self.n_mel_channels).zero_().unsqueeze(0)
452 |         # (B, n_mel_channels, T_out) -> (T_out, B, n_mel_channels)
453 |         targets = targets.permute(2, 0, 1)
454 |         decoder_inputs = torch.cat((go_frame, targets), dim=0)
455 |         prenet_outputs = self.prenet(decoder_inputs, inference=True)
456 | 
457 |         mask =~ get_mask_from_lengths(memory_lengths) if memory.size(0) > 1 else None
458 |         self.initialize_decoder_states(memory, mask, inference=True)
459 | 
460 |         mel_outputs, gate_outputs, alignments = [], [], []
461 |         # size - 1 for ignoring EOS symbol
462 |         while len(mel_outputs) < decoder_inputs.size(0) - 1:
463 |             prenet_output = prenet_outputs[len(mel_outputs)]
464 |             mel_output, gate_output, attention_weights = self.decode(prenet_output)
465 | 
466 |             mel_outputs += [mel_output]
467 |             gate_outputs += [gate_output]
468 |             alignments += [attention_weights]
469 | 
470 |         return self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
471 | 
472 | 
473 | class Tacotron2(nn.Module):
474 |     def __init__(self, mask_padding, n_mel_channels,
475 |                  n_symbols, symbols_embedding_dim, encoder_kernel_size,
476 |                  encoder_n_convolutions, encoder_embedding_dim,
477 |                  attention_dim, attention_location_n_filters,
478 |                  attention_location_kernel_size, n_frames_per_step,
479 |                  prenet_dim, decoder_rnn_dim, max_decoder_steps, gate_threshold,
480 |                  decoder_n_lstms, p_decoder_dropout,
481 |                  postnet_embedding_dim, postnet_kernel_size,
482 |                  postnet_n_convolutions):
483 |         super(Tacotron2, self).__init__()
484 |         self.elapsed_epochs = 0
485 |         self.mask_padding = mask_padding
486 |         self.n_mel_channels = n_mel_channels
487 |         self.n_frames_per_step = n_frames_per_step
488 |         self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim)
489 |         std = math.sqrt(2.0 / (n_symbols + symbols_embedding_dim))
490 |         val = math.sqrt(3.0) * std  # uniform bounds for std
491 |         self.embedding.weight.data.uniform_(-val, val)
492 |         self.encoder = Encoder(encoder_n_convolutions, encoder_embedding_dim, encoder_kernel_size)
493 |         self.decoder = Decoder(n_mel_channels, n_frames_per_step,
494 |                                encoder_embedding_dim, attention_dim,
495 |                                attention_location_n_filters,
496 |                                attention_location_kernel_size,
497 |                                prenet_dim, decoder_rnn_dim,
498 |                                max_decoder_steps,
499 |                                gate_threshold, decoder_n_lstms,
500 |                                p_decoder_dropout)
501 |         self.postnet = Postnet(n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions)
502 | 
503 |     def parse_outputs(self, outputs, target_lengths=None):
504 |         if self.mask_padding and target_lengths is not None:
505 |             mask = ~get_mask_from_lengths(target_lengths)
506 |             mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
507 |             mask = mask.permute(1, 0, 2)
508 | 
509 |             outputs[0].masked_fill_(mask, 0.0)
510 |             outputs[1].masked_fill_(mask, 0.0)
511 |             outputs[2].masked_fill_(mask[:, 0, :], 1e3)  # gate energies
512 | 
513 |         return outputs
514 | 
515 |     def forward(self, inputs):
516 |         texts, text_lengths, targets, target_lengths = inputs
517 | 
518 |         # [B, T_in] -> [B, embed_dim, T_in]
519 |         embedded_inputs = self.embedding(texts).transpose(1, 2)
520 |         # [B, T_in, encoder_dim]
521 |         encoder_outputs = self.encoder(embedded_inputs, text_lengths)
522 | 
523 |         mel_outputs_before, gate_outputs, alignments, _ = self.decoder(encoder_outputs, text_lengths, targets)
524 |         mel_outputs_after = mel_outputs_before + self.postnet(mel_outputs_before)
525 | 
526 |         return self.parse_outputs([mel_outputs_before, mel_outputs_after, gate_outputs, alignments])
527 | 
528 |     def infer(self, texts, text_lengths, targets=None, target_lengths=None):
529 |         # [B, T_in] -> [B, embed_dim, T_in]
530 |         embedded_inputs = self.embedding(texts).transpose(1, 2)
531 |         # [B, T_in, encoder_dim]
532 |         encoder_outputs = self.encoder(embedded_inputs, text_lengths)
533 | 
534 |         if targets is None:
535 |             mel_outputs_before, gate_outputs, alignments, mel_lengths = self.decoder.infer(encoder_outputs, text_lengths)
536 |         else:
537 |             mel_outputs_before, gate_outputs, alignments, mel_lengths = self.decoder.gta(encoder_outputs, text_lengths, targets)
538 | 
539 |         mel_outputs_after = mel_outputs_before + self.postnet(mel_outputs_before)
540 | 
541 |         return self.parse_outputs([mel_outputs_before, mel_outputs_after, gate_outputs, alignments, mel_lengths])
542 | 
543 |     def elapse_epoch(self):
544 |         self.elapsed_epochs += 1
545 | 
546 |     def get_elapsed_epochs(self):
547 |         return self.elapsed_epochs
548 | 
549 |     def save_checkpoint(self, filepath):
550 |         torch.save({'epoch': self.elapsed_epochs, 'model': self.state_dict()}, filepath)
551 | 
552 |     def restore_checkpoint(self, filepath):
553 |         if os.path.exists(filepath) :
554 |             def _checkpoint_from_distributed(state_dict):
555 |                 """
556 |                 Checks whether checkpoint was generated by DistributedDataParallel. DDP
557 |                 wraps model in additional "module.", it needs to be unwrapped for single
558 |                 GPU inference.
559 |                 :param state_dict: model's state dict
560 |                 """
561 |                 for key, _ in state_dict.items():
562 |                     if key.find('module.') != -1:
563 |                         return True
564 |                 return False
565 | 
566 |             def _unwrap_distributed(state_dict):
567 |                 """
568 |                 Unwraps model from DistributedDataParallel.
569 |                 DDP wraps model in additional "module.", it needs to be removed for single
570 |                 GPU inference.
571 |                 :param state_dict: model's state dict
572 |                 """
573 |                 new_state_dict = {}
574 |                 for key, value in state_dict.items():
575 |                     new_key = key.replace('module.', '')
576 |                     new_state_dict[new_key] = value
577 |                 return new_state_dict
578 | 
579 |             print(f'Loading Weights: "{filepath}"')
580 |             checkpoint = torch.load(filepath)
581 |             self.elapsed_epochs = checkpoint['epoch']
582 |             if _checkpoint_from_distributed(checkpoint['model']):
583 |                 checkpoint['model'] = _unwrap_distributed(checkpoint['model'])
584 |             self.load_state_dict(checkpoint['model'])
585 | 


--------------------------------------------------------------------------------