├── .gitignore
├── Readme.md
└── deepaudio
    ├── __init__.py
    └── tts
        ├── __init__.py
        ├── audio
            ├── __init__.py
            ├── audio.py
            ├── codec.py
            └── spec_normalizer.py
        ├── cli
            ├── __init__.py
            ├── configs
            │   ├── __init__.py
            │   ├── callbacks
            │   │   ├── default.yaml
            │   │   ├── early_stopping.yaml
            │   │   ├── model_checkpoint.yaml
            │   │   ├── model_summary.yaml
            │   │   ├── none.yaml
            │   │   └── rich_progress_bar.yaml
            │   ├── datamodule
            │   │   ├── fastspeech2.yaml
            │   │   ├── gan.yaml
            │   │   ├── tacotron2.yaml
            │   │   ├── transformer_tts.yaml
            │   │   ├── vits.yaml
            │   │   └── wavernn.yaml
            │   ├── experiment
            │   │   ├── fastspeech2.yaml
            │   │   ├── hifigan.yaml
            │   │   ├── parallel_wavegan.yaml
            │   │   ├── tacotron2.yaml
            │   │   ├── transformer_tts.yaml
            │   │   └── vits.yaml
            │   ├── extras
            │   │   └── default.yaml
            │   ├── hydra
            │   │   └── default.yaml
            │   ├── logger
            │   │   ├── comet.yaml
            │   │   ├── csv.yaml
            │   │   ├── many_loggers.yaml
            │   │   ├── mlflow.yaml
            │   │   ├── neptune.yaml
            │   │   ├── tensorboard.yaml
            │   │   └── wandb.yaml
            │   ├── model
            │   │   ├── fastspeech2.yaml
            │   │   ├── hifigan.yaml
            │   │   ├── parallel_wavegan.yaml
            │   │   ├── tacotron2.yaml
            │   │   ├── transformer_tts.yaml
            │   │   └── vits.yaml
            │   ├── paths
            │   │   └── default.yaml
            │   ├── train.yaml
            │   └── trainer
            │   │   ├── cpu.yaml
            │   │   ├── ddp.yaml
            │   │   ├── ddp_sim.yaml
            │   │   ├── default.yaml
            │   │   ├── gpu.yaml
            │   │   └── mps.yaml
            ├── preprocess
            │   ├── __init__.py
            │   ├── fastspeech2
            │   │   ├── __init__.py
            │   │   ├── normalize.py
            │   │   └── preprocess.py
            │   ├── gan_vocoder
            │   │   ├── __init__.py
            │   │   ├── normalize.py
            │   │   └── preprocess.py
            │   ├── tacotron2
            │   │   ├── __init__.py
            │   │   ├── normalize.py
            │   │   └── preprocess.py
            │   ├── transformer_tts
            │   │   ├── __init__.py
            │   │   ├── normalize.py
            │   │   └── preprocess.py
            │   └── vits
            │   │   ├── normalize.py
            │   │   └── preprocess.py
            ├── train.py
            └── utils
            │   ├── __init__.py
            │   ├── pylogger.py
            │   ├── rich_utils.py
            │   └── utils.py
        ├── datamodules
            ├── __init__.py
            ├── fastspeech2_datamodule.py
            ├── gan_datamodule.py
            ├── tacotron2_datamodule.py
            ├── transformer_tts_datamodule.py
            ├── vits_datamodule.py
            └── wavernn_datamodule.py
        ├── datasets
            ├── __init__.py
            ├── am_batch_fn.py
            ├── batch.py
            ├── data_table.py
            ├── dataset.py
            ├── get_feats.py
            ├── ljspeech.py
            ├── preprocess_utils.py
            └── vocoder_batch_fn.py
        ├── feats_extract_from_torch
            ├── __init__.py
            ├── abs_feats_extract.py
            ├── complex_utils.py
            ├── dio.py
            ├── energy.py
            ├── linear_spectrogram.py
            ├── log_mel.py
            ├── log_mel_fbank.py
            ├── log_spectrogram.py
            └── stft.py
        ├── frontend
            ├── __init__.py
            ├── arpabet.py
            ├── generate_lexicon.py
            ├── normalizer
            │   ├── __init__.py
            │   ├── abbrrviation.py
            │   ├── acronyms.py
            │   ├── normalizer.py
            │   ├── numbers.py
            │   └── width.py
            ├── phonectic.py
            ├── punctuation.py
            ├── tone_sandhi.py
            ├── vocab.py
            ├── zh_frontend.py
            └── zh_normalization
            │   ├── README.md
            │   ├── __init__.py
            │   ├── char_convert.py
            │   ├── chronology.py
            │   ├── constants.py
            │   ├── num.py
            │   ├── phonecode.py
            │   ├── quantifier.py
            │   └── text_normlization.py
        ├── models
            ├── __init__.py
            ├── fastspeech2
            │   ├── __init__.py
            │   ├── fastspeech2.py
            │   ├── loss.py
            │   └── model.py
            ├── hifigan
            │   ├── __init__.py
            │   ├── hifigan.py
            │   ├── loss.py
            │   ├── model.py
            │   └── residual_block.py
            ├── melgan
            │   ├── __init__.py
            │   ├── melgan.py
            │   ├── model.py
            │   ├── pqmf.py
            │   ├── residual_stack.py
            │   ├── style_melgan.py
            │   └── tade_res_block.py
            ├── parallel_wavegan
            │   ├── __init__.py
            │   ├── model.py
            │   ├── parallel_wavegan.py
            │   └── upsample.py
            ├── tacotron2
            │   ├── __init__.py
            │   ├── loss.py
            │   ├── model.py
            │   └── tacotron2.py
            ├── transformer_tts
            │   ├── __init__.py
            │   ├── loss.py
            │   ├── model.py
            │   └── transformer.py
            ├── vits
            │   ├── __init__.py
            │   ├── duration_predictor.py
            │   ├── flow.py
            │   ├── generator.py
            │   ├── loss.py
            │   ├── model.py
            │   ├── monotonic_align
            │   │   ├── __init__.py
            │   │   ├── core.pyx
            │   │   └── setup.py
            │   ├── posterior_encoder.py
            │   ├── residual_coupling.py
            │   ├── text_encoder.py
            │   ├── transform.py
            │   ├── vits.py
            │   └── wavenet
            │   │   ├── __init__.py
            │   │   ├── residual_block.py
            │   │   └── wavenet.py
            └── wavernn
            │   ├── __init__.py
            │   ├── model.py
            │   └── wavernn.py
        ├── modules
            ├── __init__.py
            ├── activation.py
            ├── causal_conv.py
            ├── conformer
            │   ├── __init__.py
            │   ├── convolution.py
            │   ├── encoder.py
            │   ├── encoder_layer.py
            │   └── swish.py
            ├── conv.py
            ├── geometry.py
            ├── layer_norm.py
            ├── losses.py
            ├── masked_fill.py
            ├── nets_utils.py
            ├── normalizer.py
            ├── positional_encoding.py
            ├── pqmf.py
            ├── predictor
            │   ├── __init__.py
            │   ├── duration_calculator.py
            │   ├── duration_predictor.py
            │   ├── length_regulator.py
            │   └── variance_predictor.py
            ├── residual_block.py
            ├── residual_stack.py
            ├── style_encoder.py
            ├── tacotron2
            │   ├── __init__.py
            │   ├── attentions.py
            │   ├── cbhg.py
            │   ├── decoder.py
            │   └── encoder.py
            ├── tade_res_block.py
            ├── transformer
            │   ├── __init__.py
            │   ├── attention.py
            │   ├── decoder.py
            │   ├── decoder_layer.py
            │   ├── dynamic_conv.py
            │   ├── dynamic_conv2d.py
            │   ├── embedding.py
            │   ├── encoder.py
            │   ├── encoder_layer.py
            │   ├── layer_norm.py
            │   ├── lightconv.py
            │   ├── lightconv2d.py
            │   ├── mask.py
            │   ├── multi_layer_conv.py
            │   ├── positionwise_feed_forward.py
            │   ├── repeat.py
            │   └── subsampling.py
            └── upsample.py
        └── utils
            ├── __init__.py
            ├── display.py
            ├── error_rate.py
            └── h5_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | outputs/
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | ## What is deepaudio-tts?
 2 | Deepaudio-tts is a framework for training neural network based Text-to-Speech (TTS) models. It inlcudes or will include  popular neural network architectures for tts and vocoder models. 
 3 | 
 4 | To make it easy to use various functions such as mixed-precision, multi-node training, and TPU training etc, I introduced PyTorch-Lighting and Hydra in this framework. *It is still in development.*
 5 | 
 6 | 
 7 | ## Training examples
 8 | 1. Preprocess you data. (Scripts comming soon, or you can follow the tutorial of paddle speech for this step.)
 9 | 2. Train the model. You can choose one experiment in deepaudio/tts/cli/configs/experiment. Then train the model with following lines:
10 | ```
11 | $ export PYTHONPATH="${PYTHONPATH}:/dir/of/this/project/"
12 | $ python -m deepaudio.tts.cli.train experiment=tacotron2 datamodule.train_metadata=/you/path/to/train_metadata datamodule.dev_metadata=/you/path/to/dev_metadata
13 | ```
14 | 
15 | ## Supported Models
16 | 1. Tacotron2
17 | 2. FastSpeech2
18 | 3. Transformer TTS
19 | 4. Parallel WaveGAN
20 | 5. HiFiGAN
21 | 6. VITS
22 | 
23 | ## Future plan
24 | ### clean code
25 | 1. Remove redundant codes.
26 | 2. make deepaudio.tts.models more clean.
27 | ### Models
28 | 1. Other models.
29 | 2. Pretrained models. 
30 | ### Deployment
31 | 1. onnx
32 | 2. jit
33 | ## How to contribute to deepaudio-tts
34 | 
35 | It is a personal project. So I don't have enough gpu resources to do a lot of experiments. 
36 | This project is still in development. 
37 | I appreciate any kind of feedback or contributions. Please feel free to make a pull requsest for some small issues like bug fixes, experiment results. If you have any questions, please [open an issue](https://github.com/deepaudio/deepaudio-tts/issues).
38 | 
39 | ## Acknowledge
40 | I borrowed a lot of codes from [espnet](https://github.com/espnet/espnet) and [paddle speech](https://github.com/PaddlePaddle/PaddleSpeech)


--------------------------------------------------------------------------------
/deepaudio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .audio import AudioProcessor
15 | from .codec import *
16 | from .spec_normalizer import LogMagnitude
17 | from .spec_normalizer import NormalizerBase
18 | 


--------------------------------------------------------------------------------
/deepaudio/tts/audio/audio.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import librosa
 15 | import numpy as np
 16 | import soundfile as sf
 17 | 
 18 | __all__ = ["AudioProcessor"]
 19 | 
 20 | 
 21 | class AudioProcessor(object):
 22 |     def __init__(self,
 23 |                  sample_rate: int,
 24 |                  n_fft: int,
 25 |                  win_length: int,
 26 |                  hop_length: int,
 27 |                  n_mels: int=80,
 28 |                  fmin: int=0,
 29 |                  fmax: int=None,
 30 |                  window="hann",
 31 |                  center=True,
 32 |                  pad_mode="reflect",
 33 |                  normalize=True):
 34 |         # read & write
 35 |         self.sample_rate = sample_rate
 36 |         self.normalize = normalize
 37 | 
 38 |         # stft
 39 |         self.n_fft = n_fft
 40 |         self.win_length = win_length
 41 |         self.hop_length = hop_length
 42 |         self.window = window
 43 |         self.center = center
 44 |         self.pad_mode = pad_mode
 45 | 
 46 |         # mel
 47 |         self.n_mels = n_mels
 48 |         self.fmin = fmin
 49 |         self.fmax = fmax
 50 | 
 51 |         self.mel_filter = self._create_mel_filter()
 52 |         self.inv_mel_filter = np.linalg.pinv(self.mel_filter)
 53 | 
 54 |     def _create_mel_filter(self):
 55 |         mel_filter = librosa.filters.mel(
 56 |             sr=self.sample_rate,
 57 |             n_fft=self.n_fft,
 58 |             n_mels=self.n_mels,
 59 |             fmin=self.fmin,
 60 |             fmax=self.fmax)
 61 |         return mel_filter
 62 | 
 63 |     def read_wav(self, filename):
 64 |         # resampling may occur
 65 |         wav, _ = librosa.load(filename, sr=self.sample_rate)
 66 | 
 67 |         # normalize the volume
 68 |         if self.normalize:
 69 |             wav = wav / np.max(np.abs(wav)) * 0.999
 70 |         return wav
 71 | 
 72 |     def write_wav(self, path, wav):
 73 |         sf.write(path, wav, samplerate=self.sample_rate)
 74 | 
 75 |     def stft(self, wav):
 76 |         D = librosa.core.stft(
 77 |             wav,
 78 |             n_fft=self.n_fft,
 79 |             hop_length=self.hop_length,
 80 |             win_length=self.win_length,
 81 |             window=self.window,
 82 |             center=self.center,
 83 |             pad_mode=self.pad_mode)
 84 |         return D
 85 | 
 86 |     def istft(self, D):
 87 |         wav = librosa.core.istft(
 88 |             D,
 89 |             hop_length=self.hop_length,
 90 |             win_length=self.win_length,
 91 |             window=self.window,
 92 |             center=self.center)
 93 |         return wav
 94 | 
 95 |     def spectrogram(self, wav):
 96 |         D = self.stft(wav)
 97 |         return np.abs(D)
 98 | 
 99 |     def mel_spectrogram(self, wav):
100 |         S = self.spectrogram(wav)
101 |         mel = np.dot(self.mel_filter, S)
102 |         return mel
103 | 


--------------------------------------------------------------------------------
/deepaudio/tts/audio/codec.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import math
15 | 
16 | import numpy as np
17 | import paddle
18 | 
19 | 
20 | # x: [0: 2**bit-1], return: [-1, 1]
21 | def label_2_float(x, bits):
22 |     return 2 * x / (2**bits - 1.) - 1.
23 | 
24 | 
25 | #x: [-1, 1], return: [0, 2**bits-1]
26 | def float_2_label(x, bits):
27 |     assert abs(x).max() <= 1.0
28 |     x = (x + 1.) * (2**bits - 1) / 2
29 |     return x.clip(0, 2**bits - 1)
30 | 
31 | 
32 | # y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
33 | # see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
34 | # be careful the input `mu` here, which is +1 than that of the link above
35 | def encode_mu_law(x, mu):
36 |     mu = mu - 1
37 |     fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
38 |     return np.floor((fx + 1) / 2 * mu + 0.5)
39 | 
40 | 
41 | # from_labels = True:
42 | # y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
43 | # from_labels = False:
44 | # y: [-1, 1], return: [-1, 1]
45 | def decode_mu_law(y, mu, from_labels=True):
46 |     # TODO: get rid of log2 - makes no sense
47 |     if from_labels:
48 |         y = label_2_float(y, math.log2(mu))
49 |     mu = mu - 1
50 |     x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
51 |     return x
52 | 


--------------------------------------------------------------------------------
/deepaudio/tts/audio/spec_normalizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | This modules contains normalizers for spectrogram magnitude.
16 | Normalizers are invertible transformations. They can be used to process 
17 | magnitude of spectrogram before training and can also be used to recover from 
18 | the generated spectrogram so as to be used with vocoders like griffin lim.
19 | 
20 | The base class describe the interface. `transform` is used to perform 
21 | transformation and `inverse` is used to perform the inverse transformation.
22 | 
23 | check issues:
24 | https://github.com/mozilla/TTS/issues/377
25 | """
26 | import numpy as np
27 | 
28 | __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"]
29 | 
30 | 
31 | class NormalizerBase(object):
32 |     def transform(self, spec):
33 |         raise NotImplementedError("transform must be implemented")
34 | 
35 |     def inverse(self, normalized):
36 |         raise NotImplementedError("inverse must be implemented")
37 | 
38 | 
39 | class LogMagnitude(NormalizerBase):
40 |     """
41 |     This is a simple normalizer used in Waveglow, Waveflow, tacotron2...
42 |     """
43 | 
44 |     def __init__(self, min=1e-5):
45 |         self.min = min
46 | 
47 |     def transform(self, x):
48 |         x = np.maximum(x, self.min)
49 |         x = np.log(x)
50 |         return x
51 | 
52 |     def inverse(self, x):
53 |         return np.exp(x)
54 | 
55 | 
56 | class UnitMagnitude(NormalizerBase):
57 |     # dbscale and (0, 1) normalization
58 |     """
59 |     This is the normalizer used in the 
60 |     """
61 | 
62 |     def __init__(self, min=1e-5):
63 |         self.min = min
64 | 
65 |     def transform(self, x):
66 |         db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20
67 |         normalized = (db_scale + 100) / 100
68 |         clipped = np.clip(normalized, 0, 1)
69 |         return clipped
70 | 
71 |     def inverse(self, x):
72 |         denormalized = np.clip(x, 0, 1) * 100 - 100
73 |         out = np.exp((denormalized + 20) / 20 * np.log(10))
74 |         return out
75 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/configs/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/default.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - model_checkpoint.yaml
 3 |   - early_stopping.yaml
 4 |   - model_summary.yaml
 5 |   - rich_progress_bar.yaml
 6 |   - _self_
 7 | 
 8 | model_checkpoint:
 9 |   dirpath: ${paths.output_dir}/checkpoints
10 |   filename: "epoch_{epoch:03d}"
11 |   monitor: "val/loss"
12 |   mode: "min"
13 |   save_last: True
14 |   auto_insert_metric_name: False
15 | 
16 | early_stopping:
17 |   monitor: "val/loss"
18 |   patience: 100
19 |   mode: "min"
20 | 
21 | model_summary:
22 |   max_depth: -1
23 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/early_stopping.yaml:
--------------------------------------------------------------------------------
 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.EarlyStopping.html
 2 | 
 3 | # Monitor a metric and stop training when it stops improving.
 4 | # Look at the above link for more detailed information.
 5 | early_stopping:
 6 |   _target_: pytorch_lightning.callbacks.EarlyStopping
 7 |   monitor: ??? # quantity to be monitored, must be specified !!!
 8 |   min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
 9 |   patience: 3 # number of checks with no improvement after which training will be stopped
10 |   verbose: False # verbosity mode
11 |   mode: "min" # "max" means higher metric value is better, can be also "min"
12 |   strict: True # whether to crash the training if monitor is not found in the validation metrics
13 |   check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
14 |   stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
15 |   divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
16 |   check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
17 |   # log_rank_zero_only: False  # this keyword argument isn't available in stable version
18 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.ModelCheckpoint.html
 2 | 
 3 | # Save the model periodically by monitoring a quantity.
 4 | # Look at the above link for more detailed information.
 5 | model_checkpoint:
 6 |   _target_: pytorch_lightning.callbacks.ModelCheckpoint
 7 |   dirpath: null # directory to save the model file
 8 |   filename: null # checkpoint filename
 9 |   monitor: null # name of the logged metric which determines when model is improving
10 |   verbose: False # verbosity mode
11 |   save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt
12 |   save_top_k: 1 # save k best models (determined by above metric)
13 |   mode: "min" # "max" means higher metric value is better, can be also "min"
14 |   auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
15 |   save_weights_only: False # if True, then only the model’s weights will be saved
16 |   every_n_train_steps: null # number of training steps between checkpoints
17 |   train_time_interval: null # checkpoints are monitored at the specified time interval
18 |   every_n_epochs: null # number of epochs between checkpoints
19 |   save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
20 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/model_summary.yaml:
--------------------------------------------------------------------------------
1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichModelSummary.html
2 | 
3 | # Generates a summary of all layers in a LightningModule with rich text formatting.
4 | # Look at the above link for more detailed information.
5 | model_summary:
6 |   _target_: pytorch_lightning.callbacks.RichModelSummary
7 |   max_depth: 1 # the maximum depth of layer nesting that the summary will include
8 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/configs/callbacks/none.yaml


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/callbacks/rich_progress_bar.yaml:
--------------------------------------------------------------------------------
1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichProgressBar.html
2 | 
3 | # Create a progress bar with rich text formatting.
4 | # Look at the above link for more detailed information.
5 | rich_progress_bar:
6 |   _target_: pytorch_lightning.callbacks.RichProgressBar
7 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/fastspeech2.yaml:
--------------------------------------------------------------------------------
1 | _target_: deepaudio.tts.datamodules.fastspeech2_datamodule.Fastspeech2DataModule
2 | train_metadata: ???
3 | dev_metadata: ???
4 | batch_size: 128
5 | num_workers: 0
6 | pin_memory: False
7 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/gan.yaml:
--------------------------------------------------------------------------------
1 | _target_: deepaudio.tts.datamodules.gan_datamodule.GanDataModule
2 | train_metadata: ???
3 | dev_metadata: ???
4 | batch_max_steps: ???
5 | n_shift: ???
6 | batch_size: 128
7 | num_workers: 0
8 | pin_memory: False
9 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/tacotron2.yaml:
--------------------------------------------------------------------------------
1 | _target_: deepaudio.tts.datamodules.tacotron2_datamodule.Tacaotron2DataModule
2 | train_metadata: ???
3 | dev_metadata: ???
4 | batch_size: 128
5 | num_workers: 0
6 | pin_memory: False
7 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/transformer_tts.yaml:
--------------------------------------------------------------------------------
1 | _target_: deepaudio.tts.datamodules.transformer_tts_datamodule.TransformerTTSDataModule
2 | train_metadata: ???
3 | dev_metadata: ???
4 | batch_size: 128
5 | num_workers: 0
6 | pin_memory: False
7 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/vits.yaml:
--------------------------------------------------------------------------------
1 | _target_: deepaudio.tts.datamodules.vits_datamodule.VitsDataModule
2 | train_metadata: ???
3 | dev_metadata: ???
4 | batch_size: 128
5 | num_workers: 0
6 | pin_memory: False
7 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/datamodule/wavernn.yaml:
--------------------------------------------------------------------------------
 1 | _target_: deepaudio.tts.datamodules.wavernn_datamodule.WaveRNNDataModule
 2 | train_metadata: ???
 3 | dev_metadata: ???
 4 | batch_max_steps: ???
 5 | n_shift: ???
 6 | mode: ???
 7 | bits: ???
 8 | batch_size: 128
 9 | num_workers: 0
10 | pin_memory: False
11 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/fastspeech2.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=fastspeech2
 5 | 
 6 | defaults:
 7 |   - override /datamodule: fastspeech2.yaml
 8 |   - override /model: fastspeech2.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["fastspeech2", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/hifigan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=hifigan
 5 | 
 6 | defaults:
 7 |   - override /datamodule: gan.yaml
 8 |   - override /model: hifigan.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["hifigan", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/parallel_wavegan.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=parallel_wavegan
 5 | 
 6 | defaults:
 7 |   - override /datamodule: gan.yaml
 8 |   - override /model: parallel_wavegan.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["parallel_wavegan", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/tacotron2.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=tacotron2
 5 | 
 6 | defaults:
 7 |   - override /datamodule: tacotron2.yaml
 8 |   - override /model: tacotron2.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["tacotron2", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/transformer_tts.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=transformer_tts
 5 | 
 6 | defaults:
 7 |   - override /datamodule: transformer_tts.yaml
 8 |   - override /model: transformer_tts.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["transformer_tts", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/experiment/vits.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=vits
 5 | 
 6 | defaults:
 7 |   - override /datamodule: vits.yaml
 8 |   - override /model: vits.yaml
 9 |   - override /callbacks: default.yaml
10 |   - override /trainer: default.yaml
11 | 
12 | # all parameters below will be merged with parameters from default configurations set above
13 | # this allows you to overwrite only specified parameters
14 | 
15 | tags: ["vits", "ljspeech"]
16 | 
17 | seed: 12345
18 | 
19 | trainer:
20 |   min_epochs: 10
21 |   max_epochs: 10
22 |   gradient_clip_val: 0.5
23 | 
24 | datamodule:
25 |   batch_size: 64
26 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/extras/default.yaml:
--------------------------------------------------------------------------------
1 | # disable python warnings if they annoy you
2 | ignore_warnings: False
3 | 
4 | # ask user for tags if none are provided in the config
5 | enforce_tags: True
6 | 
7 | # pretty print config tree at the start of the run using Rich library
8 | print_config: True
9 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # https://hydra.cc/docs/configure_hydra/intro/
 2 | 
 3 | # enable color logging
 4 | defaults:
 5 |   - override hydra_logging: default
 6 |   - override job_logging: default
 7 | 
 8 | # output directory, generated dynamically on each run
 9 | run:
10 |   dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | sweep:
12 |   dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
13 |   subdir: ${hydra.job.num}
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/comet.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.comet.ml
 2 | 
 3 | comet:
 4 |   _target_: pytorch_lightning.loggers.comet.CometLogger
 5 |   api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
 6 |   save_dir: "${paths.output_dir}"
 7 |   project_name: "lightning-hydra-template"
 8 |   rest_api_key: null
 9 |   # experiment_name: ""
10 |   experiment_key: null # set to resume experiment
11 |   offline: False
12 |   prefix: ""
13 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/csv.yaml:
--------------------------------------------------------------------------------
1 | # csv logger built in lightning
2 | 
3 | csv:
4 |   _target_: pytorch_lightning.loggers.csv_logs.CSVLogger
5 |   save_dir: "${paths.output_dir}"
6 |   name: "csv/"
7 |   prefix: ""
8 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/many_loggers.yaml:
--------------------------------------------------------------------------------
 1 | # train with many loggers at once
 2 | 
 3 | defaults:
 4 |   # - comet.yaml
 5 |   - csv.yaml
 6 |   # - mlflow.yaml
 7 |   # - neptune.yaml
 8 |   - tensorboard.yaml
 9 |   - wandb.yaml
10 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/mlflow.yaml:
--------------------------------------------------------------------------------
 1 | # https://mlflow.org
 2 | 
 3 | mlflow:
 4 |   _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger
 5 |   # experiment_name: ""
 6 |   # run_name: ""
 7 |   tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
 8 |   tags: null
 9 |   # save_dir: "./mlruns"
10 |   prefix: ""
11 |   artifact_location: null
12 |   # run_id: ""
13 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/neptune.yaml:
--------------------------------------------------------------------------------
 1 | # https://neptune.ai
 2 | 
 3 | neptune:
 4 |   _target_: pytorch_lightning.loggers.neptune.NeptuneLogger
 5 |   api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
 6 |   project: username/lightning-hydra-template
 7 |   # name: ""
 8 |   log_model_checkpoints: True
 9 |   prefix: ""
10 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.tensorflow.org/tensorboard/
 2 | 
 3 | tensorboard:
 4 |   _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
 5 |   save_dir: "${paths.output_dir}/tensorboard/"
 6 |   name: null
 7 |   log_graph: False
 8 |   default_hp_metric: True
 9 |   prefix: ""
10 |   # version: ""
11 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/logger/wandb.yaml:
--------------------------------------------------------------------------------
 1 | # https://wandb.ai
 2 | 
 3 | wandb:
 4 |   _target_: pytorch_lightning.loggers.wandb.WandbLogger
 5 |   # name: "" # name of the run (normally generated by wandb)
 6 |   save_dir: "${paths.output_dir}"
 7 |   offline: False
 8 |   id: null # pass correct id to resume experiment!
 9 |   anonymous: null # enable anonymous logging
10 |   project: "lightning-hydra-template"
11 |   log_model: False # upload lightning ckpts
12 |   prefix: "" # a string to put at the beginning of metric keys
13 |   # entity: "" # set to name of your wandb team
14 |   group: ""
15 |   tags: []
16 |   job_type: ""
17 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/model/fastspeech2.yaml:
--------------------------------------------------------------------------------
 1 | _target_: deepaudio.tts.models.fastspeech2.model.Fastspeech2Model
 2 | 
 3 | optimizer:
 4 |   _target_: torch.optim.Adam
 5 |   _partial_: true
 6 |   lr: 0.001
 7 |   weight_decay: 0.0
 8 | 
 9 | scheduler:
10 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
11 |   _partial_: true
12 |   mode: min
13 |   factor: 0.1
14 |   patience: 10
15 | 
16 | model:
17 |   _target_: deepaudio.tts.models.fastspeech2.fastspeech2.FastSpeech2
18 |   idim: 80          # Dimension of the inputs
19 |   odim: 80          # Dimension of the outputs.
20 |   adim: 384         # attention dimension
21 |   aheads: 2         # number of attention heads
22 |   elayers: 4        # number of encoder layers
23 |   eunits: 1536      # number of encoder ff units
24 |   dlayers: 4        # number of decoder layers
25 |   dunits: 1536      # number of decoder ff units
26 |   positionwise_layer_type: conv1d   # type of position-wise layer
27 |   positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
28 |   duration_predictor_layers: 2      # number of layers of duration predictor
29 |   duration_predictor_chans: 256     # number of channels of duration predictor
30 |   duration_predictor_kernel_size: 3 # filter size of duration predictor
31 |   postnet_layers: 5                 # number of layers of postnset
32 |   postnet_filts: 5                  # filter size of conv layers in postnet
33 |   postnet_chans: 256                # number of channels of conv layers in postnet
34 |   use_scaled_pos_enc: True          # whether to use scaled positional encoding
35 |   encoder_normalize_before: True    # whether to perform layer normalization before the input
36 |   decoder_normalize_before: True    # whether to perform layer normalization before the input
37 |   reduction_factor: 1               # reduction factor
38 |   init_type: xavier_uniform         # initialization type
39 |   init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
40 |   init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
41 |   transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
42 |   transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
43 |   transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
44 |   transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
45 |   transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
46 |   transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
47 |   pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
48 |   pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
49 |   pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
50 |   pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
51 |   pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
52 |   pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
53 |   stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
54 |   energy_predictor_layers: 2                 # number of conv layers in energy predictor
55 |   energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
56 |   energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
57 |   energy_predictor_dropout: 0.5              # dropout rate in energy predictor
58 |   energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
59 |   energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
60 |   stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
61 | 
62 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/model/parallel_wavegan.yaml:
--------------------------------------------------------------------------------
 1 | _target_: eepaudio.tts.models.parallel_wavegan.model.ParallelWaveGANModel
 2 | 
 3 | optimizer_d:
 4 |   _target_: torch.optim.Adam
 5 |   _partial_: true
 6 |   lr: 0.001
 7 |   weight_decay: 0.00001
 8 | 
 9 | scheduler_d:
10 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
11 |   _partial_: true
12 |   mode: min
13 |   factor: 0.1
14 |   patience: 10
15 | 
16 | optimizer_g:
17 |   _target_: torch.optim.Adam
18 |   _partial_: true
19 |   lr: 0.001
20 |   weight_decay: 0.00001
21 | 
22 | scheduler_g:
23 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
24 |   _partial_: true
25 |   mode: min
26 |   factor: 0.1
27 |   patience: 10
28 | 
29 | lambda_aux: 1.0,
30 | lambda_adv: 4.0,
31 | 
32 | generator:
33 |   _target_: deepaudio.tts.models.parallel_wavegan.parallel_wavegan.ParallelWaveGANGenerator
34 |   in_channels: 1        # Number of input channels.
35 |   out_channels: 1       # Number of output channels.
36 |   kernel_size: 3        # Kernel size of dilated convolution.
37 |   layers: 30            # Number of residual block layers.
38 |   stacks: 3             # Number of stacks i.e., dilation cycles.
39 |   residual_channels: 64 # Number of channels in residual conv.
40 |   gate_channels: 128    # Number of channels in gated conv.
41 |   skip_channels: 64     # Number of channels in skip conv.
42 |   aux_channels: 80      # Number of channels for auxiliary feature conv.
43 |                         # Must be the same as num_mels.
44 |   aux_context_window: 2 # Context window size for auxiliary feature.
45 |                         # If set to 2, previous 2 and future 2 frames will be considered.
46 |   dropout_rate: 0.0          # Dropout rate. 0.0 means no dropout applied.
47 |   use_weight_norm: True # Whether to use weight norm.
48 |                         # If set to true, it will be applied to all of the conv layers.
49 |   #upsample_scales: [4, 4, 4, 4]     # Upsampling scales. prod(upsample_scales) == n_shift
50 | 
51 | discriminator:
52 |   _target_: deepaudio.tts.models.parallel_wavegan.parallel_wavegan.ParallelWaveGANDiscriminator
53 |   in_channels: 1        # Number of input channels.
54 |   out_channels: 1       # Number of output channels.
55 |   kernel_size: 3        # Number of output channels.
56 |   layers: 10            # Number of conv layers.
57 |   conv_channels: 64     # Number of chnn layers.
58 |   bias: True            # Whether to use bias parameter in conv.
59 |   use_weight_norm: True # Whether to use weight norm.
60 |                         # If set to true, it will be applied to all of the conv layers.
61 |   nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
62 |   nonlinear_activation_params:      # Nonlinear function parameters
63 |       negative_slope: 0.2           # Alpha in leakyrelu.
64 | 
65 | criterion_stft:
66 |   _target_: deepaudio.tts.modules.losses.MultiResolutionSTFTLoss
67 |   fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
68 |   hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
69 |   win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
70 |   window: "hann"                # Window function for STFT-based loss
71 | 
72 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/model/tacotron2.yaml:
--------------------------------------------------------------------------------
 1 | _target_: deepaudio.tts.models.tacotron2.model.Tacotron2Model
 2 | 
 3 | optimizer:
 4 |   _target_: torch.optim.Adam
 5 |   _partial_: true
 6 |   lr: 0.001
 7 |   weight_decay: 0.0
 8 | 
 9 | scheduler:
10 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
11 |   _partial_: true
12 |   mode: min
13 |   factor: 0.1
14 |   patience: 10
15 | 
16 | loss_type: L1+L2
17 | use_guided_attn_loss: True
18 | 
19 | taco2_loss:
20 |   _target_: deepaudio.tts.models.tacotron2.lossTacotron2Loss
21 |   use_masking: True
22 |   use_weighted_masking: False
23 |   bce_pos_weight: 5.0
24 | 
25 | attn_loss:
26 |   _target_: deepaudio.tts.models.tacotron2.GuidedAttentionLoss
27 |   sigma: 0.4
28 |   alpha: 1.0
29 |   reset_always: True
30 | 
31 | model:
32 |   _target_: deepaudio.tts.models.tacotron2.tacotron2.Tacotron2
33 |   idim: 80
34 |   odim: 80
35 |   embed_dim: 512               # char or phn embedding dimension
36 |   elayers: 1                   # number of blstm layers in encoder
37 |   eunits: 512                  # number of blstm units
38 |   econv_layers: 3              # number of convolutional layers in encoder
39 |   econv_chans: 512             # number of channels in convolutional layer
40 |   econv_filts: 5               # filter size of convolutional layer
41 |   atype: location              # attention function type
42 |   adim: 512                    # attention dimension
43 |   aconv_chans: 32              # number of channels in convolutional layer of attention
44 |   aconv_filts: 15              # filter size of convolutional layer of attention
45 |   cumulate_att_w: True         # whether to cumulate attention weight
46 |   dlayers: 2                   # number of lstm layers in decoder
47 |   dunits: 1024                 # number of lstm units in decoder
48 |   prenet_layers: 2             # number of layers in prenet
49 |   prenet_units: 256            # number of units in prenet
50 |   postnet_layers: 5            # number of layers in postnet
51 |   postnet_chans: 512           # number of channels in postnet
52 |   postnet_filts: 5             # filter size of postnet layer
53 |   output_activation: null      # activation function for the final output
54 |   use_batch_norm: True         # whether to use batch normalization in encoder
55 |   use_concate: True            # whether to concatenate encoder embedding with decoder outputs
56 |   use_residual: False          # whether to use residual connection in encoder
57 |   dropout_rate: 0.5            # dropout rate
58 |   zoneout_rate: 0.1            # zoneout rate
59 |   reduction_factor: 1          # reduction factor
60 |   spk_embed_dim: null          # speaker embedding dimension
61 | 
62 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/model/transformer_tts.yaml:
--------------------------------------------------------------------------------
 1 | _target_: deepaudio.tts.models.transformer_tts.model.TransformerTTSModel
 2 | 
 3 | optimizer:
 4 |   _target_: torch.optim.Adam
 5 |   _partial_: true
 6 |   lr: 0.001
 7 |   weight_decay: 0.00001
 8 | 
 9 | scheduler:
10 |   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
11 |   _partial_: true
12 |   mode: min
13 |   factor: 0.1
14 |   patience: 10
15 | 
16 | loss_type: L1+L2
17 | modules_applied_guided_attn: encoder-decoder
18 | use_guided_attn_loss: True
19 | 
20 | transformer_loss:
21 |   _target_: deepaudio.tts.models.transformer_tts.loss.TransformerLoss
22 |   use_masking: True
23 |   use_weighted_masking: False
24 |   bce_pos_weight: 20.0
25 | 
26 | atten_criterion:
27 |   _target_: deepaudio.tts.models.transformer_tts.loss.GuidedMultiHeadAttentionLoss
28 |   sigma: 0.4
29 |   alpha: 1.0
30 |   reset_always: True
31 | 
32 | 
33 | model:
34 |   _target_: deepaudio.tts.models.transformer_tts.transformer.Transformer
35 |   idim: 80
36 |   odim: 80
37 |   embed_dim: 0           # embedding dimension in encoder prenet
38 |   eprenet_conv_layers: 0 # number of conv layers in encoder prenet
39 |   # if set to 0, no encoder prenet will be used
40 |   eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
41 |   eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
42 |   dprenet_layers: 2      # number of layers in decoder prenet
43 |   dprenet_units: 256     # number of units in decoder prenet
44 |   adim: 512              # attention dimension
45 |   aheads: 8              # number of attention heads
46 |   elayers: 6             # number of encoder layers
47 |   eunits: 1024           # number of encoder ff units
48 |   dlayers: 6             # number of decoder layers
49 |   dunits: 1024           # number of decoder ff units
50 |   positionwise_layer_type: conv1d  # type of position-wise layer
51 |   positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
52 |   postnet_layers: 5                # number of layers of postnset
53 |   postnet_filts: 5                 # filter size of conv layers in postnet
54 |   postnet_chans: 256               # number of channels of conv layers in postnet
55 |   use_scaled_pos_enc: True         # whether to use scaled positional encoding
56 |   encoder_normalize_before: True   # whether to perform layer normalization before the input
57 |   decoder_normalize_before: True   # whether to perform layer normalization before the input
58 |   reduction_factor: 1              # reduction factor
59 |   init_type: xavier_uniform        # initialization type
60 |   init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
61 |   init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
62 |   eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
63 |   dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
64 |   postnet_dropout_rate: 0.5        # dropout rate for postnet
65 |   transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
66 |   transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
67 |   transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
68 |   transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
69 |   transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
70 |   transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
71 |   transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
72 |   num_heads_applied_guided_attn: 2                 # number of heads to apply guided attention loss
73 |   num_layers_applied_guided_attn: 2                # number of layers to apply guided attention loss
74 | 
75 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/paths/default.yaml:
--------------------------------------------------------------------------------
 1 | # path to root directory
 2 | # this requires PROJECT_ROOT environment variable to exist
 3 | # PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py`
 4 | root_dir: ${oc.env:PROJECT_ROOT}
 5 | 
 6 | # path to logging directory
 7 | log_dir: ${paths.root_dir}/logs/
 8 | 
 9 | # path to output directory, created dynamically by hydra
10 | # path generation pattern is specified in `configs/hydra/default.yaml`
11 | # use it to store all files generated during the run, like ckpts and metrics
12 | output_dir: ${hydra:runtime.output_dir}
13 | 
14 | # path to working directory
15 | work_dir: ${hydra:runtime.cwd}
16 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # specify here default configuration
 4 | # order of defaults determines the order in which configs override each other
 5 | defaults:
 6 |   - _self_
 7 |   - datamodule: tacotron2.yaml
 8 |   - model: tacotron2.yaml
 9 |   - callbacks: default.yaml
10 |   - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
11 |   - trainer: default.yaml
12 |   - paths: default.yaml
13 |   - extras: default.yaml
14 |   - hydra: default.yaml
15 | 
16 |   # experiment configs allow for version control of specific hyperparameters
17 |   # e.g. best hyperparameters for given model and datamodule
18 |   - experiment: null
19 | 
20 |   # config for hyperparameter optimization
21 |   - hparams_search: null
22 | 
23 |   # optional local config for machine/user specific settings
24 |   # it's optional since it doesn't need to exist and is excluded from version control
25 |   - optional local: default.yaml
26 | 
27 |   # debugging config (enable through command line, e.g. `python train.py debug=default)
28 |   - debug: null
29 | 
30 | # task name, determines output directory path
31 | task_name: "train"
32 | 
33 | # tags to help you identify your experiments
34 | # you can overwrite this in experiment configs
35 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
36 | # appending lists from command line is currently not supported :(
37 | # https://github.com/facebookresearch/hydra/issues/1547
38 | tags: ["dev"]
39 | 
40 | # set False to skip model training
41 | train: True
42 | 
43 | # evaluate on test set, using best model weights achieved during training
44 | # lightning chooses best weights based on the metric specified in checkpoint callback
45 | test: True
46 | 
47 | # simply provide checkpoint path to resume training
48 | ckpt_path: null
49 | 
50 | # seed for random number generators in pytorch, numpy and python.random
51 | seed: null
52 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/cpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: cpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/ddp.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - default.yaml
 3 | 
 4 | # use "ddp_spawn" instead of "ddp",
 5 | # it's slower but normal "ddp" currently doesn't work ideally with hydra
 6 | # https://github.com/facebookresearch/hydra/issues/2070
 7 | # https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn
 8 | strategy: ddp_spawn
 9 | 
10 | accelerator: gpu
11 | devices: 4
12 | num_nodes: 1
13 | sync_batchnorm: True
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/ddp_sim.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | # simulate DDP on CPU, useful for debugging
5 | accelerator: cpu
6 | devices: 2
7 | strategy: ddp_spawn
8 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: pytorch_lightning.Trainer
 2 | 
 3 | default_root_dir: ${paths.output_dir}
 4 | 
 5 | min_epochs: 1 # prevents early stopping
 6 | max_epochs: 10
 7 | 
 8 | accelerator: cpu
 9 | devices: 1
10 | 
11 | # mixed precision for extra speed-up
12 | # precision: 16
13 | 
14 | # perform a validation loop every N training epochs
15 | check_val_every_n_epoch: 1
16 | 
17 | # set True to to ensure deterministic results
18 | # makes training slower but gives more reproducibility than just setting seeds
19 | deterministic: False
20 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/gpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: gpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/configs/trainer/mps.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default.yaml
3 | 
4 | accelerator: mps
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/preprocess/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/preprocess/fastspeech2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/preprocess/gan_vocoder/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/preprocess/tacotron2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/preprocess/transformer_tts/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/train.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | 
 3 | import hydra
 4 | import pytorch_lightning as pl
 5 | from omegaconf import DictConfig
 6 | from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer
 7 | from pytorch_lightning.loggers import LightningLoggerBase
 8 | 
 9 | from deepaudio.tts.cli import utils
10 | 
11 | log = utils.get_pylogger(__name__)
12 | 
13 | 
14 | @utils.task_wrapper
15 | def train(cfg: DictConfig) -> Tuple[dict, dict]:
16 |     """Trains the model. Can additionally evaluate on a testset, using best weights obtained during
17 |     training.
18 |     This method is wrapped in optional @task_wrapper decorator which applies extra utilities
19 |     before and after the call.
20 |     Args:
21 |         cfg (DictConfig): Configuration composed by Hydra.
22 |     Returns:
23 |         Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects.
24 |     """
25 | 
26 |     # set seed for random number generators in pytorch, numpy and python.random
27 |     if cfg.get("seed"):
28 |         pl.seed_everything(cfg.seed, workers=True)
29 | 
30 |     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>")
31 |     datamodule: LightningDataModule = hydra.utils.instantiate(cfg.datamodule)
32 | 
33 |     log.info(f"Instantiating model <{cfg.model._target_}>")
34 |     model: LightningModule = hydra.utils.instantiate(cfg.model)
35 | 
36 |     log.info("Instantiating callbacks...")
37 |     callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks"))
38 | 
39 |     log.info("Instantiating loggers...")
40 |     logger: List[LightningLoggerBase] = utils.instantiate_loggers(cfg.get("logger"))
41 | 
42 |     log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
43 |     trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger)
44 | 
45 |     object_dict = {
46 |         "cfg": cfg,
47 |         "datamodule": datamodule,
48 |         "model": model,
49 |         "callbacks": callbacks,
50 |         "logger": logger,
51 |         "trainer": trainer,
52 |     }
53 | 
54 |     if logger:
55 |         log.info("Logging hyperparameters!")
56 |         utils.log_hyperparameters(object_dict)
57 | 
58 |     if cfg.get("train"):
59 |         log.info("Starting training!")
60 |         trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
61 | 
62 |     train_metrics = trainer.callback_metrics
63 | 
64 |     if cfg.get("test"):
65 |         log.info("Starting testing!")
66 |         ckpt_path = trainer.checkpoint_callback.best_model_path
67 |         if ckpt_path == "":
68 |             log.warning("Best ckpt not found! Using current weights for testing...")
69 |             ckpt_path = None
70 |         trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
71 |         log.info(f"Best ckpt path: {ckpt_path}")
72 | 
73 |     test_metrics = trainer.callback_metrics
74 | 
75 |     # merge train and test metrics
76 |     metric_dict = {**train_metrics, **test_metrics}
77 | 
78 |     return metric_dict, object_dict
79 | 
80 | 
81 | @hydra.main(version_base="1.2", config_path="configs", config_name="train.yaml")
82 | def main(cfg: DictConfig) -> Optional[float]:
83 | 
84 |     # train the model
85 |     metric_dict, _ = train(cfg)
86 | 
87 |     # safely retrieve metric value for hydra-based hyperparameter optimization
88 |     metric_value = utils.get_metric_value(
89 |         metric_dict=metric_dict, metric_name=cfg.get("optimized_metric")
90 |     )
91 | 
92 |     # return optimized metric
93 |     return metric_value
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()


--------------------------------------------------------------------------------
/deepaudio/tts/cli/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from deepaudio.tts.cli.utils.pylogger import get_pylogger
 2 | from deepaudio.tts.cli.utils.rich_utils import enforce_tags, print_config_tree
 3 | from deepaudio.tts.cli.utils.utils import (
 4 |     close_loggers,
 5 |     extras,
 6 |     get_metric_value,
 7 |     instantiate_callbacks,
 8 |     instantiate_loggers,
 9 |     log_hyperparameters,
10 |     save_file,
11 |     task_wrapper,
12 | )
13 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from pytorch_lightning.utilities import rank_zero_only
 4 | 
 5 | 
 6 | def get_pylogger(name=__name__) -> logging.Logger:
 7 |     """Initializes multi-GPU-friendly python command line logger."""
 8 | 
 9 |     logger = logging.getLogger(name)
10 | 
11 |     # this ensures all logging levels get marked with the rank zero decorator
12 |     # otherwise logs would get multiplied for each GPU process in multi-GPU setup
13 |     logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical")
14 |     for level in logging_levels:
15 |         setattr(logger, level, rank_zero_only(getattr(logger, level)))
16 | 
17 |     return logger
18 | 


--------------------------------------------------------------------------------
/deepaudio/tts/cli/utils/rich_utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Sequence
  3 | 
  4 | import rich
  5 | import rich.syntax
  6 | import rich.tree
  7 | from hydra.core.hydra_config import HydraConfig
  8 | from omegaconf import DictConfig, OmegaConf, open_dict
  9 | from pytorch_lightning.utilities import rank_zero_only
 10 | from rich.prompt import Prompt
 11 | 
 12 | from deepaudio.tts.cli.utils import pylogger
 13 | 
 14 | log = pylogger.get_pylogger(__name__)
 15 | 
 16 | 
 17 | @rank_zero_only
 18 | def print_config_tree(
 19 |     cfg: DictConfig,
 20 |     print_order: Sequence[str] = (
 21 |         "datamodule",
 22 |         "model",
 23 |         "callbacks",
 24 |         "logger",
 25 |         "trainer",
 26 |         "paths",
 27 |         "extras",
 28 |     ),
 29 |     resolve: bool = False,
 30 |     save_to_file: bool = False,
 31 | ) -> None:
 32 |     """Prints content of DictConfig using Rich library and its tree structure.
 33 | 
 34 |     Args:
 35 |         cfg (DictConfig): Configuration composed by Hydra.
 36 |         print_order (Sequence[str], optional): Determines in what order config components are printed.
 37 |         resolve (bool, optional): Whether to resolve reference fields of DictConfig.
 38 |         save_to_file (bool, optional): Whether to export config to the hydra output folder.
 39 |     """
 40 | 
 41 |     style = "dim"
 42 |     tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
 43 | 
 44 |     queue = []
 45 | 
 46 |     # add fields from `print_order` to queue
 47 |     for field in print_order:
 48 |         queue.append(field) if field in cfg else log.warning(
 49 |             f"Field '{field}' not found in config. Skipping '{field}' config printing..."
 50 |         )
 51 | 
 52 |     # add all the other fields to queue (not specified in `print_order`)
 53 |     for field in cfg:
 54 |         if field not in queue:
 55 |             queue.append(field)
 56 | 
 57 |     # generate config tree from queue
 58 |     for field in queue:
 59 |         branch = tree.add(field, style=style, guide_style=style)
 60 | 
 61 |         config_group = cfg[field]
 62 |         if isinstance(config_group, DictConfig):
 63 |             branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
 64 |         else:
 65 |             branch_content = str(config_group)
 66 | 
 67 |         branch.add(rich.syntax.Syntax(branch_content, "yaml"))
 68 | 
 69 |     # print config tree
 70 |     rich.print(tree)
 71 | 
 72 |     # save config tree to file
 73 |     if save_to_file:
 74 |         with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
 75 |             rich.print(tree, file=file)
 76 | 
 77 | 
 78 | @rank_zero_only
 79 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
 80 |     """Prompts user to input tags from command line if no tags are provided in config."""
 81 | 
 82 |     if not cfg.get("tags"):
 83 |         if "id" in HydraConfig().cfg.hydra.job:
 84 |             raise ValueError("Specify tags before launching a multirun!")
 85 | 
 86 |         log.warning("No tags provided in config. Prompting user to input tags...")
 87 |         tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
 88 |         tags = [t.strip() for t in tags.split(",") if t != ""]
 89 | 
 90 |         with open_dict(cfg):
 91 |             cfg.tags = tags
 92 | 
 93 |         log.info(f"Tags: {cfg.tags}")
 94 | 
 95 |     if save_to_file:
 96 |         with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
 97 |             rich.print(cfg.tags, file=file)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     from hydra import compose, initialize
102 | 
103 |     with initialize(version_base="1.2", config_path="../../configs"):
104 |         cfg = compose(config_name="train.yaml", return_hydra_config=False, overrides=[])
105 |         print_config_tree(cfg, resolve=False, save_to_file=False)
106 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/datamodules/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/fastspeech2_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
 9 | from deepaudio.tts.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
10 | from deepaudio.tts.datasets.data_table import DataTable
11 | 
12 | 
13 | class Fastspeech2DataModule(LightningDataModule):
14 |     def __init__(self,
15 |                  train_metadata: str,
16 |                  dev_metadata: str,
17 |                  batch_size: int = 64,
18 |                  num_workers: int = 0,
19 |                  pin_memory: bool = False,
20 |                  speaker_dict: Optional[str] = None,
21 |                  voice_cloning: Optional[bool] = False,
22 |                  ):
23 |         super().__init__()
24 |         self.save_hyperparameters(logger=False)
25 |         self.train_dataset: Optional[Dataset] = None
26 |         self.dev_dataset: Optional[Dataset] = None
27 | 
28 |     def setup(self, stage: Optional[str] = None) -> None:
29 |         fields = [
30 |             "text", "text_lengths", "speech", "speech_lengths", "durations",
31 |             "pitch", "energy"
32 |         ]
33 |         converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
34 |         spk_num = None
35 |         if self.hparams.speaker_dict is not None:
36 |             print("multiple speaker fastspeech2!")
37 |             self.collate_fn = fastspeech2_multi_spk_batch_fn
38 |             with open(self.hparams.speaker_dict, 'rt') as f:
39 |                 spk_id = [line.strip().split() for line in f.readlines()]
40 |             spk_num = len(spk_id)
41 |             fields += ["spk_id"]
42 |         elif self.hparams.voice_cloning:
43 |             print("Training voice cloning!")
44 |             self.collate_fn = fastspeech2_multi_spk_batch_fn
45 |             fields += ["spk_emb"]
46 |             converters["spk_emb"] = np.load
47 |         else:
48 |             print("single speaker fastspeech2!")
49 |             self.collate_fn = fastspeech2_single_spk_batch_fn
50 |         print("spk_num:", spk_num)
51 | 
52 |         # construct dataset for training and validation
53 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
54 |             train_metadata = list(reader)
55 |         self.train_dataset = DataTable(
56 |             data=train_metadata,
57 |             fields=fields,
58 |             converters=converters, )
59 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
60 |             dev_metadata = list(reader)
61 |         self.dev_dataset = DataTable(
62 |             data=dev_metadata,
63 |             fields=fields,
64 |             converters=converters, )
65 | 
66 |     def train_dataloader(self):
67 |         return DataLoader(
68 |             dataset=self.train_dataset,
69 |             batch_size=self.hparams.batch_size,
70 |             num_workers=self.hparams.num_workers,
71 |             pin_memory=self.hparams.pin_memory,
72 |             shuffle=True,
73 |             collate_fn=self.collate_fn,
74 |         )
75 | 
76 |     def val_dataloader(self):
77 |         return DataLoader(
78 |             dataset=self.dev_dataset,
79 |             batch_size=self.hparams.batch_size,
80 |             num_workers=self.hparams.num_workers,
81 |             pin_memory=self.hparams.pin_memory,
82 |             shuffle=False,
83 |             collate_fn=self.collate_fn,
84 |         )
85 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/gan_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.vocoder_batch_fn import Clip
 9 | from deepaudio.tts.datasets.data_table import DataTable
10 | 
11 | 
12 | class GanDataModule(LightningDataModule):
13 |     def __init__(self,
14 |                  train_metadata: str,
15 |                  dev_metadata: str,
16 |                  batch_max_steps: int,
17 |                  n_shift: int,
18 |                  aux_context_window: Optional[int] = 0,
19 |                  batch_size: int = 64,
20 |                  num_workers: int = 0,
21 |                  pin_memory: bool = False,
22 |                  ):
23 |         super().__init__()
24 |         self.save_hyperparameters(logger=False)
25 |         self.train_dataset: Optional[Dataset] = None
26 |         self.dev_dataset: Optional[Dataset] = None
27 | 
28 |     def setup(self, stage: Optional[str] = None) -> None:
29 |         # construct dataset for training and validation
30 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
31 |             train_metadata = list(reader)
32 |         self.train_dataset = DataTable(
33 |             data=train_metadata,
34 |             fields=["wave", "feats"],
35 |             converters={
36 |                 "wave": np.load,
37 |                 "feats": np.load,
38 |             }, )
39 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
40 |             dev_metadata = list(reader)
41 |         self.dev_dataset = DataTable(
42 |             data=dev_metadata,
43 |             fields=["wave", "feats"],
44 |             converters={
45 |                 "wave": np.load,
46 |                 "feats": np.load,
47 |             }, )
48 | 
49 |         self.collate_fn = Clip(
50 |             batch_max_steps=self.hparams.batch_max_steps,
51 |             hop_size=self.hparams.n_shift,
52 |             aux_context_window=self.hparams.aux_context_window)
53 | 
54 |     def train_dataloader(self):
55 |         return DataLoader(
56 |             dataset=self.train_dataset,
57 |             batch_size=self.hparams.batch_size,
58 |             num_workers=self.hparams.num_workers,
59 |             pin_memory=self.hparams.pin_memory,
60 |             shuffle=True,
61 |             collate_fn=self.collate_fn,
62 |         )
63 | 
64 |     def val_dataloader(self):
65 |         return DataLoader(
66 |             dataset=self.dev_dataset,
67 |             batch_size=self.hparams.batch_size,
68 |             num_workers=self.hparams.num_workers,
69 |             pin_memory=self.hparams.pin_memory,
70 |             shuffle=False,
71 |             collate_fn=self.collate_fn,
72 |         )
73 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/tacotron2_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn
 9 | from deepaudio.tts.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
10 | from deepaudio.tts.datasets.data_table import DataTable
11 | 
12 | class Tacaotron2DataModule(LightningDataModule):
13 |     def __init__(self,
14 |                  train_metadata: str,
15 |                  dev_metadata: str,
16 |                  batch_size: int = 64,
17 |                  num_workers: int = 0,
18 |                  pin_memory: bool = False,
19 |                  voice_cloning: Optional[bool] = False,
20 |                  ):
21 |         super().__init__()
22 |         self.save_hyperparameters(logger=False)
23 |         self.train_dataset: Optional[Dataset] = None
24 |         self.dev_dataset: Optional[Dataset] = None
25 | 
26 |     def setup(self, stage: Optional[str] = None) -> None:
27 |         fields = [
28 |             "text",
29 |             "text_lengths",
30 |             "speech",
31 |             "speech_lengths",
32 |         ]
33 | 
34 |         converters = {
35 |             "speech": np.load,
36 |         }
37 |         if self.hparams.voice_cloning:
38 |             print("Training voice cloning!")
39 |             self.collate_fn = tacotron2_multi_spk_batch_fn
40 |             fields += ["spk_emb"]
41 |             converters["spk_emb"] = np.load
42 |         else:
43 |             print("single speaker tacotron2!")
44 |             self.collate_fn = tacotron2_single_spk_batch_fn
45 | 
46 |         # construct dataset for training and validation
47 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
48 |             train_metadata = list(reader)
49 |         self.train_dataset = DataTable(
50 |             data=train_metadata,
51 |             fields=fields,
52 |             converters=converters, )
53 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
54 |             dev_metadata = list(reader)
55 |         self.dev_dataset = DataTable(
56 |             data=dev_metadata,
57 |             fields=fields,
58 |             converters=converters, )
59 | 
60 |     def train_dataloader(self):
61 |         return DataLoader(
62 |             dataset=self.train_dataset,
63 |             batch_size=self.hparams.batch_size,
64 |             num_workers=self.hparams.num_workers,
65 |             pin_memory=self.hparams.pin_memory,
66 |             shuffle=True,
67 |             collate_fn=self.collate_fn,
68 |         )
69 | 
70 |     def val_dataloader(self):
71 |         return DataLoader(
72 |             dataset=self.dev_dataset,
73 |             batch_size=self.hparams.batch_size,
74 |             num_workers=self.hparams.num_workers,
75 |             pin_memory=self.hparams.pin_memory,
76 |             shuffle=False,
77 |             collate_fn=self.collate_fn,
78 |         )
79 | 
80 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/transformer_tts_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.am_batch_fn import transformer_single_spk_batch_fn
 9 | from deepaudio.tts.datasets.data_table import DataTable
10 | 
11 | 
12 | class TransformerTTSDataModule(LightningDataModule):
13 |     def __init__(self,
14 |                  train_metadata: str,
15 |                  dev_metadata: str,
16 |                  batch_size: int = 64,
17 |                  num_workers: int = 0,
18 |                  pin_memory: bool = False,
19 |                  ):
20 |         super().__init__()
21 |         self.save_hyperparameters(logger=False)
22 |         self.train_dataset: Optional[Dataset] = None
23 |         self.dev_dataset: Optional[Dataset] = None
24 | 
25 |     def setup(self, stage: Optional[str] = None) -> None:
26 |         # construct dataset for training and validation
27 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
28 |             train_metadata = list(reader)
29 |         self.train_dataset = DataTable(
30 |             data=train_metadata,
31 |             fields=[
32 |                 "text",
33 |                 "text_lengths",
34 |                 "speech",
35 |                 "speech_lengths",
36 |             ],
37 |             converters={
38 |                 "speech": np.load,
39 |             }, )
40 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
41 |             dev_metadata = list(reader)
42 |         self.dev_dataset = DataTable(
43 |             data=dev_metadata,
44 |             fields=[
45 |                 "text",
46 |                 "text_lengths",
47 |                 "speech",
48 |                 "speech_lengths",
49 |             ],
50 |             converters={
51 |                 "speech": np.load,
52 |             }, )
53 | 
54 |     def train_dataloader(self):
55 |         return DataLoader(
56 |             dataset=self.train_dataset,
57 |             batch_size=self.hparams.batch_size,
58 |             num_workers=self.hparams.num_workers,
59 |             pin_memory=self.hparams.pin_memory,
60 |             shuffle=True,
61 |             collate_fn=transformer_single_spk_batch_fn,
62 |         )
63 | 
64 |     def val_dataloader(self):
65 |         return DataLoader(
66 |             dataset=self.dev_dataset,
67 |             batch_size=self.hparams.batch_size,
68 |             num_workers=self.hparams.num_workers,
69 |             pin_memory=self.hparams.pin_memory,
70 |             shuffle=False,
71 |             collate_fn=transformer_single_spk_batch_fn,
72 |         )
73 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/vits_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.am_batch_fn import vits_single_spk_batch_fn
 9 | from deepaudio.tts.datasets.data_table import DataTable
10 | 
11 | 
12 | class VitsDataModule(LightningDataModule):
13 |     def __init__(self,
14 |                  train_metadata: str,
15 |                  dev_metadata: str,
16 |                  batch_size: int = 64,
17 |                  num_workers: int = 0,
18 |                  pin_memory: bool = False,
19 |                  ):
20 |         super().__init__()
21 |         self.save_hyperparameters(logger=False)
22 |         self.train_dataset: Optional[Dataset] = None
23 |         self.dev_dataset: Optional[Dataset] = None
24 | 
25 |     def setup(self, stage: Optional[str] = None) -> None:
26 |         # construct dataset for training and validation
27 |         fields = ["text", "text_lengths", "feats", "feats_lengths", "wave"]
28 | 
29 |         converters = {
30 |             "wave": np.load,
31 |             "feats": np.load,
32 |         }
33 | 
34 |         # construct dataset for training and validation
35 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
36 |             train_metadata = list(reader)
37 |         self.train_dataset = DataTable(
38 |             data=train_metadata,
39 |             fields=fields,
40 |             converters=converters, )
41 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
42 |             dev_metadata = list(reader)
43 |         self.dev_dataset = DataTable(
44 |             data=dev_metadata,
45 |             fields=fields,
46 |             converters=converters, )
47 | 
48 | 
49 |     def train_dataloader(self):
50 |         return DataLoader(
51 |             dataset=self.train_dataset,
52 |             batch_size=self.hparams.batch_size,
53 |             num_workers=self.hparams.num_workers,
54 |             pin_memory=self.hparams.pin_memory,
55 |             shuffle=True,
56 |             collate_fn=vits_single_spk_batch_fn,
57 |         )
58 | 
59 |     def val_dataloader(self):
60 |         return DataLoader(
61 |             dataset=self.dev_dataset,
62 |             batch_size=self.hparams.batch_size,
63 |             num_workers=self.hparams.num_workers,
64 |             pin_memory=self.hparams.pin_memory,
65 |             shuffle=False,
66 |             collate_fn=vits_single_spk_batch_fn,
67 |         )
68 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datamodules/wavernn_datamodule.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | import jsonlines
 3 | import numpy as np
 4 | 
 5 | from torch.utils.data import DataLoader, Dataset
 6 | from pytorch_lightning import LightningDataModule
 7 | 
 8 | from deepaudio.tts.datasets.vocoder_batch_fn import WaveRNNClip
 9 | from deepaudio.tts.datasets.data_table import DataTable
10 | 
11 | 
12 | class WaveRNNDataModule(LightningDataModule):
13 |     def __init__(self,
14 |                  train_metadata: str,
15 |                  dev_metadata: str,
16 |                  batch_max_steps: int,
17 |                  n_shift: int,
18 |                  mode: str,
19 |                  bits: int,
20 |                  aux_context_window: Optional[int] = 0,
21 |                  batch_size: int = 64,
22 |                  num_workers: int = 0,
23 |                  pin_memory: bool = False,
24 |                  ):
25 |         super().__init__()
26 |         self.save_hyperparameters(logger=False)
27 |         self.train_dataset: Optional[Dataset] = None
28 |         self.dev_dataset: Optional[Dataset] = None
29 | 
30 |     def setup(self, stage: Optional[str] = None) -> None:
31 |         # construct dataset for training and validation
32 |         with jsonlines.open(self.hparams.train_metadata, 'r') as reader:
33 |             train_metadata = list(reader)
34 |         self.train_dataset = DataTable(
35 |             data=train_metadata,
36 |             fields=["wave", "feats"],
37 |             converters={
38 |                 "wave": np.load,
39 |                 "feats": np.load,
40 |             }, )
41 | 
42 |         with jsonlines.open(self.hparams.dev_metadata, 'r') as reader:
43 |             dev_metadata = list(reader)
44 |         self.dev_dataset = DataTable(
45 |             data=dev_metadata,
46 |             fields=["wave", "feats"],
47 |             converters={
48 |                 "wave": np.load,
49 |                 "feats": np.load,
50 |             }, )
51 | 
52 |         self.collate_fn = WaveRNNClip(
53 |             mode=self.hparams.mode,
54 |             aux_context_window=self.hparams.aux_context_window,
55 |             hop_size=self.hparams.n_shift,
56 |             batch_max_steps=self.hparams.batch_max_steps,
57 |             bits=self.hparams.bits)
58 | 
59 |     def train_dataloader(self):
60 |         return DataLoader(
61 |             dataset=self.train_dataset,
62 |             batch_size=self.hparams.batch_size,
63 |             num_workers=self.hparams.num_workers,
64 |             pin_memory=self.hparams.pin_memory,
65 |             shuffle=True,
66 |             collate_fn=self.collate_fn,
67 |         )
68 | 
69 |     def val_dataloader(self):
70 |         return DataLoader(
71 |             dataset=self.dev_dataset,
72 |             batch_size=self.hparams.batch_size,
73 |             num_workers=self.hparams.num_workers,
74 |             pin_memory=self.hparams.pin_memory,
75 |             shuffle=False,
76 |             collate_fn=self.collate_fn,
77 |         )
78 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .ljspeech import *
15 | 


--------------------------------------------------------------------------------
/deepaudio/tts/datasets/ljspeech.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from pathlib import Path
15 | 
16 | from torch.utils.data import Dataset
17 | 
18 | __all__ = ["LJSpeechMetaData"]
19 | 
20 | 
21 | class LJSpeechMetaData(Dataset):
22 |     def __init__(self, root):
23 |         self.root = Path(root).expanduser()
24 |         wav_dir = self.root / "wavs"
25 |         csv_path = self.root / "metadata.csv"
26 |         records = []
27 |         speaker_name = "ljspeech"
28 |         with open(str(csv_path), 'rt', encoding='utf-8') as f:
29 |             for line in f:
30 |                 filename, _, normalized_text = line.strip().split("|")
31 |                 filename = str(wav_dir / (filename + ".wav"))
32 |                 records.append([filename, normalized_text, speaker_name])
33 |         self.records = records
34 | 
35 |     def __getitem__(self, i):
36 |         return self.records[i]
37 | 
38 |     def __len__(self):
39 |         return len(self.records)
40 | 


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/feats_extract_from_torch/__init__.py


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/abs_feats_extract.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class AbsFeatsExtract(torch.nn.Module, ABC):
 8 |     @abstractmethod
 9 |     def output_size(self) -> int:
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def get_parameters(self) -> Dict[str, Any]:
14 |         raise NotImplementedError
15 | 
16 |     @abstractmethod
17 |     def forward(
18 |         self, input: torch.Tensor, input_lengths: torch.Tensor
19 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
20 |         raise NotImplementedError
21 | 


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/linear_spectrogram.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | 
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | from deepaudio.tts.feats_extract_from_torch.stft import Stft
 7 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract
 8 | 
 9 | 
10 | class LinearSpectrogram(AbsFeatsExtract):
11 |     """Linear amplitude spectrogram.
12 | 
13 |     Stft -> amplitude-spec
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         n_fft: int = 1024,
19 |         win_length: int = None,
20 |         hop_length: int = 256,
21 |         window: Optional[str] = "hann",
22 |         center: bool = True,
23 |         normalized: bool = False,
24 |         onesided: bool = True,
25 |     ):
26 |         assert check_argument_types()
27 |         super().__init__()
28 |         self.n_fft = n_fft
29 |         self.hop_length = hop_length
30 |         self.win_length = win_length
31 |         self.window = window
32 |         self.stft = Stft(
33 |             n_fft=n_fft,
34 |             win_length=win_length,
35 |             hop_length=hop_length,
36 |             window=window,
37 |             center=center,
38 |             normalized=normalized,
39 |             onesided=onesided,
40 |         )
41 |         self.n_fft = n_fft
42 | 
43 |     def output_size(self) -> int:
44 |         return self.n_fft // 2 + 1
45 | 
46 |     def get_parameters(self) -> Dict[str, Any]:
47 |         """Return the parameters required by Vocoder."""
48 |         return dict(
49 |             n_fft=self.n_fft,
50 |             n_shift=self.hop_length,
51 |             win_length=self.win_length,
52 |             window=self.window,
53 |         )
54 | 
55 |     def forward(
56 |         self, input: torch.Tensor, input_lengths: torch.Tensor = None
57 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
58 |         # 1. Stft: time -> time-freq
59 |         input_stft, feats_lens = self.stft(input, input_lengths)
60 | 
61 |         assert input_stft.dim() >= 4, input_stft.shape
62 |         # "2" refers to the real/imag parts of Complex
63 |         assert input_stft.shape[-1] == 2, input_stft.shape
64 | 
65 |         # STFT -> Power spectrum -> Amp spectrum
66 |         # input_stft: (..., F, 2) -> (..., F)
67 |         input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
68 |         input_amp = torch.sqrt(torch.clamp(input_power, min=1.0e-10))
69 |         return input_amp, feats_lens
70 | 


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/log_mel.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import librosa
 4 | import torch
 5 | 
 6 | from deepaudio.tts.modules.nets_utils import make_pad_mask
 7 | 
 8 | 
 9 | class LogMel(torch.nn.Module):
10 |     """Convert STFT to fbank feats
11 | 
12 |     The arguments is same as librosa.filters.mel
13 | 
14 |     Args:
15 |         fs: number > 0 [scalar] sampling rate of the incoming signal
16 |         n_fft: int > 0 [scalar] number of FFT components
17 |         n_mels: int > 0 [scalar] number of Mel bands to generate
18 |         fmin: float >= 0 [scalar] lowest frequency (in Hz)
19 |         fmax: float >= 0 [scalar] highest frequency (in Hz).
20 |             If `None`, use `fmax = fs / 2.0`
21 |         htk: use HTK formula instead of Slaney
22 |     """
23 | 
24 |     def __init__(
25 |         self,
26 |         fs: int = 16000,
27 |         n_fft: int = 512,
28 |         n_mels: int = 80,
29 |         fmin: float = None,
30 |         fmax: float = None,
31 |         htk: bool = False,
32 |         log_base: float = None,
33 |     ):
34 |         super().__init__()
35 | 
36 |         fmin = 0 if fmin is None else fmin
37 |         fmax = fs / 2 if fmax is None else fmax
38 |         _mel_options = dict(
39 |             sr=fs,
40 |             n_fft=n_fft,
41 |             n_mels=n_mels,
42 |             fmin=fmin,
43 |             fmax=fmax,
44 |             htk=htk,
45 |         )
46 |         self.mel_options = _mel_options
47 |         self.log_base = log_base
48 | 
49 |         # Note(kamo): The mel matrix of librosa is different from kaldi.
50 |         melmat = librosa.filters.mel(**_mel_options)
51 |         # melmat: (D2, D1) -> (D1, D2)
52 |         self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
53 | 
54 |     def extra_repr(self):
55 |         return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
56 | 
57 |     def forward(
58 |         self,
59 |         feat: torch.Tensor,
60 |         ilens: torch.Tensor = None,
61 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
62 |         # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
63 |         mel_feat = torch.matmul(feat, self.melmat)
64 |         mel_feat = torch.clamp(mel_feat, min=1e-10)
65 | 
66 |         if self.log_base is None:
67 |             logmel_feat = mel_feat.log()
68 |         elif self.log_base == 2.0:
69 |             logmel_feat = mel_feat.log2()
70 |         elif self.log_base == 10.0:
71 |             logmel_feat = mel_feat.log10()
72 |         else:
73 |             logmel_feat = mel_feat.log() / torch.log(self.log_base)
74 | 
75 |         # Zero padding
76 |         if ilens is not None:
77 |             logmel_feat = logmel_feat.masked_fill(
78 |                 make_pad_mask(ilens, logmel_feat, 1), 0.0
79 |             )
80 |         else:
81 |             ilens = feat.new_full(
82 |                 [feat.size(0)], fill_value=feat.size(1), dtype=torch.long
83 |             )
84 |         return logmel_feat, ilens
85 | 


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/log_mel_fbank.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Optional, Tuple, Union
  2 | 
  3 | import torch
  4 | from typeguard import check_argument_types
  5 | 
  6 | from deepaudio.tts.feats_extract_from_torch.log_mel import LogMel
  7 | from deepaudio.tts.feats_extract_from_torch.stft import Stft
  8 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract
  9 | 
 10 | 
 11 | 
 12 | class LogMelFbank(AbsFeatsExtract):
 13 |     """Conventional frontend structure for TTS.
 14 | 
 15 |     Stft -> amplitude-spec -> Log-Mel-Fbank
 16 |     """
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         fs: int = 16000,
 21 |         n_fft: int = 1024,
 22 |         win_length: int = None,
 23 |         hop_length: int = 256,
 24 |         window: Optional[str] = "hann",
 25 |         center: bool = True,
 26 |         normalized: bool = False,
 27 |         onesided: bool = True,
 28 |         n_mels: int = 80,
 29 |         fmin: Optional[int] = 80,
 30 |         fmax: Optional[int] = 7600,
 31 |         htk: bool = False,
 32 |         log_base: Optional[float] = 10.0,
 33 |     ):
 34 |         assert check_argument_types()
 35 |         super().__init__()
 36 | 
 37 |         self.fs = fs
 38 |         self.n_mels = n_mels
 39 |         self.n_fft = n_fft
 40 |         self.hop_length = hop_length
 41 |         self.win_length = win_length
 42 |         self.window = window
 43 |         self.fmin = fmin
 44 |         self.fmax = fmax
 45 | 
 46 |         self.stft = Stft(
 47 |             n_fft=n_fft,
 48 |             win_length=win_length,
 49 |             hop_length=hop_length,
 50 |             window=window,
 51 |             center=center,
 52 |             normalized=normalized,
 53 |             onesided=onesided,
 54 |         )
 55 | 
 56 |         self.logmel = LogMel(
 57 |             fs=fs,
 58 |             n_fft=n_fft,
 59 |             n_mels=n_mels,
 60 |             fmin=fmin,
 61 |             fmax=fmax,
 62 |             htk=htk,
 63 |             log_base=log_base,
 64 |         )
 65 | 
 66 |     def output_size(self) -> int:
 67 |         return self.n_mels
 68 | 
 69 |     def get_parameters(self) -> Dict[str, Any]:
 70 |         """Return the parameters required by Vocoder"""
 71 |         return dict(
 72 |             fs=self.fs,
 73 |             n_fft=self.n_fft,
 74 |             n_shift=self.hop_length,
 75 |             window=self.window,
 76 |             n_mels=self.n_mels,
 77 |             win_length=self.win_length,
 78 |             fmin=self.fmin,
 79 |             fmax=self.fmax,
 80 |         )
 81 | 
 82 |     def forward(
 83 |         self, input: torch.Tensor, input_lengths: torch.Tensor = None
 84 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 85 |         # 1. Domain-conversion: e.g. Stft: time -> time-freq
 86 |         input_stft, feats_lens = self.stft(input, input_lengths)
 87 | 
 88 |         assert input_stft.dim() >= 4, input_stft.shape
 89 |         # "2" refers to the real/imag parts of Complex
 90 |         assert input_stft.shape[-1] == 2, input_stft.shape
 91 | 
 92 |         # NOTE(kamo): We use different definition for log-spec between TTS and ASR
 93 |         #   TTS: log_10(abs(stft))
 94 |         #   ASR: log_e(power(stft))
 95 | 
 96 |         # input_stft: (..., F, 2) -> (..., F)
 97 |         input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
 98 |         input_amp = torch.sqrt(torch.clamp(input_power, min=1.0e-10))
 99 |         input_feats, _ = self.logmel(input_amp, feats_lens)
100 |         return input_feats, feats_lens
101 | 


--------------------------------------------------------------------------------
/deepaudio/tts/feats_extract_from_torch/log_spectrogram.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional, Tuple
 2 | 
 3 | import torch
 4 | from typeguard import check_argument_types
 5 | 
 6 | from deepaudio.tts.feats_extract_from_torch.stft import Stft
 7 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract
 8 | 
 9 | 
10 | class LogSpectrogram(AbsFeatsExtract):
11 |     """Conventional frontend structure for ASR
12 | 
13 |     Stft -> log-amplitude-spec
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         n_fft: int = 1024,
19 |         win_length: int = None,
20 |         hop_length: int = 256,
21 |         window: Optional[str] = "hann",
22 |         center: bool = True,
23 |         normalized: bool = False,
24 |         onesided: bool = True,
25 |     ):
26 |         assert check_argument_types()
27 |         super().__init__()
28 |         self.n_fft = n_fft
29 |         self.hop_length = hop_length
30 |         self.win_length = win_length
31 |         self.window = window
32 |         self.stft = Stft(
33 |             n_fft=n_fft,
34 |             win_length=win_length,
35 |             hop_length=hop_length,
36 |             window=window,
37 |             center=center,
38 |             normalized=normalized,
39 |             onesided=onesided,
40 |         )
41 |         self.n_fft = n_fft
42 | 
43 |     def output_size(self) -> int:
44 |         return self.n_fft // 2 + 1
45 | 
46 |     def get_parameters(self) -> Dict[str, Any]:
47 |         """Return the parameters required by Vocoder"""
48 |         return dict(
49 |             n_fft=self.n_fft,
50 |             n_shift=self.hop_length,
51 |             win_length=self.win_length,
52 |             window=self.window,
53 |         )
54 | 
55 |     def forward(
56 |         self, input: torch.Tensor, input_lengths: torch.Tensor = None
57 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
58 |         # 1. Stft: time -> time-freq
59 |         input_stft, feats_lens = self.stft(input, input_lengths)
60 | 
61 |         assert input_stft.dim() >= 4, input_stft.shape
62 |         # "2" refers to the real/imag parts of Complex
63 |         assert input_stft.shape[-1] == 2, input_stft.shape
64 | 
65 |         # NOTE(kamo): We use different definition for log-spec between TTS and ASR
66 |         #   TTS: log_10(abs(stft))
67 |         #   ASR: log_e(power(stft))
68 | 
69 |         # STFT -> Power spectrum
70 |         # input_stft: (..., F, 2) -> (..., F)
71 |         input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
72 |         log_amp = 0.5 * torch.log10(torch.clamp(input_power, min=1.0e-10))
73 |         return log_amp, feats_lens
74 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .generate_lexicon import *
15 | from .normalizer import *
16 | from .phonectic import *
17 | from .punctuation import *
18 | from .tone_sandhi import *
19 | from .vocab import *
20 | from .zh_normalization import *
21 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from paddlespeech.t2s.frontend.normalizer.normalizer import *
15 | from paddlespeech.t2s.frontend.normalizer.numbers import *
16 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/abbrrviation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/acronyms.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/normalizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import unicodedata
16 | from builtins import str as unicode
17 | 
18 | from paddlespeech.t2s.frontend.normalizer.numbers import normalize_numbers
19 | 
20 | 
21 | def normalize(sentence):
22 |     """ Normalize English text.
23 |     """
24 |     # preprocessing
25 |     sentence = unicode(sentence)
26 |     sentence = normalize_numbers(sentence)
27 |     sentence = ''.join(
28 |         char for char in unicodedata.normalize('NFD', sentence)
29 |         if unicodedata.category(char) != 'Mn')  # Strip accents
30 |     sentence = sentence.lower()
31 |     sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence)
32 |     sentence = sentence.replace("i.e.", "that is")
33 |     sentence = sentence.replace("e.g.", "for example")
34 |     return sentence
35 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/numbers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # number expansion is not that easy
15 | import re
16 | 
17 | import inflect
18 | 
19 | _inflect = inflect.engine()
20 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
21 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
22 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
23 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
24 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
25 | _number_re = re.compile(r'[0-9]+')
26 | 
27 | 
28 | def _remove_commas(m):
29 |     return m.group(1).replace(',', '')
30 | 
31 | 
32 | def _expand_decimal_point(m):
33 |     return m.group(1).replace('.', ' point ')
34 | 
35 | 
36 | def _expand_dollars(m):
37 |     match = m.group(1)
38 |     parts = match.split('.')
39 |     if len(parts) > 2:
40 |         return match + ' dollars'  # Unexpected format
41 |     dollars = int(parts[0]) if parts[0] else 0
42 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
43 |     if dollars and cents:
44 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
45 |         cent_unit = 'cent' if cents == 1 else 'cents'
46 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
47 |     elif dollars:
48 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
49 |         return '%s %s' % (dollars, dollar_unit)
50 |     elif cents:
51 |         cent_unit = 'cent' if cents == 1 else 'cents'
52 |         return '%s %s' % (cents, cent_unit)
53 |     else:
54 |         return 'zero dollars'
55 | 
56 | 
57 | def _expand_ordinal(m):
58 |     return _inflect.number_to_words(m.group(0))
59 | 
60 | 
61 | def _expand_number(m):
62 |     num = int(m.group(0))
63 |     if num > 1000 and num < 3000:
64 |         if num == 2000:
65 |             return 'two thousand'
66 |         elif num > 2000 and num < 2010:
67 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
68 |         elif num % 100 == 0:
69 |             return _inflect.number_to_words(num // 100) + ' hundred'
70 |         else:
71 |             return _inflect.number_to_words(
72 |                 num, andword='', zero='oh', group=2).replace(', ', ' ')
73 |     else:
74 |         return _inflect.number_to_words(num, andword='')
75 | 
76 | 
77 | def normalize_numbers(text):
78 |     """ Normalize numbers in English text.
79 |     """
80 |     text = re.sub(_comma_number_re, _remove_commas, text)
81 |     text = re.sub(_pounds_re, r'\1 pounds', text)
82 |     text = re.sub(_dollars_re, _expand_dollars, text)
83 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
84 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
85 |     text = re.sub(_number_re, _expand_number, text)
86 |     return text
87 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/normalizer/width.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | def full2half_width(ustr):
17 |     half = []
18 |     for u in ustr:
19 |         num = ord(u)
20 |         if num == 0x3000:  # 全角空格变半角
21 |             num = 32
22 |         elif 0xFF01 <= num <= 0xFF5E:
23 |             num -= 0xfee0
24 |         u = chr(num)
25 |         half.append(u)
26 |     return ''.join(half)
27 | 
28 | 
29 | def half2full_width(ustr):
30 |     full = []
31 |     for u in ustr:
32 |         num = ord(u)
33 |         if num == 32:  # 半角空格变全角
34 |             num = 0x3000
35 |         elif 0x21 <= num <= 0x7E:
36 |             num += 0xfee0
37 |         u = chr(num)  # to unicode
38 |         full.append(u)
39 | 
40 |     return ''.join(full)
41 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/punctuation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | __all__ = ["get_punctuations"]
16 | 
17 | EN_PUNCT = [
18 |     " ",
19 |     "-",
20 |     "...",
21 |     ",",
22 |     ".",
23 |     "?",
24 |     "!",
25 | ]
26 | 
27 | CN_PUNCT = ["、", "，", "；", "：", "。", "？", "！"]
28 | 
29 | 
30 | def get_punctuations(lang):
31 |     if lang == "en":
32 |         return EN_PUNCT
33 |     elif lang == "cn":
34 |         return CN_PUNCT
35 |     else:
36 |         raise ValueError(f"language {lang} Not supported")
37 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/vocab.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from collections import OrderedDict
 15 | from typing import Iterable
 16 | 
 17 | __all__ = ["Vocab"]
 18 | 
 19 | 
 20 | class Vocab(object):
 21 |     """  Vocabulary.
 22 | 
 23 |     Args:
 24 |         symbols (Iterable[str]): Common symbols.
 25 |         padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
 26 |         unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
 27 |         start_symbol (str, optional): Symbol for start. Defaults to "<s>"
 28 |         end_symbol (str, optional): Symbol for end. Defaults to "</s>"
 29 |     """
 30 | 
 31 |     def __init__(self,
 32 |                  symbols: Iterable[str],
 33 |                  padding_symbol="<pad>",
 34 |                  unk_symbol="<unk>",
 35 |                  start_symbol="<s>",
 36 |                  end_symbol="</s>"):
 37 |         self.special_symbols = OrderedDict()
 38 |         for i, item in enumerate(
 39 |             [padding_symbol, unk_symbol, start_symbol, end_symbol]):
 40 |             if item:
 41 |                 self.special_symbols[item] = len(self.special_symbols)
 42 | 
 43 |         self.padding_symbol = padding_symbol
 44 |         self.unk_symbol = unk_symbol
 45 |         self.start_symbol = start_symbol
 46 |         self.end_symbol = end_symbol
 47 | 
 48 |         self.stoi = OrderedDict()
 49 |         self.stoi.update(self.special_symbols)
 50 | 
 51 |         for i, s in enumerate(symbols):
 52 |             if s not in self.stoi:
 53 |                 self.stoi[s] = len(self.stoi)
 54 |         self.itos = {v: k for k, v in self.stoi.items()}
 55 | 
 56 |     def __len__(self):
 57 |         return len(self.stoi)
 58 | 
 59 |     @property
 60 |     def num_specials(self):
 61 |         """ The number of special symbols.
 62 |         """
 63 |         return len(self.special_symbols)
 64 | 
 65 |     # special tokens
 66 |     @property
 67 |     def padding_index(self):
 68 |         """ The index of padding symbol
 69 |         """
 70 |         return self.stoi.get(self.padding_symbol, -1)
 71 | 
 72 |     @property
 73 |     def unk_index(self):
 74 |         """The index of unknow symbol.
 75 |         """
 76 |         return self.stoi.get(self.unk_symbol, -1)
 77 | 
 78 |     @property
 79 |     def start_index(self):
 80 |         """The index of start symbol.
 81 |         """
 82 |         return self.stoi.get(self.start_symbol, -1)
 83 | 
 84 |     @property
 85 |     def end_index(self):
 86 |         """ The index of end symbol.
 87 |         """
 88 |         return self.stoi.get(self.end_symbol, -1)
 89 | 
 90 |     def __repr__(self):
 91 |         fmt = "Vocab(size: {},\nstoi:\n{})"
 92 |         return fmt.format(len(self), self.stoi)
 93 | 
 94 |     def __str__(self):
 95 |         return self.__repr__()
 96 | 
 97 |     def lookup(self, symbol):
 98 |         """ The index that symbol correspond.
 99 |         """
100 |         return self.stoi[symbol]
101 | 
102 |     def reverse(self, index):
103 |         """ The symbol thar index cottespond.
104 |         """
105 |         return self.itos[index]
106 | 
107 |     def add_symbol(self, symbol):
108 |         """ Add a new symbol in vocab.
109 |         """
110 |         if symbol in self.stoi:
111 |             return
112 |         N = len(self.stoi)
113 |         self.stoi[symbol] = N
114 |         self.itos[N] = symbol
115 | 
116 |     def add_symbols(self, symbols):
117 |         """ Add multiple symbols in vocab.
118 |         """
119 |         for symbol in symbols:
120 |             self.add_symbol(symbol)
121 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/README.md:
--------------------------------------------------------------------------------
 1 | ## Supported NSW (Non-Standard-Word) Normalization
 2 | 
 3 | |NSW type|raw|normalized|
 4 | |:--|:-|:-|
 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
 6 | |cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
 7 | |numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
 8 | |date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我
10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
12 | |percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
13 | |money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
14 | |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
15 | ## References
16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
17 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from paddlespeech.t2s.frontend.zh_normalization.text_normlization import *
15 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/chronology.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import re
 15 | 
 16 | from .num import DIGITS
 17 | from .num import num2str
 18 | from .num import verbalize_cardinal
 19 | from .num import verbalize_digit
 20 | 
 21 | 
 22 | def _time_num2str(num_string: str) -> str:
 23 |     """A special case for verbalizing number in time."""
 24 |     result = num2str(num_string.lstrip('0'))
 25 |     if num_string.startswith('0'):
 26 |         result = DIGITS['0'] + result
 27 |     return result
 28 | 
 29 | 
 30 | # 时刻表达式
 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
 32 |                      r':([0-5][0-9])'
 33 |                      r'(:([0-5][0-9]))?')
 34 | 
 35 | # 时间范围，如8:30-12:30
 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 37 |                            r':([0-5][0-9])'
 38 |                            r'(:([0-5][0-9]))?'
 39 |                            r'(~|-)'
 40 |                            r'([0-1]?[0-9]|2[0-3])'
 41 |                            r':([0-5][0-9])'
 42 |                            r'(:([0-5][0-9]))?')
 43 | 
 44 | 
 45 | def replace_time(match) -> str:
 46 |     """
 47 |     Args:
 48 |         match (re.Match)
 49 |     Returns:
 50 |         str
 51 |     """
 52 | 
 53 |     is_range = len(match.groups()) > 5
 54 | 
 55 |     hour = match.group(1)
 56 |     minute = match.group(2)
 57 |     second = match.group(4)
 58 | 
 59 |     if is_range:
 60 |         hour_2 = match.group(6)
 61 |         minute_2 = match.group(7)
 62 |         second_2 = match.group(9)
 63 | 
 64 |     result = f"{num2str(hour)}点"
 65 |     if minute.lstrip('0'):
 66 |         if int(minute) == 30:
 67 |             result += "半"
 68 |         else:
 69 |             result += f"{_time_num2str(minute)}分"
 70 |     if second and second.lstrip('0'):
 71 |         result += f"{_time_num2str(second)}秒"
 72 | 
 73 |     if is_range:
 74 |         result += "至"
 75 |         result += f"{num2str(hour_2)}点"
 76 |         if minute_2.lstrip('0'):
 77 |             if int(minute) == 30:
 78 |                 result += "半"
 79 |             else:
 80 |                 result += f"{_time_num2str(minute_2)}分"
 81 |         if second_2 and second_2.lstrip('0'):
 82 |             result += f"{_time_num2str(second_2)}秒"
 83 | 
 84 |     return result
 85 | 
 86 | 
 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 88 |                      r'((0?[1-9]|1[0-2])月)?'
 89 |                      r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
 90 | 
 91 | 
 92 | def replace_date(match) -> str:
 93 |     """
 94 |     Args:
 95 |         match (re.Match)
 96 |     Returns:
 97 |         str
 98 |     """
 99 |     year = match.group(1)
100 |     month = match.group(3)
101 |     day = match.group(5)
102 |     result = ""
103 |     if year:
104 |         result += f"{verbalize_digit(year)}年"
105 |     if month:
106 |         result += f"{verbalize_cardinal(month)}月"
107 |     if day:
108 |         result += f"{verbalize_cardinal(day)}{match.group(9)}"
109 |     return result
110 | 
111 | 
112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
113 | RE_DATE2 = re.compile(
114 |     r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
115 | 
116 | 
117 | def replace_date2(match) -> str:
118 |     """
119 |     Args:
120 |         match (re.Match)
121 |     Returns:
122 |         str
123 |     """
124 |     year = match.group(1)
125 |     month = match.group(3)
126 |     day = match.group(4)
127 |     result = ""
128 |     if year:
129 |         result += f"{verbalize_digit(year)}年"
130 |     if month:
131 |         result += f"{verbalize_cardinal(month)}月"
132 |     if day:
133 |         result += f"{verbalize_cardinal(day)}日"
134 |     return result
135 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | import string
16 | 
17 | from pypinyin.constants import SUPPORT_UCS4
18 | 
19 | # 全角半角转换
20 | # 英文字符全角 -> 半角映射表 (num: 52)
21 | F2H_ASCII_LETTERS = {
22 |     chr(ord(char) + 65248): char
23 |     for char in string.ascii_letters
24 | }
25 | 
26 | # 英文字符半角 -> 全角映射表
27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
28 | 
29 | # 数字字符全角 -> 半角映射表 (num: 10)
30 | F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
31 | # 数字字符半角 -> 全角映射表
32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
33 | 
34 | # 标点符号全角 -> 半角映射表 (num: 32)
35 | F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
36 | # 标点符号半角 -> 全角映射表
37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
38 | 
39 | # 空格 (num: 1)
40 | F2H_SPACE = {'\u3000': ' '}
41 | H2F_SPACE = {' ': '\u3000'}
42 | 
43 | # 非"有拼音的汉字"的字符串，可用于NSW提取
44 | if SUPPORT_UCS4:
45 |     RE_NSW = re.compile(r'(?:[^'
46 |                         r'\u3007'  # 〇
47 |                         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
48 |                         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
49 |                         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
50 |                         r'\U00020000-\U0002A6DF'  # CJK扩展B:[20000-2A6DF]
51 |                         r'\U0002A703-\U0002B73F'  # CJK扩展C:[2A700-2B73F]
52 |                         r'\U0002B740-\U0002B81D'  # CJK扩展D:[2B740-2B81D]
53 |                         r'\U0002F80A-\U0002FA1F'  # CJK兼容扩展:[2F800-2FA1F]
54 |                         r'])+')
55 | else:
56 |     RE_NSW = re.compile(  # pragma: no cover
57 |         r'(?:[^'
58 |         r'\u3007'  # 〇
59 |         r'\u3400-\u4dbf'  # CJK扩展A:[3400-4DBF]
60 |         r'\u4e00-\u9fff'  # CJK基本:[4E00-9FFF]
61 |         r'\uf900-\ufaff'  # CJK兼容:[F900-FAFF]
62 |         r'])+')
63 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/phonecode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import verbalize_digit
17 | 
18 | # 规范化固话/手机号码
19 | # 手机
20 | # http://www.jihaoba.com/news/show/13680
21 | # 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
22 | # 联通：130、131、132、156、155、186、185、176
23 | # 电信：133、153、189、180、181、177
24 | RE_MOBILE_PHONE = re.compile(
25 |     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
26 | RE_TELEPHONE = re.compile(
27 |     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
28 | 
29 | # 全国统一的号码400开头
30 | RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
31 | 
32 | 
33 | def phone2str(phone_string: str, mobile=True) -> str:
34 |     if mobile:
35 |         sp_parts = phone_string.strip('+').split()
36 |         result = '，'.join(
37 |             [verbalize_digit(part, alt_one=True) for part in sp_parts])
38 |         return result
39 |     else:
40 |         sil_parts = phone_string.split('-')
41 |         result = '，'.join(
42 |             [verbalize_digit(part, alt_one=True) for part in sil_parts])
43 |         return result
44 | 
45 | 
46 | def replace_phone(match) -> str:
47 |     """
48 |     Args:
49 |         match (re.Match)
50 |     Returns:
51 |         str
52 |     """
53 |     return phone2str(match.group(0), mobile=False)
54 | 
55 | 
56 | def replace_mobile(match) -> str:
57 |     """
58 |     Args:
59 |         match (re.Match)
60 |     Returns:
61 |         str
62 |     """
63 |     return phone2str(match.group(0))
64 | 


--------------------------------------------------------------------------------
/deepaudio/tts/frontend/zh_normalization/quantifier.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from .num import num2str
17 | 
18 | # 温度表达式，温度会影响负号的读法
19 | # -3°C 零下三度
20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
21 | 
22 | 
23 | def replace_temperature(match) -> str:
24 |     """
25 |     Args:
26 |         match (re.Match)
27 |     Returns:
28 |         str
29 |     """
30 |     sign = match.group(1)
31 |     temperature = match.group(2)
32 |     unit = match.group(3)
33 |     sign: str = "零下" if sign else ""
34 |     temperature: str = num2str(temperature)
35 |     unit: str = "摄氏度" if unit == "摄氏度" else "度"
36 |     result = f"{sign}{temperature}{unit}"
37 |     return result
38 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | 
 4 | from .base import BasePLModel
 5 | 
 6 | 
 7 | MODEL_REGISTRY = dict()
 8 | MODEL_DATACLASS_REGISTRY = dict()
 9 | 
10 | 
11 | def register_model(name: str, dataclass=None):
12 |     r"""
13 |     New model types can be added to OpenSpeech with the :func:`register_model` function decorator.
14 | 
15 |     For example::
16 |         @register_model('conformer_lstm')
17 |         class ConformerLSTMModel(OpenspeechModel):
18 |             (...)
19 | 
20 |     .. note:: All models must implement the :class:`cls.__name__` interface.
21 | 
22 |     Args:
23 |         name (str): the name of the model
24 |     """
25 | 
26 |     def register_model_cls(cls):
27 |         if name in MODEL_REGISTRY:
28 |             raise ValueError(f"Cannot register duplicate model ({name})")
29 |         if not issubclass(cls, BasePLModel):
30 |             raise ValueError(f"Model ({name}: {cls.__name__}) must extend BaseModel")
31 | 
32 |         MODEL_REGISTRY[name] = cls
33 | 
34 |         cls.__dataclass = dataclass
35 |         if dataclass is not None:
36 |             if name in MODEL_DATACLASS_REGISTRY:
37 |                 raise ValueError(f"Cannot register duplicate model ({name})")
38 |             MODEL_DATACLASS_REGISTRY[name] = dataclass
39 | 
40 |         return cls
41 | 
42 |     return register_model_cls
43 | 
44 | 
45 | # automatically import any Python files in the models/ directory
46 | models_dir = os.path.dirname(__file__)
47 | for file in os.listdir(models_dir):
48 |     if os.path.isdir(os.path.join(models_dir, file)) and not file.startswith('__'):
49 |         for subfile in os.listdir(os.path.join(models_dir, file)):
50 |             path = os.path.join(models_dir, file, subfile)
51 |             if subfile.endswith(".py"):
52 |                 python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
53 |                 module = importlib.import_module(f"deepaudio.tts.models.{file}.{python_file}")
54 |         continue
55 | 
56 |     path = os.path.join(models_dir, file)
57 |     if file.endswith(".py"):
58 |         model_name = file[: file.find(".py")] if file.endswith(".py") else file
59 |         module = importlib.import_module(f"deepaudio.tts.models.{model_name}")


--------------------------------------------------------------------------------
/deepaudio/tts/models/fastspeech2/__init__.py:
--------------------------------------------------------------------------------
1 | from .fastspeech2 import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/fastspeech2/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor, nn
 3 | from pytorch_lightning import LightningModule
 4 | 
 5 | from deepaudio.tts.models.fastspeech2 import FastSpeech2
 6 | from deepaudio.tts.models.fastspeech2.loss import FastSpeech2Loss
 7 | 
 8 | 
 9 | class Fastspeech2Model(LightningModule):
10 |     def __init__(self,
11 |                  model: FastSpeech2,
12 |                  optimizer: torch.optim.Optimizer,
13 |                  scheduler: torch.optim.lr_scheduler, ):
14 |         super(Fastspeech2Model, self).__init__()
15 | 
16 |         self.save_hyperparameters(logger=False, ignore=["model"])
17 |         self.model = model
18 |         self.criterion = FastSpeech2Loss()
19 | 
20 |     def step(self, batch):
21 |         # spk_id!=None in multiple spk fastspeech2
22 |         spk_id = batch["spk_id"] if "spk_id" in batch else None
23 |         spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
24 |         lang_id = batch["lang_id"] if "lang_id" in batch else None
25 |         # No explicit speaker identifier labels are used during voice cloning training.
26 |         if spk_emb is not None:
27 |             spk_id = None
28 | 
29 |         outs = self.model(
30 |             text=batch["text"],
31 |             text_lengths=batch["text_lengths"],
32 |             feats=batch["speech"],
33 |             feats_lengths=batch["speech_lengths"],
34 |             durations=batch["durations"],
35 |             pitch=batch["pitch"],
36 |             energy=batch["energy"],
37 |             sids=spk_id,
38 |             spembs=spk_emb,
39 |             lids=lang_id,
40 |         )
41 |         return outs
42 | 
43 |     def training_step(self, batch: dict, batch_idx: int):
44 |         before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.step(batch)
45 |         l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
46 |             after_outs=after_outs,
47 |             before_outs=before_outs,
48 |             d_outs=d_outs,
49 |             p_outs=p_outs,
50 |             e_outs=e_outs,
51 |             ys=ys,
52 |             ds=batch["durations"],
53 |             ps=batch["pitch"],
54 |             es=batch["energy"],
55 |             ilens=batch["text_lengths"],
56 |             olens=olens)
57 | 
58 |         loss = l1_loss + duration_loss + pitch_loss + energy_loss
59 |         return {'loss': loss}
60 | 
61 |     def validation_step(self, batch: dict, batch_idx: int):
62 |         before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.step(batch)
63 |         l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
64 |             after_outs=after_outs,
65 |             before_outs=before_outs,
66 |             d_outs=d_outs,
67 |             p_outs=p_outs,
68 |             e_outs=e_outs,
69 |             ys=ys,
70 |             ds=batch["durations"],
71 |             ps=batch["pitch"],
72 |             es=batch["energy"],
73 |             ilens=batch["text_lengths"],
74 |             olens=olens)
75 | 
76 |         loss = l1_loss + duration_loss + pitch_loss + energy_loss
77 |         self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True)
78 |         return {'val_loss': loss}
79 | 
80 |     def configure_optimizers(self):
81 |         optimizer = self.hparams.optimizer(params=self.parameters())
82 |         scheduler = self.hparams.scheduler(optimizer=optimizer)
83 | 
84 |         return {
85 |             "optimizer": optimizer,
86 |             "lr_scheduler": {
87 |                 "scheduler": scheduler,
88 |                 "monitor": "val/loss",
89 |                 "interval": "epoch",
90 |                 "frequency": 1,
91 |             },
92 |         }
93 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .hifigan import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/hifigan/residual_block.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """HiFiGAN Residual block modules.
 5 | 
 6 | This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
 7 | 
 8 | """
 9 | 
10 | from typing import Any, Dict, List
11 | 
12 | import torch
13 | 
14 | 
15 | class ResidualBlock(torch.nn.Module):
16 |     """Residual block module in HiFiGAN."""
17 | 
18 |     def __init__(
19 |         self,
20 |         kernel_size: int = 3,
21 |         channels: int = 512,
22 |         dilations: List[int] = [1, 3, 5],
23 |         bias: bool = True,
24 |         use_additional_convs: bool = True,
25 |         nonlinear_activation: str = "LeakyReLU",
26 |         nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1},
27 |     ):
28 |         """Initialize ResidualBlock module.
29 | 
30 |         Args:
31 |             kernel_size (int): Kernel size of dilation convolution layer.
32 |             channels (int): Number of channels for convolution layer.
33 |             dilations (List[int]): List of dilation factors.
34 |             use_additional_convs (bool): Whether to use additional convolution layers.
35 |             bias (bool): Whether to add bias parameter in convolution layers.
36 |             nonlinear_activation (str): Activation function module name.
37 |             nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
38 |                 function.
39 | 
40 |         """
41 |         super().__init__()
42 |         self.use_additional_convs = use_additional_convs
43 |         self.convs1 = torch.nn.ModuleList()
44 |         if use_additional_convs:
45 |             self.convs2 = torch.nn.ModuleList()
46 |         assert kernel_size % 2 == 1, "Kernel size must be odd number."
47 |         for dilation in dilations:
48 |             self.convs1 += [
49 |                 torch.nn.Sequential(
50 |                     getattr(torch.nn, nonlinear_activation)(
51 |                         **nonlinear_activation_params
52 |                     ),
53 |                     torch.nn.Conv1d(
54 |                         channels,
55 |                         channels,
56 |                         kernel_size,
57 |                         1,
58 |                         dilation=dilation,
59 |                         bias=bias,
60 |                         padding=(kernel_size - 1) // 2 * dilation,
61 |                     ),
62 |                 )
63 |             ]
64 |             if use_additional_convs:
65 |                 self.convs2 += [
66 |                     torch.nn.Sequential(
67 |                         getattr(torch.nn, nonlinear_activation)(
68 |                             **nonlinear_activation_params
69 |                         ),
70 |                         torch.nn.Conv1d(
71 |                             channels,
72 |                             channels,
73 |                             kernel_size,
74 |                             1,
75 |                             dilation=1,
76 |                             bias=bias,
77 |                             padding=(kernel_size - 1) // 2,
78 |                         ),
79 |                     )
80 |                 ]
81 | 
82 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
83 |         """Calculate forward propagation.
84 | 
85 |         Args:
86 |             x (Tensor): Input tensor (B, channels, T).
87 | 
88 |         Returns:
89 |             Tensor: Output tensor (B, channels, T).
90 | 
91 |         """
92 |         for idx in range(len(self.convs1)):
93 |             xt = self.convs1[idx](x)
94 |             if self.use_additional_convs:
95 |                 xt = self.convs2[idx](xt)
96 |             x = xt + x
97 |         return x
98 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/melgan/__init__.py:
--------------------------------------------------------------------------------
1 | from .melgan import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/melgan/residual_stack.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Residual stack module in MelGAN.
 5 | 
 6 | This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
 7 | 
 8 | """
 9 | 
10 | from typing import Any, Dict
11 | 
12 | import torch
13 | 
14 | 
15 | class ResidualStack(torch.nn.Module):
16 |     """Residual stack module introduced in MelGAN."""
17 | 
18 |     def __init__(
19 |         self,
20 |         kernel_size: int = 3,
21 |         channels: int = 32,
22 |         dilation: int = 1,
23 |         bias: bool = True,
24 |         nonlinear_activation: str = "LeakyReLU",
25 |         nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
26 |         pad: str = "ReflectionPad1d",
27 |         pad_params: Dict[str, Any] = {},
28 |     ):
29 |         """Initialize ResidualStack module.
30 | 
31 |         Args:
32 |             kernel_size (int): Kernel size of dilation convolution layer.
33 |             channels (int): Number of channels of convolution layers.
34 |             dilation (int): Dilation factor.
35 |             bias (bool): Whether to add bias parameter in convolution layers.
36 |             nonlinear_activation (str): Activation function module name.
37 |             nonlinear_activation_params (Dict[str, Any]): Hyperparameters for
38 |                 activation function.
39 |             pad (str): Padding function module name before dilated convolution layer.
40 |             pad_params (Dict[str, Any]): Hyperparameters for padding function.
41 | 
42 |         """
43 |         super().__init__()
44 | 
45 |         # defile residual stack part
46 |         assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
47 |         self.stack = torch.nn.Sequential(
48 |             getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
49 |             getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
50 |             torch.nn.Conv1d(
51 |                 channels, channels, kernel_size, dilation=dilation, bias=bias
52 |             ),
53 |             getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
54 |             torch.nn.Conv1d(channels, channels, 1, bias=bias),
55 |         )
56 | 
57 |         # defile extra layer for skip connection
58 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
59 | 
60 |     def forward(self, c: torch.Tensor) -> torch.Tensor:
61 |         """Calculate forward propagation.
62 | 
63 |         Args:
64 |             c (Tensor): Input tensor (B, channels, T).
65 | 
66 |         Returns:
67 |             Tensor: Output tensor (B, chennels, T).
68 | 
69 |         """
70 |         return self.stack(c) + self.skip_layer(c)
71 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
1 | from .parallel_wavegan import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/parallel_wavegan/model.py:
--------------------------------------------------------------------------------
  1 | from omegaconf import DictConfig
  2 | import torch
  3 | from torch import Tensor, nn
  4 | 
  5 | from pytorch_lightning import LightningModule
  6 | from deepaudio.tts.models.parallel_wavegan import ParallelWaveGANDiscriminator
  7 | from deepaudio.tts.models.parallel_wavegan import ParallelWaveGANGenerator
  8 | 
  9 | 
 10 | from deepaudio.tts.modules.losses import MultiResolutionSTFTLoss
 11 | 
 12 | 
 13 | class ParallelWaveGANModel(LightningModule):
 14 |     def __init__(self,
 15 |                  generator: ParallelWaveGANGenerator,
 16 |                  discriminator: ParallelWaveGANDiscriminator,
 17 |                  criterion_stft: MultiResolutionSTFTLoss,
 18 |                  lambda_aux: float,
 19 |                  lambda_adv: float,
 20 |                  optimizer_d: torch.optim.Optimizer,
 21 |                  scheduler_d: torch.optim.lr_scheduler,
 22 |                  optimizer_g: torch.optim.Optimizer,
 23 |                  scheduler_g: torch.optim.lr_scheduler,
 24 |                  discriminator_train_start_steps: int = 100000,
 25 |                  ):
 26 |         super(ParallelWaveGANModel, self).__init__()
 27 | 
 28 |         self.generator = generator
 29 |         self.discriminator = discriminator
 30 |         self.criterion_stft = criterion_stft
 31 |         self.criterion_mse = torch.nn.MSELoss()
 32 |         self.save_hyperparameters(logger=False, ignore=["generator",
 33 |                                                         "discriminator",
 34 |                                                         "criterion_stft"])
 35 | 
 36 |     def step_generator(self, wav, mel, batch_idx):
 37 |         losses_dict = {}
 38 |         noise = torch.randn(wav.shape).to(device=wav.device, dtype=wav.dtype)
 39 |         wav_ = self.generator(mel, noise)
 40 | 
 41 |         # initialize
 42 |         gen_loss = 0.0
 43 |         aux_loss = 0.0
 44 | 
 45 |         # multi-resolution stft loss
 46 |         sc_loss, mag_loss = self.criterion_stft(wav_, wav)
 47 |         aux_loss += sc_loss + mag_loss
 48 | 
 49 |         gen_loss += aux_loss * self.hparams.lambda_aux
 50 | 
 51 |         losses_dict["spectral_convergence_loss"] = sc_loss
 52 |         losses_dict["log_stft_magnitude_loss"] = mag_loss
 53 | 
 54 |         # adversarial loss
 55 |         if batch_idx > self.hparams.discriminator_train_start_steps:
 56 |             p_ = self.discriminator(wav_)
 57 |             adv_loss = self.criterion_mse(p_, torch.ones_like(p_))
 58 |             losses_dict["adversarial_loss"] = adv_loss
 59 |             gen_loss += self.hparams.lambda_adv * adv_loss
 60 |         losses_dict["generator_loss"] = gen_loss
 61 |         self.log_dict(losses_dict)
 62 |         return gen_loss
 63 | 
 64 |     def step_disctiminator(self, wav, mel):
 65 |         losses_dict = {}
 66 |         with torch.no_grad():
 67 |             noise = torch.randn(wav.shape)
 68 |             wav_ = self.generator(mel, noise)
 69 |         p = self.discriminator(wav)
 70 |         p_ = self.discriminator(wav_.detach())
 71 |         real_loss = self.criterion_mse(p, torch.ones_like(p))
 72 |         fake_loss = self.criterion_mse(p_, torch.zeros_like(p_))
 73 |         dis_loss = real_loss + fake_loss
 74 | 
 75 |         losses_dict["real_loss"] = real_loss
 76 |         losses_dict["fake_loss"] = fake_loss
 77 |         losses_dict["discriminator_loss"] = dis_loss
 78 |         self.log_dict(losses_dict)
 79 |         return dis_loss
 80 | 
 81 |     def training_step(self, batch: tuple, batch_idx: int, optimizer_idx: int):
 82 |         opt_g, opt_d = self.optimizers()
 83 |         sch_g, sch_d = self.lr_schedulers()
 84 |         # parse batch
 85 |         wav, mel = batch
 86 | 
 87 |         # Generator
 88 |         gen_loss = self.step_generator(wav, mel, batch_idx)
 89 |         opt_g.zero_grad()
 90 |         self.manual_backward(gen_loss)
 91 |         opt_g.step()
 92 |         sch_g.step()
 93 | 
 94 |         # Disctiminator
 95 |         if batch_idx > self.hparams.discriminator_train_start_steps:
 96 |             # re-compute wav_ which leads better quality
 97 |             dis_loss = self.step_disctiminator(wav, mel)
 98 |             opt_d.zero_grad()
 99 |             self.manual_backward(dis_loss)
100 |             opt_d.step()
101 |             sch_d.step()
102 | 
103 | 
104 |     def configure_optimizers(self):
105 |         optimizer_g = self.hparams.optimizer_g(params=self.generator.parameters())
106 |         optimizer_d = self.hparams.optimizer_d(params=self.discriminator.parameters())
107 |         scheduler_g = self.hparams.scheduler_g(optimizer=optimizer_g)
108 |         scheduler_d = self.hparams.scheduler_d(optimizer=optimizer_d)
109 | 
110 |         return [optimizer_g, optimizer_d], [scheduler_g, scheduler_d]
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/tacotron2/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron2 import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/tacotron2/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import Tensor, nn
  3 | 
  4 | from pytorch_lightning import LightningModule
  5 | from deepaudio.tts.models.tacotron2.tacotron2 import Tacotron2
  6 | from deepaudio.tts.models.tacotron2.loss import Tacotron2Loss
  7 | from deepaudio.tts.models.tacotron2.loss import GuidedAttentionLoss
  8 | 
  9 | 
 10 | class Tacotron2Model(LightningModule):
 11 |     def __init__(self,
 12 |                  model: Tacotron2,
 13 |                  loss_type: str,
 14 |                  taco2_loss: Tacotron2Loss,
 15 |                  use_guided_attn_loss: bool,
 16 |                  attn_loss: GuidedAttentionLoss,
 17 |                  optimizer: torch.optim.Optimizer,
 18 |                  scheduler: torch.optim.lr_scheduler
 19 |                  ):
 20 |         super(Tacotron2Model, self).__init__()
 21 | 
 22 |         self.model = model
 23 |         self.taco2_loss = taco2_loss
 24 |         self.save_hyperparameters(logger=False, ignore=["model",
 25 |                                                         "taco2_loss",
 26 |                                                         "attn_loss"])
 27 |         if self.hparams.use_guided_attn_loss:
 28 |             self.attn_loss = attn_loss
 29 | 
 30 |     def compute_loss(self, batch):
 31 |         losses_dict = {}
 32 |         # spk_id!=None in multiple spk fastspeech2
 33 |         spk_id = batch["spk_id"] if "spk_id" in batch else None
 34 |         spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
 35 |         if spk_emb is not None:
 36 |             spk_id = None
 37 | 
 38 |         after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
 39 |             text=batch["text"],
 40 |             text_lengths=batch["text_lengths"],
 41 |             feats=batch["speech"],
 42 |             feats_lengths=batch["speech_lengths"],
 43 |             spk_id=spk_id,
 44 |             spk_emb=spk_emb)
 45 | 
 46 |         # calculate taco2 loss
 47 |         l1_loss, mse_loss, bce_loss = self.taco2_loss(
 48 |             after_outs=after_outs,
 49 |             before_outs=before_outs,
 50 |             logits=logits,
 51 |             ys=ys,
 52 |             labels=labels,
 53 |             olens=olens)
 54 | 
 55 |         if self.hparams.loss_type == "L1+L2":
 56 |             loss = l1_loss + mse_loss + bce_loss
 57 |         elif self.hparams.loss_type == "L1":
 58 |             loss = l1_loss + bce_loss
 59 |         elif self.hparams.loss_type == "L2":
 60 |             loss = mse_loss + bce_loss
 61 |         else:
 62 |             raise ValueError(f"unknown --loss-type {self.loss_type}")
 63 | 
 64 |         # calculate attention loss
 65 |         if self.hparams.use_guided_attn_loss:
 66 |             # NOTE: length of output for auto-regressive
 67 |             # input will be changed when r > 1
 68 |             attn_loss = self.attn_loss(
 69 |                 att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
 70 |             losses_dict["attn_loss"] = attn_loss
 71 |             loss = loss + attn_loss
 72 | 
 73 |         losses_dict["l1_loss"] = l1_loss
 74 |         losses_dict["mse_loss"] = mse_loss
 75 |         losses_dict["bce_loss"] = bce_loss
 76 |         losses_dict["loss"] = loss
 77 |         return losses_dict
 78 | 
 79 |     def training_step(self, batch: dict, batch_idx: int):
 80 |         losses_dict = self.compute_loss(batch)
 81 |         self.log_dict(losses_dict)
 82 |         return losses_dict
 83 | 
 84 |     def validation_step(self, batch: dict, batch_idx: int):
 85 |         losses_dict = self.compute_loss(batch)
 86 |         loss = losses_dict.pop('loss')
 87 |         losses_dict['val_loss'] = loss
 88 |         self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True)
 89 |         return loss
 90 | 
 91 |     def configure_optimizers(self):
 92 |         optimizer = self.hparams.optimizer(params=self.parameters())
 93 |         scheduler = self.hparams.scheduler(optimizer=optimizer)
 94 | 
 95 |         return {
 96 |             "optimizer": optimizer,
 97 |             "lr_scheduler": {
 98 |                 "scheduler": scheduler,
 99 |                 "monitor": "val/loss",
100 |                 "interval": "epoch",
101 |                 "frequency": 1,
102 |             },
103 |         }
104 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/transformer_tts/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/transformer_tts/loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """TTS-Transformer related modules."""
 5 | 
 6 | import torch
 7 | 
 8 | from deepaudio.tts.models.tacotron2.loss import GuidedAttentionLoss
 9 | from deepaudio.tts.models.tacotron2.loss import (
10 |     Tacotron2Loss as TransformerLoss,
11 | )
12 | 
13 | 
14 | class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
15 |     """Guided attention loss function module for multi head attention.
16 | 
17 |     Args:
18 |         sigma (float, optional): Standard deviation to control
19 |         how close attention to a diagonal.
20 |         alpha (float, optional): Scaling coefficient (lambda).
21 |         reset_always (bool, optional): Whether to always reset masks.
22 | 
23 |     """
24 | 
25 |     def forward(self, att_ws, ilens, olens):
26 |         """Calculate forward propagation.
27 | 
28 |         Args:
29 |             att_ws (Tensor):
30 |                 Batch of multi head attention weights (B, H, T_max_out, T_max_in).
31 |             ilens (LongTensor): Batch of input lengths (B,).
32 |             olens (LongTensor): Batch of output lengths (B,).
33 | 
34 |         Returns:
35 |             Tensor: Guided attention loss value.
36 | 
37 |         """
38 |         if self.guided_attn_masks is None:
39 |             self.guided_attn_masks = (
40 |                 self._make_guided_attention_masks(ilens, olens)
41 |                     .to(att_ws.device)
42 |                     .unsqueeze(1)
43 |             )
44 |         if self.masks is None:
45 |             self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1)
46 |         losses = self.guided_attn_masks * att_ws
47 |         loss = torch.mean(losses.masked_select(self.masks))
48 |         if self.reset_always:
49 |             self._reset_masks()
50 | 
51 |         return self.alpha * loss
52 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/__init__.py:
--------------------------------------------------------------------------------
1 | from .vits import *
2 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """VITS-related loss modules.
 5 | 
 6 | This code is based on https://github.com/jaywalnut310/vits.
 7 | 
 8 | """
 9 | 
10 | import torch
11 | 
12 | 
13 | class KLDivergenceLoss(torch.nn.Module):
14 |     """KL divergence loss."""
15 | 
16 |     def forward(
17 |         self,
18 |         z_p: torch.Tensor,
19 |         logs_q: torch.Tensor,
20 |         m_p: torch.Tensor,
21 |         logs_p: torch.Tensor,
22 |         z_mask: torch.Tensor,
23 |     ) -> torch.Tensor:
24 |         """Calculate KL divergence loss.
25 | 
26 |         Args:
27 |             z_p (Tensor): Flow hidden representation (B, H, T_feats).
28 |             logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats).
29 |             m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats).
30 |             logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats).
31 |             z_mask (Tensor): Mask tensor (B, 1, T_feats).
32 | 
33 |         Returns:
34 |             Tensor: KL divergence loss.
35 | 
36 |         """
37 |         z_p = z_p.float()
38 |         logs_q = logs_q.float()
39 |         m_p = m_p.float()
40 |         logs_p = logs_p.float()
41 |         z_mask = z_mask.float()
42 |         kl = logs_p - logs_q - 0.5
43 |         kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
44 |         kl = torch.sum(kl * z_mask)
45 |         loss = kl / torch.sum(z_mask)
46 | 
47 |         return loss
48 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Maximum path calculation module.
15 | 
16 | This code is based on https://github.com/jaywalnut310/vits.
17 | 
18 | """
19 | import warnings
20 | 
21 | import numpy as np
22 | import paddle
23 | from numba import njit
24 | from numba import prange
25 | 
26 | try:
27 |     from .core import maximum_path_c
28 | 
29 |     is_cython_avalable = True
30 | except ImportError:
31 |     is_cython_avalable = False
32 |     warnings.warn(
33 |         "Cython version is not available. Fallback to 'EXPERIMETAL' numba version. "
34 |         "If you want to use the cython version, please build it as follows: "
35 |         "`cd paddlespeech/t2s/models/vits/monotonic_align; python setup.py build_ext --inplace`"
36 |     )
37 | 
38 | 
39 | def maximum_path(neg_x_ent: paddle.Tensor,
40 |                  attn_mask: paddle.Tensor) -> paddle.Tensor:
41 |     """Calculate maximum path.
42 | 
43 |     Args:
44 |         neg_x_ent (Tensor): Negative X entropy tensor (B, T_feats, T_text).
45 |         attn_mask (Tensor): Attention mask (B, T_feats, T_text).
46 | 
47 |     Returns:
48 |         Tensor: Maximum path tensor (B, T_feats, T_text).
49 | 
50 |     """
51 |     dtype = neg_x_ent.dtype
52 |     neg_x_ent = neg_x_ent.numpy().astype(np.float32)
53 |     path = np.zeros(neg_x_ent.shape, dtype=np.int32)
54 |     t_t_max = attn_mask.sum(1)[:, 0].cpu().numpy().astype(np.int32)
55 |     t_s_max = attn_mask.sum(2)[:, 0].cpu().numpy().astype(np.int32)
56 |     if is_cython_avalable:
57 |         maximum_path_c(path, neg_x_ent, t_t_max, t_s_max)
58 |     else:
59 |         maximum_path_numba(path, neg_x_ent, t_t_max, t_s_max)
60 | 
61 |     return paddle.cast(paddle.to_tensor(path), dtype=dtype)
62 | 
63 | 
64 | @njit
65 | def maximum_path_each_numba(path, value, t_y, t_x, max_neg_val=-np.inf):
66 |     """Calculate a single maximum path with numba."""
67 |     index = t_x - 1
68 |     for y in range(t_y):
69 |         for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
70 |             if x == y:
71 |                 v_cur = max_neg_val
72 |             else:
73 |                 v_cur = value[y - 1, x]
74 |             if x == 0:
75 |                 if y == 0:
76 |                     v_prev = 0.0
77 |                 else:
78 |                     v_prev = max_neg_val
79 |             else:
80 |                 v_prev = value[y - 1, x - 1]
81 |             value[y, x] += max(v_prev, v_cur)
82 | 
83 |     for y in range(t_y - 1, -1, -1):
84 |         path[y, index] = 1
85 |         if index != 0 and (index == y or
86 |                            value[y - 1, index] < value[y - 1, index - 1]):
87 |             index = index - 1
88 | 
89 | 
90 | @njit(parallel=True)
91 | def maximum_path_numba(paths, values, t_ys, t_xs):
92 |     """Calculate batch maximum path with numba."""
93 |     for i in prange(paths.shape[0]):
94 |         maximum_path_each_numba(paths[i], values[i], t_ys[i], t_xs[i])
95 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Maximum path calculation module with cython optimization.
15 | 
16 | This code is copied from https://github.com/jaywalnut310/vits and modifed code format.
17 | 
18 | """
19 | 
20 | cimport cython
21 | 
22 | from cython.parallel import prange
23 | 
24 | 
25 | @cython.boundscheck(False)
26 | @cython.wraparound(False)
27 | cdef void maximum_path_each(int[:, ::1] path, float[:, ::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
28 |     cdef int x
29 |     cdef int y
30 |     cdef float v_prev
31 |     cdef float v_cur
32 |     cdef float tmp
33 |     cdef int index = t_x - 1
34 | 
35 |     for y in range(t_y):
36 |         for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
37 |             if x == y:
38 |                 v_cur = max_neg_val
39 |             else:
40 |                 v_cur = value[y - 1, x]
41 |             if x == 0:
42 |                 if y == 0:
43 |                     v_prev = 0.0
44 |                 else:
45 |                     v_prev = max_neg_val
46 |             else:
47 |                 v_prev = value[y - 1, x - 1]
48 |             value[y, x] += max(v_prev, v_cur)
49 | 
50 |     for y in range(t_y - 1, -1, -1):
51 |         path[y, index] = 1
52 |         if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
53 |             index = index - 1
54 | 
55 | 
56 | @cython.boundscheck(False)
57 | @cython.wraparound(False)
58 | cpdef void maximum_path_c(int[:, :, ::1] paths, float[:, :, ::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
59 |     cdef int b = paths.shape[0]
60 |     cdef int i
61 |     for i in prange(b, nogil=True):
62 |         maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
63 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/monotonic_align/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Setup cython code."""
15 | from Cython.Build import cythonize
16 | from setuptools import Extension
17 | from setuptools import setup
18 | from setuptools.command.build_ext import build_ext as _build_ext
19 | 
20 | 
21 | class build_ext(_build_ext):
22 |     """Overwrite build_ext."""
23 | 
24 |     def finalize_options(self):
25 |         """Prevent numpy from thinking it is still in its setup process."""
26 |         _build_ext.finalize_options(self)
27 |         __builtins__.__NUMPY_SETUP__ = False
28 |         import numpy
29 | 
30 |         self.include_dirs.append(numpy.get_include())
31 | 
32 | 
33 | exts = [Extension(
34 |     name="core",
35 |     sources=["core.pyx"], )]
36 | setup(
37 |     name="monotonic_align",
38 |     ext_modules=cythonize(exts, language_level=3),
39 |     cmdclass={"build_ext": build_ext}, )
40 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/posterior_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Tomoki Hayashi
  2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  3 | 
  4 | """Posterior encoder module in VITS.
  5 | 
  6 | This code is based on https://github.com/jaywalnut310/vits.
  7 | 
  8 | """
  9 | 
 10 | from typing import Optional, Tuple
 11 | 
 12 | import torch
 13 | 
 14 | from deepaudio.tts.models.vits.wavenet.wavenet import WaveNet
 15 | from deepaudio.tts.models.vits.wavenet.residual_block import Conv1d
 16 | from deepaudio.tts.modules.nets_utils import make_non_pad_mask
 17 | 
 18 | 
 19 | class PosteriorEncoder(torch.nn.Module):
 20 |     """Posterior encoder module in VITS.
 21 | 
 22 |     This is a module of posterior encoder described in `Conditional Variational
 23 |     Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_.
 24 | 
 25 |     .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
 26 |         Text-to-Speech`: https://arxiv.org/abs/2006.04558
 27 |     """
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         in_channels: int = 513,
 32 |         out_channels: int = 192,
 33 |         hidden_channels: int = 192,
 34 |         kernel_size: int = 5,
 35 |         layers: int = 16,
 36 |         stacks: int = 1,
 37 |         base_dilation: int = 1,
 38 |         global_channels: int = -1,
 39 |         dropout_rate: float = 0.0,
 40 |         bias: bool = True,
 41 |         use_weight_norm: bool = True,
 42 |     ):
 43 |         """Initilialize PosteriorEncoder module.
 44 | 
 45 |         Args:
 46 |             in_channels (int): Number of input channels.
 47 |             out_channels (int): Number of output channels.
 48 |             hidden_channels (int): Number of hidden channels.
 49 |             kernel_size (int): Kernel size in WaveNet.
 50 |             layers (int): Number of layers of WaveNet.
 51 |             stacks (int): Number of repeat stacking of WaveNet.
 52 |             base_dilation (int): Base dilation factor.
 53 |             global_channels (int): Number of global conditioning channels.
 54 |             dropout_rate (float): Dropout rate.
 55 |             bias (bool): Whether to use bias parameters in conv.
 56 |             use_weight_norm (bool): Whether to apply weight norm.
 57 | 
 58 |         """
 59 |         super().__init__()
 60 | 
 61 |         # define modules
 62 |         self.input_conv = Conv1d(in_channels, hidden_channels, 1)
 63 |         self.encoder = WaveNet(
 64 |             in_channels=-1,
 65 |             out_channels=-1,
 66 |             kernel_size=kernel_size,
 67 |             layers=layers,
 68 |             stacks=stacks,
 69 |             base_dilation=base_dilation,
 70 |             residual_channels=hidden_channels,
 71 |             aux_channels=-1,
 72 |             gate_channels=hidden_channels * 2,
 73 |             skip_channels=hidden_channels,
 74 |             global_channels=global_channels,
 75 |             dropout_rate=dropout_rate,
 76 |             bias=bias,
 77 |             use_weight_norm=use_weight_norm,
 78 |             use_first_conv=False,
 79 |             use_last_conv=False,
 80 |             scale_residual=False,
 81 |             scale_skip_connect=True,
 82 |         )
 83 |         self.proj = Conv1d(hidden_channels, out_channels * 2, 1)
 84 | 
 85 |     def forward(
 86 |         self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
 87 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 88 |         """Calculate forward propagation.
 89 | 
 90 |         Args:
 91 |             x (Tensor): Input tensor (B, in_channels, T_feats).
 92 |             x_lengths (Tensor): Length tensor (B,).
 93 |             g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
 94 | 
 95 |         Returns:
 96 |             Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
 97 |             Tensor: Projected mean tensor (B, out_channels, T_feats).
 98 |             Tensor: Projected scale tensor (B, out_channels, T_feats).
 99 |             Tensor: Mask tensor for input tensor (B, 1, T_feats).
100 | 
101 |         """
102 |         x_mask = (
103 |             make_non_pad_mask(x_lengths)
104 |             .unsqueeze(1)
105 |             .to(
106 |                 dtype=x.dtype,
107 |                 device=x.device,
108 |             )
109 |         )
110 |         x = self.input_conv(x) * x_mask
111 |         x = self.encoder(x, x_mask, g=g)
112 |         stats = self.proj(x) * x_mask
113 |         m, logs = stats.split(stats.size(1) // 2, dim=1)
114 |         z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
115 | 
116 |         return z, m, logs, x_mask
117 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/vits/wavenet/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/models/wavernn/__init__.py:
--------------------------------------------------------------------------------
1 | from .wavernn import *
2 | from .wavernn import *


--------------------------------------------------------------------------------
/deepaudio/tts/models/wavernn/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor, nn
 3 | 
 4 | from deepaudio.tts.models.wavernn import WaveRNN
 5 | from deepaudio.tts.modules.losses import discretized_mix_logistic_loss
 6 | 
 7 | 
 8 | 
 9 | class WaveRNNModel(BasePLModel):
10 |     def __init__(self,
11 |                  model: WaveRNN,
12 |                  mode: str,
13 |                  optimizer: torch.optim.Optimizer,
14 |                  scheduler: torch.optim.lr_scheduler
15 |                  ):
16 |         super(WaveRNNModel, self).__init__()
17 | 
18 |         self.model = model
19 |         self.save_hyperparameters(logger=False, ignore=["model"])
20 |         if self.hparams.mode == 'RAW':
21 |             self.criterion = nn.CrossEntropyLoss()
22 |         elif self.hparams.mode == 'MOL':
23 |             self.criterion = discretized_mix_logistic_loss()
24 |         else:
25 |             self.criterion = None
26 |             RuntimeError('Unknown model mode value - ', self.configs.model.mode)
27 | 
28 |     def compute_loss(self, batch):
29 |         wav, y, mel = batch
30 |         y_hat = self.model(wav, mel)
31 |         if self.hparams.mode == 'RAW':
32 |             y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
33 |         elif self.hparams.mode == 'MOL':
34 |             y_hat = y_hat.type(torch.float32)
35 | 
36 |         y = y.unsqueeze(-1)
37 |         loss = self.criterion(y_hat, y)
38 |         return loss
39 | 
40 |     def training_step(self, batch: tuple, batch_idx: int):
41 |         loss = self.compute_loss(batch)
42 |         return {
43 |             'loss': loss
44 |         }
45 | 
46 |     def validation_step(self, batch: tuple, batch_idx: int):
47 |         loss = self.compute_loss(batch)
48 |         return {
49 |             'val_loss': loss
50 |         }
51 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .conv import *
15 | from .geometry import *
16 | from .losses import *
17 | from .positional_encoding import *
18 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | 
16 | 
17 | def get_activation(act, **kwargs):
18 |     """Return activation function."""
19 | 
20 |     activation_funcs = {
21 |         "hardtanh": torch.nn.Hardtanh,
22 |         "tanh": torch.nn.Tanh,
23 |         "relu": torch.nn.ReLU,
24 |         "selu": torch.nn.SELU,
25 |         "leakyrelu": torch.nn.LeakyReLU,
26 |         "swish": torch.nn.Swish,
27 |         "glu": torch.nn.GLU
28 |     }
29 | 
30 |     return activation_funcs[act](**kwargs)
31 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/causal_conv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Causal convolusion layer modules."""
15 | import torch
16 | from torch import nn
17 | 
18 | 
19 | class CausalConv1D(nn.Module):
20 |     """CausalConv1D module with customized initialization."""
21 | 
22 |     def __init__(
23 |             self,
24 |             in_channels,
25 |             out_channels,
26 |             kernel_size,
27 |             dilation=1,
28 |             bias=True,
29 |             pad="Pad1D",
30 |             pad_params={"value": 0.0}, ):
31 |         """Initialize CausalConv1d module."""
32 |         super().__init__()
33 |         self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation,
34 |                                            **pad_params)
35 |         self.conv = nn.Conv1d(
36 |             in_channels,
37 |             out_channels,
38 |             kernel_size,
39 |             dilation=dilation,
40 |             bias=bias)
41 | 
42 |     def forward(self, x):
43 |         """Calculate forward propagation.
44 |         Args:
45 |             x (Tensor): Input tensor (B, in_channels, T).
46 |         Returns: 
47 |             Tensor: Output tensor (B, out_channels, T).
48 |         """
49 |         return self.conv(self.pad(x))[:, :, :x.shape[2]]
50 | 
51 | 
52 | class CausalConv1DTranspose(nn.Module):
53 |     """CausalConv1DTranspose module with customized initialization."""
54 | 
55 |     def __init__(self,
56 |                  in_channels,
57 |                  out_channels,
58 |                  kernel_size,
59 |                  stride,
60 |                  bias=True):
61 |         """Initialize CausalConvTranspose1d module."""
62 |         super().__init__()
63 |         self.deconv = nn.ConvTranspose1d(
64 |             in_channels, out_channels, kernel_size, stride, bias=bias)
65 |         self.stride = stride
66 | 
67 |     def forward(self, x):
68 |         """Calculate forward propagation.
69 |         Args:
70 |             x (Tensor): Input tensor (B, in_channels, T_in).
71 |         Returns:
72 |             Tensor: Output tensor (B, out_channels, T_out).
73 |         """
74 |         return self.deconv(x)[:, :, :-self.stride]
75 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/conformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/conformer/convolution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """ConvolutionModule definition."""
 9 | 
10 | from torch import nn
11 | 
12 | 
13 | class ConvolutionModule(nn.Module):
14 |     """ConvolutionModule in Conformer model.
15 | 
16 |     Args:
17 |         channels (int): The number of channels of conv layers.
18 |         kernel_size (int): Kernerl size of conv layers.
19 | 
20 |     """
21 | 
22 |     def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
23 |         """Construct an ConvolutionModule object."""
24 |         super(ConvolutionModule, self).__init__()
25 |         # kernerl_size should be a odd number for 'SAME' padding
26 |         assert (kernel_size - 1) % 2 == 0
27 | 
28 |         self.pointwise_conv1 = nn.Conv1d(
29 |             channels,
30 |             2 * channels,
31 |             kernel_size=1,
32 |             stride=1,
33 |             padding=0,
34 |             bias=bias,
35 |         )
36 |         self.depthwise_conv = nn.Conv1d(
37 |             channels,
38 |             channels,
39 |             kernel_size,
40 |             stride=1,
41 |             padding=(kernel_size - 1) // 2,
42 |             groups=channels,
43 |             bias=bias,
44 |         )
45 |         self.norm = nn.BatchNorm1d(channels)
46 |         self.pointwise_conv2 = nn.Conv1d(
47 |             channels,
48 |             channels,
49 |             kernel_size=1,
50 |             stride=1,
51 |             padding=0,
52 |             bias=bias,
53 |         )
54 |         self.activation = activation
55 | 
56 |     def forward(self, x):
57 |         """Compute convolution module.
58 | 
59 |         Args:
60 |             x (torch.Tensor): Input tensor (#batch, time, channels).
61 | 
62 |         Returns:
63 |             torch.Tensor: Output tensor (#batch, time, channels).
64 | 
65 |         """
66 |         # exchange the temporal dimension and the feature dimension
67 |         x = x.transpose(1, 2)
68 | 
69 |         # GLU mechanism
70 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
71 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
72 | 
73 |         # 1D Depthwise Conv
74 |         x = self.depthwise_conv(x)
75 |         x = self.activation(self.norm(x))
76 | 
77 |         x = self.pointwise_conv2(x)
78 | 
79 |         return x.transpose(1, 2)
80 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/conformer/swish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """Swish() activation function for Conformer."""
 9 | 
10 | import torch
11 | 
12 | 
13 | class Swish(torch.nn.Module):
14 |     """Construct an Swish object."""
15 | 
16 |     def forward(self, x):
17 |         """Return Swich activation function."""
18 |         return x * torch.sigmoid(x)


--------------------------------------------------------------------------------
/deepaudio/tts/modules/geometry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import numpy as np
15 | import torch
16 | 
17 | 
18 | def shuffle_dim(x, axis, perm=None):
19 |     """Permute input tensor along aixs given the permutation or randomly.
20 |     
21 |     Args:
22 |         x (Tensor): The input tensor.
23 |         axis (int): The axis to shuffle.
24 |         perm (List[int], ndarray, optional): 
25 |             The order to reorder the tensor along the ``axis``-th dimension.
26 |             It is a permutation of ``[0, d)``, where d is the size of the
27 |             ``axis``-th dimension of the input tensor. If not provided,
28 |             a random permutation is used. Defaults to None.
29 | 
30 |     Returns:
31 |         Tensor: The shuffled tensor, which has the same shape as x does.
32 |     """
33 |     size = x.shape[axis]
34 |     if perm is not None and len(perm) != size:
35 |         raise ValueError("length of permutation should equals the input "
36 |                          "tensor's axis-th dimension's size")
37 |     if perm is not None:
38 |         perm = np.array(perm)
39 |     else:
40 |         perm = np.random.permutation(size)
41 | 
42 |     perm = torch.to_tensor(perm)
43 |     out = torch.gather(x, perm, axis)
44 |     return out
45 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Layer normalization module."""
15 | import torch
16 | 
17 | 
18 | class LayerNorm(torch.nn.LayerNorm):
19 |     """Layer normalization module.
20 | 
21 |     Args:
22 |         nout (int): Output dim size.
23 |         dim (int): Dimension to be normalized.
24 | 
25 |     """
26 | 
27 |     def __init__(self, nout, dim=-1):
28 |         """Construct an LayerNorm object."""
29 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
30 |         self.dim = dim
31 | 
32 |     def forward(self, x):
33 |         """Apply layer normalization.
34 | 
35 |         Args:
36 |             x (torch.Tensor): Input tensor.
37 | 
38 |         Returns:
39 |             torch.Tensor: Normalized tensor.
40 | 
41 |         """
42 |         if self.dim == -1:
43 |             return super(LayerNorm, self).forward(x)
44 |         return (
45 |             super(LayerNorm, self)
46 |                 .forward(x.transpose(self.dim, -1))
47 |                 .transpose(self.dim, -1)
48 |         )
49 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/masked_fill.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Union
15 | 
16 | import torch
17 | 
18 | 
19 | def is_broadcastable(shp1, shp2):
20 |     for a, b in zip(shp1[::-1], shp2[::-1]):
21 |         if a == 1 or b == 1 or a == b:
22 |             pass
23 |         else:
24 |             return False
25 |     return True
26 | 
27 | 
28 | # assume that len(shp1) == len(shp2)
29 | def broadcast_shape(shp1, shp2):
30 |     result = []
31 |     for a, b in zip(shp1[::-1], shp2[::-1]):
32 |         result.append(max(a, b))
33 |     return result[::-1]
34 | 
35 | 
36 | def masked_fill(xs: torch.Tensor,
37 |                 mask: torch.Tensor,
38 |                 value: Union[float, int]):
39 |     # comment following line for converting dygraph to static graph. 
40 |     # assert is_broadcastable(xs.shape, mask.shape) is True
41 |     # bshape = paddle.broadcast_shape(xs.shape, mask.shape)   
42 |     bshape = broadcast_shape(xs.shape, mask.shape)
43 |     mask.stop_gradient = True
44 |     mask = mask.broadcast_to(bshape)
45 | 
46 |     trues = torch.ones_like(xs) * value
47 |     mask = mask.type(dtype=torch.bool)
48 |     xs = torch.where(mask, trues, xs)
49 |     return xs
50 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/normalizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import torch
15 | from torch import nn
16 | 
17 | 
18 | class ZScore(nn.Module):
19 |     # feature last
20 |     def __init__(self, mu, sigma):
21 |         super().__init__()
22 |         self.register_buffer("mu", mu)
23 |         self.register_buffer("sigma", sigma)
24 | 
25 |     def forward(self, x):
26 |         # NOTE: to be compatible with torch's to_static, we must explicitly
27 |         # call multiply, or add, etc, instead of +-*/, etc.
28 |         return torch.divide(torch.subtract(x, self.mu), self.sigma)
29 | 
30 |     def inverse(self, x):
31 |         # NOTE: to be compatible with torch's to_static, we must explicitly
32 |         # call multiply, or add, etc, instead of +-*/, etc.
33 |         return torch.add(torch.multiply(x, self.sigma), self.mu)
34 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/positional_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import numpy as np
15 | import torch
16 | from torch import Tensor
17 | 
18 | __all__ = ["sinusoid_position_encoding", "scaled_position_encoding"]
19 | 
20 | 
21 | def sinusoid_position_encoding(num_positions: int,
22 |                                feature_size: int,
23 |                                omega: float=1.0,
24 |                                start_pos: int=0,
25 |                                dtype=None) -> torch.Tensor:
26 |     # return tensor shape (num_positions, feature_size)
27 |     # NOTE: to be compatible with paddle's to_static, we cannnot raise 
28 |     # an exception here, take care of it by yourself
29 |     # if (feature_size % 2 != 0):
30 |     #     raise ValueError("size should be divisible by 2")
31 |     dtype = dtype or torch.get_default_dtype()
32 | 
33 |     channel = torch.arange(0, feature_size, 2, dtype=dtype)
34 |     index = torch.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
35 |     denominator = channel / float(feature_size)
36 |     denominator = torch.from_numpy(np.array([10000.0]).astype(np.float32))**denominator
37 |     p = (torch.unsqueeze(index, -1) * omega) / denominator
38 |     encodings = torch.zeros([num_positions, feature_size], dtype=dtype)
39 |     encodings[:, 0::2] = torch.sin(p)
40 |     encodings[:, 1::2] = torch.cos(p)
41 |     return encodings
42 | 
43 | 
44 | def scaled_position_encoding(num_positions: int,
45 |                              feature_size: int,
46 |                              omega: Tensor,
47 |                              start_pos: int=0,
48 |                              dtype=None) -> Tensor:
49 |     # omega: Tensor (batch_size, )
50 |     # return tensor shape (batch_size, num_positions, feature_size)
51 |     # consider renaming this as batched positioning encoding
52 |     if (feature_size % 2 != 0):
53 |         raise ValueError("size should be divisible by 2")
54 |     dtype = dtype or torch.get_default_dtype()
55 | 
56 |     channel = torch.arange(0, feature_size, 2, dtype=dtype)
57 |     index = torch.arange(
58 |         start_pos, start_pos + num_positions, 1, dtype=omega.dtype)
59 |     batch_size = omega.shape[0]
60 |     omega = torch.unsqueeze(omega, 1)
61 |     omega = torch.unsqueeze(omega, 2)
62 |     p = (torch.unsqueeze(index, -1) *
63 |          omega) / (10000.0**(channel / float(feature_size)))
64 |     encodings = torch.zeros(
65 |         [batch_size, num_positions, feature_size], dtype=dtype)
66 |     # it is nice to have fancy indexing and inplace operations
67 |     encodings[:, :, 0::2] = torch.sin(p)
68 |     encodings[:, :, 1::2] = torch.cos(p)
69 |     return encodings
70 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/predictor/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/predictor/duration_calculator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Tomoki Hayashi
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Duration calculator related modules."""
 8 | 
 9 | import torch
10 | 
11 | from deepaudio.tts.models.tacotron2.tacotron2 import Tacotron2
12 | from deepaudio.tts.models.transformer_tts import Transformer
13 | from deepaudio.tts.modules.nets_utils import pad_list
14 | 
15 | 
16 | class DurationCalculator(torch.nn.Module):
17 |     """Duration calculator module for FastSpeech.
18 | 
19 |     Todo:
20 |         * Fix the duplicated calculation of diagonal head decision
21 | 
22 |     """
23 | 
24 |     def __init__(self, teacher_model):
25 |         """Initialize duration calculator module.
26 | 
27 |         Args:
28 |             teacher_model (e2e_tts_transformer.Transformer):
29 |                 Pretrained auto-regressive Transformer.
30 | 
31 |         """
32 |         super(DurationCalculator, self).__init__()
33 |         if isinstance(teacher_model, Transformer):
34 |             self.register_buffer("diag_head_idx", torch.tensor(-1))
35 |         elif isinstance(teacher_model, Tacotron2):
36 |             pass
37 |         else:
38 |             raise ValueError(
39 |                 "teacher model should be the instance of "
40 |                 "e2e_tts_transformer.Transformer or e2e_tts_tacotron2.Tacotron2."
41 |             )
42 |         self.teacher_model = teacher_model
43 | 
44 |     def forward(self, xs, ilens, ys, olens, spembs=None):
45 |         """Calculate forward propagation.
46 | 
47 |         Args:
48 |             xs (Tensor): Batch of the padded sequences of character ids (B, Tmax).
49 |             ilens (Tensor): Batch of lengths of each input sequence (B,).
50 |             ys (Tensor):
51 |                 Batch of the padded sequence of target features (B, Lmax, odim).
52 |             olens (Tensor): Batch of lengths of each output sequence (B,).
53 |             spembs (Tensor, optional):
54 |                 Batch of speaker embedding vectors (B, spk_embed_dim).
55 | 
56 |         Returns:
57 |             Tensor: Batch of durations (B, Tmax).
58 | 
59 |         """
60 |         if isinstance(self.teacher_model, Transformer):
61 |             att_ws = self._calculate_encoder_decoder_attentions(
62 |                 xs, ilens, ys, olens, spembs=spembs
63 |             )
64 |             # TODO(kan-bayashi): fix this issue
65 |             # this does not work in multi-gpu case. registered buffer is not saved.
66 |             if int(self.diag_head_idx) == -1:
67 |                 self._init_diagonal_head(att_ws)
68 |             att_ws = att_ws[:, self.diag_head_idx]
69 |         else:
70 |             # NOTE(kan-bayashi): Here we assume that the teacher is tacotron 2
71 |             att_ws = self.teacher_model.calculate_all_attentions(
72 |                 xs, ilens, ys, spembs=spembs, keep_tensor=True
73 |             )
74 |         durations = [
75 |             self._calculate_duration(att_w, ilen, olen)
76 |             for att_w, ilen, olen in zip(att_ws, ilens, olens)
77 |         ]
78 | 
79 |         return pad_list(durations, 0)
80 | 
81 |     @staticmethod
82 |     def _calculate_duration(att_w, ilen, olen):
83 |         return torch.stack(
84 |             [att_w[:olen, :ilen].argmax(-1).eq(i).sum() for i in range(ilen)]
85 |         )
86 | 
87 |     def _init_diagonal_head(self, att_ws):
88 |         diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1).mean(dim=0)  # (H * L,)
89 |         self.register_buffer("diag_head_idx", diagonal_scores.argmax())
90 | 
91 |     def _calculate_encoder_decoder_attentions(self, xs, ilens, ys, olens, spembs=None):
92 |         att_dict = self.teacher_model.calculate_all_attentions(
93 |             xs, ilens, ys, olens, spembs=spembs, skip_output=True, keep_tensor=True
94 |         )
95 |         return torch.cat(
96 |             [att_dict[k] for k in att_dict.keys() if "src_attn" in k], dim=1
97 |         )  # (B, H*L, Lmax, Tmax)
98 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/predictor/length_regulator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Tomoki Hayashi
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Length regulator related modules."""
 8 | 
 9 | import logging
10 | 
11 | import torch
12 | 
13 | from deepaudio.tts.modules.nets_utils import pad_list
14 | 
15 | 
16 | class LengthRegulator(torch.nn.Module):
17 |     """Length regulator module for feed-forward Transformer.
18 | 
19 |     This is a module of length regulator described in
20 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
21 |     The length regulator expands char or
22 |     phoneme-level embedding features to frame-level by repeating each
23 |     feature based on the corresponding predicted durations.
24 | 
25 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
26 |         https://arxiv.org/pdf/1905.09263.pdf
27 | 
28 |     """
29 | 
30 |     def __init__(self, pad_value=0.0):
31 |         """Initilize length regulator module.
32 | 
33 |         Args:
34 |             pad_value (float, optional): Value used for padding.
35 | 
36 |         """
37 |         super().__init__()
38 |         self.pad_value = pad_value
39 | 
40 |     def forward(self, xs, ds, alpha=1.0):
41 |         """Calculate forward propagation.
42 | 
43 |         Args:
44 |             xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
45 |             ds (LongTensor): Batch of durations of each frame (B, T).
46 |             alpha (float, optional): Alpha value to control speed of speech.
47 | 
48 |         Returns:
49 |             Tensor: replicated input tensor based on durations (B, T*, D).
50 | 
51 |         """
52 |         if alpha != 1.0:
53 |             assert alpha > 0
54 |             ds = torch.round(ds.float() * alpha).long()
55 | 
56 |         if ds.sum() == 0:
57 |             logging.warning(
58 |                 "predicted durations includes all 0 sequences. "
59 |                 "fill the first element with 1."
60 |             )
61 |             # NOTE(kan-bayashi): This case must not be happened in teacher forcing.
62 |             #   It will be happened in inference with a bad duration predictor.
63 |             #   So we do not need to care the padded sequence case here.
64 |             ds[ds.sum(dim=1).eq(0)] = 1
65 | 
66 |         repeat = [torch.repeat_interleave(x, d, dim=0) for x, d in zip(xs, ds)]
67 |         return pad_list(repeat, self.pad_value)
68 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/predictor/variance_predictor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2020 Tomoki Hayashi
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | """Variance predictor related modules."""
 7 | 
 8 | import torch
 9 | from typeguard import check_argument_types
10 | 
11 | from deepaudio.tts.modules.layer_norm import LayerNorm
12 | 
13 | 
14 | class VariancePredictor(torch.nn.Module):
15 |     """Variance predictor module.
16 | 
17 |     This is a module of variacne predictor described in `FastSpeech 2:
18 |     Fast and High-Quality End-to-End Text to Speech`_.
19 | 
20 |     .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
21 |         https://arxiv.org/abs/2006.04558
22 | 
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         idim: int,
28 |         n_layers: int = 2,
29 |         n_chans: int = 384,
30 |         kernel_size: int = 3,
31 |         bias: bool = True,
32 |         dropout_rate: float = 0.5,
33 |     ):
34 |         """Initilize duration predictor module.
35 | 
36 |         Args:
37 |             idim (int): Input dimension.
38 |             n_layers (int): Number of convolutional layers.
39 |             n_chans (int): Number of channels of convolutional layers.
40 |             kernel_size (int): Kernel size of convolutional layers.
41 |             dropout_rate (float): Dropout rate.
42 | 
43 |         """
44 |         assert check_argument_types()
45 |         super().__init__()
46 |         self.conv = torch.nn.ModuleList()
47 |         for idx in range(n_layers):
48 |             in_chans = idim if idx == 0 else n_chans
49 |             self.conv += [
50 |                 torch.nn.Sequential(
51 |                     torch.nn.Conv1d(
52 |                         in_chans,
53 |                         n_chans,
54 |                         kernel_size,
55 |                         stride=1,
56 |                         padding=(kernel_size - 1) // 2,
57 |                         bias=bias,
58 |                     ),
59 |                     torch.nn.ReLU(),
60 |                     LayerNorm(n_chans, dim=1),
61 |                     torch.nn.Dropout(dropout_rate),
62 |                 )
63 |             ]
64 |         self.linear = torch.nn.Linear(n_chans, 1)
65 | 
66 |     def forward(self, xs: torch.Tensor, x_masks: torch.Tensor = None) -> torch.Tensor:
67 |         """Calculate forward propagation.
68 | 
69 |         Args:
70 |             xs (Tensor): Batch of input sequences (B, Tmax, idim).
71 |             x_masks (ByteTensor): Batch of masks indicating padded part (B, Tmax).
72 | 
73 |         Returns:
74 |             Tensor: Batch of predicted sequences (B, Tmax, 1).
75 | 
76 |         """
77 |         xs = xs.transpose(1, -1)  # (B, idim, Tmax)
78 |         for f in self.conv:
79 |             xs = f(xs)  # (B, C, Tmax)
80 | 
81 |         xs = self.linear(xs.transpose(1, 2))  # (B, Tmax, 1)
82 | 
83 |         if x_masks is not None:
84 |             xs = xs.masked_fill(x_masks, 0.0)
85 | 
86 |         return xs
87 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/residual_stack.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # Modified from espnet(https://github.com/espnet/espnet)
 15 | """Residual stack module in MelGAN."""
 16 | from typing import Any
 17 | from typing import Dict
 18 | 
 19 | from torch import nn
 20 | 
 21 | from deepaudio.tts.modules.activation import get_activation
 22 | from deepaudio.tts.modules.causal_conv import CausalConv1D
 23 | 
 24 | 
 25 | class ResidualStack(nn.Module):
 26 |     """Residual stack module introduced in MelGAN."""
 27 | 
 28 |     def __init__(
 29 |             self,
 30 |             kernel_size: int=3,
 31 |             channels: int=32,
 32 |             dilation: int=1,
 33 |             bias: bool=True,
 34 |             nonlinear_activation: str="leakyrelu",
 35 |             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
 36 |             pad: str="Pad1D",
 37 |             pad_params: Dict[str, Any]={"mode": "reflect"},
 38 |             use_causal_conv: bool=False, ):
 39 |         """Initialize ResidualStack module.
 40 | 
 41 |         Args:
 42 |             kernel_size (int): Kernel size of dilation convolution layer.
 43 |             channels (int): Number of channels of convolution layers.
 44 |             dilation (int): Dilation factor.
 45 |             bias (bool): Whether to add bias parameter in convolution layers.
 46 |             nonlinear_activation (str): Activation function module name.
 47 |             nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
 48 |             pad (str): Padding function module name before dilated convolution layer.
 49 |             pad_params (Dict[str, Any]): Hyperparameters for padding function.
 50 |             use_causal_conv (bool): Whether to use causal convolution.
 51 |         """
 52 |         super().__init__()
 53 |         # for compatibility
 54 |         if nonlinear_activation:
 55 |             nonlinear_activation = nonlinear_activation.lower()
 56 | 
 57 |         # defile residual stack part
 58 |         if not use_causal_conv:
 59 |             assert (kernel_size - 1
 60 |                     ) % 2 == 0, "Not support even number kernel size."
 61 |             self.stack = nn.Sequential(
 62 |                 get_activation(nonlinear_activation,
 63 |                                **nonlinear_activation_params),
 64 |                 getattr(nn, pad)((kernel_size - 1) // 2 * dilation,
 65 |                                  **pad_params),
 66 |                 nn.Conv1d(
 67 |                     channels,
 68 |                     channels,
 69 |                     kernel_size,
 70 |                     dilation=dilation,
 71 |                     bias=bias),
 72 |                 get_activation(nonlinear_activation,
 73 |                                **nonlinear_activation_params),
 74 |                 nn.Conv1d(channels, channels, 1, bias=bias), )
 75 |         else:
 76 |             self.stack = nn.Sequential(
 77 |                 get_activation(nonlinear_activation,
 78 |                                **nonlinear_activation_params),
 79 |                 CausalConv1D(
 80 |                     channels,
 81 |                     channels,
 82 |                     kernel_size,
 83 |                     dilation=dilation,
 84 |                     bias=bias,
 85 |                     pad=pad,
 86 |                     pad_params=pad_params, ),
 87 |                 get_activation(nonlinear_activation,
 88 |                                **nonlinear_activation_params),
 89 |                 nn.Conv1d(channels, channels, 1, bias=bias), )
 90 | 
 91 |         # defile extra layer for skip connection
 92 |         self.skip_layer = nn.Conv1d(channels, channels, 1, bias=bias)
 93 | 
 94 |     def forward(self, c):
 95 |         """Calculate forward propagation.
 96 | 
 97 |         Args:
 98 |             c (Tensor): Input tensor (B, channels, T).
 99 |         Returns:     
100 |             Tensor: Output tensor (B, chennels, T).
101 |         """
102 |         return self.stack(c) + self.skip_layer(c)
103 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/tacotron2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/encoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Encoder self-attention layer definition."""
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | 
 12 | from .layer_norm import LayerNorm
 13 | 
 14 | 
 15 | class EncoderLayer(nn.Module):
 16 |     """Encoder layer module.
 17 | 
 18 |     Args:
 19 |         size (int): Input dimension.
 20 |         self_attn (torch.nn.Module): Self-attention module instance.
 21 |             `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
 22 |             can be used as the argument.
 23 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 24 |             `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
 25 |             can be used as the argument.
 26 |         dropout_rate (float): Dropout rate.
 27 |         normalize_before (bool): Whether to use layer_norm before the first block.
 28 |         concat_after (bool): Whether to concat attention layer's input and output.
 29 |             if True, additional linear will be applied.
 30 |             i.e. x -> x + linear(concat(x, att(x)))
 31 |             if False, no additional linear will be applied. i.e. x -> x + att(x)
 32 |         stochastic_depth_rate (float): Proability to skip this layer.
 33 |             During training, the layer may skip residual computation and return input
 34 |             as-is with given probability.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         size,
 40 |         self_attn,
 41 |         feed_forward,
 42 |         dropout_rate,
 43 |         normalize_before=True,
 44 |         concat_after=False,
 45 |         stochastic_depth_rate=0.0,
 46 |     ):
 47 |         """Construct an EncoderLayer object."""
 48 |         super(EncoderLayer, self).__init__()
 49 |         self.self_attn = self_attn
 50 |         self.feed_forward = feed_forward
 51 |         self.norm1 = LayerNorm(size)
 52 |         self.norm2 = LayerNorm(size)
 53 |         self.dropout = nn.Dropout(dropout_rate)
 54 |         self.size = size
 55 |         self.normalize_before = normalize_before
 56 |         self.concat_after = concat_after
 57 |         if self.concat_after:
 58 |             self.concat_linear = nn.Linear(size + size, size)
 59 |         self.stochastic_depth_rate = stochastic_depth_rate
 60 | 
 61 |     def forward(self, x, mask, cache=None):
 62 |         """Compute encoded features.
 63 | 
 64 |         Args:
 65 |             x_input (torch.Tensor): Input tensor (#batch, time, size).
 66 |             mask (torch.Tensor): Mask tensor for the input (#batch, time).
 67 |             cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
 68 | 
 69 |         Returns:
 70 |             torch.Tensor: Output tensor (#batch, time, size).
 71 |             torch.Tensor: Mask tensor (#batch, time).
 72 | 
 73 |         """
 74 |         skip_layer = False
 75 |         # with stochastic depth, residual connection `x + f(x)` becomes
 76 |         # `x <- x + 1 / (1 - p) * f(x)` at training time.
 77 |         stoch_layer_coeff = 1.0
 78 |         if self.training and self.stochastic_depth_rate > 0:
 79 |             skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
 80 |             stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
 81 | 
 82 |         if skip_layer:
 83 |             if cache is not None:
 84 |                 x = torch.cat([cache, x], dim=1)
 85 |             return x, mask
 86 | 
 87 |         residual = x
 88 |         if self.normalize_before:
 89 |             x = self.norm1(x)
 90 | 
 91 |         if cache is None:
 92 |             x_q = x
 93 |         else:
 94 |             assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
 95 |             x_q = x[:, -1:, :]
 96 |             residual = residual[:, -1:, :]
 97 |             mask = None if mask is None else mask[:, -1:, :]
 98 | 
 99 |         if self.concat_after:
100 |             x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
101 |             x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
102 |         else:
103 |             x = residual + stoch_layer_coeff * self.dropout(
104 |                 self.self_attn(x_q, x, x, mask)
105 |             )
106 |         if not self.normalize_before:
107 |             x = self.norm1(x)
108 | 
109 |         residual = x
110 |         if self.normalize_before:
111 |             x = self.norm2(x)
112 |         x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
113 |         if not self.normalize_before:
114 |             x = self.norm2(x)
115 | 
116 |         if cache is not None:
117 |             x = torch.cat([cache, x], dim=1)
118 | 
119 |         return x, mask
120 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Layer normalization module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class LayerNorm(torch.nn.LayerNorm):
13 |     """Layer normalization module.
14 | 
15 |     Args:
16 |         nout (int): Output dim size.
17 |         dim (int): Dimension to be normalized.
18 | 
19 |     """
20 | 
21 |     def __init__(self, nout, dim=-1):
22 |         """Construct an LayerNorm object."""
23 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
24 |         self.dim = dim
25 | 
26 |     def forward(self, x):
27 |         """Apply layer normalization.
28 | 
29 |         Args:
30 |             x (torch.Tensor): Input tensor.
31 | 
32 |         Returns:
33 |             torch.Tensor: Normalized tensor.
34 | 
35 |         """
36 |         if self.dim == -1:
37 |             return super(LayerNorm, self).forward(x)
38 |         return (
39 |             super(LayerNorm, self)
40 |             .forward(x.transpose(self.dim, -1))
41 |             .transpose(self.dim, -1)
42 |         )
43 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/lightconv.py:
--------------------------------------------------------------------------------
  1 | """Lightweight Convolution Module."""
  2 | 
  3 | import numpy
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from torch import nn
  7 | 
  8 | MIN_VALUE = float(numpy.finfo(numpy.float32).min)
  9 | 
 10 | 
 11 | class LightweightConvolution(nn.Module):
 12 |     """Lightweight Convolution layer.
 13 | 
 14 |     This implementation is based on
 15 |     https://github.com/pytorch/fairseq/tree/master/fairseq
 16 | 
 17 |     Args:
 18 |         wshare (int): the number of kernel of convolution
 19 |         n_feat (int): the number of features
 20 |         dropout_rate (float): dropout_rate
 21 |         kernel_size (int): kernel size (length)
 22 |         use_kernel_mask (bool): Use causal mask or not for convolution kernel
 23 |         use_bias (bool): Use bias term or not.
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         wshare,
 30 |         n_feat,
 31 |         dropout_rate,
 32 |         kernel_size,
 33 |         use_kernel_mask=False,
 34 |         use_bias=False,
 35 |     ):
 36 |         """Construct Lightweight Convolution layer."""
 37 |         super(LightweightConvolution, self).__init__()
 38 | 
 39 |         assert n_feat % wshare == 0
 40 |         self.wshare = wshare
 41 |         self.use_kernel_mask = use_kernel_mask
 42 |         self.dropout_rate = dropout_rate
 43 |         self.kernel_size = kernel_size
 44 |         self.padding_size = int(kernel_size / 2)
 45 | 
 46 |         # linear -> GLU -> lightconv -> linear
 47 |         self.linear1 = nn.Linear(n_feat, n_feat * 2)
 48 |         self.linear2 = nn.Linear(n_feat, n_feat)
 49 |         self.act = nn.GLU()
 50 | 
 51 |         # lightconv related
 52 |         self.weight = nn.Parameter(
 53 |             torch.Tensor(self.wshare, 1, kernel_size).uniform_(0, 1)
 54 |         )
 55 |         self.use_bias = use_bias
 56 |         if self.use_bias:
 57 |             self.bias = nn.Parameter(torch.Tensor(n_feat))
 58 | 
 59 |         # mask of kernel
 60 |         kernel_mask0 = torch.zeros(self.wshare, int(kernel_size / 2))
 61 |         kernel_mask1 = torch.ones(self.wshare, int(kernel_size / 2 + 1))
 62 |         self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)
 63 | 
 64 |     def forward(self, query, key, value, mask):
 65 |         """Forward of 'Lightweight Convolution'.
 66 | 
 67 |         This function takes query, key and value but uses only query.
 68 |         This is just for compatibility with self-attention layer (attention.py)
 69 | 
 70 |         Args:
 71 |             query (torch.Tensor): (batch, time1, d_model) input tensor
 72 |             key (torch.Tensor): (batch, time2, d_model) NOT USED
 73 |             value (torch.Tensor): (batch, time2, d_model) NOT USED
 74 |             mask (torch.Tensor): (batch, time1, time2) mask
 75 | 
 76 |         Return:
 77 |             x (torch.Tensor): (batch, time1, d_model) output
 78 | 
 79 |         """
 80 |         # linear -> GLU -> lightconv -> linear
 81 |         x = query
 82 |         B, T, C = x.size()
 83 |         H = self.wshare
 84 | 
 85 |         # first liner layer
 86 |         x = self.linear1(x)
 87 | 
 88 |         # GLU activation
 89 |         x = self.act(x)
 90 | 
 91 |         # lightconv
 92 |         x = x.transpose(1, 2).contiguous().view(-1, H, T)  # B x C x T
 93 |         weight = F.dropout(self.weight, self.dropout_rate, training=self.training)
 94 |         if self.use_kernel_mask:
 95 |             self.kernel_mask = self.kernel_mask.to(x.device)
 96 |             weight = weight.masked_fill(self.kernel_mask == 0.0, float("-inf"))
 97 |         weight = F.softmax(weight, dim=-1)
 98 |         x = F.conv1d(x, weight, padding=self.padding_size, groups=self.wshare).view(
 99 |             B, C, T
100 |         )
101 |         if self.use_bias:
102 |             x = x + self.bias.view(1, -1, 1)
103 |         x = x.transpose(1, 2)  # B x T x C
104 | 
105 |         if mask is not None and not self.use_kernel_mask:
106 |             mask = mask.transpose(-1, -2)
107 |             x = x.masked_fill(mask == 0, 0.0)
108 | 
109 |         # second linear layer
110 |         x = self.linear2(x)
111 |         return x
112 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/mask.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Shigeki Karita
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | 
 4 | """Mask module."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def subsequent_mask(size, device="cpu", dtype=torch.bool):
10 |     """Create mask for subsequent steps (size, size).
11 | 
12 |     :param int size: size of mask
13 |     :param str device: "cpu" or "cuda" or torch.Tensor.device
14 |     :param torch.dtype dtype: result dtype
15 |     :rtype: torch.Tensor
16 |     >>> subsequent_mask(3)
17 |     [[1, 0, 0],
18 |      [1, 1, 0],
19 |      [1, 1, 1]]
20 |     """
21 |     ret = torch.ones(size, size, device=device, dtype=dtype)
22 |     return torch.tril(ret, out=ret)
23 | 
24 | 
25 | def target_mask(ys_in_pad, ignore_id):
26 |     """Create mask for decoder self-attention.
27 | 
28 |     :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
29 |     :param int ignore_id: index of padding
30 |     :param torch.dtype dtype: result dtype
31 |     :rtype: torch.Tensor (B, Lmax, Lmax)
32 |     """
33 |     ys_mask = ys_in_pad != ignore_id
34 |     m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
35 |     return ys_mask.unsqueeze(-2) & m
36 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/multi_layer_conv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
  8 | 
  9 | import torch
 10 | 
 11 | 
 12 | class MultiLayeredConv1d(torch.nn.Module):
 13 |     """Multi-layered conv1d for Transformer block.
 14 | 
 15 |     This is a module of multi-leyered conv1d designed
 16 |     to replace positionwise feed-forward network
 17 |     in Transforner block, which is introduced in
 18 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
 19 | 
 20 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
 21 |         https://arxiv.org/pdf/1905.09263.pdf
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 26 |         """Initialize MultiLayeredConv1d module.
 27 | 
 28 |         Args:
 29 |             in_chans (int): Number of input channels.
 30 |             hidden_chans (int): Number of hidden channels.
 31 |             kernel_size (int): Kernel size of conv1d.
 32 |             dropout_rate (float): Dropout rate.
 33 | 
 34 |         """
 35 |         super(MultiLayeredConv1d, self).__init__()
 36 |         self.w_1 = torch.nn.Conv1d(
 37 |             in_chans,
 38 |             hidden_chans,
 39 |             kernel_size,
 40 |             stride=1,
 41 |             padding=(kernel_size - 1) // 2,
 42 |         )
 43 |         self.w_2 = torch.nn.Conv1d(
 44 |             hidden_chans,
 45 |             in_chans,
 46 |             kernel_size,
 47 |             stride=1,
 48 |             padding=(kernel_size - 1) // 2,
 49 |         )
 50 |         self.dropout = torch.nn.Dropout(dropout_rate)
 51 | 
 52 |     def forward(self, x):
 53 |         """Calculate forward propagation.
 54 | 
 55 |         Args:
 56 |             x (torch.Tensor): Batch of input tensors (B, T, in_chans).
 57 | 
 58 |         Returns:
 59 |             torch.Tensor: Batch of output tensors (B, T, hidden_chans).
 60 | 
 61 |         """
 62 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
 63 |         return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
 64 | 
 65 | 
 66 | class Conv1dLinear(torch.nn.Module):
 67 |     """Conv1D + Linear for Transformer block.
 68 | 
 69 |     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
 70 | 
 71 |     """
 72 | 
 73 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 74 |         """Initialize Conv1dLinear module.
 75 | 
 76 |         Args:
 77 |             in_chans (int): Number of input channels.
 78 |             hidden_chans (int): Number of hidden channels.
 79 |             kernel_size (int): Kernel size of conv1d.
 80 |             dropout_rate (float): Dropout rate.
 81 | 
 82 |         """
 83 |         super(Conv1dLinear, self).__init__()
 84 |         self.w_1 = torch.nn.Conv1d(
 85 |             in_chans,
 86 |             hidden_chans,
 87 |             kernel_size,
 88 |             stride=1,
 89 |             padding=(kernel_size - 1) // 2,
 90 |         )
 91 |         self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
 92 |         self.dropout = torch.nn.Dropout(dropout_rate)
 93 | 
 94 |     def forward(self, x):
 95 |         """Calculate forward propagation.
 96 | 
 97 |         Args:
 98 |             x (torch.Tensor): Batch of input tensors (B, T, in_chans).
 99 | 
100 |         Returns:
101 |             torch.Tensor: Batch of output tensors (B, T, hidden_chans).
102 | 
103 |         """
104 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
105 |         return self.w_2(self.dropout(x))
106 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positionwise feed forward layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class PositionwiseFeedForward(torch.nn.Module):
13 |     """Positionwise feed forward layer.
14 | 
15 |     Args:
16 |         idim (int): Input dimenstion.
17 |         hidden_units (int): The number of hidden units.
18 |         dropout_rate (float): Dropout rate.
19 | 
20 |     """
21 | 
22 |     def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
23 |         """Construct an PositionwiseFeedForward object."""
24 |         super(PositionwiseFeedForward, self).__init__()
25 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
26 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
27 |         self.dropout = torch.nn.Dropout(dropout_rate)
28 |         self.activation = activation
29 | 
30 |     def forward(self, x):
31 |         """Forward function."""
32 |         return self.w_2(self.dropout(self.activation(self.w_1(x))))
33 | 


--------------------------------------------------------------------------------
/deepaudio/tts/modules/transformer/repeat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Repeat the same layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiSequential(torch.nn.Sequential):
13 |     """Multi-input multi-output torch.nn.Sequential."""
14 | 
15 |     def forward(self, *args):
16 |         """Repeat."""
17 |         for m in self:
18 |             args = m(*args)
19 |         return args
20 | 
21 | 
22 | def repeat(N, fn):
23 |     """Repeat module N times.
24 | 
25 |     Args:
26 |         N (int): Number of repeat time.
27 |         fn (Callable): Function to generate module.
28 | 
29 |     Returns:
30 |         MultiSequential: Repeated model instance.
31 | 
32 |     """
33 |     return MultiSequential(*[fn(n) for n in range(N)])
34 | 


--------------------------------------------------------------------------------
/deepaudio/tts/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from . import display
15 | 
16 | 
17 | def str2bool(str):
18 |     return True if str.lower() == 'true' else False
19 | 


--------------------------------------------------------------------------------
/deepaudio/tts/utils/display.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import librosa.display
 15 | import matplotlib.pylab as plt
 16 | 
 17 | __all__ = [
 18 |     "plot_alignment",
 19 |     "plot_spectrogram",
 20 |     "plot_waveform",
 21 |     "plot_multihead_alignments",
 22 |     "plot_multilayer_multihead_alignments",
 23 | ]
 24 | 
 25 | 
 26 | def plot_alignment(alignment, title=None):
 27 |     # alignment: [encoder_steps, decoder_steps)
 28 |     fig, ax = plt.subplots(figsize=(6, 4))
 29 |     im = ax.imshow(
 30 |         alignment, aspect='auto', origin='lower', interpolation='none')
 31 |     fig.colorbar(im, ax=ax)
 32 |     xlabel = 'Decoder timestep'
 33 |     if title is not None:
 34 |         xlabel += '\n\n' + title
 35 |     plt.xlabel(xlabel)
 36 |     plt.ylabel('Encoder timestep')
 37 |     plt.tight_layout()
 38 |     return fig
 39 | 
 40 | 
 41 | def plot_multihead_alignments(alignments, title=None):
 42 |     # alignments: [N, encoder_steps, decoder_steps)
 43 |     num_subplots = alignments.shape[0]
 44 | 
 45 |     fig, axes = plt.subplots(
 46 |         figsize=(6 * num_subplots, 4),
 47 |         ncols=num_subplots,
 48 |         sharey=True,
 49 |         squeeze=True)
 50 |     for i, ax in enumerate(axes):
 51 |         im = ax.imshow(
 52 |             alignments[i], aspect='auto', origin='lower', interpolation='none')
 53 |         fig.colorbar(im, ax=ax)
 54 |         xlabel = 'Decoder timestep'
 55 |         if title is not None:
 56 |             xlabel += '\n\n' + title
 57 |         ax.set_xlabel(xlabel)
 58 |         if i == 0:
 59 |             ax.set_ylabel('Encoder timestep')
 60 |     plt.tight_layout()
 61 |     return fig
 62 | 
 63 | 
 64 | def plot_multilayer_multihead_alignments(alignments, title=None):
 65 |     # alignments: [num_layers, num_heads, encoder_steps, decoder_steps)
 66 |     num_layers, num_heads, *_ = alignments.shape
 67 | 
 68 |     fig, axes = plt.subplots(
 69 |         figsize=(6 * num_heads, 4 * num_layers),
 70 |         nrows=num_layers,
 71 |         ncols=num_heads,
 72 |         sharex=True,
 73 |         sharey=True,
 74 |         squeeze=True)
 75 |     for i, row in enumerate(axes):
 76 |         for j, ax in enumerate(row):
 77 |             im = ax.imshow(
 78 |                 alignments[i, j],
 79 |                 aspect='auto',
 80 |                 origin='lower',
 81 |                 interpolation='none')
 82 |             fig.colorbar(im, ax=ax)
 83 |             xlabel = 'Decoder timestep'
 84 |             if title is not None:
 85 |                 xlabel += '\n\n' + title
 86 |             if i == num_layers - 1:
 87 |                 ax.set_xlabel(xlabel)
 88 |             if j == 0:
 89 |                 ax.set_ylabel('Encoder timestep')
 90 |     plt.tight_layout()
 91 |     return fig
 92 | 
 93 | 
 94 | def plot_spectrogram(spec):
 95 |     # spec: [C, T] librosa convention
 96 |     fig, ax = plt.subplots(figsize=(12, 3))
 97 |     im = ax.imshow(spec, aspect="auto", origin="lower", interpolation='none')
 98 |     plt.colorbar(im, ax=ax)
 99 |     plt.xlabel("Frames")
100 |     plt.ylabel("Channels")
101 |     plt.tight_layout()
102 |     return fig
103 | 
104 | 
105 | def plot_waveform(wav, sr=22050):
106 |     fig, ax = plt.subplots(figsize=(12, 3))
107 |     im = librosa.display.waveplot(wav, sr=22050)
108 |     plt.colorbar(im, ax=ax)
109 |     plt.tight_layout()
110 |     return fig
111 | 


--------------------------------------------------------------------------------
/deepaudio/tts/utils/h5_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import logging
15 | import sys
16 | from pathlib import Path
17 | from typing import Any
18 | from typing import Union
19 | 
20 | import h5py
21 | import numpy as np
22 | 
23 | 
24 | def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
25 |     """Read a dataset from a HDF5 file.
26 |     Args:
27 |         filename (Union[Path, str]): Path of the HDF5 file.
28 |         dataset_name (str): Name of the dataset to read.
29 | 
30 |     Returns:
31 |         Any: The retrieved dataset.
32 |     """
33 |     filename = Path(filename)
34 | 
35 |     if not filename.exists():
36 |         logging.error(f"There is no such a hdf5 file ({filename}).")
37 |         sys.exit(1)
38 | 
39 |     hdf5_file = h5py.File(filename, "r")
40 | 
41 |     if dataset_name not in hdf5_file:
42 |         logging.error(f"There is no such a data in hdf5 file. ({dataset_name})")
43 |         sys.exit(1)
44 | 
45 |     # [()]: a special syntax of h5py to get the dataset as-is
46 |     hdf5_data = hdf5_file[dataset_name][()]
47 |     hdf5_file.close()
48 | 
49 |     return hdf5_data
50 | 
51 | 
52 | def write_hdf5(filename: Union[Path, str],
53 |                dataset_name: str,
54 |                write_data: np.ndarray,
55 |                is_overwrite: bool=True) -> None:
56 |     """Write dataset to HDF5 file.
57 |     Args:
58 |         filename (Union[Path, str]): Path of the HDF5 file.
59 |         dataset_name (str): Name of the dataset to write to.
60 |         write_data (np.ndarrays): The data to write.
61 |         is_overwrite (bool, optional): Whether to overwrite, by default True
62 |     """
63 |     # convert to numpy array
64 |     filename = Path(filename)
65 |     write_data = np.array(write_data)
66 | 
67 |     # check folder existence
68 |     filename.parent.mkdir(parents=True, exist_ok=True)
69 | 
70 |     # check hdf5 existence
71 |     if filename.exists():
72 |         # if already exists, open with r+ mode
73 |         hdf5_file = h5py.File(filename, "r+")
74 |         # check dataset existence
75 |         if dataset_name in hdf5_file:
76 |             if is_overwrite:
77 |                 logging.warning("Dataset in hdf5 file already exists. "
78 |                                 "recreate dataset in hdf5.")
79 |                 hdf5_file.__delitem__(dataset_name)
80 |             else:
81 |                 logging.error(
82 |                     "Dataset in hdf5 file already exists. "
83 |                     "if you want to overwrite, please set is_overwrite = True.")
84 |                 hdf5_file.close()
85 |                 sys.exit(1)
86 |     else:
87 |         # if not exists, open with w mode
88 |         hdf5_file = h5py.File(filename, "w")
89 | 
90 |     # write data to hdf5
91 |     hdf5_file.create_dataset(dataset_name, data=write_data)
92 |     hdf5_file.flush()
93 |     hdf5_file.close()
94 | 


--------------------------------------------------------------------------------