├── .gitignore ├── Readme.md └── deepaudio ├── __init__.py └── tts ├── __init__.py ├── audio ├── __init__.py ├── audio.py ├── codec.py └── spec_normalizer.py ├── cli ├── __init__.py ├── configs │ ├── __init__.py │ ├── callbacks │ │ ├── default.yaml │ │ ├── early_stopping.yaml │ │ ├── model_checkpoint.yaml │ │ ├── model_summary.yaml │ │ ├── none.yaml │ │ └── rich_progress_bar.yaml │ ├── datamodule │ │ ├── fastspeech2.yaml │ │ ├── gan.yaml │ │ ├── tacotron2.yaml │ │ ├── transformer_tts.yaml │ │ ├── vits.yaml │ │ └── wavernn.yaml │ ├── experiment │ │ ├── fastspeech2.yaml │ │ ├── hifigan.yaml │ │ ├── parallel_wavegan.yaml │ │ ├── tacotron2.yaml │ │ ├── transformer_tts.yaml │ │ └── vits.yaml │ ├── extras │ │ └── default.yaml │ ├── hydra │ │ └── default.yaml │ ├── logger │ │ ├── comet.yaml │ │ ├── csv.yaml │ │ ├── many_loggers.yaml │ │ ├── mlflow.yaml │ │ ├── neptune.yaml │ │ ├── tensorboard.yaml │ │ └── wandb.yaml │ ├── model │ │ ├── fastspeech2.yaml │ │ ├── hifigan.yaml │ │ ├── parallel_wavegan.yaml │ │ ├── tacotron2.yaml │ │ ├── transformer_tts.yaml │ │ └── vits.yaml │ ├── paths │ │ └── default.yaml │ ├── train.yaml │ └── trainer │ │ ├── cpu.yaml │ │ ├── ddp.yaml │ │ ├── ddp_sim.yaml │ │ ├── default.yaml │ │ ├── gpu.yaml │ │ └── mps.yaml ├── preprocess │ ├── __init__.py │ ├── fastspeech2 │ │ ├── __init__.py │ │ ├── normalize.py │ │ └── preprocess.py │ ├── gan_vocoder │ │ ├── __init__.py │ │ ├── normalize.py │ │ └── preprocess.py │ ├── tacotron2 │ │ ├── __init__.py │ │ ├── normalize.py │ │ └── preprocess.py │ ├── transformer_tts │ │ ├── __init__.py │ │ ├── normalize.py │ │ └── preprocess.py │ └── vits │ │ ├── normalize.py │ │ └── preprocess.py ├── train.py └── utils │ ├── __init__.py │ ├── pylogger.py │ ├── rich_utils.py │ └── utils.py ├── datamodules ├── __init__.py ├── fastspeech2_datamodule.py ├── gan_datamodule.py ├── tacotron2_datamodule.py ├── transformer_tts_datamodule.py ├── vits_datamodule.py └── wavernn_datamodule.py ├── datasets ├── __init__.py ├── am_batch_fn.py ├── batch.py ├── data_table.py ├── dataset.py ├── get_feats.py ├── ljspeech.py ├── preprocess_utils.py └── vocoder_batch_fn.py ├── feats_extract_from_torch ├── __init__.py ├── abs_feats_extract.py ├── complex_utils.py ├── dio.py ├── energy.py ├── linear_spectrogram.py ├── log_mel.py ├── log_mel_fbank.py ├── log_spectrogram.py └── stft.py ├── frontend ├── __init__.py ├── arpabet.py ├── generate_lexicon.py ├── normalizer │ ├── __init__.py │ ├── abbrrviation.py │ ├── acronyms.py │ ├── normalizer.py │ ├── numbers.py │ └── width.py ├── phonectic.py ├── punctuation.py ├── tone_sandhi.py ├── vocab.py ├── zh_frontend.py └── zh_normalization │ ├── README.md │ ├── __init__.py │ ├── char_convert.py │ ├── chronology.py │ ├── constants.py │ ├── num.py │ ├── phonecode.py │ ├── quantifier.py │ └── text_normlization.py ├── models ├── __init__.py ├── fastspeech2 │ ├── __init__.py │ ├── fastspeech2.py │ ├── loss.py │ └── model.py ├── hifigan │ ├── __init__.py │ ├── hifigan.py │ ├── loss.py │ ├── model.py │ └── residual_block.py ├── melgan │ ├── __init__.py │ ├── melgan.py │ ├── model.py │ ├── pqmf.py │ ├── residual_stack.py │ ├── style_melgan.py │ └── tade_res_block.py ├── parallel_wavegan │ ├── __init__.py │ ├── model.py │ ├── parallel_wavegan.py │ └── upsample.py ├── tacotron2 │ ├── __init__.py │ ├── loss.py │ ├── model.py │ └── tacotron2.py ├── transformer_tts │ ├── __init__.py │ ├── loss.py │ ├── model.py │ └── transformer.py ├── vits │ ├── __init__.py │ ├── duration_predictor.py │ ├── flow.py │ ├── generator.py │ ├── loss.py │ ├── model.py │ ├── monotonic_align │ │ ├── __init__.py │ │ ├── core.pyx │ │ └── setup.py │ ├── posterior_encoder.py │ ├── residual_coupling.py │ ├── text_encoder.py │ ├── transform.py │ ├── vits.py │ └── wavenet │ │ ├── __init__.py │ │ ├── residual_block.py │ │ └── wavenet.py └── wavernn │ ├── __init__.py │ ├── model.py │ └── wavernn.py ├── modules ├── __init__.py ├── activation.py ├── causal_conv.py ├── conformer │ ├── __init__.py │ ├── convolution.py │ ├── encoder.py │ ├── encoder_layer.py │ └── swish.py ├── conv.py ├── geometry.py ├── layer_norm.py ├── losses.py ├── masked_fill.py ├── nets_utils.py ├── normalizer.py ├── positional_encoding.py ├── pqmf.py ├── predictor │ ├── __init__.py │ ├── duration_calculator.py │ ├── duration_predictor.py │ ├── length_regulator.py │ └── variance_predictor.py ├── residual_block.py ├── residual_stack.py ├── style_encoder.py ├── tacotron2 │ ├── __init__.py │ ├── attentions.py │ ├── cbhg.py │ ├── decoder.py │ └── encoder.py ├── tade_res_block.py ├── transformer │ ├── __init__.py │ ├── attention.py │ ├── decoder.py │ ├── decoder_layer.py │ ├── dynamic_conv.py │ ├── dynamic_conv2d.py │ ├── embedding.py │ ├── encoder.py │ ├── encoder_layer.py │ ├── layer_norm.py │ ├── lightconv.py │ ├── lightconv2d.py │ ├── mask.py │ ├── multi_layer_conv.py │ ├── positionwise_feed_forward.py │ ├── repeat.py │ └── subsampling.py └── upsample.py └── utils ├── __init__.py ├── display.py ├── error_rate.py └── h5_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | outputs/ 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ## What is deepaudio-tts? 2 | Deepaudio-tts is a framework for training neural network based Text-to-Speech (TTS) models. It inlcudes or will include popular neural network architectures for tts and vocoder models. 3 | 4 | To make it easy to use various functions such as mixed-precision, multi-node training, and TPU training etc, I introduced PyTorch-Lighting and Hydra in this framework. *It is still in development.* 5 | 6 | 7 | ## Training examples 8 | 1. Preprocess you data. (Scripts comming soon, or you can follow the tutorial of paddle speech for this step.) 9 | 2. Train the model. You can choose one experiment in deepaudio/tts/cli/configs/experiment. Then train the model with following lines: 10 | ``` 11 | $ export PYTHONPATH="${PYTHONPATH}:/dir/of/this/project/" 12 | $ python -m deepaudio.tts.cli.train experiment=tacotron2 datamodule.train_metadata=/you/path/to/train_metadata datamodule.dev_metadata=/you/path/to/dev_metadata 13 | ``` 14 | 15 | ## Supported Models 16 | 1. Tacotron2 17 | 2. FastSpeech2 18 | 3. Transformer TTS 19 | 4. Parallel WaveGAN 20 | 5. HiFiGAN 21 | 6. VITS 22 | 23 | ## Future plan 24 | ### clean code 25 | 1. Remove redundant codes. 26 | 2. make deepaudio.tts.models more clean. 27 | ### Models 28 | 1. Other models. 29 | 2. Pretrained models. 30 | ### Deployment 31 | 1. onnx 32 | 2. jit 33 | ## How to contribute to deepaudio-tts 34 | 35 | It is a personal project. So I don't have enough gpu resources to do a lot of experiments. 36 | This project is still in development. 37 | I appreciate any kind of feedback or contributions. Please feel free to make a pull requsest for some small issues like bug fixes, experiment results. If you have any questions, please [open an issue](https://github.com/deepaudio/deepaudio-tts/issues). 38 | 39 | ## Acknowledge 40 | I borrowed a lot of codes from [espnet](https://github.com/espnet/espnet) and [paddle speech](https://github.com/PaddlePaddle/PaddleSpeech) -------------------------------------------------------------------------------- /deepaudio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/audio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .audio import AudioProcessor 15 | from .codec import * 16 | from .spec_normalizer import LogMagnitude 17 | from .spec_normalizer import NormalizerBase 18 | -------------------------------------------------------------------------------- /deepaudio/tts/audio/audio.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import librosa 15 | import numpy as np 16 | import soundfile as sf 17 | 18 | __all__ = ["AudioProcessor"] 19 | 20 | 21 | class AudioProcessor(object): 22 | def __init__(self, 23 | sample_rate: int, 24 | n_fft: int, 25 | win_length: int, 26 | hop_length: int, 27 | n_mels: int=80, 28 | fmin: int=0, 29 | fmax: int=None, 30 | window="hann", 31 | center=True, 32 | pad_mode="reflect", 33 | normalize=True): 34 | # read & write 35 | self.sample_rate = sample_rate 36 | self.normalize = normalize 37 | 38 | # stft 39 | self.n_fft = n_fft 40 | self.win_length = win_length 41 | self.hop_length = hop_length 42 | self.window = window 43 | self.center = center 44 | self.pad_mode = pad_mode 45 | 46 | # mel 47 | self.n_mels = n_mels 48 | self.fmin = fmin 49 | self.fmax = fmax 50 | 51 | self.mel_filter = self._create_mel_filter() 52 | self.inv_mel_filter = np.linalg.pinv(self.mel_filter) 53 | 54 | def _create_mel_filter(self): 55 | mel_filter = librosa.filters.mel( 56 | sr=self.sample_rate, 57 | n_fft=self.n_fft, 58 | n_mels=self.n_mels, 59 | fmin=self.fmin, 60 | fmax=self.fmax) 61 | return mel_filter 62 | 63 | def read_wav(self, filename): 64 | # resampling may occur 65 | wav, _ = librosa.load(filename, sr=self.sample_rate) 66 | 67 | # normalize the volume 68 | if self.normalize: 69 | wav = wav / np.max(np.abs(wav)) * 0.999 70 | return wav 71 | 72 | def write_wav(self, path, wav): 73 | sf.write(path, wav, samplerate=self.sample_rate) 74 | 75 | def stft(self, wav): 76 | D = librosa.core.stft( 77 | wav, 78 | n_fft=self.n_fft, 79 | hop_length=self.hop_length, 80 | win_length=self.win_length, 81 | window=self.window, 82 | center=self.center, 83 | pad_mode=self.pad_mode) 84 | return D 85 | 86 | def istft(self, D): 87 | wav = librosa.core.istft( 88 | D, 89 | hop_length=self.hop_length, 90 | win_length=self.win_length, 91 | window=self.window, 92 | center=self.center) 93 | return wav 94 | 95 | def spectrogram(self, wav): 96 | D = self.stft(wav) 97 | return np.abs(D) 98 | 99 | def mel_spectrogram(self, wav): 100 | S = self.spectrogram(wav) 101 | mel = np.dot(self.mel_filter, S) 102 | return mel 103 | -------------------------------------------------------------------------------- /deepaudio/tts/audio/codec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import math 15 | 16 | import numpy as np 17 | import paddle 18 | 19 | 20 | # x: [0: 2**bit-1], return: [-1, 1] 21 | def label_2_float(x, bits): 22 | return 2 * x / (2**bits - 1.) - 1. 23 | 24 | 25 | #x: [-1, 1], return: [0, 2**bits-1] 26 | def float_2_label(x, bits): 27 | assert abs(x).max() <= 1.0 28 | x = (x + 1.) * (2**bits - 1) / 2 29 | return x.clip(0, 2**bits - 1) 30 | 31 | 32 | # y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1] 33 | # see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm 34 | # be careful the input `mu` here, which is +1 than that of the link above 35 | def encode_mu_law(x, mu): 36 | mu = mu - 1 37 | fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) 38 | return np.floor((fx + 1) / 2 * mu + 0.5) 39 | 40 | 41 | # from_labels = True: 42 | # y: [0: 2**bit-1], mu: 2**bits, return: [-1,1] 43 | # from_labels = False: 44 | # y: [-1, 1], return: [-1, 1] 45 | def decode_mu_law(y, mu, from_labels=True): 46 | # TODO: get rid of log2 - makes no sense 47 | if from_labels: 48 | y = label_2_float(y, math.log2(mu)) 49 | mu = mu - 1 50 | x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1) 51 | return x 52 | -------------------------------------------------------------------------------- /deepaudio/tts/audio/spec_normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | This modules contains normalizers for spectrogram magnitude. 16 | Normalizers are invertible transformations. They can be used to process 17 | magnitude of spectrogram before training and can also be used to recover from 18 | the generated spectrogram so as to be used with vocoders like griffin lim. 19 | 20 | The base class describe the interface. `transform` is used to perform 21 | transformation and `inverse` is used to perform the inverse transformation. 22 | 23 | check issues: 24 | https://github.com/mozilla/TTS/issues/377 25 | """ 26 | import numpy as np 27 | 28 | __all__ = ["NormalizerBase", "LogMagnitude", "UnitMagnitude"] 29 | 30 | 31 | class NormalizerBase(object): 32 | def transform(self, spec): 33 | raise NotImplementedError("transform must be implemented") 34 | 35 | def inverse(self, normalized): 36 | raise NotImplementedError("inverse must be implemented") 37 | 38 | 39 | class LogMagnitude(NormalizerBase): 40 | """ 41 | This is a simple normalizer used in Waveglow, Waveflow, tacotron2... 42 | """ 43 | 44 | def __init__(self, min=1e-5): 45 | self.min = min 46 | 47 | def transform(self, x): 48 | x = np.maximum(x, self.min) 49 | x = np.log(x) 50 | return x 51 | 52 | def inverse(self, x): 53 | return np.exp(x) 54 | 55 | 56 | class UnitMagnitude(NormalizerBase): 57 | # dbscale and (0, 1) normalization 58 | """ 59 | This is the normalizer used in the 60 | """ 61 | 62 | def __init__(self, min=1e-5): 63 | self.min = min 64 | 65 | def transform(self, x): 66 | db_scale = 20 * np.log10(np.maximum(self.min, x)) - 20 67 | normalized = (db_scale + 100) / 100 68 | clipped = np.clip(normalized, 0, 1) 69 | return clipped 70 | 71 | def inverse(self, x): 72 | denormalized = np.clip(x, 0, 1) * 100 - 100 73 | out = np.exp((denormalized + 20) / 20 * np.log(10)) 74 | return out 75 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/configs/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - early_stopping.yaml 4 | - model_summary.yaml 5 | - rich_progress_bar.yaml 6 | - _self_ 7 | 8 | model_checkpoint: 9 | dirpath: ${paths.output_dir}/checkpoints 10 | filename: "epoch_{epoch:03d}" 11 | monitor: "val/loss" 12 | mode: "min" 13 | save_last: True 14 | auto_insert_metric_name: False 15 | 16 | early_stopping: 17 | monitor: "val/loss" 18 | patience: 100 19 | mode: "min" 20 | 21 | model_summary: 22 | max_depth: -1 23 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/early_stopping.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.EarlyStopping.html 2 | 3 | # Monitor a metric and stop training when it stops improving. 4 | # Look at the above link for more detailed information. 5 | early_stopping: 6 | _target_: pytorch_lightning.callbacks.EarlyStopping 7 | monitor: ??? # quantity to be monitored, must be specified !!! 8 | min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement 9 | patience: 3 # number of checks with no improvement after which training will be stopped 10 | verbose: False # verbosity mode 11 | mode: "min" # "max" means higher metric value is better, can be also "min" 12 | strict: True # whether to crash the training if monitor is not found in the validation metrics 13 | check_finite: True # when set True, stops training when the monitor becomes NaN or infinite 14 | stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold 15 | divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold 16 | check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch 17 | # log_rank_zero_only: False # this keyword argument isn't available in stable version 18 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.ModelCheckpoint.html 2 | 3 | # Save the model periodically by monitoring a quantity. 4 | # Look at the above link for more detailed information. 5 | model_checkpoint: 6 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 7 | dirpath: null # directory to save the model file 8 | filename: null # checkpoint filename 9 | monitor: null # name of the logged metric which determines when model is improving 10 | verbose: False # verbosity mode 11 | save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt 12 | save_top_k: 1 # save k best models (determined by above metric) 13 | mode: "min" # "max" means higher metric value is better, can be also "min" 14 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 15 | save_weights_only: False # if True, then only the model’s weights will be saved 16 | every_n_train_steps: null # number of training steps between checkpoints 17 | train_time_interval: null # checkpoints are monitored at the specified time interval 18 | every_n_epochs: null # number of epochs between checkpoints 19 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 20 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichModelSummary.html 2 | 3 | # Generates a summary of all layers in a LightningModule with rich text formatting. 4 | # Look at the above link for more detailed information. 5 | model_summary: 6 | _target_: pytorch_lightning.callbacks.RichModelSummary 7 | max_depth: 1 # the maximum depth of layer nesting that the summary will include 8 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/cli/configs/callbacks/none.yaml -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.callbacks.RichProgressBar.html 2 | 3 | # Create a progress bar with rich text formatting. 4 | # Look at the above link for more detailed information. 5 | rich_progress_bar: 6 | _target_: pytorch_lightning.callbacks.RichProgressBar 7 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/fastspeech2.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.fastspeech2_datamodule.Fastspeech2DataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_size: 128 5 | num_workers: 0 6 | pin_memory: False 7 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/gan.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.gan_datamodule.GanDataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_max_steps: ??? 5 | n_shift: ??? 6 | batch_size: 128 7 | num_workers: 0 8 | pin_memory: False 9 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/tacotron2.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.tacotron2_datamodule.Tacaotron2DataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_size: 128 5 | num_workers: 0 6 | pin_memory: False 7 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/transformer_tts.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.transformer_tts_datamodule.TransformerTTSDataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_size: 128 5 | num_workers: 0 6 | pin_memory: False 7 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/vits.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.vits_datamodule.VitsDataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_size: 128 5 | num_workers: 0 6 | pin_memory: False 7 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/datamodule/wavernn.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.datamodules.wavernn_datamodule.WaveRNNDataModule 2 | train_metadata: ??? 3 | dev_metadata: ??? 4 | batch_max_steps: ??? 5 | n_shift: ??? 6 | mode: ??? 7 | bits: ??? 8 | batch_size: 128 9 | num_workers: 0 10 | pin_memory: False 11 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/fastspeech2.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=fastspeech2 5 | 6 | defaults: 7 | - override /datamodule: fastspeech2.yaml 8 | - override /model: fastspeech2.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["fastspeech2", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/hifigan.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=hifigan 5 | 6 | defaults: 7 | - override /datamodule: gan.yaml 8 | - override /model: hifigan.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["hifigan", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/parallel_wavegan.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=parallel_wavegan 5 | 6 | defaults: 7 | - override /datamodule: gan.yaml 8 | - override /model: parallel_wavegan.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["parallel_wavegan", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/tacotron2.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=tacotron2 5 | 6 | defaults: 7 | - override /datamodule: tacotron2.yaml 8 | - override /model: tacotron2.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["tacotron2", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/transformer_tts.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=transformer_tts 5 | 6 | defaults: 7 | - override /datamodule: transformer_tts.yaml 8 | - override /model: transformer_tts.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["transformer_tts", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/experiment/vits.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=vits 5 | 6 | defaults: 7 | - override /datamodule: vits.yaml 8 | - override /model: vits.yaml 9 | - override /callbacks: default.yaml 10 | - override /trainer: default.yaml 11 | 12 | # all parameters below will be merged with parameters from default configurations set above 13 | # this allows you to overwrite only specified parameters 14 | 15 | tags: ["vits", "ljspeech"] 16 | 17 | seed: 12345 18 | 19 | trainer: 20 | min_epochs: 10 21 | max_epochs: 10 22 | gradient_clip_val: 0.5 23 | 24 | datamodule: 25 | batch_size: 64 26 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: default 6 | - override job_logging: default 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: pytorch_lightning.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "lightning-hydra-template" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: pytorch_lightning.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | # - comet.yaml 5 | - csv.yaml 6 | # - mlflow.yaml 7 | # - neptune.yaml 8 | - tensorboard.yaml 9 | - wandb.yaml 10 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: pytorch_lightning.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: username/lightning-hydra-template 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: pytorch_lightning.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "lightning-hydra-template" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/model/fastspeech2.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.models.fastspeech2.model.Fastspeech2Model 2 | 3 | optimizer: 4 | _target_: torch.optim.Adam 5 | _partial_: true 6 | lr: 0.001 7 | weight_decay: 0.0 8 | 9 | scheduler: 10 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 11 | _partial_: true 12 | mode: min 13 | factor: 0.1 14 | patience: 10 15 | 16 | model: 17 | _target_: deepaudio.tts.models.fastspeech2.fastspeech2.FastSpeech2 18 | idim: 80 # Dimension of the inputs 19 | odim: 80 # Dimension of the outputs. 20 | adim: 384 # attention dimension 21 | aheads: 2 # number of attention heads 22 | elayers: 4 # number of encoder layers 23 | eunits: 1536 # number of encoder ff units 24 | dlayers: 4 # number of decoder layers 25 | dunits: 1536 # number of decoder ff units 26 | positionwise_layer_type: conv1d # type of position-wise layer 27 | positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer 28 | duration_predictor_layers: 2 # number of layers of duration predictor 29 | duration_predictor_chans: 256 # number of channels of duration predictor 30 | duration_predictor_kernel_size: 3 # filter size of duration predictor 31 | postnet_layers: 5 # number of layers of postnset 32 | postnet_filts: 5 # filter size of conv layers in postnet 33 | postnet_chans: 256 # number of channels of conv layers in postnet 34 | use_scaled_pos_enc: True # whether to use scaled positional encoding 35 | encoder_normalize_before: True # whether to perform layer normalization before the input 36 | decoder_normalize_before: True # whether to perform layer normalization before the input 37 | reduction_factor: 1 # reduction factor 38 | init_type: xavier_uniform # initialization type 39 | init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding 40 | init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding 41 | transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer 42 | transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding 43 | transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer 44 | transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer 45 | transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding 46 | transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer 47 | pitch_predictor_layers: 5 # number of conv layers in pitch predictor 48 | pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor 49 | pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor 50 | pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor 51 | pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch 52 | pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch 53 | stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder 54 | energy_predictor_layers: 2 # number of conv layers in energy predictor 55 | energy_predictor_chans: 256 # number of channels of conv layers in energy predictor 56 | energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor 57 | energy_predictor_dropout: 0.5 # dropout rate in energy predictor 58 | energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy 59 | energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy 60 | stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder 61 | 62 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/model/parallel_wavegan.yaml: -------------------------------------------------------------------------------- 1 | _target_: eepaudio.tts.models.parallel_wavegan.model.ParallelWaveGANModel 2 | 3 | optimizer_d: 4 | _target_: torch.optim.Adam 5 | _partial_: true 6 | lr: 0.001 7 | weight_decay: 0.00001 8 | 9 | scheduler_d: 10 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 11 | _partial_: true 12 | mode: min 13 | factor: 0.1 14 | patience: 10 15 | 16 | optimizer_g: 17 | _target_: torch.optim.Adam 18 | _partial_: true 19 | lr: 0.001 20 | weight_decay: 0.00001 21 | 22 | scheduler_g: 23 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 24 | _partial_: true 25 | mode: min 26 | factor: 0.1 27 | patience: 10 28 | 29 | lambda_aux: 1.0, 30 | lambda_adv: 4.0, 31 | 32 | generator: 33 | _target_: deepaudio.tts.models.parallel_wavegan.parallel_wavegan.ParallelWaveGANGenerator 34 | in_channels: 1 # Number of input channels. 35 | out_channels: 1 # Number of output channels. 36 | kernel_size: 3 # Kernel size of dilated convolution. 37 | layers: 30 # Number of residual block layers. 38 | stacks: 3 # Number of stacks i.e., dilation cycles. 39 | residual_channels: 64 # Number of channels in residual conv. 40 | gate_channels: 128 # Number of channels in gated conv. 41 | skip_channels: 64 # Number of channels in skip conv. 42 | aux_channels: 80 # Number of channels for auxiliary feature conv. 43 | # Must be the same as num_mels. 44 | aux_context_window: 2 # Context window size for auxiliary feature. 45 | # If set to 2, previous 2 and future 2 frames will be considered. 46 | dropout_rate: 0.0 # Dropout rate. 0.0 means no dropout applied. 47 | use_weight_norm: True # Whether to use weight norm. 48 | # If set to true, it will be applied to all of the conv layers. 49 | #upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift 50 | 51 | discriminator: 52 | _target_: deepaudio.tts.models.parallel_wavegan.parallel_wavegan.ParallelWaveGANDiscriminator 53 | in_channels: 1 # Number of input channels. 54 | out_channels: 1 # Number of output channels. 55 | kernel_size: 3 # Number of output channels. 56 | layers: 10 # Number of conv layers. 57 | conv_channels: 64 # Number of chnn layers. 58 | bias: True # Whether to use bias parameter in conv. 59 | use_weight_norm: True # Whether to use weight norm. 60 | # If set to true, it will be applied to all of the conv layers. 61 | nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. 62 | nonlinear_activation_params: # Nonlinear function parameters 63 | negative_slope: 0.2 # Alpha in leakyrelu. 64 | 65 | criterion_stft: 66 | _target_: deepaudio.tts.modules.losses.MultiResolutionSTFTLoss 67 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 68 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 69 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 70 | window: "hann" # Window function for STFT-based loss 71 | 72 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/model/tacotron2.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.models.tacotron2.model.Tacotron2Model 2 | 3 | optimizer: 4 | _target_: torch.optim.Adam 5 | _partial_: true 6 | lr: 0.001 7 | weight_decay: 0.0 8 | 9 | scheduler: 10 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 11 | _partial_: true 12 | mode: min 13 | factor: 0.1 14 | patience: 10 15 | 16 | loss_type: L1+L2 17 | use_guided_attn_loss: True 18 | 19 | taco2_loss: 20 | _target_: deepaudio.tts.models.tacotron2.lossTacotron2Loss 21 | use_masking: True 22 | use_weighted_masking: False 23 | bce_pos_weight: 5.0 24 | 25 | attn_loss: 26 | _target_: deepaudio.tts.models.tacotron2.GuidedAttentionLoss 27 | sigma: 0.4 28 | alpha: 1.0 29 | reset_always: True 30 | 31 | model: 32 | _target_: deepaudio.tts.models.tacotron2.tacotron2.Tacotron2 33 | idim: 80 34 | odim: 80 35 | embed_dim: 512 # char or phn embedding dimension 36 | elayers: 1 # number of blstm layers in encoder 37 | eunits: 512 # number of blstm units 38 | econv_layers: 3 # number of convolutional layers in encoder 39 | econv_chans: 512 # number of channels in convolutional layer 40 | econv_filts: 5 # filter size of convolutional layer 41 | atype: location # attention function type 42 | adim: 512 # attention dimension 43 | aconv_chans: 32 # number of channels in convolutional layer of attention 44 | aconv_filts: 15 # filter size of convolutional layer of attention 45 | cumulate_att_w: True # whether to cumulate attention weight 46 | dlayers: 2 # number of lstm layers in decoder 47 | dunits: 1024 # number of lstm units in decoder 48 | prenet_layers: 2 # number of layers in prenet 49 | prenet_units: 256 # number of units in prenet 50 | postnet_layers: 5 # number of layers in postnet 51 | postnet_chans: 512 # number of channels in postnet 52 | postnet_filts: 5 # filter size of postnet layer 53 | output_activation: null # activation function for the final output 54 | use_batch_norm: True # whether to use batch normalization in encoder 55 | use_concate: True # whether to concatenate encoder embedding with decoder outputs 56 | use_residual: False # whether to use residual connection in encoder 57 | dropout_rate: 0.5 # dropout rate 58 | zoneout_rate: 0.1 # zoneout rate 59 | reduction_factor: 1 # reduction factor 60 | spk_embed_dim: null # speaker embedding dimension 61 | 62 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/model/transformer_tts.yaml: -------------------------------------------------------------------------------- 1 | _target_: deepaudio.tts.models.transformer_tts.model.TransformerTTSModel 2 | 3 | optimizer: 4 | _target_: torch.optim.Adam 5 | _partial_: true 6 | lr: 0.001 7 | weight_decay: 0.00001 8 | 9 | scheduler: 10 | _target_: torch.optim.lr_scheduler.ReduceLROnPlateau 11 | _partial_: true 12 | mode: min 13 | factor: 0.1 14 | patience: 10 15 | 16 | loss_type: L1+L2 17 | modules_applied_guided_attn: encoder-decoder 18 | use_guided_attn_loss: True 19 | 20 | transformer_loss: 21 | _target_: deepaudio.tts.models.transformer_tts.loss.TransformerLoss 22 | use_masking: True 23 | use_weighted_masking: False 24 | bce_pos_weight: 20.0 25 | 26 | atten_criterion: 27 | _target_: deepaudio.tts.models.transformer_tts.loss.GuidedMultiHeadAttentionLoss 28 | sigma: 0.4 29 | alpha: 1.0 30 | reset_always: True 31 | 32 | 33 | model: 34 | _target_: deepaudio.tts.models.transformer_tts.transformer.Transformer 35 | idim: 80 36 | odim: 80 37 | embed_dim: 0 # embedding dimension in encoder prenet 38 | eprenet_conv_layers: 0 # number of conv layers in encoder prenet 39 | # if set to 0, no encoder prenet will be used 40 | eprenet_conv_filts: 0 # filter size of conv layers in encoder prenet 41 | eprenet_conv_chans: 0 # number of channels of conv layers in encoder prenet 42 | dprenet_layers: 2 # number of layers in decoder prenet 43 | dprenet_units: 256 # number of units in decoder prenet 44 | adim: 512 # attention dimension 45 | aheads: 8 # number of attention heads 46 | elayers: 6 # number of encoder layers 47 | eunits: 1024 # number of encoder ff units 48 | dlayers: 6 # number of decoder layers 49 | dunits: 1024 # number of decoder ff units 50 | positionwise_layer_type: conv1d # type of position-wise layer 51 | positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer 52 | postnet_layers: 5 # number of layers of postnset 53 | postnet_filts: 5 # filter size of conv layers in postnet 54 | postnet_chans: 256 # number of channels of conv layers in postnet 55 | use_scaled_pos_enc: True # whether to use scaled positional encoding 56 | encoder_normalize_before: True # whether to perform layer normalization before the input 57 | decoder_normalize_before: True # whether to perform layer normalization before the input 58 | reduction_factor: 1 # reduction factor 59 | init_type: xavier_uniform # initialization type 60 | init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding 61 | init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding 62 | eprenet_dropout_rate: 0.0 # dropout rate for encoder prenet 63 | dprenet_dropout_rate: 0.5 # dropout rate for decoder prenet 64 | postnet_dropout_rate: 0.5 # dropout rate for postnet 65 | transformer_enc_dropout_rate: 0.1 # dropout rate for transformer encoder layer 66 | transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding 67 | transformer_enc_attn_dropout_rate: 0.1 # dropout rate for transformer encoder attention layer 68 | transformer_dec_dropout_rate: 0.1 # dropout rate for transformer decoder layer 69 | transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding 70 | transformer_dec_attn_dropout_rate: 0.1 # dropout rate for transformer decoder attention layer 71 | transformer_enc_dec_attn_dropout_rate: 0.1 # dropout rate for transformer encoder-decoder attention layer 72 | num_heads_applied_guided_attn: 2 # number of heads to apply guided attention loss 73 | num_layers_applied_guided_attn: 2 # number of layers to apply guided attention loss 74 | 75 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # PROJECT_ROOT is inferred and set by pyrootutils package in `train.py` and `eval.py` 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to logging directory 7 | log_dir: ${paths.root_dir}/logs/ 8 | 9 | # path to output directory, created dynamically by hydra 10 | # path generation pattern is specified in `configs/hydra/default.yaml` 11 | # use it to store all files generated during the run, like ckpts and metrics 12 | output_dir: ${hydra:runtime.output_dir} 13 | 14 | # path to working directory 15 | work_dir: ${hydra:runtime.cwd} 16 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # specify here default configuration 4 | # order of defaults determines the order in which configs override each other 5 | defaults: 6 | - _self_ 7 | - datamodule: tacotron2.yaml 8 | - model: tacotron2.yaml 9 | - callbacks: default.yaml 10 | - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`) 11 | - trainer: default.yaml 12 | - paths: default.yaml 13 | - extras: default.yaml 14 | - hydra: default.yaml 15 | 16 | # experiment configs allow for version control of specific hyperparameters 17 | # e.g. best hyperparameters for given model and datamodule 18 | - experiment: null 19 | 20 | # config for hyperparameter optimization 21 | - hparams_search: null 22 | 23 | # optional local config for machine/user specific settings 24 | # it's optional since it doesn't need to exist and is excluded from version control 25 | - optional local: default.yaml 26 | 27 | # debugging config (enable through command line, e.g. `python train.py debug=default) 28 | - debug: null 29 | 30 | # task name, determines output directory path 31 | task_name: "train" 32 | 33 | # tags to help you identify your experiments 34 | # you can overwrite this in experiment configs 35 | # overwrite from command line with `python train.py tags="[first_tag, second_tag]"` 36 | # appending lists from command line is currently not supported :( 37 | # https://github.com/facebookresearch/hydra/issues/1547 38 | tags: ["dev"] 39 | 40 | # set False to skip model training 41 | train: True 42 | 43 | # evaluate on test set, using best model weights achieved during training 44 | # lightning chooses best weights based on the metric specified in checkpoint callback 45 | test: True 46 | 47 | # simply provide checkpoint path to resume training 48 | ckpt_path: null 49 | 50 | # seed for random number generators in pytorch, numpy and python.random 51 | seed: null 52 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | # use "ddp_spawn" instead of "ddp", 5 | # it's slower but normal "ddp" currently doesn't work ideally with hydra 6 | # https://github.com/facebookresearch/hydra/issues/2070 7 | # https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn 8 | strategy: ddp_spawn 9 | 10 | accelerator: gpu 11 | devices: 4 12 | num_nodes: 1 13 | sync_batchnorm: True 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: pytorch_lightning.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | min_epochs: 1 # prevents early stopping 6 | max_epochs: 10 7 | 8 | accelerator: cpu 9 | devices: 1 10 | 11 | # mixed precision for extra speed-up 12 | # precision: 16 13 | 14 | # perform a validation loop every N training epochs 15 | check_val_every_n_epoch: 1 16 | 17 | # set True to to ensure deterministic results 18 | # makes training slower but gives more reproducibility than just setting seeds 19 | deterministic: False 20 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default.yaml 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/preprocess/fastspeech2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/preprocess/gan_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/preprocess/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/preprocess/transformer_tts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/train.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import hydra 4 | import pytorch_lightning as pl 5 | from omegaconf import DictConfig 6 | from pytorch_lightning import Callback, LightningDataModule, LightningModule, Trainer 7 | from pytorch_lightning.loggers import LightningLoggerBase 8 | 9 | from deepaudio.tts.cli import utils 10 | 11 | log = utils.get_pylogger(__name__) 12 | 13 | 14 | @utils.task_wrapper 15 | def train(cfg: DictConfig) -> Tuple[dict, dict]: 16 | """Trains the model. Can additionally evaluate on a testset, using best weights obtained during 17 | training. 18 | This method is wrapped in optional @task_wrapper decorator which applies extra utilities 19 | before and after the call. 20 | Args: 21 | cfg (DictConfig): Configuration composed by Hydra. 22 | Returns: 23 | Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects. 24 | """ 25 | 26 | # set seed for random number generators in pytorch, numpy and python.random 27 | if cfg.get("seed"): 28 | pl.seed_everything(cfg.seed, workers=True) 29 | 30 | log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>") 31 | datamodule: LightningDataModule = hydra.utils.instantiate(cfg.datamodule) 32 | 33 | log.info(f"Instantiating model <{cfg.model._target_}>") 34 | model: LightningModule = hydra.utils.instantiate(cfg.model) 35 | 36 | log.info("Instantiating callbacks...") 37 | callbacks: List[Callback] = utils.instantiate_callbacks(cfg.get("callbacks")) 38 | 39 | log.info("Instantiating loggers...") 40 | logger: List[LightningLoggerBase] = utils.instantiate_loggers(cfg.get("logger")) 41 | 42 | log.info(f"Instantiating trainer <{cfg.trainer._target_}>") 43 | trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger) 44 | 45 | object_dict = { 46 | "cfg": cfg, 47 | "datamodule": datamodule, 48 | "model": model, 49 | "callbacks": callbacks, 50 | "logger": logger, 51 | "trainer": trainer, 52 | } 53 | 54 | if logger: 55 | log.info("Logging hyperparameters!") 56 | utils.log_hyperparameters(object_dict) 57 | 58 | if cfg.get("train"): 59 | log.info("Starting training!") 60 | trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path")) 61 | 62 | train_metrics = trainer.callback_metrics 63 | 64 | if cfg.get("test"): 65 | log.info("Starting testing!") 66 | ckpt_path = trainer.checkpoint_callback.best_model_path 67 | if ckpt_path == "": 68 | log.warning("Best ckpt not found! Using current weights for testing...") 69 | ckpt_path = None 70 | trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path) 71 | log.info(f"Best ckpt path: {ckpt_path}") 72 | 73 | test_metrics = trainer.callback_metrics 74 | 75 | # merge train and test metrics 76 | metric_dict = {**train_metrics, **test_metrics} 77 | 78 | return metric_dict, object_dict 79 | 80 | 81 | @hydra.main(version_base="1.2", config_path="configs", config_name="train.yaml") 82 | def main(cfg: DictConfig) -> Optional[float]: 83 | 84 | # train the model 85 | metric_dict, _ = train(cfg) 86 | 87 | # safely retrieve metric value for hydra-based hyperparameter optimization 88 | metric_value = utils.get_metric_value( 89 | metric_dict=metric_dict, metric_name=cfg.get("optimized_metric") 90 | ) 91 | 92 | # return optimized metric 93 | return metric_value 94 | 95 | 96 | if __name__ == "__main__": 97 | main() -------------------------------------------------------------------------------- /deepaudio/tts/cli/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from deepaudio.tts.cli.utils.pylogger import get_pylogger 2 | from deepaudio.tts.cli.utils.rich_utils import enforce_tags, print_config_tree 3 | from deepaudio.tts.cli.utils.utils import ( 4 | close_loggers, 5 | extras, 6 | get_metric_value, 7 | instantiate_callbacks, 8 | instantiate_loggers, 9 | log_hyperparameters, 10 | save_file, 11 | task_wrapper, 12 | ) 13 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from pytorch_lightning.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name=__name__) -> logging.Logger: 7 | """Initializes multi-GPU-friendly python command line logger.""" 8 | 9 | logger = logging.getLogger(name) 10 | 11 | # this ensures all logging levels get marked with the rank zero decorator 12 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 13 | logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") 14 | for level in logging_levels: 15 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 16 | 17 | return logger 18 | -------------------------------------------------------------------------------- /deepaudio/tts/cli/utils/rich_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Sequence 3 | 4 | import rich 5 | import rich.syntax 6 | import rich.tree 7 | from hydra.core.hydra_config import HydraConfig 8 | from omegaconf import DictConfig, OmegaConf, open_dict 9 | from pytorch_lightning.utilities import rank_zero_only 10 | from rich.prompt import Prompt 11 | 12 | from deepaudio.tts.cli.utils import pylogger 13 | 14 | log = pylogger.get_pylogger(__name__) 15 | 16 | 17 | @rank_zero_only 18 | def print_config_tree( 19 | cfg: DictConfig, 20 | print_order: Sequence[str] = ( 21 | "datamodule", 22 | "model", 23 | "callbacks", 24 | "logger", 25 | "trainer", 26 | "paths", 27 | "extras", 28 | ), 29 | resolve: bool = False, 30 | save_to_file: bool = False, 31 | ) -> None: 32 | """Prints content of DictConfig using Rich library and its tree structure. 33 | 34 | Args: 35 | cfg (DictConfig): Configuration composed by Hydra. 36 | print_order (Sequence[str], optional): Determines in what order config components are printed. 37 | resolve (bool, optional): Whether to resolve reference fields of DictConfig. 38 | save_to_file (bool, optional): Whether to export config to the hydra output folder. 39 | """ 40 | 41 | style = "dim" 42 | tree = rich.tree.Tree("CONFIG", style=style, guide_style=style) 43 | 44 | queue = [] 45 | 46 | # add fields from `print_order` to queue 47 | for field in print_order: 48 | queue.append(field) if field in cfg else log.warning( 49 | f"Field '{field}' not found in config. Skipping '{field}' config printing..." 50 | ) 51 | 52 | # add all the other fields to queue (not specified in `print_order`) 53 | for field in cfg: 54 | if field not in queue: 55 | queue.append(field) 56 | 57 | # generate config tree from queue 58 | for field in queue: 59 | branch = tree.add(field, style=style, guide_style=style) 60 | 61 | config_group = cfg[field] 62 | if isinstance(config_group, DictConfig): 63 | branch_content = OmegaConf.to_yaml(config_group, resolve=resolve) 64 | else: 65 | branch_content = str(config_group) 66 | 67 | branch.add(rich.syntax.Syntax(branch_content, "yaml")) 68 | 69 | # print config tree 70 | rich.print(tree) 71 | 72 | # save config tree to file 73 | if save_to_file: 74 | with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file: 75 | rich.print(tree, file=file) 76 | 77 | 78 | @rank_zero_only 79 | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None: 80 | """Prompts user to input tags from command line if no tags are provided in config.""" 81 | 82 | if not cfg.get("tags"): 83 | if "id" in HydraConfig().cfg.hydra.job: 84 | raise ValueError("Specify tags before launching a multirun!") 85 | 86 | log.warning("No tags provided in config. Prompting user to input tags...") 87 | tags = Prompt.ask("Enter a list of comma separated tags", default="dev") 88 | tags = [t.strip() for t in tags.split(",") if t != ""] 89 | 90 | with open_dict(cfg): 91 | cfg.tags = tags 92 | 93 | log.info(f"Tags: {cfg.tags}") 94 | 95 | if save_to_file: 96 | with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file: 97 | rich.print(cfg.tags, file=file) 98 | 99 | 100 | if __name__ == "__main__": 101 | from hydra import compose, initialize 102 | 103 | with initialize(version_base="1.2", config_path="../../configs"): 104 | cfg = compose(config_name="train.yaml", return_hydra_config=False, overrides=[]) 105 | print_config_tree(cfg, resolve=False, save_to_file=False) 106 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/datamodules/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/fastspeech2_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn 9 | from deepaudio.tts.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn 10 | from deepaudio.tts.datasets.data_table import DataTable 11 | 12 | 13 | class Fastspeech2DataModule(LightningDataModule): 14 | def __init__(self, 15 | train_metadata: str, 16 | dev_metadata: str, 17 | batch_size: int = 64, 18 | num_workers: int = 0, 19 | pin_memory: bool = False, 20 | speaker_dict: Optional[str] = None, 21 | voice_cloning: Optional[bool] = False, 22 | ): 23 | super().__init__() 24 | self.save_hyperparameters(logger=False) 25 | self.train_dataset: Optional[Dataset] = None 26 | self.dev_dataset: Optional[Dataset] = None 27 | 28 | def setup(self, stage: Optional[str] = None) -> None: 29 | fields = [ 30 | "text", "text_lengths", "speech", "speech_lengths", "durations", 31 | "pitch", "energy" 32 | ] 33 | converters = {"speech": np.load, "pitch": np.load, "energy": np.load} 34 | spk_num = None 35 | if self.hparams.speaker_dict is not None: 36 | print("multiple speaker fastspeech2!") 37 | self.collate_fn = fastspeech2_multi_spk_batch_fn 38 | with open(self.hparams.speaker_dict, 'rt') as f: 39 | spk_id = [line.strip().split() for line in f.readlines()] 40 | spk_num = len(spk_id) 41 | fields += ["spk_id"] 42 | elif self.hparams.voice_cloning: 43 | print("Training voice cloning!") 44 | self.collate_fn = fastspeech2_multi_spk_batch_fn 45 | fields += ["spk_emb"] 46 | converters["spk_emb"] = np.load 47 | else: 48 | print("single speaker fastspeech2!") 49 | self.collate_fn = fastspeech2_single_spk_batch_fn 50 | print("spk_num:", spk_num) 51 | 52 | # construct dataset for training and validation 53 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 54 | train_metadata = list(reader) 55 | self.train_dataset = DataTable( 56 | data=train_metadata, 57 | fields=fields, 58 | converters=converters, ) 59 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 60 | dev_metadata = list(reader) 61 | self.dev_dataset = DataTable( 62 | data=dev_metadata, 63 | fields=fields, 64 | converters=converters, ) 65 | 66 | def train_dataloader(self): 67 | return DataLoader( 68 | dataset=self.train_dataset, 69 | batch_size=self.hparams.batch_size, 70 | num_workers=self.hparams.num_workers, 71 | pin_memory=self.hparams.pin_memory, 72 | shuffle=True, 73 | collate_fn=self.collate_fn, 74 | ) 75 | 76 | def val_dataloader(self): 77 | return DataLoader( 78 | dataset=self.dev_dataset, 79 | batch_size=self.hparams.batch_size, 80 | num_workers=self.hparams.num_workers, 81 | pin_memory=self.hparams.pin_memory, 82 | shuffle=False, 83 | collate_fn=self.collate_fn, 84 | ) 85 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/gan_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.vocoder_batch_fn import Clip 9 | from deepaudio.tts.datasets.data_table import DataTable 10 | 11 | 12 | class GanDataModule(LightningDataModule): 13 | def __init__(self, 14 | train_metadata: str, 15 | dev_metadata: str, 16 | batch_max_steps: int, 17 | n_shift: int, 18 | aux_context_window: Optional[int] = 0, 19 | batch_size: int = 64, 20 | num_workers: int = 0, 21 | pin_memory: bool = False, 22 | ): 23 | super().__init__() 24 | self.save_hyperparameters(logger=False) 25 | self.train_dataset: Optional[Dataset] = None 26 | self.dev_dataset: Optional[Dataset] = None 27 | 28 | def setup(self, stage: Optional[str] = None) -> None: 29 | # construct dataset for training and validation 30 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 31 | train_metadata = list(reader) 32 | self.train_dataset = DataTable( 33 | data=train_metadata, 34 | fields=["wave", "feats"], 35 | converters={ 36 | "wave": np.load, 37 | "feats": np.load, 38 | }, ) 39 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 40 | dev_metadata = list(reader) 41 | self.dev_dataset = DataTable( 42 | data=dev_metadata, 43 | fields=["wave", "feats"], 44 | converters={ 45 | "wave": np.load, 46 | "feats": np.load, 47 | }, ) 48 | 49 | self.collate_fn = Clip( 50 | batch_max_steps=self.hparams.batch_max_steps, 51 | hop_size=self.hparams.n_shift, 52 | aux_context_window=self.hparams.aux_context_window) 53 | 54 | def train_dataloader(self): 55 | return DataLoader( 56 | dataset=self.train_dataset, 57 | batch_size=self.hparams.batch_size, 58 | num_workers=self.hparams.num_workers, 59 | pin_memory=self.hparams.pin_memory, 60 | shuffle=True, 61 | collate_fn=self.collate_fn, 62 | ) 63 | 64 | def val_dataloader(self): 65 | return DataLoader( 66 | dataset=self.dev_dataset, 67 | batch_size=self.hparams.batch_size, 68 | num_workers=self.hparams.num_workers, 69 | pin_memory=self.hparams.pin_memory, 70 | shuffle=False, 71 | collate_fn=self.collate_fn, 72 | ) 73 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/tacotron2_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn 9 | from deepaudio.tts.datasets.am_batch_fn import tacotron2_single_spk_batch_fn 10 | from deepaudio.tts.datasets.data_table import DataTable 11 | 12 | class Tacaotron2DataModule(LightningDataModule): 13 | def __init__(self, 14 | train_metadata: str, 15 | dev_metadata: str, 16 | batch_size: int = 64, 17 | num_workers: int = 0, 18 | pin_memory: bool = False, 19 | voice_cloning: Optional[bool] = False, 20 | ): 21 | super().__init__() 22 | self.save_hyperparameters(logger=False) 23 | self.train_dataset: Optional[Dataset] = None 24 | self.dev_dataset: Optional[Dataset] = None 25 | 26 | def setup(self, stage: Optional[str] = None) -> None: 27 | fields = [ 28 | "text", 29 | "text_lengths", 30 | "speech", 31 | "speech_lengths", 32 | ] 33 | 34 | converters = { 35 | "speech": np.load, 36 | } 37 | if self.hparams.voice_cloning: 38 | print("Training voice cloning!") 39 | self.collate_fn = tacotron2_multi_spk_batch_fn 40 | fields += ["spk_emb"] 41 | converters["spk_emb"] = np.load 42 | else: 43 | print("single speaker tacotron2!") 44 | self.collate_fn = tacotron2_single_spk_batch_fn 45 | 46 | # construct dataset for training and validation 47 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 48 | train_metadata = list(reader) 49 | self.train_dataset = DataTable( 50 | data=train_metadata, 51 | fields=fields, 52 | converters=converters, ) 53 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 54 | dev_metadata = list(reader) 55 | self.dev_dataset = DataTable( 56 | data=dev_metadata, 57 | fields=fields, 58 | converters=converters, ) 59 | 60 | def train_dataloader(self): 61 | return DataLoader( 62 | dataset=self.train_dataset, 63 | batch_size=self.hparams.batch_size, 64 | num_workers=self.hparams.num_workers, 65 | pin_memory=self.hparams.pin_memory, 66 | shuffle=True, 67 | collate_fn=self.collate_fn, 68 | ) 69 | 70 | def val_dataloader(self): 71 | return DataLoader( 72 | dataset=self.dev_dataset, 73 | batch_size=self.hparams.batch_size, 74 | num_workers=self.hparams.num_workers, 75 | pin_memory=self.hparams.pin_memory, 76 | shuffle=False, 77 | collate_fn=self.collate_fn, 78 | ) 79 | 80 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/transformer_tts_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.am_batch_fn import transformer_single_spk_batch_fn 9 | from deepaudio.tts.datasets.data_table import DataTable 10 | 11 | 12 | class TransformerTTSDataModule(LightningDataModule): 13 | def __init__(self, 14 | train_metadata: str, 15 | dev_metadata: str, 16 | batch_size: int = 64, 17 | num_workers: int = 0, 18 | pin_memory: bool = False, 19 | ): 20 | super().__init__() 21 | self.save_hyperparameters(logger=False) 22 | self.train_dataset: Optional[Dataset] = None 23 | self.dev_dataset: Optional[Dataset] = None 24 | 25 | def setup(self, stage: Optional[str] = None) -> None: 26 | # construct dataset for training and validation 27 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 28 | train_metadata = list(reader) 29 | self.train_dataset = DataTable( 30 | data=train_metadata, 31 | fields=[ 32 | "text", 33 | "text_lengths", 34 | "speech", 35 | "speech_lengths", 36 | ], 37 | converters={ 38 | "speech": np.load, 39 | }, ) 40 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 41 | dev_metadata = list(reader) 42 | self.dev_dataset = DataTable( 43 | data=dev_metadata, 44 | fields=[ 45 | "text", 46 | "text_lengths", 47 | "speech", 48 | "speech_lengths", 49 | ], 50 | converters={ 51 | "speech": np.load, 52 | }, ) 53 | 54 | def train_dataloader(self): 55 | return DataLoader( 56 | dataset=self.train_dataset, 57 | batch_size=self.hparams.batch_size, 58 | num_workers=self.hparams.num_workers, 59 | pin_memory=self.hparams.pin_memory, 60 | shuffle=True, 61 | collate_fn=transformer_single_spk_batch_fn, 62 | ) 63 | 64 | def val_dataloader(self): 65 | return DataLoader( 66 | dataset=self.dev_dataset, 67 | batch_size=self.hparams.batch_size, 68 | num_workers=self.hparams.num_workers, 69 | pin_memory=self.hparams.pin_memory, 70 | shuffle=False, 71 | collate_fn=transformer_single_spk_batch_fn, 72 | ) 73 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/vits_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.am_batch_fn import vits_single_spk_batch_fn 9 | from deepaudio.tts.datasets.data_table import DataTable 10 | 11 | 12 | class VitsDataModule(LightningDataModule): 13 | def __init__(self, 14 | train_metadata: str, 15 | dev_metadata: str, 16 | batch_size: int = 64, 17 | num_workers: int = 0, 18 | pin_memory: bool = False, 19 | ): 20 | super().__init__() 21 | self.save_hyperparameters(logger=False) 22 | self.train_dataset: Optional[Dataset] = None 23 | self.dev_dataset: Optional[Dataset] = None 24 | 25 | def setup(self, stage: Optional[str] = None) -> None: 26 | # construct dataset for training and validation 27 | fields = ["text", "text_lengths", "feats", "feats_lengths", "wave"] 28 | 29 | converters = { 30 | "wave": np.load, 31 | "feats": np.load, 32 | } 33 | 34 | # construct dataset for training and validation 35 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 36 | train_metadata = list(reader) 37 | self.train_dataset = DataTable( 38 | data=train_metadata, 39 | fields=fields, 40 | converters=converters, ) 41 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 42 | dev_metadata = list(reader) 43 | self.dev_dataset = DataTable( 44 | data=dev_metadata, 45 | fields=fields, 46 | converters=converters, ) 47 | 48 | 49 | def train_dataloader(self): 50 | return DataLoader( 51 | dataset=self.train_dataset, 52 | batch_size=self.hparams.batch_size, 53 | num_workers=self.hparams.num_workers, 54 | pin_memory=self.hparams.pin_memory, 55 | shuffle=True, 56 | collate_fn=vits_single_spk_batch_fn, 57 | ) 58 | 59 | def val_dataloader(self): 60 | return DataLoader( 61 | dataset=self.dev_dataset, 62 | batch_size=self.hparams.batch_size, 63 | num_workers=self.hparams.num_workers, 64 | pin_memory=self.hparams.pin_memory, 65 | shuffle=False, 66 | collate_fn=vits_single_spk_batch_fn, 67 | ) 68 | -------------------------------------------------------------------------------- /deepaudio/tts/datamodules/wavernn_datamodule.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | import jsonlines 3 | import numpy as np 4 | 5 | from torch.utils.data import DataLoader, Dataset 6 | from pytorch_lightning import LightningDataModule 7 | 8 | from deepaudio.tts.datasets.vocoder_batch_fn import WaveRNNClip 9 | from deepaudio.tts.datasets.data_table import DataTable 10 | 11 | 12 | class WaveRNNDataModule(LightningDataModule): 13 | def __init__(self, 14 | train_metadata: str, 15 | dev_metadata: str, 16 | batch_max_steps: int, 17 | n_shift: int, 18 | mode: str, 19 | bits: int, 20 | aux_context_window: Optional[int] = 0, 21 | batch_size: int = 64, 22 | num_workers: int = 0, 23 | pin_memory: bool = False, 24 | ): 25 | super().__init__() 26 | self.save_hyperparameters(logger=False) 27 | self.train_dataset: Optional[Dataset] = None 28 | self.dev_dataset: Optional[Dataset] = None 29 | 30 | def setup(self, stage: Optional[str] = None) -> None: 31 | # construct dataset for training and validation 32 | with jsonlines.open(self.hparams.train_metadata, 'r') as reader: 33 | train_metadata = list(reader) 34 | self.train_dataset = DataTable( 35 | data=train_metadata, 36 | fields=["wave", "feats"], 37 | converters={ 38 | "wave": np.load, 39 | "feats": np.load, 40 | }, ) 41 | 42 | with jsonlines.open(self.hparams.dev_metadata, 'r') as reader: 43 | dev_metadata = list(reader) 44 | self.dev_dataset = DataTable( 45 | data=dev_metadata, 46 | fields=["wave", "feats"], 47 | converters={ 48 | "wave": np.load, 49 | "feats": np.load, 50 | }, ) 51 | 52 | self.collate_fn = WaveRNNClip( 53 | mode=self.hparams.mode, 54 | aux_context_window=self.hparams.aux_context_window, 55 | hop_size=self.hparams.n_shift, 56 | batch_max_steps=self.hparams.batch_max_steps, 57 | bits=self.hparams.bits) 58 | 59 | def train_dataloader(self): 60 | return DataLoader( 61 | dataset=self.train_dataset, 62 | batch_size=self.hparams.batch_size, 63 | num_workers=self.hparams.num_workers, 64 | pin_memory=self.hparams.pin_memory, 65 | shuffle=True, 66 | collate_fn=self.collate_fn, 67 | ) 68 | 69 | def val_dataloader(self): 70 | return DataLoader( 71 | dataset=self.dev_dataset, 72 | batch_size=self.hparams.batch_size, 73 | num_workers=self.hparams.num_workers, 74 | pin_memory=self.hparams.pin_memory, 75 | shuffle=False, 76 | collate_fn=self.collate_fn, 77 | ) 78 | -------------------------------------------------------------------------------- /deepaudio/tts/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .ljspeech import * 15 | -------------------------------------------------------------------------------- /deepaudio/tts/datasets/ljspeech.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from pathlib import Path 15 | 16 | from torch.utils.data import Dataset 17 | 18 | __all__ = ["LJSpeechMetaData"] 19 | 20 | 21 | class LJSpeechMetaData(Dataset): 22 | def __init__(self, root): 23 | self.root = Path(root).expanduser() 24 | wav_dir = self.root / "wavs" 25 | csv_path = self.root / "metadata.csv" 26 | records = [] 27 | speaker_name = "ljspeech" 28 | with open(str(csv_path), 'rt', encoding='utf-8') as f: 29 | for line in f: 30 | filename, _, normalized_text = line.strip().split("|") 31 | filename = str(wav_dir / (filename + ".wav")) 32 | records.append([filename, normalized_text, speaker_name]) 33 | self.records = records 34 | 35 | def __getitem__(self, i): 36 | return self.records[i] 37 | 38 | def __len__(self): 39 | return len(self.records) 40 | -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepaudio/deepaudio-tts/4bf986616438f77bffdf1859caed230e4d4f4030/deepaudio/tts/feats_extract_from_torch/__init__.py -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/abs_feats_extract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, Tuple 3 | 4 | import torch 5 | 6 | 7 | class AbsFeatsExtract(torch.nn.Module, ABC): 8 | @abstractmethod 9 | def output_size(self) -> int: 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def get_parameters(self) -> Dict[str, Any]: 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def forward( 18 | self, input: torch.Tensor, input_lengths: torch.Tensor 19 | ) -> Tuple[torch.Tensor, torch.Tensor]: 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/linear_spectrogram.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | from deepaudio.tts.feats_extract_from_torch.stft import Stft 7 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract 8 | 9 | 10 | class LinearSpectrogram(AbsFeatsExtract): 11 | """Linear amplitude spectrogram. 12 | 13 | Stft -> amplitude-spec 14 | """ 15 | 16 | def __init__( 17 | self, 18 | n_fft: int = 1024, 19 | win_length: int = None, 20 | hop_length: int = 256, 21 | window: Optional[str] = "hann", 22 | center: bool = True, 23 | normalized: bool = False, 24 | onesided: bool = True, 25 | ): 26 | assert check_argument_types() 27 | super().__init__() 28 | self.n_fft = n_fft 29 | self.hop_length = hop_length 30 | self.win_length = win_length 31 | self.window = window 32 | self.stft = Stft( 33 | n_fft=n_fft, 34 | win_length=win_length, 35 | hop_length=hop_length, 36 | window=window, 37 | center=center, 38 | normalized=normalized, 39 | onesided=onesided, 40 | ) 41 | self.n_fft = n_fft 42 | 43 | def output_size(self) -> int: 44 | return self.n_fft // 2 + 1 45 | 46 | def get_parameters(self) -> Dict[str, Any]: 47 | """Return the parameters required by Vocoder.""" 48 | return dict( 49 | n_fft=self.n_fft, 50 | n_shift=self.hop_length, 51 | win_length=self.win_length, 52 | window=self.window, 53 | ) 54 | 55 | def forward( 56 | self, input: torch.Tensor, input_lengths: torch.Tensor = None 57 | ) -> Tuple[torch.Tensor, torch.Tensor]: 58 | # 1. Stft: time -> time-freq 59 | input_stft, feats_lens = self.stft(input, input_lengths) 60 | 61 | assert input_stft.dim() >= 4, input_stft.shape 62 | # "2" refers to the real/imag parts of Complex 63 | assert input_stft.shape[-1] == 2, input_stft.shape 64 | 65 | # STFT -> Power spectrum -> Amp spectrum 66 | # input_stft: (..., F, 2) -> (..., F) 67 | input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2 68 | input_amp = torch.sqrt(torch.clamp(input_power, min=1.0e-10)) 69 | return input_amp, feats_lens 70 | -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/log_mel.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import librosa 4 | import torch 5 | 6 | from deepaudio.tts.modules.nets_utils import make_pad_mask 7 | 8 | 9 | class LogMel(torch.nn.Module): 10 | """Convert STFT to fbank feats 11 | 12 | The arguments is same as librosa.filters.mel 13 | 14 | Args: 15 | fs: number > 0 [scalar] sampling rate of the incoming signal 16 | n_fft: int > 0 [scalar] number of FFT components 17 | n_mels: int > 0 [scalar] number of Mel bands to generate 18 | fmin: float >= 0 [scalar] lowest frequency (in Hz) 19 | fmax: float >= 0 [scalar] highest frequency (in Hz). 20 | If `None`, use `fmax = fs / 2.0` 21 | htk: use HTK formula instead of Slaney 22 | """ 23 | 24 | def __init__( 25 | self, 26 | fs: int = 16000, 27 | n_fft: int = 512, 28 | n_mels: int = 80, 29 | fmin: float = None, 30 | fmax: float = None, 31 | htk: bool = False, 32 | log_base: float = None, 33 | ): 34 | super().__init__() 35 | 36 | fmin = 0 if fmin is None else fmin 37 | fmax = fs / 2 if fmax is None else fmax 38 | _mel_options = dict( 39 | sr=fs, 40 | n_fft=n_fft, 41 | n_mels=n_mels, 42 | fmin=fmin, 43 | fmax=fmax, 44 | htk=htk, 45 | ) 46 | self.mel_options = _mel_options 47 | self.log_base = log_base 48 | 49 | # Note(kamo): The mel matrix of librosa is different from kaldi. 50 | melmat = librosa.filters.mel(**_mel_options) 51 | # melmat: (D2, D1) -> (D1, D2) 52 | self.register_buffer("melmat", torch.from_numpy(melmat.T).float()) 53 | 54 | def extra_repr(self): 55 | return ", ".join(f"{k}={v}" for k, v in self.mel_options.items()) 56 | 57 | def forward( 58 | self, 59 | feat: torch.Tensor, 60 | ilens: torch.Tensor = None, 61 | ) -> Tuple[torch.Tensor, torch.Tensor]: 62 | # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2) 63 | mel_feat = torch.matmul(feat, self.melmat) 64 | mel_feat = torch.clamp(mel_feat, min=1e-10) 65 | 66 | if self.log_base is None: 67 | logmel_feat = mel_feat.log() 68 | elif self.log_base == 2.0: 69 | logmel_feat = mel_feat.log2() 70 | elif self.log_base == 10.0: 71 | logmel_feat = mel_feat.log10() 72 | else: 73 | logmel_feat = mel_feat.log() / torch.log(self.log_base) 74 | 75 | # Zero padding 76 | if ilens is not None: 77 | logmel_feat = logmel_feat.masked_fill( 78 | make_pad_mask(ilens, logmel_feat, 1), 0.0 79 | ) 80 | else: 81 | ilens = feat.new_full( 82 | [feat.size(0)], fill_value=feat.size(1), dtype=torch.long 83 | ) 84 | return logmel_feat, ilens 85 | -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/log_mel_fbank.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple, Union 2 | 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | from deepaudio.tts.feats_extract_from_torch.log_mel import LogMel 7 | from deepaudio.tts.feats_extract_from_torch.stft import Stft 8 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract 9 | 10 | 11 | 12 | class LogMelFbank(AbsFeatsExtract): 13 | """Conventional frontend structure for TTS. 14 | 15 | Stft -> amplitude-spec -> Log-Mel-Fbank 16 | """ 17 | 18 | def __init__( 19 | self, 20 | fs: int = 16000, 21 | n_fft: int = 1024, 22 | win_length: int = None, 23 | hop_length: int = 256, 24 | window: Optional[str] = "hann", 25 | center: bool = True, 26 | normalized: bool = False, 27 | onesided: bool = True, 28 | n_mels: int = 80, 29 | fmin: Optional[int] = 80, 30 | fmax: Optional[int] = 7600, 31 | htk: bool = False, 32 | log_base: Optional[float] = 10.0, 33 | ): 34 | assert check_argument_types() 35 | super().__init__() 36 | 37 | self.fs = fs 38 | self.n_mels = n_mels 39 | self.n_fft = n_fft 40 | self.hop_length = hop_length 41 | self.win_length = win_length 42 | self.window = window 43 | self.fmin = fmin 44 | self.fmax = fmax 45 | 46 | self.stft = Stft( 47 | n_fft=n_fft, 48 | win_length=win_length, 49 | hop_length=hop_length, 50 | window=window, 51 | center=center, 52 | normalized=normalized, 53 | onesided=onesided, 54 | ) 55 | 56 | self.logmel = LogMel( 57 | fs=fs, 58 | n_fft=n_fft, 59 | n_mels=n_mels, 60 | fmin=fmin, 61 | fmax=fmax, 62 | htk=htk, 63 | log_base=log_base, 64 | ) 65 | 66 | def output_size(self) -> int: 67 | return self.n_mels 68 | 69 | def get_parameters(self) -> Dict[str, Any]: 70 | """Return the parameters required by Vocoder""" 71 | return dict( 72 | fs=self.fs, 73 | n_fft=self.n_fft, 74 | n_shift=self.hop_length, 75 | window=self.window, 76 | n_mels=self.n_mels, 77 | win_length=self.win_length, 78 | fmin=self.fmin, 79 | fmax=self.fmax, 80 | ) 81 | 82 | def forward( 83 | self, input: torch.Tensor, input_lengths: torch.Tensor = None 84 | ) -> Tuple[torch.Tensor, torch.Tensor]: 85 | # 1. Domain-conversion: e.g. Stft: time -> time-freq 86 | input_stft, feats_lens = self.stft(input, input_lengths) 87 | 88 | assert input_stft.dim() >= 4, input_stft.shape 89 | # "2" refers to the real/imag parts of Complex 90 | assert input_stft.shape[-1] == 2, input_stft.shape 91 | 92 | # NOTE(kamo): We use different definition for log-spec between TTS and ASR 93 | # TTS: log_10(abs(stft)) 94 | # ASR: log_e(power(stft)) 95 | 96 | # input_stft: (..., F, 2) -> (..., F) 97 | input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2 98 | input_amp = torch.sqrt(torch.clamp(input_power, min=1.0e-10)) 99 | input_feats, _ = self.logmel(input_amp, feats_lens) 100 | return input_feats, feats_lens 101 | -------------------------------------------------------------------------------- /deepaudio/tts/feats_extract_from_torch/log_spectrogram.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Optional, Tuple 2 | 3 | import torch 4 | from typeguard import check_argument_types 5 | 6 | from deepaudio.tts.feats_extract_from_torch.stft import Stft 7 | from deepaudio.tts.feats_extract_from_torch.abs_feats_extract import AbsFeatsExtract 8 | 9 | 10 | class LogSpectrogram(AbsFeatsExtract): 11 | """Conventional frontend structure for ASR 12 | 13 | Stft -> log-amplitude-spec 14 | """ 15 | 16 | def __init__( 17 | self, 18 | n_fft: int = 1024, 19 | win_length: int = None, 20 | hop_length: int = 256, 21 | window: Optional[str] = "hann", 22 | center: bool = True, 23 | normalized: bool = False, 24 | onesided: bool = True, 25 | ): 26 | assert check_argument_types() 27 | super().__init__() 28 | self.n_fft = n_fft 29 | self.hop_length = hop_length 30 | self.win_length = win_length 31 | self.window = window 32 | self.stft = Stft( 33 | n_fft=n_fft, 34 | win_length=win_length, 35 | hop_length=hop_length, 36 | window=window, 37 | center=center, 38 | normalized=normalized, 39 | onesided=onesided, 40 | ) 41 | self.n_fft = n_fft 42 | 43 | def output_size(self) -> int: 44 | return self.n_fft // 2 + 1 45 | 46 | def get_parameters(self) -> Dict[str, Any]: 47 | """Return the parameters required by Vocoder""" 48 | return dict( 49 | n_fft=self.n_fft, 50 | n_shift=self.hop_length, 51 | win_length=self.win_length, 52 | window=self.window, 53 | ) 54 | 55 | def forward( 56 | self, input: torch.Tensor, input_lengths: torch.Tensor = None 57 | ) -> Tuple[torch.Tensor, torch.Tensor]: 58 | # 1. Stft: time -> time-freq 59 | input_stft, feats_lens = self.stft(input, input_lengths) 60 | 61 | assert input_stft.dim() >= 4, input_stft.shape 62 | # "2" refers to the real/imag parts of Complex 63 | assert input_stft.shape[-1] == 2, input_stft.shape 64 | 65 | # NOTE(kamo): We use different definition for log-spec between TTS and ASR 66 | # TTS: log_10(abs(stft)) 67 | # ASR: log_e(power(stft)) 68 | 69 | # STFT -> Power spectrum 70 | # input_stft: (..., F, 2) -> (..., F) 71 | input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2 72 | log_amp = 0.5 * torch.log10(torch.clamp(input_power, min=1.0e-10)) 73 | return log_amp, feats_lens 74 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .generate_lexicon import * 15 | from .normalizer import * 16 | from .phonectic import * 17 | from .punctuation import * 18 | from .tone_sandhi import * 19 | from .vocab import * 20 | from .zh_normalization import * 21 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from paddlespeech.t2s.frontend.normalizer.normalizer import * 15 | from paddlespeech.t2s.frontend.normalizer.numbers import * 16 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/abbrrviation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/acronyms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import unicodedata 16 | from builtins import str as unicode 17 | 18 | from paddlespeech.t2s.frontend.normalizer.numbers import normalize_numbers 19 | 20 | 21 | def normalize(sentence): 22 | """ Normalize English text. 23 | """ 24 | # preprocessing 25 | sentence = unicode(sentence) 26 | sentence = normalize_numbers(sentence) 27 | sentence = ''.join( 28 | char for char in unicodedata.normalize('NFD', sentence) 29 | if unicodedata.category(char) != 'Mn') # Strip accents 30 | sentence = sentence.lower() 31 | sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence) 32 | sentence = sentence.replace("i.e.", "that is") 33 | sentence = sentence.replace("e.g.", "for example") 34 | return sentence 35 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/numbers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # number expansion is not that easy 15 | import re 16 | 17 | import inflect 18 | 19 | _inflect = inflect.engine() 20 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 21 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 22 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 23 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 24 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 25 | _number_re = re.compile(r'[0-9]+') 26 | 27 | 28 | def _remove_commas(m): 29 | return m.group(1).replace(',', '') 30 | 31 | 32 | def _expand_decimal_point(m): 33 | return m.group(1).replace('.', ' point ') 34 | 35 | 36 | def _expand_dollars(m): 37 | match = m.group(1) 38 | parts = match.split('.') 39 | if len(parts) > 2: 40 | return match + ' dollars' # Unexpected format 41 | dollars = int(parts[0]) if parts[0] else 0 42 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 43 | if dollars and cents: 44 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 45 | cent_unit = 'cent' if cents == 1 else 'cents' 46 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 47 | elif dollars: 48 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 49 | return '%s %s' % (dollars, dollar_unit) 50 | elif cents: 51 | cent_unit = 'cent' if cents == 1 else 'cents' 52 | return '%s %s' % (cents, cent_unit) 53 | else: 54 | return 'zero dollars' 55 | 56 | 57 | def _expand_ordinal(m): 58 | return _inflect.number_to_words(m.group(0)) 59 | 60 | 61 | def _expand_number(m): 62 | num = int(m.group(0)) 63 | if num > 1000 and num < 3000: 64 | if num == 2000: 65 | return 'two thousand' 66 | elif num > 2000 and num < 2010: 67 | return 'two thousand ' + _inflect.number_to_words(num % 100) 68 | elif num % 100 == 0: 69 | return _inflect.number_to_words(num // 100) + ' hundred' 70 | else: 71 | return _inflect.number_to_words( 72 | num, andword='', zero='oh', group=2).replace(', ', ' ') 73 | else: 74 | return _inflect.number_to_words(num, andword='') 75 | 76 | 77 | def normalize_numbers(text): 78 | """ Normalize numbers in English text. 79 | """ 80 | text = re.sub(_comma_number_re, _remove_commas, text) 81 | text = re.sub(_pounds_re, r'\1 pounds', text) 82 | text = re.sub(_dollars_re, _expand_dollars, text) 83 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 84 | text = re.sub(_ordinal_re, _expand_ordinal, text) 85 | text = re.sub(_number_re, _expand_number, text) 86 | return text 87 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/normalizer/width.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def full2half_width(ustr): 17 | half = [] 18 | for u in ustr: 19 | num = ord(u) 20 | if num == 0x3000: # 全角空格变半角 21 | num = 32 22 | elif 0xFF01 <= num <= 0xFF5E: 23 | num -= 0xfee0 24 | u = chr(num) 25 | half.append(u) 26 | return ''.join(half) 27 | 28 | 29 | def half2full_width(ustr): 30 | full = [] 31 | for u in ustr: 32 | num = ord(u) 33 | if num == 32: # 半角空格变全角 34 | num = 0x3000 35 | elif 0x21 <= num <= 0x7E: 36 | num += 0xfee0 37 | u = chr(num) # to unicode 38 | full.append(u) 39 | 40 | return ''.join(full) 41 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/punctuation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __all__ = ["get_punctuations"] 16 | 17 | EN_PUNCT = [ 18 | " ", 19 | "-", 20 | "...", 21 | ",", 22 | ".", 23 | "?", 24 | "!", 25 | ] 26 | 27 | CN_PUNCT = ["、", ",", ";", ":", "。", "?", "!"] 28 | 29 | 30 | def get_punctuations(lang): 31 | if lang == "en": 32 | return EN_PUNCT 33 | elif lang == "cn": 34 | return CN_PUNCT 35 | else: 36 | raise ValueError(f"language {lang} Not supported") 37 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/vocab.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from collections import OrderedDict 15 | from typing import Iterable 16 | 17 | __all__ = ["Vocab"] 18 | 19 | 20 | class Vocab(object): 21 | """ Vocabulary. 22 | 23 | Args: 24 | symbols (Iterable[str]): Common symbols. 25 | padding_symbol (str, optional): Symbol for pad. Defaults to "". 26 | unk_symbol (str, optional): Symbol for unknow. Defaults to "" 27 | start_symbol (str, optional): Symbol for start. Defaults to "" 28 | end_symbol (str, optional): Symbol for end. Defaults to "" 29 | """ 30 | 31 | def __init__(self, 32 | symbols: Iterable[str], 33 | padding_symbol="", 34 | unk_symbol="", 35 | start_symbol="", 36 | end_symbol=""): 37 | self.special_symbols = OrderedDict() 38 | for i, item in enumerate( 39 | [padding_symbol, unk_symbol, start_symbol, end_symbol]): 40 | if item: 41 | self.special_symbols[item] = len(self.special_symbols) 42 | 43 | self.padding_symbol = padding_symbol 44 | self.unk_symbol = unk_symbol 45 | self.start_symbol = start_symbol 46 | self.end_symbol = end_symbol 47 | 48 | self.stoi = OrderedDict() 49 | self.stoi.update(self.special_symbols) 50 | 51 | for i, s in enumerate(symbols): 52 | if s not in self.stoi: 53 | self.stoi[s] = len(self.stoi) 54 | self.itos = {v: k for k, v in self.stoi.items()} 55 | 56 | def __len__(self): 57 | return len(self.stoi) 58 | 59 | @property 60 | def num_specials(self): 61 | """ The number of special symbols. 62 | """ 63 | return len(self.special_symbols) 64 | 65 | # special tokens 66 | @property 67 | def padding_index(self): 68 | """ The index of padding symbol 69 | """ 70 | return self.stoi.get(self.padding_symbol, -1) 71 | 72 | @property 73 | def unk_index(self): 74 | """The index of unknow symbol. 75 | """ 76 | return self.stoi.get(self.unk_symbol, -1) 77 | 78 | @property 79 | def start_index(self): 80 | """The index of start symbol. 81 | """ 82 | return self.stoi.get(self.start_symbol, -1) 83 | 84 | @property 85 | def end_index(self): 86 | """ The index of end symbol. 87 | """ 88 | return self.stoi.get(self.end_symbol, -1) 89 | 90 | def __repr__(self): 91 | fmt = "Vocab(size: {},\nstoi:\n{})" 92 | return fmt.format(len(self), self.stoi) 93 | 94 | def __str__(self): 95 | return self.__repr__() 96 | 97 | def lookup(self, symbol): 98 | """ The index that symbol correspond. 99 | """ 100 | return self.stoi[symbol] 101 | 102 | def reverse(self, index): 103 | """ The symbol thar index cottespond. 104 | """ 105 | return self.itos[index] 106 | 107 | def add_symbol(self, symbol): 108 | """ Add a new symbol in vocab. 109 | """ 110 | if symbol in self.stoi: 111 | return 112 | N = len(self.stoi) 113 | self.stoi[symbol] = N 114 | self.itos[N] = symbol 115 | 116 | def add_symbols(self, symbols): 117 | """ Add multiple symbols in vocab. 118 | """ 119 | for symbol in symbols: 120 | self.add_symbol(symbol) 121 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/README.md: -------------------------------------------------------------------------------- 1 | ## Supported NSW (Non-Standard-Word) Normalization 2 | 3 | |NSW type|raw|normalized| 4 | |:--|:-|:-| 5 | |serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| 6 | |cardinal|这块黄金重达324.75克
我们班的最高总分为583分|这块黄金重达三百二十四点七五克
我们班的最高总分为五百八十三分| 7 | |numeric range |12\~23
-1.5\~2|十二到二十三
负一点五到二| 8 | |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日| 9 | |time|等会请在12:05请通知我|等会请在十二点零五分请通知我 10 | |temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度 11 | |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| 12 | |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| 13 | |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五,三十四点五元,二十点一万| 14 | |telephone|这是固话0421-33441122
这是手机+86 18544139121|这是固话零四二一三三四四一一二二
这是手机八六一八五四四一三九一二一| 15 | ## References 16 | [Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files) 17 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from paddlespeech.t2s.frontend.zh_normalization.text_normlization import * 15 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/chronology.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import DIGITS 17 | from .num import num2str 18 | from .num import verbalize_cardinal 19 | from .num import verbalize_digit 20 | 21 | 22 | def _time_num2str(num_string: str) -> str: 23 | """A special case for verbalizing number in time.""" 24 | result = num2str(num_string.lstrip('0')) 25 | if num_string.startswith('0'): 26 | result = DIGITS['0'] + result 27 | return result 28 | 29 | 30 | # 时刻表达式 31 | RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])' 32 | r':([0-5][0-9])' 33 | r'(:([0-5][0-9]))?') 34 | 35 | # 时间范围,如8:30-12:30 36 | RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' 37 | r':([0-5][0-9])' 38 | r'(:([0-5][0-9]))?' 39 | r'(~|-)' 40 | r'([0-1]?[0-9]|2[0-3])' 41 | r':([0-5][0-9])' 42 | r'(:([0-5][0-9]))?') 43 | 44 | 45 | def replace_time(match) -> str: 46 | """ 47 | Args: 48 | match (re.Match) 49 | Returns: 50 | str 51 | """ 52 | 53 | is_range = len(match.groups()) > 5 54 | 55 | hour = match.group(1) 56 | minute = match.group(2) 57 | second = match.group(4) 58 | 59 | if is_range: 60 | hour_2 = match.group(6) 61 | minute_2 = match.group(7) 62 | second_2 = match.group(9) 63 | 64 | result = f"{num2str(hour)}点" 65 | if minute.lstrip('0'): 66 | if int(minute) == 30: 67 | result += "半" 68 | else: 69 | result += f"{_time_num2str(minute)}分" 70 | if second and second.lstrip('0'): 71 | result += f"{_time_num2str(second)}秒" 72 | 73 | if is_range: 74 | result += "至" 75 | result += f"{num2str(hour_2)}点" 76 | if minute_2.lstrip('0'): 77 | if int(minute) == 30: 78 | result += "半" 79 | else: 80 | result += f"{_time_num2str(minute_2)}分" 81 | if second_2 and second_2.lstrip('0'): 82 | result += f"{_time_num2str(second_2)}秒" 83 | 84 | return result 85 | 86 | 87 | RE_DATE = re.compile(r'(\d{4}|\d{2})年' 88 | r'((0?[1-9]|1[0-2])月)?' 89 | r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?') 90 | 91 | 92 | def replace_date(match) -> str: 93 | """ 94 | Args: 95 | match (re.Match) 96 | Returns: 97 | str 98 | """ 99 | year = match.group(1) 100 | month = match.group(3) 101 | day = match.group(5) 102 | result = "" 103 | if year: 104 | result += f"{verbalize_digit(year)}年" 105 | if month: 106 | result += f"{verbalize_cardinal(month)}月" 107 | if day: 108 | result += f"{verbalize_cardinal(day)}{match.group(9)}" 109 | return result 110 | 111 | 112 | # 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 113 | RE_DATE2 = re.compile( 114 | r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])') 115 | 116 | 117 | def replace_date2(match) -> str: 118 | """ 119 | Args: 120 | match (re.Match) 121 | Returns: 122 | str 123 | """ 124 | year = match.group(1) 125 | month = match.group(3) 126 | day = match.group(4) 127 | result = "" 128 | if year: 129 | result += f"{verbalize_digit(year)}年" 130 | if month: 131 | result += f"{verbalize_cardinal(month)}月" 132 | if day: 133 | result += f"{verbalize_cardinal(day)}日" 134 | return result 135 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | import string 16 | 17 | from pypinyin.constants import SUPPORT_UCS4 18 | 19 | # 全角半角转换 20 | # 英文字符全角 -> 半角映射表 (num: 52) 21 | F2H_ASCII_LETTERS = { 22 | chr(ord(char) + 65248): char 23 | for char in string.ascii_letters 24 | } 25 | 26 | # 英文字符半角 -> 全角映射表 27 | H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} 28 | 29 | # 数字字符全角 -> 半角映射表 (num: 10) 30 | F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} 31 | # 数字字符半角 -> 全角映射表 32 | H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} 33 | 34 | # 标点符号全角 -> 半角映射表 (num: 32) 35 | F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} 36 | # 标点符号半角 -> 全角映射表 37 | H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} 38 | 39 | # 空格 (num: 1) 40 | F2H_SPACE = {'\u3000': ' '} 41 | H2F_SPACE = {' ': '\u3000'} 42 | 43 | # 非"有拼音的汉字"的字符串,可用于NSW提取 44 | if SUPPORT_UCS4: 45 | RE_NSW = re.compile(r'(?:[^' 46 | r'\u3007' # 〇 47 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 48 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 49 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 50 | r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] 51 | r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] 52 | r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] 53 | r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] 54 | r'])+') 55 | else: 56 | RE_NSW = re.compile( # pragma: no cover 57 | r'(?:[^' 58 | r'\u3007' # 〇 59 | r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] 60 | r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] 61 | r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] 62 | r'])+') 63 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/phonecode.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import verbalize_digit 17 | 18 | # 规范化固话/手机号码 19 | # 手机 20 | # http://www.jihaoba.com/news/show/13680 21 | # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 22 | # 联通:130、131、132、156、155、186、185、176 23 | # 电信:133、153、189、180、181、177 24 | RE_MOBILE_PHONE = re.compile( 25 | r"(? str: 34 | if mobile: 35 | sp_parts = phone_string.strip('+').split() 36 | result = ','.join( 37 | [verbalize_digit(part, alt_one=True) for part in sp_parts]) 38 | return result 39 | else: 40 | sil_parts = phone_string.split('-') 41 | result = ','.join( 42 | [verbalize_digit(part, alt_one=True) for part in sil_parts]) 43 | return result 44 | 45 | 46 | def replace_phone(match) -> str: 47 | """ 48 | Args: 49 | match (re.Match) 50 | Returns: 51 | str 52 | """ 53 | return phone2str(match.group(0), mobile=False) 54 | 55 | 56 | def replace_mobile(match) -> str: 57 | """ 58 | Args: 59 | match (re.Match) 60 | Returns: 61 | str 62 | """ 63 | return phone2str(match.group(0)) 64 | -------------------------------------------------------------------------------- /deepaudio/tts/frontend/zh_normalization/quantifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from .num import num2str 17 | 18 | # 温度表达式,温度会影响负号的读法 19 | # -3°C 零下三度 20 | RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') 21 | 22 | 23 | def replace_temperature(match) -> str: 24 | """ 25 | Args: 26 | match (re.Match) 27 | Returns: 28 | str 29 | """ 30 | sign = match.group(1) 31 | temperature = match.group(2) 32 | unit = match.group(3) 33 | sign: str = "零下" if sign else "" 34 | temperature: str = num2str(temperature) 35 | unit: str = "摄氏度" if unit == "摄氏度" else "度" 36 | result = f"{sign}{temperature}{unit}" 37 | return result 38 | -------------------------------------------------------------------------------- /deepaudio/tts/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | from .base import BasePLModel 5 | 6 | 7 | MODEL_REGISTRY = dict() 8 | MODEL_DATACLASS_REGISTRY = dict() 9 | 10 | 11 | def register_model(name: str, dataclass=None): 12 | r""" 13 | New model types can be added to OpenSpeech with the :func:`register_model` function decorator. 14 | 15 | For example:: 16 | @register_model('conformer_lstm') 17 | class ConformerLSTMModel(OpenspeechModel): 18 | (...) 19 | 20 | .. note:: All models must implement the :class:`cls.__name__` interface. 21 | 22 | Args: 23 | name (str): the name of the model 24 | """ 25 | 26 | def register_model_cls(cls): 27 | if name in MODEL_REGISTRY: 28 | raise ValueError(f"Cannot register duplicate model ({name})") 29 | if not issubclass(cls, BasePLModel): 30 | raise ValueError(f"Model ({name}: {cls.__name__}) must extend BaseModel") 31 | 32 | MODEL_REGISTRY[name] = cls 33 | 34 | cls.__dataclass = dataclass 35 | if dataclass is not None: 36 | if name in MODEL_DATACLASS_REGISTRY: 37 | raise ValueError(f"Cannot register duplicate model ({name})") 38 | MODEL_DATACLASS_REGISTRY[name] = dataclass 39 | 40 | return cls 41 | 42 | return register_model_cls 43 | 44 | 45 | # automatically import any Python files in the models/ directory 46 | models_dir = os.path.dirname(__file__) 47 | for file in os.listdir(models_dir): 48 | if os.path.isdir(os.path.join(models_dir, file)) and not file.startswith('__'): 49 | for subfile in os.listdir(os.path.join(models_dir, file)): 50 | path = os.path.join(models_dir, file, subfile) 51 | if subfile.endswith(".py"): 52 | python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile 53 | module = importlib.import_module(f"deepaudio.tts.models.{file}.{python_file}") 54 | continue 55 | 56 | path = os.path.join(models_dir, file) 57 | if file.endswith(".py"): 58 | model_name = file[: file.find(".py")] if file.endswith(".py") else file 59 | module = importlib.import_module(f"deepaudio.tts.models.{model_name}") -------------------------------------------------------------------------------- /deepaudio/tts/models/fastspeech2/__init__.py: -------------------------------------------------------------------------------- 1 | from .fastspeech2 import * -------------------------------------------------------------------------------- /deepaudio/tts/models/fastspeech2/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor, nn 3 | from pytorch_lightning import LightningModule 4 | 5 | from deepaudio.tts.models.fastspeech2 import FastSpeech2 6 | from deepaudio.tts.models.fastspeech2.loss import FastSpeech2Loss 7 | 8 | 9 | class Fastspeech2Model(LightningModule): 10 | def __init__(self, 11 | model: FastSpeech2, 12 | optimizer: torch.optim.Optimizer, 13 | scheduler: torch.optim.lr_scheduler, ): 14 | super(Fastspeech2Model, self).__init__() 15 | 16 | self.save_hyperparameters(logger=False, ignore=["model"]) 17 | self.model = model 18 | self.criterion = FastSpeech2Loss() 19 | 20 | def step(self, batch): 21 | # spk_id!=None in multiple spk fastspeech2 22 | spk_id = batch["spk_id"] if "spk_id" in batch else None 23 | spk_emb = batch["spk_emb"] if "spk_emb" in batch else None 24 | lang_id = batch["lang_id"] if "lang_id" in batch else None 25 | # No explicit speaker identifier labels are used during voice cloning training. 26 | if spk_emb is not None: 27 | spk_id = None 28 | 29 | outs = self.model( 30 | text=batch["text"], 31 | text_lengths=batch["text_lengths"], 32 | feats=batch["speech"], 33 | feats_lengths=batch["speech_lengths"], 34 | durations=batch["durations"], 35 | pitch=batch["pitch"], 36 | energy=batch["energy"], 37 | sids=spk_id, 38 | spembs=spk_emb, 39 | lids=lang_id, 40 | ) 41 | return outs 42 | 43 | def training_step(self, batch: dict, batch_idx: int): 44 | before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.step(batch) 45 | l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( 46 | after_outs=after_outs, 47 | before_outs=before_outs, 48 | d_outs=d_outs, 49 | p_outs=p_outs, 50 | e_outs=e_outs, 51 | ys=ys, 52 | ds=batch["durations"], 53 | ps=batch["pitch"], 54 | es=batch["energy"], 55 | ilens=batch["text_lengths"], 56 | olens=olens) 57 | 58 | loss = l1_loss + duration_loss + pitch_loss + energy_loss 59 | return {'loss': loss} 60 | 61 | def validation_step(self, batch: dict, batch_idx: int): 62 | before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.step(batch) 63 | l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( 64 | after_outs=after_outs, 65 | before_outs=before_outs, 66 | d_outs=d_outs, 67 | p_outs=p_outs, 68 | e_outs=e_outs, 69 | ys=ys, 70 | ds=batch["durations"], 71 | ps=batch["pitch"], 72 | es=batch["energy"], 73 | ilens=batch["text_lengths"], 74 | olens=olens) 75 | 76 | loss = l1_loss + duration_loss + pitch_loss + energy_loss 77 | self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True) 78 | return {'val_loss': loss} 79 | 80 | def configure_optimizers(self): 81 | optimizer = self.hparams.optimizer(params=self.parameters()) 82 | scheduler = self.hparams.scheduler(optimizer=optimizer) 83 | 84 | return { 85 | "optimizer": optimizer, 86 | "lr_scheduler": { 87 | "scheduler": scheduler, 88 | "monitor": "val/loss", 89 | "interval": "epoch", 90 | "frequency": 1, 91 | }, 92 | } 93 | -------------------------------------------------------------------------------- /deepaudio/tts/models/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from .hifigan import * -------------------------------------------------------------------------------- /deepaudio/tts/models/hifigan/residual_block.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """HiFiGAN Residual block modules. 5 | 6 | This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN. 7 | 8 | """ 9 | 10 | from typing import Any, Dict, List 11 | 12 | import torch 13 | 14 | 15 | class ResidualBlock(torch.nn.Module): 16 | """Residual block module in HiFiGAN.""" 17 | 18 | def __init__( 19 | self, 20 | kernel_size: int = 3, 21 | channels: int = 512, 22 | dilations: List[int] = [1, 3, 5], 23 | bias: bool = True, 24 | use_additional_convs: bool = True, 25 | nonlinear_activation: str = "LeakyReLU", 26 | nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1}, 27 | ): 28 | """Initialize ResidualBlock module. 29 | 30 | Args: 31 | kernel_size (int): Kernel size of dilation convolution layer. 32 | channels (int): Number of channels for convolution layer. 33 | dilations (List[int]): List of dilation factors. 34 | use_additional_convs (bool): Whether to use additional convolution layers. 35 | bias (bool): Whether to add bias parameter in convolution layers. 36 | nonlinear_activation (str): Activation function module name. 37 | nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation 38 | function. 39 | 40 | """ 41 | super().__init__() 42 | self.use_additional_convs = use_additional_convs 43 | self.convs1 = torch.nn.ModuleList() 44 | if use_additional_convs: 45 | self.convs2 = torch.nn.ModuleList() 46 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 47 | for dilation in dilations: 48 | self.convs1 += [ 49 | torch.nn.Sequential( 50 | getattr(torch.nn, nonlinear_activation)( 51 | **nonlinear_activation_params 52 | ), 53 | torch.nn.Conv1d( 54 | channels, 55 | channels, 56 | kernel_size, 57 | 1, 58 | dilation=dilation, 59 | bias=bias, 60 | padding=(kernel_size - 1) // 2 * dilation, 61 | ), 62 | ) 63 | ] 64 | if use_additional_convs: 65 | self.convs2 += [ 66 | torch.nn.Sequential( 67 | getattr(torch.nn, nonlinear_activation)( 68 | **nonlinear_activation_params 69 | ), 70 | torch.nn.Conv1d( 71 | channels, 72 | channels, 73 | kernel_size, 74 | 1, 75 | dilation=1, 76 | bias=bias, 77 | padding=(kernel_size - 1) // 2, 78 | ), 79 | ) 80 | ] 81 | 82 | def forward(self, x: torch.Tensor) -> torch.Tensor: 83 | """Calculate forward propagation. 84 | 85 | Args: 86 | x (Tensor): Input tensor (B, channels, T). 87 | 88 | Returns: 89 | Tensor: Output tensor (B, channels, T). 90 | 91 | """ 92 | for idx in range(len(self.convs1)): 93 | xt = self.convs1[idx](x) 94 | if self.use_additional_convs: 95 | xt = self.convs2[idx](xt) 96 | x = xt + x 97 | return x 98 | -------------------------------------------------------------------------------- /deepaudio/tts/models/melgan/__init__.py: -------------------------------------------------------------------------------- 1 | from .melgan import * -------------------------------------------------------------------------------- /deepaudio/tts/models/melgan/residual_stack.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Residual stack module in MelGAN. 5 | 6 | This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN. 7 | 8 | """ 9 | 10 | from typing import Any, Dict 11 | 12 | import torch 13 | 14 | 15 | class ResidualStack(torch.nn.Module): 16 | """Residual stack module introduced in MelGAN.""" 17 | 18 | def __init__( 19 | self, 20 | kernel_size: int = 3, 21 | channels: int = 32, 22 | dilation: int = 1, 23 | bias: bool = True, 24 | nonlinear_activation: str = "LeakyReLU", 25 | nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2}, 26 | pad: str = "ReflectionPad1d", 27 | pad_params: Dict[str, Any] = {}, 28 | ): 29 | """Initialize ResidualStack module. 30 | 31 | Args: 32 | kernel_size (int): Kernel size of dilation convolution layer. 33 | channels (int): Number of channels of convolution layers. 34 | dilation (int): Dilation factor. 35 | bias (bool): Whether to add bias parameter in convolution layers. 36 | nonlinear_activation (str): Activation function module name. 37 | nonlinear_activation_params (Dict[str, Any]): Hyperparameters for 38 | activation function. 39 | pad (str): Padding function module name before dilated convolution layer. 40 | pad_params (Dict[str, Any]): Hyperparameters for padding function. 41 | 42 | """ 43 | super().__init__() 44 | 45 | # defile residual stack part 46 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 47 | self.stack = torch.nn.Sequential( 48 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 49 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 50 | torch.nn.Conv1d( 51 | channels, channels, kernel_size, dilation=dilation, bias=bias 52 | ), 53 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 54 | torch.nn.Conv1d(channels, channels, 1, bias=bias), 55 | ) 56 | 57 | # defile extra layer for skip connection 58 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 59 | 60 | def forward(self, c: torch.Tensor) -> torch.Tensor: 61 | """Calculate forward propagation. 62 | 63 | Args: 64 | c (Tensor): Input tensor (B, channels, T). 65 | 66 | Returns: 67 | Tensor: Output tensor (B, chennels, T). 68 | 69 | """ 70 | return self.stack(c) + self.skip_layer(c) 71 | -------------------------------------------------------------------------------- /deepaudio/tts/models/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- 1 | from .parallel_wavegan import * -------------------------------------------------------------------------------- /deepaudio/tts/models/parallel_wavegan/model.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | import torch 3 | from torch import Tensor, nn 4 | 5 | from pytorch_lightning import LightningModule 6 | from deepaudio.tts.models.parallel_wavegan import ParallelWaveGANDiscriminator 7 | from deepaudio.tts.models.parallel_wavegan import ParallelWaveGANGenerator 8 | 9 | 10 | from deepaudio.tts.modules.losses import MultiResolutionSTFTLoss 11 | 12 | 13 | class ParallelWaveGANModel(LightningModule): 14 | def __init__(self, 15 | generator: ParallelWaveGANGenerator, 16 | discriminator: ParallelWaveGANDiscriminator, 17 | criterion_stft: MultiResolutionSTFTLoss, 18 | lambda_aux: float, 19 | lambda_adv: float, 20 | optimizer_d: torch.optim.Optimizer, 21 | scheduler_d: torch.optim.lr_scheduler, 22 | optimizer_g: torch.optim.Optimizer, 23 | scheduler_g: torch.optim.lr_scheduler, 24 | discriminator_train_start_steps: int = 100000, 25 | ): 26 | super(ParallelWaveGANModel, self).__init__() 27 | 28 | self.generator = generator 29 | self.discriminator = discriminator 30 | self.criterion_stft = criterion_stft 31 | self.criterion_mse = torch.nn.MSELoss() 32 | self.save_hyperparameters(logger=False, ignore=["generator", 33 | "discriminator", 34 | "criterion_stft"]) 35 | 36 | def step_generator(self, wav, mel, batch_idx): 37 | losses_dict = {} 38 | noise = torch.randn(wav.shape).to(device=wav.device, dtype=wav.dtype) 39 | wav_ = self.generator(mel, noise) 40 | 41 | # initialize 42 | gen_loss = 0.0 43 | aux_loss = 0.0 44 | 45 | # multi-resolution stft loss 46 | sc_loss, mag_loss = self.criterion_stft(wav_, wav) 47 | aux_loss += sc_loss + mag_loss 48 | 49 | gen_loss += aux_loss * self.hparams.lambda_aux 50 | 51 | losses_dict["spectral_convergence_loss"] = sc_loss 52 | losses_dict["log_stft_magnitude_loss"] = mag_loss 53 | 54 | # adversarial loss 55 | if batch_idx > self.hparams.discriminator_train_start_steps: 56 | p_ = self.discriminator(wav_) 57 | adv_loss = self.criterion_mse(p_, torch.ones_like(p_)) 58 | losses_dict["adversarial_loss"] = adv_loss 59 | gen_loss += self.hparams.lambda_adv * adv_loss 60 | losses_dict["generator_loss"] = gen_loss 61 | self.log_dict(losses_dict) 62 | return gen_loss 63 | 64 | def step_disctiminator(self, wav, mel): 65 | losses_dict = {} 66 | with torch.no_grad(): 67 | noise = torch.randn(wav.shape) 68 | wav_ = self.generator(mel, noise) 69 | p = self.discriminator(wav) 70 | p_ = self.discriminator(wav_.detach()) 71 | real_loss = self.criterion_mse(p, torch.ones_like(p)) 72 | fake_loss = self.criterion_mse(p_, torch.zeros_like(p_)) 73 | dis_loss = real_loss + fake_loss 74 | 75 | losses_dict["real_loss"] = real_loss 76 | losses_dict["fake_loss"] = fake_loss 77 | losses_dict["discriminator_loss"] = dis_loss 78 | self.log_dict(losses_dict) 79 | return dis_loss 80 | 81 | def training_step(self, batch: tuple, batch_idx: int, optimizer_idx: int): 82 | opt_g, opt_d = self.optimizers() 83 | sch_g, sch_d = self.lr_schedulers() 84 | # parse batch 85 | wav, mel = batch 86 | 87 | # Generator 88 | gen_loss = self.step_generator(wav, mel, batch_idx) 89 | opt_g.zero_grad() 90 | self.manual_backward(gen_loss) 91 | opt_g.step() 92 | sch_g.step() 93 | 94 | # Disctiminator 95 | if batch_idx > self.hparams.discriminator_train_start_steps: 96 | # re-compute wav_ which leads better quality 97 | dis_loss = self.step_disctiminator(wav, mel) 98 | opt_d.zero_grad() 99 | self.manual_backward(dis_loss) 100 | opt_d.step() 101 | sch_d.step() 102 | 103 | 104 | def configure_optimizers(self): 105 | optimizer_g = self.hparams.optimizer_g(params=self.generator.parameters()) 106 | optimizer_d = self.hparams.optimizer_d(params=self.discriminator.parameters()) 107 | scheduler_g = self.hparams.scheduler_g(optimizer=optimizer_g) 108 | scheduler_d = self.hparams.scheduler_d(optimizer=optimizer_d) 109 | 110 | return [optimizer_g, optimizer_d], [scheduler_g, scheduler_d] 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /deepaudio/tts/models/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron2 import * -------------------------------------------------------------------------------- /deepaudio/tts/models/tacotron2/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor, nn 3 | 4 | from pytorch_lightning import LightningModule 5 | from deepaudio.tts.models.tacotron2.tacotron2 import Tacotron2 6 | from deepaudio.tts.models.tacotron2.loss import Tacotron2Loss 7 | from deepaudio.tts.models.tacotron2.loss import GuidedAttentionLoss 8 | 9 | 10 | class Tacotron2Model(LightningModule): 11 | def __init__(self, 12 | model: Tacotron2, 13 | loss_type: str, 14 | taco2_loss: Tacotron2Loss, 15 | use_guided_attn_loss: bool, 16 | attn_loss: GuidedAttentionLoss, 17 | optimizer: torch.optim.Optimizer, 18 | scheduler: torch.optim.lr_scheduler 19 | ): 20 | super(Tacotron2Model, self).__init__() 21 | 22 | self.model = model 23 | self.taco2_loss = taco2_loss 24 | self.save_hyperparameters(logger=False, ignore=["model", 25 | "taco2_loss", 26 | "attn_loss"]) 27 | if self.hparams.use_guided_attn_loss: 28 | self.attn_loss = attn_loss 29 | 30 | def compute_loss(self, batch): 31 | losses_dict = {} 32 | # spk_id!=None in multiple spk fastspeech2 33 | spk_id = batch["spk_id"] if "spk_id" in batch else None 34 | spk_emb = batch["spk_emb"] if "spk_emb" in batch else None 35 | if spk_emb is not None: 36 | spk_id = None 37 | 38 | after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( 39 | text=batch["text"], 40 | text_lengths=batch["text_lengths"], 41 | feats=batch["speech"], 42 | feats_lengths=batch["speech_lengths"], 43 | spk_id=spk_id, 44 | spk_emb=spk_emb) 45 | 46 | # calculate taco2 loss 47 | l1_loss, mse_loss, bce_loss = self.taco2_loss( 48 | after_outs=after_outs, 49 | before_outs=before_outs, 50 | logits=logits, 51 | ys=ys, 52 | labels=labels, 53 | olens=olens) 54 | 55 | if self.hparams.loss_type == "L1+L2": 56 | loss = l1_loss + mse_loss + bce_loss 57 | elif self.hparams.loss_type == "L1": 58 | loss = l1_loss + bce_loss 59 | elif self.hparams.loss_type == "L2": 60 | loss = mse_loss + bce_loss 61 | else: 62 | raise ValueError(f"unknown --loss-type {self.loss_type}") 63 | 64 | # calculate attention loss 65 | if self.hparams.use_guided_attn_loss: 66 | # NOTE: length of output for auto-regressive 67 | # input will be changed when r > 1 68 | attn_loss = self.attn_loss( 69 | att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) 70 | losses_dict["attn_loss"] = attn_loss 71 | loss = loss + attn_loss 72 | 73 | losses_dict["l1_loss"] = l1_loss 74 | losses_dict["mse_loss"] = mse_loss 75 | losses_dict["bce_loss"] = bce_loss 76 | losses_dict["loss"] = loss 77 | return losses_dict 78 | 79 | def training_step(self, batch: dict, batch_idx: int): 80 | losses_dict = self.compute_loss(batch) 81 | self.log_dict(losses_dict) 82 | return losses_dict 83 | 84 | def validation_step(self, batch: dict, batch_idx: int): 85 | losses_dict = self.compute_loss(batch) 86 | loss = losses_dict.pop('loss') 87 | losses_dict['val_loss'] = loss 88 | self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=True) 89 | return loss 90 | 91 | def configure_optimizers(self): 92 | optimizer = self.hparams.optimizer(params=self.parameters()) 93 | scheduler = self.hparams.scheduler(optimizer=optimizer) 94 | 95 | return { 96 | "optimizer": optimizer, 97 | "lr_scheduler": { 98 | "scheduler": scheduler, 99 | "monitor": "val/loss", 100 | "interval": "epoch", 101 | "frequency": 1, 102 | }, 103 | } 104 | -------------------------------------------------------------------------------- /deepaudio/tts/models/transformer_tts/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import * -------------------------------------------------------------------------------- /deepaudio/tts/models/transformer_tts/loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """TTS-Transformer related modules.""" 5 | 6 | import torch 7 | 8 | from deepaudio.tts.models.tacotron2.loss import GuidedAttentionLoss 9 | from deepaudio.tts.models.tacotron2.loss import ( 10 | Tacotron2Loss as TransformerLoss, 11 | ) 12 | 13 | 14 | class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): 15 | """Guided attention loss function module for multi head attention. 16 | 17 | Args: 18 | sigma (float, optional): Standard deviation to control 19 | how close attention to a diagonal. 20 | alpha (float, optional): Scaling coefficient (lambda). 21 | reset_always (bool, optional): Whether to always reset masks. 22 | 23 | """ 24 | 25 | def forward(self, att_ws, ilens, olens): 26 | """Calculate forward propagation. 27 | 28 | Args: 29 | att_ws (Tensor): 30 | Batch of multi head attention weights (B, H, T_max_out, T_max_in). 31 | ilens (LongTensor): Batch of input lengths (B,). 32 | olens (LongTensor): Batch of output lengths (B,). 33 | 34 | Returns: 35 | Tensor: Guided attention loss value. 36 | 37 | """ 38 | if self.guided_attn_masks is None: 39 | self.guided_attn_masks = ( 40 | self._make_guided_attention_masks(ilens, olens) 41 | .to(att_ws.device) 42 | .unsqueeze(1) 43 | ) 44 | if self.masks is None: 45 | self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1) 46 | losses = self.guided_attn_masks * att_ws 47 | loss = torch.mean(losses.masked_select(self.masks)) 48 | if self.reset_always: 49 | self._reset_masks() 50 | 51 | return self.alpha * loss 52 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/__init__.py: -------------------------------------------------------------------------------- 1 | from .vits import * 2 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """VITS-related loss modules. 5 | 6 | This code is based on https://github.com/jaywalnut310/vits. 7 | 8 | """ 9 | 10 | import torch 11 | 12 | 13 | class KLDivergenceLoss(torch.nn.Module): 14 | """KL divergence loss.""" 15 | 16 | def forward( 17 | self, 18 | z_p: torch.Tensor, 19 | logs_q: torch.Tensor, 20 | m_p: torch.Tensor, 21 | logs_p: torch.Tensor, 22 | z_mask: torch.Tensor, 23 | ) -> torch.Tensor: 24 | """Calculate KL divergence loss. 25 | 26 | Args: 27 | z_p (Tensor): Flow hidden representation (B, H, T_feats). 28 | logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats). 29 | m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats). 30 | logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats). 31 | z_mask (Tensor): Mask tensor (B, 1, T_feats). 32 | 33 | Returns: 34 | Tensor: KL divergence loss. 35 | 36 | """ 37 | z_p = z_p.float() 38 | logs_q = logs_q.float() 39 | m_p = m_p.float() 40 | logs_p = logs_p.float() 41 | z_mask = z_mask.float() 42 | kl = logs_p - logs_q - 0.5 43 | kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) 44 | kl = torch.sum(kl * z_mask) 45 | loss = kl / torch.sum(z_mask) 46 | 47 | return loss 48 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Maximum path calculation module. 15 | 16 | This code is based on https://github.com/jaywalnut310/vits. 17 | 18 | """ 19 | import warnings 20 | 21 | import numpy as np 22 | import paddle 23 | from numba import njit 24 | from numba import prange 25 | 26 | try: 27 | from .core import maximum_path_c 28 | 29 | is_cython_avalable = True 30 | except ImportError: 31 | is_cython_avalable = False 32 | warnings.warn( 33 | "Cython version is not available. Fallback to 'EXPERIMETAL' numba version. " 34 | "If you want to use the cython version, please build it as follows: " 35 | "`cd paddlespeech/t2s/models/vits/monotonic_align; python setup.py build_ext --inplace`" 36 | ) 37 | 38 | 39 | def maximum_path(neg_x_ent: paddle.Tensor, 40 | attn_mask: paddle.Tensor) -> paddle.Tensor: 41 | """Calculate maximum path. 42 | 43 | Args: 44 | neg_x_ent (Tensor): Negative X entropy tensor (B, T_feats, T_text). 45 | attn_mask (Tensor): Attention mask (B, T_feats, T_text). 46 | 47 | Returns: 48 | Tensor: Maximum path tensor (B, T_feats, T_text). 49 | 50 | """ 51 | dtype = neg_x_ent.dtype 52 | neg_x_ent = neg_x_ent.numpy().astype(np.float32) 53 | path = np.zeros(neg_x_ent.shape, dtype=np.int32) 54 | t_t_max = attn_mask.sum(1)[:, 0].cpu().numpy().astype(np.int32) 55 | t_s_max = attn_mask.sum(2)[:, 0].cpu().numpy().astype(np.int32) 56 | if is_cython_avalable: 57 | maximum_path_c(path, neg_x_ent, t_t_max, t_s_max) 58 | else: 59 | maximum_path_numba(path, neg_x_ent, t_t_max, t_s_max) 60 | 61 | return paddle.cast(paddle.to_tensor(path), dtype=dtype) 62 | 63 | 64 | @njit 65 | def maximum_path_each_numba(path, value, t_y, t_x, max_neg_val=-np.inf): 66 | """Calculate a single maximum path with numba.""" 67 | index = t_x - 1 68 | for y in range(t_y): 69 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 70 | if x == y: 71 | v_cur = max_neg_val 72 | else: 73 | v_cur = value[y - 1, x] 74 | if x == 0: 75 | if y == 0: 76 | v_prev = 0.0 77 | else: 78 | v_prev = max_neg_val 79 | else: 80 | v_prev = value[y - 1, x - 1] 81 | value[y, x] += max(v_prev, v_cur) 82 | 83 | for y in range(t_y - 1, -1, -1): 84 | path[y, index] = 1 85 | if index != 0 and (index == y or 86 | value[y - 1, index] < value[y - 1, index - 1]): 87 | index = index - 1 88 | 89 | 90 | @njit(parallel=True) 91 | def maximum_path_numba(paths, values, t_ys, t_xs): 92 | """Calculate batch maximum path with numba.""" 93 | for i in prange(paths.shape[0]): 94 | maximum_path_each_numba(paths[i], values[i], t_ys[i], t_xs[i]) 95 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Maximum path calculation module with cython optimization. 15 | 16 | This code is copied from https://github.com/jaywalnut310/vits and modifed code format. 17 | 18 | """ 19 | 20 | cimport cython 21 | 22 | from cython.parallel import prange 23 | 24 | 25 | @cython.boundscheck(False) 26 | @cython.wraparound(False) 27 | cdef void maximum_path_each(int[:, ::1] path, float[:, ::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: 28 | cdef int x 29 | cdef int y 30 | cdef float v_prev 31 | cdef float v_cur 32 | cdef float tmp 33 | cdef int index = t_x - 1 34 | 35 | for y in range(t_y): 36 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 37 | if x == y: 38 | v_cur = max_neg_val 39 | else: 40 | v_cur = value[y - 1, x] 41 | if x == 0: 42 | if y == 0: 43 | v_prev = 0.0 44 | else: 45 | v_prev = max_neg_val 46 | else: 47 | v_prev = value[y - 1, x - 1] 48 | value[y, x] += max(v_prev, v_cur) 49 | 50 | for y in range(t_y - 1, -1, -1): 51 | path[y, index] = 1 52 | if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]): 53 | index = index - 1 54 | 55 | 56 | @cython.boundscheck(False) 57 | @cython.wraparound(False) 58 | cpdef void maximum_path_c(int[:, :, ::1] paths, float[:, :, ::1] values, int[::1] t_ys, int[::1] t_xs) nogil: 59 | cdef int b = paths.shape[0] 60 | cdef int i 61 | for i in prange(b, nogil=True): 62 | maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) 63 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Setup cython code.""" 15 | from Cython.Build import cythonize 16 | from setuptools import Extension 17 | from setuptools import setup 18 | from setuptools.command.build_ext import build_ext as _build_ext 19 | 20 | 21 | class build_ext(_build_ext): 22 | """Overwrite build_ext.""" 23 | 24 | def finalize_options(self): 25 | """Prevent numpy from thinking it is still in its setup process.""" 26 | _build_ext.finalize_options(self) 27 | __builtins__.__NUMPY_SETUP__ = False 28 | import numpy 29 | 30 | self.include_dirs.append(numpy.get_include()) 31 | 32 | 33 | exts = [Extension( 34 | name="core", 35 | sources=["core.pyx"], )] 36 | setup( 37 | name="monotonic_align", 38 | ext_modules=cythonize(exts, language_level=3), 39 | cmdclass={"build_ext": build_ext}, ) 40 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/posterior_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Posterior encoder module in VITS. 5 | 6 | This code is based on https://github.com/jaywalnut310/vits. 7 | 8 | """ 9 | 10 | from typing import Optional, Tuple 11 | 12 | import torch 13 | 14 | from deepaudio.tts.models.vits.wavenet.wavenet import WaveNet 15 | from deepaudio.tts.models.vits.wavenet.residual_block import Conv1d 16 | from deepaudio.tts.modules.nets_utils import make_non_pad_mask 17 | 18 | 19 | class PosteriorEncoder(torch.nn.Module): 20 | """Posterior encoder module in VITS. 21 | 22 | This is a module of posterior encoder described in `Conditional Variational 23 | Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_. 24 | 25 | .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End 26 | Text-to-Speech`: https://arxiv.org/abs/2006.04558 27 | """ 28 | 29 | def __init__( 30 | self, 31 | in_channels: int = 513, 32 | out_channels: int = 192, 33 | hidden_channels: int = 192, 34 | kernel_size: int = 5, 35 | layers: int = 16, 36 | stacks: int = 1, 37 | base_dilation: int = 1, 38 | global_channels: int = -1, 39 | dropout_rate: float = 0.0, 40 | bias: bool = True, 41 | use_weight_norm: bool = True, 42 | ): 43 | """Initilialize PosteriorEncoder module. 44 | 45 | Args: 46 | in_channels (int): Number of input channels. 47 | out_channels (int): Number of output channels. 48 | hidden_channels (int): Number of hidden channels. 49 | kernel_size (int): Kernel size in WaveNet. 50 | layers (int): Number of layers of WaveNet. 51 | stacks (int): Number of repeat stacking of WaveNet. 52 | base_dilation (int): Base dilation factor. 53 | global_channels (int): Number of global conditioning channels. 54 | dropout_rate (float): Dropout rate. 55 | bias (bool): Whether to use bias parameters in conv. 56 | use_weight_norm (bool): Whether to apply weight norm. 57 | 58 | """ 59 | super().__init__() 60 | 61 | # define modules 62 | self.input_conv = Conv1d(in_channels, hidden_channels, 1) 63 | self.encoder = WaveNet( 64 | in_channels=-1, 65 | out_channels=-1, 66 | kernel_size=kernel_size, 67 | layers=layers, 68 | stacks=stacks, 69 | base_dilation=base_dilation, 70 | residual_channels=hidden_channels, 71 | aux_channels=-1, 72 | gate_channels=hidden_channels * 2, 73 | skip_channels=hidden_channels, 74 | global_channels=global_channels, 75 | dropout_rate=dropout_rate, 76 | bias=bias, 77 | use_weight_norm=use_weight_norm, 78 | use_first_conv=False, 79 | use_last_conv=False, 80 | scale_residual=False, 81 | scale_skip_connect=True, 82 | ) 83 | self.proj = Conv1d(hidden_channels, out_channels * 2, 1) 84 | 85 | def forward( 86 | self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None 87 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 88 | """Calculate forward propagation. 89 | 90 | Args: 91 | x (Tensor): Input tensor (B, in_channels, T_feats). 92 | x_lengths (Tensor): Length tensor (B,). 93 | g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1). 94 | 95 | Returns: 96 | Tensor: Encoded hidden representation tensor (B, out_channels, T_feats). 97 | Tensor: Projected mean tensor (B, out_channels, T_feats). 98 | Tensor: Projected scale tensor (B, out_channels, T_feats). 99 | Tensor: Mask tensor for input tensor (B, 1, T_feats). 100 | 101 | """ 102 | x_mask = ( 103 | make_non_pad_mask(x_lengths) 104 | .unsqueeze(1) 105 | .to( 106 | dtype=x.dtype, 107 | device=x.device, 108 | ) 109 | ) 110 | x = self.input_conv(x) * x_mask 111 | x = self.encoder(x, x_mask, g=g) 112 | stats = self.proj(x) * x_mask 113 | m, logs = stats.split(stats.size(1) // 2, dim=1) 114 | z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask 115 | 116 | return z, m, logs, x_mask 117 | -------------------------------------------------------------------------------- /deepaudio/tts/models/vits/wavenet/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/models/wavernn/__init__.py: -------------------------------------------------------------------------------- 1 | from .wavernn import * 2 | from .wavernn import * -------------------------------------------------------------------------------- /deepaudio/tts/models/wavernn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor, nn 3 | 4 | from deepaudio.tts.models.wavernn import WaveRNN 5 | from deepaudio.tts.modules.losses import discretized_mix_logistic_loss 6 | 7 | 8 | 9 | class WaveRNNModel(BasePLModel): 10 | def __init__(self, 11 | model: WaveRNN, 12 | mode: str, 13 | optimizer: torch.optim.Optimizer, 14 | scheduler: torch.optim.lr_scheduler 15 | ): 16 | super(WaveRNNModel, self).__init__() 17 | 18 | self.model = model 19 | self.save_hyperparameters(logger=False, ignore=["model"]) 20 | if self.hparams.mode == 'RAW': 21 | self.criterion = nn.CrossEntropyLoss() 22 | elif self.hparams.mode == 'MOL': 23 | self.criterion = discretized_mix_logistic_loss() 24 | else: 25 | self.criterion = None 26 | RuntimeError('Unknown model mode value - ', self.configs.model.mode) 27 | 28 | def compute_loss(self, batch): 29 | wav, y, mel = batch 30 | y_hat = self.model(wav, mel) 31 | if self.hparams.mode == 'RAW': 32 | y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1) 33 | elif self.hparams.mode == 'MOL': 34 | y_hat = y_hat.type(torch.float32) 35 | 36 | y = y.unsqueeze(-1) 37 | loss = self.criterion(y_hat, y) 38 | return loss 39 | 40 | def training_step(self, batch: tuple, batch_idx: int): 41 | loss = self.compute_loss(batch) 42 | return { 43 | 'loss': loss 44 | } 45 | 46 | def validation_step(self, batch: tuple, batch_idx: int): 47 | loss = self.compute_loss(batch) 48 | return { 49 | 'val_loss': loss 50 | } 51 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .conv import * 15 | from .geometry import * 16 | from .losses import * 17 | from .positional_encoding import * 18 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | 16 | 17 | def get_activation(act, **kwargs): 18 | """Return activation function.""" 19 | 20 | activation_funcs = { 21 | "hardtanh": torch.nn.Hardtanh, 22 | "tanh": torch.nn.Tanh, 23 | "relu": torch.nn.ReLU, 24 | "selu": torch.nn.SELU, 25 | "leakyrelu": torch.nn.LeakyReLU, 26 | "swish": torch.nn.Swish, 27 | "glu": torch.nn.GLU 28 | } 29 | 30 | return activation_funcs[act](**kwargs) 31 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/causal_conv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Causal convolusion layer modules.""" 15 | import torch 16 | from torch import nn 17 | 18 | 19 | class CausalConv1D(nn.Module): 20 | """CausalConv1D module with customized initialization.""" 21 | 22 | def __init__( 23 | self, 24 | in_channels, 25 | out_channels, 26 | kernel_size, 27 | dilation=1, 28 | bias=True, 29 | pad="Pad1D", 30 | pad_params={"value": 0.0}, ): 31 | """Initialize CausalConv1d module.""" 32 | super().__init__() 33 | self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, 34 | **pad_params) 35 | self.conv = nn.Conv1d( 36 | in_channels, 37 | out_channels, 38 | kernel_size, 39 | dilation=dilation, 40 | bias=bias) 41 | 42 | def forward(self, x): 43 | """Calculate forward propagation. 44 | Args: 45 | x (Tensor): Input tensor (B, in_channels, T). 46 | Returns: 47 | Tensor: Output tensor (B, out_channels, T). 48 | """ 49 | return self.conv(self.pad(x))[:, :, :x.shape[2]] 50 | 51 | 52 | class CausalConv1DTranspose(nn.Module): 53 | """CausalConv1DTranspose module with customized initialization.""" 54 | 55 | def __init__(self, 56 | in_channels, 57 | out_channels, 58 | kernel_size, 59 | stride, 60 | bias=True): 61 | """Initialize CausalConvTranspose1d module.""" 62 | super().__init__() 63 | self.deconv = nn.ConvTranspose1d( 64 | in_channels, out_channels, kernel_size, stride, bias=bias) 65 | self.stride = stride 66 | 67 | def forward(self, x): 68 | """Calculate forward propagation. 69 | Args: 70 | x (Tensor): Input tensor (B, in_channels, T_in). 71 | Returns: 72 | Tensor: Output tensor (B, out_channels, T_out). 73 | """ 74 | return self.deconv(x)[:, :, :-self.stride] 75 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/conformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/conformer/convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """ConvolutionModule definition.""" 9 | 10 | from torch import nn 11 | 12 | 13 | class ConvolutionModule(nn.Module): 14 | """ConvolutionModule in Conformer model. 15 | 16 | Args: 17 | channels (int): The number of channels of conv layers. 18 | kernel_size (int): Kernerl size of conv layers. 19 | 20 | """ 21 | 22 | def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): 23 | """Construct an ConvolutionModule object.""" 24 | super(ConvolutionModule, self).__init__() 25 | # kernerl_size should be a odd number for 'SAME' padding 26 | assert (kernel_size - 1) % 2 == 0 27 | 28 | self.pointwise_conv1 = nn.Conv1d( 29 | channels, 30 | 2 * channels, 31 | kernel_size=1, 32 | stride=1, 33 | padding=0, 34 | bias=bias, 35 | ) 36 | self.depthwise_conv = nn.Conv1d( 37 | channels, 38 | channels, 39 | kernel_size, 40 | stride=1, 41 | padding=(kernel_size - 1) // 2, 42 | groups=channels, 43 | bias=bias, 44 | ) 45 | self.norm = nn.BatchNorm1d(channels) 46 | self.pointwise_conv2 = nn.Conv1d( 47 | channels, 48 | channels, 49 | kernel_size=1, 50 | stride=1, 51 | padding=0, 52 | bias=bias, 53 | ) 54 | self.activation = activation 55 | 56 | def forward(self, x): 57 | """Compute convolution module. 58 | 59 | Args: 60 | x (torch.Tensor): Input tensor (#batch, time, channels). 61 | 62 | Returns: 63 | torch.Tensor: Output tensor (#batch, time, channels). 64 | 65 | """ 66 | # exchange the temporal dimension and the feature dimension 67 | x = x.transpose(1, 2) 68 | 69 | # GLU mechanism 70 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 71 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 72 | 73 | # 1D Depthwise Conv 74 | x = self.depthwise_conv(x) 75 | x = self.activation(self.norm(x)) 76 | 77 | x = self.pointwise_conv2(x) 78 | 79 | return x.transpose(1, 2) 80 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/conformer/swish.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 5 | # Northwestern Polytechnical University (Pengcheng Guo) 6 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | """Swish() activation function for Conformer.""" 9 | 10 | import torch 11 | 12 | 13 | class Swish(torch.nn.Module): 14 | """Construct an Swish object.""" 15 | 16 | def forward(self, x): 17 | """Return Swich activation function.""" 18 | return x * torch.sigmoid(x) -------------------------------------------------------------------------------- /deepaudio/tts/modules/geometry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import numpy as np 15 | import torch 16 | 17 | 18 | def shuffle_dim(x, axis, perm=None): 19 | """Permute input tensor along aixs given the permutation or randomly. 20 | 21 | Args: 22 | x (Tensor): The input tensor. 23 | axis (int): The axis to shuffle. 24 | perm (List[int], ndarray, optional): 25 | The order to reorder the tensor along the ``axis``-th dimension. 26 | It is a permutation of ``[0, d)``, where d is the size of the 27 | ``axis``-th dimension of the input tensor. If not provided, 28 | a random permutation is used. Defaults to None. 29 | 30 | Returns: 31 | Tensor: The shuffled tensor, which has the same shape as x does. 32 | """ 33 | size = x.shape[axis] 34 | if perm is not None and len(perm) != size: 35 | raise ValueError("length of permutation should equals the input " 36 | "tensor's axis-th dimension's size") 37 | if perm is not None: 38 | perm = np.array(perm) 39 | else: 40 | perm = np.random.permutation(size) 41 | 42 | perm = torch.to_tensor(perm) 43 | out = torch.gather(x, perm, axis) 44 | return out 45 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Layer normalization module.""" 15 | import torch 16 | 17 | 18 | class LayerNorm(torch.nn.LayerNorm): 19 | """Layer normalization module. 20 | 21 | Args: 22 | nout (int): Output dim size. 23 | dim (int): Dimension to be normalized. 24 | 25 | """ 26 | 27 | def __init__(self, nout, dim=-1): 28 | """Construct an LayerNorm object.""" 29 | super(LayerNorm, self).__init__(nout, eps=1e-12) 30 | self.dim = dim 31 | 32 | def forward(self, x): 33 | """Apply layer normalization. 34 | 35 | Args: 36 | x (torch.Tensor): Input tensor. 37 | 38 | Returns: 39 | torch.Tensor: Normalized tensor. 40 | 41 | """ 42 | if self.dim == -1: 43 | return super(LayerNorm, self).forward(x) 44 | return ( 45 | super(LayerNorm, self) 46 | .forward(x.transpose(self.dim, -1)) 47 | .transpose(self.dim, -1) 48 | ) 49 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/masked_fill.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Union 15 | 16 | import torch 17 | 18 | 19 | def is_broadcastable(shp1, shp2): 20 | for a, b in zip(shp1[::-1], shp2[::-1]): 21 | if a == 1 or b == 1 or a == b: 22 | pass 23 | else: 24 | return False 25 | return True 26 | 27 | 28 | # assume that len(shp1) == len(shp2) 29 | def broadcast_shape(shp1, shp2): 30 | result = [] 31 | for a, b in zip(shp1[::-1], shp2[::-1]): 32 | result.append(max(a, b)) 33 | return result[::-1] 34 | 35 | 36 | def masked_fill(xs: torch.Tensor, 37 | mask: torch.Tensor, 38 | value: Union[float, int]): 39 | # comment following line for converting dygraph to static graph. 40 | # assert is_broadcastable(xs.shape, mask.shape) is True 41 | # bshape = paddle.broadcast_shape(xs.shape, mask.shape) 42 | bshape = broadcast_shape(xs.shape, mask.shape) 43 | mask.stop_gradient = True 44 | mask = mask.broadcast_to(bshape) 45 | 46 | trues = torch.ones_like(xs) * value 47 | mask = mask.type(dtype=torch.bool) 48 | xs = torch.where(mask, trues, xs) 49 | return xs 50 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torch 15 | from torch import nn 16 | 17 | 18 | class ZScore(nn.Module): 19 | # feature last 20 | def __init__(self, mu, sigma): 21 | super().__init__() 22 | self.register_buffer("mu", mu) 23 | self.register_buffer("sigma", sigma) 24 | 25 | def forward(self, x): 26 | # NOTE: to be compatible with torch's to_static, we must explicitly 27 | # call multiply, or add, etc, instead of +-*/, etc. 28 | return torch.divide(torch.subtract(x, self.mu), self.sigma) 29 | 30 | def inverse(self, x): 31 | # NOTE: to be compatible with torch's to_static, we must explicitly 32 | # call multiply, or add, etc, instead of +-*/, etc. 33 | return torch.add(torch.multiply(x, self.sigma), self.mu) 34 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/positional_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import numpy as np 15 | import torch 16 | from torch import Tensor 17 | 18 | __all__ = ["sinusoid_position_encoding", "scaled_position_encoding"] 19 | 20 | 21 | def sinusoid_position_encoding(num_positions: int, 22 | feature_size: int, 23 | omega: float=1.0, 24 | start_pos: int=0, 25 | dtype=None) -> torch.Tensor: 26 | # return tensor shape (num_positions, feature_size) 27 | # NOTE: to be compatible with paddle's to_static, we cannnot raise 28 | # an exception here, take care of it by yourself 29 | # if (feature_size % 2 != 0): 30 | # raise ValueError("size should be divisible by 2") 31 | dtype = dtype or torch.get_default_dtype() 32 | 33 | channel = torch.arange(0, feature_size, 2, dtype=dtype) 34 | index = torch.arange(start_pos, start_pos + num_positions, 1, dtype=dtype) 35 | denominator = channel / float(feature_size) 36 | denominator = torch.from_numpy(np.array([10000.0]).astype(np.float32))**denominator 37 | p = (torch.unsqueeze(index, -1) * omega) / denominator 38 | encodings = torch.zeros([num_positions, feature_size], dtype=dtype) 39 | encodings[:, 0::2] = torch.sin(p) 40 | encodings[:, 1::2] = torch.cos(p) 41 | return encodings 42 | 43 | 44 | def scaled_position_encoding(num_positions: int, 45 | feature_size: int, 46 | omega: Tensor, 47 | start_pos: int=0, 48 | dtype=None) -> Tensor: 49 | # omega: Tensor (batch_size, ) 50 | # return tensor shape (batch_size, num_positions, feature_size) 51 | # consider renaming this as batched positioning encoding 52 | if (feature_size % 2 != 0): 53 | raise ValueError("size should be divisible by 2") 54 | dtype = dtype or torch.get_default_dtype() 55 | 56 | channel = torch.arange(0, feature_size, 2, dtype=dtype) 57 | index = torch.arange( 58 | start_pos, start_pos + num_positions, 1, dtype=omega.dtype) 59 | batch_size = omega.shape[0] 60 | omega = torch.unsqueeze(omega, 1) 61 | omega = torch.unsqueeze(omega, 2) 62 | p = (torch.unsqueeze(index, -1) * 63 | omega) / (10000.0**(channel / float(feature_size))) 64 | encodings = torch.zeros( 65 | [batch_size, num_positions, feature_size], dtype=dtype) 66 | # it is nice to have fancy indexing and inplace operations 67 | encodings[:, :, 0::2] = torch.sin(p) 68 | encodings[:, :, 1::2] = torch.cos(p) 69 | return encodings 70 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/predictor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/predictor/duration_calculator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Tomoki Hayashi 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Duration calculator related modules.""" 8 | 9 | import torch 10 | 11 | from deepaudio.tts.models.tacotron2.tacotron2 import Tacotron2 12 | from deepaudio.tts.models.transformer_tts import Transformer 13 | from deepaudio.tts.modules.nets_utils import pad_list 14 | 15 | 16 | class DurationCalculator(torch.nn.Module): 17 | """Duration calculator module for FastSpeech. 18 | 19 | Todo: 20 | * Fix the duplicated calculation of diagonal head decision 21 | 22 | """ 23 | 24 | def __init__(self, teacher_model): 25 | """Initialize duration calculator module. 26 | 27 | Args: 28 | teacher_model (e2e_tts_transformer.Transformer): 29 | Pretrained auto-regressive Transformer. 30 | 31 | """ 32 | super(DurationCalculator, self).__init__() 33 | if isinstance(teacher_model, Transformer): 34 | self.register_buffer("diag_head_idx", torch.tensor(-1)) 35 | elif isinstance(teacher_model, Tacotron2): 36 | pass 37 | else: 38 | raise ValueError( 39 | "teacher model should be the instance of " 40 | "e2e_tts_transformer.Transformer or e2e_tts_tacotron2.Tacotron2." 41 | ) 42 | self.teacher_model = teacher_model 43 | 44 | def forward(self, xs, ilens, ys, olens, spembs=None): 45 | """Calculate forward propagation. 46 | 47 | Args: 48 | xs (Tensor): Batch of the padded sequences of character ids (B, Tmax). 49 | ilens (Tensor): Batch of lengths of each input sequence (B,). 50 | ys (Tensor): 51 | Batch of the padded sequence of target features (B, Lmax, odim). 52 | olens (Tensor): Batch of lengths of each output sequence (B,). 53 | spembs (Tensor, optional): 54 | Batch of speaker embedding vectors (B, spk_embed_dim). 55 | 56 | Returns: 57 | Tensor: Batch of durations (B, Tmax). 58 | 59 | """ 60 | if isinstance(self.teacher_model, Transformer): 61 | att_ws = self._calculate_encoder_decoder_attentions( 62 | xs, ilens, ys, olens, spembs=spembs 63 | ) 64 | # TODO(kan-bayashi): fix this issue 65 | # this does not work in multi-gpu case. registered buffer is not saved. 66 | if int(self.diag_head_idx) == -1: 67 | self._init_diagonal_head(att_ws) 68 | att_ws = att_ws[:, self.diag_head_idx] 69 | else: 70 | # NOTE(kan-bayashi): Here we assume that the teacher is tacotron 2 71 | att_ws = self.teacher_model.calculate_all_attentions( 72 | xs, ilens, ys, spembs=spembs, keep_tensor=True 73 | ) 74 | durations = [ 75 | self._calculate_duration(att_w, ilen, olen) 76 | for att_w, ilen, olen in zip(att_ws, ilens, olens) 77 | ] 78 | 79 | return pad_list(durations, 0) 80 | 81 | @staticmethod 82 | def _calculate_duration(att_w, ilen, olen): 83 | return torch.stack( 84 | [att_w[:olen, :ilen].argmax(-1).eq(i).sum() for i in range(ilen)] 85 | ) 86 | 87 | def _init_diagonal_head(self, att_ws): 88 | diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1).mean(dim=0) # (H * L,) 89 | self.register_buffer("diag_head_idx", diagonal_scores.argmax()) 90 | 91 | def _calculate_encoder_decoder_attentions(self, xs, ilens, ys, olens, spembs=None): 92 | att_dict = self.teacher_model.calculate_all_attentions( 93 | xs, ilens, ys, olens, spembs=spembs, skip_output=True, keep_tensor=True 94 | ) 95 | return torch.cat( 96 | [att_dict[k] for k in att_dict.keys() if "src_attn" in k], dim=1 97 | ) # (B, H*L, Lmax, Tmax) 98 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/predictor/length_regulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Tomoki Hayashi 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Length regulator related modules.""" 8 | 9 | import logging 10 | 11 | import torch 12 | 13 | from deepaudio.tts.modules.nets_utils import pad_list 14 | 15 | 16 | class LengthRegulator(torch.nn.Module): 17 | """Length regulator module for feed-forward Transformer. 18 | 19 | This is a module of length regulator described in 20 | `FastSpeech: Fast, Robust and Controllable Text to Speech`_. 21 | The length regulator expands char or 22 | phoneme-level embedding features to frame-level by repeating each 23 | feature based on the corresponding predicted durations. 24 | 25 | .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: 26 | https://arxiv.org/pdf/1905.09263.pdf 27 | 28 | """ 29 | 30 | def __init__(self, pad_value=0.0): 31 | """Initilize length regulator module. 32 | 33 | Args: 34 | pad_value (float, optional): Value used for padding. 35 | 36 | """ 37 | super().__init__() 38 | self.pad_value = pad_value 39 | 40 | def forward(self, xs, ds, alpha=1.0): 41 | """Calculate forward propagation. 42 | 43 | Args: 44 | xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). 45 | ds (LongTensor): Batch of durations of each frame (B, T). 46 | alpha (float, optional): Alpha value to control speed of speech. 47 | 48 | Returns: 49 | Tensor: replicated input tensor based on durations (B, T*, D). 50 | 51 | """ 52 | if alpha != 1.0: 53 | assert alpha > 0 54 | ds = torch.round(ds.float() * alpha).long() 55 | 56 | if ds.sum() == 0: 57 | logging.warning( 58 | "predicted durations includes all 0 sequences. " 59 | "fill the first element with 1." 60 | ) 61 | # NOTE(kan-bayashi): This case must not be happened in teacher forcing. 62 | # It will be happened in inference with a bad duration predictor. 63 | # So we do not need to care the padded sequence case here. 64 | ds[ds.sum(dim=1).eq(0)] = 1 65 | 66 | repeat = [torch.repeat_interleave(x, d, dim=0) for x, d in zip(xs, ds)] 67 | return pad_list(repeat, self.pad_value) 68 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/predictor/variance_predictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 5 | 6 | """Variance predictor related modules.""" 7 | 8 | import torch 9 | from typeguard import check_argument_types 10 | 11 | from deepaudio.tts.modules.layer_norm import LayerNorm 12 | 13 | 14 | class VariancePredictor(torch.nn.Module): 15 | """Variance predictor module. 16 | 17 | This is a module of variacne predictor described in `FastSpeech 2: 18 | Fast and High-Quality End-to-End Text to Speech`_. 19 | 20 | .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`: 21 | https://arxiv.org/abs/2006.04558 22 | 23 | """ 24 | 25 | def __init__( 26 | self, 27 | idim: int, 28 | n_layers: int = 2, 29 | n_chans: int = 384, 30 | kernel_size: int = 3, 31 | bias: bool = True, 32 | dropout_rate: float = 0.5, 33 | ): 34 | """Initilize duration predictor module. 35 | 36 | Args: 37 | idim (int): Input dimension. 38 | n_layers (int): Number of convolutional layers. 39 | n_chans (int): Number of channels of convolutional layers. 40 | kernel_size (int): Kernel size of convolutional layers. 41 | dropout_rate (float): Dropout rate. 42 | 43 | """ 44 | assert check_argument_types() 45 | super().__init__() 46 | self.conv = torch.nn.ModuleList() 47 | for idx in range(n_layers): 48 | in_chans = idim if idx == 0 else n_chans 49 | self.conv += [ 50 | torch.nn.Sequential( 51 | torch.nn.Conv1d( 52 | in_chans, 53 | n_chans, 54 | kernel_size, 55 | stride=1, 56 | padding=(kernel_size - 1) // 2, 57 | bias=bias, 58 | ), 59 | torch.nn.ReLU(), 60 | LayerNorm(n_chans, dim=1), 61 | torch.nn.Dropout(dropout_rate), 62 | ) 63 | ] 64 | self.linear = torch.nn.Linear(n_chans, 1) 65 | 66 | def forward(self, xs: torch.Tensor, x_masks: torch.Tensor = None) -> torch.Tensor: 67 | """Calculate forward propagation. 68 | 69 | Args: 70 | xs (Tensor): Batch of input sequences (B, Tmax, idim). 71 | x_masks (ByteTensor): Batch of masks indicating padded part (B, Tmax). 72 | 73 | Returns: 74 | Tensor: Batch of predicted sequences (B, Tmax, 1). 75 | 76 | """ 77 | xs = xs.transpose(1, -1) # (B, idim, Tmax) 78 | for f in self.conv: 79 | xs = f(xs) # (B, C, Tmax) 80 | 81 | xs = self.linear(xs.transpose(1, 2)) # (B, Tmax, 1) 82 | 83 | if x_masks is not None: 84 | xs = xs.masked_fill(x_masks, 0.0) 85 | 86 | return xs 87 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/residual_stack.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Modified from espnet(https://github.com/espnet/espnet) 15 | """Residual stack module in MelGAN.""" 16 | from typing import Any 17 | from typing import Dict 18 | 19 | from torch import nn 20 | 21 | from deepaudio.tts.modules.activation import get_activation 22 | from deepaudio.tts.modules.causal_conv import CausalConv1D 23 | 24 | 25 | class ResidualStack(nn.Module): 26 | """Residual stack module introduced in MelGAN.""" 27 | 28 | def __init__( 29 | self, 30 | kernel_size: int=3, 31 | channels: int=32, 32 | dilation: int=1, 33 | bias: bool=True, 34 | nonlinear_activation: str="leakyrelu", 35 | nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2}, 36 | pad: str="Pad1D", 37 | pad_params: Dict[str, Any]={"mode": "reflect"}, 38 | use_causal_conv: bool=False, ): 39 | """Initialize ResidualStack module. 40 | 41 | Args: 42 | kernel_size (int): Kernel size of dilation convolution layer. 43 | channels (int): Number of channels of convolution layers. 44 | dilation (int): Dilation factor. 45 | bias (bool): Whether to add bias parameter in convolution layers. 46 | nonlinear_activation (str): Activation function module name. 47 | nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. 48 | pad (str): Padding function module name before dilated convolution layer. 49 | pad_params (Dict[str, Any]): Hyperparameters for padding function. 50 | use_causal_conv (bool): Whether to use causal convolution. 51 | """ 52 | super().__init__() 53 | # for compatibility 54 | if nonlinear_activation: 55 | nonlinear_activation = nonlinear_activation.lower() 56 | 57 | # defile residual stack part 58 | if not use_causal_conv: 59 | assert (kernel_size - 1 60 | ) % 2 == 0, "Not support even number kernel size." 61 | self.stack = nn.Sequential( 62 | get_activation(nonlinear_activation, 63 | **nonlinear_activation_params), 64 | getattr(nn, pad)((kernel_size - 1) // 2 * dilation, 65 | **pad_params), 66 | nn.Conv1d( 67 | channels, 68 | channels, 69 | kernel_size, 70 | dilation=dilation, 71 | bias=bias), 72 | get_activation(nonlinear_activation, 73 | **nonlinear_activation_params), 74 | nn.Conv1d(channels, channels, 1, bias=bias), ) 75 | else: 76 | self.stack = nn.Sequential( 77 | get_activation(nonlinear_activation, 78 | **nonlinear_activation_params), 79 | CausalConv1D( 80 | channels, 81 | channels, 82 | kernel_size, 83 | dilation=dilation, 84 | bias=bias, 85 | pad=pad, 86 | pad_params=pad_params, ), 87 | get_activation(nonlinear_activation, 88 | **nonlinear_activation_params), 89 | nn.Conv1d(channels, channels, 1, bias=bias), ) 90 | 91 | # defile extra layer for skip connection 92 | self.skip_layer = nn.Conv1d(channels, channels, 1, bias=bias) 93 | 94 | def forward(self, c): 95 | """Calculate forward propagation. 96 | 97 | Args: 98 | c (Tensor): Input tensor (B, channels, T). 99 | Returns: 100 | Tensor: Output tensor (B, chennels, T). 101 | """ 102 | return self.stack(c) + self.skip_layer(c) 103 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/encoder_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Encoder self-attention layer definition.""" 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from .layer_norm import LayerNorm 13 | 14 | 15 | class EncoderLayer(nn.Module): 16 | """Encoder layer module. 17 | 18 | Args: 19 | size (int): Input dimension. 20 | self_attn (torch.nn.Module): Self-attention module instance. 21 | `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance 22 | can be used as the argument. 23 | feed_forward (torch.nn.Module): Feed-forward module instance. 24 | `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance 25 | can be used as the argument. 26 | dropout_rate (float): Dropout rate. 27 | normalize_before (bool): Whether to use layer_norm before the first block. 28 | concat_after (bool): Whether to concat attention layer's input and output. 29 | if True, additional linear will be applied. 30 | i.e. x -> x + linear(concat(x, att(x))) 31 | if False, no additional linear will be applied. i.e. x -> x + att(x) 32 | stochastic_depth_rate (float): Proability to skip this layer. 33 | During training, the layer may skip residual computation and return input 34 | as-is with given probability. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | size, 40 | self_attn, 41 | feed_forward, 42 | dropout_rate, 43 | normalize_before=True, 44 | concat_after=False, 45 | stochastic_depth_rate=0.0, 46 | ): 47 | """Construct an EncoderLayer object.""" 48 | super(EncoderLayer, self).__init__() 49 | self.self_attn = self_attn 50 | self.feed_forward = feed_forward 51 | self.norm1 = LayerNorm(size) 52 | self.norm2 = LayerNorm(size) 53 | self.dropout = nn.Dropout(dropout_rate) 54 | self.size = size 55 | self.normalize_before = normalize_before 56 | self.concat_after = concat_after 57 | if self.concat_after: 58 | self.concat_linear = nn.Linear(size + size, size) 59 | self.stochastic_depth_rate = stochastic_depth_rate 60 | 61 | def forward(self, x, mask, cache=None): 62 | """Compute encoded features. 63 | 64 | Args: 65 | x_input (torch.Tensor): Input tensor (#batch, time, size). 66 | mask (torch.Tensor): Mask tensor for the input (#batch, time). 67 | cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). 68 | 69 | Returns: 70 | torch.Tensor: Output tensor (#batch, time, size). 71 | torch.Tensor: Mask tensor (#batch, time). 72 | 73 | """ 74 | skip_layer = False 75 | # with stochastic depth, residual connection `x + f(x)` becomes 76 | # `x <- x + 1 / (1 - p) * f(x)` at training time. 77 | stoch_layer_coeff = 1.0 78 | if self.training and self.stochastic_depth_rate > 0: 79 | skip_layer = torch.rand(1).item() < self.stochastic_depth_rate 80 | stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) 81 | 82 | if skip_layer: 83 | if cache is not None: 84 | x = torch.cat([cache, x], dim=1) 85 | return x, mask 86 | 87 | residual = x 88 | if self.normalize_before: 89 | x = self.norm1(x) 90 | 91 | if cache is None: 92 | x_q = x 93 | else: 94 | assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) 95 | x_q = x[:, -1:, :] 96 | residual = residual[:, -1:, :] 97 | mask = None if mask is None else mask[:, -1:, :] 98 | 99 | if self.concat_after: 100 | x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1) 101 | x = residual + stoch_layer_coeff * self.concat_linear(x_concat) 102 | else: 103 | x = residual + stoch_layer_coeff * self.dropout( 104 | self.self_attn(x_q, x, x, mask) 105 | ) 106 | if not self.normalize_before: 107 | x = self.norm1(x) 108 | 109 | residual = x 110 | if self.normalize_before: 111 | x = self.norm2(x) 112 | x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) 113 | if not self.normalize_before: 114 | x = self.norm2(x) 115 | 116 | if cache is not None: 117 | x = torch.cat([cache, x], dim=1) 118 | 119 | return x, mask 120 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/layer_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Layer normalization module.""" 8 | 9 | import torch 10 | 11 | 12 | class LayerNorm(torch.nn.LayerNorm): 13 | """Layer normalization module. 14 | 15 | Args: 16 | nout (int): Output dim size. 17 | dim (int): Dimension to be normalized. 18 | 19 | """ 20 | 21 | def __init__(self, nout, dim=-1): 22 | """Construct an LayerNorm object.""" 23 | super(LayerNorm, self).__init__(nout, eps=1e-12) 24 | self.dim = dim 25 | 26 | def forward(self, x): 27 | """Apply layer normalization. 28 | 29 | Args: 30 | x (torch.Tensor): Input tensor. 31 | 32 | Returns: 33 | torch.Tensor: Normalized tensor. 34 | 35 | """ 36 | if self.dim == -1: 37 | return super(LayerNorm, self).forward(x) 38 | return ( 39 | super(LayerNorm, self) 40 | .forward(x.transpose(self.dim, -1)) 41 | .transpose(self.dim, -1) 42 | ) 43 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/lightconv.py: -------------------------------------------------------------------------------- 1 | """Lightweight Convolution Module.""" 2 | 3 | import numpy 4 | import torch 5 | import torch.nn.functional as F 6 | from torch import nn 7 | 8 | MIN_VALUE = float(numpy.finfo(numpy.float32).min) 9 | 10 | 11 | class LightweightConvolution(nn.Module): 12 | """Lightweight Convolution layer. 13 | 14 | This implementation is based on 15 | https://github.com/pytorch/fairseq/tree/master/fairseq 16 | 17 | Args: 18 | wshare (int): the number of kernel of convolution 19 | n_feat (int): the number of features 20 | dropout_rate (float): dropout_rate 21 | kernel_size (int): kernel size (length) 22 | use_kernel_mask (bool): Use causal mask or not for convolution kernel 23 | use_bias (bool): Use bias term or not. 24 | 25 | """ 26 | 27 | def __init__( 28 | self, 29 | wshare, 30 | n_feat, 31 | dropout_rate, 32 | kernel_size, 33 | use_kernel_mask=False, 34 | use_bias=False, 35 | ): 36 | """Construct Lightweight Convolution layer.""" 37 | super(LightweightConvolution, self).__init__() 38 | 39 | assert n_feat % wshare == 0 40 | self.wshare = wshare 41 | self.use_kernel_mask = use_kernel_mask 42 | self.dropout_rate = dropout_rate 43 | self.kernel_size = kernel_size 44 | self.padding_size = int(kernel_size / 2) 45 | 46 | # linear -> GLU -> lightconv -> linear 47 | self.linear1 = nn.Linear(n_feat, n_feat * 2) 48 | self.linear2 = nn.Linear(n_feat, n_feat) 49 | self.act = nn.GLU() 50 | 51 | # lightconv related 52 | self.weight = nn.Parameter( 53 | torch.Tensor(self.wshare, 1, kernel_size).uniform_(0, 1) 54 | ) 55 | self.use_bias = use_bias 56 | if self.use_bias: 57 | self.bias = nn.Parameter(torch.Tensor(n_feat)) 58 | 59 | # mask of kernel 60 | kernel_mask0 = torch.zeros(self.wshare, int(kernel_size / 2)) 61 | kernel_mask1 = torch.ones(self.wshare, int(kernel_size / 2 + 1)) 62 | self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1) 63 | 64 | def forward(self, query, key, value, mask): 65 | """Forward of 'Lightweight Convolution'. 66 | 67 | This function takes query, key and value but uses only query. 68 | This is just for compatibility with self-attention layer (attention.py) 69 | 70 | Args: 71 | query (torch.Tensor): (batch, time1, d_model) input tensor 72 | key (torch.Tensor): (batch, time2, d_model) NOT USED 73 | value (torch.Tensor): (batch, time2, d_model) NOT USED 74 | mask (torch.Tensor): (batch, time1, time2) mask 75 | 76 | Return: 77 | x (torch.Tensor): (batch, time1, d_model) output 78 | 79 | """ 80 | # linear -> GLU -> lightconv -> linear 81 | x = query 82 | B, T, C = x.size() 83 | H = self.wshare 84 | 85 | # first liner layer 86 | x = self.linear1(x) 87 | 88 | # GLU activation 89 | x = self.act(x) 90 | 91 | # lightconv 92 | x = x.transpose(1, 2).contiguous().view(-1, H, T) # B x C x T 93 | weight = F.dropout(self.weight, self.dropout_rate, training=self.training) 94 | if self.use_kernel_mask: 95 | self.kernel_mask = self.kernel_mask.to(x.device) 96 | weight = weight.masked_fill(self.kernel_mask == 0.0, float("-inf")) 97 | weight = F.softmax(weight, dim=-1) 98 | x = F.conv1d(x, weight, padding=self.padding_size, groups=self.wshare).view( 99 | B, C, T 100 | ) 101 | if self.use_bias: 102 | x = x + self.bias.view(1, -1, 1) 103 | x = x.transpose(1, 2) # B x T x C 104 | 105 | if mask is not None and not self.use_kernel_mask: 106 | mask = mask.transpose(-1, -2) 107 | x = x.masked_fill(mask == 0, 0.0) 108 | 109 | # second linear layer 110 | x = self.linear2(x) 111 | return x 112 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/mask.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Shigeki Karita 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | """Mask module.""" 5 | 6 | import torch 7 | 8 | 9 | def subsequent_mask(size, device="cpu", dtype=torch.bool): 10 | """Create mask for subsequent steps (size, size). 11 | 12 | :param int size: size of mask 13 | :param str device: "cpu" or "cuda" or torch.Tensor.device 14 | :param torch.dtype dtype: result dtype 15 | :rtype: torch.Tensor 16 | >>> subsequent_mask(3) 17 | [[1, 0, 0], 18 | [1, 1, 0], 19 | [1, 1, 1]] 20 | """ 21 | ret = torch.ones(size, size, device=device, dtype=dtype) 22 | return torch.tril(ret, out=ret) 23 | 24 | 25 | def target_mask(ys_in_pad, ignore_id): 26 | """Create mask for decoder self-attention. 27 | 28 | :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax) 29 | :param int ignore_id: index of padding 30 | :param torch.dtype dtype: result dtype 31 | :rtype: torch.Tensor (B, Lmax, Lmax) 32 | """ 33 | ys_mask = ys_in_pad != ignore_id 34 | m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0) 35 | return ys_mask.unsqueeze(-2) & m 36 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/multi_layer_conv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Tomoki Hayashi 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Layer modules for FFT block in FastSpeech (Feed-forward Transformer).""" 8 | 9 | import torch 10 | 11 | 12 | class MultiLayeredConv1d(torch.nn.Module): 13 | """Multi-layered conv1d for Transformer block. 14 | 15 | This is a module of multi-leyered conv1d designed 16 | to replace positionwise feed-forward network 17 | in Transforner block, which is introduced in 18 | `FastSpeech: Fast, Robust and Controllable Text to Speech`_. 19 | 20 | .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: 21 | https://arxiv.org/pdf/1905.09263.pdf 22 | 23 | """ 24 | 25 | def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): 26 | """Initialize MultiLayeredConv1d module. 27 | 28 | Args: 29 | in_chans (int): Number of input channels. 30 | hidden_chans (int): Number of hidden channels. 31 | kernel_size (int): Kernel size of conv1d. 32 | dropout_rate (float): Dropout rate. 33 | 34 | """ 35 | super(MultiLayeredConv1d, self).__init__() 36 | self.w_1 = torch.nn.Conv1d( 37 | in_chans, 38 | hidden_chans, 39 | kernel_size, 40 | stride=1, 41 | padding=(kernel_size - 1) // 2, 42 | ) 43 | self.w_2 = torch.nn.Conv1d( 44 | hidden_chans, 45 | in_chans, 46 | kernel_size, 47 | stride=1, 48 | padding=(kernel_size - 1) // 2, 49 | ) 50 | self.dropout = torch.nn.Dropout(dropout_rate) 51 | 52 | def forward(self, x): 53 | """Calculate forward propagation. 54 | 55 | Args: 56 | x (torch.Tensor): Batch of input tensors (B, T, in_chans). 57 | 58 | Returns: 59 | torch.Tensor: Batch of output tensors (B, T, hidden_chans). 60 | 61 | """ 62 | x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) 63 | return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1) 64 | 65 | 66 | class Conv1dLinear(torch.nn.Module): 67 | """Conv1D + Linear for Transformer block. 68 | 69 | A variant of MultiLayeredConv1d, which replaces second conv-layer to linear. 70 | 71 | """ 72 | 73 | def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): 74 | """Initialize Conv1dLinear module. 75 | 76 | Args: 77 | in_chans (int): Number of input channels. 78 | hidden_chans (int): Number of hidden channels. 79 | kernel_size (int): Kernel size of conv1d. 80 | dropout_rate (float): Dropout rate. 81 | 82 | """ 83 | super(Conv1dLinear, self).__init__() 84 | self.w_1 = torch.nn.Conv1d( 85 | in_chans, 86 | hidden_chans, 87 | kernel_size, 88 | stride=1, 89 | padding=(kernel_size - 1) // 2, 90 | ) 91 | self.w_2 = torch.nn.Linear(hidden_chans, in_chans) 92 | self.dropout = torch.nn.Dropout(dropout_rate) 93 | 94 | def forward(self, x): 95 | """Calculate forward propagation. 96 | 97 | Args: 98 | x (torch.Tensor): Batch of input tensors (B, T, in_chans). 99 | 100 | Returns: 101 | torch.Tensor: Batch of output tensors (B, T, hidden_chans). 102 | 103 | """ 104 | x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) 105 | return self.w_2(self.dropout(x)) 106 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/positionwise_feed_forward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Positionwise feed forward layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class PositionwiseFeedForward(torch.nn.Module): 13 | """Positionwise feed forward layer. 14 | 15 | Args: 16 | idim (int): Input dimenstion. 17 | hidden_units (int): The number of hidden units. 18 | dropout_rate (float): Dropout rate. 19 | 20 | """ 21 | 22 | def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): 23 | """Construct an PositionwiseFeedForward object.""" 24 | super(PositionwiseFeedForward, self).__init__() 25 | self.w_1 = torch.nn.Linear(idim, hidden_units) 26 | self.w_2 = torch.nn.Linear(hidden_units, idim) 27 | self.dropout = torch.nn.Dropout(dropout_rate) 28 | self.activation = activation 29 | 30 | def forward(self, x): 31 | """Forward function.""" 32 | return self.w_2(self.dropout(self.activation(self.w_1(x)))) 33 | -------------------------------------------------------------------------------- /deepaudio/tts/modules/transformer/repeat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2019 Shigeki Karita 5 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 6 | 7 | """Repeat the same layer definition.""" 8 | 9 | import torch 10 | 11 | 12 | class MultiSequential(torch.nn.Sequential): 13 | """Multi-input multi-output torch.nn.Sequential.""" 14 | 15 | def forward(self, *args): 16 | """Repeat.""" 17 | for m in self: 18 | args = m(*args) 19 | return args 20 | 21 | 22 | def repeat(N, fn): 23 | """Repeat module N times. 24 | 25 | Args: 26 | N (int): Number of repeat time. 27 | fn (Callable): Function to generate module. 28 | 29 | Returns: 30 | MultiSequential: Repeated model instance. 31 | 32 | """ 33 | return MultiSequential(*[fn(n) for n in range(N)]) 34 | -------------------------------------------------------------------------------- /deepaudio/tts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from . import display 15 | 16 | 17 | def str2bool(str): 18 | return True if str.lower() == 'true' else False 19 | -------------------------------------------------------------------------------- /deepaudio/tts/utils/display.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import librosa.display 15 | import matplotlib.pylab as plt 16 | 17 | __all__ = [ 18 | "plot_alignment", 19 | "plot_spectrogram", 20 | "plot_waveform", 21 | "plot_multihead_alignments", 22 | "plot_multilayer_multihead_alignments", 23 | ] 24 | 25 | 26 | def plot_alignment(alignment, title=None): 27 | # alignment: [encoder_steps, decoder_steps) 28 | fig, ax = plt.subplots(figsize=(6, 4)) 29 | im = ax.imshow( 30 | alignment, aspect='auto', origin='lower', interpolation='none') 31 | fig.colorbar(im, ax=ax) 32 | xlabel = 'Decoder timestep' 33 | if title is not None: 34 | xlabel += '\n\n' + title 35 | plt.xlabel(xlabel) 36 | plt.ylabel('Encoder timestep') 37 | plt.tight_layout() 38 | return fig 39 | 40 | 41 | def plot_multihead_alignments(alignments, title=None): 42 | # alignments: [N, encoder_steps, decoder_steps) 43 | num_subplots = alignments.shape[0] 44 | 45 | fig, axes = plt.subplots( 46 | figsize=(6 * num_subplots, 4), 47 | ncols=num_subplots, 48 | sharey=True, 49 | squeeze=True) 50 | for i, ax in enumerate(axes): 51 | im = ax.imshow( 52 | alignments[i], aspect='auto', origin='lower', interpolation='none') 53 | fig.colorbar(im, ax=ax) 54 | xlabel = 'Decoder timestep' 55 | if title is not None: 56 | xlabel += '\n\n' + title 57 | ax.set_xlabel(xlabel) 58 | if i == 0: 59 | ax.set_ylabel('Encoder timestep') 60 | plt.tight_layout() 61 | return fig 62 | 63 | 64 | def plot_multilayer_multihead_alignments(alignments, title=None): 65 | # alignments: [num_layers, num_heads, encoder_steps, decoder_steps) 66 | num_layers, num_heads, *_ = alignments.shape 67 | 68 | fig, axes = plt.subplots( 69 | figsize=(6 * num_heads, 4 * num_layers), 70 | nrows=num_layers, 71 | ncols=num_heads, 72 | sharex=True, 73 | sharey=True, 74 | squeeze=True) 75 | for i, row in enumerate(axes): 76 | for j, ax in enumerate(row): 77 | im = ax.imshow( 78 | alignments[i, j], 79 | aspect='auto', 80 | origin='lower', 81 | interpolation='none') 82 | fig.colorbar(im, ax=ax) 83 | xlabel = 'Decoder timestep' 84 | if title is not None: 85 | xlabel += '\n\n' + title 86 | if i == num_layers - 1: 87 | ax.set_xlabel(xlabel) 88 | if j == 0: 89 | ax.set_ylabel('Encoder timestep') 90 | plt.tight_layout() 91 | return fig 92 | 93 | 94 | def plot_spectrogram(spec): 95 | # spec: [C, T] librosa convention 96 | fig, ax = plt.subplots(figsize=(12, 3)) 97 | im = ax.imshow(spec, aspect="auto", origin="lower", interpolation='none') 98 | plt.colorbar(im, ax=ax) 99 | plt.xlabel("Frames") 100 | plt.ylabel("Channels") 101 | plt.tight_layout() 102 | return fig 103 | 104 | 105 | def plot_waveform(wav, sr=22050): 106 | fig, ax = plt.subplots(figsize=(12, 3)) 107 | im = librosa.display.waveplot(wav, sr=22050) 108 | plt.colorbar(im, ax=ax) 109 | plt.tight_layout() 110 | return fig 111 | -------------------------------------------------------------------------------- /deepaudio/tts/utils/h5_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | import sys 16 | from pathlib import Path 17 | from typing import Any 18 | from typing import Union 19 | 20 | import h5py 21 | import numpy as np 22 | 23 | 24 | def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: 25 | """Read a dataset from a HDF5 file. 26 | Args: 27 | filename (Union[Path, str]): Path of the HDF5 file. 28 | dataset_name (str): Name of the dataset to read. 29 | 30 | Returns: 31 | Any: The retrieved dataset. 32 | """ 33 | filename = Path(filename) 34 | 35 | if not filename.exists(): 36 | logging.error(f"There is no such a hdf5 file ({filename}).") 37 | sys.exit(1) 38 | 39 | hdf5_file = h5py.File(filename, "r") 40 | 41 | if dataset_name not in hdf5_file: 42 | logging.error(f"There is no such a data in hdf5 file. ({dataset_name})") 43 | sys.exit(1) 44 | 45 | # [()]: a special syntax of h5py to get the dataset as-is 46 | hdf5_data = hdf5_file[dataset_name][()] 47 | hdf5_file.close() 48 | 49 | return hdf5_data 50 | 51 | 52 | def write_hdf5(filename: Union[Path, str], 53 | dataset_name: str, 54 | write_data: np.ndarray, 55 | is_overwrite: bool=True) -> None: 56 | """Write dataset to HDF5 file. 57 | Args: 58 | filename (Union[Path, str]): Path of the HDF5 file. 59 | dataset_name (str): Name of the dataset to write to. 60 | write_data (np.ndarrays): The data to write. 61 | is_overwrite (bool, optional): Whether to overwrite, by default True 62 | """ 63 | # convert to numpy array 64 | filename = Path(filename) 65 | write_data = np.array(write_data) 66 | 67 | # check folder existence 68 | filename.parent.mkdir(parents=True, exist_ok=True) 69 | 70 | # check hdf5 existence 71 | if filename.exists(): 72 | # if already exists, open with r+ mode 73 | hdf5_file = h5py.File(filename, "r+") 74 | # check dataset existence 75 | if dataset_name in hdf5_file: 76 | if is_overwrite: 77 | logging.warning("Dataset in hdf5 file already exists. " 78 | "recreate dataset in hdf5.") 79 | hdf5_file.__delitem__(dataset_name) 80 | else: 81 | logging.error( 82 | "Dataset in hdf5 file already exists. " 83 | "if you want to overwrite, please set is_overwrite = True.") 84 | hdf5_file.close() 85 | sys.exit(1) 86 | else: 87 | # if not exists, open with w mode 88 | hdf5_file = h5py.File(filename, "w") 89 | 90 | # write data to hdf5 91 | hdf5_file.create_dataset(dataset_name, data=write_data) 92 | hdf5_file.flush() 93 | hdf5_file.close() 94 | --------------------------------------------------------------------------------