├── .gitignore ├── LICENSE ├── README.md ├── hparams.py ├── preprocess.py └── tts ├── __init__.py ├── dataset ├── __init__.py ├── ljspeech.py └── utils.py ├── frontend ├── __init__.py ├── en │ └── __init__.py └── text │ ├── __init__.py │ ├── cleaners.py │ ├── cmudict.py │ ├── numbers.py │ └── symbols.py └── preprocess ├── __init__.py ├── audio.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # dataset 107 | data_dir 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text2Speech-Pytorch 2 | A Text2Speech engine for Pytorch. 3 | 4 | ## NOTE: Heavily Work in Progress 5 | 6 | This repo will hold various TTS modules, from frontends such as tacotron and deepvoice to neural vocoder backends such as wavenet and wavernn. 7 | 8 | The main goal is to have everything in one place, in one framework and with a bit more modular structure. 9 | 10 | More importantly this is for self learning as I try to implement the various models and architectures. 11 | 12 | Contributions are welcomed! 13 | 14 | # Features (Like to have) 15 | * single speaker/multi speaker dataset support 16 | * modular components 17 | * support for prototyping new models (i.e resusable modules for seq2seq, attention, fully conv nets, transformer, etc) 18 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | class hparams: 2 | # audio parameters 3 | num_mels = 80 4 | fmin = 125 5 | fmax = 7600 6 | fft_size = 1024 7 | hop_size = 256 8 | sample_rate = 22050 9 | preemphasis = 0.97 10 | min_level_db = -100 11 | ref_level_db = 20 12 | rescaling = False 13 | rescaling_max = 0.999 14 | allow_clipping_in_normalization = True 15 | 16 | max_iters=200 17 | griffin_lim_iters=60 18 | power=1.5 19 | 20 | # preprocessing parameters 21 | min_text = 20 22 | 23 | # general parameters 24 | language = "en" 25 | 26 | # trainig paremeters 27 | replace_pronunciation_prob = 1.0 28 | outputs_per_step = 5 29 | batch_size = 60 30 | batch_split = int(batch_size*0.5) 31 | epochs = 2000 32 | guided_attention_ratio = 0.995 33 | teacher_forcing_ratio = 0.5 34 | attention_scale = 15 35 | attention_decay = 0.997 -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Preprocess dataset 4 | 5 | usage: preprocess.py [options] 6 | 7 | options: 8 | --num_workers= Num workers. 9 | --hparams= Hyper parameters [default: ]. 10 | -h, --help Show help message. 11 | """ 12 | from docopt import docopt 13 | import os 14 | from multiprocessing import cpu_count 15 | from tqdm import tqdm 16 | from hparams import hparams 17 | import importlib 18 | 19 | 20 | def preprocess(mode, in_dir, out_dir, num_workers): 21 | """ 22 | """ 23 | os.makedirs(out_dir, exist_ok=True) 24 | metadata = mode.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm) 25 | write_metadata(metadata, out_dir) 26 | 27 | 28 | def write_metadata(metadata, out_dir): 29 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 30 | for m in metadata: 31 | f.write('|'.join([str(x) for x in m]) + '\n') 32 | frames = sum([m[2] for m in metadata]) 33 | frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000 34 | hours = frames * frame_shift_ms / (3600 * 1000) 35 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) 36 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 37 | print('Max output length: %d' % max(m[2] for m in metadata)) 38 | 39 | 40 | 41 | 42 | if __name__ == "__main__": 43 | args = docopt(__doc__) 44 | name = args[""] 45 | in_dir = args[""] 46 | out_dir = args[""] 47 | num_workers = args["--num_workers"] 48 | num_workers = cpu_count() if num_workers is None else int(num_workers) 49 | 50 | # check if data folder name is in available ones 51 | assert name in ["ljspeech"] 52 | # get mode 53 | mode = importlib.import_module("tts.dataset."+name) 54 | # preprocess 55 | preprocess(mode, in_dir, out_dir, num_workers) 56 | 57 | 58 | -------------------------------------------------------------------------------- /tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/__init__.py -------------------------------------------------------------------------------- /tts/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/dataset/__init__.py -------------------------------------------------------------------------------- /tts/dataset/ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from tts.preprocess import audio 6 | from hparams import hparams 7 | 8 | def build_from_path(in_dir, out_dir, num_workers=4, tqdm=lambda x: x): 9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 10 | 11 | Args: 12 | in_dir: The directory where you have downloaded the LJ Speech dataset 13 | out_dir: The directory to write the output into 14 | num_workers: Optional number of worker processes to parallelize across 15 | tqdm: You can optionally pass tqdm to get a nice progress bar 16 | 17 | Returns: 18 | A list of tuples describing the training examples. This should be written to train.txt 19 | ''' 20 | 21 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 22 | # can omit it and just call _process_utterance on each input if you want. 23 | executor = ProcessPoolExecutor(max_workers=num_workers) 24 | futures = [] 25 | index = 1 26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 27 | for line in f: 28 | parts = line.strip().split('|') 29 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 30 | text = parts[2] 31 | if len(text) < hparams.min_text: 32 | continue 33 | futures.append(executor.submit( 34 | partial(_process_utterance, out_dir, index, wav_path, text))) 35 | index += 1 36 | return [future.result() for future in tqdm(futures)] 37 | 38 | def _process_utterance(out_dir, index, wav_path, text): 39 | '''Preprocesses a single utterance audio/text pair. 40 | 41 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 42 | to the train.txt file. 43 | 44 | Args: 45 | out_dir: The directory to write the spectrograms into 46 | index: The numeric index to use in the spectrogram filenames. 47 | wav_path: Path to the audio file containing the speech input 48 | text: The text spoken in the input audio file 49 | 50 | Returns: 51 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 52 | ''' 53 | 54 | # Load the audio to a numpy array: 55 | wav = audio.load_wav(wav_path) 56 | 57 | if hparams.rescaling: 58 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 59 | 60 | # Compute the linear-scale spectrogram from the wav: 61 | spectrogram = audio.spectrogram(wav).astype(np.float32) 62 | n_frames = spectrogram.shape[1] 63 | 64 | # Compute a mel-scale spectrogram from the wav: 65 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 66 | 67 | 68 | # Write the spectrograms to disk: 69 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index 70 | mel_filename = 'ljspeech-mel-%05d.npy' % index 71 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 72 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 73 | 74 | # Return a tuple describing this training example: 75 | return (spectrogram_filename, mel_filename, n_frames, text) -------------------------------------------------------------------------------- /tts/dataset/utils.py: -------------------------------------------------------------------------------- 1 | from nnmnkwii.datasets import FileSourceDataset, FileDataSource 2 | import numpy as np 3 | 4 | def _pad(seq, max_len, constant_values=0): 5 | return np.pad(seq, (0, max_len - len(seq)), 6 | mode='constant', constant_values=constant_values) 7 | 8 | -------------------------------------------------------------------------------- /tts/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """Text processing frontend 4 | 5 | All frontend module should have the following functions: 6 | 7 | - text_to_sequence(text, p) 8 | - sequence_to_text(sequence) 9 | 10 | and the property: 11 | 12 | - n_vocab 13 | 14 | """ 15 | from tts.frontend import en -------------------------------------------------------------------------------- /tts/frontend/en/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from tts.frontend.text.symbols import symbols 3 | 4 | import nltk 5 | from random import random 6 | 7 | n_vocab = len(symbols) 8 | 9 | _arpabet = nltk.corpus.cmudict.dict() 10 | 11 | 12 | def _maybe_get_arpabet(word, p): 13 | try: 14 | phonemes = _arpabet[word][0] 15 | phonemes = " ".join(phonemes) 16 | except KeyError: 17 | return word 18 | 19 | return '{%s}' % phonemes if random() < p else word 20 | 21 | 22 | def mix_pronunciation(text, p): 23 | text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) 24 | return text 25 | 26 | 27 | def text_to_sequence(text, p=0.0): 28 | if p >= 0: 29 | text = mix_pronunciation(text, p) 30 | from tts.frontend.text import text_to_sequence 31 | text = text_to_sequence(text, ["english_cleaners"]) 32 | return text 33 | 34 | 35 | from tts.frontend.text import sequence_to_text -------------------------------------------------------------------------------- /tts/frontend/text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from tts.frontend.text import cleaners 3 | from tts.frontend.text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' -------------------------------------------------------------------------------- /tts/frontend/text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, ' ', text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def add_punctuation(text): 67 | if len(text) == 0: 68 | return text 69 | if text[-1] not in '!,.:;?': 70 | text = text + '.' # without this decoder is confused when to output EOS 71 | return text 72 | 73 | 74 | def basic_cleaners(text): 75 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def transliteration_cleaners(text): 82 | '''Pipeline for non-English text that transliterates to ASCII.''' 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = collapse_whitespace(text) 86 | return text 87 | 88 | 89 | def english_cleaners(text): 90 | '''Pipeline for English text, including number and abbreviation expansion.''' 91 | text = convert_to_ascii(text) 92 | text = add_punctuation(text) 93 | text = lowercase(text) 94 | text = expand_numbers(text) 95 | text = expand_abbreviations(text) 96 | text = collapse_whitespace(text) 97 | return text -------------------------------------------------------------------------------- /tts/frontend/text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | 20 | def __init__(self, file_or_path, keep_ambiguous=True): 21 | if isinstance(file_or_path, str): 22 | with open(file_or_path, encoding='latin-1') as f: 23 | entries = _parse_cmudict(f) 24 | else: 25 | entries = _parse_cmudict(file_or_path) 26 | if not keep_ambiguous: 27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 28 | self._entries = entries 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | def lookup(self, word): 34 | '''Returns list of ARPAbet pronunciations of the given word.''' 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | _alt_re = re.compile(r'\([0-9]+\)') 39 | 40 | 41 | def _parse_cmudict(file): 42 | cmudict = {} 43 | for line in file: 44 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 45 | parts = line.split(' ') 46 | word = re.sub(_alt_re, '', parts[0]) 47 | pronunciation = _get_pronunciation(parts[1]) 48 | if pronunciation: 49 | if word in cmudict: 50 | cmudict[word].append(pronunciation) 51 | else: 52 | cmudict[word] = [pronunciation] 53 | return cmudict 54 | 55 | 56 | def _get_pronunciation(s): 57 | parts = s.strip().split(' ') 58 | for part in parts: 59 | if part not in _valid_symbol_set: 60 | return None 61 | return ' '.join(parts) -------------------------------------------------------------------------------- /tts/frontend/text/numbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text -------------------------------------------------------------------------------- /tts/frontend/text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from .cmudict import valid_symbols 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet -------------------------------------------------------------------------------- /tts/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/preprocess/__init__.py -------------------------------------------------------------------------------- /tts/preprocess/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | from scipy import signal 6 | from hparams import hparams 7 | from scipy.io import wavfile 8 | 9 | import lws 10 | 11 | 12 | def load_wav(path): 13 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | 15 | 16 | def save_wav(wav, path): 17 | wav = wav * 32767 / max(0.01, np.max(np.abs(wav))) 18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 19 | 20 | 21 | def preemphasis(x): 22 | from nnmnkwii.preprocessing import preemphasis 23 | return preemphasis(x, hparams.preemphasis) 24 | 25 | 26 | def inv_preemphasis(x): 27 | from nnmnkwii.preprocessing import inv_preemphasis 28 | return inv_preemphasis(x, hparams.preemphasis) 29 | 30 | 31 | def spectrogram(y): 32 | D = _lws_processor().stft(preemphasis(y)).T 33 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 34 | return _normalize(S) 35 | 36 | 37 | def inv_spectrogram(spectrogram): 38 | '''Converts spectrogram to waveform using librosa''' 39 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 40 | processor = _lws_processor() 41 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 42 | y = processor.istft(D).astype(np.float32) 43 | return inv_preemphasis(y) 44 | 45 | 46 | def melspectrogram(y): 47 | D = _lws_processor().stft(preemphasis(y)).T 48 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 49 | if not hparams.allow_clipping_in_normalization: 50 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 51 | return _normalize(S) 52 | 53 | 54 | def _lws_processor(): 55 | return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech") 56 | 57 | 58 | def inv_mel_spectrogram(mel_spectrogram): 59 | D = _denormalize(mel_spectrogram) 60 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)) # Convert back to linear 61 | processor = _lws_processor() 62 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 63 | y = processor.istft(D).astype(np.float32) 64 | return inv_preemphasis(y) 65 | 66 | 67 | _mel_basis = None 68 | 69 | _inv_mel_basis = None 70 | 71 | def _mel_to_linear(mel_spectrogram): 72 | global _inv_mel_basis 73 | if _inv_mel_basis is None: 74 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 75 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 76 | 77 | 78 | def _linear_to_mel(spectrogram): 79 | global _mel_basis 80 | if _mel_basis is None: 81 | _mel_basis = _build_mel_basis() 82 | return np.dot(_mel_basis, spectrogram) 83 | 84 | 85 | def _build_mel_basis(): 86 | if hparams.fmax is not None: 87 | assert hparams.fmax <= hparams.sample_rate // 2 88 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, 89 | fmin=hparams.fmin, fmax=hparams.fmax, 90 | n_mels=hparams.num_mels) 91 | 92 | 93 | def _amp_to_db(x): 94 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 95 | return 20 * np.log10(np.maximum(min_level, x)) 96 | 97 | 98 | def _db_to_amp(x): 99 | return np.power(10.0, x * 0.05) 100 | 101 | 102 | def _normalize(S): 103 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 104 | 105 | 106 | def _denormalize(S): 107 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db -------------------------------------------------------------------------------- /tts/preprocess/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/preprocess/utils.py --------------------------------------------------------------------------------