├── .gitignore
├── LICENSE
├── README.md
├── hparams.py
├── preprocess.py
└── tts
    ├── __init__.py
    ├── dataset
        ├── __init__.py
        ├── ljspeech.py
        └── utils.py
    ├── frontend
        ├── __init__.py
        ├── en
        │   └── __init__.py
        └── text
        │   ├── __init__.py
        │   ├── cleaners.py
        │   ├── cmudict.py
        │   ├── numbers.py
        │   └── symbols.py
    └── preprocess
        ├── __init__.py
        ├── audio.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # dataset
107 | data_dir
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text2Speech-Pytorch
 2 | A Text2Speech engine for Pytorch.
 3 | 
 4 | ## NOTE: Heavily Work in Progress
 5 | 
 6 | This repo will hold various TTS modules, from frontends such as tacotron and deepvoice to neural vocoder backends such as wavenet and wavernn.
 7 | 
 8 | The main goal is to have everything in one place, in one framework and with a bit more modular structure.
 9 | 
10 | More importantly this is for self learning as I try to implement the various models and architectures.
11 | 
12 | Contributions are welcomed!
13 | 
14 | # Features (Like to have)
15 | * single speaker/multi speaker dataset support
16 | * modular components
17 | * support for prototyping new models (i.e resusable modules for seq2seq, attention, fully conv nets, transformer, etc)
18 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | class hparams:
 2 |     # audio parameters
 3 |     num_mels = 80
 4 |     fmin = 125
 5 |     fmax = 7600
 6 |     fft_size = 1024
 7 |     hop_size = 256
 8 |     sample_rate = 22050
 9 |     preemphasis = 0.97
10 |     min_level_db = -100
11 |     ref_level_db = 20
12 |     rescaling = False
13 |     rescaling_max = 0.999
14 |     allow_clipping_in_normalization = True
15 | 
16 |     max_iters=200
17 |     griffin_lim_iters=60
18 |     power=1.5        
19 | 
20 |     # preprocessing parameters
21 |     min_text = 20
22 | 
23 |     # general parameters
24 |     language = "en"
25 | 
26 |     # trainig paremeters
27 |     replace_pronunciation_prob = 1.0
28 |     outputs_per_step = 5
29 |     batch_size = 60
30 |     batch_split = int(batch_size*0.5)
31 |     epochs = 2000
32 |     guided_attention_ratio = 0.995
33 |     teacher_forcing_ratio = 0.5
34 |     attention_scale = 15
35 |     attention_decay = 0.997


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Preprocess dataset
 4 | 
 5 | usage: preprocess.py [options] <name> <in_dir> <out_dir>
 6 | 
 7 | options:
 8 |     --num_workers=<n>        Num workers.
 9 |     --hparams=<parmas>       Hyper parameters [default: ].
10 |     -h, --help               Show help message.
11 | """
12 | from docopt import docopt
13 | import os
14 | from multiprocessing import cpu_count
15 | from tqdm import tqdm
16 | from hparams import hparams
17 | import importlib
18 | 
19 | 
20 | def preprocess(mode, in_dir, out_dir, num_workers):
21 |     """
22 |     """
23 |     os.makedirs(out_dir, exist_ok=True)
24 |     metadata = mode.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
25 |     write_metadata(metadata, out_dir)
26 | 
27 | 
28 | def write_metadata(metadata, out_dir):
29 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
30 |         for m in metadata:
31 |             f.write('|'.join([str(x) for x in m]) + '\n')
32 |     frames = sum([m[2] for m in metadata])
33 |     frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000
34 |     hours = frames * frame_shift_ms / (3600 * 1000)
35 |     print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
36 |     print('Max input length:  %d' % max(len(m[3]) for m in metadata))
37 |     print('Max output length: %d' % max(m[2] for m in metadata))
38 | 
39 | 
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     args = docopt(__doc__)
44 |     name = args["<name>"]
45 |     in_dir = args["<in_dir>"]
46 |     out_dir = args["<out_dir>"]
47 |     num_workers = args["--num_workers"]
48 |     num_workers = cpu_count() if num_workers is None else int(num_workers)
49 | 
50 |     # check if data folder name is in available ones
51 |     assert name in ["ljspeech"]
52 |     # get mode
53 |     mode = importlib.import_module("tts.dataset."+name)
54 |     # preprocess
55 |     preprocess(mode, in_dir, out_dir, num_workers)
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/__init__.py


--------------------------------------------------------------------------------
/tts/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/dataset/__init__.py


--------------------------------------------------------------------------------
/tts/dataset/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from tts.preprocess import audio
 6 | from hparams import hparams
 7 | 
 8 | def build_from_path(in_dir, out_dir, num_workers=4, tqdm=lambda x: x):
 9 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
10 | 
11 |       Args:
12 |         in_dir: The directory where you have downloaded the LJ Speech dataset
13 |         out_dir: The directory to write the output into
14 |         num_workers: Optional number of worker processes to parallelize across
15 |         tqdm: You can optionally pass tqdm to get a nice progress bar
16 | 
17 |       Returns:
18 |         A list of tuples describing the training examples. This should be written to train.txt
19 |     '''
20 | 
21 |     # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
22 |     # can omit it and just call _process_utterance on each input if you want.
23 |     executor = ProcessPoolExecutor(max_workers=num_workers)
24 |     futures = []
25 |     index = 1
26 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
27 |         for line in f:
28 |             parts = line.strip().split('|')
29 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
30 |             text = parts[2]
31 |             if len(text) < hparams.min_text:
32 |                 continue
33 |             futures.append(executor.submit(
34 |                 partial(_process_utterance, out_dir, index, wav_path, text)))
35 |             index += 1
36 |     return [future.result() for future in tqdm(futures)]
37 | 
38 | def _process_utterance(out_dir, index, wav_path, text):
39 |     '''Preprocesses a single utterance audio/text pair.
40 | 
41 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
42 |     to the train.txt file.
43 | 
44 |     Args:
45 |       out_dir: The directory to write the spectrograms into
46 |       index: The numeric index to use in the spectrogram filenames.
47 |       wav_path: Path to the audio file containing the speech input
48 |       text: The text spoken in the input audio file
49 | 
50 |     Returns:
51 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
52 |     '''
53 | 
54 |     # Load the audio to a numpy array:
55 |     wav = audio.load_wav(wav_path)
56 | 
57 |     if hparams.rescaling:
58 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
59 | 
60 |     # Compute the linear-scale spectrogram from the wav:
61 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
62 |     n_frames = spectrogram.shape[1]
63 | 
64 |     # Compute a mel-scale spectrogram from the wav:
65 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
66 |     
67 | 
68 |     # Write the spectrograms to disk:
69 |     spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
70 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
71 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
72 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
73 | 
74 |     # Return a tuple describing this training example:
75 |     return (spectrogram_filename, mel_filename, n_frames, text)


--------------------------------------------------------------------------------
/tts/dataset/utils.py:
--------------------------------------------------------------------------------
1 | from nnmnkwii.datasets import FileSourceDataset, FileDataSource
2 | import numpy as np
3 | 
4 | def _pad(seq, max_len, constant_values=0):
5 |     return np.pad(seq, (0, max_len - len(seq)),
6 |                   mode='constant', constant_values=constant_values)
7 | 
8 | 


--------------------------------------------------------------------------------
/tts/frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """Text processing frontend
 4 | 
 5 | All frontend module should have the following functions:
 6 | 
 7 | - text_to_sequence(text, p)
 8 | - sequence_to_text(sequence)
 9 | 
10 | and the property:
11 | 
12 | - n_vocab
13 | 
14 | """
15 | from tts.frontend import en


--------------------------------------------------------------------------------
/tts/frontend/en/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from tts.frontend.text.symbols import symbols
 3 | 
 4 | import nltk
 5 | from random import random
 6 | 
 7 | n_vocab = len(symbols)
 8 | 
 9 | _arpabet = nltk.corpus.cmudict.dict()
10 | 
11 | 
12 | def _maybe_get_arpabet(word, p):
13 |     try:
14 |         phonemes = _arpabet[word][0]
15 |         phonemes = " ".join(phonemes)
16 |     except KeyError:
17 |         return word
18 | 
19 |     return '{%s}' % phonemes if random() < p else word
20 | 
21 | 
22 | def mix_pronunciation(text, p):
23 |     text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
24 |     return text
25 | 
26 | 
27 | def text_to_sequence(text, p=0.0):
28 |     if p >= 0:
29 |         text = mix_pronunciation(text, p)
30 |     from tts.frontend.text import text_to_sequence
31 |     text = text_to_sequence(text, ["english_cleaners"])
32 |     return text
33 | 
34 | 
35 | from tts.frontend.text import sequence_to_text


--------------------------------------------------------------------------------
/tts/frontend/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from tts.frontend.text import cleaners
 3 | from tts.frontend.text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |       in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |       Args:
21 |         text: string to convert to a sequence
22 |         cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |       Returns:
25 |         List of integers corresponding to the symbols in the text
26 |     '''
27 |     sequence = []
28 | 
29 |     # Check for curly braces and treat their contents as ARPAbet:
30 |     while len(text):
31 |         m = _curly_re.match(text)
32 |         if not m:
33 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |             break
35 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |         sequence += _arpabet_to_sequence(m.group(2))
37 |         text = m.group(3)
38 | 
39 |     # Append EOS token
40 |     sequence.append(_symbol_to_id['~'])
41 |     return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |     '''Converts a sequence of IDs back to a string'''
46 |     result = ''
47 |     for symbol_id in sequence:
48 |         if symbol_id in _id_to_symbol:
49 |             s = _id_to_symbol[symbol_id]
50 |             # Enclose ARPAbet back in curly braces:
51 |             if len(s) > 1 and s[0] == '@':
52 |                 s = '{%s}' % s[1:]
53 |             result += s
54 |     return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |     for name in cleaner_names:
59 |         cleaner = getattr(cleaners, name)
60 |         if not cleaner:
61 |             raise Exception('Unknown cleaner: %s' % name)
62 |         text = cleaner(text)
63 |     return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |     return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |     return s in _symbol_to_id and s is not '_' and s is not '~'


--------------------------------------------------------------------------------
/tts/frontend/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 |     ('mrs', 'misess'),
24 |     ('mr', 'mister'),
25 |     ('dr', 'doctor'),
26 |     ('st', 'saint'),
27 |     ('co', 'company'),
28 |     ('jr', 'junior'),
29 |     ('maj', 'major'),
30 |     ('gen', 'general'),
31 |     ('drs', 'doctors'),
32 |     ('rev', 'reverend'),
33 |     ('lt', 'lieutenant'),
34 |     ('hon', 'honorable'),
35 |     ('sgt', 'sergeant'),
36 |     ('capt', 'captain'),
37 |     ('esq', 'esquire'),
38 |     ('ltd', 'limited'),
39 |     ('col', 'colonel'),
40 |     ('ft', 'fort'),
41 | ]]
42 | 
43 | 
44 | def expand_abbreviations(text):
45 |     for regex, replacement in _abbreviations:
46 |         text = re.sub(regex, replacement, text)
47 |     return text
48 | 
49 | 
50 | def expand_numbers(text):
51 |     return normalize_numbers(text)
52 | 
53 | 
54 | def lowercase(text):
55 |     return text.lower()
56 | 
57 | 
58 | def collapse_whitespace(text):
59 |     return re.sub(_whitespace_re, ' ', text)
60 | 
61 | 
62 | def convert_to_ascii(text):
63 |     return unidecode(text)
64 | 
65 | 
66 | def add_punctuation(text):
67 |     if len(text) == 0:
68 |         return text
69 |     if text[-1] not in '!,.:;?':
70 |         text = text + '.'  # without this decoder is confused when to output EOS
71 |     return text
72 | 
73 | 
74 | def basic_cleaners(text):
75 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
76 |     text = lowercase(text)
77 |     text = collapse_whitespace(text)
78 |     return text
79 | 
80 | 
81 | def transliteration_cleaners(text):
82 |     '''Pipeline for non-English text that transliterates to ASCII.'''
83 |     text = convert_to_ascii(text)
84 |     text = lowercase(text)
85 |     text = collapse_whitespace(text)
86 |     return text
87 | 
88 | 
89 | def english_cleaners(text):
90 |     '''Pipeline for English text, including number and abbreviation expansion.'''
91 |     text = convert_to_ascii(text)
92 |     text = add_punctuation(text)
93 |     text = lowercase(text)
94 |     text = expand_numbers(text)
95 |     text = expand_abbreviations(text)
96 |     text = collapse_whitespace(text)
97 |     return text


--------------------------------------------------------------------------------
/tts/frontend/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |     'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |     'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |     'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |     'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |     'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |     'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |     '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 | 
20 |     def __init__(self, file_or_path, keep_ambiguous=True):
21 |         if isinstance(file_or_path, str):
22 |             with open(file_or_path, encoding='latin-1') as f:
23 |                 entries = _parse_cmudict(f)
24 |         else:
25 |             entries = _parse_cmudict(file_or_path)
26 |         if not keep_ambiguous:
27 |             entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
28 |         self._entries = entries
29 | 
30 |     def __len__(self):
31 |         return len(self._entries)
32 | 
33 |     def lookup(self, word):
34 |         '''Returns list of ARPAbet pronunciations of the given word.'''
35 |         return self._entries.get(word.upper())
36 | 
37 | 
38 | _alt_re = re.compile(r'\([0-9]+\)')
39 | 
40 | 
41 | def _parse_cmudict(file):
42 |     cmudict = {}
43 |     for line in file:
44 |         if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
45 |             parts = line.split('  ')
46 |             word = re.sub(_alt_re, '', parts[0])
47 |             pronunciation = _get_pronunciation(parts[1])
48 |             if pronunciation:
49 |                 if word in cmudict:
50 |                     cmudict[word].append(pronunciation)
51 |                 else:
52 |                     cmudict[word] = [pronunciation]
53 |     return cmudict
54 | 
55 | 
56 | def _get_pronunciation(s):
57 |     parts = s.strip().split(' ')
58 |     for part in parts:
59 |         if part not in _valid_symbol_set:
60 |             return None
61 |     return ' '.join(parts)


--------------------------------------------------------------------------------
/tts/frontend/text/numbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split('.')
27 |     if len(parts) > 2:
28 |         return match + ' dollars'  # Unexpected format
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |         cent_unit = 'cent' if cents == 1 else 'cents'
34 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |     elif dollars:
36 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |         return '%s %s' % (dollars, dollar_unit)
38 |     elif cents:
39 |         cent_unit = 'cent' if cents == 1 else 'cents'
40 |         return '%s %s' % (cents, cent_unit)
41 |     else:
42 |         return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return 'two thousand'
54 |         elif num > 2000 and num < 2010:
55 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + ' hundred'
58 |         else:
59 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |     else:
61 |         return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r'\1 pounds', text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text


--------------------------------------------------------------------------------
/tts/frontend/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from .cmudict import valid_symbols
 8 | 
 9 | _pad = '_'
10 | _eos = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet


--------------------------------------------------------------------------------
/tts/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/preprocess/__init__.py


--------------------------------------------------------------------------------
/tts/preprocess/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import math
  4 | import numpy as np
  5 | from scipy import signal
  6 | from hparams import hparams
  7 | from scipy.io import wavfile
  8 | 
  9 | import lws
 10 | 
 11 | 
 12 | def load_wav(path):
 13 |     return librosa.core.load(path, sr=hparams.sample_rate)[0]
 14 | 
 15 | 
 16 | def save_wav(wav, path):
 17 |     wav = wav * 32767 / max(0.01, np.max(np.abs(wav)))
 18 |     wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 19 | 
 20 | 
 21 | def preemphasis(x):
 22 |     from nnmnkwii.preprocessing import preemphasis
 23 |     return preemphasis(x, hparams.preemphasis)
 24 | 
 25 | 
 26 | def inv_preemphasis(x):
 27 |     from nnmnkwii.preprocessing import inv_preemphasis
 28 |     return inv_preemphasis(x, hparams.preemphasis)
 29 | 
 30 | 
 31 | def spectrogram(y):
 32 |     D = _lws_processor().stft(preemphasis(y)).T
 33 |     S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 34 |     return _normalize(S)
 35 | 
 36 | 
 37 | def inv_spectrogram(spectrogram):
 38 |     '''Converts spectrogram to waveform using librosa'''
 39 |     S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
 40 |     processor = _lws_processor()
 41 |     D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 42 |     y = processor.istft(D).astype(np.float32)
 43 |     return inv_preemphasis(y)
 44 | 
 45 | 
 46 | def melspectrogram(y):
 47 |     D = _lws_processor().stft(preemphasis(y)).T
 48 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 49 |     if not hparams.allow_clipping_in_normalization:
 50 |         assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
 51 |     return _normalize(S)
 52 | 
 53 | 
 54 | def _lws_processor():
 55 |     return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech")
 56 | 
 57 | 
 58 | def inv_mel_spectrogram(mel_spectrogram):
 59 |     D = _denormalize(mel_spectrogram)
 60 |     S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db))  # Convert back to linear
 61 |     processor = _lws_processor()
 62 |     D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 63 |     y = processor.istft(D).astype(np.float32)
 64 |     return inv_preemphasis(y)
 65 | 
 66 | 
 67 | _mel_basis = None
 68 | 
 69 | _inv_mel_basis = None
 70 | 
 71 | def _mel_to_linear(mel_spectrogram):
 72 | 	global _inv_mel_basis
 73 | 	if _inv_mel_basis is None:
 74 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
 75 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
 76 | 
 77 | 
 78 | def _linear_to_mel(spectrogram):
 79 |     global _mel_basis
 80 |     if _mel_basis is None:
 81 |         _mel_basis = _build_mel_basis()
 82 |     return np.dot(_mel_basis, spectrogram)
 83 | 
 84 | 
 85 | def _build_mel_basis():
 86 |     if hparams.fmax is not None:
 87 |         assert hparams.fmax <= hparams.sample_rate // 2
 88 |     return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
 89 |                                fmin=hparams.fmin, fmax=hparams.fmax,
 90 |                                n_mels=hparams.num_mels)
 91 | 
 92 | 
 93 | def _amp_to_db(x):
 94 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
 95 |     return 20 * np.log10(np.maximum(min_level, x))
 96 | 
 97 | 
 98 | def _db_to_amp(x):
 99 |     return np.power(10.0, x * 0.05)
100 | 
101 | 
102 | def _normalize(S):
103 |     return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
104 | 
105 | 
106 | def _denormalize(S):
107 |     return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db


--------------------------------------------------------------------------------
/tts/preprocess/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Wang/Text2Speech-Pytorch/7bacdd0880825c3eeb08b6899b5c499416e53d0c/tts/preprocess/utils.py


--------------------------------------------------------------------------------