├── .gitignore ├── .gitmodules ├── LICENSE ├── backend_wrappers ├── __init__.py ├── tacotron.py └── waveglow.py ├── data ├── config.yaml ├── text_handler_cfg.yaml └── voice.yaml ├── example.py ├── logger.py ├── requirements.txt ├── synthesizer.py └── utils ├── async_utils.py └── voice_control.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/.idea 3 | 4 | venv/ 5 | 6 | Logs/ 7 | data/* 8 | !data/*.yaml -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "backend/tacotron2"] 2 | path = backend/tacotron2 3 | url = https://github.com/sovaai/sova-tts-engine 4 | [submodule "backend/waveglow"] 5 | path = backend/waveglow 6 | url = https://github.com/sovaai/sova-tts-vocoder 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (c) 2020, Virtual Assistants, LLC 190 | All rights reserved. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /backend_wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | backend_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../backend") 5 | 6 | import_path = os.path.join(backend_path, "tacotron2") 7 | sys.path.insert(0, import_path) 8 | from .tacotron import Tacotron2Wrapper 9 | sys.path.pop(0) 10 | 11 | import_path = os.path.join(backend_path, "waveglow") 12 | sys.path.insert(0, import_path) 13 | from .waveglow import WaveglowWrapper 14 | sys.path.pop(0) -------------------------------------------------------------------------------- /backend_wrappers/tacotron.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from hparams import create_hparams 4 | from model import load_model 5 | from modules.layers import TacotronSTFT 6 | 7 | 8 | class HparamsNotFound(Exception): 9 | pass 10 | 11 | 12 | class Tacotron2Wrapper: 13 | def __init__(self, model_path, device, hparams_path=None, steps_per_symbol=10, gate_threshold=0.5): 14 | self.device = torch.device("cpu" if not torch.cuda.is_available() else device) 15 | self.dtype = torch.float if self.device.type == "cpu" else torch.half 16 | 17 | _checkpoint = torch.load(model_path, map_location=self.device) 18 | _hparams = _checkpoint.get("hparams", None) 19 | if _hparams is not None: 20 | pass 21 | elif hparams_path is None: 22 | raise HparamsNotFound("The hparams dict is not presented either in a checkpoint or as a file.") 23 | else: 24 | _hparams = hparams_path 25 | 26 | self.hparams = create_hparams(_hparams) 27 | 28 | _charset = self.hparams.get("language", None) # обратная совместимость со старыми конфигами 29 | if _charset is not None: 30 | self.hparams.charset = _charset 31 | self.hparams.device = self.device 32 | 33 | self.model = load_model(self.hparams) 34 | self.model.load_state_dict(_checkpoint["state_dict"]) 35 | self.model.eval().to(device=self.device, dtype=self.dtype) 36 | 37 | self.stft = TacotronSTFT( 38 | self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, 39 | self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, 40 | self.hparams.mel_fmax 41 | ) 42 | 43 | self.steps_per_symbol = steps_per_symbol 44 | self.gate_threshold = gate_threshold 45 | 46 | 47 | def __call__(self, sequence, **kwargs): 48 | sequence = torch.LongTensor(sequence).view(1, -1) 49 | sequence = sequence.to(device=self.device) 50 | 51 | kwargs["max_decoder_steps"] = int(self.steps_per_symbol * sequence.size(-1)) 52 | 53 | mel_outputs, mel_outputs_postnet, gates, alignments = self.model.inference(sequence, **kwargs) 54 | 55 | return mel_outputs_postnet -------------------------------------------------------------------------------- /backend_wrappers/waveglow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import numpy as np 4 | 5 | from denoiser import Denoiser 6 | import glow 7 | 8 | 9 | _waveglow_path = sys.path[0] 10 | 11 | 12 | class WaveglowWrapper: 13 | def __init__(self, model_path, device, sigma=0.666, strength=0.1): 14 | self.device = torch.device("cpu" if not torch.cuda.is_available() else device) 15 | self.dtype = torch.float if self.device.type == "cpu" else torch.half 16 | 17 | self.model = torch.load(model_path, map_location=self.device)["model"] 18 | self.model.device = self.device 19 | 20 | for m in self.model.modules(): 21 | if "Conv" in str(type(m)): 22 | setattr(m, "padding_mode", "zeros") 23 | 24 | self.model.eval().to(device=self.device, dtype=self.dtype) 25 | 26 | for k in self.model.convinv: 27 | k.float() 28 | 29 | self.denoiser = Denoiser(self.model, device=self.device) 30 | 31 | self.sigma = sigma 32 | self.strength = strength 33 | 34 | 35 | def __call__(self, spectrogram): 36 | with torch.no_grad(): 37 | audio = self.model.infer(spectrogram, self.sigma) 38 | 39 | return audio 40 | 41 | 42 | def denoise(self, audio): 43 | if type(audio) == np.ndarray: 44 | audio = torch.tensor(audio).to(self.device, self.dtype) 45 | 46 | if audio.ndim == 1: 47 | audio = audio.view(1, -1) 48 | audio = self.denoiser(audio, self.strength)[:, 0] 49 | 50 | return audio.data.cpu().numpy() -------------------------------------------------------------------------------- /data/config.yaml: -------------------------------------------------------------------------------- 1 | general: 2 | device: "cuda" 3 | pause_type: "silence" 4 | sample_rate: 22050 5 | 6 | tacotron2: 7 | voice_control_cfg: "data/voice.yaml" 8 | user_dict: 9 | 10 | text_handler: 11 | config_path: "data/text_handler_cfg.yaml" 12 | out_max_length: 200 13 | 14 | modules: 15 | engine: tacotron2 16 | vocoder: waveglow 17 | 18 | engine: 19 | tacotron2: 20 | model_path: 21 | hparams_path: 22 | options: 23 | steps_per_symbol: 10 24 | gate_threshold: 0.5 25 | 26 | vocoder: 27 | waveglow: 28 | model_path: 29 | options: 30 | sigma: 0.666 31 | strength: 0.1 -------------------------------------------------------------------------------- /data/text_handler_cfg.yaml: -------------------------------------------------------------------------------- 1 | handler: 2 | out_max_length: 200 3 | charset: ru_trans 4 | modules: [emphasizer, phonetizer] 5 | 6 | emphasizer: 7 | type: rule_based 8 | prefer_user: true 9 | dict_source: 10 | 11 | phonetizer: 12 | type: rule_based 13 | dict_source: -------------------------------------------------------------------------------- /data/voice.yaml: -------------------------------------------------------------------------------- 1 | psola: 2 | max_hz: 1050 3 | min_hz: 40 4 | analysis_win_ms: 40 5 | max_change: 1.455 6 | min_change: 0.695 7 | 8 | phase: 9 | nfft: 256 10 | hop: 64 -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from synthesizer import Synthesizer 2 | 3 | 4 | def test(): 5 | tacotron = Synthesizer.from_config("data/config.yaml", name="tacotron2") 6 | 7 | samples = [ 8 | "Съешь же ещё этих мягких французских булок да выпей чаю.", 9 | 10 | "Широкая электрификация южных губерний даст мощный толчок подъёму сельского хозяйства.", 11 | 12 | "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!" 13 | ] 14 | 15 | save_path = "data/waves" 16 | for i, sample in enumerate(samples): 17 | audio = tacotron.synthesize( 18 | text=sample 19 | ) 20 | 21 | tacotron.save(audio, save_path, str(i)) 22 | 23 | 24 | if __name__ == "__main__": 25 | test() -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from loguru import logger 3 | 4 | 5 | # sys.stdout.reconfigure(encoding="utf-8") 6 | class Format: 7 | time = "{time:YYYY-MM-DD HH:mm:ss.SSS}" 8 | level = "{level: <8}" 9 | module = "{name}:{function}:{line}" 10 | message = "{message}" 11 | 12 | LEVEL = "INFO" 13 | SAVE_TO_FILE = False 14 | _filename = "data/logs/user.log" 15 | 16 | logger.remove() 17 | logger.add(sys.stdout, level=LEVEL) 18 | 19 | if SAVE_TO_FILE: 20 | logger.add(_filename, encoding="utf8") 21 | 22 | 23 | # from collections import defaultdict 24 | # from random import choice 25 | # 26 | # colors = ["blue", "green", "magenta", "red", "yellow"] 27 | # color_per_module = defaultdict(lambda: choice(colors)) 28 | # 29 | # logger.bind(synthesizer_name=name) 30 | # _color_tag = choice(colors) 31 | # _name_fmt = "<{}>".format(_color_tag) + "{extra[synthesizer_name]}" + "".format(_color_tag) 32 | # _formatter = " | ".join((Format.time, Format.level, Format.module, _name_fmt, Format.message)) 33 | # logger.add(sys.stdout, format=_formatter) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch===1.4.0 2 | soundfile==0.10.3.post1 3 | scipy==1.4.1 4 | librosa==0.7.2 5 | numpy==1.13.3 6 | loguru 7 | -e git+https://github.com/sovaai/sova-tts-tps@v1.0.1#egg=TPS -------------------------------------------------------------------------------- /synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import yaml 5 | 6 | import numpy as np 7 | import soundfile 8 | 9 | from tps import cleaners, Handler, load_dict, save_dict 10 | from tps.types import Delimiter 11 | 12 | root_path = os.path.dirname(os.path.abspath(__file__)) 13 | sys.path.insert(0, root_path) 14 | 15 | import backend_wrappers as bw 16 | from utils.async_utils import BackgroundGenerator 17 | from utils.voice_control import shift_pitch, stretch_wave 18 | from logger import logger 19 | 20 | sys.path.pop(0) 21 | 22 | 23 | def uniqid(): 24 | from time import time 25 | return hex(int(time() * 1e7))[2:] 26 | 27 | 28 | _modules_dict = { 29 | "tacotron2": bw.Tacotron2Wrapper, 30 | "waveglow": bw.WaveglowWrapper 31 | } 32 | 33 | 34 | _pauses = { 35 | Delimiter.eos: 10000, 36 | Delimiter.semicolon: 5000, 37 | Delimiter.colon: 3000, 38 | Delimiter.comma: 2000, 39 | Delimiter.space: 1000 40 | } 41 | 42 | 43 | class Synthesizer: 44 | def __init__(self, name, text_handler, engine, vocoder, sample_rate, device="cuda", pause_type="silence", 45 | voice_control_cfg=None, user_dict=None): 46 | self.name = name 47 | 48 | self.text_handler = text_handler 49 | self.engine = engine 50 | self.vocoder = vocoder 51 | 52 | self.sample_rate = sample_rate 53 | 54 | self.device = device 55 | 56 | self.pause_type = pause_type 57 | self.voice_control_cfg = self.load_config(voice_control_cfg) 58 | 59 | self.user_dict = None 60 | self._dict_source = None 61 | self.load_user_dict(user_dict) 62 | 63 | logger.info("Synthesizer {} is ready".format(name)) 64 | 65 | 66 | def synthesize(self, text, **kwargs): 67 | logger.info(text) 68 | 69 | mask_stress = kwargs.pop("mask_stress", False) 70 | mask_phonemes = kwargs.pop("mask_phonemes", False) 71 | 72 | sequence = self.text_handler( 73 | text=text, 74 | cleaner=cleaners.light_punctuation_cleaners, 75 | user_dict=self.user_dict, 76 | keep_delimiters=True, 77 | mask_stress=mask_stress, mask_phonemes=mask_phonemes 78 | ) 79 | 80 | audio_list = list(self._generate_audio(sequence, **kwargs)) 81 | audio = np.concatenate(audio_list) 82 | 83 | return audio 84 | 85 | 86 | def generate(self, text, **kwargs): 87 | mask_stress = kwargs.pop("mask_stress", False) 88 | mask_phonemes = kwargs.pop("mask_phonemes", False) 89 | 90 | sequence = self.text_handler.generate( 91 | text=text, 92 | cleaner=cleaners.light_punctuation_cleaners, 93 | user_dict=self.user_dict, 94 | keep_delimiters=True, 95 | mask_stress=mask_stress, mask_phonemes=mask_phonemes 96 | ) 97 | 98 | return BackgroundGenerator(self._generate_audio(sequence, **kwargs)) 99 | 100 | 101 | def _generate_audio(self, sequence, **kwargs): 102 | logger.debug("kwargs: {}".format(kwargs)) 103 | 104 | for unit in sequence: 105 | if unit in Delimiter: 106 | duration = _pauses[unit] 107 | audio = generate_pause(duration, ptype=self.pause_type) 108 | else: 109 | logger.debug(unit) 110 | unit = self.text_handler.check_eos(unit) 111 | unit = self.text_handler.text2vec(unit) 112 | 113 | spectrogram = self.engine(unit, **kwargs) 114 | audio = self.vocoder(spectrogram) 115 | audio = self.vocoder.denoise(audio) 116 | 117 | audio = self.post_process(audio, **kwargs) 118 | 119 | yield audio 120 | 121 | 122 | def post_process(self, audio, **kwargs): 123 | tone_factor = kwargs.pop("tone_factor", None) 124 | speed_factor = kwargs.pop("speed_factor", None) 125 | 126 | if tone_factor or speed_factor: 127 | audio = audio.squeeze() 128 | if tone_factor: 129 | audio = self.change_pitch(audio, tone_factor) 130 | if speed_factor: 131 | audio = self.change_speed(audio, speed_factor) 132 | 133 | audio = self.vocoder.denoise(audio) 134 | 135 | return audio.squeeze() 136 | 137 | 138 | def save(self, audio, path, prefix=None): 139 | os.makedirs(path, exist_ok=True) 140 | prefix = [prefix] if prefix is not None else [] 141 | 142 | waves_format = ".wav" 143 | name = "_".join(prefix + [self.name, uniqid(), time.strftime("%Y-%m-%d_%H-%M")]) + waves_format 144 | 145 | file_path = os.path.join(path, name) 146 | soundfile.write(file_path, audio, self.sample_rate) 147 | 148 | logger.info("Audio was saved as {}".format(os.path.abspath(file_path))) 149 | 150 | return file_path 151 | 152 | 153 | def change_speed(self, audio, factor): 154 | if factor > 2 or factor < 0.5: 155 | print("ERROR: speed factor is out of range [0.5, 2.0] -- original signal returned") 156 | return audio 157 | 158 | params = self.voice_control_cfg["phase"] 159 | 160 | return stretch_wave(audio, factor, params) 161 | 162 | 163 | def change_pitch(self, audio, factor): 164 | if factor > 1.5 or factor < 0.75: 165 | print("ERROR: tone factor is out of range [0.75, 1.5] -- original signal returned") 166 | return audio 167 | 168 | params = self.voice_control_cfg["psola"] 169 | 170 | return shift_pitch(audio, self.sample_rate, factor, params) 171 | 172 | 173 | def load_user_dict(self, user_dict): 174 | data_dir = "data" 175 | if isinstance(user_dict, dict) or user_dict is None: 176 | if not os.path.exists(data_dir): 177 | os.makedirs(data_dir) 178 | logger.info("Data folder was created along the path {}".format(os.path.abspath(data_dir))) 179 | self._dict_source = os.path.join(data_dir, "{}_user_dict.json".format(self.name)) 180 | else: 181 | self._dict_source = user_dict 182 | assert self._dict_source.endswith((".json", ".yaml")) 183 | 184 | self.user_dict = load_dict(user_dict) 185 | logger.info("User dictionary has been loaded") 186 | 187 | 188 | def get_user_dict(self): 189 | logger.info("Request for the user dictionary was received") 190 | return self.user_dict 191 | 192 | 193 | def update_user_dict(self, new_dict): 194 | self.user_dict.update(new_dict) 195 | logger.info("User dictionary has been updated") 196 | 197 | save_dict(self.user_dict, self._dict_source) 198 | logger.info("User dictionary has been saved") 199 | 200 | 201 | def replace_user_dict(self, new_dict): 202 | self.user_dict = new_dict 203 | logger.info("User dictionary has been replaced") 204 | 205 | save_dict(self.user_dict, self._dict_source) 206 | logger.info("User dictionary has been saved") 207 | 208 | 209 | @classmethod 210 | def from_config(cls, config, name): 211 | if isinstance(config, str): 212 | logger.debug("Loading synthesizer from config file {}".format(os.path.abspath(config))) 213 | 214 | config = cls.load_config(config) 215 | 216 | params = config["general"] 217 | params["name"] = name 218 | device = params["device"] 219 | assert device is not None 220 | 221 | modules_config = config.pop(name) 222 | params["voice_control_cfg"] = modules_config["voice_control_cfg"] 223 | params["user_dict"] = modules_config["user_dict"] 224 | 225 | params["text_handler"] = _load_text_handler(modules_config["text_handler"]) 226 | 227 | chosen = modules_config["modules"] 228 | 229 | for mtype, mname in chosen.items(): 230 | params[mtype] = Synthesizer.module_from_config(modules_config, mtype, mname, device) 231 | 232 | return Synthesizer(**params) 233 | 234 | 235 | @staticmethod 236 | def module_from_config(modules_config, mtype, mname, device): 237 | logger.info("Loading {} module".format(mname)) 238 | 239 | module_config = modules_config[mtype][mname] 240 | module_config["device"] = device 241 | 242 | for key, value in module_config.pop("options", {}).items(): 243 | if value is not None: 244 | module_config[key] = value 245 | 246 | return _modules_dict[mname](**module_config) 247 | 248 | 249 | @staticmethod 250 | def load_config(config_source): 251 | if isinstance(config_source, dict): 252 | return config_source 253 | elif isinstance(config_source, str): 254 | pass 255 | else: 256 | raise TypeError 257 | 258 | with open(config_source, "r", encoding="utf-8") as stream: 259 | config = yaml.safe_load(stream) 260 | 261 | assert config is not None 262 | 263 | return config 264 | 265 | 266 | def generate_pause(duration, eps=1e-4, ptype='white_noise'): 267 | if ptype == 'silence': 268 | pause = np.zeros((duration, )) 269 | elif ptype == 'white_noise': 270 | pause = np.random.random((duration, )) * eps 271 | else: 272 | raise TypeError 273 | 274 | return pause.astype(np.float32) 275 | 276 | 277 | def _load_text_handler(config_dict): 278 | logger.info("Loading text handler") 279 | 280 | out_max_length = config_dict["out_max_length"] 281 | 282 | config_path = config_dict["config_path"] 283 | assert config_path is not None 284 | 285 | handler_config = Synthesizer.load_config(config_dict["config_path"]) 286 | handler_config["handler"]["out_max_length"] = out_max_length 287 | 288 | return Handler.from_config(handler_config) -------------------------------------------------------------------------------- /utils/async_utils.py: -------------------------------------------------------------------------------- 1 | import queue 2 | import threading 3 | 4 | 5 | class BackgroundGenerator(threading.Thread): 6 | def __init__(self, generator): 7 | super().__init__() 8 | self.queue = queue.Queue(4) 9 | self.generator = generator 10 | 11 | self.daemon = True 12 | self.start() 13 | 14 | 15 | def __iter__(self): 16 | return self 17 | 18 | 19 | def __next__(self): 20 | item = self.queue.get() 21 | if item is None: 22 | raise StopIteration 23 | return item 24 | 25 | 26 | def run(self): 27 | for item in self.generator: 28 | self.queue.put(item) 29 | self.queue.put(None) -------------------------------------------------------------------------------- /utils/voice_control.py: -------------------------------------------------------------------------------- 1 | """ 2 | Based on: 3 | https://github.com/gaganbahga/time_stretch 4 | https://github.com/sannawag/TD-PSOLA 5 | """ 6 | 7 | import librosa 8 | import numpy as np 9 | from numpy.fft import fft, ifft 10 | 11 | 12 | def shift_pitch(signal, fs, factor, psola_params): 13 | """ 14 | Changing speech tone in 'factor' times. 15 | 16 | :param signal: 17 | :param fs: 18 | :param factor: 19 | :param psola_params: 20 | :return: 21 | """ 22 | if factor == 1: 23 | return signal 24 | 25 | peaks = find_peaks(signal, fs, psola_params) 26 | new_signal = psola(signal, peaks, factor) 27 | 28 | return new_signal 29 | 30 | 31 | def find_peaks(signal, fs, psola_params): 32 | max_hz = psola_params['max_hz'] 33 | min_hz = psola_params['min_hz'] 34 | analysis_win_ms = psola_params['analysis_win_ms'] 35 | max_change = psola_params['max_change'] 36 | min_change = psola_params['min_change'] 37 | 38 | N = len(signal) 39 | min_period = fs // max_hz 40 | max_period = fs // min_hz 41 | 42 | # compute pitch periodicity 43 | sequence_len = int(analysis_win_ms / 1000 * fs) # analysis sequence length in samples 44 | periods = compute_periods_per_sequence(signal, sequence_len, min_period, max_period) 45 | 46 | # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range 47 | mean_period = np.mean(periods) 48 | max_period = int(mean_period * 1.1) 49 | min_period = int(mean_period * 0.9) 50 | periods = compute_periods_per_sequence(signal, sequence_len, min_period, max_period) 51 | 52 | # find the peaks 53 | peaks = [np.argmax(signal[:int(periods[0] * 1.1)])] 54 | while True: 55 | prev = peaks[-1] 56 | idx = prev // sequence_len # current autocorrelation analysis window 57 | if prev + int(periods[idx] * max_change) >= N: 58 | break 59 | # find maximum near expected location 60 | peaks.append(prev + int(periods[idx] * min_change) + 61 | np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)])) 62 | return np.array(peaks) 63 | 64 | 65 | def compute_periods_per_sequence(signal, sequence_len, min_period, max_period): 66 | N = len(signal) 67 | offset = 0 # current sample offset 68 | periods = [] # period length of each analysis sequence 69 | 70 | while offset < N: 71 | frame = signal[offset:offset + sequence_len] 72 | if len(frame) < sequence_len: 73 | frame_padded = np.zeros((sequence_len, )) 74 | frame_padded[:len(frame)] = frame 75 | frame = frame_padded 76 | 77 | fourier = fft(frame) 78 | fourier[0] = 0 # remove DC component 79 | autoc = ifft(fourier * np.conj(fourier)).real 80 | autoc_peak = min_period + np.argmax(autoc[min_period:max_period]) 81 | periods.append(autoc_peak) 82 | offset += sequence_len 83 | 84 | return periods 85 | 86 | 87 | def psola(signal, peaks, f_ratio): 88 | N = len(signal) 89 | # Interpolate 90 | new_signal = np.zeros(N) 91 | new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio)) 92 | new_peaks = np.zeros(len(new_peaks_ref)).astype(int) 93 | 94 | for i in range(len(new_peaks)): 95 | weight = new_peaks_ref[i] % 1 96 | left = np.floor(new_peaks_ref[i]).astype(int) 97 | right = np.ceil(new_peaks_ref[i]).astype(int) 98 | new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight) 99 | 100 | # PSOLA 101 | for j in range(len(new_peaks)): 102 | # find the corresponding old peak index 103 | i = np.argmin(np.abs(peaks - new_peaks[j])) 104 | # get the distances to adjacent peaks 105 | P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1], 106 | N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]] 107 | # edge case truncation 108 | if peaks[i] - P1[0] < 0: 109 | P1[0] = peaks[i] 110 | if peaks[i] + P1[1] > N - 1: 111 | P1[1] = N - 1 - peaks[i] 112 | # linear OLA window 113 | window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:]) 114 | # center window from original signal at the new peak 115 | new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]] 116 | 117 | return new_signal 118 | 119 | 120 | def stretch_wave(x, factor, phase_params): 121 | """ 122 | Changing speech speed in 'factor' times, preserving its tone 123 | 124 | :param x: 125 | :param factor: 126 | :param phase_params: 127 | :return: 128 | """ 129 | if factor == 1: 130 | return x 131 | 132 | nfft = phase_params['nfft'] 133 | hop = phase_params['hop'] 134 | 135 | stft = librosa.core.stft(x, n_fft=nfft).transpose() 136 | stft_cols = stft.shape[1] 137 | 138 | times = np.arange(0, stft.shape[0], factor) 139 | phase_adv = (2 * np.pi * hop * np.arange(0, stft_cols))/ nfft 140 | stft = np.concatenate((stft, np.zeros((1, stft_cols))), axis=0) 141 | 142 | indices = np.floor(times).astype(np.int) 143 | alpha = np.expand_dims(times - np.floor(times), axis=1) 144 | mag = (1. - alpha) * np.absolute(stft[indices, :]) + alpha * np.absolute(stft[indices + 1, :]) 145 | 146 | dphi = np.angle(stft[indices + 1, :]) - np.angle(stft[indices, :]) - phase_adv 147 | dphi = dphi - 2 * np.pi * np.floor(dphi/(2 * np.pi)) 148 | 149 | phase_adv_acc = np.matmul(np.expand_dims(np.arange(len(times) + 1),axis=1), np.expand_dims(phase_adv, axis=0)) 150 | phase = np.concatenate( (np.zeros((1, stft_cols)), np.cumsum(dphi, axis=0)), axis=0) + phase_adv_acc 151 | phase += np.angle(stft[0, :]) 152 | 153 | stft_new = mag * np.exp(phase[:-1, :] * 1j) 154 | 155 | return librosa.core.istft(stft_new.transpose()) --------------------------------------------------------------------------------