├── .gitignore
├── .gitmodules
├── LICENSE
├── backend_wrappers
├── __init__.py
├── tacotron.py
└── waveglow.py
├── data
├── config.yaml
├── text_handler_cfg.yaml
└── voice.yaml
├── example.py
├── logger.py
├── requirements.txt
├── synthesizer.py
└── utils
├── async_utils.py
└── voice_control.py
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | **/.idea
3 |
4 | venv/
5 |
6 | Logs/
7 | data/*
8 | !data/*.yaml
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "backend/tacotron2"]
2 | path = backend/tacotron2
3 | url = https://github.com/sovaai/sova-tts-engine
4 | [submodule "backend/waveglow"]
5 | path = backend/waveglow
6 | url = https://github.com/sovaai/sova-tts-vocoder
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright (c) 2020, Virtual Assistants, LLC
190 | All rights reserved.
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/backend_wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | backend_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../backend")
5 |
6 | import_path = os.path.join(backend_path, "tacotron2")
7 | sys.path.insert(0, import_path)
8 | from .tacotron import Tacotron2Wrapper
9 | sys.path.pop(0)
10 |
11 | import_path = os.path.join(backend_path, "waveglow")
12 | sys.path.insert(0, import_path)
13 | from .waveglow import WaveglowWrapper
14 | sys.path.pop(0)
--------------------------------------------------------------------------------
/backend_wrappers/tacotron.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from hparams import create_hparams
4 | from model import load_model
5 | from modules.layers import TacotronSTFT
6 |
7 |
8 | class HparamsNotFound(Exception):
9 | pass
10 |
11 |
12 | class Tacotron2Wrapper:
13 | def __init__(self, model_path, device, hparams_path=None, steps_per_symbol=10, gate_threshold=0.5):
14 | self.device = torch.device("cpu" if not torch.cuda.is_available() else device)
15 | self.dtype = torch.float if self.device.type == "cpu" else torch.half
16 |
17 | _checkpoint = torch.load(model_path, map_location=self.device)
18 | _hparams = _checkpoint.get("hparams", None)
19 | if _hparams is not None:
20 | pass
21 | elif hparams_path is None:
22 | raise HparamsNotFound("The hparams dict is not presented either in a checkpoint or as a file.")
23 | else:
24 | _hparams = hparams_path
25 |
26 | self.hparams = create_hparams(_hparams)
27 |
28 | _charset = self.hparams.get("language", None) # обратная совместимость со старыми конфигами
29 | if _charset is not None:
30 | self.hparams.charset = _charset
31 | self.hparams.device = self.device
32 |
33 | self.model = load_model(self.hparams)
34 | self.model.load_state_dict(_checkpoint["state_dict"])
35 | self.model.eval().to(device=self.device, dtype=self.dtype)
36 |
37 | self.stft = TacotronSTFT(
38 | self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length,
39 | self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin,
40 | self.hparams.mel_fmax
41 | )
42 |
43 | self.steps_per_symbol = steps_per_symbol
44 | self.gate_threshold = gate_threshold
45 |
46 |
47 | def __call__(self, sequence, **kwargs):
48 | sequence = torch.LongTensor(sequence).view(1, -1)
49 | sequence = sequence.to(device=self.device)
50 |
51 | kwargs["max_decoder_steps"] = int(self.steps_per_symbol * sequence.size(-1))
52 |
53 | mel_outputs, mel_outputs_postnet, gates, alignments = self.model.inference(sequence, **kwargs)
54 |
55 | return mel_outputs_postnet
--------------------------------------------------------------------------------
/backend_wrappers/waveglow.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import numpy as np
4 |
5 | from denoiser import Denoiser
6 | import glow
7 |
8 |
9 | _waveglow_path = sys.path[0]
10 |
11 |
12 | class WaveglowWrapper:
13 | def __init__(self, model_path, device, sigma=0.666, strength=0.1):
14 | self.device = torch.device("cpu" if not torch.cuda.is_available() else device)
15 | self.dtype = torch.float if self.device.type == "cpu" else torch.half
16 |
17 | self.model = torch.load(model_path, map_location=self.device)["model"]
18 | self.model.device = self.device
19 |
20 | for m in self.model.modules():
21 | if "Conv" in str(type(m)):
22 | setattr(m, "padding_mode", "zeros")
23 |
24 | self.model.eval().to(device=self.device, dtype=self.dtype)
25 |
26 | for k in self.model.convinv:
27 | k.float()
28 |
29 | self.denoiser = Denoiser(self.model, device=self.device)
30 |
31 | self.sigma = sigma
32 | self.strength = strength
33 |
34 |
35 | def __call__(self, spectrogram):
36 | with torch.no_grad():
37 | audio = self.model.infer(spectrogram, self.sigma)
38 |
39 | return audio
40 |
41 |
42 | def denoise(self, audio):
43 | if type(audio) == np.ndarray:
44 | audio = torch.tensor(audio).to(self.device, self.dtype)
45 |
46 | if audio.ndim == 1:
47 | audio = audio.view(1, -1)
48 | audio = self.denoiser(audio, self.strength)[:, 0]
49 |
50 | return audio.data.cpu().numpy()
--------------------------------------------------------------------------------
/data/config.yaml:
--------------------------------------------------------------------------------
1 | general:
2 | device: "cuda"
3 | pause_type: "silence"
4 | sample_rate: 22050
5 |
6 | tacotron2:
7 | voice_control_cfg: "data/voice.yaml"
8 | user_dict:
9 |
10 | text_handler:
11 | config_path: "data/text_handler_cfg.yaml"
12 | out_max_length: 200
13 |
14 | modules:
15 | engine: tacotron2
16 | vocoder: waveglow
17 |
18 | engine:
19 | tacotron2:
20 | model_path:
21 | hparams_path:
22 | options:
23 | steps_per_symbol: 10
24 | gate_threshold: 0.5
25 |
26 | vocoder:
27 | waveglow:
28 | model_path:
29 | options:
30 | sigma: 0.666
31 | strength: 0.1
--------------------------------------------------------------------------------
/data/text_handler_cfg.yaml:
--------------------------------------------------------------------------------
1 | handler:
2 | out_max_length: 200
3 | charset: ru_trans
4 | modules: [emphasizer, phonetizer]
5 |
6 | emphasizer:
7 | type: rule_based
8 | prefer_user: true
9 | dict_source:
10 |
11 | phonetizer:
12 | type: rule_based
13 | dict_source:
--------------------------------------------------------------------------------
/data/voice.yaml:
--------------------------------------------------------------------------------
1 | psola:
2 | max_hz: 1050
3 | min_hz: 40
4 | analysis_win_ms: 40
5 | max_change: 1.455
6 | min_change: 0.695
7 |
8 | phase:
9 | nfft: 256
10 | hop: 64
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | from synthesizer import Synthesizer
2 |
3 |
4 | def test():
5 | tacotron = Synthesizer.from_config("data/config.yaml", name="tacotron2")
6 |
7 | samples = [
8 | "Съешь же ещё этих мягких французских булок да выпей чаю.",
9 |
10 | "Широкая электрификация южных губерний даст мощный толчок подъёму сельского хозяйства.",
11 |
12 | "В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!"
13 | ]
14 |
15 | save_path = "data/waves"
16 | for i, sample in enumerate(samples):
17 | audio = tacotron.synthesize(
18 | text=sample
19 | )
20 |
21 | tacotron.save(audio, save_path, str(i))
22 |
23 |
24 | if __name__ == "__main__":
25 | test()
--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from loguru import logger
3 |
4 |
5 | # sys.stdout.reconfigure(encoding="utf-8")
6 | class Format:
7 | time = "{time:YYYY-MM-DD HH:mm:ss.SSS}"
8 | level = "{level: <8}"
9 | module = "{name}:{function}:{line}"
10 | message = "{message}"
11 |
12 | LEVEL = "INFO"
13 | SAVE_TO_FILE = False
14 | _filename = "data/logs/user.log"
15 |
16 | logger.remove()
17 | logger.add(sys.stdout, level=LEVEL)
18 |
19 | if SAVE_TO_FILE:
20 | logger.add(_filename, encoding="utf8")
21 |
22 |
23 | # from collections import defaultdict
24 | # from random import choice
25 | #
26 | # colors = ["blue", "green", "magenta", "red", "yellow"]
27 | # color_per_module = defaultdict(lambda: choice(colors))
28 | #
29 | # logger.bind(synthesizer_name=name)
30 | # _color_tag = choice(colors)
31 | # _name_fmt = "<{}>".format(_color_tag) + "{extra[synthesizer_name]}" + "{}>".format(_color_tag)
32 | # _formatter = " | ".join((Format.time, Format.level, Format.module, _name_fmt, Format.message))
33 | # logger.add(sys.stdout, format=_formatter)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch===1.4.0
2 | soundfile==0.10.3.post1
3 | scipy==1.4.1
4 | librosa==0.7.2
5 | numpy==1.13.3
6 | loguru
7 | -e git+https://github.com/sovaai/sova-tts-tps@v1.0.1#egg=TPS
--------------------------------------------------------------------------------
/synthesizer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import yaml
5 |
6 | import numpy as np
7 | import soundfile
8 |
9 | from tps import cleaners, Handler, load_dict, save_dict
10 | from tps.types import Delimiter
11 |
12 | root_path = os.path.dirname(os.path.abspath(__file__))
13 | sys.path.insert(0, root_path)
14 |
15 | import backend_wrappers as bw
16 | from utils.async_utils import BackgroundGenerator
17 | from utils.voice_control import shift_pitch, stretch_wave
18 | from logger import logger
19 |
20 | sys.path.pop(0)
21 |
22 |
23 | def uniqid():
24 | from time import time
25 | return hex(int(time() * 1e7))[2:]
26 |
27 |
28 | _modules_dict = {
29 | "tacotron2": bw.Tacotron2Wrapper,
30 | "waveglow": bw.WaveglowWrapper
31 | }
32 |
33 |
34 | _pauses = {
35 | Delimiter.eos: 10000,
36 | Delimiter.semicolon: 5000,
37 | Delimiter.colon: 3000,
38 | Delimiter.comma: 2000,
39 | Delimiter.space: 1000
40 | }
41 |
42 |
43 | class Synthesizer:
44 | def __init__(self, name, text_handler, engine, vocoder, sample_rate, device="cuda", pause_type="silence",
45 | voice_control_cfg=None, user_dict=None):
46 | self.name = name
47 |
48 | self.text_handler = text_handler
49 | self.engine = engine
50 | self.vocoder = vocoder
51 |
52 | self.sample_rate = sample_rate
53 |
54 | self.device = device
55 |
56 | self.pause_type = pause_type
57 | self.voice_control_cfg = self.load_config(voice_control_cfg)
58 |
59 | self.user_dict = None
60 | self._dict_source = None
61 | self.load_user_dict(user_dict)
62 |
63 | logger.info("Synthesizer {} is ready".format(name))
64 |
65 |
66 | def synthesize(self, text, **kwargs):
67 | logger.info(text)
68 |
69 | mask_stress = kwargs.pop("mask_stress", False)
70 | mask_phonemes = kwargs.pop("mask_phonemes", False)
71 |
72 | sequence = self.text_handler(
73 | text=text,
74 | cleaner=cleaners.light_punctuation_cleaners,
75 | user_dict=self.user_dict,
76 | keep_delimiters=True,
77 | mask_stress=mask_stress, mask_phonemes=mask_phonemes
78 | )
79 |
80 | audio_list = list(self._generate_audio(sequence, **kwargs))
81 | audio = np.concatenate(audio_list)
82 |
83 | return audio
84 |
85 |
86 | def generate(self, text, **kwargs):
87 | mask_stress = kwargs.pop("mask_stress", False)
88 | mask_phonemes = kwargs.pop("mask_phonemes", False)
89 |
90 | sequence = self.text_handler.generate(
91 | text=text,
92 | cleaner=cleaners.light_punctuation_cleaners,
93 | user_dict=self.user_dict,
94 | keep_delimiters=True,
95 | mask_stress=mask_stress, mask_phonemes=mask_phonemes
96 | )
97 |
98 | return BackgroundGenerator(self._generate_audio(sequence, **kwargs))
99 |
100 |
101 | def _generate_audio(self, sequence, **kwargs):
102 | logger.debug("kwargs: {}".format(kwargs))
103 |
104 | for unit in sequence:
105 | if unit in Delimiter:
106 | duration = _pauses[unit]
107 | audio = generate_pause(duration, ptype=self.pause_type)
108 | else:
109 | logger.debug(unit)
110 | unit = self.text_handler.check_eos(unit)
111 | unit = self.text_handler.text2vec(unit)
112 |
113 | spectrogram = self.engine(unit, **kwargs)
114 | audio = self.vocoder(spectrogram)
115 | audio = self.vocoder.denoise(audio)
116 |
117 | audio = self.post_process(audio, **kwargs)
118 |
119 | yield audio
120 |
121 |
122 | def post_process(self, audio, **kwargs):
123 | tone_factor = kwargs.pop("tone_factor", None)
124 | speed_factor = kwargs.pop("speed_factor", None)
125 |
126 | if tone_factor or speed_factor:
127 | audio = audio.squeeze()
128 | if tone_factor:
129 | audio = self.change_pitch(audio, tone_factor)
130 | if speed_factor:
131 | audio = self.change_speed(audio, speed_factor)
132 |
133 | audio = self.vocoder.denoise(audio)
134 |
135 | return audio.squeeze()
136 |
137 |
138 | def save(self, audio, path, prefix=None):
139 | os.makedirs(path, exist_ok=True)
140 | prefix = [prefix] if prefix is not None else []
141 |
142 | waves_format = ".wav"
143 | name = "_".join(prefix + [self.name, uniqid(), time.strftime("%Y-%m-%d_%H-%M")]) + waves_format
144 |
145 | file_path = os.path.join(path, name)
146 | soundfile.write(file_path, audio, self.sample_rate)
147 |
148 | logger.info("Audio was saved as {}".format(os.path.abspath(file_path)))
149 |
150 | return file_path
151 |
152 |
153 | def change_speed(self, audio, factor):
154 | if factor > 2 or factor < 0.5:
155 | print("ERROR: speed factor is out of range [0.5, 2.0] -- original signal returned")
156 | return audio
157 |
158 | params = self.voice_control_cfg["phase"]
159 |
160 | return stretch_wave(audio, factor, params)
161 |
162 |
163 | def change_pitch(self, audio, factor):
164 | if factor > 1.5 or factor < 0.75:
165 | print("ERROR: tone factor is out of range [0.75, 1.5] -- original signal returned")
166 | return audio
167 |
168 | params = self.voice_control_cfg["psola"]
169 |
170 | return shift_pitch(audio, self.sample_rate, factor, params)
171 |
172 |
173 | def load_user_dict(self, user_dict):
174 | data_dir = "data"
175 | if isinstance(user_dict, dict) or user_dict is None:
176 | if not os.path.exists(data_dir):
177 | os.makedirs(data_dir)
178 | logger.info("Data folder was created along the path {}".format(os.path.abspath(data_dir)))
179 | self._dict_source = os.path.join(data_dir, "{}_user_dict.json".format(self.name))
180 | else:
181 | self._dict_source = user_dict
182 | assert self._dict_source.endswith((".json", ".yaml"))
183 |
184 | self.user_dict = load_dict(user_dict)
185 | logger.info("User dictionary has been loaded")
186 |
187 |
188 | def get_user_dict(self):
189 | logger.info("Request for the user dictionary was received")
190 | return self.user_dict
191 |
192 |
193 | def update_user_dict(self, new_dict):
194 | self.user_dict.update(new_dict)
195 | logger.info("User dictionary has been updated")
196 |
197 | save_dict(self.user_dict, self._dict_source)
198 | logger.info("User dictionary has been saved")
199 |
200 |
201 | def replace_user_dict(self, new_dict):
202 | self.user_dict = new_dict
203 | logger.info("User dictionary has been replaced")
204 |
205 | save_dict(self.user_dict, self._dict_source)
206 | logger.info("User dictionary has been saved")
207 |
208 |
209 | @classmethod
210 | def from_config(cls, config, name):
211 | if isinstance(config, str):
212 | logger.debug("Loading synthesizer from config file {}".format(os.path.abspath(config)))
213 |
214 | config = cls.load_config(config)
215 |
216 | params = config["general"]
217 | params["name"] = name
218 | device = params["device"]
219 | assert device is not None
220 |
221 | modules_config = config.pop(name)
222 | params["voice_control_cfg"] = modules_config["voice_control_cfg"]
223 | params["user_dict"] = modules_config["user_dict"]
224 |
225 | params["text_handler"] = _load_text_handler(modules_config["text_handler"])
226 |
227 | chosen = modules_config["modules"]
228 |
229 | for mtype, mname in chosen.items():
230 | params[mtype] = Synthesizer.module_from_config(modules_config, mtype, mname, device)
231 |
232 | return Synthesizer(**params)
233 |
234 |
235 | @staticmethod
236 | def module_from_config(modules_config, mtype, mname, device):
237 | logger.info("Loading {} module".format(mname))
238 |
239 | module_config = modules_config[mtype][mname]
240 | module_config["device"] = device
241 |
242 | for key, value in module_config.pop("options", {}).items():
243 | if value is not None:
244 | module_config[key] = value
245 |
246 | return _modules_dict[mname](**module_config)
247 |
248 |
249 | @staticmethod
250 | def load_config(config_source):
251 | if isinstance(config_source, dict):
252 | return config_source
253 | elif isinstance(config_source, str):
254 | pass
255 | else:
256 | raise TypeError
257 |
258 | with open(config_source, "r", encoding="utf-8") as stream:
259 | config = yaml.safe_load(stream)
260 |
261 | assert config is not None
262 |
263 | return config
264 |
265 |
266 | def generate_pause(duration, eps=1e-4, ptype='white_noise'):
267 | if ptype == 'silence':
268 | pause = np.zeros((duration, ))
269 | elif ptype == 'white_noise':
270 | pause = np.random.random((duration, )) * eps
271 | else:
272 | raise TypeError
273 |
274 | return pause.astype(np.float32)
275 |
276 |
277 | def _load_text_handler(config_dict):
278 | logger.info("Loading text handler")
279 |
280 | out_max_length = config_dict["out_max_length"]
281 |
282 | config_path = config_dict["config_path"]
283 | assert config_path is not None
284 |
285 | handler_config = Synthesizer.load_config(config_dict["config_path"])
286 | handler_config["handler"]["out_max_length"] = out_max_length
287 |
288 | return Handler.from_config(handler_config)
--------------------------------------------------------------------------------
/utils/async_utils.py:
--------------------------------------------------------------------------------
1 | import queue
2 | import threading
3 |
4 |
5 | class BackgroundGenerator(threading.Thread):
6 | def __init__(self, generator):
7 | super().__init__()
8 | self.queue = queue.Queue(4)
9 | self.generator = generator
10 |
11 | self.daemon = True
12 | self.start()
13 |
14 |
15 | def __iter__(self):
16 | return self
17 |
18 |
19 | def __next__(self):
20 | item = self.queue.get()
21 | if item is None:
22 | raise StopIteration
23 | return item
24 |
25 |
26 | def run(self):
27 | for item in self.generator:
28 | self.queue.put(item)
29 | self.queue.put(None)
--------------------------------------------------------------------------------
/utils/voice_control.py:
--------------------------------------------------------------------------------
1 | """
2 | Based on:
3 | https://github.com/gaganbahga/time_stretch
4 | https://github.com/sannawag/TD-PSOLA
5 | """
6 |
7 | import librosa
8 | import numpy as np
9 | from numpy.fft import fft, ifft
10 |
11 |
12 | def shift_pitch(signal, fs, factor, psola_params):
13 | """
14 | Changing speech tone in 'factor' times.
15 |
16 | :param signal:
17 | :param fs:
18 | :param factor:
19 | :param psola_params:
20 | :return:
21 | """
22 | if factor == 1:
23 | return signal
24 |
25 | peaks = find_peaks(signal, fs, psola_params)
26 | new_signal = psola(signal, peaks, factor)
27 |
28 | return new_signal
29 |
30 |
31 | def find_peaks(signal, fs, psola_params):
32 | max_hz = psola_params['max_hz']
33 | min_hz = psola_params['min_hz']
34 | analysis_win_ms = psola_params['analysis_win_ms']
35 | max_change = psola_params['max_change']
36 | min_change = psola_params['min_change']
37 |
38 | N = len(signal)
39 | min_period = fs // max_hz
40 | max_period = fs // min_hz
41 |
42 | # compute pitch periodicity
43 | sequence_len = int(analysis_win_ms / 1000 * fs) # analysis sequence length in samples
44 | periods = compute_periods_per_sequence(signal, sequence_len, min_period, max_period)
45 |
46 | # simple hack to avoid octave error: assume that the pitch should not vary much, restrict range
47 | mean_period = np.mean(periods)
48 | max_period = int(mean_period * 1.1)
49 | min_period = int(mean_period * 0.9)
50 | periods = compute_periods_per_sequence(signal, sequence_len, min_period, max_period)
51 |
52 | # find the peaks
53 | peaks = [np.argmax(signal[:int(periods[0] * 1.1)])]
54 | while True:
55 | prev = peaks[-1]
56 | idx = prev // sequence_len # current autocorrelation analysis window
57 | if prev + int(periods[idx] * max_change) >= N:
58 | break
59 | # find maximum near expected location
60 | peaks.append(prev + int(periods[idx] * min_change) +
61 | np.argmax(signal[prev + int(periods[idx] * min_change): prev + int(periods[idx] * max_change)]))
62 | return np.array(peaks)
63 |
64 |
65 | def compute_periods_per_sequence(signal, sequence_len, min_period, max_period):
66 | N = len(signal)
67 | offset = 0 # current sample offset
68 | periods = [] # period length of each analysis sequence
69 |
70 | while offset < N:
71 | frame = signal[offset:offset + sequence_len]
72 | if len(frame) < sequence_len:
73 | frame_padded = np.zeros((sequence_len, ))
74 | frame_padded[:len(frame)] = frame
75 | frame = frame_padded
76 |
77 | fourier = fft(frame)
78 | fourier[0] = 0 # remove DC component
79 | autoc = ifft(fourier * np.conj(fourier)).real
80 | autoc_peak = min_period + np.argmax(autoc[min_period:max_period])
81 | periods.append(autoc_peak)
82 | offset += sequence_len
83 |
84 | return periods
85 |
86 |
87 | def psola(signal, peaks, f_ratio):
88 | N = len(signal)
89 | # Interpolate
90 | new_signal = np.zeros(N)
91 | new_peaks_ref = np.linspace(0, len(peaks) - 1, int(len(peaks) * f_ratio))
92 | new_peaks = np.zeros(len(new_peaks_ref)).astype(int)
93 |
94 | for i in range(len(new_peaks)):
95 | weight = new_peaks_ref[i] % 1
96 | left = np.floor(new_peaks_ref[i]).astype(int)
97 | right = np.ceil(new_peaks_ref[i]).astype(int)
98 | new_peaks[i] = int(peaks[left] * (1 - weight) + peaks[right] * weight)
99 |
100 | # PSOLA
101 | for j in range(len(new_peaks)):
102 | # find the corresponding old peak index
103 | i = np.argmin(np.abs(peaks - new_peaks[j]))
104 | # get the distances to adjacent peaks
105 | P1 = [new_peaks[j] if j == 0 else new_peaks[j] - new_peaks[j-1],
106 | N - 1 - new_peaks[j] if j == len(new_peaks) - 1 else new_peaks[j+1] - new_peaks[j]]
107 | # edge case truncation
108 | if peaks[i] - P1[0] < 0:
109 | P1[0] = peaks[i]
110 | if peaks[i] + P1[1] > N - 1:
111 | P1[1] = N - 1 - peaks[i]
112 | # linear OLA window
113 | window = list(np.linspace(0, 1, P1[0] + 1)[1:]) + list(np.linspace(1, 0, P1[1] + 1)[1:])
114 | # center window from original signal at the new peak
115 | new_signal[new_peaks[j] - P1[0]: new_peaks[j] + P1[1]] += window * signal[peaks[i] - P1[0]: peaks[i] + P1[1]]
116 |
117 | return new_signal
118 |
119 |
120 | def stretch_wave(x, factor, phase_params):
121 | """
122 | Changing speech speed in 'factor' times, preserving its tone
123 |
124 | :param x:
125 | :param factor:
126 | :param phase_params:
127 | :return:
128 | """
129 | if factor == 1:
130 | return x
131 |
132 | nfft = phase_params['nfft']
133 | hop = phase_params['hop']
134 |
135 | stft = librosa.core.stft(x, n_fft=nfft).transpose()
136 | stft_cols = stft.shape[1]
137 |
138 | times = np.arange(0, stft.shape[0], factor)
139 | phase_adv = (2 * np.pi * hop * np.arange(0, stft_cols))/ nfft
140 | stft = np.concatenate((stft, np.zeros((1, stft_cols))), axis=0)
141 |
142 | indices = np.floor(times).astype(np.int)
143 | alpha = np.expand_dims(times - np.floor(times), axis=1)
144 | mag = (1. - alpha) * np.absolute(stft[indices, :]) + alpha * np.absolute(stft[indices + 1, :])
145 |
146 | dphi = np.angle(stft[indices + 1, :]) - np.angle(stft[indices, :]) - phase_adv
147 | dphi = dphi - 2 * np.pi * np.floor(dphi/(2 * np.pi))
148 |
149 | phase_adv_acc = np.matmul(np.expand_dims(np.arange(len(times) + 1),axis=1), np.expand_dims(phase_adv, axis=0))
150 | phase = np.concatenate( (np.zeros((1, stft_cols)), np.cumsum(dphi, axis=0)), axis=0) + phase_adv_acc
151 | phase += np.angle(stft[0, :])
152 |
153 | stft_new = mag * np.exp(phase[:-1, :] * 1j)
154 |
155 | return librosa.core.istft(stft_new.transpose())
--------------------------------------------------------------------------------