├── test ├── __init__.py ├── conftest.py └── test_engine.py ├── engine ├── vc │ └── __init__.py ├── __init__.py ├── vocoder │ ├── __init__.py │ ├── utils.py │ └── hifi_gan.py ├── loading.py ├── feature_extraction.py └── general.py ├── datasets ├── remembered_dataset_pathes.txt └── test │ └── example │ ├── example.mp3 │ └── example.wav ├── gui ├── __init__.py ├── widgets.py └── gui.py ├── utils ├── __init__.py ├── hparams.py └── data.py ├── img_dashboard.png ├── .gitignore ├── requirements.txt ├── app.py ├── config.yaml ├── README.md ├── dist ├── README.md └── warn_processing.py ├── LICENSE └── toolbox.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /engine/vc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/remembered_dataset_pathes.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gui/__init__.py: -------------------------------------------------------------------------------- 1 | from .gui import GUI -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import load_wav -------------------------------------------------------------------------------- /engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .general import VC, Utterance -------------------------------------------------------------------------------- /engine/vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .hifi_gan import Generator as HifiGenerator -------------------------------------------------------------------------------- /img_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/img_dashboard.png -------------------------------------------------------------------------------- /datasets/test/example/example.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.mp3 -------------------------------------------------------------------------------- /datasets/test/example/example.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.wav -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/ 2 | .pytest_cache/ 3 | **/__pycache__/ 4 | 5 | **/.DS_Store 6 | 7 | # deployment cache 8 | dist/build/ 9 | dist/dist/ 10 | dist/hooks/ 11 | dist/VCToolbox.spec -------------------------------------------------------------------------------- /utils/hparams.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | from pathlib import Path 3 | import __main__ 4 | 5 | 6 | root_dir = Path(__file__).parent.parent.resolve() 7 | cfg = OmegaConf.load(root_dir / 'config.yaml') 8 | cfg.root_dir = str(root_dir.resolve()) -------------------------------------------------------------------------------- /engine/vocoder/utils.py: -------------------------------------------------------------------------------- 1 | def init_weights(m, mean=0.0, std=0.01): 2 | classname = m.__class__.__name__ 3 | if classname.find("Conv") != -1: 4 | m.weight.data.normal_(mean, std) 5 | 6 | 7 | def get_padding(kernel_size, dilation=1): 8 | return int((kernel_size*dilation - dilation)/2) 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.3.2 2 | torch==1.8.1 3 | umap_learn==0.5.1 4 | omegaconf==2.0.6 5 | Resemblyzer==0.1.1.dev0 6 | numpy==1.20.1 7 | matplotlib==3.4.1 8 | sounddevice==0.4.1 9 | pytest==6.2.2 10 | gdown==3.12.2 11 | librosa==0.8.0 12 | PySide6==6.0.3 13 | PyYAML==5.4.1 14 | soundfile==0.10.3.post1 15 | umap-learn==0.5.1 16 | -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import numpy as np 4 | from utils import load_wav 5 | 6 | 7 | @pytest.fixture(scope="session") 8 | def example_wav(): 9 | wav = load_wav( 10 | os.path.join(os.path.dirname(__file__), "../datasets/test/example/example.wav") 11 | ) 12 | assert len(wav.shape) == 1 13 | return wav 14 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from toolbox import Toolbox 3 | import argparse 4 | from pathlib import Path 5 | 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser( 9 | description="Runs the toolbox", 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 11 | ) 12 | parser.add_argument("-d", "--datasets_root", type=Path, help= \ 13 | "Path to the directory containing your datasets.", default=Path(__file__).parent / 'datasets') 14 | parser.add_argument("--seed", type=int, default=17, help=\ 15 | "Optional random number seed value to make toolbox deterministic.") 16 | args = parser.parse_args() 17 | 18 | # Launch the toolbox 19 | Toolbox(**vars(args)) -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | data: # shouldn't be changed 2 | sample_rate: 16000 3 | n_mels: 80 4 | n_fft: 1280 5 | win_length: 1280 6 | hop_length: 320 7 | f_min: 50. 8 | f_max: null 9 | preemph: 0.97 10 | 11 | root_dir: null # will be defined onstart 12 | ckpt_default_path: '.cache' 13 | ckpt_dict: 14 | hifi_gan: 15 | 'generator': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/generator_pt330', 'wget'] 16 | 'config.yaml': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/config_v1_4.json', 'wget'] 17 | fragmentvc: 18 | 'model.pt': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v4.5/fragmentvc_v4_5_stage2.pt', 'wget'] -------------------------------------------------------------------------------- /utils/data.py: -------------------------------------------------------------------------------- 1 | from .hparams import cfg 2 | 3 | import librosa 4 | import numpy as np 5 | import soundfile as sf 6 | from pathlib import Path 7 | import os 8 | 9 | 10 | def load_wav(path): 11 | wav, _ = librosa.load(path, sr=cfg.data.sample_rate) 12 | wav = librosa.util.normalize(wav) * 0.95 13 | 14 | return wav 15 | 16 | 17 | def save_wav(path, wav, sr): 18 | """Save audio to path""" 19 | wav = np.clip(wav, -1.0, 1.0) 20 | sf.write(path, wav, sr) 21 | 22 | 23 | def get_subdirs(dir_path): 24 | dir_path = Path(dir_path) 25 | dirnames = [p.stem for p in dir_path.iterdir() if p.is_dir()] 26 | return dirnames 27 | 28 | 29 | def has_ext(filepath, ext): 30 | if isinstance(ext, str): 31 | return Path(filepath).match(f'*{ext}') 32 | else: 33 | return any(Path(filepath).match(f'*{e}') for e in ext) 34 | 35 | 36 | def get_filepathes(dir_path, ext='.wav'): 37 | for d_path, _, f_names in os.walk(dir_path): 38 | for fn in f_names: 39 | if not has_ext(fn, ext): 40 | continue 41 | 42 | yield os.path.join(d_path, fn) 43 | -------------------------------------------------------------------------------- /gui/widgets.py: -------------------------------------------------------------------------------- 1 | from PySide6.QtGui import QPixmap 2 | from PySide6.QtWidgets import QLabel 3 | from PySide6.QtCore import Signal 4 | 5 | import io 6 | 7 | def get_figure_widget(fig): 8 | buf = io.BytesIO() 9 | fig.savefig(buf, format='png') 10 | buf.seek(0) 11 | 12 | pixmap = QPixmap() 13 | pixmap.loadFromData(buf.read()) 14 | buf.close() 15 | 16 | fig_label = QLabel() 17 | fig_label.setPixmap(pixmap) 18 | 19 | return fig_label 20 | 21 | 22 | def get_figure_pixmap(fig): 23 | buf = io.BytesIO() 24 | fig.savefig(buf, format='png', transparent=True) 25 | buf.seek(0) 26 | 27 | pixmap = QPixmap() 28 | pixmap.loadFromData(buf.read()) 29 | buf.close() 30 | 31 | return pixmap 32 | 33 | 34 | 35 | class FigureWidget(QLabel): 36 | draw_signal = Signal() 37 | 38 | def __init__(self, fig, axis): 39 | super().__init__() 40 | self.fig = fig 41 | self.axis = axis 42 | self.draw_signal.connect(self.draw) 43 | 44 | def update(self): 45 | self.draw_signal.emit() 46 | 47 | def draw(self): 48 | self.setPixmap(get_figure_pixmap(self.fig)) 49 | 50 | -------------------------------------------------------------------------------- /test/test_engine.py: -------------------------------------------------------------------------------- 1 | from utils.hparams import cfg 2 | from engine import VC, Utterance 3 | 4 | import numpy as np 5 | import torch 6 | import pytest 7 | 8 | @pytest.fixture(scope='module') 9 | def engine(): 10 | return VC() 11 | 12 | @pytest.fixture(scope='module') 13 | def src(example_wav): 14 | return example_wav 15 | 16 | @pytest.fixture(scope='module') 17 | def tgts(example_wav): 18 | return [ 19 | example_wav, 20 | example_wav[:len(example_wav) // 2], 21 | ] 22 | 23 | 24 | def test_prepare(engine, src, tgts): 25 | src_features, (tgt_mel, tgt_spk_emb) = engine.prepare(src, tgts) 26 | 27 | assert len(src_features.shape) == 3 28 | assert len(tgt_mel.shape) == 3 29 | assert len(tgt_spk_emb.shape) == 2 30 | 31 | 32 | def test_convert(engine, src, tgts): 33 | mel = engine.convert(*engine.prepare(src, tgts)) 34 | 35 | assert len(mel.shape) == 3 36 | 37 | 38 | def test_vocode(engine, src): 39 | src = torch.from_numpy(src).unsqueeze(0) 40 | mel = engine._get_mel(src) 41 | 42 | wav = engine.vocode(mel) 43 | 44 | assert len(wav.shape) == 2 45 | 46 | 47 | def test_e2e(engine, src, tgts): 48 | # using build-in __call__ 49 | src_utt = Utterance(wav=src) 50 | tgt_utts = [Utterance(wav=wav) for wav in tgts] 51 | out1 = engine(src_utt, tgt_utts) 52 | 53 | # using step-by-step 54 | out2 = engine.convert(*engine.prepare(src, tgts)) 55 | out2 = engine.vocode(out2).cpu().numpy() 56 | 57 | assert np.allclose(out1, out2) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Voice Conversion 2 | 3 | ![Toolbox View](img_dashboard.png) 4 | 5 | This repository implements a simple toolbox for voice conversion. It uses one of the latest result for mel-based source to target one-shot voice conversion model and fast GAN-based vocoder. 6 | 7 | ## Installation 8 | 9 | Application file of every release was created by [PyInstaller](https://pypi.org/project/pyinstaller/) making it available to build on many popular platforms (Windows, Mac OS X, GNU/Linux, etc). Download release archive for your OS, then simply start the application file named `VCToolbox` from unarchived zip. 10 | 11 | Available OS releases: 12 | 13 | - MacOS 14 | - Linux 15 | 16 | If you want to distribute for other platforms follow instructions in `dist/BUILD.md`. 17 | 18 | ## Installation (Development build) 19 | 20 | 1. Install Requirements (Python 3.7+ were tested ok) 21 | 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | **Note:** for mp3 support you should have ffmpeg installed (you can via `brew` for MacOS, `apt-get` for linux, static builds for windows). 27 | 28 | 2. Everything is complete for launching the toolbox. Required models will be loaded on first application run. Run toolbox with following command: 29 | 30 | ```bash 31 | python3 app.py 32 | ``` 33 | 34 | 3*. You may want to run tests for this repo. It can be done via: 35 | 36 | ``` 37 | pytest 38 | ``` 39 | 40 | **Note:** that will also download models if they weren't. 41 | 42 | ## Credits 43 | 44 | Author thanks developer of [CorentinJ/Real-Time-Voice-Cloning](github.com/CorentinJ/Real-Time-Voice-Cloning) for cool design and implementation ideas. 45 | -------------------------------------------------------------------------------- /engine/loading.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | from utils.hparams import cfg 3 | from .vocoder import HifiGenerator 4 | 5 | import os 6 | import torch 7 | import json 8 | 9 | ckpt_dir = os.path.join(cfg.root_dir, cfg.ckpt_default_path) 10 | ckpt_dict = cfg.ckpt_dict 11 | 12 | 13 | def get_vocoder(): 14 | group_name = 'hifi_gan' 15 | download_group(group_name) 16 | 17 | config_path = os.path.join(ckpt_dir, group_name, "config.yaml") 18 | json_config = json.loads(open(config_path).read()) 19 | with torch.no_grad(): 20 | generator = HifiGenerator(DictConfig(json_config)).eval() 21 | 22 | ckpt_path = os.path.join(ckpt_dir, group_name, "generator") 23 | state = torch.load(ckpt_path, map_location=torch.device('cpu')) 24 | generator.load_state_dict(state['generator']) 25 | generator.remove_weight_norm() 26 | 27 | return generator 28 | 29 | 30 | def get_vc_model(): 31 | group_name = 'fragmentvc' 32 | download_group(group_name) 33 | 34 | ckpt_path = os.path.join(ckpt_dir, group_name, "model.pt") 35 | model = torch.jit.load(ckpt_path).eval() 36 | 37 | return model 38 | 39 | 40 | def download_group(group_name): 41 | for filename, (url, agent) in ckpt_dict[group_name].items(): 42 | filepath = os.path.join(ckpt_dir, group_name, filename) 43 | _download(filepath, url, agent=agent) 44 | 45 | 46 | def _download(filepath, url, refresh=False, agent='wget'): 47 | ''' 48 | Download from url into filepath using agent if needed 49 | Ref: https://github.com/s3prl/s3prl 50 | ''' 51 | 52 | dirpath = os.path.dirname(filepath) 53 | os.makedirs(dirpath, exist_ok=True) 54 | 55 | if not os.path.isfile(filepath) or refresh: 56 | if agent == 'wget': 57 | os.system(f'wget {url} -O {filepath}') 58 | elif agent == 'gdown': 59 | import gdown 60 | gdown.download(url, filepath, use_cookies=False) 61 | else: 62 | print('[Download] - Unknown download agent. Only \'wget\' and \'gdown\' are supported.') 63 | raise NotImplementedError 64 | else: 65 | print(f'Using checkpoint found in {filepath}') 66 | -------------------------------------------------------------------------------- /engine/feature_extraction.py: -------------------------------------------------------------------------------- 1 | from resemblyzer import VoiceEncoder 2 | from transformers import Wav2Vec2Model 3 | 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | from librosa.filters import mel as librosa_mel_fn 8 | from utils.hparams import cfg 9 | 10 | 11 | def load_pretrained_spk_emb(device='cpu'): 12 | """Load speaker embedding model""" 13 | 14 | model = VoiceEncoder().to(device).eval() 15 | model.requires_grad_(False) 16 | return model 17 | 18 | 19 | def load_pretrained_feature_extractor(device='cpu', ckpt_path='facebook/wav2vec2-base-960h'): 20 | """Load pretrained Wav2Vec model.""" 21 | 22 | def extract_features(self, wav, mask): 23 | # wav2vec has window of 400, so we pad to center windows 24 | wav = torch.nn.functional.pad(wav.unsqueeze(1), (200, 200), mode='reflect').squeeze(1) 25 | return [self(wav).last_hidden_state] 26 | 27 | Wav2Vec2Model.extract_features = extract_features # for same behaviour as fairseq.Wav2Vec2Model 28 | model = Wav2Vec2Model.from_pretrained(ckpt_path).eval() 29 | model.requires_grad_(False) 30 | return model 31 | 32 | 33 | class Wav2Mel(nn.Module): 34 | def __init__(self, n_fft, hop_length, win_length, 35 | sample_rate, n_mels, f_min, f_max, preemph 36 | ): 37 | super().__init__() 38 | 39 | window = torch.hann_window(win_length).float() 40 | self.register_buffer("window", window) 41 | 42 | mel_basis = torch.from_numpy(librosa_mel_fn( 43 | sample_rate, n_fft, n_mels, f_min, f_max 44 | )).float() 45 | self.register_buffer("mel_basis", mel_basis) 46 | 47 | preemph_kernel = torch.FloatTensor([[[-preemph, 1]]]) 48 | self.register_buffer("preemph_kernel", preemph_kernel) 49 | 50 | self.n_fft = n_fft 51 | self.hop_length = hop_length 52 | self.win_length = win_length 53 | self.sample_rate = sample_rate 54 | self.n_mels = n_mels 55 | 56 | def forward(self, wav): 57 | n_pad = self.n_fft // 2 58 | 59 | while len(wav.shape) < 3: 60 | wav = wav.unsqueeze(0) 61 | 62 | wav = torch.nn.functional.conv1d(wav, self.preemph_kernel, padding=1)[:, :, :-1] 63 | 64 | wav = F.pad(wav, (n_pad, n_pad), "reflect").squeeze(0) 65 | spec = torch.stft(wav, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, 66 | window=self.window, center=False, return_complex=True 67 | ).abs() 68 | 69 | mel = torch.matmul(self.mel_basis, spec) 70 | log_mel = torch.log(torch.clamp(mel, min=1e-5)) 71 | 72 | return log_mel -------------------------------------------------------------------------------- /dist/README.md: -------------------------------------------------------------------------------- 1 | # Toolbox package building guide 2 | 3 | We use OS dependent utilite PyInstaller as package building tool. Result of steps below is a platform dependent for single application. It can be used directly on same platform as guide instructions are completed. For other platforms build on virtual machines or containers. [Read more about](https://pyinstaller.readthedocs.io/en/stable/usage.html) build tool. 4 | 5 | **Note:** using of python virtual environment is highly recommended: 6 | ``` 7 | # creation 8 | virtualenv ~/install_env 9 | source ~/install_env/bin/activate 10 | 11 | # after use (delete if needed) 12 | deactivate 13 | rm -rf ~/install_env 14 | ``` 15 | 16 | The procedure of distribution of this repo toolbox consists of several steps (steps performed from `dist/` directory). 17 | 18 | ## MacOS 19 | 20 | 1. Install all required packages from requirements.txt. Also install pyinstaller and uninstall some deprecated (but installed) packages: 21 | ```bash 22 | pip -r ../requirements.txt 23 | pip install pyinstaller 24 | pip uninstall typing dataclasses 25 | ``` 26 | 27 | 2. Trying first build which will definetely fail, but provide us with important warning log. 28 | ```bash 29 | pyinstaller --name="VCToolbox" --windowed --add-data="../config.yaml:./" --add-data="../datasets/*:datasets/" --hidden-import=typing_extensions -y --onefile ../app.py 30 | ``` 31 | 32 | **Note:** `--onefile` is optional and not fully supported option with PySide6 (main Qt for Python package used in toolbox). 33 | 34 | After command above, there would be two new directories ('build/' and 'dist/'). Actually we don't need 'dist/' folder, because it stores our releasing app, but generated 'build/VCToolbox/warn-VCToolbox.txt' will be used for next stages. 35 | 36 | 3. Define installation hooks for not found modules. For this run `warn_processing.py` file: 37 | ```bash 38 | python3 warn_processing.py 39 | ``` 40 | 41 | **Important:** If script fails on python package itself (there would be string consist in python path), you should add specified string into `remove_list` in `warn_processing.py` file and run script again. 42 | 43 | This stage is cumbersome and depends on previous one. Some important modules that weren't found (such as librosa, etc) already added in script. After the command, there will be generated `hooks/` directory with files (~60 files) defining hooks for PyInstaller. Tree will look like: 44 | 45 | ``` 46 | . 47 | ├── BUILD.md 48 | ├── VCToolbox.spec 49 | ├── build 50 | ├── dist 51 | ├── hooks 52 | └── warn_processing.py 53 | ``` 54 | 55 | You can remove unnecessary 'dist/' and 'build/' folders for now: 56 | ``` 57 | rm -rf dist/ build/ 58 | ``` 59 | 60 | 4. Build with hooks for not found packages: 61 | ```bash 62 | pyinstaller --name="VCToolbox" --windowed --hidden-import=typing_extensions -y --additional-hooks-dir=hooks --onefile ../app.py 63 | ``` 64 | 65 | If everything done right, there would be an executable file in dist/ folder. 66 | 67 | 5.(optional) For distribution purposes move app binary into root folder, because it uses `config.yaml` and `datasets/` pathes. Then zip into archive, or use other utility for `.app` and `.dmg` creation. 68 | 69 | -------------------------------------------------------------------------------- /dist/warn_processing.py: -------------------------------------------------------------------------------- 1 | ### Define next the folder to create the hooks files and the warning file to read the modules from 2 | output_hooks_dir = 'hooks' 3 | warning_file = 'build/VCToolbox/warn-VCToolbox.txt' 4 | 5 | import re 6 | import os 7 | import shutil 8 | 9 | shutil.rmtree(output_hooks_dir, ignore_errors=True) 10 | os.makedirs(output_hooks_dir, exist_ok=True) 11 | 12 | with open(warning_file) as file: 13 | files_content = file.readlines() 14 | 15 | clean_content = [] 16 | for line in files_content: 17 | if re.search('missing module named',line): 18 | temp_line = re.sub('.*imported by ','',line) 19 | temp_line = re.sub('\n',', ',temp_line) 20 | clean_content.append(temp_line) 21 | clean_content = list(set(clean_content)) 22 | joined_content = ''.join(clean_content) 23 | clean_content = list(set(joined_content.split('), '))) 24 | 25 | modules_toplevel = [] 26 | for line in clean_content: 27 | if re.search('top-level',line): 28 | temp_mod = re.sub(' \(.*','',line) 29 | temp_mod = re.sub('\..*','',temp_mod) 30 | modules_toplevel.append(temp_mod) 31 | modules_toplevel = list(set(modules_toplevel)) 32 | 33 | modules_conditional = [] 34 | for line in clean_content: 35 | if re.search('conditional',line): 36 | temp_mod = re.sub(' \(.*','',line) 37 | temp_mod = re.sub('\..*','',temp_mod) 38 | modules_conditional.append(temp_mod) 39 | modules_conditional = list(set(modules_conditional)) 40 | 41 | modules_delayed = [] 42 | for line in clean_content: 43 | if re.search('delayed',line): 44 | temp_mod = re.sub(' \(.*','',line) 45 | temp_mod = re.sub('\..*','',temp_mod) 46 | modules_delayed.append(temp_mod) 47 | modules_delayed = list(set(modules_delayed)) 48 | 49 | modules_optional = [] 50 | for line in clean_content: 51 | if re.search('optional',line): 52 | temp_mod = re.sub(' \(.*','',line) 53 | temp_mod = re.sub('\..*','',temp_mod) 54 | modules_optional.append(temp_mod) 55 | modules_optional = list(set(modules_optional)) 56 | 57 | all_modules = modules_toplevel + modules_conditional + modules_delayed + modules_optional 58 | all_modules = list(set(all_modules)) 59 | 60 | print(all_modules) 61 | print('Number of found modules:', len(all_modules)) 62 | 63 | ### Optional: remove any of the modules 64 | remove_list = [ 65 | '/usr/local/lib/python3', 66 | '/Users/sotomi/envs/pyinstaller-env/lib/python3', 67 | 'zipimport', 68 | 'test', 69 | ] 70 | add_list = [ 71 | 'sacremoses', 72 | 'resemblyzer', 73 | 'librosa', 74 | ] 75 | for pkg in remove_list: 76 | if pkg in all_modules: 77 | all_modules.remove(pkg) 78 | for pkg in add_list: 79 | if pkg not in all_modules: 80 | all_modules.append(pkg) 81 | 82 | print('Total number of requested modules:', len(all_modules)) 83 | 84 | ### Optional: Change all_modules by any of the other lists, e.g. modules_toplevel 85 | for module in all_modules: 86 | output_content = 'from PyInstaller.utils.hooks import collect_all\n\ndatas, binaries, hiddenimports = collect_all(\''+module+'\')' 87 | with open(output_hooks_dir+'/hook-'+str(module)+'.py', 'w') as f: 88 | f.write(output_content) 89 | -------------------------------------------------------------------------------- /engine/general.py: -------------------------------------------------------------------------------- 1 | from .loading import get_vocoder, get_vc_model 2 | from .feature_extraction import load_pretrained_spk_emb, load_pretrained_feature_extractor, Wav2Mel 3 | from utils.hparams import cfg 4 | 5 | from resemblyzer import preprocess_wav 6 | import torch 7 | import numpy as np 8 | import librosa 9 | from pathlib import Path 10 | from dataclasses import dataclass, field 11 | from typing import List 12 | 13 | 14 | @dataclass 15 | class Utterance: 16 | """Audio with its cached features""" 17 | wav: np.array = field(repr=False, default=None) 18 | sr: int = None 19 | path: str = None 20 | spk_name: str = None 21 | mel: np.ndarray = field(repr=False, default=None) 22 | spk_emb: np.array = field(repr=False, default=None) 23 | features: np.ndarray = field(repr=False, default=None) 24 | 25 | 26 | def clear(self): 27 | self.wav = None 28 | self.mel = None 29 | self.features = None 30 | 31 | def get_id(self): 32 | if self.path is None or self.spk_name is None: 33 | return 34 | return (self.spk_name, Path(self.path).stem) 35 | 36 | def __eq__(self, other): 37 | return self.get_id() == other.get_id() 38 | 39 | def __hash__(self): 40 | return hash(self.get_id()) 41 | 42 | 43 | class VC: 44 | def __init__(self): 45 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 46 | self.feature_extractor = None 47 | self.mel_extractor = None 48 | self.spk_emb_extractor = None 49 | self.model = None 50 | self.vocoder = None 51 | 52 | @staticmethod 53 | def yield_init_stage(stage_num): 54 | return stage_num / 5 55 | 56 | def logged_init(self): 57 | yield self.yield_init_stage(0) 58 | self.feature_extractor = load_pretrained_feature_extractor(device=self.device) 59 | 60 | yield self.yield_init_stage(1) 61 | self.mel_extractor = Wav2Mel(**cfg.data) 62 | 63 | yield self.yield_init_stage(2) 64 | self.spk_emb_extractor = load_pretrained_spk_emb(device=self.device) 65 | 66 | yield self.yield_init_stage(3) 67 | self.model = get_vc_model().to(self.device) 68 | 69 | yield self.yield_init_stage(4) 70 | self.vocoder = get_vocoder().to(self.device) 71 | 72 | yield self.yield_init_stage(5) 73 | 74 | def __call__(self, src: Utterance, tgts: List[Utterance], input_sr: int = cfg.data.sample_rate): 75 | """Convert source utterance from source speaker to target speaker""" 76 | 77 | # preparation 78 | src_features, tgt_features = self.prepare(src.wav, [tgt.wav for tgt in tgts], input_sr=input_sr) 79 | 80 | # conversion 81 | out_mel = self.convert(src_features, tgt_features) 82 | 83 | # vocoding 84 | out_wav = self.vocode(out_mel) 85 | 86 | return out_wav.cpu().numpy() 87 | 88 | def prepare(self, src_wav, tgt_wavs, input_sr=cfg.data.sample_rate): 89 | src_wav = torch.from_numpy(src_wav).to(self.device) 90 | if len(src_wav.shape) == 1: 91 | src_wav = src_wav.unsqueeze(0) 92 | tgt_wav = torch.from_numpy( 93 | np.concatenate(tgt_wavs) 94 | ).to(self.device).unsqueeze(0) 95 | 96 | src_features = self._get_features(src_wav) 97 | tgt_spk_emb = self._get_spk_emb(tgt_wavs, input_sr) 98 | tgt_mel = self.mel_extractor(tgt_wav) 99 | 100 | return src_features, (tgt_mel, tgt_spk_emb) 101 | 102 | def convert(self, src_features, tgt_features): 103 | tgt_mel, tgt_spk_emb = tgt_features 104 | with torch.no_grad(): 105 | out_mel, _, _, _ = self.model(src_features, tgt_mel, ref_embs=tgt_spk_emb) 106 | return out_mel 107 | 108 | def vocode(self, mel): 109 | with torch.no_grad(): 110 | wav = self.vocoder(mel).squeeze(1) 111 | return wav 112 | 113 | def _get_mel(self, wav): 114 | return self.mel_extractor(wav) 115 | 116 | def _get_features(self, wav): 117 | with torch.no_grad(): 118 | return self.feature_extractor.extract_features(wav, None)[0] 119 | 120 | def _get_spk_emb(self, wavs, sr=None): 121 | wavs = [preprocess_wav(wav, sr) for wav in wavs] 122 | cat_wav = np.concatenate(wavs, 0) 123 | spk_emb = self.spk_emb_extractor.embed_utterance(cat_wav) 124 | 125 | return torch.from_numpy(spk_emb).to(self.device).unsqueeze(0) 126 | 127 | # @staticmethod 128 | # def preprocess_single_wav(fpath_or_wav: Union[str, Path, np.ndarray], src_sr=None, tgt_sr=None): 129 | # # TODO 130 | # if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 131 | # wav, src_sr = librosa.load(str(fpath_or_wav), sr=None) 132 | # else: 133 | # wav = fpath_or_wav 134 | 135 | # # Resample the wav 136 | # if src_sr is not None and tgt_sr is not None: 137 | # wav = librosa.resample(wav, src_sr, tgt_sr) 138 | 139 | # return wav -------------------------------------------------------------------------------- /engine/vocoder/hifi_gan.py: -------------------------------------------------------------------------------- 1 | # Hifi-GAN Reference: 2 | # https://github.com/jik876/hifi-gan/blob/master/models.py 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torch.nn as nn 7 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 8 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 9 | from .utils import init_weights, get_padding 10 | 11 | LRELU_SLOPE = 0.1 12 | 13 | 14 | class ResBlock1(torch.nn.Module): 15 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 16 | super(ResBlock1, self).__init__() 17 | self.h = h 18 | self.convs1 = nn.ModuleList([ 19 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 20 | padding=get_padding(kernel_size, dilation[0]))), 21 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 22 | padding=get_padding(kernel_size, dilation[1]))), 23 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 24 | padding=get_padding(kernel_size, dilation[2]))) 25 | ]) 26 | self.convs1.apply(init_weights) 27 | 28 | self.convs2 = nn.ModuleList([ 29 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 30 | padding=get_padding(kernel_size, 1))), 31 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 32 | padding=get_padding(kernel_size, 1))), 33 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 34 | padding=get_padding(kernel_size, 1))) 35 | ]) 36 | self.convs2.apply(init_weights) 37 | 38 | def forward(self, x): 39 | for c1, c2 in zip(self.convs1, self.convs2): 40 | xt = F.leaky_relu(x, LRELU_SLOPE) 41 | xt = c1(xt) 42 | xt = F.leaky_relu(xt, LRELU_SLOPE) 43 | xt = c2(xt) 44 | x = xt + x 45 | return x 46 | 47 | def remove_weight_norm(self): 48 | for l in self.convs1: 49 | remove_weight_norm(l) 50 | for l in self.convs2: 51 | remove_weight_norm(l) 52 | 53 | 54 | class ResBlock2(torch.nn.Module): 55 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 56 | super(ResBlock2, self).__init__() 57 | self.h = h 58 | self.convs = nn.ModuleList([ 59 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 60 | padding=get_padding(kernel_size, dilation[0]))), 61 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 62 | padding=get_padding(kernel_size, dilation[1]))) 63 | ]) 64 | self.convs.apply(init_weights) 65 | 66 | def forward(self, x): 67 | for c in self.convs: 68 | xt = F.leaky_relu(x, LRELU_SLOPE) 69 | xt = c(xt) 70 | x = xt + x 71 | return x 72 | 73 | def remove_weight_norm(self): 74 | for l in self.convs: 75 | remove_weight_norm(l) 76 | 77 | 78 | class Generator(torch.nn.Module): 79 | def __init__(self, h): 80 | super(Generator, self).__init__() 81 | self.h = h 82 | self.num_kernels = len(h.resblock_kernel_sizes) 83 | self.num_upsamples = len(h.upsample_rates) 84 | self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) 85 | resblock = ResBlock1 if h.resblock == '1' else ResBlock2 86 | 87 | self.ups = nn.ModuleList() 88 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 89 | self.ups.append(weight_norm( 90 | ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), 91 | k, u, padding=(u//2 + u%2), output_padding=u%2))) 92 | 93 | self.resblocks = nn.ModuleList() 94 | for i in range(len(self.ups)): 95 | ch = h.upsample_initial_channel//(2**(i+1)) 96 | for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): 97 | self.resblocks.append(resblock(h, ch, k, d)) 98 | 99 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 100 | self.ups.apply(init_weights) 101 | self.conv_post.apply(init_weights) 102 | 103 | def forward(self, x): 104 | x = self.conv_pre(x) 105 | for i in range(self.num_upsamples): 106 | x = F.leaky_relu(x, LRELU_SLOPE) 107 | x = self.ups[i](x) 108 | xs = None 109 | for j in range(self.num_kernels): 110 | if xs is None: 111 | xs = self.resblocks[i*self.num_kernels+j](x) 112 | else: 113 | xs += self.resblocks[i*self.num_kernels+j](x) 114 | x = xs / self.num_kernels 115 | x = F.leaky_relu(x) 116 | x = self.conv_post(x) 117 | x = torch.tanh(x) 118 | 119 | return x 120 | 121 | def remove_weight_norm(self): 122 | print('Removing weight norm...') 123 | for l in self.ups: 124 | remove_weight_norm(l) 125 | for l in self.resblocks: 126 | l.remove_weight_norm() 127 | remove_weight_norm(self.conv_pre) 128 | remove_weight_norm(self.conv_post) 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /toolbox.py: -------------------------------------------------------------------------------- 1 | # from audioread.exceptions import NoBackendError 2 | from engine import VC, Utterance 3 | from gui import GUI 4 | from pathlib import Path 5 | import utils 6 | from utils.hparams import cfg 7 | 8 | from time import perf_counter as timer 9 | import traceback 10 | import numpy as np 11 | import torch 12 | import os 13 | import sys 14 | from pathlib import Path 15 | from collections import defaultdict 16 | 17 | 18 | # Maximum of generated wavs to keep on memory 19 | MAX_WAVS = 15 20 | MAX_TARGET_SAMPLES = 10 21 | MAX_LOADED_SAMPLES = 100 22 | 23 | 24 | class Toolbox: 25 | def __init__(self, datasets_root, seed): 26 | sys.excepthook = self.excepthook 27 | self.seed = seed 28 | self.datasets_root = datasets_root 29 | self.recognized_datasets = [] 30 | self.utterances = set() 31 | self.current_generated = (None, None, None, None) # speaker_name, mel, breaks, wav 32 | self.speaker_filepathes = defaultdict(set) 33 | self.audio_ext = {'.wav', '.flac', '.mp3'} 34 | for datafolder in utils.data.get_subdirs(datasets_root): 35 | self.load_dataset_info(os.path.join(self.datasets_root, datafolder)) 36 | 37 | self.engine = None # type: VC 38 | self.current_src_utt = None 39 | self.current_tgt_utts = None 40 | self.current_tgt_spk = None 41 | self.loaded_utts = [] 42 | self.conv_utts_list = [] 43 | self.conv_utts_idlist = [] 44 | self.self_record_count = 0 45 | 46 | self.trim_silences = True 47 | 48 | # Initialize the events and the interface 49 | self.ui = GUI() 50 | self.reset_ui(seed) 51 | self.setup_events() 52 | self.ui.start() 53 | 54 | def excepthook(self, exc_type, exc_value, exc_tb): 55 | traceback.print_exception(exc_type, exc_value, exc_tb) 56 | self.ui.log("Exception: %s" % exc_value) 57 | 58 | def setup_events(self): 59 | # Dataset, speaker and utterance selection 60 | self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser()) 61 | random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root, self.recognized_datasets, level) 62 | self.ui.random_dataset_button.clicked.connect(random_func(0)) 63 | self.ui.random_speaker_button.clicked.connect(random_func(1)) 64 | self.ui.random_utterance_button.clicked.connect(random_func(2)) 65 | self.ui.dataset_box.currentIndexChanged.connect(random_func(1)) 66 | self.ui.src_spk_box.currentIndexChanged.connect(random_func(2)) 67 | self.ui.tgt_spk_box.currentIndexChanged.connect(random_func(2)) 68 | 69 | # Utterance selection 70 | func = lambda: self.load_from_browser(self.ui.browse_file()) 71 | self.ui.browser_browse_button.clicked.connect(func) 72 | func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current") 73 | self.ui.utterance_history.currentIndexChanged.connect(func) 74 | func = lambda: self.ui.play(self.ui.selected_utterance.wav, cfg.data.sample_rate) 75 | self.ui.play_button.clicked.connect(func) 76 | self.ui.stop_button.clicked.connect(self.ui.stop) 77 | self.ui.record_button.clicked.connect(self.record) 78 | 79 | # Audio 80 | self.ui.setup_audio_devices(cfg.data.sample_rate) 81 | 82 | # Wav playback & save 83 | func = lambda: self.replay_last_wav() 84 | self.ui.replay_wav_button.clicked.connect(func) 85 | func = lambda: self.export_current_wave() 86 | self.ui.export_wav_button.clicked.connect(func) 87 | self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt) 88 | 89 | # Generation 90 | func = lambda: self.convert() or self.vocode() 91 | self.ui.generate_button.clicked.connect(func) 92 | self.ui.synthesize_button.clicked.connect(self.convert) 93 | self.ui.vocode_button.clicked.connect(self.vocode) 94 | self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox) 95 | 96 | # UMAP legend 97 | self.ui.clear_button.clicked.connect(self.clear_utterances) 98 | 99 | def set_current_utt(self, index): 100 | self.current_src_utt = self.conv_utts_list[index] 101 | 102 | def export_current_wave(self): 103 | self.ui.save_audio_file(self.current_src_utt, cfg.data.sample_rate) 104 | 105 | def replay_last_wav(self): 106 | self.ui.play(self.current_src_utt, cfg.data.sample_rate) 107 | 108 | def reset_ui(self, seed): 109 | self.recognized_datasets = [p for p in self.datasets_root.iterdir() if p.is_dir()] 110 | self.ui.populate_browser(self.datasets_root, self.recognized_datasets, 0, True) 111 | self.ui.populate_gen_options(seed, self.trim_silences) 112 | 113 | def load_from_browser(self, fpath=None): 114 | if fpath is None: 115 | fpath = Path(self.datasets_root, self.ui.current_dataset_name, self.ui.current_src_spk, self.ui.current_utterance_name) 116 | name = str(fpath.relative_to(self.datasets_root)) 117 | speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_src_spk 118 | 119 | # Select the next utterance 120 | if self.ui.auto_next_checkbox.isChecked(): 121 | self.ui.browser_select_next() 122 | elif fpath == "": 123 | return 124 | else: 125 | name = fpath.name 126 | speaker_name = fpath.parent.name 127 | 128 | 129 | # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for 130 | # playback, so as to have a fair comparison with the generated audio 131 | wav = utils.load_wav(str(fpath)) 132 | self.ui.log("Loaded %s" % name) 133 | 134 | self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name) 135 | 136 | def record(self): 137 | wav = self.ui.record_one(cfg.data.sample_rate, 5) 138 | if wav is None: 139 | return 140 | self.ui.play(wav, cfg.data.sample_rate) 141 | self.self_record_count += 1 142 | 143 | speaker_name = "user_recorder" 144 | name = f"{speaker_name}_{self.self_record_count}" 145 | self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name) 146 | 147 | def add_real_utterance(self, wav, sr, path, spk_name): 148 | if self.engine is None: 149 | self.init_engine() 150 | 151 | # Compute the mel spectrogram 152 | mel = self.engine._get_mel(torch.from_numpy(wav)) 153 | self.ui.draw_mel(mel.squeeze(0), "current") 154 | 155 | # Compute the embedding 156 | embed = self.engine._get_spk_emb([wav], sr=sr) 157 | 158 | # Add the utterance 159 | utterance = Utterance( 160 | wav=wav, sr=sr, 161 | path=path, spk_name=spk_name, 162 | mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0) 163 | ) 164 | if utterance not in self.utterances: 165 | self.utterances.add(utterance) 166 | self.ui.register_utterance(utterance) 167 | 168 | # Plot it 169 | # self.ui.draw_embed(embed, Path(path).stem, "current") 170 | self.ui.draw_umap_projections(self.utterances) 171 | 172 | def clear_utterances(self): 173 | self.reset_ui(self.seed) 174 | self.utterances.clear() 175 | self.ui.draw_umap_projections(self.utterances) 176 | 177 | def convert(self): 178 | self.ui.log("Converting from source to target...") 179 | self.ui.set_loading(1) 180 | 181 | # Update the synthesizer random seed 182 | if self.ui.random_seed_checkbox.isChecked(): 183 | seed = int(self.ui.seed_textbox.text()) 184 | self.ui.populate_gen_options(seed, self.trim_silences) 185 | else: 186 | seed = None 187 | 188 | tgt_spk = self.ui.current_tgt_spk 189 | 190 | # Synthesize the spectrogram 191 | if self.engine is None: 192 | self.init_engine() 193 | 194 | src_wav = self.ui.selected_utterance.wav 195 | if self.current_tgt_spk is None or self.current_tgt_spk != tgt_spk: 196 | self.current_tgt_utts = self.get_spk_utterances(tgt_spk) 197 | 198 | tgt_wavs = [tgt.wav for tgt in self.current_tgt_utts] 199 | prep_data = self.engine.prepare(src_wav, tgt_wavs) 200 | mel = self.engine.convert(*prep_data) 201 | 202 | self.ui.draw_mel(mel.cpu().numpy().squeeze(0), "converted mel") 203 | self.current_generated = (self.ui.selected_utterance.spk_name, Path(self.ui.selected_utterance.path).stem, self.ui.current_tgt_spk, mel) 204 | self.ui.set_loading(0) 205 | 206 | def vocode(self): 207 | src_spk, basename, tgt_spk, mel = self.current_generated 208 | assert mel is not None 209 | 210 | # Synthesize the waveform 211 | if not self.engine: 212 | self.init_engine() 213 | 214 | # def vocoder_progress(i, seq_len, b_size, gen_rate): 215 | # real_time_factor = (gen_rate / cfg.data.sample_rate) * 1000 216 | # line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ 217 | # % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) 218 | # self.ui.log(line, "overwrite") 219 | # self.ui.set_loading(i, seq_len) 220 | 221 | # wav = vocoder.infer_waveform(mel, progress_callback=vocoder_progress) 222 | wav = self.engine.vocode(mel).squeeze(0).cpu().numpy() 223 | self.ui.set_loading(0) 224 | self.ui.log("Done!", "append") 225 | 226 | 227 | # Play it 228 | wav = (wav / np.abs(wav).max()) * 0.95 229 | self.ui.play(wav, cfg.data.sample_rate) 230 | 231 | # Name it (history displayed in combobox) 232 | name = f"{src_spk}_to_{tgt_spk}_{basename}" 233 | spk_name = f"{src_spk}_to_{tgt_spk}" 234 | 235 | # Update wavs combobox 236 | if len(self.conv_utts_list) > MAX_WAVS: 237 | self.conv_utts_list.pop() 238 | self.conv_utts_idlist.pop() 239 | self.conv_utts_list.insert(0, wav) 240 | self.conv_utts_idlist.insert(0, name) 241 | 242 | # self.ui.wavs_cb.disconnect() 243 | self.ui.wavs_cb_model.setStringList(self.conv_utts_idlist) 244 | self.ui.wavs_cb.setCurrentIndex(0) 245 | self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt) 246 | 247 | # Update current wav 248 | self.set_current_utt(0) 249 | 250 | # Enable replay and save buttons: 251 | self.ui.replay_wav_button.setDisabled(False) 252 | self.ui.export_wav_button.setDisabled(False) 253 | 254 | # Compute speaker embedding 255 | embed = self.engine._get_spk_emb([wav], sr=cfg.data.sample_rate) 256 | 257 | # Add the utterance 258 | utterance = Utterance( 259 | wav=wav, sr=cfg.data.sample_rate, 260 | path=name, spk_name=spk_name, 261 | mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0) 262 | ) 263 | self.utterances.add(utterance) 264 | 265 | # Plot it 266 | # self.ui.draw_embed(embed, name, "generated") 267 | self.ui.draw_umap_projections(self.utterances) 268 | 269 | 270 | def get_spk_utterances(self, spk_name): 271 | utts = list(filter(lambda u: u.spk_name == spk_name, self.loaded_utts)) 272 | if len(utts) >= MAX_TARGET_SAMPLES: 273 | return utts 274 | 275 | utts_pathes = set(map(lambda u: u.path, utts)) 276 | available_utts_pathes = list(filter(lambda p: p not in utts_pathes, self.speaker_filepathes[spk_name])) 277 | available_utts_pathes = available_utts_pathes[:MAX_TARGET_SAMPLES - len(utts_pathes)] 278 | 279 | new_utts = list(map(lambda p: self.load_utterance(spk_name, p), available_utts_pathes)) 280 | self.loaded_utts.extend(new_utts) 281 | self.loaded_utts = self.loaded_utts[-MAX_LOADED_SAMPLES:] 282 | 283 | utts.extend(new_utts) 284 | return utts 285 | 286 | def load_utterance(self, spk_name, path): 287 | wav = utils.load_wav(path) 288 | return Utterance(wav, cfg.data.sample_rate, path=path, spk_name=spk_name) 289 | 290 | def load_dataset_info(self, dataset_path): 291 | speakers = utils.data.get_subdirs(dataset_path) 292 | 293 | for spk in speakers: 294 | self.speaker_filepathes[spk] = { 295 | *self.speaker_filepathes[spk], 296 | *utils.data.get_filepathes(os.path.join(dataset_path, spk), self.audio_ext) 297 | } 298 | 299 | def init_engine(self): 300 | self.ui.log("Creating voice conversion model...") 301 | self.ui.set_loading(1) 302 | start = timer() 303 | self.engine = VC() 304 | for stage in self.engine.logged_init(): 305 | self.ui.set_loading(stage) 306 | self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") 307 | self.ui.set_loading(0) 308 | 309 | def update_seed_textbox(self): 310 | self.ui.update_seed_textbox() -------------------------------------------------------------------------------- /gui/gui.py: -------------------------------------------------------------------------------- 1 | from engine import Utterance 2 | from .widgets import get_figure_widget, FigureWidget 3 | 4 | import matplotlib.pyplot as plt 5 | from PySide6.QtCore import Qt, QStringListModel 6 | from PySide6.QtGui import QImage, QPixmap 7 | from PySide6.QtWidgets import * 8 | 9 | from pathlib import Path 10 | from typing import List, Set 11 | import sounddevice as sd 12 | import soundfile as sf 13 | import numpy as np 14 | from time import sleep 15 | import umap 16 | import sys 17 | from warnings import filterwarnings, warn 18 | filterwarnings("ignore") 19 | 20 | 21 | colormap = np.array([ 22 | [0, 127, 70], 23 | [255, 0, 0], 24 | [255, 217, 38], 25 | [0, 135, 255], 26 | [165, 0, 165], 27 | [255, 167, 255], 28 | [97, 142, 151], 29 | [0, 255, 255], 30 | [255, 96, 38], 31 | [142, 76, 0], 32 | [33, 0, 127], 33 | [0, 0, 0], 34 | [183, 183, 183], 35 | [76, 255, 0], 36 | ], dtype=np.float) / 255 37 | 38 | 39 | class GUI(QDialog): 40 | min_umap_points = 4 41 | max_log_lines = 5 42 | max_saved_utterances = 20 43 | 44 | def draw_utterance(self, utterance: Utterance, which): 45 | self.draw_mel(utterance.mel, which) 46 | # self.draw_embed(utterance.spk_emb, Path(utterance.path).stem, which) 47 | 48 | def draw_embed(self, spk_emb, name, which): 49 | widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget 50 | embed_ax, _ = widget.axis 51 | embed_ax.figure.suptitle("" if spk_emb is None else name) 52 | 53 | ## Embedding 54 | # Clear the plot 55 | if len(embed_ax.images) > 0: 56 | embed_ax.images[0].colorbar.remove() 57 | embed_ax.clear() 58 | 59 | # Draw speaker embedding 60 | if spk_emb is not None: 61 | embed_ax.set_title("embedding") 62 | embed_ax.set_aspect("equal", "datalim") 63 | embed_ax.set_xticks([]) 64 | embed_ax.set_yticks([]) 65 | embed_ax.figure.canvas.draw() 66 | widget.update() 67 | 68 | def draw_mel(self, mel, which): 69 | widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget 70 | # _, mel_ax = widget.axis 71 | mel_ax = widget.axis 72 | 73 | ## Spectrogram 74 | # Draw the spectrogram 75 | mel_ax.clear() 76 | if mel is not None: 77 | im = mel_ax.imshow(mel, aspect="auto", origin="lower", interpolation='none') 78 | mel_ax.set_title("mel spectrogram") 79 | 80 | mel_ax.set_xticks([]) 81 | mel_ax.set_yticks([]) 82 | mel_ax.figure.canvas.draw() 83 | widget.update() 84 | if which != "current": 85 | self.vocode_button.setDisabled(mel is None) 86 | 87 | def draw_umap_projections(self, utterances: Set[Utterance]): 88 | self.umap_ax.clear() 89 | 90 | speakers = np.unique([u.spk_name for u in utterances]) 91 | colors = {spk_name: colormap[i] for i, spk_name in enumerate(speakers)} 92 | embeds = [u.spk_emb for u in utterances] 93 | 94 | # Display a message if there aren't enough points 95 | if len(utterances) < self.min_umap_points: 96 | self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" % 97 | (self.min_umap_points - len(utterances)), 98 | horizontalalignment='center', fontsize=15) 99 | self.umap_ax.set_title("") 100 | 101 | # Compute the projections 102 | else: 103 | if not self.umap_hot: 104 | self.log( 105 | "Drawing UMAP projections for the first time, this will take a few seconds.") 106 | self.umap_hot = True 107 | 108 | reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine") 109 | projections = reducer.fit_transform(embeds) 110 | 111 | speakers_done = set() 112 | for projection, utterance in zip(projections, utterances): 113 | color = colors[utterance.spk_name] 114 | mark = "x" if "_gen_" in Path(utterance.path).stem else "o" 115 | label = None if utterance.spk_name in speakers_done else utterance.spk_name 116 | speakers_done.add(utterance.spk_name) 117 | self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark, label=label) 118 | # self.umap_ax.set_title("UMAP projections") 119 | self.umap_ax.legend(prop={'size': 10}) 120 | 121 | # Draw the plot 122 | self.umap_ax.set_aspect("equal", "datalim") 123 | self.umap_ax.set_xticks([]) 124 | self.umap_ax.set_yticks([]) 125 | self.umap_ax.figure.canvas.draw() 126 | 127 | def save_audio_file(self, wav, sample_rate): 128 | dialog = QFileDialog() 129 | dialog.setDefaultSuffix(".wav") 130 | fpath, _ = dialog.getSaveFileName( 131 | parent=self, 132 | caption="Select a path to save the audio file", 133 | filter="Audio Files (*.flac *.wav)" 134 | ) 135 | if fpath: 136 | #Default format is wav 137 | if Path(fpath).suffix == "": 138 | fpath += ".wav" 139 | sf.write(fpath, wav, sample_rate) 140 | 141 | def setup_audio_devices(self, sample_rate): 142 | input_devices = [] 143 | output_devices = [] 144 | for device in sd.query_devices(): 145 | # Check if valid input 146 | try: 147 | sd.check_input_settings(device=device["name"], samplerate=sample_rate) 148 | input_devices.append(device["name"]) 149 | except: 150 | pass 151 | 152 | # Check if valid output 153 | try: 154 | sd.check_output_settings(device=device["name"], samplerate=sample_rate) 155 | output_devices.append(device["name"]) 156 | except Exception as e: 157 | # Log a warning only if the device is not an input 158 | if not device["name"] in input_devices: 159 | warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e))) 160 | 161 | if len(input_devices) == 0: 162 | self.log("No audio input device detected. Recording may not work.") 163 | self.audio_in_device = None 164 | else: 165 | self.audio_in_device = input_devices[0] 166 | 167 | if len(output_devices) == 0: 168 | self.log("No supported output audio devices were found! Audio output may not work.") 169 | self.audio_out_devices_cb.addItems(["None"]) 170 | self.audio_out_devices_cb.setDisabled(True) 171 | else: 172 | self.audio_out_devices_cb.clear() 173 | self.audio_out_devices_cb.addItems(output_devices) 174 | self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device) 175 | 176 | self.set_audio_device() 177 | 178 | def set_audio_device(self): 179 | 180 | output_device = self.audio_out_devices_cb.currentText() 181 | if output_device == "None": 182 | output_device = None 183 | 184 | # If None, sounddevice queries portaudio 185 | sd.default.device = (self.audio_in_device, output_device) 186 | 187 | def play(self, wav, sample_rate): 188 | try: 189 | sd.stop() 190 | sd.play(wav, sample_rate) 191 | except Exception as e: 192 | print(e) 193 | self.log("Error in audio playback. Try selecting a different audio output device.") 194 | self.log("Your device must be connected before you start the toolbox.") 195 | 196 | def stop(self): 197 | sd.stop() 198 | 199 | def record_one(self, sample_rate, duration): 200 | self.record_button.setText("Recording...") 201 | self.record_button.setDisabled(True) 202 | 203 | self.log("Recording %d seconds of audio" % duration) 204 | sd.stop() 205 | try: 206 | wav = sd.rec(duration * sample_rate, sample_rate, 1) 207 | except Exception as e: 208 | print(e) 209 | self.log("Could not record anything. Is your recording device enabled?") 210 | self.log("Your device must be connected before you start the toolbox.") 211 | return None 212 | 213 | for i in np.arange(0, duration, 0.1): 214 | self.set_loading(i, duration) 215 | sleep(0.1) 216 | self.set_loading(duration, duration) 217 | sd.wait() 218 | 219 | self.log("Done recording.") 220 | self.record_button.setText("Record") 221 | self.record_button.setDisabled(False) 222 | 223 | return wav.squeeze() 224 | 225 | @property 226 | def current_dataset_name(self): 227 | return self.dataset_box.currentText() 228 | 229 | @property 230 | def current_src_spk(self): 231 | return self.src_spk_box.currentText() 232 | 233 | @property 234 | def current_tgt_spk(self): 235 | return self.tgt_spk_box.currentText() 236 | 237 | @property 238 | def current_utterance_name(self): 239 | return self.utterance_box.currentText() 240 | 241 | def browse_file(self): 242 | fpath = QFileDialog().getOpenFileName( 243 | parent=self, 244 | caption="Select an audio file", 245 | filter="Audio Files (*.mp3 *.flac *.wav *.m4a)" 246 | ) 247 | return Path(fpath[0]) if fpath[0] != "" else "" 248 | 249 | @staticmethod 250 | def repopulate_box(box, items, random=False): 251 | """ 252 | Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join 253 | data to the items 254 | """ 255 | box.blockSignals(True) 256 | box.clear() 257 | for item in items: 258 | item = list(item) if isinstance(item, tuple) else [item] 259 | box.addItem(str(item[0]), *item[1:]) 260 | if len(items) > 0: 261 | box.setCurrentIndex(np.random.randint(len(items)) if random else 0) 262 | box.setDisabled(len(items) == 0) 263 | box.blockSignals(False) 264 | 265 | def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int, random=True): 266 | # Select a random dataset 267 | if level <= 0: 268 | if datasets_root is not None: 269 | datasets = [datasets_root.joinpath(d) for d in recognized_datasets] 270 | datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()] 271 | self.browser_load_button.setDisabled(len(datasets) == 0) 272 | if datasets_root is None or len(datasets) == 0: 273 | msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \ 274 | if datasets_root is None else "o not have any of the recognized datasets" \ 275 | " in %s" % datasets_root) 276 | self.log(msg) 277 | msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \ 278 | "can still use the toolbox by recording samples yourself." % \ 279 | ("\n\t".join(map(str, recognized_datasets))) 280 | print(msg, file=sys.stderr) 281 | 282 | self.random_utterance_button.setDisabled(True) 283 | self.random_speaker_button.setDisabled(True) 284 | self.random_dataset_button.setDisabled(True) 285 | self.utterance_box.setDisabled(True) 286 | self.src_spk_box.setDisabled(True) 287 | self.tgt_spk_box.setDisabled(True) 288 | self.dataset_box.setDisabled(True) 289 | self.browser_load_button.setDisabled(True) 290 | self.auto_next_checkbox.setDisabled(True) 291 | return 292 | self.repopulate_box(self.dataset_box, datasets, random) 293 | 294 | # Select a random src and tgt speakers 295 | if level <= 1: 296 | speakers_root = datasets_root.joinpath(self.current_dataset_name) 297 | speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()] 298 | self.repopulate_box(self.src_spk_box, speaker_names, random) 299 | self.repopulate_box(self.tgt_spk_box, speaker_names, random) 300 | 301 | # Select a random utterance 302 | if level <= 2: 303 | utterances_root = datasets_root.joinpath( 304 | self.current_dataset_name, 305 | self.current_src_spk 306 | ) 307 | utterances = [] 308 | for extension in ['mp3', 'flac', 'wav']: 309 | utterances.extend(Path(utterances_root).glob("**/*.%s" % extension)) 310 | utterances = [fpath.relative_to(utterances_root) for fpath in utterances] 311 | self.repopulate_box(self.utterance_box, utterances, random) 312 | 313 | def browser_select_next(self): 314 | index = (self.utterance_box.currentIndex() + 1) % self.utterance_box.count() 315 | self.utterance_box.setCurrentIndex(index) 316 | 317 | @property 318 | def selected_utterance(self): 319 | return self.utterance_history.itemData(self.utterance_history.currentIndex()) 320 | 321 | def register_utterance(self, utterance: Utterance): 322 | self.utterance_history.blockSignals(True) 323 | self.utterance_history.insertItem(0, Path(utterance.path).stem, utterance) 324 | self.utterance_history.setCurrentIndex(0) 325 | self.utterance_history.blockSignals(False) 326 | 327 | if self.utterance_history.count() > self.max_saved_utterances: 328 | self.utterance_history.removeItem(self.max_saved_utterances) 329 | 330 | self.play_button.setDisabled(False) 331 | self.generate_button.setDisabled(False) 332 | self.synthesize_button.setDisabled(False) 333 | 334 | def log(self, line, mode="newline"): 335 | if mode == "newline": 336 | self.logs.append(line) 337 | if len(self.logs) > self.max_log_lines: 338 | del self.logs[0] 339 | elif mode == "append": 340 | self.logs[-1] += line 341 | elif mode == "overwrite": 342 | self.logs[-1] = line 343 | log_text = '\n'.join(self.logs) 344 | 345 | self.log_window.setText(log_text) 346 | self.app.processEvents() 347 | 348 | def set_loading(self, value, maximum=1): 349 | self.loading_bar.setValue(value * 100) 350 | self.loading_bar.setMaximum(maximum * 100) 351 | self.loading_bar.setTextVisible(value != 0) 352 | self.app.processEvents() 353 | 354 | def populate_gen_options(self, seed, trim_silences): 355 | if seed is not None: 356 | self.random_seed_checkbox.setChecked(True) 357 | self.seed_textbox.setText(str(seed)) 358 | self.seed_textbox.setEnabled(True) 359 | else: 360 | self.random_seed_checkbox.setChecked(False) 361 | self.seed_textbox.setText(str(0)) 362 | self.seed_textbox.setEnabled(False) 363 | 364 | def update_seed_textbox(self): 365 | if self.random_seed_checkbox.isChecked(): 366 | self.seed_textbox.setEnabled(True) 367 | else: 368 | self.seed_textbox.setEnabled(False) 369 | 370 | def reset_interface(self): 371 | # self.draw_embed(None, None, "current") 372 | # self.draw_embed(None, None, "generated") 373 | self.draw_mel(None, "current") 374 | self.draw_mel(None, "generated") 375 | # self.draw_umap_projections(set()) 376 | self.set_loading(0) 377 | self.play_button.setDisabled(True) 378 | self.generate_button.setDisabled(True) 379 | self.synthesize_button.setDisabled(True) 380 | self.vocode_button.setDisabled(True) 381 | self.replay_wav_button.setDisabled(True) 382 | self.export_wav_button.setDisabled(True) 383 | [self.log("") for _ in range(self.max_log_lines)] 384 | 385 | def __init__(self): 386 | ## Initialize the application 387 | self.app = QApplication(sys.argv) 388 | super().__init__(None) 389 | self.setWindowTitle("Voice Conversion app") 390 | 391 | 392 | ## Main layouts 393 | # Root 394 | root_layout = QGridLayout() 395 | self.setLayout(root_layout) 396 | 397 | # Browser 398 | browser_layout = QGridLayout() 399 | root_layout.addLayout(browser_layout, 0, 0, 1, 2) 400 | 401 | # Generation 402 | gen_layout = QVBoxLayout() 403 | root_layout.addLayout(gen_layout, 1, 0, 1, 2) 404 | 405 | # Projections 406 | self.projections_layout = QVBoxLayout() 407 | root_layout.addLayout(self.projections_layout, 1, 2, 1, 1) 408 | 409 | # Visualizations 410 | vis_layout = QVBoxLayout() 411 | root_layout.addLayout(vis_layout, 0, 2, 1, 1) 412 | 413 | 414 | ## Projections 415 | # UMap 416 | fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0") 417 | fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98) 418 | self.projections_layout.addWidget(get_figure_widget(fig)) 419 | self.umap_hot = False 420 | self.clear_button = QPushButton("Clear") 421 | self.projections_layout.addWidget(self.clear_button) 422 | 423 | 424 | ## Browser 425 | # Dataset, speaker and utterance selection 426 | i = 0 427 | self.dataset_box = QComboBox() 428 | browser_layout.addWidget(QLabel("Dataset"), i, 0) 429 | browser_layout.addWidget(self.dataset_box, i + 1, 0) 430 | self.src_spk_box = QComboBox() 431 | browser_layout.addWidget(QLabel("Source speaker"), i, 1) 432 | browser_layout.addWidget(self.src_spk_box, i + 1, 1) 433 | self.utterance_box = QComboBox() 434 | browser_layout.addWidget(QLabel("Utterance"), i, 2) 435 | browser_layout.addWidget(self.utterance_box, i + 1, 2) 436 | self.browser_load_button = QPushButton("Load") 437 | browser_layout.addWidget(self.browser_load_button, i + 1, 3) 438 | i += 2 439 | 440 | # Random buttons 441 | self.random_dataset_button = QPushButton("Random") 442 | browser_layout.addWidget(self.random_dataset_button, i, 0) 443 | self.random_speaker_button = QPushButton("Random") 444 | browser_layout.addWidget(self.random_speaker_button, i, 1) 445 | self.random_utterance_button = QPushButton("Random") 446 | browser_layout.addWidget(self.random_utterance_button, i, 2) 447 | self.auto_next_checkbox = QCheckBox("Auto select next") 448 | self.auto_next_checkbox.setChecked(True) 449 | browser_layout.addWidget(self.auto_next_checkbox, i, 3) 450 | i += 1 451 | 452 | # Utterance box 453 | browser_layout.addWidget(QLabel("Use source from:"), i, 0) 454 | self.utterance_history = QComboBox() 455 | browser_layout.addWidget(self.utterance_history, i, 1, 1, 3) 456 | i += 1 457 | 458 | # Random & next utterance buttons 459 | self.browser_browse_button = QPushButton("Browse") 460 | browser_layout.addWidget(self.browser_browse_button, i, 0) 461 | self.record_button = QPushButton("Record") 462 | browser_layout.addWidget(self.record_button, i, 1) 463 | self.play_button = QPushButton("Play") 464 | browser_layout.addWidget(self.play_button, i, 2) 465 | self.stop_button = QPushButton("Stop") 466 | browser_layout.addWidget(self.stop_button, i, 3) 467 | i += 1 468 | 469 | 470 | # Model and audio output selection 471 | self.tgt_spk_box = QComboBox() 472 | browser_layout.addWidget(QLabel("Target speaker"), i, 0) 473 | browser_layout.addWidget(self.tgt_spk_box, i + 1, 0) 474 | 475 | self.audio_out_devices_cb=QComboBox() 476 | browser_layout.addWidget(QLabel("Audio Output"), i, 1) 477 | browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 1) 478 | i += 2 479 | 480 | #Replay & Save Audio 481 | browser_layout.addWidget(QLabel("Toolbox Output:"), i, 0) 482 | self.wavs_cb = QComboBox() 483 | self.wavs_cb_model = QStringListModel() 484 | self.wavs_cb.setModel(self.wavs_cb_model) 485 | self.wavs_cb.setToolTip("Select one of the last generated wavs in this section for replaying or exporting") 486 | browser_layout.addWidget(self.wavs_cb, i, 1) 487 | self.replay_wav_button = QPushButton("Replay") 488 | self.replay_wav_button.setToolTip("Replay last generated vocoder") 489 | browser_layout.addWidget(self.replay_wav_button, i, 2) 490 | self.export_wav_button = QPushButton("Export") 491 | self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file") 492 | browser_layout.addWidget(self.export_wav_button, i, 3) 493 | i += 1 494 | 495 | 496 | ## Embed & spectrograms 497 | vis_layout.addStretch() 498 | 499 | gridspec_kw = {"width_ratios": [1]} 500 | fig, cur_ax = plt.subplots( 501 | 1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw 502 | ) 503 | fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) 504 | self.cur_ax_widget = FigureWidget(fig, cur_ax) 505 | vis_layout.addWidget(self.cur_ax_widget) 506 | 507 | fig, gen_ax = plt.subplots( 508 | 1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw 509 | ) 510 | fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) 511 | self.gen_ax_widget = FigureWidget(fig, gen_ax) 512 | vis_layout.addWidget(self.gen_ax_widget) 513 | 514 | # for ax in self.cur_ax_widget.axis.tolist() + self.gen_ax_widget.axis.tolist(): 515 | for ax in [self.cur_ax_widget.axis, self.gen_ax_widget.axis]: 516 | ax.set_facecolor("#F0F0F0") 517 | for side in ["top", "right", "bottom", "left"]: 518 | ax.spines[side].set_visible(False) 519 | 520 | 521 | ## Generation 522 | layout = QHBoxLayout() 523 | self.generate_button = QPushButton("Synthesize and vocode") 524 | layout.addWidget(self.generate_button) 525 | self.synthesize_button = QPushButton("Synthesize only") 526 | layout.addWidget(self.synthesize_button) 527 | self.vocode_button = QPushButton("Vocode only") 528 | layout.addWidget(self.vocode_button) 529 | gen_layout.addLayout(layout) 530 | 531 | layout_seed = QGridLayout() 532 | self.random_seed_checkbox = QCheckBox("Random seed:") 533 | self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.") 534 | layout_seed.addWidget(self.random_seed_checkbox, 0, 0) 535 | self.seed_textbox = QLineEdit() 536 | self.seed_textbox.setMaximumWidth(80) 537 | layout_seed.addWidget(self.seed_textbox, 0, 1) 538 | gen_layout.addLayout(layout_seed) 539 | 540 | self.loading_bar = QProgressBar() 541 | gen_layout.addWidget(self.loading_bar) 542 | 543 | self.log_window = QLabel() 544 | self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft) 545 | gen_layout.addWidget(self.log_window) 546 | self.logs = [] 547 | gen_layout.addStretch() 548 | 549 | 550 | ## Set the size of the window and of the elements 551 | max_size = self.screen().availableGeometry().size() * 0.7 552 | self.resize(max_size) 553 | 554 | ## Finalize the display 555 | self.reset_interface() 556 | self.show() 557 | 558 | def start(self): 559 | self.app.exec_() --------------------------------------------------------------------------------