├── test
├── __init__.py
├── conftest.py
└── test_engine.py
├── engine
├── vc
│ └── __init__.py
├── __init__.py
├── vocoder
│ ├── __init__.py
│ ├── utils.py
│ └── hifi_gan.py
├── loading.py
├── feature_extraction.py
└── general.py
├── datasets
├── remembered_dataset_pathes.txt
└── test
│ └── example
│ ├── example.mp3
│ └── example.wav
├── gui
├── __init__.py
├── widgets.py
└── gui.py
├── utils
├── __init__.py
├── hparams.py
└── data.py
├── img_dashboard.png
├── .gitignore
├── requirements.txt
├── app.py
├── config.yaml
├── README.md
├── dist
├── README.md
└── warn_processing.py
├── LICENSE
└── toolbox.py
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/engine/vc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datasets/remembered_dataset_pathes.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gui/__init__.py:
--------------------------------------------------------------------------------
1 | from .gui import GUI
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import load_wav
--------------------------------------------------------------------------------
/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .general import VC, Utterance
--------------------------------------------------------------------------------
/engine/vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .hifi_gan import Generator as HifiGenerator
--------------------------------------------------------------------------------
/img_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/img_dashboard.png
--------------------------------------------------------------------------------
/datasets/test/example/example.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.mp3
--------------------------------------------------------------------------------
/datasets/test/example/example.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.wav
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .cache/
2 | .pytest_cache/
3 | **/__pycache__/
4 |
5 | **/.DS_Store
6 |
7 | # deployment cache
8 | dist/build/
9 | dist/dist/
10 | dist/hooks/
11 | dist/VCToolbox.spec
--------------------------------------------------------------------------------
/utils/hparams.py:
--------------------------------------------------------------------------------
1 | from omegaconf import OmegaConf
2 | from pathlib import Path
3 | import __main__
4 |
5 |
6 | root_dir = Path(__file__).parent.parent.resolve()
7 | cfg = OmegaConf.load(root_dir / 'config.yaml')
8 | cfg.root_dir = str(root_dir.resolve())
--------------------------------------------------------------------------------
/engine/vocoder/utils.py:
--------------------------------------------------------------------------------
1 | def init_weights(m, mean=0.0, std=0.01):
2 | classname = m.__class__.__name__
3 | if classname.find("Conv") != -1:
4 | m.weight.data.normal_(mean, std)
5 |
6 |
7 | def get_padding(kernel_size, dilation=1):
8 | return int((kernel_size*dilation - dilation)/2)
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.3.2
2 | torch==1.8.1
3 | umap_learn==0.5.1
4 | omegaconf==2.0.6
5 | Resemblyzer==0.1.1.dev0
6 | numpy==1.20.1
7 | matplotlib==3.4.1
8 | sounddevice==0.4.1
9 | pytest==6.2.2
10 | gdown==3.12.2
11 | librosa==0.8.0
12 | PySide6==6.0.3
13 | PyYAML==5.4.1
14 | soundfile==0.10.3.post1
15 | umap-learn==0.5.1
16 |
--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import os
3 | import numpy as np
4 | from utils import load_wav
5 |
6 |
7 | @pytest.fixture(scope="session")
8 | def example_wav():
9 | wav = load_wav(
10 | os.path.join(os.path.dirname(__file__), "../datasets/test/example/example.wav")
11 | )
12 | assert len(wav.shape) == 1
13 | return wav
14 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from toolbox import Toolbox
3 | import argparse
4 | from pathlib import Path
5 |
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser(
9 | description="Runs the toolbox",
10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 | )
12 | parser.add_argument("-d", "--datasets_root", type=Path, help= \
13 | "Path to the directory containing your datasets.", default=Path(__file__).parent / 'datasets')
14 | parser.add_argument("--seed", type=int, default=17, help=\
15 | "Optional random number seed value to make toolbox deterministic.")
16 | args = parser.parse_args()
17 |
18 | # Launch the toolbox
19 | Toolbox(**vars(args))
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | data: # shouldn't be changed
2 | sample_rate: 16000
3 | n_mels: 80
4 | n_fft: 1280
5 | win_length: 1280
6 | hop_length: 320
7 | f_min: 50.
8 | f_max: null
9 | preemph: 0.97
10 |
11 | root_dir: null # will be defined onstart
12 | ckpt_default_path: '.cache'
13 | ckpt_dict:
14 | hifi_gan:
15 | 'generator': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/generator_pt330', 'wget']
16 | 'config.yaml': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/config_v1_4.json', 'wget']
17 | fragmentvc:
18 | 'model.pt': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v4.5/fragmentvc_v4_5_stage2.pt', 'wget']
--------------------------------------------------------------------------------
/utils/data.py:
--------------------------------------------------------------------------------
1 | from .hparams import cfg
2 |
3 | import librosa
4 | import numpy as np
5 | import soundfile as sf
6 | from pathlib import Path
7 | import os
8 |
9 |
10 | def load_wav(path):
11 | wav, _ = librosa.load(path, sr=cfg.data.sample_rate)
12 | wav = librosa.util.normalize(wav) * 0.95
13 |
14 | return wav
15 |
16 |
17 | def save_wav(path, wav, sr):
18 | """Save audio to path"""
19 | wav = np.clip(wav, -1.0, 1.0)
20 | sf.write(path, wav, sr)
21 |
22 |
23 | def get_subdirs(dir_path):
24 | dir_path = Path(dir_path)
25 | dirnames = [p.stem for p in dir_path.iterdir() if p.is_dir()]
26 | return dirnames
27 |
28 |
29 | def has_ext(filepath, ext):
30 | if isinstance(ext, str):
31 | return Path(filepath).match(f'*{ext}')
32 | else:
33 | return any(Path(filepath).match(f'*{e}') for e in ext)
34 |
35 |
36 | def get_filepathes(dir_path, ext='.wav'):
37 | for d_path, _, f_names in os.walk(dir_path):
38 | for fn in f_names:
39 | if not has_ext(fn, ext):
40 | continue
41 |
42 | yield os.path.join(d_path, fn)
43 |
--------------------------------------------------------------------------------
/gui/widgets.py:
--------------------------------------------------------------------------------
1 | from PySide6.QtGui import QPixmap
2 | from PySide6.QtWidgets import QLabel
3 | from PySide6.QtCore import Signal
4 |
5 | import io
6 |
7 | def get_figure_widget(fig):
8 | buf = io.BytesIO()
9 | fig.savefig(buf, format='png')
10 | buf.seek(0)
11 |
12 | pixmap = QPixmap()
13 | pixmap.loadFromData(buf.read())
14 | buf.close()
15 |
16 | fig_label = QLabel()
17 | fig_label.setPixmap(pixmap)
18 |
19 | return fig_label
20 |
21 |
22 | def get_figure_pixmap(fig):
23 | buf = io.BytesIO()
24 | fig.savefig(buf, format='png', transparent=True)
25 | buf.seek(0)
26 |
27 | pixmap = QPixmap()
28 | pixmap.loadFromData(buf.read())
29 | buf.close()
30 |
31 | return pixmap
32 |
33 |
34 |
35 | class FigureWidget(QLabel):
36 | draw_signal = Signal()
37 |
38 | def __init__(self, fig, axis):
39 | super().__init__()
40 | self.fig = fig
41 | self.axis = axis
42 | self.draw_signal.connect(self.draw)
43 |
44 | def update(self):
45 | self.draw_signal.emit()
46 |
47 | def draw(self):
48 | self.setPixmap(get_figure_pixmap(self.fig))
49 |
50 |
--------------------------------------------------------------------------------
/test/test_engine.py:
--------------------------------------------------------------------------------
1 | from utils.hparams import cfg
2 | from engine import VC, Utterance
3 |
4 | import numpy as np
5 | import torch
6 | import pytest
7 |
8 | @pytest.fixture(scope='module')
9 | def engine():
10 | return VC()
11 |
12 | @pytest.fixture(scope='module')
13 | def src(example_wav):
14 | return example_wav
15 |
16 | @pytest.fixture(scope='module')
17 | def tgts(example_wav):
18 | return [
19 | example_wav,
20 | example_wav[:len(example_wav) // 2],
21 | ]
22 |
23 |
24 | def test_prepare(engine, src, tgts):
25 | src_features, (tgt_mel, tgt_spk_emb) = engine.prepare(src, tgts)
26 |
27 | assert len(src_features.shape) == 3
28 | assert len(tgt_mel.shape) == 3
29 | assert len(tgt_spk_emb.shape) == 2
30 |
31 |
32 | def test_convert(engine, src, tgts):
33 | mel = engine.convert(*engine.prepare(src, tgts))
34 |
35 | assert len(mel.shape) == 3
36 |
37 |
38 | def test_vocode(engine, src):
39 | src = torch.from_numpy(src).unsqueeze(0)
40 | mel = engine._get_mel(src)
41 |
42 | wav = engine.vocode(mel)
43 |
44 | assert len(wav.shape) == 2
45 |
46 |
47 | def test_e2e(engine, src, tgts):
48 | # using build-in __call__
49 | src_utt = Utterance(wav=src)
50 | tgt_utts = [Utterance(wav=wav) for wav in tgts]
51 | out1 = engine(src_utt, tgt_utts)
52 |
53 | # using step-by-step
54 | out2 = engine.convert(*engine.prepare(src, tgts))
55 | out2 = engine.vocode(out2).cpu().numpy()
56 |
57 | assert np.allclose(out1, out2)
58 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Real-Time Voice Conversion
2 |
3 | 
4 |
5 | This repository implements a simple toolbox for voice conversion. It uses one of the latest result for mel-based source to target one-shot voice conversion model and fast GAN-based vocoder.
6 |
7 | ## Installation
8 |
9 | Application file of every release was created by [PyInstaller](https://pypi.org/project/pyinstaller/) making it available to build on many popular platforms (Windows, Mac OS X, GNU/Linux, etc). Download release archive for your OS, then simply start the application file named `VCToolbox` from unarchived zip.
10 |
11 | Available OS releases:
12 |
13 | - MacOS
14 | - Linux
15 |
16 | If you want to distribute for other platforms follow instructions in `dist/BUILD.md`.
17 |
18 | ## Installation (Development build)
19 |
20 | 1. Install Requirements (Python 3.7+ were tested ok)
21 |
22 | ```bash
23 | pip install -r requirements.txt
24 | ```
25 |
26 | **Note:** for mp3 support you should have ffmpeg installed (you can via `brew` for MacOS, `apt-get` for linux, static builds for windows).
27 |
28 | 2. Everything is complete for launching the toolbox. Required models will be loaded on first application run. Run toolbox with following command:
29 |
30 | ```bash
31 | python3 app.py
32 | ```
33 |
34 | 3*. You may want to run tests for this repo. It can be done via:
35 |
36 | ```
37 | pytest
38 | ```
39 |
40 | **Note:** that will also download models if they weren't.
41 |
42 | ## Credits
43 |
44 | Author thanks developer of [CorentinJ/Real-Time-Voice-Cloning](github.com/CorentinJ/Real-Time-Voice-Cloning) for cool design and implementation ideas.
45 |
--------------------------------------------------------------------------------
/engine/loading.py:
--------------------------------------------------------------------------------
1 | from omegaconf import DictConfig
2 | from utils.hparams import cfg
3 | from .vocoder import HifiGenerator
4 |
5 | import os
6 | import torch
7 | import json
8 |
9 | ckpt_dir = os.path.join(cfg.root_dir, cfg.ckpt_default_path)
10 | ckpt_dict = cfg.ckpt_dict
11 |
12 |
13 | def get_vocoder():
14 | group_name = 'hifi_gan'
15 | download_group(group_name)
16 |
17 | config_path = os.path.join(ckpt_dir, group_name, "config.yaml")
18 | json_config = json.loads(open(config_path).read())
19 | with torch.no_grad():
20 | generator = HifiGenerator(DictConfig(json_config)).eval()
21 |
22 | ckpt_path = os.path.join(ckpt_dir, group_name, "generator")
23 | state = torch.load(ckpt_path, map_location=torch.device('cpu'))
24 | generator.load_state_dict(state['generator'])
25 | generator.remove_weight_norm()
26 |
27 | return generator
28 |
29 |
30 | def get_vc_model():
31 | group_name = 'fragmentvc'
32 | download_group(group_name)
33 |
34 | ckpt_path = os.path.join(ckpt_dir, group_name, "model.pt")
35 | model = torch.jit.load(ckpt_path).eval()
36 |
37 | return model
38 |
39 |
40 | def download_group(group_name):
41 | for filename, (url, agent) in ckpt_dict[group_name].items():
42 | filepath = os.path.join(ckpt_dir, group_name, filename)
43 | _download(filepath, url, agent=agent)
44 |
45 |
46 | def _download(filepath, url, refresh=False, agent='wget'):
47 | '''
48 | Download from url into filepath using agent if needed
49 | Ref: https://github.com/s3prl/s3prl
50 | '''
51 |
52 | dirpath = os.path.dirname(filepath)
53 | os.makedirs(dirpath, exist_ok=True)
54 |
55 | if not os.path.isfile(filepath) or refresh:
56 | if agent == 'wget':
57 | os.system(f'wget {url} -O {filepath}')
58 | elif agent == 'gdown':
59 | import gdown
60 | gdown.download(url, filepath, use_cookies=False)
61 | else:
62 | print('[Download] - Unknown download agent. Only \'wget\' and \'gdown\' are supported.')
63 | raise NotImplementedError
64 | else:
65 | print(f'Using checkpoint found in {filepath}')
66 |
--------------------------------------------------------------------------------
/engine/feature_extraction.py:
--------------------------------------------------------------------------------
1 | from resemblyzer import VoiceEncoder
2 | from transformers import Wav2Vec2Model
3 |
4 | import torch
5 | from torch import nn
6 | import torch.nn.functional as F
7 | from librosa.filters import mel as librosa_mel_fn
8 | from utils.hparams import cfg
9 |
10 |
11 | def load_pretrained_spk_emb(device='cpu'):
12 | """Load speaker embedding model"""
13 |
14 | model = VoiceEncoder().to(device).eval()
15 | model.requires_grad_(False)
16 | return model
17 |
18 |
19 | def load_pretrained_feature_extractor(device='cpu', ckpt_path='facebook/wav2vec2-base-960h'):
20 | """Load pretrained Wav2Vec model."""
21 |
22 | def extract_features(self, wav, mask):
23 | # wav2vec has window of 400, so we pad to center windows
24 | wav = torch.nn.functional.pad(wav.unsqueeze(1), (200, 200), mode='reflect').squeeze(1)
25 | return [self(wav).last_hidden_state]
26 |
27 | Wav2Vec2Model.extract_features = extract_features # for same behaviour as fairseq.Wav2Vec2Model
28 | model = Wav2Vec2Model.from_pretrained(ckpt_path).eval()
29 | model.requires_grad_(False)
30 | return model
31 |
32 |
33 | class Wav2Mel(nn.Module):
34 | def __init__(self, n_fft, hop_length, win_length,
35 | sample_rate, n_mels, f_min, f_max, preemph
36 | ):
37 | super().__init__()
38 |
39 | window = torch.hann_window(win_length).float()
40 | self.register_buffer("window", window)
41 |
42 | mel_basis = torch.from_numpy(librosa_mel_fn(
43 | sample_rate, n_fft, n_mels, f_min, f_max
44 | )).float()
45 | self.register_buffer("mel_basis", mel_basis)
46 |
47 | preemph_kernel = torch.FloatTensor([[[-preemph, 1]]])
48 | self.register_buffer("preemph_kernel", preemph_kernel)
49 |
50 | self.n_fft = n_fft
51 | self.hop_length = hop_length
52 | self.win_length = win_length
53 | self.sample_rate = sample_rate
54 | self.n_mels = n_mels
55 |
56 | def forward(self, wav):
57 | n_pad = self.n_fft // 2
58 |
59 | while len(wav.shape) < 3:
60 | wav = wav.unsqueeze(0)
61 |
62 | wav = torch.nn.functional.conv1d(wav, self.preemph_kernel, padding=1)[:, :, :-1]
63 |
64 | wav = F.pad(wav, (n_pad, n_pad), "reflect").squeeze(0)
65 | spec = torch.stft(wav, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length,
66 | window=self.window, center=False, return_complex=True
67 | ).abs()
68 |
69 | mel = torch.matmul(self.mel_basis, spec)
70 | log_mel = torch.log(torch.clamp(mel, min=1e-5))
71 |
72 | return log_mel
--------------------------------------------------------------------------------
/dist/README.md:
--------------------------------------------------------------------------------
1 | # Toolbox package building guide
2 |
3 | We use OS dependent utilite PyInstaller as package building tool. Result of steps below is a platform dependent for single application. It can be used directly on same platform as guide instructions are completed. For other platforms build on virtual machines or containers. [Read more about](https://pyinstaller.readthedocs.io/en/stable/usage.html) build tool.
4 |
5 | **Note:** using of python virtual environment is highly recommended:
6 | ```
7 | # creation
8 | virtualenv ~/install_env
9 | source ~/install_env/bin/activate
10 |
11 | # after use (delete if needed)
12 | deactivate
13 | rm -rf ~/install_env
14 | ```
15 |
16 | The procedure of distribution of this repo toolbox consists of several steps (steps performed from `dist/` directory).
17 |
18 | ## MacOS
19 |
20 | 1. Install all required packages from requirements.txt. Also install pyinstaller and uninstall some deprecated (but installed) packages:
21 | ```bash
22 | pip -r ../requirements.txt
23 | pip install pyinstaller
24 | pip uninstall typing dataclasses
25 | ```
26 |
27 | 2. Trying first build which will definetely fail, but provide us with important warning log.
28 | ```bash
29 | pyinstaller --name="VCToolbox" --windowed --add-data="../config.yaml:./" --add-data="../datasets/*:datasets/" --hidden-import=typing_extensions -y --onefile ../app.py
30 | ```
31 |
32 | **Note:** `--onefile` is optional and not fully supported option with PySide6 (main Qt for Python package used in toolbox).
33 |
34 | After command above, there would be two new directories ('build/' and 'dist/'). Actually we don't need 'dist/' folder, because it stores our releasing app, but generated 'build/VCToolbox/warn-VCToolbox.txt' will be used for next stages.
35 |
36 | 3. Define installation hooks for not found modules. For this run `warn_processing.py` file:
37 | ```bash
38 | python3 warn_processing.py
39 | ```
40 |
41 | **Important:** If script fails on python package itself (there would be string consist in python path), you should add specified string into `remove_list` in `warn_processing.py` file and run script again.
42 |
43 | This stage is cumbersome and depends on previous one. Some important modules that weren't found (such as librosa, etc) already added in script. After the command, there will be generated `hooks/` directory with files (~60 files) defining hooks for PyInstaller. Tree will look like:
44 |
45 | ```
46 | .
47 | ├── BUILD.md
48 | ├── VCToolbox.spec
49 | ├── build
50 | ├── dist
51 | ├── hooks
52 | └── warn_processing.py
53 | ```
54 |
55 | You can remove unnecessary 'dist/' and 'build/' folders for now:
56 | ```
57 | rm -rf dist/ build/
58 | ```
59 |
60 | 4. Build with hooks for not found packages:
61 | ```bash
62 | pyinstaller --name="VCToolbox" --windowed --hidden-import=typing_extensions -y --additional-hooks-dir=hooks --onefile ../app.py
63 | ```
64 |
65 | If everything done right, there would be an executable file in dist/ folder.
66 |
67 | 5.(optional) For distribution purposes move app binary into root folder, because it uses `config.yaml` and `datasets/` pathes. Then zip into archive, or use other utility for `.app` and `.dmg` creation.
68 |
69 |
--------------------------------------------------------------------------------
/dist/warn_processing.py:
--------------------------------------------------------------------------------
1 | ### Define next the folder to create the hooks files and the warning file to read the modules from
2 | output_hooks_dir = 'hooks'
3 | warning_file = 'build/VCToolbox/warn-VCToolbox.txt'
4 |
5 | import re
6 | import os
7 | import shutil
8 |
9 | shutil.rmtree(output_hooks_dir, ignore_errors=True)
10 | os.makedirs(output_hooks_dir, exist_ok=True)
11 |
12 | with open(warning_file) as file:
13 | files_content = file.readlines()
14 |
15 | clean_content = []
16 | for line in files_content:
17 | if re.search('missing module named',line):
18 | temp_line = re.sub('.*imported by ','',line)
19 | temp_line = re.sub('\n',', ',temp_line)
20 | clean_content.append(temp_line)
21 | clean_content = list(set(clean_content))
22 | joined_content = ''.join(clean_content)
23 | clean_content = list(set(joined_content.split('), ')))
24 |
25 | modules_toplevel = []
26 | for line in clean_content:
27 | if re.search('top-level',line):
28 | temp_mod = re.sub(' \(.*','',line)
29 | temp_mod = re.sub('\..*','',temp_mod)
30 | modules_toplevel.append(temp_mod)
31 | modules_toplevel = list(set(modules_toplevel))
32 |
33 | modules_conditional = []
34 | for line in clean_content:
35 | if re.search('conditional',line):
36 | temp_mod = re.sub(' \(.*','',line)
37 | temp_mod = re.sub('\..*','',temp_mod)
38 | modules_conditional.append(temp_mod)
39 | modules_conditional = list(set(modules_conditional))
40 |
41 | modules_delayed = []
42 | for line in clean_content:
43 | if re.search('delayed',line):
44 | temp_mod = re.sub(' \(.*','',line)
45 | temp_mod = re.sub('\..*','',temp_mod)
46 | modules_delayed.append(temp_mod)
47 | modules_delayed = list(set(modules_delayed))
48 |
49 | modules_optional = []
50 | for line in clean_content:
51 | if re.search('optional',line):
52 | temp_mod = re.sub(' \(.*','',line)
53 | temp_mod = re.sub('\..*','',temp_mod)
54 | modules_optional.append(temp_mod)
55 | modules_optional = list(set(modules_optional))
56 |
57 | all_modules = modules_toplevel + modules_conditional + modules_delayed + modules_optional
58 | all_modules = list(set(all_modules))
59 |
60 | print(all_modules)
61 | print('Number of found modules:', len(all_modules))
62 |
63 | ### Optional: remove any of the modules
64 | remove_list = [
65 | '/usr/local/lib/python3',
66 | '/Users/sotomi/envs/pyinstaller-env/lib/python3',
67 | 'zipimport',
68 | 'test',
69 | ]
70 | add_list = [
71 | 'sacremoses',
72 | 'resemblyzer',
73 | 'librosa',
74 | ]
75 | for pkg in remove_list:
76 | if pkg in all_modules:
77 | all_modules.remove(pkg)
78 | for pkg in add_list:
79 | if pkg not in all_modules:
80 | all_modules.append(pkg)
81 |
82 | print('Total number of requested modules:', len(all_modules))
83 |
84 | ### Optional: Change all_modules by any of the other lists, e.g. modules_toplevel
85 | for module in all_modules:
86 | output_content = 'from PyInstaller.utils.hooks import collect_all\n\ndatas, binaries, hiddenimports = collect_all(\''+module+'\')'
87 | with open(output_hooks_dir+'/hook-'+str(module)+'.py', 'w') as f:
88 | f.write(output_content)
89 |
--------------------------------------------------------------------------------
/engine/general.py:
--------------------------------------------------------------------------------
1 | from .loading import get_vocoder, get_vc_model
2 | from .feature_extraction import load_pretrained_spk_emb, load_pretrained_feature_extractor, Wav2Mel
3 | from utils.hparams import cfg
4 |
5 | from resemblyzer import preprocess_wav
6 | import torch
7 | import numpy as np
8 | import librosa
9 | from pathlib import Path
10 | from dataclasses import dataclass, field
11 | from typing import List
12 |
13 |
14 | @dataclass
15 | class Utterance:
16 | """Audio with its cached features"""
17 | wav: np.array = field(repr=False, default=None)
18 | sr: int = None
19 | path: str = None
20 | spk_name: str = None
21 | mel: np.ndarray = field(repr=False, default=None)
22 | spk_emb: np.array = field(repr=False, default=None)
23 | features: np.ndarray = field(repr=False, default=None)
24 |
25 |
26 | def clear(self):
27 | self.wav = None
28 | self.mel = None
29 | self.features = None
30 |
31 | def get_id(self):
32 | if self.path is None or self.spk_name is None:
33 | return
34 | return (self.spk_name, Path(self.path).stem)
35 |
36 | def __eq__(self, other):
37 | return self.get_id() == other.get_id()
38 |
39 | def __hash__(self):
40 | return hash(self.get_id())
41 |
42 |
43 | class VC:
44 | def __init__(self):
45 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
46 | self.feature_extractor = None
47 | self.mel_extractor = None
48 | self.spk_emb_extractor = None
49 | self.model = None
50 | self.vocoder = None
51 |
52 | @staticmethod
53 | def yield_init_stage(stage_num):
54 | return stage_num / 5
55 |
56 | def logged_init(self):
57 | yield self.yield_init_stage(0)
58 | self.feature_extractor = load_pretrained_feature_extractor(device=self.device)
59 |
60 | yield self.yield_init_stage(1)
61 | self.mel_extractor = Wav2Mel(**cfg.data)
62 |
63 | yield self.yield_init_stage(2)
64 | self.spk_emb_extractor = load_pretrained_spk_emb(device=self.device)
65 |
66 | yield self.yield_init_stage(3)
67 | self.model = get_vc_model().to(self.device)
68 |
69 | yield self.yield_init_stage(4)
70 | self.vocoder = get_vocoder().to(self.device)
71 |
72 | yield self.yield_init_stage(5)
73 |
74 | def __call__(self, src: Utterance, tgts: List[Utterance], input_sr: int = cfg.data.sample_rate):
75 | """Convert source utterance from source speaker to target speaker"""
76 |
77 | # preparation
78 | src_features, tgt_features = self.prepare(src.wav, [tgt.wav for tgt in tgts], input_sr=input_sr)
79 |
80 | # conversion
81 | out_mel = self.convert(src_features, tgt_features)
82 |
83 | # vocoding
84 | out_wav = self.vocode(out_mel)
85 |
86 | return out_wav.cpu().numpy()
87 |
88 | def prepare(self, src_wav, tgt_wavs, input_sr=cfg.data.sample_rate):
89 | src_wav = torch.from_numpy(src_wav).to(self.device)
90 | if len(src_wav.shape) == 1:
91 | src_wav = src_wav.unsqueeze(0)
92 | tgt_wav = torch.from_numpy(
93 | np.concatenate(tgt_wavs)
94 | ).to(self.device).unsqueeze(0)
95 |
96 | src_features = self._get_features(src_wav)
97 | tgt_spk_emb = self._get_spk_emb(tgt_wavs, input_sr)
98 | tgt_mel = self.mel_extractor(tgt_wav)
99 |
100 | return src_features, (tgt_mel, tgt_spk_emb)
101 |
102 | def convert(self, src_features, tgt_features):
103 | tgt_mel, tgt_spk_emb = tgt_features
104 | with torch.no_grad():
105 | out_mel, _, _, _ = self.model(src_features, tgt_mel, ref_embs=tgt_spk_emb)
106 | return out_mel
107 |
108 | def vocode(self, mel):
109 | with torch.no_grad():
110 | wav = self.vocoder(mel).squeeze(1)
111 | return wav
112 |
113 | def _get_mel(self, wav):
114 | return self.mel_extractor(wav)
115 |
116 | def _get_features(self, wav):
117 | with torch.no_grad():
118 | return self.feature_extractor.extract_features(wav, None)[0]
119 |
120 | def _get_spk_emb(self, wavs, sr=None):
121 | wavs = [preprocess_wav(wav, sr) for wav in wavs]
122 | cat_wav = np.concatenate(wavs, 0)
123 | spk_emb = self.spk_emb_extractor.embed_utterance(cat_wav)
124 |
125 | return torch.from_numpy(spk_emb).to(self.device).unsqueeze(0)
126 |
127 | # @staticmethod
128 | # def preprocess_single_wav(fpath_or_wav: Union[str, Path, np.ndarray], src_sr=None, tgt_sr=None):
129 | # # TODO
130 | # if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
131 | # wav, src_sr = librosa.load(str(fpath_or_wav), sr=None)
132 | # else:
133 | # wav = fpath_or_wav
134 |
135 | # # Resample the wav
136 | # if src_sr is not None and tgt_sr is not None:
137 | # wav = librosa.resample(wav, src_sr, tgt_sr)
138 |
139 | # return wav
--------------------------------------------------------------------------------
/engine/vocoder/hifi_gan.py:
--------------------------------------------------------------------------------
1 | # Hifi-GAN Reference:
2 | # https://github.com/jik876/hifi-gan/blob/master/models.py
3 |
4 | import torch
5 | import torch.nn.functional as F
6 | import torch.nn as nn
7 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
8 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
9 | from .utils import init_weights, get_padding
10 |
11 | LRELU_SLOPE = 0.1
12 |
13 |
14 | class ResBlock1(torch.nn.Module):
15 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
16 | super(ResBlock1, self).__init__()
17 | self.h = h
18 | self.convs1 = nn.ModuleList([
19 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
20 | padding=get_padding(kernel_size, dilation[0]))),
21 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
22 | padding=get_padding(kernel_size, dilation[1]))),
23 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
24 | padding=get_padding(kernel_size, dilation[2])))
25 | ])
26 | self.convs1.apply(init_weights)
27 |
28 | self.convs2 = nn.ModuleList([
29 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
30 | padding=get_padding(kernel_size, 1))),
31 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
32 | padding=get_padding(kernel_size, 1))),
33 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
34 | padding=get_padding(kernel_size, 1)))
35 | ])
36 | self.convs2.apply(init_weights)
37 |
38 | def forward(self, x):
39 | for c1, c2 in zip(self.convs1, self.convs2):
40 | xt = F.leaky_relu(x, LRELU_SLOPE)
41 | xt = c1(xt)
42 | xt = F.leaky_relu(xt, LRELU_SLOPE)
43 | xt = c2(xt)
44 | x = xt + x
45 | return x
46 |
47 | def remove_weight_norm(self):
48 | for l in self.convs1:
49 | remove_weight_norm(l)
50 | for l in self.convs2:
51 | remove_weight_norm(l)
52 |
53 |
54 | class ResBlock2(torch.nn.Module):
55 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
56 | super(ResBlock2, self).__init__()
57 | self.h = h
58 | self.convs = nn.ModuleList([
59 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
60 | padding=get_padding(kernel_size, dilation[0]))),
61 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
62 | padding=get_padding(kernel_size, dilation[1])))
63 | ])
64 | self.convs.apply(init_weights)
65 |
66 | def forward(self, x):
67 | for c in self.convs:
68 | xt = F.leaky_relu(x, LRELU_SLOPE)
69 | xt = c(xt)
70 | x = xt + x
71 | return x
72 |
73 | def remove_weight_norm(self):
74 | for l in self.convs:
75 | remove_weight_norm(l)
76 |
77 |
78 | class Generator(torch.nn.Module):
79 | def __init__(self, h):
80 | super(Generator, self).__init__()
81 | self.h = h
82 | self.num_kernels = len(h.resblock_kernel_sizes)
83 | self.num_upsamples = len(h.upsample_rates)
84 | self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
85 | resblock = ResBlock1 if h.resblock == '1' else ResBlock2
86 |
87 | self.ups = nn.ModuleList()
88 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
89 | self.ups.append(weight_norm(
90 | ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
91 | k, u, padding=(u//2 + u%2), output_padding=u%2)))
92 |
93 | self.resblocks = nn.ModuleList()
94 | for i in range(len(self.ups)):
95 | ch = h.upsample_initial_channel//(2**(i+1))
96 | for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
97 | self.resblocks.append(resblock(h, ch, k, d))
98 |
99 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
100 | self.ups.apply(init_weights)
101 | self.conv_post.apply(init_weights)
102 |
103 | def forward(self, x):
104 | x = self.conv_pre(x)
105 | for i in range(self.num_upsamples):
106 | x = F.leaky_relu(x, LRELU_SLOPE)
107 | x = self.ups[i](x)
108 | xs = None
109 | for j in range(self.num_kernels):
110 | if xs is None:
111 | xs = self.resblocks[i*self.num_kernels+j](x)
112 | else:
113 | xs += self.resblocks[i*self.num_kernels+j](x)
114 | x = xs / self.num_kernels
115 | x = F.leaky_relu(x)
116 | x = self.conv_post(x)
117 | x = torch.tanh(x)
118 |
119 | return x
120 |
121 | def remove_weight_norm(self):
122 | print('Removing weight norm...')
123 | for l in self.ups:
124 | remove_weight_norm(l)
125 | for l in self.resblocks:
126 | l.remove_weight_norm()
127 | remove_weight_norm(self.conv_pre)
128 | remove_weight_norm(self.conv_post)
129 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/toolbox.py:
--------------------------------------------------------------------------------
1 | # from audioread.exceptions import NoBackendError
2 | from engine import VC, Utterance
3 | from gui import GUI
4 | from pathlib import Path
5 | import utils
6 | from utils.hparams import cfg
7 |
8 | from time import perf_counter as timer
9 | import traceback
10 | import numpy as np
11 | import torch
12 | import os
13 | import sys
14 | from pathlib import Path
15 | from collections import defaultdict
16 |
17 |
18 | # Maximum of generated wavs to keep on memory
19 | MAX_WAVS = 15
20 | MAX_TARGET_SAMPLES = 10
21 | MAX_LOADED_SAMPLES = 100
22 |
23 |
24 | class Toolbox:
25 | def __init__(self, datasets_root, seed):
26 | sys.excepthook = self.excepthook
27 | self.seed = seed
28 | self.datasets_root = datasets_root
29 | self.recognized_datasets = []
30 | self.utterances = set()
31 | self.current_generated = (None, None, None, None) # speaker_name, mel, breaks, wav
32 | self.speaker_filepathes = defaultdict(set)
33 | self.audio_ext = {'.wav', '.flac', '.mp3'}
34 | for datafolder in utils.data.get_subdirs(datasets_root):
35 | self.load_dataset_info(os.path.join(self.datasets_root, datafolder))
36 |
37 | self.engine = None # type: VC
38 | self.current_src_utt = None
39 | self.current_tgt_utts = None
40 | self.current_tgt_spk = None
41 | self.loaded_utts = []
42 | self.conv_utts_list = []
43 | self.conv_utts_idlist = []
44 | self.self_record_count = 0
45 |
46 | self.trim_silences = True
47 |
48 | # Initialize the events and the interface
49 | self.ui = GUI()
50 | self.reset_ui(seed)
51 | self.setup_events()
52 | self.ui.start()
53 |
54 | def excepthook(self, exc_type, exc_value, exc_tb):
55 | traceback.print_exception(exc_type, exc_value, exc_tb)
56 | self.ui.log("Exception: %s" % exc_value)
57 |
58 | def setup_events(self):
59 | # Dataset, speaker and utterance selection
60 | self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
61 | random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root, self.recognized_datasets, level)
62 | self.ui.random_dataset_button.clicked.connect(random_func(0))
63 | self.ui.random_speaker_button.clicked.connect(random_func(1))
64 | self.ui.random_utterance_button.clicked.connect(random_func(2))
65 | self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
66 | self.ui.src_spk_box.currentIndexChanged.connect(random_func(2))
67 | self.ui.tgt_spk_box.currentIndexChanged.connect(random_func(2))
68 |
69 | # Utterance selection
70 | func = lambda: self.load_from_browser(self.ui.browse_file())
71 | self.ui.browser_browse_button.clicked.connect(func)
72 | func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
73 | self.ui.utterance_history.currentIndexChanged.connect(func)
74 | func = lambda: self.ui.play(self.ui.selected_utterance.wav, cfg.data.sample_rate)
75 | self.ui.play_button.clicked.connect(func)
76 | self.ui.stop_button.clicked.connect(self.ui.stop)
77 | self.ui.record_button.clicked.connect(self.record)
78 |
79 | # Audio
80 | self.ui.setup_audio_devices(cfg.data.sample_rate)
81 |
82 | # Wav playback & save
83 | func = lambda: self.replay_last_wav()
84 | self.ui.replay_wav_button.clicked.connect(func)
85 | func = lambda: self.export_current_wave()
86 | self.ui.export_wav_button.clicked.connect(func)
87 | self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt)
88 |
89 | # Generation
90 | func = lambda: self.convert() or self.vocode()
91 | self.ui.generate_button.clicked.connect(func)
92 | self.ui.synthesize_button.clicked.connect(self.convert)
93 | self.ui.vocode_button.clicked.connect(self.vocode)
94 | self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
95 |
96 | # UMAP legend
97 | self.ui.clear_button.clicked.connect(self.clear_utterances)
98 |
99 | def set_current_utt(self, index):
100 | self.current_src_utt = self.conv_utts_list[index]
101 |
102 | def export_current_wave(self):
103 | self.ui.save_audio_file(self.current_src_utt, cfg.data.sample_rate)
104 |
105 | def replay_last_wav(self):
106 | self.ui.play(self.current_src_utt, cfg.data.sample_rate)
107 |
108 | def reset_ui(self, seed):
109 | self.recognized_datasets = [p for p in self.datasets_root.iterdir() if p.is_dir()]
110 | self.ui.populate_browser(self.datasets_root, self.recognized_datasets, 0, True)
111 | self.ui.populate_gen_options(seed, self.trim_silences)
112 |
113 | def load_from_browser(self, fpath=None):
114 | if fpath is None:
115 | fpath = Path(self.datasets_root, self.ui.current_dataset_name, self.ui.current_src_spk, self.ui.current_utterance_name)
116 | name = str(fpath.relative_to(self.datasets_root))
117 | speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_src_spk
118 |
119 | # Select the next utterance
120 | if self.ui.auto_next_checkbox.isChecked():
121 | self.ui.browser_select_next()
122 | elif fpath == "":
123 | return
124 | else:
125 | name = fpath.name
126 | speaker_name = fpath.parent.name
127 |
128 |
129 | # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
130 | # playback, so as to have a fair comparison with the generated audio
131 | wav = utils.load_wav(str(fpath))
132 | self.ui.log("Loaded %s" % name)
133 |
134 | self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
135 |
136 | def record(self):
137 | wav = self.ui.record_one(cfg.data.sample_rate, 5)
138 | if wav is None:
139 | return
140 | self.ui.play(wav, cfg.data.sample_rate)
141 | self.self_record_count += 1
142 |
143 | speaker_name = "user_recorder"
144 | name = f"{speaker_name}_{self.self_record_count}"
145 | self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
146 |
147 | def add_real_utterance(self, wav, sr, path, spk_name):
148 | if self.engine is None:
149 | self.init_engine()
150 |
151 | # Compute the mel spectrogram
152 | mel = self.engine._get_mel(torch.from_numpy(wav))
153 | self.ui.draw_mel(mel.squeeze(0), "current")
154 |
155 | # Compute the embedding
156 | embed = self.engine._get_spk_emb([wav], sr=sr)
157 |
158 | # Add the utterance
159 | utterance = Utterance(
160 | wav=wav, sr=sr,
161 | path=path, spk_name=spk_name,
162 | mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0)
163 | )
164 | if utterance not in self.utterances:
165 | self.utterances.add(utterance)
166 | self.ui.register_utterance(utterance)
167 |
168 | # Plot it
169 | # self.ui.draw_embed(embed, Path(path).stem, "current")
170 | self.ui.draw_umap_projections(self.utterances)
171 |
172 | def clear_utterances(self):
173 | self.reset_ui(self.seed)
174 | self.utterances.clear()
175 | self.ui.draw_umap_projections(self.utterances)
176 |
177 | def convert(self):
178 | self.ui.log("Converting from source to target...")
179 | self.ui.set_loading(1)
180 |
181 | # Update the synthesizer random seed
182 | if self.ui.random_seed_checkbox.isChecked():
183 | seed = int(self.ui.seed_textbox.text())
184 | self.ui.populate_gen_options(seed, self.trim_silences)
185 | else:
186 | seed = None
187 |
188 | tgt_spk = self.ui.current_tgt_spk
189 |
190 | # Synthesize the spectrogram
191 | if self.engine is None:
192 | self.init_engine()
193 |
194 | src_wav = self.ui.selected_utterance.wav
195 | if self.current_tgt_spk is None or self.current_tgt_spk != tgt_spk:
196 | self.current_tgt_utts = self.get_spk_utterances(tgt_spk)
197 |
198 | tgt_wavs = [tgt.wav for tgt in self.current_tgt_utts]
199 | prep_data = self.engine.prepare(src_wav, tgt_wavs)
200 | mel = self.engine.convert(*prep_data)
201 |
202 | self.ui.draw_mel(mel.cpu().numpy().squeeze(0), "converted mel")
203 | self.current_generated = (self.ui.selected_utterance.spk_name, Path(self.ui.selected_utterance.path).stem, self.ui.current_tgt_spk, mel)
204 | self.ui.set_loading(0)
205 |
206 | def vocode(self):
207 | src_spk, basename, tgt_spk, mel = self.current_generated
208 | assert mel is not None
209 |
210 | # Synthesize the waveform
211 | if not self.engine:
212 | self.init_engine()
213 |
214 | # def vocoder_progress(i, seq_len, b_size, gen_rate):
215 | # real_time_factor = (gen_rate / cfg.data.sample_rate) * 1000
216 | # line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
217 | # % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
218 | # self.ui.log(line, "overwrite")
219 | # self.ui.set_loading(i, seq_len)
220 |
221 | # wav = vocoder.infer_waveform(mel, progress_callback=vocoder_progress)
222 | wav = self.engine.vocode(mel).squeeze(0).cpu().numpy()
223 | self.ui.set_loading(0)
224 | self.ui.log("Done!", "append")
225 |
226 |
227 | # Play it
228 | wav = (wav / np.abs(wav).max()) * 0.95
229 | self.ui.play(wav, cfg.data.sample_rate)
230 |
231 | # Name it (history displayed in combobox)
232 | name = f"{src_spk}_to_{tgt_spk}_{basename}"
233 | spk_name = f"{src_spk}_to_{tgt_spk}"
234 |
235 | # Update wavs combobox
236 | if len(self.conv_utts_list) > MAX_WAVS:
237 | self.conv_utts_list.pop()
238 | self.conv_utts_idlist.pop()
239 | self.conv_utts_list.insert(0, wav)
240 | self.conv_utts_idlist.insert(0, name)
241 |
242 | # self.ui.wavs_cb.disconnect()
243 | self.ui.wavs_cb_model.setStringList(self.conv_utts_idlist)
244 | self.ui.wavs_cb.setCurrentIndex(0)
245 | self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt)
246 |
247 | # Update current wav
248 | self.set_current_utt(0)
249 |
250 | # Enable replay and save buttons:
251 | self.ui.replay_wav_button.setDisabled(False)
252 | self.ui.export_wav_button.setDisabled(False)
253 |
254 | # Compute speaker embedding
255 | embed = self.engine._get_spk_emb([wav], sr=cfg.data.sample_rate)
256 |
257 | # Add the utterance
258 | utterance = Utterance(
259 | wav=wav, sr=cfg.data.sample_rate,
260 | path=name, spk_name=spk_name,
261 | mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0)
262 | )
263 | self.utterances.add(utterance)
264 |
265 | # Plot it
266 | # self.ui.draw_embed(embed, name, "generated")
267 | self.ui.draw_umap_projections(self.utterances)
268 |
269 |
270 | def get_spk_utterances(self, spk_name):
271 | utts = list(filter(lambda u: u.spk_name == spk_name, self.loaded_utts))
272 | if len(utts) >= MAX_TARGET_SAMPLES:
273 | return utts
274 |
275 | utts_pathes = set(map(lambda u: u.path, utts))
276 | available_utts_pathes = list(filter(lambda p: p not in utts_pathes, self.speaker_filepathes[spk_name]))
277 | available_utts_pathes = available_utts_pathes[:MAX_TARGET_SAMPLES - len(utts_pathes)]
278 |
279 | new_utts = list(map(lambda p: self.load_utterance(spk_name, p), available_utts_pathes))
280 | self.loaded_utts.extend(new_utts)
281 | self.loaded_utts = self.loaded_utts[-MAX_LOADED_SAMPLES:]
282 |
283 | utts.extend(new_utts)
284 | return utts
285 |
286 | def load_utterance(self, spk_name, path):
287 | wav = utils.load_wav(path)
288 | return Utterance(wav, cfg.data.sample_rate, path=path, spk_name=spk_name)
289 |
290 | def load_dataset_info(self, dataset_path):
291 | speakers = utils.data.get_subdirs(dataset_path)
292 |
293 | for spk in speakers:
294 | self.speaker_filepathes[spk] = {
295 | *self.speaker_filepathes[spk],
296 | *utils.data.get_filepathes(os.path.join(dataset_path, spk), self.audio_ext)
297 | }
298 |
299 | def init_engine(self):
300 | self.ui.log("Creating voice conversion model...")
301 | self.ui.set_loading(1)
302 | start = timer()
303 | self.engine = VC()
304 | for stage in self.engine.logged_init():
305 | self.ui.set_loading(stage)
306 | self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
307 | self.ui.set_loading(0)
308 |
309 | def update_seed_textbox(self):
310 | self.ui.update_seed_textbox()
--------------------------------------------------------------------------------
/gui/gui.py:
--------------------------------------------------------------------------------
1 | from engine import Utterance
2 | from .widgets import get_figure_widget, FigureWidget
3 |
4 | import matplotlib.pyplot as plt
5 | from PySide6.QtCore import Qt, QStringListModel
6 | from PySide6.QtGui import QImage, QPixmap
7 | from PySide6.QtWidgets import *
8 |
9 | from pathlib import Path
10 | from typing import List, Set
11 | import sounddevice as sd
12 | import soundfile as sf
13 | import numpy as np
14 | from time import sleep
15 | import umap
16 | import sys
17 | from warnings import filterwarnings, warn
18 | filterwarnings("ignore")
19 |
20 |
21 | colormap = np.array([
22 | [0, 127, 70],
23 | [255, 0, 0],
24 | [255, 217, 38],
25 | [0, 135, 255],
26 | [165, 0, 165],
27 | [255, 167, 255],
28 | [97, 142, 151],
29 | [0, 255, 255],
30 | [255, 96, 38],
31 | [142, 76, 0],
32 | [33, 0, 127],
33 | [0, 0, 0],
34 | [183, 183, 183],
35 | [76, 255, 0],
36 | ], dtype=np.float) / 255
37 |
38 |
39 | class GUI(QDialog):
40 | min_umap_points = 4
41 | max_log_lines = 5
42 | max_saved_utterances = 20
43 |
44 | def draw_utterance(self, utterance: Utterance, which):
45 | self.draw_mel(utterance.mel, which)
46 | # self.draw_embed(utterance.spk_emb, Path(utterance.path).stem, which)
47 |
48 | def draw_embed(self, spk_emb, name, which):
49 | widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget
50 | embed_ax, _ = widget.axis
51 | embed_ax.figure.suptitle("" if spk_emb is None else name)
52 |
53 | ## Embedding
54 | # Clear the plot
55 | if len(embed_ax.images) > 0:
56 | embed_ax.images[0].colorbar.remove()
57 | embed_ax.clear()
58 |
59 | # Draw speaker embedding
60 | if spk_emb is not None:
61 | embed_ax.set_title("embedding")
62 | embed_ax.set_aspect("equal", "datalim")
63 | embed_ax.set_xticks([])
64 | embed_ax.set_yticks([])
65 | embed_ax.figure.canvas.draw()
66 | widget.update()
67 |
68 | def draw_mel(self, mel, which):
69 | widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget
70 | # _, mel_ax = widget.axis
71 | mel_ax = widget.axis
72 |
73 | ## Spectrogram
74 | # Draw the spectrogram
75 | mel_ax.clear()
76 | if mel is not None:
77 | im = mel_ax.imshow(mel, aspect="auto", origin="lower", interpolation='none')
78 | mel_ax.set_title("mel spectrogram")
79 |
80 | mel_ax.set_xticks([])
81 | mel_ax.set_yticks([])
82 | mel_ax.figure.canvas.draw()
83 | widget.update()
84 | if which != "current":
85 | self.vocode_button.setDisabled(mel is None)
86 |
87 | def draw_umap_projections(self, utterances: Set[Utterance]):
88 | self.umap_ax.clear()
89 |
90 | speakers = np.unique([u.spk_name for u in utterances])
91 | colors = {spk_name: colormap[i] for i, spk_name in enumerate(speakers)}
92 | embeds = [u.spk_emb for u in utterances]
93 |
94 | # Display a message if there aren't enough points
95 | if len(utterances) < self.min_umap_points:
96 | self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
97 | (self.min_umap_points - len(utterances)),
98 | horizontalalignment='center', fontsize=15)
99 | self.umap_ax.set_title("")
100 |
101 | # Compute the projections
102 | else:
103 | if not self.umap_hot:
104 | self.log(
105 | "Drawing UMAP projections for the first time, this will take a few seconds.")
106 | self.umap_hot = True
107 |
108 | reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
109 | projections = reducer.fit_transform(embeds)
110 |
111 | speakers_done = set()
112 | for projection, utterance in zip(projections, utterances):
113 | color = colors[utterance.spk_name]
114 | mark = "x" if "_gen_" in Path(utterance.path).stem else "o"
115 | label = None if utterance.spk_name in speakers_done else utterance.spk_name
116 | speakers_done.add(utterance.spk_name)
117 | self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark, label=label)
118 | # self.umap_ax.set_title("UMAP projections")
119 | self.umap_ax.legend(prop={'size': 10})
120 |
121 | # Draw the plot
122 | self.umap_ax.set_aspect("equal", "datalim")
123 | self.umap_ax.set_xticks([])
124 | self.umap_ax.set_yticks([])
125 | self.umap_ax.figure.canvas.draw()
126 |
127 | def save_audio_file(self, wav, sample_rate):
128 | dialog = QFileDialog()
129 | dialog.setDefaultSuffix(".wav")
130 | fpath, _ = dialog.getSaveFileName(
131 | parent=self,
132 | caption="Select a path to save the audio file",
133 | filter="Audio Files (*.flac *.wav)"
134 | )
135 | if fpath:
136 | #Default format is wav
137 | if Path(fpath).suffix == "":
138 | fpath += ".wav"
139 | sf.write(fpath, wav, sample_rate)
140 |
141 | def setup_audio_devices(self, sample_rate):
142 | input_devices = []
143 | output_devices = []
144 | for device in sd.query_devices():
145 | # Check if valid input
146 | try:
147 | sd.check_input_settings(device=device["name"], samplerate=sample_rate)
148 | input_devices.append(device["name"])
149 | except:
150 | pass
151 |
152 | # Check if valid output
153 | try:
154 | sd.check_output_settings(device=device["name"], samplerate=sample_rate)
155 | output_devices.append(device["name"])
156 | except Exception as e:
157 | # Log a warning only if the device is not an input
158 | if not device["name"] in input_devices:
159 | warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
160 |
161 | if len(input_devices) == 0:
162 | self.log("No audio input device detected. Recording may not work.")
163 | self.audio_in_device = None
164 | else:
165 | self.audio_in_device = input_devices[0]
166 |
167 | if len(output_devices) == 0:
168 | self.log("No supported output audio devices were found! Audio output may not work.")
169 | self.audio_out_devices_cb.addItems(["None"])
170 | self.audio_out_devices_cb.setDisabled(True)
171 | else:
172 | self.audio_out_devices_cb.clear()
173 | self.audio_out_devices_cb.addItems(output_devices)
174 | self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
175 |
176 | self.set_audio_device()
177 |
178 | def set_audio_device(self):
179 |
180 | output_device = self.audio_out_devices_cb.currentText()
181 | if output_device == "None":
182 | output_device = None
183 |
184 | # If None, sounddevice queries portaudio
185 | sd.default.device = (self.audio_in_device, output_device)
186 |
187 | def play(self, wav, sample_rate):
188 | try:
189 | sd.stop()
190 | sd.play(wav, sample_rate)
191 | except Exception as e:
192 | print(e)
193 | self.log("Error in audio playback. Try selecting a different audio output device.")
194 | self.log("Your device must be connected before you start the toolbox.")
195 |
196 | def stop(self):
197 | sd.stop()
198 |
199 | def record_one(self, sample_rate, duration):
200 | self.record_button.setText("Recording...")
201 | self.record_button.setDisabled(True)
202 |
203 | self.log("Recording %d seconds of audio" % duration)
204 | sd.stop()
205 | try:
206 | wav = sd.rec(duration * sample_rate, sample_rate, 1)
207 | except Exception as e:
208 | print(e)
209 | self.log("Could not record anything. Is your recording device enabled?")
210 | self.log("Your device must be connected before you start the toolbox.")
211 | return None
212 |
213 | for i in np.arange(0, duration, 0.1):
214 | self.set_loading(i, duration)
215 | sleep(0.1)
216 | self.set_loading(duration, duration)
217 | sd.wait()
218 |
219 | self.log("Done recording.")
220 | self.record_button.setText("Record")
221 | self.record_button.setDisabled(False)
222 |
223 | return wav.squeeze()
224 |
225 | @property
226 | def current_dataset_name(self):
227 | return self.dataset_box.currentText()
228 |
229 | @property
230 | def current_src_spk(self):
231 | return self.src_spk_box.currentText()
232 |
233 | @property
234 | def current_tgt_spk(self):
235 | return self.tgt_spk_box.currentText()
236 |
237 | @property
238 | def current_utterance_name(self):
239 | return self.utterance_box.currentText()
240 |
241 | def browse_file(self):
242 | fpath = QFileDialog().getOpenFileName(
243 | parent=self,
244 | caption="Select an audio file",
245 | filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
246 | )
247 | return Path(fpath[0]) if fpath[0] != "" else ""
248 |
249 | @staticmethod
250 | def repopulate_box(box, items, random=False):
251 | """
252 | Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
253 | data to the items
254 | """
255 | box.blockSignals(True)
256 | box.clear()
257 | for item in items:
258 | item = list(item) if isinstance(item, tuple) else [item]
259 | box.addItem(str(item[0]), *item[1:])
260 | if len(items) > 0:
261 | box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
262 | box.setDisabled(len(items) == 0)
263 | box.blockSignals(False)
264 |
265 | def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int, random=True):
266 | # Select a random dataset
267 | if level <= 0:
268 | if datasets_root is not None:
269 | datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
270 | datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
271 | self.browser_load_button.setDisabled(len(datasets) == 0)
272 | if datasets_root is None or len(datasets) == 0:
273 | msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
274 | if datasets_root is None else "o not have any of the recognized datasets" \
275 | " in %s" % datasets_root)
276 | self.log(msg)
277 | msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
278 | "can still use the toolbox by recording samples yourself." % \
279 | ("\n\t".join(map(str, recognized_datasets)))
280 | print(msg, file=sys.stderr)
281 |
282 | self.random_utterance_button.setDisabled(True)
283 | self.random_speaker_button.setDisabled(True)
284 | self.random_dataset_button.setDisabled(True)
285 | self.utterance_box.setDisabled(True)
286 | self.src_spk_box.setDisabled(True)
287 | self.tgt_spk_box.setDisabled(True)
288 | self.dataset_box.setDisabled(True)
289 | self.browser_load_button.setDisabled(True)
290 | self.auto_next_checkbox.setDisabled(True)
291 | return
292 | self.repopulate_box(self.dataset_box, datasets, random)
293 |
294 | # Select a random src and tgt speakers
295 | if level <= 1:
296 | speakers_root = datasets_root.joinpath(self.current_dataset_name)
297 | speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
298 | self.repopulate_box(self.src_spk_box, speaker_names, random)
299 | self.repopulate_box(self.tgt_spk_box, speaker_names, random)
300 |
301 | # Select a random utterance
302 | if level <= 2:
303 | utterances_root = datasets_root.joinpath(
304 | self.current_dataset_name,
305 | self.current_src_spk
306 | )
307 | utterances = []
308 | for extension in ['mp3', 'flac', 'wav']:
309 | utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
310 | utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
311 | self.repopulate_box(self.utterance_box, utterances, random)
312 |
313 | def browser_select_next(self):
314 | index = (self.utterance_box.currentIndex() + 1) % self.utterance_box.count()
315 | self.utterance_box.setCurrentIndex(index)
316 |
317 | @property
318 | def selected_utterance(self):
319 | return self.utterance_history.itemData(self.utterance_history.currentIndex())
320 |
321 | def register_utterance(self, utterance: Utterance):
322 | self.utterance_history.blockSignals(True)
323 | self.utterance_history.insertItem(0, Path(utterance.path).stem, utterance)
324 | self.utterance_history.setCurrentIndex(0)
325 | self.utterance_history.blockSignals(False)
326 |
327 | if self.utterance_history.count() > self.max_saved_utterances:
328 | self.utterance_history.removeItem(self.max_saved_utterances)
329 |
330 | self.play_button.setDisabled(False)
331 | self.generate_button.setDisabled(False)
332 | self.synthesize_button.setDisabled(False)
333 |
334 | def log(self, line, mode="newline"):
335 | if mode == "newline":
336 | self.logs.append(line)
337 | if len(self.logs) > self.max_log_lines:
338 | del self.logs[0]
339 | elif mode == "append":
340 | self.logs[-1] += line
341 | elif mode == "overwrite":
342 | self.logs[-1] = line
343 | log_text = '\n'.join(self.logs)
344 |
345 | self.log_window.setText(log_text)
346 | self.app.processEvents()
347 |
348 | def set_loading(self, value, maximum=1):
349 | self.loading_bar.setValue(value * 100)
350 | self.loading_bar.setMaximum(maximum * 100)
351 | self.loading_bar.setTextVisible(value != 0)
352 | self.app.processEvents()
353 |
354 | def populate_gen_options(self, seed, trim_silences):
355 | if seed is not None:
356 | self.random_seed_checkbox.setChecked(True)
357 | self.seed_textbox.setText(str(seed))
358 | self.seed_textbox.setEnabled(True)
359 | else:
360 | self.random_seed_checkbox.setChecked(False)
361 | self.seed_textbox.setText(str(0))
362 | self.seed_textbox.setEnabled(False)
363 |
364 | def update_seed_textbox(self):
365 | if self.random_seed_checkbox.isChecked():
366 | self.seed_textbox.setEnabled(True)
367 | else:
368 | self.seed_textbox.setEnabled(False)
369 |
370 | def reset_interface(self):
371 | # self.draw_embed(None, None, "current")
372 | # self.draw_embed(None, None, "generated")
373 | self.draw_mel(None, "current")
374 | self.draw_mel(None, "generated")
375 | # self.draw_umap_projections(set())
376 | self.set_loading(0)
377 | self.play_button.setDisabled(True)
378 | self.generate_button.setDisabled(True)
379 | self.synthesize_button.setDisabled(True)
380 | self.vocode_button.setDisabled(True)
381 | self.replay_wav_button.setDisabled(True)
382 | self.export_wav_button.setDisabled(True)
383 | [self.log("") for _ in range(self.max_log_lines)]
384 |
385 | def __init__(self):
386 | ## Initialize the application
387 | self.app = QApplication(sys.argv)
388 | super().__init__(None)
389 | self.setWindowTitle("Voice Conversion app")
390 |
391 |
392 | ## Main layouts
393 | # Root
394 | root_layout = QGridLayout()
395 | self.setLayout(root_layout)
396 |
397 | # Browser
398 | browser_layout = QGridLayout()
399 | root_layout.addLayout(browser_layout, 0, 0, 1, 2)
400 |
401 | # Generation
402 | gen_layout = QVBoxLayout()
403 | root_layout.addLayout(gen_layout, 1, 0, 1, 2)
404 |
405 | # Projections
406 | self.projections_layout = QVBoxLayout()
407 | root_layout.addLayout(self.projections_layout, 1, 2, 1, 1)
408 |
409 | # Visualizations
410 | vis_layout = QVBoxLayout()
411 | root_layout.addLayout(vis_layout, 0, 2, 1, 1)
412 |
413 |
414 | ## Projections
415 | # UMap
416 | fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
417 | fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
418 | self.projections_layout.addWidget(get_figure_widget(fig))
419 | self.umap_hot = False
420 | self.clear_button = QPushButton("Clear")
421 | self.projections_layout.addWidget(self.clear_button)
422 |
423 |
424 | ## Browser
425 | # Dataset, speaker and utterance selection
426 | i = 0
427 | self.dataset_box = QComboBox()
428 | browser_layout.addWidget(QLabel("Dataset"), i, 0)
429 | browser_layout.addWidget(self.dataset_box, i + 1, 0)
430 | self.src_spk_box = QComboBox()
431 | browser_layout.addWidget(QLabel("Source speaker"), i, 1)
432 | browser_layout.addWidget(self.src_spk_box, i + 1, 1)
433 | self.utterance_box = QComboBox()
434 | browser_layout.addWidget(QLabel("Utterance"), i, 2)
435 | browser_layout.addWidget(self.utterance_box, i + 1, 2)
436 | self.browser_load_button = QPushButton("Load")
437 | browser_layout.addWidget(self.browser_load_button, i + 1, 3)
438 | i += 2
439 |
440 | # Random buttons
441 | self.random_dataset_button = QPushButton("Random")
442 | browser_layout.addWidget(self.random_dataset_button, i, 0)
443 | self.random_speaker_button = QPushButton("Random")
444 | browser_layout.addWidget(self.random_speaker_button, i, 1)
445 | self.random_utterance_button = QPushButton("Random")
446 | browser_layout.addWidget(self.random_utterance_button, i, 2)
447 | self.auto_next_checkbox = QCheckBox("Auto select next")
448 | self.auto_next_checkbox.setChecked(True)
449 | browser_layout.addWidget(self.auto_next_checkbox, i, 3)
450 | i += 1
451 |
452 | # Utterance box
453 | browser_layout.addWidget(QLabel("Use source from:"), i, 0)
454 | self.utterance_history = QComboBox()
455 | browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
456 | i += 1
457 |
458 | # Random & next utterance buttons
459 | self.browser_browse_button = QPushButton("Browse")
460 | browser_layout.addWidget(self.browser_browse_button, i, 0)
461 | self.record_button = QPushButton("Record")
462 | browser_layout.addWidget(self.record_button, i, 1)
463 | self.play_button = QPushButton("Play")
464 | browser_layout.addWidget(self.play_button, i, 2)
465 | self.stop_button = QPushButton("Stop")
466 | browser_layout.addWidget(self.stop_button, i, 3)
467 | i += 1
468 |
469 |
470 | # Model and audio output selection
471 | self.tgt_spk_box = QComboBox()
472 | browser_layout.addWidget(QLabel("Target speaker"), i, 0)
473 | browser_layout.addWidget(self.tgt_spk_box, i + 1, 0)
474 |
475 | self.audio_out_devices_cb=QComboBox()
476 | browser_layout.addWidget(QLabel("Audio Output"), i, 1)
477 | browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 1)
478 | i += 2
479 |
480 | #Replay & Save Audio
481 | browser_layout.addWidget(QLabel("Toolbox Output:"), i, 0)
482 | self.wavs_cb = QComboBox()
483 | self.wavs_cb_model = QStringListModel()
484 | self.wavs_cb.setModel(self.wavs_cb_model)
485 | self.wavs_cb.setToolTip("Select one of the last generated wavs in this section for replaying or exporting")
486 | browser_layout.addWidget(self.wavs_cb, i, 1)
487 | self.replay_wav_button = QPushButton("Replay")
488 | self.replay_wav_button.setToolTip("Replay last generated vocoder")
489 | browser_layout.addWidget(self.replay_wav_button, i, 2)
490 | self.export_wav_button = QPushButton("Export")
491 | self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
492 | browser_layout.addWidget(self.export_wav_button, i, 3)
493 | i += 1
494 |
495 |
496 | ## Embed & spectrograms
497 | vis_layout.addStretch()
498 |
499 | gridspec_kw = {"width_ratios": [1]}
500 | fig, cur_ax = plt.subplots(
501 | 1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw
502 | )
503 | fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
504 | self.cur_ax_widget = FigureWidget(fig, cur_ax)
505 | vis_layout.addWidget(self.cur_ax_widget)
506 |
507 | fig, gen_ax = plt.subplots(
508 | 1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw
509 | )
510 | fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
511 | self.gen_ax_widget = FigureWidget(fig, gen_ax)
512 | vis_layout.addWidget(self.gen_ax_widget)
513 |
514 | # for ax in self.cur_ax_widget.axis.tolist() + self.gen_ax_widget.axis.tolist():
515 | for ax in [self.cur_ax_widget.axis, self.gen_ax_widget.axis]:
516 | ax.set_facecolor("#F0F0F0")
517 | for side in ["top", "right", "bottom", "left"]:
518 | ax.spines[side].set_visible(False)
519 |
520 |
521 | ## Generation
522 | layout = QHBoxLayout()
523 | self.generate_button = QPushButton("Synthesize and vocode")
524 | layout.addWidget(self.generate_button)
525 | self.synthesize_button = QPushButton("Synthesize only")
526 | layout.addWidget(self.synthesize_button)
527 | self.vocode_button = QPushButton("Vocode only")
528 | layout.addWidget(self.vocode_button)
529 | gen_layout.addLayout(layout)
530 |
531 | layout_seed = QGridLayout()
532 | self.random_seed_checkbox = QCheckBox("Random seed:")
533 | self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
534 | layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
535 | self.seed_textbox = QLineEdit()
536 | self.seed_textbox.setMaximumWidth(80)
537 | layout_seed.addWidget(self.seed_textbox, 0, 1)
538 | gen_layout.addLayout(layout_seed)
539 |
540 | self.loading_bar = QProgressBar()
541 | gen_layout.addWidget(self.loading_bar)
542 |
543 | self.log_window = QLabel()
544 | self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
545 | gen_layout.addWidget(self.log_window)
546 | self.logs = []
547 | gen_layout.addStretch()
548 |
549 |
550 | ## Set the size of the window and of the elements
551 | max_size = self.screen().availableGeometry().size() * 0.7
552 | self.resize(max_size)
553 |
554 | ## Finalize the display
555 | self.reset_interface()
556 | self.show()
557 |
558 | def start(self):
559 | self.app.exec_()
--------------------------------------------------------------------------------