├── test
    ├── __init__.py
    ├── conftest.py
    └── test_engine.py
├── engine
    ├── vc
    │   └── __init__.py
    ├── __init__.py
    ├── vocoder
    │   ├── __init__.py
    │   ├── utils.py
    │   └── hifi_gan.py
    ├── loading.py
    ├── feature_extraction.py
    └── general.py
├── datasets
    ├── remembered_dataset_pathes.txt
    └── test
    │   └── example
    │       ├── example.mp3
    │       └── example.wav
├── gui
    ├── __init__.py
    ├── widgets.py
    └── gui.py
├── utils
    ├── __init__.py
    ├── hparams.py
    └── data.py
├── img_dashboard.png
├── .gitignore
├── requirements.txt
├── app.py
├── config.yaml
├── README.md
├── dist
    ├── README.md
    └── warn_processing.py
├── LICENSE
└── toolbox.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/engine/vc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/remembered_dataset_pathes.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gui/__init__.py:
--------------------------------------------------------------------------------
1 | from .gui import GUI


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import load_wav


--------------------------------------------------------------------------------
/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .general import VC, Utterance


--------------------------------------------------------------------------------
/engine/vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .hifi_gan import Generator as HifiGenerator


--------------------------------------------------------------------------------
/img_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/img_dashboard.png


--------------------------------------------------------------------------------
/datasets/test/example/example.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.mp3


--------------------------------------------------------------------------------
/datasets/test/example/example.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SolomidHero/real-time-voice-conversion/HEAD/datasets/test/example/example.wav


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .cache/
 2 | .pytest_cache/
 3 | **/__pycache__/
 4 | 
 5 | **/.DS_Store
 6 | 
 7 | # deployment cache
 8 | dist/build/
 9 | dist/dist/
10 | dist/hooks/
11 | dist/VCToolbox.spec


--------------------------------------------------------------------------------
/utils/hparams.py:
--------------------------------------------------------------------------------
1 | from omegaconf import OmegaConf
2 | from pathlib import Path
3 | import __main__
4 | 
5 | 
6 | root_dir = Path(__file__).parent.parent.resolve()
7 | cfg = OmegaConf.load(root_dir / 'config.yaml')
8 | cfg.root_dir = str(root_dir.resolve())


--------------------------------------------------------------------------------
/engine/vocoder/utils.py:
--------------------------------------------------------------------------------
1 | def init_weights(m, mean=0.0, std=0.01):
2 |   classname = m.__class__.__name__
3 |   if classname.find("Conv") != -1:
4 |     m.weight.data.normal_(mean, std)
5 | 
6 | 
7 | def get_padding(kernel_size, dilation=1):
8 |   return int((kernel_size*dilation - dilation)/2)
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.3.2
 2 | torch==1.8.1
 3 | umap_learn==0.5.1
 4 | omegaconf==2.0.6
 5 | Resemblyzer==0.1.1.dev0
 6 | numpy==1.20.1
 7 | matplotlib==3.4.1
 8 | sounddevice==0.4.1
 9 | pytest==6.2.2
10 | gdown==3.12.2
11 | librosa==0.8.0
12 | PySide6==6.0.3
13 | PyYAML==5.4.1
14 | soundfile==0.10.3.post1
15 | umap-learn==0.5.1
16 | 


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import numpy as np
 4 | from utils import load_wav
 5 | 
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def example_wav():
 9 |   wav = load_wav(
10 |     os.path.join(os.path.dirname(__file__), "../datasets/test/example/example.wav")
11 |   )
12 |   assert len(wav.shape) == 1
13 |   return wav
14 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from toolbox import Toolbox
 3 | import argparse
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |   parser = argparse.ArgumentParser(
 9 |     description="Runs the toolbox",
10 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 |   )
12 |   parser.add_argument("-d", "--datasets_root", type=Path, help= \
13 |     "Path to the directory containing your datasets.", default=Path(__file__).parent / 'datasets')
14 |   parser.add_argument("--seed", type=int, default=17, help=\
15 |     "Optional random number seed value to make toolbox deterministic.")
16 |   args = parser.parse_args()
17 | 
18 |   # Launch the toolbox
19 |   Toolbox(**vars(args))


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | data: # shouldn't be changed
 2 |   sample_rate: 16000
 3 |   n_mels: 80
 4 |   n_fft: 1280
 5 |   win_length: 1280
 6 |   hop_length: 320
 7 |   f_min: 50.
 8 |   f_max: null
 9 |   preemph: 0.97
10 | 
11 | root_dir: null # will be defined onstart
12 | ckpt_default_path: '.cache'
13 | ckpt_dict:
14 |   hifi_gan:
15 |     'generator': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/generator_pt330', 'wget']
16 |     'config.yaml': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v3.0/config_v1_4.json', 'wget']
17 |   fragmentvc:
18 |     'model.pt': ['https://github.com/SolomidHero/FragmentVC-with-RAdam/releases/download/v4.5/fragmentvc_v4_5_stage2.pt', 'wget']


--------------------------------------------------------------------------------
/utils/data.py:
--------------------------------------------------------------------------------
 1 | from .hparams import cfg
 2 | 
 3 | import librosa
 4 | import numpy as np
 5 | import soundfile as sf
 6 | from pathlib import Path
 7 | import os
 8 | 
 9 | 
10 | def load_wav(path):
11 |   wav, _ = librosa.load(path, sr=cfg.data.sample_rate)
12 |   wav = librosa.util.normalize(wav) * 0.95
13 | 
14 |   return wav
15 | 
16 | 
17 | def save_wav(path, wav, sr):
18 |   """Save audio to path"""
19 |   wav = np.clip(wav, -1.0, 1.0)
20 |   sf.write(path, wav, sr)
21 | 
22 | 
23 | def get_subdirs(dir_path):
24 |   dir_path = Path(dir_path)
25 |   dirnames = [p.stem for p in dir_path.iterdir() if p.is_dir()]
26 |   return dirnames
27 | 
28 | 
29 | def has_ext(filepath, ext):
30 |   if isinstance(ext, str):
31 |     return Path(filepath).match(f'*{ext}')
32 |   else:
33 |     return any(Path(filepath).match(f'*{e}') for e in ext)
34 | 
35 | 
36 | def get_filepathes(dir_path, ext='.wav'):
37 |   for d_path, _, f_names in os.walk(dir_path):
38 |     for fn in f_names:
39 |       if not has_ext(fn, ext):
40 |         continue
41 | 
42 |       yield os.path.join(d_path, fn)
43 | 


--------------------------------------------------------------------------------
/gui/widgets.py:
--------------------------------------------------------------------------------
 1 | from PySide6.QtGui import QPixmap
 2 | from PySide6.QtWidgets import QLabel
 3 | from PySide6.QtCore import Signal
 4 | 
 5 | import io
 6 | 
 7 | def get_figure_widget(fig):
 8 |   buf = io.BytesIO()
 9 |   fig.savefig(buf, format='png')
10 |   buf.seek(0)
11 | 
12 |   pixmap = QPixmap()
13 |   pixmap.loadFromData(buf.read())
14 |   buf.close()
15 | 
16 |   fig_label = QLabel()
17 |   fig_label.setPixmap(pixmap)
18 | 
19 |   return fig_label
20 | 
21 | 
22 | def get_figure_pixmap(fig):
23 |   buf = io.BytesIO()
24 |   fig.savefig(buf, format='png', transparent=True)
25 |   buf.seek(0)
26 | 
27 |   pixmap = QPixmap()
28 |   pixmap.loadFromData(buf.read())
29 |   buf.close()
30 | 
31 |   return pixmap
32 | 
33 | 
34 | 
35 | class FigureWidget(QLabel):
36 |   draw_signal = Signal()
37 | 
38 |   def __init__(self, fig, axis):
39 |     super().__init__()
40 |     self.fig = fig
41 |     self.axis = axis
42 |     self.draw_signal.connect(self.draw)
43 | 
44 |   def update(self):
45 |     self.draw_signal.emit()
46 | 
47 |   def draw(self):
48 |     self.setPixmap(get_figure_pixmap(self.fig))
49 | 
50 | 


--------------------------------------------------------------------------------
/test/test_engine.py:
--------------------------------------------------------------------------------
 1 | from utils.hparams import cfg
 2 | from engine import VC, Utterance
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import pytest
 7 | 
 8 | @pytest.fixture(scope='module')
 9 | def engine():
10 |   return VC()
11 | 
12 | @pytest.fixture(scope='module')
13 | def src(example_wav):
14 |   return example_wav
15 | 
16 | @pytest.fixture(scope='module')
17 | def tgts(example_wav):
18 |   return [
19 |     example_wav,
20 |     example_wav[:len(example_wav) // 2],
21 |   ]
22 | 
23 | 
24 | def test_prepare(engine, src, tgts):
25 |   src_features, (tgt_mel, tgt_spk_emb) = engine.prepare(src, tgts)
26 | 
27 |   assert len(src_features.shape) == 3
28 |   assert len(tgt_mel.shape) == 3
29 |   assert len(tgt_spk_emb.shape) == 2
30 | 
31 | 
32 | def test_convert(engine, src, tgts):
33 |   mel = engine.convert(*engine.prepare(src, tgts))
34 | 
35 |   assert len(mel.shape) == 3
36 | 
37 | 
38 | def test_vocode(engine, src):
39 |   src = torch.from_numpy(src).unsqueeze(0)
40 |   mel = engine._get_mel(src)
41 | 
42 |   wav = engine.vocode(mel)
43 | 
44 |   assert len(wav.shape) == 2
45 | 
46 | 
47 | def test_e2e(engine, src, tgts):
48 |   # using build-in __call__
49 |   src_utt = Utterance(wav=src)
50 |   tgt_utts = [Utterance(wav=wav) for wav in tgts]
51 |   out1 = engine(src_utt, tgt_utts)
52 | 
53 |   # using step-by-step
54 |   out2 = engine.convert(*engine.prepare(src, tgts))
55 |   out2 = engine.vocode(out2).cpu().numpy()
56 | 
57 |   assert np.allclose(out1, out2)
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Real-Time Voice Conversion
 2 | 
 3 | ![Toolbox View](img_dashboard.png)
 4 | 
 5 | This repository implements a simple toolbox for voice conversion. It uses one of the latest result for mel-based source to target one-shot voice conversion model and fast GAN-based vocoder.
 6 | 
 7 | ## Installation 
 8 | 
 9 | Application file of every release was created by [PyInstaller](https://pypi.org/project/pyinstaller/) making it available to build on many popular platforms (Windows, Mac OS X, GNU/Linux, etc). Download release archive for your OS, then simply start the application file named `VCToolbox` from unarchived zip.
10 | 
11 | Available OS releases:
12 | 
13 | - MacOS
14 | - Linux
15 | 
16 | If you want to distribute for other platforms follow instructions in `dist/BUILD.md`.
17 | 
18 | ## Installation (Development build)
19 | 
20 | 1. Install Requirements (Python 3.7+ were tested ok)
21 | 
22 | ```bash
23 | pip install -r requirements.txt
24 | ```
25 | 
26 | **Note:** for mp3 support you should have ffmpeg installed (you can via `brew` for MacOS, `apt-get` for linux, static builds for windows).
27 | 
28 | 2. Everything is complete for launching the toolbox. Required models will be loaded on first application run. Run toolbox with following command:
29 | 
30 | ```bash
31 | python3 app.py
32 | ```
33 | 
34 | 3*. You may want to run tests for this repo. It can be done via:
35 | 
36 | ```
37 | pytest
38 | ```
39 | 
40 | **Note:** that will also download models if they weren't.
41 | 
42 | ## Credits
43 | 
44 | Author thanks developer of [CorentinJ/Real-Time-Voice-Cloning](github.com/CorentinJ/Real-Time-Voice-Cloning) for cool design and implementation ideas.
45 | 


--------------------------------------------------------------------------------
/engine/loading.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import DictConfig
 2 | from utils.hparams import cfg
 3 | from .vocoder import HifiGenerator
 4 | 
 5 | import os
 6 | import torch
 7 | import json
 8 | 
 9 | ckpt_dir = os.path.join(cfg.root_dir, cfg.ckpt_default_path)
10 | ckpt_dict = cfg.ckpt_dict
11 | 
12 | 
13 | def get_vocoder():
14 |   group_name = 'hifi_gan'
15 |   download_group(group_name)
16 | 
17 |   config_path = os.path.join(ckpt_dir, group_name, "config.yaml")
18 |   json_config = json.loads(open(config_path).read())
19 |   with torch.no_grad():
20 |     generator = HifiGenerator(DictConfig(json_config)).eval()
21 | 
22 |   ckpt_path = os.path.join(ckpt_dir, group_name, "generator")
23 |   state = torch.load(ckpt_path, map_location=torch.device('cpu'))
24 |   generator.load_state_dict(state['generator'])
25 |   generator.remove_weight_norm()
26 | 
27 |   return generator
28 | 
29 | 
30 | def get_vc_model():
31 |   group_name = 'fragmentvc'
32 |   download_group(group_name)
33 | 
34 |   ckpt_path = os.path.join(ckpt_dir, group_name, "model.pt")
35 |   model = torch.jit.load(ckpt_path).eval()
36 | 
37 |   return model
38 | 
39 | 
40 | def download_group(group_name):
41 |   for filename, (url, agent) in ckpt_dict[group_name].items():
42 |     filepath = os.path.join(ckpt_dir, group_name, filename)
43 |     _download(filepath, url, agent=agent)
44 | 
45 | 
46 | def _download(filepath, url, refresh=False, agent='wget'):
47 |   '''
48 |   Download from url into filepath using agent if needed
49 |   Ref: https://github.com/s3prl/s3prl
50 |   '''
51 | 
52 |   dirpath = os.path.dirname(filepath)
53 |   os.makedirs(dirpath, exist_ok=True)
54 | 
55 |   if not os.path.isfile(filepath) or refresh:
56 |     if agent == 'wget':
57 |       os.system(f'wget {url} -O {filepath}')
58 |     elif agent == 'gdown':
59 |       import gdown
60 |       gdown.download(url, filepath, use_cookies=False)
61 |     else:
62 |       print('[Download] - Unknown download agent. Only \'wget\' and \'gdown\' are supported.')
63 |       raise NotImplementedError
64 |   else:
65 |     print(f'Using checkpoint found in {filepath}')
66 | 


--------------------------------------------------------------------------------
/engine/feature_extraction.py:
--------------------------------------------------------------------------------
 1 | from resemblyzer import VoiceEncoder
 2 | from transformers import Wav2Vec2Model
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | import torch.nn.functional as F
 7 | from librosa.filters import mel as librosa_mel_fn
 8 | from utils.hparams import cfg
 9 | 
10 | 
11 | def load_pretrained_spk_emb(device='cpu'):
12 |   """Load speaker embedding model"""
13 | 
14 |   model = VoiceEncoder().to(device).eval()
15 |   model.requires_grad_(False)
16 |   return model
17 | 
18 | 
19 | def load_pretrained_feature_extractor(device='cpu', ckpt_path='facebook/wav2vec2-base-960h'):
20 |   """Load pretrained Wav2Vec model."""
21 | 
22 |   def extract_features(self, wav, mask):
23 |     # wav2vec has window of 400, so we pad to center windows
24 |     wav = torch.nn.functional.pad(wav.unsqueeze(1), (200, 200), mode='reflect').squeeze(1)
25 |     return [self(wav).last_hidden_state]
26 | 
27 |   Wav2Vec2Model.extract_features = extract_features # for same behaviour as fairseq.Wav2Vec2Model
28 |   model = Wav2Vec2Model.from_pretrained(ckpt_path).eval()
29 |   model.requires_grad_(False)
30 |   return model
31 | 
32 | 
33 | class Wav2Mel(nn.Module):
34 |   def __init__(self, n_fft, hop_length, win_length,
35 |     sample_rate, n_mels, f_min, f_max, preemph
36 |   ):
37 |     super().__init__()
38 | 
39 |     window = torch.hann_window(win_length).float()
40 |     self.register_buffer("window", window)
41 | 
42 |     mel_basis = torch.from_numpy(librosa_mel_fn(
43 |       sample_rate, n_fft, n_mels, f_min, f_max
44 |     )).float()
45 |     self.register_buffer("mel_basis", mel_basis)
46 | 
47 |     preemph_kernel = torch.FloatTensor([[[-preemph, 1]]])
48 |     self.register_buffer("preemph_kernel", preemph_kernel)
49 | 
50 |     self.n_fft = n_fft
51 |     self.hop_length = hop_length
52 |     self.win_length = win_length
53 |     self.sample_rate = sample_rate
54 |     self.n_mels = n_mels
55 | 
56 |   def forward(self, wav):
57 |     n_pad = self.n_fft // 2
58 | 
59 |     while len(wav.shape) < 3:
60 |       wav = wav.unsqueeze(0)
61 | 
62 |     wav = torch.nn.functional.conv1d(wav, self.preemph_kernel, padding=1)[:, :, :-1]
63 | 
64 |     wav = F.pad(wav, (n_pad, n_pad), "reflect").squeeze(0)
65 |     spec = torch.stft(wav, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length,
66 |       window=self.window, center=False, return_complex=True
67 |     ).abs()
68 | 
69 |     mel = torch.matmul(self.mel_basis, spec)
70 |     log_mel = torch.log(torch.clamp(mel, min=1e-5))
71 | 
72 |     return log_mel


--------------------------------------------------------------------------------
/dist/README.md:
--------------------------------------------------------------------------------
 1 | # Toolbox package building guide
 2 | 
 3 | We use OS dependent utilite PyInstaller as package building tool. Result of steps below is a platform dependent for single application. It can be used directly on same platform as guide instructions are completed. For other platforms build on virtual machines or containers. [Read more about](https://pyinstaller.readthedocs.io/en/stable/usage.html) build tool.
 4 | 
 5 | **Note:** using of python virtual environment is highly recommended:
 6 | ```
 7 | # creation
 8 | virtualenv ~/install_env
 9 | source ~/install_env/bin/activate
10 | 
11 | # after use (delete if needed)
12 | deactivate
13 | rm -rf ~/install_env
14 | ```
15 | 
16 | The procedure of distribution of this repo toolbox consists of several steps (steps performed from `dist/` directory).
17 | 
18 | ## MacOS
19 | 
20 | 1. Install all required packages from requirements.txt. Also install pyinstaller and uninstall some deprecated (but installed) packages:
21 | ```bash
22 | pip -r ../requirements.txt
23 | pip install pyinstaller
24 | pip uninstall typing dataclasses
25 | ```
26 | 
27 | 2. Trying first build which will definetely fail, but provide us with important warning log.
28 | ```bash
29 | pyinstaller --name="VCToolbox" --windowed --add-data="../config.yaml:./" --add-data="../datasets/*:datasets/" --hidden-import=typing_extensions -y --onefile ../app.py
30 | ```
31 | 
32 | **Note:** `--onefile` is optional and not fully supported option with PySide6 (main Qt for Python package used in toolbox).
33 | 
34 | After command above, there would be two new directories ('build/' and 'dist/'). Actually we don't need 'dist/' folder, because it stores our releasing app, but generated 'build/VCToolbox/warn-VCToolbox.txt' will be used for next stages.
35 | 
36 | 3. Define installation hooks for not found modules. For this run `warn_processing.py` file:
37 | ```bash
38 | python3 warn_processing.py
39 | ```
40 | 
41 | **Important:** If script fails on python package itself (there would be string consist in python path), you should add specified string into `remove_list` in `warn_processing.py` file and run script again.
42 | 
43 | This stage is cumbersome and depends on previous one. Some important modules that weren't found (such as librosa, etc) already added in script. After the command, there will be generated `hooks/` directory with files (~60 files) defining hooks for PyInstaller. Tree will look like:
44 | 
45 | ```
46 | .
47 | ├── BUILD.md
48 | ├── VCToolbox.spec
49 | ├── build
50 | ├── dist
51 | ├── hooks
52 | └── warn_processing.py
53 | ```
54 | 
55 | You can remove unnecessary 'dist/' and 'build/' folders for now:
56 | ```
57 | rm -rf dist/ build/
58 | ```
59 | 
60 | 4. Build with hooks for not found packages:
61 | ```bash
62 | pyinstaller --name="VCToolbox" --windowed --hidden-import=typing_extensions -y --additional-hooks-dir=hooks --onefile ../app.py
63 | ```
64 | 
65 | If everything done right, there would be an executable file in dist/ folder.
66 | 
67 | 5.(optional) For distribution purposes move app binary into root folder, because it uses `config.yaml` and `datasets/` pathes. Then zip into archive, or use other utility for `.app` and `.dmg` creation.
68 | 
69 | 


--------------------------------------------------------------------------------
/dist/warn_processing.py:
--------------------------------------------------------------------------------
 1 | ### Define next the folder to create the hooks files and the warning file to read the modules from
 2 | output_hooks_dir = 'hooks'
 3 | warning_file = 'build/VCToolbox/warn-VCToolbox.txt'
 4 | 
 5 | import re
 6 | import os
 7 | import shutil
 8 | 
 9 | shutil.rmtree(output_hooks_dir, ignore_errors=True)
10 | os.makedirs(output_hooks_dir, exist_ok=True)
11 | 
12 | with open(warning_file) as file:
13 |     files_content = file.readlines()
14 | 
15 | clean_content = []
16 | for line in files_content:
17 |     if re.search('missing module named',line):
18 |         temp_line = re.sub('.*imported by ','',line)
19 |         temp_line = re.sub('\n',', ',temp_line)
20 |         clean_content.append(temp_line)
21 | clean_content = list(set(clean_content))
22 | joined_content = ''.join(clean_content)
23 | clean_content = list(set(joined_content.split('), ')))
24 | 
25 | modules_toplevel = []
26 | for line in clean_content:
27 |     if re.search('top-level',line):
28 |         temp_mod = re.sub(' \(.*','',line)
29 |         temp_mod = re.sub('\..*','',temp_mod)
30 |         modules_toplevel.append(temp_mod)
31 | modules_toplevel = list(set(modules_toplevel))
32 | 
33 | modules_conditional = []
34 | for line in clean_content:
35 |     if re.search('conditional',line):
36 |         temp_mod = re.sub(' \(.*','',line)
37 |         temp_mod = re.sub('\..*','',temp_mod)
38 |         modules_conditional.append(temp_mod)
39 | modules_conditional = list(set(modules_conditional))
40 | 
41 | modules_delayed = []
42 | for line in clean_content:
43 |     if re.search('delayed',line):
44 |         temp_mod = re.sub(' \(.*','',line)
45 |         temp_mod = re.sub('\..*','',temp_mod)
46 |         modules_delayed.append(temp_mod)
47 | modules_delayed = list(set(modules_delayed))
48 | 
49 | modules_optional = []
50 | for line in clean_content:
51 |     if re.search('optional',line):
52 |         temp_mod = re.sub(' \(.*','',line)
53 |         temp_mod = re.sub('\..*','',temp_mod)
54 |         modules_optional.append(temp_mod)
55 | modules_optional = list(set(modules_optional))
56 | 
57 | all_modules = modules_toplevel + modules_conditional + modules_delayed + modules_optional
58 | all_modules = list(set(all_modules))
59 | 
60 | print(all_modules)
61 | print('Number of found modules:', len(all_modules))
62 | 
63 | ### Optional: remove any of the modules
64 | remove_list = [
65 |     '/usr/local/lib/python3',
66 |     '/Users/sotomi/envs/pyinstaller-env/lib/python3',
67 |     'zipimport',
68 |     'test',
69 | ]
70 | add_list = [
71 |     'sacremoses',
72 |     'resemblyzer',
73 |     'librosa',
74 | ]
75 | for pkg in remove_list:
76 |     if pkg in all_modules:
77 |         all_modules.remove(pkg)
78 | for pkg in add_list:
79 |     if pkg not in all_modules:
80 |         all_modules.append(pkg)
81 | 
82 | print('Total number of requested modules:', len(all_modules))
83 | 
84 | ### Optional: Change all_modules by any of the other lists, e.g. modules_toplevel
85 | for module in all_modules:
86 |     output_content = 'from PyInstaller.utils.hooks import collect_all\n\ndatas, binaries, hiddenimports = collect_all(\''+module+'\')'
87 |     with open(output_hooks_dir+'/hook-'+str(module)+'.py', 'w') as f:
88 |         f.write(output_content)
89 | 


--------------------------------------------------------------------------------
/engine/general.py:
--------------------------------------------------------------------------------
  1 | from .loading import get_vocoder, get_vc_model
  2 | from .feature_extraction import load_pretrained_spk_emb, load_pretrained_feature_extractor, Wav2Mel
  3 | from utils.hparams import cfg
  4 | 
  5 | from resemblyzer import preprocess_wav
  6 | import torch
  7 | import numpy as np
  8 | import librosa
  9 | from pathlib import Path
 10 | from dataclasses import dataclass, field
 11 | from typing import List
 12 | 
 13 | 
 14 | @dataclass
 15 | class Utterance:
 16 |   """Audio with its cached features"""
 17 |   wav: np.array = field(repr=False, default=None)
 18 |   sr: int = None
 19 |   path: str = None
 20 |   spk_name: str = None
 21 |   mel: np.ndarray = field(repr=False, default=None)
 22 |   spk_emb: np.array = field(repr=False, default=None)
 23 |   features: np.ndarray = field(repr=False, default=None)
 24 | 
 25 | 
 26 |   def clear(self):
 27 |     self.wav = None
 28 |     self.mel = None
 29 |     self.features = None
 30 | 
 31 |   def get_id(self):
 32 |     if self.path is None or self.spk_name is None:
 33 |       return
 34 |     return (self.spk_name, Path(self.path).stem)
 35 | 
 36 |   def __eq__(self, other):
 37 |     return self.get_id() == other.get_id()
 38 | 
 39 |   def __hash__(self):
 40 |     return hash(self.get_id())
 41 | 
 42 | 
 43 | class VC:
 44 |   def __init__(self):
 45 |     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 46 |     self.feature_extractor = None
 47 |     self.mel_extractor = None
 48 |     self.spk_emb_extractor = None
 49 |     self.model = None
 50 |     self.vocoder = None
 51 | 
 52 |   @staticmethod
 53 |   def yield_init_stage(stage_num):
 54 |     return stage_num / 5
 55 | 
 56 |   def logged_init(self):
 57 |     yield self.yield_init_stage(0)
 58 |     self.feature_extractor = load_pretrained_feature_extractor(device=self.device)
 59 | 
 60 |     yield self.yield_init_stage(1)
 61 |     self.mel_extractor = Wav2Mel(**cfg.data)
 62 | 
 63 |     yield self.yield_init_stage(2)
 64 |     self.spk_emb_extractor = load_pretrained_spk_emb(device=self.device)
 65 | 
 66 |     yield self.yield_init_stage(3)
 67 |     self.model = get_vc_model().to(self.device)
 68 | 
 69 |     yield self.yield_init_stage(4)
 70 |     self.vocoder = get_vocoder().to(self.device)
 71 | 
 72 |     yield self.yield_init_stage(5)
 73 | 
 74 |   def __call__(self, src: Utterance, tgts: List[Utterance], input_sr: int = cfg.data.sample_rate):
 75 |     """Convert source utterance from source speaker to target speaker"""
 76 | 
 77 |     # preparation
 78 |     src_features, tgt_features = self.prepare(src.wav, [tgt.wav for tgt in tgts], input_sr=input_sr)
 79 | 
 80 |     # conversion
 81 |     out_mel = self.convert(src_features, tgt_features)
 82 | 
 83 |     # vocoding
 84 |     out_wav = self.vocode(out_mel)
 85 | 
 86 |     return out_wav.cpu().numpy()
 87 | 
 88 |   def prepare(self, src_wav, tgt_wavs, input_sr=cfg.data.sample_rate):
 89 |     src_wav = torch.from_numpy(src_wav).to(self.device)
 90 |     if len(src_wav.shape) == 1:
 91 |       src_wav = src_wav.unsqueeze(0)
 92 |     tgt_wav = torch.from_numpy(
 93 |       np.concatenate(tgt_wavs)
 94 |     ).to(self.device).unsqueeze(0)
 95 | 
 96 |     src_features = self._get_features(src_wav)
 97 |     tgt_spk_emb = self._get_spk_emb(tgt_wavs, input_sr)
 98 |     tgt_mel = self.mel_extractor(tgt_wav)
 99 | 
100 |     return src_features, (tgt_mel, tgt_spk_emb)
101 | 
102 |   def convert(self, src_features, tgt_features):
103 |     tgt_mel, tgt_spk_emb = tgt_features
104 |     with torch.no_grad():
105 |       out_mel, _, _, _ = self.model(src_features, tgt_mel, ref_embs=tgt_spk_emb)
106 |     return out_mel
107 | 
108 |   def vocode(self, mel):
109 |     with torch.no_grad():
110 |       wav = self.vocoder(mel).squeeze(1)
111 |     return wav
112 | 
113 |   def _get_mel(self, wav):
114 |     return self.mel_extractor(wav)
115 | 
116 |   def _get_features(self, wav):
117 |     with torch.no_grad():
118 |       return self.feature_extractor.extract_features(wav, None)[0]
119 | 
120 |   def _get_spk_emb(self, wavs, sr=None):
121 |     wavs = [preprocess_wav(wav, sr) for wav in wavs]
122 |     cat_wav = np.concatenate(wavs, 0)
123 |     spk_emb = self.spk_emb_extractor.embed_utterance(cat_wav)
124 | 
125 |     return torch.from_numpy(spk_emb).to(self.device).unsqueeze(0)
126 | 
127 |   # @staticmethod
128 |   # def preprocess_single_wav(fpath_or_wav: Union[str, Path, np.ndarray], src_sr=None, tgt_sr=None):
129 |   #   # TODO
130 |   #   if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
131 |   #     wav, src_sr = librosa.load(str(fpath_or_wav), sr=None)
132 |   #   else:
133 |   #     wav = fpath_or_wav
134 | 
135 |   #   # Resample the wav
136 |   #   if src_sr is not None and tgt_sr is not None:
137 |   #     wav = librosa.resample(wav, src_sr, tgt_sr)
138 | 
139 |   #   return wav


--------------------------------------------------------------------------------
/engine/vocoder/hifi_gan.py:
--------------------------------------------------------------------------------
  1 | # Hifi-GAN Reference:
  2 | # https://github.com/jik876/hifi-gan/blob/master/models.py
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torch.nn as nn
  7 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  8 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
  9 | from .utils import init_weights, get_padding
 10 | 
 11 | LRELU_SLOPE = 0.1
 12 | 
 13 | 
 14 | class ResBlock1(torch.nn.Module):
 15 |   def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 16 |     super(ResBlock1, self).__init__()
 17 |     self.h = h
 18 |     self.convs1 = nn.ModuleList([
 19 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 20 |                  padding=get_padding(kernel_size, dilation[0]))),
 21 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 22 |                  padding=get_padding(kernel_size, dilation[1]))),
 23 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
 24 |                  padding=get_padding(kernel_size, dilation[2])))
 25 |     ])
 26 |     self.convs1.apply(init_weights)
 27 | 
 28 |     self.convs2 = nn.ModuleList([
 29 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 30 |                  padding=get_padding(kernel_size, 1))),
 31 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 32 |                  padding=get_padding(kernel_size, 1))),
 33 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 34 |                  padding=get_padding(kernel_size, 1)))
 35 |     ])
 36 |     self.convs2.apply(init_weights)
 37 | 
 38 |   def forward(self, x):
 39 |     for c1, c2 in zip(self.convs1, self.convs2):
 40 |       xt = F.leaky_relu(x, LRELU_SLOPE)
 41 |       xt = c1(xt)
 42 |       xt = F.leaky_relu(xt, LRELU_SLOPE)
 43 |       xt = c2(xt)
 44 |       x = xt + x
 45 |     return x
 46 | 
 47 |   def remove_weight_norm(self):
 48 |     for l in self.convs1:
 49 |       remove_weight_norm(l)
 50 |     for l in self.convs2:
 51 |       remove_weight_norm(l)
 52 | 
 53 | 
 54 | class ResBlock2(torch.nn.Module):
 55 |   def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
 56 |     super(ResBlock2, self).__init__()
 57 |     self.h = h
 58 |     self.convs = nn.ModuleList([
 59 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 60 |                  padding=get_padding(kernel_size, dilation[0]))),
 61 |       weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 62 |                  padding=get_padding(kernel_size, dilation[1])))
 63 |     ])
 64 |     self.convs.apply(init_weights)
 65 | 
 66 |   def forward(self, x):
 67 |     for c in self.convs:
 68 |       xt = F.leaky_relu(x, LRELU_SLOPE)
 69 |       xt = c(xt)
 70 |       x = xt + x
 71 |     return x
 72 | 
 73 |   def remove_weight_norm(self):
 74 |     for l in self.convs:
 75 |       remove_weight_norm(l)
 76 | 
 77 | 
 78 | class Generator(torch.nn.Module):
 79 |   def __init__(self, h):
 80 |     super(Generator, self).__init__()
 81 |     self.h = h
 82 |     self.num_kernels = len(h.resblock_kernel_sizes)
 83 |     self.num_upsamples = len(h.upsample_rates)
 84 |     self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
 85 |     resblock = ResBlock1 if h.resblock == '1' else ResBlock2
 86 | 
 87 |     self.ups = nn.ModuleList()
 88 |     for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
 89 |       self.ups.append(weight_norm(
 90 |         ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
 91 |                 k, u, padding=(u//2 + u%2), output_padding=u%2)))
 92 | 
 93 |     self.resblocks = nn.ModuleList()
 94 |     for i in range(len(self.ups)):
 95 |       ch = h.upsample_initial_channel//(2**(i+1))
 96 |       for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
 97 |         self.resblocks.append(resblock(h, ch, k, d))
 98 | 
 99 |     self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
100 |     self.ups.apply(init_weights)
101 |     self.conv_post.apply(init_weights)
102 | 
103 |   def forward(self, x):
104 |     x = self.conv_pre(x)
105 |     for i in range(self.num_upsamples):
106 |       x = F.leaky_relu(x, LRELU_SLOPE)
107 |       x = self.ups[i](x)
108 |       xs = None
109 |       for j in range(self.num_kernels):
110 |         if xs is None:
111 |           xs = self.resblocks[i*self.num_kernels+j](x)
112 |         else:
113 |           xs += self.resblocks[i*self.num_kernels+j](x)
114 |       x = xs / self.num_kernels
115 |     x = F.leaky_relu(x)
116 |     x = self.conv_post(x)
117 |     x = torch.tanh(x)
118 | 
119 |     return x
120 | 
121 |   def remove_weight_norm(self):
122 |     print('Removing weight norm...')
123 |     for l in self.ups:
124 |       remove_weight_norm(l)
125 |     for l in self.resblocks:
126 |       l.remove_weight_norm()
127 |     remove_weight_norm(self.conv_pre)
128 |     remove_weight_norm(self.conv_post)
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/toolbox.py:
--------------------------------------------------------------------------------
  1 | # from audioread.exceptions import NoBackendError
  2 | from engine import VC, Utterance
  3 | from gui import GUI
  4 | from pathlib import Path
  5 | import utils
  6 | from utils.hparams import cfg
  7 | 
  8 | from time import perf_counter as timer
  9 | import traceback
 10 | import numpy as np
 11 | import torch
 12 | import os
 13 | import sys
 14 | from pathlib import Path
 15 | from collections import defaultdict
 16 | 
 17 | 
 18 | # Maximum of generated wavs to keep on memory
 19 | MAX_WAVS = 15
 20 | MAX_TARGET_SAMPLES = 10
 21 | MAX_LOADED_SAMPLES = 100
 22 | 
 23 | 
 24 | class Toolbox:
 25 |   def __init__(self, datasets_root, seed):
 26 |     sys.excepthook = self.excepthook
 27 |     self.seed = seed
 28 |     self.datasets_root = datasets_root
 29 |     self.recognized_datasets = []
 30 |     self.utterances = set()
 31 |     self.current_generated = (None, None, None, None) # speaker_name, mel, breaks, wav
 32 |     self.speaker_filepathes = defaultdict(set)
 33 |     self.audio_ext = {'.wav', '.flac', '.mp3'}
 34 |     for datafolder in utils.data.get_subdirs(datasets_root):
 35 |       self.load_dataset_info(os.path.join(self.datasets_root, datafolder))
 36 | 
 37 |     self.engine = None # type: VC
 38 |     self.current_src_utt = None
 39 |     self.current_tgt_utts = None
 40 |     self.current_tgt_spk = None
 41 |     self.loaded_utts = []
 42 |     self.conv_utts_list = []
 43 |     self.conv_utts_idlist = []
 44 |     self.self_record_count = 0
 45 | 
 46 |     self.trim_silences = True
 47 | 
 48 |     # Initialize the events and the interface
 49 |     self.ui = GUI()
 50 |     self.reset_ui(seed)
 51 |     self.setup_events()
 52 |     self.ui.start()
 53 | 
 54 |   def excepthook(self, exc_type, exc_value, exc_tb):
 55 |     traceback.print_exception(exc_type, exc_value, exc_tb)
 56 |     self.ui.log("Exception: %s" % exc_value)
 57 | 
 58 |   def setup_events(self):
 59 |     # Dataset, speaker and utterance selection
 60 |     self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
 61 |     random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root, self.recognized_datasets, level)
 62 |     self.ui.random_dataset_button.clicked.connect(random_func(0))
 63 |     self.ui.random_speaker_button.clicked.connect(random_func(1))
 64 |     self.ui.random_utterance_button.clicked.connect(random_func(2))
 65 |     self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
 66 |     self.ui.src_spk_box.currentIndexChanged.connect(random_func(2))
 67 |     self.ui.tgt_spk_box.currentIndexChanged.connect(random_func(2))
 68 | 
 69 |     # Utterance selection
 70 |     func = lambda: self.load_from_browser(self.ui.browse_file())
 71 |     self.ui.browser_browse_button.clicked.connect(func)
 72 |     func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
 73 |     self.ui.utterance_history.currentIndexChanged.connect(func)
 74 |     func = lambda: self.ui.play(self.ui.selected_utterance.wav, cfg.data.sample_rate)
 75 |     self.ui.play_button.clicked.connect(func)
 76 |     self.ui.stop_button.clicked.connect(self.ui.stop)
 77 |     self.ui.record_button.clicked.connect(self.record)
 78 | 
 79 |     # Audio
 80 |     self.ui.setup_audio_devices(cfg.data.sample_rate)
 81 | 
 82 |     # Wav playback & save
 83 |     func = lambda: self.replay_last_wav()
 84 |     self.ui.replay_wav_button.clicked.connect(func)
 85 |     func = lambda: self.export_current_wave()
 86 |     self.ui.export_wav_button.clicked.connect(func)
 87 |     self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt)
 88 | 
 89 |     # Generation
 90 |     func = lambda: self.convert() or self.vocode()
 91 |     self.ui.generate_button.clicked.connect(func)
 92 |     self.ui.synthesize_button.clicked.connect(self.convert)
 93 |     self.ui.vocode_button.clicked.connect(self.vocode)
 94 |     self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)
 95 | 
 96 |     # UMAP legend
 97 |     self.ui.clear_button.clicked.connect(self.clear_utterances)
 98 | 
 99 |   def set_current_utt(self, index):
100 |     self.current_src_utt = self.conv_utts_list[index]
101 | 
102 |   def export_current_wave(self):
103 |     self.ui.save_audio_file(self.current_src_utt, cfg.data.sample_rate)
104 | 
105 |   def replay_last_wav(self):
106 |     self.ui.play(self.current_src_utt, cfg.data.sample_rate)
107 | 
108 |   def reset_ui(self, seed):
109 |     self.recognized_datasets = [p for p in self.datasets_root.iterdir() if p.is_dir()]
110 |     self.ui.populate_browser(self.datasets_root, self.recognized_datasets, 0, True)
111 |     self.ui.populate_gen_options(seed, self.trim_silences)
112 | 
113 |   def load_from_browser(self, fpath=None):
114 |     if fpath is None:
115 |       fpath = Path(self.datasets_root, self.ui.current_dataset_name, self.ui.current_src_spk, self.ui.current_utterance_name)
116 |       name = str(fpath.relative_to(self.datasets_root))
117 |       speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_src_spk
118 | 
119 |       # Select the next utterance
120 |       if self.ui.auto_next_checkbox.isChecked():
121 |         self.ui.browser_select_next()
122 |     elif fpath == "":
123 |       return
124 |     else:
125 |       name = fpath.name
126 |       speaker_name = fpath.parent.name
127 | 
128 | 
129 |     # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
130 |     # playback, so as to have a fair comparison with the generated audio
131 |     wav = utils.load_wav(str(fpath))
132 |     self.ui.log("Loaded %s" % name)
133 | 
134 |     self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
135 | 
136 |   def record(self):
137 |     wav = self.ui.record_one(cfg.data.sample_rate, 5)
138 |     if wav is None:
139 |       return
140 |     self.ui.play(wav, cfg.data.sample_rate)
141 |     self.self_record_count += 1
142 | 
143 |     speaker_name = "user_recorder"
144 |     name = f"{speaker_name}_{self.self_record_count}"
145 |     self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
146 | 
147 |   def add_real_utterance(self, wav, sr, path, spk_name):
148 |     if self.engine is None:
149 |       self.init_engine()
150 | 
151 |     # Compute the mel spectrogram
152 |     mel = self.engine._get_mel(torch.from_numpy(wav))
153 |     self.ui.draw_mel(mel.squeeze(0), "current")
154 | 
155 |     # Compute the embedding
156 |     embed = self.engine._get_spk_emb([wav], sr=sr)
157 | 
158 |     # Add the utterance
159 |     utterance = Utterance(
160 |       wav=wav, sr=sr,
161 |       path=path, spk_name=spk_name,
162 |       mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0)
163 |     )
164 |     if utterance not in self.utterances:
165 |       self.utterances.add(utterance)
166 |       self.ui.register_utterance(utterance)
167 | 
168 |     # Plot it
169 |     # self.ui.draw_embed(embed, Path(path).stem, "current")
170 |     self.ui.draw_umap_projections(self.utterances)
171 | 
172 |   def clear_utterances(self):
173 |     self.reset_ui(self.seed)
174 |     self.utterances.clear()
175 |     self.ui.draw_umap_projections(self.utterances)
176 | 
177 |   def convert(self):
178 |     self.ui.log("Converting from source to target...")
179 |     self.ui.set_loading(1)
180 | 
181 |     # Update the synthesizer random seed
182 |     if self.ui.random_seed_checkbox.isChecked():
183 |       seed = int(self.ui.seed_textbox.text())
184 |       self.ui.populate_gen_options(seed, self.trim_silences)
185 |     else:
186 |       seed = None
187 | 
188 |     tgt_spk = self.ui.current_tgt_spk
189 | 
190 |     # Synthesize the spectrogram
191 |     if self.engine is None:
192 |       self.init_engine()
193 | 
194 |     src_wav = self.ui.selected_utterance.wav
195 |     if self.current_tgt_spk is None or self.current_tgt_spk != tgt_spk:
196 |       self.current_tgt_utts = self.get_spk_utterances(tgt_spk)
197 | 
198 |     tgt_wavs = [tgt.wav for tgt in self.current_tgt_utts]
199 |     prep_data = self.engine.prepare(src_wav, tgt_wavs)
200 |     mel = self.engine.convert(*prep_data)
201 | 
202 |     self.ui.draw_mel(mel.cpu().numpy().squeeze(0), "converted mel")
203 |     self.current_generated = (self.ui.selected_utterance.spk_name, Path(self.ui.selected_utterance.path).stem, self.ui.current_tgt_spk, mel)
204 |     self.ui.set_loading(0)
205 | 
206 |   def vocode(self):
207 |     src_spk, basename, tgt_spk, mel = self.current_generated
208 |     assert mel is not None
209 | 
210 |     # Synthesize the waveform
211 |     if not self.engine:
212 |       self.init_engine()
213 | 
214 |     # def vocoder_progress(i, seq_len, b_size, gen_rate):
215 |     #   real_time_factor = (gen_rate / cfg.data.sample_rate) * 1000
216 |     #   line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
217 |     #        % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
218 |     #   self.ui.log(line, "overwrite")
219 |     #   self.ui.set_loading(i, seq_len)
220 | 
221 |     # wav = vocoder.infer_waveform(mel, progress_callback=vocoder_progress)
222 |     wav = self.engine.vocode(mel).squeeze(0).cpu().numpy()
223 |     self.ui.set_loading(0)
224 |     self.ui.log("Done!", "append")
225 | 
226 | 
227 |     # Play it
228 |     wav = (wav / np.abs(wav).max()) * 0.95
229 |     self.ui.play(wav, cfg.data.sample_rate)
230 | 
231 |     # Name it (history displayed in combobox)
232 |     name = f"{src_spk}_to_{tgt_spk}_{basename}"
233 |     spk_name = f"{src_spk}_to_{tgt_spk}"
234 | 
235 |     # Update wavs combobox
236 |     if len(self.conv_utts_list) > MAX_WAVS:
237 |       self.conv_utts_list.pop()
238 |       self.conv_utts_idlist.pop()
239 |     self.conv_utts_list.insert(0, wav)
240 |     self.conv_utts_idlist.insert(0, name)
241 | 
242 |     # self.ui.wavs_cb.disconnect()
243 |     self.ui.wavs_cb_model.setStringList(self.conv_utts_idlist)
244 |     self.ui.wavs_cb.setCurrentIndex(0)
245 |     self.ui.wavs_cb.currentIndexChanged.connect(self.set_current_utt)
246 | 
247 |     # Update current wav
248 |     self.set_current_utt(0)
249 | 
250 |     # Enable replay and save buttons:
251 |     self.ui.replay_wav_button.setDisabled(False)
252 |     self.ui.export_wav_button.setDisabled(False)
253 | 
254 |     # Compute speaker embedding
255 |     embed = self.engine._get_spk_emb([wav], sr=cfg.data.sample_rate)
256 | 
257 |     # Add the utterance
258 |     utterance = Utterance(
259 |       wav=wav, sr=cfg.data.sample_rate,
260 |       path=name, spk_name=spk_name,
261 |       mel=mel.cpu().numpy().squeeze(0), spk_emb=embed.squeeze(0)
262 |     )
263 |     self.utterances.add(utterance)
264 | 
265 |     # Plot it
266 |     # self.ui.draw_embed(embed, name, "generated")
267 |     self.ui.draw_umap_projections(self.utterances)
268 | 
269 | 
270 |   def get_spk_utterances(self, spk_name):
271 |     utts = list(filter(lambda u: u.spk_name == spk_name, self.loaded_utts))
272 |     if len(utts) >= MAX_TARGET_SAMPLES:
273 |       return utts
274 | 
275 |     utts_pathes = set(map(lambda u: u.path, utts))
276 |     available_utts_pathes = list(filter(lambda p: p not in utts_pathes, self.speaker_filepathes[spk_name]))
277 |     available_utts_pathes = available_utts_pathes[:MAX_TARGET_SAMPLES - len(utts_pathes)]
278 | 
279 |     new_utts = list(map(lambda p: self.load_utterance(spk_name, p), available_utts_pathes))
280 |     self.loaded_utts.extend(new_utts)
281 |     self.loaded_utts = self.loaded_utts[-MAX_LOADED_SAMPLES:]
282 | 
283 |     utts.extend(new_utts)
284 |     return utts
285 | 
286 |   def load_utterance(self, spk_name, path):
287 |     wav = utils.load_wav(path)
288 |     return Utterance(wav, cfg.data.sample_rate, path=path, spk_name=spk_name)
289 | 
290 |   def load_dataset_info(self, dataset_path):
291 |     speakers = utils.data.get_subdirs(dataset_path)
292 | 
293 |     for spk in speakers:
294 |       self.speaker_filepathes[spk] = {
295 |         *self.speaker_filepathes[spk],
296 |         *utils.data.get_filepathes(os.path.join(dataset_path, spk), self.audio_ext)
297 |       }
298 | 
299 |   def init_engine(self):
300 |     self.ui.log("Creating voice conversion model...")
301 |     self.ui.set_loading(1)
302 |     start = timer()
303 |     self.engine = VC()
304 |     for stage in self.engine.logged_init():
305 |       self.ui.set_loading(stage)
306 |     self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
307 |     self.ui.set_loading(0)
308 | 
309 |   def update_seed_textbox(self):
310 |     self.ui.update_seed_textbox()


--------------------------------------------------------------------------------
/gui/gui.py:
--------------------------------------------------------------------------------
  1 | from engine import Utterance
  2 | from .widgets import get_figure_widget, FigureWidget
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | from PySide6.QtCore import Qt, QStringListModel
  6 | from PySide6.QtGui import QImage, QPixmap
  7 | from PySide6.QtWidgets import *
  8 | 
  9 | from pathlib import Path
 10 | from typing import List, Set
 11 | import sounddevice as sd
 12 | import soundfile as sf
 13 | import numpy as np
 14 | from time import sleep
 15 | import umap
 16 | import sys
 17 | from warnings import filterwarnings, warn
 18 | filterwarnings("ignore")
 19 | 
 20 | 
 21 | colormap = np.array([
 22 |   [0, 127, 70],
 23 |   [255, 0, 0],
 24 |   [255, 217, 38],
 25 |   [0, 135, 255],
 26 |   [165, 0, 165],
 27 |   [255, 167, 255],
 28 |   [97, 142, 151],
 29 |   [0, 255, 255],
 30 |   [255, 96, 38],
 31 |   [142, 76, 0],
 32 |   [33, 0, 127],
 33 |   [0, 0, 0],
 34 |   [183, 183, 183],
 35 |   [76, 255, 0],
 36 | ], dtype=np.float) / 255
 37 | 
 38 | 
 39 | class GUI(QDialog):
 40 |   min_umap_points = 4
 41 |   max_log_lines = 5
 42 |   max_saved_utterances = 20
 43 | 
 44 |   def draw_utterance(self, utterance: Utterance, which):
 45 |     self.draw_mel(utterance.mel, which)
 46 |     # self.draw_embed(utterance.spk_emb, Path(utterance.path).stem, which)
 47 | 
 48 |   def draw_embed(self, spk_emb, name, which):
 49 |     widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget
 50 |     embed_ax, _ = widget.axis
 51 |     embed_ax.figure.suptitle("" if spk_emb is None else name)
 52 | 
 53 |     ## Embedding
 54 |     # Clear the plot
 55 |     if len(embed_ax.images) > 0:
 56 |       embed_ax.images[0].colorbar.remove()
 57 |     embed_ax.clear()
 58 | 
 59 |     # Draw speaker embedding
 60 |     if spk_emb is not None:
 61 |       embed_ax.set_title("embedding")
 62 |     embed_ax.set_aspect("equal", "datalim")
 63 |     embed_ax.set_xticks([])
 64 |     embed_ax.set_yticks([])
 65 |     embed_ax.figure.canvas.draw()
 66 |     widget.update()
 67 | 
 68 |   def draw_mel(self, mel, which):
 69 |     widget = self.cur_ax_widget if which == "current" else self.gen_ax_widget
 70 |     # _, mel_ax = widget.axis
 71 |     mel_ax = widget.axis
 72 | 
 73 |     ## Spectrogram
 74 |     # Draw the spectrogram
 75 |     mel_ax.clear()
 76 |     if mel is not None:
 77 |       im = mel_ax.imshow(mel, aspect="auto", origin="lower", interpolation='none')
 78 |       mel_ax.set_title("mel spectrogram")
 79 | 
 80 |     mel_ax.set_xticks([])
 81 |     mel_ax.set_yticks([])
 82 |     mel_ax.figure.canvas.draw()
 83 |     widget.update()
 84 |     if which != "current":
 85 |       self.vocode_button.setDisabled(mel is None)
 86 | 
 87 |   def draw_umap_projections(self, utterances: Set[Utterance]):
 88 |     self.umap_ax.clear()
 89 | 
 90 |     speakers = np.unique([u.spk_name for u in utterances])
 91 |     colors = {spk_name: colormap[i] for i, spk_name in enumerate(speakers)}
 92 |     embeds = [u.spk_emb for u in utterances]
 93 | 
 94 |     # Display a message if there aren't enough points
 95 |     if len(utterances) < self.min_umap_points:
 96 |       self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
 97 |                 (self.min_umap_points - len(utterances)),
 98 |                 horizontalalignment='center', fontsize=15)
 99 |       self.umap_ax.set_title("")
100 | 
101 |     # Compute the projections
102 |     else:
103 |       if not self.umap_hot:
104 |         self.log(
105 |           "Drawing UMAP projections for the first time, this will take a few seconds.")
106 |         self.umap_hot = True
107 | 
108 |       reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
109 |       projections = reducer.fit_transform(embeds)
110 | 
111 |       speakers_done = set()
112 |       for projection, utterance in zip(projections, utterances):
113 |         color = colors[utterance.spk_name]
114 |         mark = "x" if "_gen_" in Path(utterance.path).stem else "o"
115 |         label = None if utterance.spk_name in speakers_done else utterance.spk_name
116 |         speakers_done.add(utterance.spk_name)
117 |         self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark, label=label)
118 |       # self.umap_ax.set_title("UMAP projections")
119 |       self.umap_ax.legend(prop={'size': 10})
120 | 
121 |     # Draw the plot
122 |     self.umap_ax.set_aspect("equal", "datalim")
123 |     self.umap_ax.set_xticks([])
124 |     self.umap_ax.set_yticks([])
125 |     self.umap_ax.figure.canvas.draw()
126 | 
127 |   def save_audio_file(self, wav, sample_rate):
128 |     dialog = QFileDialog()
129 |     dialog.setDefaultSuffix(".wav")
130 |     fpath, _ = dialog.getSaveFileName(
131 |       parent=self,
132 |       caption="Select a path to save the audio file",
133 |       filter="Audio Files (*.flac *.wav)"
134 |     )
135 |     if fpath:
136 |       #Default format is wav
137 |       if Path(fpath).suffix == "":
138 |         fpath += ".wav"
139 |       sf.write(fpath, wav, sample_rate)
140 | 
141 |   def setup_audio_devices(self, sample_rate):
142 |     input_devices = []
143 |     output_devices = []
144 |     for device in sd.query_devices():
145 |       # Check if valid input
146 |       try:
147 |         sd.check_input_settings(device=device["name"], samplerate=sample_rate)
148 |         input_devices.append(device["name"])
149 |       except:
150 |         pass
151 | 
152 |       # Check if valid output
153 |       try:
154 |         sd.check_output_settings(device=device["name"], samplerate=sample_rate)
155 |         output_devices.append(device["name"])
156 |       except Exception as e:
157 |         # Log a warning only if the device is not an input
158 |         if not device["name"] in input_devices:
159 |           warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
160 | 
161 |     if len(input_devices) == 0:
162 |       self.log("No audio input device detected. Recording may not work.")
163 |       self.audio_in_device = None
164 |     else:
165 |       self.audio_in_device = input_devices[0]
166 | 
167 |     if len(output_devices) == 0:
168 |       self.log("No supported output audio devices were found! Audio output may not work.")
169 |       self.audio_out_devices_cb.addItems(["None"])
170 |       self.audio_out_devices_cb.setDisabled(True)
171 |     else:
172 |       self.audio_out_devices_cb.clear()
173 |       self.audio_out_devices_cb.addItems(output_devices)
174 |       self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
175 | 
176 |     self.set_audio_device()
177 | 
178 |   def set_audio_device(self):
179 | 
180 |     output_device = self.audio_out_devices_cb.currentText()
181 |     if output_device == "None":
182 |       output_device = None
183 | 
184 |     # If None, sounddevice queries portaudio
185 |     sd.default.device = (self.audio_in_device, output_device)
186 | 
187 |   def play(self, wav, sample_rate):
188 |     try:
189 |       sd.stop()
190 |       sd.play(wav, sample_rate)
191 |     except Exception as e:
192 |       print(e)
193 |       self.log("Error in audio playback. Try selecting a different audio output device.")
194 |       self.log("Your device must be connected before you start the toolbox.")
195 | 
196 |   def stop(self):
197 |     sd.stop()
198 | 
199 |   def record_one(self, sample_rate, duration):
200 |     self.record_button.setText("Recording...")
201 |     self.record_button.setDisabled(True)
202 | 
203 |     self.log("Recording %d seconds of audio" % duration)
204 |     sd.stop()
205 |     try:
206 |       wav = sd.rec(duration * sample_rate, sample_rate, 1)
207 |     except Exception as e:
208 |       print(e)
209 |       self.log("Could not record anything. Is your recording device enabled?")
210 |       self.log("Your device must be connected before you start the toolbox.")
211 |       return None
212 | 
213 |     for i in np.arange(0, duration, 0.1):
214 |       self.set_loading(i, duration)
215 |       sleep(0.1)
216 |     self.set_loading(duration, duration)
217 |     sd.wait()
218 | 
219 |     self.log("Done recording.")
220 |     self.record_button.setText("Record")
221 |     self.record_button.setDisabled(False)
222 | 
223 |     return wav.squeeze()
224 | 
225 |   @property
226 |   def current_dataset_name(self):
227 |     return self.dataset_box.currentText()
228 | 
229 |   @property
230 |   def current_src_spk(self):
231 |     return self.src_spk_box.currentText()
232 | 
233 |   @property
234 |   def current_tgt_spk(self):
235 |     return self.tgt_spk_box.currentText()
236 | 
237 |   @property
238 |   def current_utterance_name(self):
239 |     return self.utterance_box.currentText()
240 | 
241 |   def browse_file(self):
242 |     fpath = QFileDialog().getOpenFileName(
243 |       parent=self,
244 |       caption="Select an audio file",
245 |       filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
246 |     )
247 |     return Path(fpath[0]) if fpath[0] != "" else ""
248 | 
249 |   @staticmethod
250 |   def repopulate_box(box, items, random=False):
251 |     """
252 |     Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
253 |     data to the items
254 |     """
255 |     box.blockSignals(True)
256 |     box.clear()
257 |     for item in items:
258 |       item = list(item) if isinstance(item, tuple) else [item]
259 |       box.addItem(str(item[0]), *item[1:])
260 |     if len(items) > 0:
261 |       box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
262 |     box.setDisabled(len(items) == 0)
263 |     box.blockSignals(False)
264 | 
265 |   def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int, random=True):
266 |     # Select a random dataset
267 |     if level <= 0:
268 |       if datasets_root is not None:
269 |         datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
270 |         datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
271 |         self.browser_load_button.setDisabled(len(datasets) == 0)
272 |       if datasets_root is None or len(datasets) == 0:
273 |         msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
274 |           if datasets_root is None else "o not have any of the recognized datasets" \
275 |                           " in %s" % datasets_root)
276 |         self.log(msg)
277 |         msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
278 |              "can still use the toolbox by recording samples yourself." % \
279 |              ("\n\t".join(map(str, recognized_datasets)))
280 |         print(msg, file=sys.stderr)
281 | 
282 |         self.random_utterance_button.setDisabled(True)
283 |         self.random_speaker_button.setDisabled(True)
284 |         self.random_dataset_button.setDisabled(True)
285 |         self.utterance_box.setDisabled(True)
286 |         self.src_spk_box.setDisabled(True)
287 |         self.tgt_spk_box.setDisabled(True)
288 |         self.dataset_box.setDisabled(True)
289 |         self.browser_load_button.setDisabled(True)
290 |         self.auto_next_checkbox.setDisabled(True)
291 |         return
292 |       self.repopulate_box(self.dataset_box, datasets, random)
293 | 
294 |     # Select a random src and tgt speakers
295 |     if level <= 1:
296 |       speakers_root = datasets_root.joinpath(self.current_dataset_name)
297 |       speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
298 |       self.repopulate_box(self.src_spk_box, speaker_names, random)
299 |       self.repopulate_box(self.tgt_spk_box, speaker_names, random)
300 | 
301 |     # Select a random utterance
302 |     if level <= 2:
303 |       utterances_root = datasets_root.joinpath(
304 |         self.current_dataset_name,
305 |         self.current_src_spk
306 |       )
307 |       utterances = []
308 |       for extension in ['mp3', 'flac', 'wav']:
309 |         utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
310 |       utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
311 |       self.repopulate_box(self.utterance_box, utterances, random)
312 | 
313 |   def browser_select_next(self):
314 |     index = (self.utterance_box.currentIndex() + 1) % self.utterance_box.count()
315 |     self.utterance_box.setCurrentIndex(index)
316 | 
317 |   @property
318 |   def selected_utterance(self):
319 |     return self.utterance_history.itemData(self.utterance_history.currentIndex())
320 | 
321 |   def register_utterance(self, utterance: Utterance):
322 |     self.utterance_history.blockSignals(True)
323 |     self.utterance_history.insertItem(0, Path(utterance.path).stem, utterance)
324 |     self.utterance_history.setCurrentIndex(0)
325 |     self.utterance_history.blockSignals(False)
326 | 
327 |     if self.utterance_history.count() > self.max_saved_utterances:
328 |       self.utterance_history.removeItem(self.max_saved_utterances)
329 | 
330 |     self.play_button.setDisabled(False)
331 |     self.generate_button.setDisabled(False)
332 |     self.synthesize_button.setDisabled(False)
333 | 
334 |   def log(self, line, mode="newline"):
335 |     if mode == "newline":
336 |       self.logs.append(line)
337 |       if len(self.logs) > self.max_log_lines:
338 |         del self.logs[0]
339 |     elif mode == "append":
340 |       self.logs[-1] += line
341 |     elif mode == "overwrite":
342 |       self.logs[-1] = line
343 |     log_text = '\n'.join(self.logs)
344 | 
345 |     self.log_window.setText(log_text)
346 |     self.app.processEvents()
347 | 
348 |   def set_loading(self, value, maximum=1):
349 |     self.loading_bar.setValue(value * 100)
350 |     self.loading_bar.setMaximum(maximum * 100)
351 |     self.loading_bar.setTextVisible(value != 0)
352 |     self.app.processEvents()
353 | 
354 |   def populate_gen_options(self, seed, trim_silences):
355 |     if seed is not None:
356 |       self.random_seed_checkbox.setChecked(True)
357 |       self.seed_textbox.setText(str(seed))
358 |       self.seed_textbox.setEnabled(True)
359 |     else:
360 |       self.random_seed_checkbox.setChecked(False)
361 |       self.seed_textbox.setText(str(0))
362 |       self.seed_textbox.setEnabled(False)
363 | 
364 |   def update_seed_textbox(self):
365 |     if self.random_seed_checkbox.isChecked():
366 |       self.seed_textbox.setEnabled(True)
367 |     else:
368 |       self.seed_textbox.setEnabled(False)
369 | 
370 |   def reset_interface(self):
371 |     # self.draw_embed(None, None, "current")
372 |     # self.draw_embed(None, None, "generated")
373 |     self.draw_mel(None, "current")
374 |     self.draw_mel(None, "generated")
375 |     # self.draw_umap_projections(set())
376 |     self.set_loading(0)
377 |     self.play_button.setDisabled(True)
378 |     self.generate_button.setDisabled(True)
379 |     self.synthesize_button.setDisabled(True)
380 |     self.vocode_button.setDisabled(True)
381 |     self.replay_wav_button.setDisabled(True)
382 |     self.export_wav_button.setDisabled(True)
383 |     [self.log("") for _ in range(self.max_log_lines)]
384 | 
385 |   def __init__(self):
386 |     ## Initialize the application
387 |     self.app = QApplication(sys.argv)
388 |     super().__init__(None)
389 |     self.setWindowTitle("Voice Conversion app")
390 | 
391 | 
392 |     ## Main layouts
393 |     # Root
394 |     root_layout = QGridLayout()
395 |     self.setLayout(root_layout)
396 | 
397 |     # Browser
398 |     browser_layout = QGridLayout()
399 |     root_layout.addLayout(browser_layout, 0, 0, 1, 2)
400 | 
401 |     # Generation
402 |     gen_layout = QVBoxLayout()
403 |     root_layout.addLayout(gen_layout, 1, 0, 1, 2)
404 | 
405 |     # Projections
406 |     self.projections_layout = QVBoxLayout()
407 |     root_layout.addLayout(self.projections_layout, 1, 2, 1, 1)
408 | 
409 |     # Visualizations
410 |     vis_layout = QVBoxLayout()
411 |     root_layout.addLayout(vis_layout, 0, 2, 1, 1)
412 | 
413 | 
414 |     ## Projections
415 |     # UMap
416 |     fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
417 |     fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
418 |     self.projections_layout.addWidget(get_figure_widget(fig))
419 |     self.umap_hot = False
420 |     self.clear_button = QPushButton("Clear")
421 |     self.projections_layout.addWidget(self.clear_button)
422 | 
423 | 
424 |     ## Browser
425 |     # Dataset, speaker and utterance selection
426 |     i = 0
427 |     self.dataset_box = QComboBox()
428 |     browser_layout.addWidget(QLabel("<b>Dataset</b>"), i, 0)
429 |     browser_layout.addWidget(self.dataset_box, i + 1, 0)
430 |     self.src_spk_box = QComboBox()
431 |     browser_layout.addWidget(QLabel("<b>Source speaker</b>"), i, 1)
432 |     browser_layout.addWidget(self.src_spk_box, i + 1, 1)
433 |     self.utterance_box = QComboBox()
434 |     browser_layout.addWidget(QLabel("<b>Utterance</b>"), i, 2)
435 |     browser_layout.addWidget(self.utterance_box, i + 1, 2)
436 |     self.browser_load_button = QPushButton("Load")
437 |     browser_layout.addWidget(self.browser_load_button, i + 1, 3)
438 |     i += 2
439 | 
440 |     # Random buttons
441 |     self.random_dataset_button = QPushButton("Random")
442 |     browser_layout.addWidget(self.random_dataset_button, i, 0)
443 |     self.random_speaker_button = QPushButton("Random")
444 |     browser_layout.addWidget(self.random_speaker_button, i, 1)
445 |     self.random_utterance_button = QPushButton("Random")
446 |     browser_layout.addWidget(self.random_utterance_button, i, 2)
447 |     self.auto_next_checkbox = QCheckBox("Auto select next")
448 |     self.auto_next_checkbox.setChecked(True)
449 |     browser_layout.addWidget(self.auto_next_checkbox, i, 3)
450 |     i += 1
451 | 
452 |     # Utterance box
453 |     browser_layout.addWidget(QLabel("<b>Use source from:</b>"), i, 0)
454 |     self.utterance_history = QComboBox()
455 |     browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
456 |     i += 1
457 | 
458 |     # Random & next utterance buttons
459 |     self.browser_browse_button = QPushButton("Browse")
460 |     browser_layout.addWidget(self.browser_browse_button, i, 0)
461 |     self.record_button = QPushButton("Record")
462 |     browser_layout.addWidget(self.record_button, i, 1)
463 |     self.play_button = QPushButton("Play")
464 |     browser_layout.addWidget(self.play_button, i, 2)
465 |     self.stop_button = QPushButton("Stop")
466 |     browser_layout.addWidget(self.stop_button, i, 3)
467 |     i += 1
468 | 
469 | 
470 |     # Model and audio output selection
471 |     self.tgt_spk_box = QComboBox()
472 |     browser_layout.addWidget(QLabel("<b>Target speaker</b>"), i, 0)
473 |     browser_layout.addWidget(self.tgt_spk_box, i + 1, 0)
474 | 
475 |     self.audio_out_devices_cb=QComboBox()
476 |     browser_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 1)
477 |     browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 1)
478 |     i += 2
479 | 
480 |     #Replay & Save Audio
481 |     browser_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
482 |     self.wavs_cb = QComboBox()
483 |     self.wavs_cb_model = QStringListModel()
484 |     self.wavs_cb.setModel(self.wavs_cb_model)
485 |     self.wavs_cb.setToolTip("Select one of the last generated wavs in this section for replaying or exporting")
486 |     browser_layout.addWidget(self.wavs_cb, i, 1)
487 |     self.replay_wav_button = QPushButton("Replay")
488 |     self.replay_wav_button.setToolTip("Replay last generated vocoder")
489 |     browser_layout.addWidget(self.replay_wav_button, i, 2)
490 |     self.export_wav_button = QPushButton("Export")
491 |     self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
492 |     browser_layout.addWidget(self.export_wav_button, i, 3)
493 |     i += 1
494 | 
495 | 
496 |     ## Embed & spectrograms
497 |     vis_layout.addStretch()
498 | 
499 |     gridspec_kw = {"width_ratios": [1]}
500 |     fig, cur_ax = plt.subplots(
501 |       1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw
502 |     )
503 |     fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
504 |     self.cur_ax_widget = FigureWidget(fig, cur_ax)
505 |     vis_layout.addWidget(self.cur_ax_widget)
506 | 
507 |     fig, gen_ax = plt.subplots(
508 |       1, 1, figsize=(5, 2), gridspec_kw=gridspec_kw
509 |     )
510 |     fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
511 |     self.gen_ax_widget = FigureWidget(fig, gen_ax)
512 |     vis_layout.addWidget(self.gen_ax_widget)
513 | 
514 |     # for ax in self.cur_ax_widget.axis.tolist() + self.gen_ax_widget.axis.tolist():
515 |     for ax in [self.cur_ax_widget.axis, self.gen_ax_widget.axis]:
516 |       ax.set_facecolor("#F0F0F0")
517 |       for side in ["top", "right", "bottom", "left"]:
518 |         ax.spines[side].set_visible(False)
519 | 
520 | 
521 |     ## Generation
522 |     layout = QHBoxLayout()
523 |     self.generate_button = QPushButton("Synthesize and vocode")
524 |     layout.addWidget(self.generate_button)
525 |     self.synthesize_button = QPushButton("Synthesize only")
526 |     layout.addWidget(self.synthesize_button)
527 |     self.vocode_button = QPushButton("Vocode only")
528 |     layout.addWidget(self.vocode_button)
529 |     gen_layout.addLayout(layout)
530 | 
531 |     layout_seed = QGridLayout()
532 |     self.random_seed_checkbox = QCheckBox("Random seed:")
533 |     self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
534 |     layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
535 |     self.seed_textbox = QLineEdit()
536 |     self.seed_textbox.setMaximumWidth(80)
537 |     layout_seed.addWidget(self.seed_textbox, 0, 1)
538 |     gen_layout.addLayout(layout_seed)
539 | 
540 |     self.loading_bar = QProgressBar()
541 |     gen_layout.addWidget(self.loading_bar)
542 | 
543 |     self.log_window = QLabel()
544 |     self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
545 |     gen_layout.addWidget(self.log_window)
546 |     self.logs = []
547 |     gen_layout.addStretch()
548 | 
549 | 
550 |     ## Set the size of the window and of the elements
551 |     max_size = self.screen().availableGeometry().size() * 0.7
552 |     self.resize(max_size)
553 | 
554 |     ## Finalize the display
555 |     self.reset_interface()
556 |     self.show()
557 | 
558 |   def start(self):
559 |     self.app.exec_()


--------------------------------------------------------------------------------