├── .dockerignore
├── .gitignore
├── Docker-compose.yml
├── Dockerfile
├── LICENSE
├── README.md
├── THIRD_PARTY_NOTICE
├── api.py
├── app_gradio.py
├── checkpts
    ├── spk_encoder
    │   └── LICENSE
    ├── vc
    │   ├── train_dec_libritts_wodyn.log
    │   ├── train_dec_vctk_wodyn.log
    │   ├── train_enc_libritts.log
    │   └── train_enc_vctk.log
    └── vocoder
    │   ├── LICENSE
    │   └── config.json
├── deploy
    ├── Dockerfile
    └── model_repository
    │   ├── vc_pipeline_python
    │       ├── 1
    │       │   ├── model.py
    │       │   └── pipeline
    │       │   │   └── __init__.py
    │       └── config.pbtxt
    │   ├── vc_spk_encoder
    │       └── config.pbtxt
    │   └── vc_vocoder
    │       └── config.pbtxt
├── example
    ├── 6415_111615_000012_000005.wav
    └── 8534_216567_000015_000010.wav
├── export_onnx
    ├── __init__.py
    ├── export_hifigan.py
    ├── export_spk_enc.py
    └── onnx_check.py
├── filelists
    ├── exceptions_libritts.txt
    ├── exceptions_vctk.txt
    └── valid.txt
├── get_avg_mels.ipynb
├── hifi-gan
    ├── LICENSE
    ├── README.md
    ├── env.py
    ├── meldataset.py
    ├── models.py
    └── xutils.py
├── inference.py
├── inference_pipeline.ipynb
├── model
    ├── __init__.py
    ├── base.py
    ├── diffusion.py
    ├── encoder.py
    ├── modules.py
    ├── postnet.py
    ├── utils.py
    └── vc.py
├── params.py
├── requirements.txt
├── run-container.sh
├── scenario
    ├── __init__.py
    ├── prepare_data.py
    ├── train_dec.py
    └── train_enc.py
├── speaker_encoder
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── encoder
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── config.py
    │   ├── data_objects
    │   │   ├── __init__.py
    │   │   ├── random_cycler.py
    │   │   ├── speaker.py
    │   │   ├── speaker_batch.py
    │   │   ├── speaker_verification_dataset.py
    │   │   └── utterance.py
    │   ├── inference.py
    │   ├── model.py
    │   ├── params_data.py
    │   ├── params_model.py
    │   ├── preprocess.py
    │   ├── train.py
    │   └── visualizations.py
    └── utils
    │   ├── __init__.py
    │   ├── argutils.py
    │   ├── logmmse.py
    │   └── profiler.py
├── utils.py
└── var.env


/.dockerignore:
--------------------------------------------------------------------------------
1 | /__pycache__/*
2 | /checkpts
3 | /deploy
4 | /example
5 | /export_oxxn
6 | /filelists
7 | trash*.*


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # myself
  3 | requirement_env.txt
  4 | checkpts/spk_encoder/pretrained.pt
  5 | checkpts/vc/vc_libritts_wodyn.pt
  6 | checkpts/vocoder/generator
  7 | trash*.py
  8 | deploy/*
  9 | output_demo/*
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | **/__pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | share/python-wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .nox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | *.py,cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | cover/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | .pybuilder/
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | #   For a library or package, you might want to ignore these files since the code is
 98 | #   intended to run in multiple environments; otherwise, check them in:
 99 | # .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # poetry
109 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
111 | #   commonly ignored for libraries.
112 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113 | #poetry.lock
114 | 
115 | # pdm
116 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117 | #pdm.lock
118 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119 | #   in version control.
120 | #   https://pdm.fming.dev/#use-with-ide
121 | .pdm.toml
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/


--------------------------------------------------------------------------------
/Docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   vc-triton: 
 5 |     image: nvcr.io/nvidia/tritonserver:21.10-py3
 6 |     shm_size: '1gb' #<-- when RUNNING
 7 |     container_name: triton_multi_ensemble
 8 |     restart: unless-stopped
 9 |     networks:
10 |       - vcnetwork
11 |     hostname: vctriton
12 |     ports:
13 |       - "8030-8032:8000-8002"
14 |     environment:
15 |       - HOME=/config # fix "Home directory not accessible: Permission denied docker" when calling espeak
16 |     volumes:
17 |       - ./hifi-gan:/hifi-gan
18 |       - ./speaker_encoder:/speaker_encoder
19 |     command: bash -c "tritonserver --model-repository=/models --log-verbose 1"
20 |     deploy:
21 |       resources:
22 |         reservations:
23 |           devices:
24 |             - driver: nvidia
25 |               count: 1
26 |               capabilities: [gpu]
27 | 
28 | 
29 |   vc-api:
30 |     container_name: voice-conversion-api
31 |     build: .
32 |     restart: always
33 |     networks:
34 |       - vcnetwork
35 |     environment:
36 |       TRITON_URL: "vc-triton:8031"
37 |     ports:
38 |       - 1513:1513
39 |     depends_on:
40 |       - "vc-triton"
41 |     command: bash -c "uvicorn filename:app --host 0.0.0.0  --port 1900 --workers 1"
42 | 
43 | 
44 | 
45 | networks:
46 |   vcnetwork:
47 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime
 2 | 
 3 | WORKDIR /workspace 
 4 | 
 5 | RUN apt-get update \ 
 6 |   && apt-get install curl libcurl4-openssl-dev libb64-dev -y \
 7 |   && apt-get install libsndfile1-dev -y \
 8 |   && pip install --upgrade pip
 9 | RUN pip install torchaudio==0.8.1
10 | # setup for librosa 
11 | RUN apt-get install libsndfile1
12 | 
13 | COPY requirements.txt requirements.txt
14 | 
15 | RUN pip install -r requirements.txt --no-cache-dir
16 | 
17 | COPY . .
18 | # CMD ["python3", "app_gradio.py"]
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Huawei Technologies Co., Ltd.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Diffusion-Based Any-to-Any Voice Conversion 
 2 | 
 3 | ### Introduction
 4 | - This repository is a derivative of the Official implementation of the paper "Diffusion-Based Voice Conversion with Fast Maximum Likelihood Sampling Scheme" [Link](https://arxiv.org/abs/2109.13821). It builds upon their work and incorporates additional features and modifications specific to this project.
 5 | 
 6 | 
 7 | - [The Official Demo Page](https://diffvc-fast-ml-solver.github.io/).
 8 | 
 9 | # Pre-trained models
10 | 
11 | - Please check `inference.ipynb` for the detailed instructions.
12 | 
13 | - The pre-trained speaker encoder we use is available at https://drive.google.com/file/d/1Y8IO2_OqeT85P1kks9I9eeAq--S65YFb/view?usp=sharing
14 | Please put it to `/checkpts/spk_encoder/`
15 | 
16 | - The pre-trained universal HiFi-GAN vocoder we use is available at https://drive.google.com/file/d/10khlrM645pTbQ4rc2aNEYPba8RFDBkW-/view?usp=sharing. It is taken from the official HiFi-GAN repository. Please put it to `/checkpts/vocoder/`
17 | 
18 | - You have to download voice conversion model trained on LibriTTS from here: https://drive.google.com/file/d/18Xbme0CTVo58p2vOHoTQm8PBGW7oEjAy/view?usp=sharing
19 | 
20 | - Additionally, we provide voice conversion model trained on VCTK: https://drive.google.com/file/d/12s9RPmwp9suleMkBCVetD8pub7wsDAy4/view?usp=sharing
21 | . Please put models to `/checkpts/vc/`
22 | 
23 | # Build docker environment 
24 | 
25 | - To build image, run:
26 | ```bash
27 | Docker build -t diffvc .
28 | ``` 
29 | 
30 | - To run a container for develop, run:
31 | ```bash
32 | bash run-container.sh
33 | ```
34 | 
35 | # Training your own model
36 | 
37 | - To train model on your data, first create a data directory with three folders: "wavs", "mels" and "embeds". Put raw audio files sampled at 22.05kHz to "wavs" directory. The functions for calculating mel-spectrograms and extracting 256-dimensional speaker embeddings with the pre-trained speaker verification network located at *checkpts/spk_encoder/* can be found at *inference.ipynb* notebook (*get_mel* and *get_embed* correspondingly). Please put these data to "mels" and "embeds" folders respectively. Note that all the folders in your data directory should have subfolders corresponding to particular speakers and containing data only for corresponding speakers.
38 | 
39 | - If you want to train the encoder, create "logs_enc" directory and run *train_enc.py*. Before that, you have to prepare another folder "mels_mode" with mel-spectrograms of the "average voice" (i.e. target mels for the encoder) in the data directory. To obtain them, you have to run Montreal Forced Aligner on the input mels, get *.TextGrid* files and put them to "textgrids" folder in the data directory. Once you have "mels" and "textgrids" folders, run *get_avg_mels.ipynb*.
40 |  `python3 -m scenario.train_enc`
41 | - Alternatively, you may load the encoder trained on LibriTTS from https://drive.google.com/file/d/1JdoC5hh7k6Nz_oTcumH0nXNEib-GDbSq/view?usp=sharing and put it to "logs_enc" directory.
42 | 
43 | - Once you have the encoder *enc.pt* in "logs_enc" directory, create "logs_dec" directory and run *train_dec.py* to train the diffusion-based decoder.
44 | `python3 -m scenario.train_dec`
45 | - Please check *params.py* for the most important hyperparameters.
46 | 
47 | # Demo 
48 | 
49 | - To launch gradio demo app, run:
50 | ```bash
51 | python3 app_gradio.py
52 | ``` 
53 | 
54 | # Serve model (developing)
55 | 
56 | 1. Convert model from .pt to .onnx
57 | ```bash
58 | python3 -m export_onnx.export_hifigan
59 | ```
60 | 
61 | ```bash
62 | python3 -m export_onnx.export_spk_enc
63 | ```
64 | 
65 | 2. Deploy pipeline using Triton Inference Server: 
66 | 
67 | 


--------------------------------------------------------------------------------
/THIRD_PARTY_NOTICE:
--------------------------------------------------------------------------------
 1 | Please note we provide an open source software notice for the third party 
 2 | open source software along with this software and/or this software component 
 3 | contributed by Huawei (in the following just “this SOFTWARE”). The open source 
 4 | software licenses are granted by the respective right holders.
 5 | 
 6 | WARRANTY DISCLAIMER
 7 | THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL 
 8 | BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF 
 9 | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES 
10 | FOR MORE DETAILS.
11 | 
12 | COPYRIGHT NOTICE AND LICENSE TEXTS
13 | 
14 | SOFTWARE: HiFi-GAN
15 | Copyright (c) 2020 Jungil Kong <henry.k@kakaoenterprise.com>
16 | License: MIT
17 | Permission is hereby granted, free of charge, to any person obtaining a copy
18 | of this software and associated documentation files (the "Software"), to deal
19 | in the Software without restriction, including without limitation the rights
20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
21 | copies of the Software, and to permit persons to whom the Software is
22 | furnished to do so, subject to the following conditions:
23 | 
24 | The above copyright notice and this permission notice shall be included in all
25 | copies or substantial portions of the Software.
26 | 
27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 | SOFTWARE.
34 | 
35 | SOFTWARE: Real-Time Voice Cloning
36 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
37 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
38 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
39 | Original work Copyright (c) 2015 braindead (https://github.com/braindead)
40 | License: MIT
41 | Text: See above


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import uuid
  4 | import json
  5 | from time import time
  6 | from loguru import logger
  7 | import numpy as np 
  8 | 
  9 | 
 10 | from fastapi import FastAPI, Response, status, File, UploadFile, Body
 11 | from starlette.middleware.cors import CORSMiddleware
 12 | from pydantic import BaseModel, Field
 13 | 
 14 | 
 15 | from inference import Inferencer 
 16 | 
 17 | import params
 18 | from model import DiffVC
 19 | 
 20 | import sys
 21 | sys.path.append('hifi-gan/')
 22 | from env import AttrDict
 23 | from models import Generator as HiFiGAN
 24 | 
 25 | sys.path.append('speaker_encoder/')
 26 | from encoder import inference as spk_encoder
 27 | from pathlib import Path
 28 | 
 29 | 
 30 | use_gpu = torch.cuda.is_available()
 31 | vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model
 32 | 
 33 | generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 
 34 |                    params.layers, params.kernel, params.dropout, params.window_size, 
 35 |                    params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 
 36 |                    params.beta_min, params.beta_max)
 37 | if use_gpu:
 38 |     generator = generator.cuda()
 39 |     generator.load_state_dict(torch.load(vc_path))
 40 | else:
 41 |     generator.load_state_dict(torch.load(vc_path, map_location='cpu'))
 42 | generator.eval()
 43 | 
 44 | 
 45 | # loading HiFi-GAN vocoder
 46 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path
 47 | 
 48 | with open(hfg_path + 'config.json') as f:
 49 |     h = AttrDict(json.load(f))
 50 | 
 51 | if use_gpu:
 52 |     hifigan_universal = HiFiGAN(h).cuda()
 53 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
 54 | else:
 55 |     hifigan_universal = HiFiGAN(h)
 56 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])
 57 | 
 58 | _ = hifigan_universal.eval()
 59 | hifigan_universal.remove_weight_norm()
 60 | 
 61 | 
 62 | # loading speaker encoder
 63 | enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path
 64 | if use_gpu:
 65 |     spk_encoder.load_model(enc_model_fpath, device="cuda")
 66 | else:
 67 |     spk_encoder.load_model(enc_model_fpath, device="cpu")
 68 | # Define Inferencer 
 69 | _inferencer = Inferencer(generator, spk_encoder, hifigan_universal, MEDIA_ROOT, True )
 70 | 
 71 | 
 72 | # Make dir to save audio files log
 73 | MEDIA_ROOT = os.path.join('/logs', 'media')
 74 | if not os.path.exists(MEDIA_ROOT):
 75 |     os.makedirs(MEDIA_ROOT)
 76 | 
 77 | # Make dir to save json response log
 78 | LOG_ROOT = os.path.join('/logs', 'json')
 79 | if not os.path.exists(LOG_ROOT):
 80 |     os.makedirs(LOG_ROOT)
 81 | 
 82 | def save_audio(file):
 83 |     job_id = str(uuid.uuid4())
 84 |     output_dir = os.path.join(MEDIA_ROOT, str(job_id))
 85 |     if not os.path.exists(output_dir):
 86 |         os.makedirs(output_dir)
 87 |     audio_save_path = os.path.join(output_dir, file.filename)
 88 |     with open(audio_save_path, "wb+") as file_object:
 89 |         file_object.write(file.file.read())
 90 |     
 91 |     return audio_save_path 
 92 |     
 93 |     
 94 |     
 95 | 
 96 | app = FastAPI(
 97 |     title="Voice Conversion",
 98 | )
 99 | 
100 | app.add_middleware(
101 |     CORSMiddleware,
102 |     allow_origins=["*"],
103 |     allow_credentials=True,
104 |     allow_methods=["*"],
105 |     allow_headers=["*"],
106 | )
107 | 
108 | 
109 | @app.get('/', status_code=status.HTTP_200_OK)
110 | async def check_status(response: Response):
111 |     api_status = {"API Status": "Running"}
112 |     return api_status
113 | 
114 | 
115 | 
116 | @app.post('/convert', status_code=200)
117 | async def convert(response:Response, file1: UploadFile = File(...), file2: UploadFile = File(...) ):
118 |     # Save source and target to MEDIA 
119 |     source_fpath = save_audio(file1)
120 |     target_fpath = save_audio(file2)
121 |     
122 |     audio = _inferencer.infer(src_path=audio_path, tgt_path=target_path, return_output_path=False)
123 |     
124 |     return audio 
125 |     
126 | 
127 | 


--------------------------------------------------------------------------------
/app_gradio.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import os
  3 | import uuid
  4 | import torch
  5 | import json
  6 | from inference import Inferencer
  7 | 
  8 | import params
  9 | from model import DiffVC
 10 | 
 11 | import sys
 12 | sys.path.append('hifi-gan/')
 13 | from env import AttrDict
 14 | from models import Generator as HiFiGAN
 15 | 
 16 | sys.path.append('speaker_encoder/')
 17 | from encoder import inference as spk_encoder
 18 | from pathlib import Path
 19 | 
 20 | use_gpu = torch.cuda.is_available()
 21 | 
 22 | MEDIA_ROOT = os.path.join('/logs', 'media')
 23 | if not os.path.exists(MEDIA_ROOT):
 24 |     os.makedirs(MEDIA_ROOT)
 25 | 
 26 | #  load voice conversion 
 27 | vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model
 28 | 
 29 | generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 
 30 |                    params.layers, params.kernel, params.dropout, params.window_size, 
 31 |                    params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 
 32 |                    params.beta_min, params.beta_max)
 33 | if use_gpu:
 34 |     generator = generator.cuda()
 35 |     generator.load_state_dict(torch.load(vc_path))
 36 | else:
 37 |     generator.load_state_dict(torch.load(vc_path, map_location='cpu'))
 38 | generator.eval()
 39 | 
 40 | 
 41 | # loading HiFi-GAN vocoder
 42 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path
 43 | 
 44 | with open(hfg_path + 'config.json') as f:
 45 |     h = AttrDict(json.load(f))
 46 | 
 47 | if use_gpu:
 48 |     hifigan_universal = HiFiGAN(h).cuda()
 49 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
 50 | else:
 51 |     hifigan_universal = HiFiGAN(h)
 52 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])
 53 | 
 54 | _ = hifigan_universal.eval()
 55 | hifigan_universal.remove_weight_norm()
 56 | 
 57 | 
 58 | # loading speaker encoder
 59 | enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path
 60 | if use_gpu:
 61 |     spk_encoder.load_model(enc_model_fpath, device="cuda")
 62 | else:
 63 |     spk_encoder.load_model(enc_model_fpath, device="cpu")
 64 |     
 65 |     
 66 | #  define inference object
 67 | _inferencer = Inferencer(generator, spk_encoder, hifigan_universal, MEDIA_ROOT, True)
 68 | 
 69 | 
 70 | def _inference(audio_path, target_path, mic_path1=None, mic_path2=None):
 71 | 
 72 |     if mic_path1:
 73 |         audio_path = mic_path1
 74 |     if mic_path2:
 75 |         target_path = mic_path2
 76 |    
 77 |     output_path = _inferencer.infer(src_path=audio_path, tgt_path=target_path, return_output_path=True)
 78 | 
 79 |     return output_path
 80 | 
 81 | # gradio app 
 82 | 
 83 | title = "VC-DEMO"
 84 | description = "Gradio demo for Voice Conversion"
 85 | # examples = [['./test_wav/p225_001.wav', "./test_wav/p226_001.wav"]]
 86 | 
 87 | 
 88 | def toggle(choice):
 89 |     if choice == "mic":
 90 |         return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
 91 |     else:
 92 |         return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
 93 | 
 94 | 
 95 | with gr.Blocks() as demo:
 96 |     with gr.Row():
 97 |         with gr.Column():
 98 |             radio1 = gr.Radio(["mic", "file"], value="file",
 99 |                              label="How would you like to upload your audio?")
100 | 
101 |             mic_input1 = gr.Mic(label="Input", type="filepath", visible=False)
102 |             audio_input = gr.Audio(
103 |                 type="filepath", label="Input", visible=True)
104 |             
105 |             radio2 = gr.Radio(["mic", "file"], value="file",
106 |                             label="How would you like to upload your audio?")
107 |             mic_input2 = gr.Mic(label="Target", type="filepath", visible=False)
108 |             audio_target = gr.Audio(
109 |                 type="filepath", label="Target", visible=True)
110 |         with gr.Column():
111 |             audio_output = gr.Audio(label="Output")
112 | 
113 |     # gr.Examples(examples, fn=_inference, inputs=[audio_input, audio_target],
114 |     #                   outputs=audio_output, cache_examples=True)
115 |     
116 |     btn = gr.Button("Generate")
117 |     btn.click(_inference, inputs=[audio_input,
118 |               audio_target, mic_input1, mic_input2], outputs=audio_output)
119 |     radio1.change(toggle, radio1, [mic_input1, audio_input])
120 |     radio2.change(toggle, radio2, [mic_input2, audio_target])
121 | 
122 | demo.launch(enable_queue=True, server_port=1402, server_name="0.0.0.0", share=True)


--------------------------------------------------------------------------------
/checkpts/spk_encoder/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead)
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/checkpts/vc/train_dec_libritts_wodyn.log:
--------------------------------------------------------------------------------
  1 | Epoch 1: loss = 0.1397
  2 | Epoch 2: loss = 0.1209
  3 | Epoch 3: loss = 0.1192
  4 | Epoch 4: loss = 0.1185
  5 | Epoch 5: loss = 0.1170
  6 | Epoch 6: loss = 0.1161
  7 | Epoch 7: loss = 0.1157
  8 | Epoch 8: loss = 0.1157
  9 | Epoch 9: loss = 0.1152
 10 | Epoch 10: loss = 0.1141
 11 | Epoch 11: loss = 0.1140
 12 | Epoch 12: loss = 0.1139
 13 | Epoch 13: loss = 0.1132
 14 | Epoch 14: loss = 0.1137
 15 | Epoch 15: loss = 0.1136
 16 | Epoch 16: loss = 0.1138
 17 | Epoch 17: loss = 0.1130
 18 | Epoch 18: loss = 0.1124
 19 | Epoch 19: loss = 0.1121
 20 | Epoch 20: loss = 0.1123
 21 | Epoch 21: loss = 0.1121
 22 | Epoch 22: loss = 0.1122
 23 | Epoch 23: loss = 0.1126
 24 | Epoch 24: loss = 0.1122
 25 | Epoch 25: loss = 0.1118
 26 | Epoch 26: loss = 0.1118
 27 | Epoch 27: loss = 0.1120
 28 | Epoch 28: loss = 0.1112
 29 | Epoch 29: loss = 0.1106
 30 | Epoch 30: loss = 0.1111
 31 | Epoch 31: loss = 0.1111
 32 | Epoch 32: loss = 0.1107
 33 | Epoch 33: loss = 0.1115
 34 | Epoch 34: loss = 0.1111
 35 | Epoch 35: loss = 0.1118
 36 | Epoch 36: loss = 0.1111
 37 | Epoch 37: loss = 0.1106
 38 | Epoch 38: loss = 0.1108
 39 | Epoch 39: loss = 0.1101
 40 | Epoch 40: loss = 0.1109
 41 | Epoch 41: loss = 0.1110
 42 | Epoch 42: loss = 0.1106
 43 | Epoch 43: loss = 0.1107
 44 | Epoch 44: loss = 0.1104
 45 | Epoch 45: loss = 0.1099
 46 | Epoch 46: loss = 0.1093
 47 | Epoch 47: loss = 0.1105
 48 | Epoch 48: loss = 0.1107
 49 | Epoch 49: loss = 0.1092
 50 | Epoch 50: loss = 0.1100
 51 | Epoch 51: loss = 0.1098
 52 | Epoch 52: loss = 0.1097
 53 | Epoch 53: loss = 0.1103
 54 | Epoch 54: loss = 0.1103
 55 | Epoch 55: loss = 0.1101
 56 | Epoch 56: loss = 0.1090
 57 | Epoch 57: loss = 0.1095
 58 | Epoch 58: loss = 0.1105
 59 | Epoch 59: loss = 0.1098
 60 | Epoch 60: loss = 0.1098
 61 | Epoch 61: loss = 0.1098
 62 | Epoch 62: loss = 0.1095
 63 | Epoch 63: loss = 0.1107
 64 | Epoch 64: loss = 0.1097
 65 | Epoch 65: loss = 0.1088
 66 | Epoch 66: loss = 0.1099
 67 | Epoch 67: loss = 0.1085
 68 | Epoch 68: loss = 0.1091
 69 | Epoch 69: loss = 0.1092
 70 | Epoch 70: loss = 0.1093
 71 | Epoch 71: loss = 0.1094
 72 | Epoch 72: loss = 0.1094
 73 | Epoch 73: loss = 0.1084
 74 | Epoch 74: loss = 0.1090
 75 | Epoch 75: loss = 0.1102
 76 | Epoch 76: loss = 0.1083
 77 | Epoch 77: loss = 0.1085
 78 | Epoch 78: loss = 0.1092
 79 | Epoch 79: loss = 0.1088
 80 | Epoch 80: loss = 0.1083
 81 | Epoch 81: loss = 0.1082
 82 | Epoch 82: loss = 0.1083
 83 | Epoch 83: loss = 0.1089
 84 | Epoch 84: loss = 0.1077
 85 | Epoch 85: loss = 0.1089
 86 | Epoch 86: loss = 0.1087
 87 | Epoch 87: loss = 0.1086
 88 | Epoch 88: loss = 0.1086
 89 | Epoch 89: loss = 0.1089
 90 | Epoch 90: loss = 0.1086
 91 | Epoch 91: loss = 0.1082
 92 | Epoch 92: loss = 0.1090
 93 | Epoch 93: loss = 0.1087
 94 | Epoch 94: loss = 0.1081
 95 | Epoch 95: loss = 0.1082
 96 | Epoch 96: loss = 0.1082
 97 | Epoch 97: loss = 0.1079
 98 | Epoch 98: loss = 0.1079
 99 | Epoch 99: loss = 0.1094
100 | Epoch 100: loss = 0.1092
101 | Epoch 101: loss = 0.1084
102 | Epoch 102: loss = 0.1086
103 | Epoch 103: loss = 0.1082
104 | Epoch 104: loss = 0.1081
105 | Epoch 105: loss = 0.1084
106 | Epoch 106: loss = 0.1081
107 | Epoch 107: loss = 0.1086
108 | Epoch 108: loss = 0.1093
109 | Epoch 109: loss = 0.1070
110 | Epoch 110: loss = 0.1081
111 | 


--------------------------------------------------------------------------------
/checkpts/vc/train_dec_vctk_wodyn.log:
--------------------------------------------------------------------------------
  1 | Epoch 1: loss = 0.1779
  2 | Epoch 2: loss = 0.1237
  3 | Epoch 3: loss = 0.1198
  4 | Epoch 4: loss = 0.1165
  5 | Epoch 5: loss = 0.1158
  6 | Epoch 6: loss = 0.1162
  7 | Epoch 7: loss = 0.1158
  8 | Epoch 8: loss = 0.1129
  9 | Epoch 9: loss = 0.1115
 10 | Epoch 10: loss = 0.1124
 11 | Epoch 11: loss = 0.1107
 12 | Epoch 12: loss = 0.1116
 13 | Epoch 13: loss = 0.1095
 14 | Epoch 14: loss = 0.1079
 15 | Epoch 15: loss = 0.1108
 16 | Epoch 16: loss = 0.1060
 17 | Epoch 17: loss = 0.1081
 18 | Epoch 18: loss = 0.1066
 19 | Epoch 19: loss = 0.1087
 20 | Epoch 20: loss = 0.1057
 21 | Epoch 21: loss = 0.1062
 22 | Epoch 22: loss = 0.1070
 23 | Epoch 23: loss = 0.1078
 24 | Epoch 24: loss = 0.1064
 25 | Epoch 25: loss = 0.1063
 26 | Epoch 26: loss = 0.1066
 27 | Epoch 27: loss = 0.1068
 28 | Epoch 28: loss = 0.1058
 29 | Epoch 29: loss = 0.1052
 30 | Epoch 30: loss = 0.1057
 31 | Epoch 31: loss = 0.1057
 32 | Epoch 32: loss = 0.1055
 33 | Epoch 33: loss = 0.1046
 34 | Epoch 34: loss = 0.1046
 35 | Epoch 35: loss = 0.1052
 36 | Epoch 36: loss = 0.1046
 37 | Epoch 37: loss = 0.1053
 38 | Epoch 38: loss = 0.1049
 39 | Epoch 39: loss = 0.1034
 40 | Epoch 40: loss = 0.1037
 41 | Epoch 41: loss = 0.1051
 42 | Epoch 42: loss = 0.1039
 43 | Epoch 43: loss = 0.1033
 44 | Epoch 44: loss = 0.1058
 45 | Epoch 45: loss = 0.1039
 46 | Epoch 46: loss = 0.1025
 47 | Epoch 47: loss = 0.1031
 48 | Epoch 48: loss = 0.1037
 49 | Epoch 49: loss = 0.1034
 50 | Epoch 50: loss = 0.1046
 51 | Epoch 51: loss = 0.1037
 52 | Epoch 52: loss = 0.1044
 53 | Epoch 53: loss = 0.1029
 54 | Epoch 54: loss = 0.1022
 55 | Epoch 55: loss = 0.1026
 56 | Epoch 56: loss = 0.1031
 57 | Epoch 57: loss = 0.1031
 58 | Epoch 58: loss = 0.1030
 59 | Epoch 59: loss = 0.1036
 60 | Epoch 60: loss = 0.1025
 61 | Epoch 61: loss = 0.1031
 62 | Epoch 62: loss = 0.1042
 63 | Epoch 63: loss = 0.1038
 64 | Epoch 64: loss = 0.1034
 65 | Epoch 65: loss = 0.1031
 66 | Epoch 66: loss = 0.1023
 67 | Epoch 67: loss = 0.1029
 68 | Epoch 68: loss = 0.1018
 69 | Epoch 69: loss = 0.1007
 70 | Epoch 70: loss = 0.1022
 71 | Epoch 71: loss = 0.1020
 72 | Epoch 72: loss = 0.1026
 73 | Epoch 73: loss = 0.1008
 74 | Epoch 74: loss = 0.1024
 75 | Epoch 75: loss = 0.1012
 76 | Epoch 76: loss = 0.1016
 77 | Epoch 77: loss = 0.1036
 78 | Epoch 78: loss = 0.1018
 79 | Epoch 79: loss = 0.1009
 80 | Epoch 80: loss = 0.1009
 81 | Epoch 81: loss = 0.1011
 82 | Epoch 82: loss = 0.1012
 83 | Epoch 83: loss = 0.1024
 84 | Epoch 84: loss = 0.1025
 85 | Epoch 85: loss = 0.1015
 86 | Epoch 86: loss = 0.0998
 87 | Epoch 87: loss = 0.1011
 88 | Epoch 88: loss = 0.1033
 89 | Epoch 89: loss = 0.1024
 90 | Epoch 90: loss = 0.1032
 91 | Epoch 91: loss = 0.1033
 92 | Epoch 92: loss = 0.1014
 93 | Epoch 93: loss = 0.1008
 94 | Epoch 94: loss = 0.1011
 95 | Epoch 95: loss = 0.1010
 96 | Epoch 96: loss = 0.1001
 97 | Epoch 97: loss = 0.1001
 98 | Epoch 98: loss = 0.1011
 99 | Epoch 99: loss = 0.1024
100 | Epoch 100: loss = 0.1007
101 | Epoch 101: loss = 0.0998
102 | Epoch 102: loss = 0.1010
103 | Epoch 103: loss = 0.1004
104 | Epoch 104: loss = 0.1014
105 | Epoch 105: loss = 0.1002
106 | Epoch 106: loss = 0.1003
107 | Epoch 107: loss = 0.0998
108 | Epoch 108: loss = 0.0996
109 | Epoch 109: loss = 0.0994
110 | Epoch 110: loss = 0.0997
111 | Epoch 111: loss = 0.1007
112 | Epoch 112: loss = 0.0990
113 | Epoch 113: loss = 0.0997
114 | Epoch 114: loss = 0.0994
115 | Epoch 115: loss = 0.1003
116 | Epoch 116: loss = 0.1011
117 | Epoch 117: loss = 0.1009
118 | Epoch 118: loss = 0.0991
119 | Epoch 119: loss = 0.0992
120 | Epoch 120: loss = 0.0998
121 | Epoch 121: loss = 0.1002
122 | Epoch 122: loss = 0.1007
123 | Epoch 123: loss = 0.1004
124 | Epoch 124: loss = 0.0995
125 | Epoch 125: loss = 0.1004
126 | Epoch 126: loss = 0.0998
127 | Epoch 127: loss = 0.0994
128 | Epoch 128: loss = 0.1007
129 | Epoch 129: loss = 0.0991
130 | Epoch 130: loss = 0.1009
131 | Epoch 131: loss = 0.0994
132 | Epoch 132: loss = 0.0990
133 | Epoch 133: loss = 0.1015
134 | Epoch 134: loss = 0.0986
135 | Epoch 135: loss = 0.1002
136 | Epoch 136: loss = 0.1000
137 | Epoch 137: loss = 0.0996
138 | Epoch 138: loss = 0.0994
139 | Epoch 139: loss = 0.0988
140 | Epoch 140: loss = 0.0996
141 | Epoch 141: loss = 0.0989
142 | Epoch 142: loss = 0.0991
143 | Epoch 143: loss = 0.1002
144 | Epoch 144: loss = 0.0985
145 | Epoch 145: loss = 0.1004
146 | Epoch 146: loss = 0.0998
147 | Epoch 147: loss = 0.0981
148 | Epoch 148: loss = 0.0989
149 | Epoch 149: loss = 0.0997
150 | Epoch 150: loss = 0.0993
151 | Epoch 151: loss = 0.0984
152 | Epoch 152: loss = 0.0993
153 | Epoch 153: loss = 0.0993
154 | Epoch 154: loss = 0.1006
155 | Epoch 155: loss = 0.1009
156 | Epoch 156: loss = 0.0989
157 | Epoch 157: loss = 0.0974
158 | Epoch 158: loss = 0.0978
159 | Epoch 159: loss = 0.0988
160 | Epoch 160: loss = 0.0984
161 | Epoch 161: loss = 0.0985
162 | Epoch 162: loss = 0.1005
163 | Epoch 163: loss = 0.0987
164 | Epoch 164: loss = 0.0992
165 | Epoch 165: loss = 0.0987
166 | Epoch 166: loss = 0.1003
167 | Epoch 167: loss = 0.1000
168 | Epoch 168: loss = 0.0983
169 | Epoch 169: loss = 0.0988
170 | Epoch 170: loss = 0.1004
171 | Epoch 171: loss = 0.0991
172 | Epoch 172: loss = 0.0985
173 | Epoch 173: loss = 0.0999
174 | Epoch 174: loss = 0.1012
175 | Epoch 175: loss = 0.0993
176 | Epoch 176: loss = 0.0980
177 | Epoch 177: loss = 0.0987
178 | Epoch 178: loss = 0.0991
179 | Epoch 179: loss = 0.0987
180 | Epoch 180: loss = 0.0986
181 | Epoch 181: loss = 0.0985
182 | Epoch 182: loss = 0.0968
183 | Epoch 183: loss = 0.0993
184 | Epoch 184: loss = 0.0973
185 | Epoch 185: loss = 0.0981
186 | Epoch 186: loss = 0.0993
187 | Epoch 187: loss = 0.0974
188 | Epoch 188: loss = 0.0989
189 | Epoch 189: loss = 0.0974
190 | Epoch 190: loss = 0.0985
191 | Epoch 191: loss = 0.0989
192 | Epoch 192: loss = 0.0992
193 | Epoch 193: loss = 0.0973
194 | Epoch 194: loss = 0.0980
195 | Epoch 195: loss = 0.0975
196 | Epoch 196: loss = 0.0990
197 | Epoch 197: loss = 0.0969
198 | Epoch 198: loss = 0.0973
199 | Epoch 199: loss = 0.0981
200 | Epoch 200: loss = 0.0978
201 | 


--------------------------------------------------------------------------------
/checkpts/vc/train_enc_libritts.log:
--------------------------------------------------------------------------------
  1 | Epoch 1: loss = 0.5523
  2 | Epoch 2: loss = 0.2962
  3 | Epoch 3: loss = 0.2634
  4 | Epoch 4: loss = 0.2445
  5 | Epoch 5: loss = 0.2324
  6 | Epoch 6: loss = 0.2246
  7 | Epoch 7: loss = 0.2179
  8 | Epoch 8: loss = 0.2124
  9 | Epoch 9: loss = 0.2083
 10 | Epoch 10: loss = 0.2052
 11 | Epoch 11: loss = 0.2023
 12 | Epoch 12: loss = 0.2001
 13 | Epoch 13: loss = 0.1970
 14 | Epoch 14: loss = 0.1947
 15 | Epoch 15: loss = 0.1933
 16 | Epoch 16: loss = 0.1918
 17 | Epoch 17: loss = 0.1904
 18 | Epoch 18: loss = 0.1890
 19 | Epoch 19: loss = 0.1874
 20 | Epoch 20: loss = 0.1867
 21 | Epoch 21: loss = 0.1859
 22 | Epoch 22: loss = 0.1833
 23 | Epoch 23: loss = 0.1827
 24 | Epoch 24: loss = 0.1822
 25 | Epoch 25: loss = 0.1815
 26 | Epoch 26: loss = 0.1803
 27 | Epoch 27: loss = 0.1795
 28 | Epoch 28: loss = 0.1790
 29 | Epoch 29: loss = 0.1784
 30 | Epoch 30: loss = 0.1777
 31 | Epoch 31: loss = 0.1771
 32 | Epoch 32: loss = 0.1761
 33 | Epoch 33: loss = 0.1761
 34 | Epoch 34: loss = 0.1748
 35 | Epoch 35: loss = 0.1740
 36 | Epoch 36: loss = 0.1735
 37 | Epoch 37: loss = 0.1730
 38 | Epoch 38: loss = 0.1722
 39 | Epoch 39: loss = 0.1717
 40 | Epoch 40: loss = 0.1715
 41 | Epoch 41: loss = 0.1705
 42 | Epoch 42: loss = 0.1706
 43 | Epoch 43: loss = 0.1700
 44 | Epoch 44: loss = 0.1694
 45 | Epoch 45: loss = 0.1688
 46 | Epoch 46: loss = 0.1686
 47 | Epoch 47: loss = 0.1684
 48 | Epoch 48: loss = 0.1678
 49 | Epoch 49: loss = 0.1670
 50 | Epoch 50: loss = 0.1670
 51 | Epoch 51: loss = 0.1666
 52 | Epoch 52: loss = 0.1666
 53 | Epoch 53: loss = 0.1659
 54 | Epoch 54: loss = 0.1656
 55 | Epoch 55: loss = 0.1651
 56 | Epoch 56: loss = 0.1647
 57 | Epoch 57: loss = 0.1646
 58 | Epoch 58: loss = 0.1639
 59 | Epoch 59: loss = 0.1638
 60 | Epoch 60: loss = 0.1635
 61 | Epoch 61: loss = 0.1629
 62 | Epoch 62: loss = 0.1635
 63 | Epoch 63: loss = 0.1625
 64 | Epoch 64: loss = 0.1622
 65 | Epoch 65: loss = 0.1622
 66 | Epoch 66: loss = 0.1617
 67 | Epoch 67: loss = 0.1614
 68 | Epoch 68: loss = 0.1614
 69 | Epoch 69: loss = 0.1606
 70 | Epoch 70: loss = 0.1607
 71 | Epoch 71: loss = 0.1603
 72 | Epoch 72: loss = 0.1601
 73 | Epoch 73: loss = 0.1600
 74 | Epoch 74: loss = 0.1594
 75 | Epoch 75: loss = 0.1593
 76 | Epoch 76: loss = 0.1594
 77 | Epoch 77: loss = 0.1590
 78 | Epoch 78: loss = 0.1584
 79 | Epoch 79: loss = 0.1582
 80 | Epoch 80: loss = 0.1581
 81 | Epoch 81: loss = 0.1578
 82 | Epoch 82: loss = 0.1581
 83 | Epoch 83: loss = 0.1578
 84 | Epoch 84: loss = 0.1571
 85 | Epoch 85: loss = 0.1571
 86 | Epoch 86: loss = 0.1572
 87 | Epoch 87: loss = 0.1566
 88 | Epoch 88: loss = 0.1562
 89 | Epoch 89: loss = 0.1566
 90 | Epoch 90: loss = 0.1556
 91 | Epoch 91: loss = 0.1553
 92 | Epoch 92: loss = 0.1559
 93 | Epoch 93: loss = 0.1562
 94 | Epoch 94: loss = 0.1556
 95 | Epoch 95: loss = 0.1553
 96 | Epoch 96: loss = 0.1553
 97 | Epoch 97: loss = 0.1548
 98 | Epoch 98: loss = 0.1544
 99 | Epoch 99: loss = 0.1544
100 | Epoch 100: loss = 0.1545
101 | Epoch 101: loss = 0.1538
102 | Epoch 102: loss = 0.1538
103 | Epoch 103: loss = 0.1538
104 | Epoch 104: loss = 0.1538
105 | Epoch 105: loss = 0.1533
106 | Epoch 106: loss = 0.1535
107 | Epoch 107: loss = 0.1528
108 | Epoch 108: loss = 0.1529
109 | Epoch 109: loss = 0.1528
110 | Epoch 110: loss = 0.1523
111 | Epoch 111: loss = 0.1526
112 | Epoch 112: loss = 0.1522
113 | Epoch 113: loss = 0.1518
114 | Epoch 114: loss = 0.1518
115 | Epoch 115: loss = 0.1522
116 | Epoch 116: loss = 0.1514
117 | Epoch 117: loss = 0.1510
118 | Epoch 118: loss = 0.1517
119 | Epoch 119: loss = 0.1519
120 | Epoch 120: loss = 0.1508
121 | Epoch 121: loss = 0.1508
122 | Epoch 122: loss = 0.1515
123 | Epoch 123: loss = 0.1508
124 | Epoch 124: loss = 0.1505
125 | Epoch 125: loss = 0.1507
126 | Epoch 126: loss = 0.1508
127 | Epoch 127: loss = 0.1497
128 | Epoch 128: loss = 0.1497
129 | Epoch 129: loss = 0.1497
130 | Epoch 130: loss = 0.1498
131 | Epoch 131: loss = 0.1498
132 | Epoch 132: loss = 0.1493
133 | Epoch 133: loss = 0.1498
134 | Epoch 134: loss = 0.1488
135 | Epoch 135: loss = 0.1490
136 | Epoch 136: loss = 0.1493
137 | Epoch 137: loss = 0.1488
138 | Epoch 138: loss = 0.1485
139 | Epoch 139: loss = 0.1486
140 | Epoch 140: loss = 0.1486
141 | Epoch 141: loss = 0.1481
142 | Epoch 142: loss = 0.1483
143 | Epoch 143: loss = 0.1475
144 | Epoch 144: loss = 0.1483
145 | Epoch 145: loss = 0.1483
146 | Epoch 146: loss = 0.1476
147 | Epoch 147: loss = 0.1477
148 | Epoch 148: loss = 0.1475
149 | Epoch 149: loss = 0.1473
150 | Epoch 150: loss = 0.1474
151 | Epoch 151: loss = 0.1469
152 | Epoch 152: loss = 0.1473
153 | Epoch 153: loss = 0.1472
154 | Epoch 154: loss = 0.1465
155 | Epoch 155: loss = 0.1467
156 | Epoch 156: loss = 0.1469
157 | Epoch 157: loss = 0.1466
158 | Epoch 158: loss = 0.1468
159 | Epoch 159: loss = 0.1459
160 | Epoch 160: loss = 0.1463
161 | Epoch 161: loss = 0.1461
162 | Epoch 162: loss = 0.1459
163 | Epoch 163: loss = 0.1461
164 | Epoch 164: loss = 0.1455
165 | Epoch 165: loss = 0.1458
166 | Epoch 166: loss = 0.1457
167 | Epoch 167: loss = 0.1455
168 | Epoch 168: loss = 0.1457
169 | Epoch 169: loss = 0.1452
170 | Epoch 170: loss = 0.1457
171 | Epoch 171: loss = 0.1451
172 | Epoch 172: loss = 0.1448
173 | Epoch 173: loss = 0.1445
174 | Epoch 174: loss = 0.1451
175 | Epoch 175: loss = 0.1451
176 | Epoch 176: loss = 0.1451
177 | Epoch 177: loss = 0.1446
178 | Epoch 178: loss = 0.1442
179 | Epoch 179: loss = 0.1452
180 | Epoch 180: loss = 0.1447
181 | Epoch 181: loss = 0.1445
182 | Epoch 182: loss = 0.1444
183 | Epoch 183: loss = 0.1440
184 | Epoch 184: loss = 0.1446
185 | Epoch 185: loss = 0.1442
186 | Epoch 186: loss = 0.1442
187 | Epoch 187: loss = 0.1441
188 | Epoch 188: loss = 0.1438
189 | Epoch 189: loss = 0.1441
190 | Epoch 190: loss = 0.1433
191 | Epoch 191: loss = 0.1436
192 | Epoch 192: loss = 0.1435
193 | Epoch 193: loss = 0.1431
194 | Epoch 194: loss = 0.1431
195 | Epoch 195: loss = 0.1431
196 | Epoch 196: loss = 0.1432
197 | Epoch 197: loss = 0.1434
198 | Epoch 198: loss = 0.1427
199 | Epoch 199: loss = 0.1429
200 | Epoch 200: loss = 0.1428
201 | Epoch 201: loss = 0.1425
202 | Epoch 202: loss = 0.1420
203 | Epoch 203: loss = 0.1431
204 | Epoch 204: loss = 0.1424
205 | Epoch 205: loss = 0.1422
206 | Epoch 206: loss = 0.1425
207 | Epoch 207: loss = 0.1426
208 | Epoch 208: loss = 0.1425
209 | Epoch 209: loss = 0.1419
210 | Epoch 210: loss = 0.1422
211 | Epoch 211: loss = 0.1420
212 | Epoch 212: loss = 0.1419
213 | Epoch 213: loss = 0.1418
214 | Epoch 214: loss = 0.1416
215 | Epoch 215: loss = 0.1415
216 | Epoch 216: loss = 0.1418
217 | Epoch 217: loss = 0.1414
218 | Epoch 218: loss = 0.1417
219 | Epoch 219: loss = 0.1418
220 | Epoch 220: loss = 0.1418
221 | Epoch 221: loss = 0.1414
222 | Epoch 222: loss = 0.1414
223 | Epoch 223: loss = 0.1414
224 | Epoch 224: loss = 0.1410
225 | Epoch 225: loss = 0.1410
226 | Epoch 226: loss = 0.1408
227 | Epoch 227: loss = 0.1409
228 | Epoch 228: loss = 0.1406
229 | Epoch 229: loss = 0.1409
230 | Epoch 230: loss = 0.1407
231 | Epoch 231: loss = 0.1406
232 | Epoch 232: loss = 0.1407
233 | Epoch 233: loss = 0.1412
234 | Epoch 234: loss = 0.1405
235 | Epoch 235: loss = 0.1398
236 | Epoch 236: loss = 0.1402
237 | Epoch 237: loss = 0.1405
238 | Epoch 238: loss = 0.1401
239 | Epoch 239: loss = 0.1401
240 | Epoch 240: loss = 0.1401
241 | Epoch 241: loss = 0.1402
242 | Epoch 242: loss = 0.1398
243 | Epoch 243: loss = 0.1400
244 | Epoch 244: loss = 0.1399
245 | Epoch 245: loss = 0.1395
246 | Epoch 246: loss = 0.1398
247 | Epoch 247: loss = 0.1391
248 | Epoch 248: loss = 0.1397
249 | Epoch 249: loss = 0.1391
250 | Epoch 250: loss = 0.1398
251 | Epoch 251: loss = 0.1394
252 | Epoch 252: loss = 0.1394
253 | Epoch 253: loss = 0.1400
254 | Epoch 254: loss = 0.1395
255 | Epoch 255: loss = 0.1396
256 | Epoch 256: loss = 0.1388
257 | Epoch 257: loss = 0.1391
258 | Epoch 258: loss = 0.1390
259 | Epoch 259: loss = 0.1392
260 | Epoch 260: loss = 0.1391
261 | Epoch 261: loss = 0.1390
262 | Epoch 262: loss = 0.1385
263 | Epoch 263: loss = 0.1383
264 | Epoch 264: loss = 0.1395
265 | Epoch 265: loss = 0.1386
266 | Epoch 266: loss = 0.1382
267 | Epoch 267: loss = 0.1387
268 | Epoch 268: loss = 0.1382
269 | Epoch 269: loss = 0.1384
270 | Epoch 270: loss = 0.1385
271 | Epoch 271: loss = 0.1382
272 | Epoch 272: loss = 0.1385
273 | Epoch 273: loss = 0.1380
274 | Epoch 274: loss = 0.1381
275 | Epoch 275: loss = 0.1385
276 | Epoch 276: loss = 0.1384
277 | Epoch 277: loss = 0.1381
278 | Epoch 278: loss = 0.1380
279 | Epoch 279: loss = 0.1382
280 | Epoch 280: loss = 0.1384
281 | Epoch 281: loss = 0.1376
282 | Epoch 282: loss = 0.1379
283 | Epoch 283: loss = 0.1379
284 | Epoch 284: loss = 0.1378
285 | Epoch 285: loss = 0.1379
286 | Epoch 286: loss = 0.1376
287 | Epoch 287: loss = 0.1373
288 | Epoch 288: loss = 0.1374
289 | Epoch 289: loss = 0.1375
290 | Epoch 290: loss = 0.1372
291 | Epoch 291: loss = 0.1378
292 | Epoch 292: loss = 0.1373
293 | Epoch 293: loss = 0.1375
294 | Epoch 294: loss = 0.1373
295 | Epoch 295: loss = 0.1375
296 | Epoch 296: loss = 0.1372
297 | Epoch 297: loss = 0.1372
298 | Epoch 298: loss = 0.1370
299 | Epoch 299: loss = 0.1367
300 | Epoch 300: loss = 0.1368
301 | 


--------------------------------------------------------------------------------
/checkpts/vocoder/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/checkpts/vocoder/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0002,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "segment_size": 8192,
18 |     "num_mels": 80,
19 |     "num_freq": 1025,
20 |     "n_fft": 1024,
21 |     "hop_size": 256,
22 |     "win_size": 1024,
23 | 
24 |     "sampling_rate": 22050,
25 | 
26 |     "fmin": 0,
27 |     "fmax": 8000,
28 |     "fmax_for_loss": null,
29 | 
30 |     "num_workers": 4,
31 | 
32 |     "dist_config": {
33 |         "dist_backend": "nccl",
34 |         "dist_url": "tcp://localhost:54321",
35 |         "world_size": 1
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/deploy/Dockerfile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/Dockerfile


--------------------------------------------------------------------------------
/deploy/model_repository/vc_pipeline_python/1/model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import torch
 4 | import triton_python_backend_utils as pb_utils
 5 | 
 6 | class TritonPythonModel: 
 7 |     
 8 |     def initialize(self, args):
 9 |         
10 |         self.model_config = model_config = json.loads(args['model_config'])
11 |         
12 |     def execute(self, requests):
13 |         responses = [] 
14 |         for request in requests: 
15 |             inp = pb_utils.get_input_tensor_by_name(request, "prompt")
16 |             


--------------------------------------------------------------------------------
/deploy/model_repository/vc_pipeline_python/1/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/model_repository/vc_pipeline_python/1/pipeline/__init__.py


--------------------------------------------------------------------------------
/deploy/model_repository/vc_pipeline_python/config.pbtxt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/model_repository/vc_pipeline_python/config.pbtxt


--------------------------------------------------------------------------------
/deploy/model_repository/vc_spk_encoder/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "vc_spk_encoder"
 2 | platform: "onnxruntime_onnx"
 3 | max_batch_size : 8
 4 | version_policy: {
 5 |     specific: {
 6 |       versions: [1]
 7 |     }
 8 | }
 9 | input [
10 |   {
11 |     name: "frame_input"
12 |     data_type: TYPE_FP16
13 |     dims: [-1, 160, 40]
14 |   }
15 | ]
16 | output [
17 |   {
18 |     name: "embed_output"
19 |     data_type: TYPE_FP16
20 |     dims: [-1, 256]
21 |   }
22 | ]


--------------------------------------------------------------------------------
/deploy/model_repository/vc_vocoder/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "vc_vocoder"
 2 | platform: "onnxruntime_onnx"
 3 | max_batch_size : 8
 4 | version_policy: {
 5 |     specific: {
 6 |       versions: [1]
 7 |     }
 8 | }
 9 | input [
10 |   {
11 |     name: "mel_input"
12 |     data_type: TYPE_FP16
13 |     dims: [-1, 80, -1]
14 |   }
15 | ]
16 | output [
17 |   {
18 |     name: "audio_output"
19 |     data_type: TYPE_FP16
20 |     dims: [-1,1,-1]
21 |   }
22 | ]


--------------------------------------------------------------------------------
/example/6415_111615_000012_000005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/example/6415_111615_000012_000005.wav


--------------------------------------------------------------------------------
/example/8534_216567_000015_000010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/example/8534_216567_000015_000010.wav


--------------------------------------------------------------------------------
/export_onnx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/export_onnx/__init__.py


--------------------------------------------------------------------------------
/export_onnx/export_hifigan.py:
--------------------------------------------------------------------------------
 1 | # vocoder 
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import numpy as np
 7 | import IPython.display as ipd
 8 | from tqdm import tqdm
 9 | from scipy.io.wavfile import write
10 | 
11 | import torch
12 | use_gpu = torch.cuda.is_available()
13 | 
14 | import librosa
15 | from librosa.core import load
16 | from librosa.filters import mel as librosa_mel_fn
17 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)
18 | 
19 | import params
20 | from model import DiffVC
21 | 
22 | import sys
23 | sys.path.append('hifi-gan/')
24 | from env import AttrDict
25 | from models import Generator as HiFiGAN
26 | 
27 | sys.path.append('speaker_encoder/')
28 | from encoder import inference as spk_encoder
29 | from pathlib import Path
30 | 
31 | os.environ["CUDA_VISIBLE_DEVICES"]= "1"
32 | 
33 | 
34 | # loading HiFi-GAN vocoder
35 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path
36 | 
37 | with open(hfg_path + 'config.json') as f:
38 |     h = AttrDict(json.load(f))
39 | 
40 | if use_gpu:
41 |     hifigan_universal = HiFiGAN(h).cuda()
42 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
43 | else:
44 |     hifigan_universal = HiFiGAN(h)
45 |     hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])
46 | 
47 | _ = hifigan_universal.eval()
48 | hifigan_universal.remove_weight_norm()
49 | 
50 | 
51 | 
52 | 
53 | def convert_torch_to_onnx_batch(model, output_path, dummy_input, device=None):
54 | 
55 |     input_names = ["mel_input"]
56 |     output_names = ["audio_output"]
57 |     
58 |     if device!=None:
59 |         model = model.to(device)
60 |         dummy_input = dummy_input.to(device)
61 |     
62 |     torch.onnx.export(model, 
63 |                  dummy_input, 
64 |                  output_path, 
65 |                  verbose=True, 
66 |                  input_names=input_names, 
67 |                  output_names=output_names,
68 |                  dynamic_axes={'mel_input' : {0: 'batch_size', 2 : 'mel_leghths'},    # variable length axes
69 |                                'audio_output' : {0:'batch_size', 2 : 'audio_lenghts'}})
70 |     
71 | device = torch.device('cuda')
72 | output_path = "hifigan.onnx"
73 | # dummy_input = mel_source
74 | dummy_input = torch.rand(2,80,200)
75 | dummy_output = torch.rand(2,1,124321)
76 | convert_torch_to_onnx_batch(hifigan_universal, output_path, dummy_input, device=device)
77 | 
78 | print(device)


--------------------------------------------------------------------------------
/export_onnx/export_spk_enc.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import numpy as np
 5 | import IPython.display as ipd
 6 | from tqdm import tqdm
 7 | from scipy.io.wavfile import write
 8 | 
 9 | import torch
10 | use_gpu = torch.cuda.is_available()
11 | 
12 | import librosa
13 | from librosa.core import load
14 | from librosa.filters import mel as librosa_mel_fn
15 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)
16 | 
17 | import params
18 | from model import DiffVC
19 | 
20 | import sys
21 | # sys.path.append('hifi-gan/')
22 | # from env import AttrDict
23 | # from models import Generator as HiFiGAN
24 | 
25 | sys.path.append('speaker_encoder/')
26 | from encoder import inference as spk_encoder
27 | from pathlib import Path
28 | 
29 | os.environ["CUDA_VISIBLE_DEVICES"]= "1"
30 | 
31 | from encoder.model import SpeakerEncoder
32 | 
33 | 
34 | weights_fpath = Path('checkpts/spk_encoder/pretrained.pt')
35 | _device = torch.device('cuda')
36 | 
37 | _model = SpeakerEncoder(_device, torch.device("cpu"))
38 | checkpoint = torch.load(weights_fpath, map_location="cuda")
39 | _model.load_state_dict(checkpoint["model_state"])
40 | _model.eval()
41 | 
42 | def convert_torch_to_onnx_batch(model, output_path, dummy_input, device=None):
43 |     
44 |     input_names = ["frame_input"]
45 |     output_names = ["embed_output"]
46 |     
47 |     if device!=None:
48 |         model = model.to(device)
49 |         dummy_input = dummy_input.to(device)
50 |     
51 |     torch.onnx.export(model, 
52 |                  dummy_input, 
53 |                  output_path, 
54 |                  verbose=True, 
55 |                  input_names=input_names, 
56 |                  output_names=output_names,
57 |                  dynamic_axes={'frame_input' : {0: 'batch_size'},    # variable length axes
58 |                                'embed_output' : {0:'batch_size'}})
59 | print("hihi")
60 | device = torch.device('cuda')
61 | output_path = "spk_enc.onnx"
62 | # dummy_input = mel_source
63 | dummy_input = torch.rand(2, 10, 160, 40)
64 | dummpy_ouput = torch.rand(10,256)
65 | 
66 | convert_torch_to_onnx_batch(_model, output_path, dummy_input, device=device)
67 | 
68 | # print(device)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/export_onnx/onnx_check.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | 
 3 | # load model from onnx
 4 | 
 5 | model = onnx.load('./path/to/onnx.onnx')
 6 | 
 7 | # confirm model has valid schema
 8 | onnx.checker.check_model(model)
 9 | 
10 | 
11 | # Print a human readable representation of the graph
12 | onnx.helper.printable_graph(model.graph)


--------------------------------------------------------------------------------
/filelists/exceptions_libritts.txt:
--------------------------------------------------------------------------------
  1 | 1027_125147_000139_000000
  2 | 7739_8592_000126_000000
  3 | 1535_141642_000009_000000
  4 | 1974_139741_000015_000001
  5 | 8855_283242_000010_000000
  6 | 7120_118112_000003_000000
  7 | 5583_41919_000016_000001
  8 | 1509_145742_000007_000000
  9 | 1265_135635_000018_000000
 10 | 176_122025_000000_000001
 11 | 5655_46267_000030_000000
 12 | 2012_139358_000006_000000
 13 | 207_143321_000019_000000
 14 | 176_123269_000011_000000
 15 | 8699_291107_000027_000000
 16 | 1638_84447_000036_000000
 17 | 1050_134119_000035_000002
 18 | 4108_2777_000010_000000
 19 | 6233_61741_000020_000000
 20 | 4088_158079_000094_000000
 21 | 1731_142320_000096_000000
 22 | 2401_144485_000071_000001
 23 | 1553_140048_000009_000000
 24 | 3240_131232_000080_000000
 25 | 7402_59171_000003_000000
 26 | 8396_120280_000033_000001
 27 | 8238_283452_000006_000000
 28 | 166_352_000004_000000
 29 | 589_146346_000020_000005
 30 | 3513_163606_000046_000001
 31 | 500_125123_000112_000000
 32 | 4586_96498_000028_000000
 33 | 7538_100045_000021_000001
 34 | 1958_144503_000020_000001
 35 | 3857_182315_000006_000000
 36 | 176_122025_000017_000001
 37 | 8063_274112_000077_000000
 38 | 176_122025_000009_000001
 39 | 6458_232057_000041_000001
 40 | 7495_102612_000071_000000
 41 | 4807_26852_000071_000000
 42 | 78_369_000023_000000
 43 | 14_212_000011_000009
 44 | 4363_11049_000058_000000
 45 | 3224_167024_000041_000000
 46 | 16_122828_000022_000000
 47 | 207_143321_000019_000001
 48 | 1271_136861_000021_000000
 49 | 339_132718_000019_000002
 50 | 9023_296467_000008_000000
 51 | 5660_101884_000031_000000
 52 | 1845_145083_000010_000002
 53 | 2060_150855_000024_000000
 54 | 2045_158081_000020_000001
 55 | 4088_158079_000088_000000
 56 | 922_132300_000030_000001
 57 | 5333_5083_000012_000011
 58 | 6385_34655_000022_000000
 59 | 340_124368_000004_000000
 60 | 4044_9010_000022_000000
 61 | 2204_131732_000017_000017
 62 | 64_76974_000089_000000
 63 | 2436_2476_000048_000000
 64 | 4806_26894_000004_000000
 65 | 28_12332_000061_000000
 66 | 2531_156724_000012_000000
 67 | 4957_30119_000014_000000
 68 | 1182_134981_000027_000000
 69 | 3540_163612_000169_000000
 70 | 6104_58845_000020_000000
 71 | 60_121082_000029_000000
 72 | 4363_11049_000177_000000
 73 | 1958_144503_000027_000000
 74 | 7278_246956_000020_000000
 75 | 2401_144485_000068_000001
 76 | 1264_129805_000026_000000
 77 | 6098_57836_000021_000000
 78 | 2517_135227_000015_000004
 79 | 4680_16026_000096_000000
 80 | 4116_3582_000039_000001
 81 | 7511_102420_000005_000001
 82 | 4297_13009_000042_000000
 83 | 307_127535_000033_000003
 84 | 1841_179183_000009_000000
 85 | 8770_295462_000051_000000
 86 | 3513_7741_000060_000001
 87 | 7800_283492_000025_000000
 88 | 8479_276730_000026_000000
 89 | 3728_105386_000010_000001
 90 | 4800_73729_000026_000004
 91 | 4427_20023_000004_000007
 92 | 8063_274112_000030_000000
 93 | 7145_87280_000100_000004
 94 | 4243_187023_000016_000000
 95 | 6643_67857_000002_000000
 96 | 1885_136863_000025_000000
 97 | 7120_118112_000035_000000
 98 | 3513_163607_000044_000002
 99 | 1553_140048_000001_000000
100 | 2053_138901_000004_000004
101 | 510_130101_000054_000000
102 | 899_126233_000033_000000
103 | 1265_135636_000067_000001
104 | 6981_70843_000127_000000
105 | 6104_58843_000033_000000
106 | 7078_271888_000015_000000
107 | 335_125945_000035_000001
108 | 1265_135635_000052_000000
109 | 5339_14134_000072_000000
110 | 7939_120318_000016_000000
111 | 6032_58192_000008_000000
112 | 1743_142912_000015_000001
113 | 839_130898_000020_000000
114 | 1121_176698_000015_000000
115 | 501_125128_000068_000000
116 | 7783_107486_000060_000001
117 | 806_124221_000037_000000
118 | 78_369_000043_000006
119 | 1731_142320_000053_000000
120 | 4837_285896_000040_000000
121 | 5622_44586_000017_000000
122 | 7991_102381_000019_000000
123 | 14_208_000021_000002
124 | 157_121907_000017_000000
125 | 4108_2777_000059_000000
126 | 14_212_000011_000004
127 | 3983_5331_000002_000000
128 | 497_125118_000079_000000
129 | 8699_291107_000302_000000
130 | 512_124520_000071_000008
131 | 7120_118112_000019_000000
132 | 78_369_000030_000009
133 | 954_130627_000038_000001
134 | 6574_70756_000008_000007
135 | 4381_14897_000005_000006
136 | 6006_60489_000033_000005
137 | 4267_72637_000007_000000
138 | 5731_50776_000030_000001
139 | 2053_138901_000037_000001
140 | 2592_5341_000039_000000
141 | 1283_136983_000009_000000
142 | 1731_142320_000127_000000
143 | 1265_135635_000003_000000
144 | 1731_142320_000059_000000
145 | 6895_96175_000051_000000
146 | 1027_125147_000080_000000
147 | 8479_276730_000042_000000
148 | 1335_163935_000018_000001
149 | 1731_142320_000114_000000
150 | 374_180298_000028_000001
151 | 4088_158079_000154_000000
152 | 8875_293959_000083_000000
153 | 5876_8675_000009_000000
154 | 7665_104979_000053_000000
155 | 5968_55202_000071_000000
156 | 500_125123_000032_000000
157 | 1705_142318_000022_000000
158 | 4586_96498_000035_000001
159 | 6104_58845_000023_000000
160 | 7739_8592_000024_000000
161 | 2473_157859_000047_000004
162 | 249_121331_000003_000000
163 | 2012_139358_000012_000000
164 | 6104_58843_000080_000000
165 | 454_134728_000083_000000
166 | 6904_262305_000001_000000
167 | 1974_139741_000048_000001
168 | 549_126410_000049_000001
169 | 1603_139325_000039_000000
170 | 8770_295465_000020_000000
171 | 816_53638_000055_000000
172 | 6701_71404_000089_000000
173 | 78_369_000035_000003
174 | 4495_18533_000041_000000
175 | 2436_2477_000061_000001
176 | 118_47824_000109_000000
177 | 8479_276730_000034_000000
178 | 298_126791_000064_000000
179 | 8176_115047_000053_000004
180 | 7511_102419_000004_000001
181 | 1027_125140_000073_000000
182 | 5583_41259_000007_000005
183 | 8465_246947_000028_000000
184 | 4535_279856_000055_000000
185 | 6880_216547_000039_000000
186 | 2045_158081_000012_000000
187 | 1958_144503_000083_000000
188 | 1974_139742_000065_000000
189 | 576_129623_000056_000005
190 | 5519_39481_000017_000000
191 | 1027_125147_000045_000001
192 | 5304_55856_000010_000000
193 | 205_159056_000013_000000
194 | 337_123025_000026_000003
195 | 2368_157056_000070_000000
196 | 806_124221_000045_000000
197 | 2092_145709_000002_000001
198 | 14_212_000018_000001
199 | 1974_139742_000069_000000
200 | 1731_142320_000069_000000
201 | 211_122442_000144_000000
202 | 7945_112011_000069_000000
203 | 7000_83706_000006_000003
204 | 78_369_000065_000003
205 | 8190_284435_000073_000000
206 | 806_124221_000040_000000
207 | 1271_136861_000062_000000
208 | 2401_144485_000092_000000


--------------------------------------------------------------------------------
/filelists/exceptions_vctk.txt:
--------------------------------------------------------------------------------
  1 | p234_280_mic2
  2 | p234_122_mic2
  3 | p234_010_mic2
  4 | p234_097_mic2
  5 | p234_304_mic2
  6 | p234_124_mic2
  7 | p234_075_mic2
  8 | p234_318_mic2
  9 | p234_125_mic2
 10 | p234_355_mic2
 11 | p234_157_mic2
 12 | p234_089_mic2
 13 | p234_062_mic2
 14 | p234_317_mic2
 15 | p234_279_mic2
 16 | p234_094_mic2
 17 | p234_199_mic2
 18 | p234_272_mic2
 19 | p234_054_mic2
 20 | p234_083_mic2
 21 | p234_336_mic2
 22 | p234_030_mic2
 23 | p234_091_mic2
 24 | p234_055_mic2
 25 | p234_191_mic2
 26 | p234_258_mic2
 27 | p234_038_mic2
 28 | p234_035_mic2
 29 | p234_346_mic2
 30 | p234_222_mic2
 31 | p234_200_mic2
 32 | p234_173_mic2
 33 | p234_262_mic2
 34 | p234_334_mic2
 35 | p234_253_mic2
 36 | p234_241_mic2
 37 | p234_139_mic2
 38 | p234_316_mic2
 39 | p234_099_mic2
 40 | p234_207_mic2
 41 | p234_325_mic2
 42 | p234_093_mic2
 43 | p234_118_mic2
 44 | p234_194_mic2
 45 | p234_006_mic2
 46 | p234_155_mic2
 47 | p234_259_mic2
 48 | p234_081_mic2
 49 | p234_063_mic2
 50 | p234_046_mic2
 51 | p234_177_mic2
 52 | p234_024_mic2
 53 | p234_213_mic2
 54 | p234_333_mic2
 55 | p234_189_mic2
 56 | p234_236_mic2
 57 | p234_135_mic2
 58 | p234_228_mic2
 59 | p234_005_mic2
 60 | p234_108_mic2
 61 | p234_257_mic2
 62 | p234_100_mic2
 63 | p234_179_mic2
 64 | p234_309_mic2
 65 | p234_165_mic2
 66 | p234_040_mic2
 67 | p234_074_mic2
 68 | p234_181_mic2
 69 | p234_242_mic2
 70 | p234_170_mic2
 71 | p234_327_mic2
 72 | p234_013_mic2
 73 | p234_132_mic2
 74 | p234_204_mic2
 75 | p234_342_mic2
 76 | p234_056_mic2
 77 | p234_111_mic2
 78 | p234_095_mic2
 79 | p234_031_mic2
 80 | p234_275_mic2
 81 | p234_137_mic2
 82 | p234_130_mic2
 83 | p234_245_mic2
 84 | p234_290_mic2
 85 | p234_129_mic2
 86 | p234_288_mic2
 87 | p234_221_mic2
 88 | p234_019_mic2
 89 | p234_043_mic2
 90 | p234_077_mic2
 91 | p234_050_mic2
 92 | p234_350_mic2
 93 | p234_167_mic2
 94 | p234_273_mic2
 95 | p234_294_mic2
 96 | p234_187_mic2
 97 | p234_156_mic2
 98 | p234_266_mic2
 99 | p234_254_mic2
100 | p234_227_mic2
101 | p360_262_mic2
102 | p234_303_mic2
103 | p234_295_mic2
104 | p234_032_mic2
105 | p234_025_mic2
106 | p234_003_mic2
107 | p234_328_mic2
108 | p234_291_mic2
109 | p234_016_mic2
110 | p234_322_mic2
111 | p234_248_mic2
112 | p234_102_mic2
113 | p234_356_mic2
114 | p234_087_mic2
115 | p234_012_mic2
116 | p234_270_mic2
117 | p234_104_mic2
118 | p234_073_mic2
119 | p234_209_mic2
120 | p234_026_mic2
121 | p234_205_mic2
122 | p234_017_mic2
123 | p234_343_mic2
124 | p234_086_mic2
125 | p234_212_mic2
126 | p234_027_mic2
127 | p234_018_mic2
128 | p234_105_mic2
129 | p234_249_mic2
130 | p234_311_mic2
131 | p234_041_mic2
132 | p234_326_mic2
133 | p234_123_mic2
134 | p234_329_mic2
135 | p234_299_mic2
136 | p234_296_mic2
137 | p234_171_mic2
138 | p234_263_mic2
139 | p234_216_mic2
140 | p234_321_mic2
141 | p234_090_mic2
142 | p234_069_mic2
143 | p234_282_mic2
144 | p234_117_mic2
145 | p234_286_mic2
146 | p234_233_mic2
147 | p234_214_mic2
148 | p234_047_mic2
149 | p234_022_mic2
150 | p234_106_mic2
151 | p234_239_mic2
152 | p234_219_mic2
153 | p234_133_mic2
154 | p234_353_mic2
155 | p234_052_mic2
156 | p234_277_mic2
157 | p234_208_mic2
158 | p234_033_mic2
159 | p234_186_mic2
160 | p234_256_mic2
161 | p234_064_mic2
162 | p234_140_mic2
163 | p234_354_mic2
164 | p234_182_mic2
165 | p234_240_mic2
166 | p234_298_mic2
167 | p234_127_mic2
168 | p234_071_mic2
169 | p234_034_mic2
170 | p234_324_mic2
171 | p234_175_mic2
172 | p234_308_mic2
173 | p234_159_mic2
174 | p234_152_mic2
175 | p234_183_mic2
176 | p234_079_mic2
177 | p234_053_mic2
178 | p234_112_mic2
179 | p234_072_mic2
180 | p234_176_mic2
181 | p234_323_mic2
182 | p234_285_mic2
183 | p234_314_mic2
184 | p234_349_mic2
185 | p234_115_mic2
186 | p234_061_mic2
187 | p234_174_mic2
188 | p234_060_mic2
189 | p234_110_mic2
190 | p234_224_mic2
191 | p234_229_mic2
192 | p234_261_mic2
193 | p234_250_mic2
194 | p234_188_mic2
195 | p234_310_mic2
196 | p234_276_mic2
197 | p234_202_mic2
198 | p234_265_mic2
199 | p234_169_mic2
200 | p234_339_mic2
201 | p234_193_mic2
202 | p234_168_mic2
203 | p234_274_mic2
204 | p234_082_mic2
205 | p234_029_mic2
206 | p234_210_mic2
207 | p234_068_mic2
208 | p234_107_mic2
209 | p234_340_mic2
210 | p234_301_mic2
211 | p234_103_mic2
212 | p234_048_mic2
213 | p234_058_mic2
214 | p234_185_mic2
215 | p234_120_mic2
216 | p234_218_mic2
217 | p234_001_mic2
218 | p234_237_mic2
219 | p234_154_mic2
220 | p234_161_mic2
221 | p234_109_mic2
222 | p234_143_mic2
223 | p234_085_mic2
224 | p234_180_mic2
225 | p234_057_mic2
226 | p234_009_mic2
227 | p234_198_mic2
228 | p234_313_mic2
229 | p234_195_mic2
230 | p234_348_mic2
231 | p234_306_mic2
232 | p234_337_mic2
233 | p234_178_mic2
234 | p234_243_mic2
235 | p234_044_mic2
236 | p234_347_mic2
237 | p234_359_mic2
238 | p234_126_mic2
239 | p234_002_mic2
240 | p234_023_mic2
241 | p234_246_mic2
242 | p234_039_mic2
243 | p234_092_mic2
244 | p234_096_mic2
245 | p234_315_mic2
246 | p234_147_mic2
247 | p234_004_mic2
248 | p234_358_mic2
249 | p234_160_mic2
250 | p234_217_mic2
251 | p234_164_mic2
252 | p234_149_mic2
253 | p234_289_mic2
254 | p234_252_mic2
255 | p234_020_mic2
256 | p234_021_mic2
257 | p234_172_mic2
258 | p234_244_mic2
259 | p234_113_mic2
260 | p234_264_mic2
261 | p234_153_mic2
262 | p234_220_mic2
263 | p234_247_mic2
264 | p234_360_mic2
265 | p234_101_mic2
266 | p234_338_mic2
267 | p234_225_mic2
268 | p234_284_mic2
269 | p234_302_mic2
270 | p234_260_mic2
271 | p234_145_mic2
272 | p234_144_mic2
273 | p234_190_mic2
274 | p234_235_mic2
275 | p234_320_mic2
276 | p234_098_mic2
277 | p234_138_mic2
278 | p234_226_mic2
279 | p234_345_mic2
280 | p234_197_mic2
281 | p234_331_mic2
282 | p234_271_mic2
283 | p234_230_mic2
284 | p234_119_mic2
285 | p234_335_mic2
286 | p234_344_mic2
287 | p234_341_mic2
288 | p234_148_mic2
289 | p234_059_mic2
290 | p234_307_mic2
291 | p323_011_mic2
292 | p234_114_mic2
293 | p234_319_mic2
294 | p234_116_mic2
295 | p234_008_mic2
296 | p234_166_mic2
297 | p234_361_mic2
298 | p234_231_mic2
299 | p234_076_mic2
300 | p234_015_mic2
301 | p234_070_mic2
302 | p234_158_mic2
303 | p234_131_mic2
304 | p234_088_mic2
305 | p234_142_mic2
306 | p234_080_mic2
307 | p234_121_mic2
308 | p234_192_mic2
309 | p234_312_mic2
310 | p234_234_mic2
311 | p234_281_mic2
312 | p234_162_mic2
313 | p234_268_mic2
314 | p234_352_mic2
315 | p234_028_mic2
316 | p234_049_mic2
317 | p234_293_mic2
318 | p234_151_mic2
319 | p234_196_mic2
320 | p234_037_mic2
321 | p234_042_mic2
322 | p234_201_mic2
323 | p234_332_mic2
324 | p234_067_mic2
325 | p234_292_mic2
326 | p234_146_mic2
327 | p234_223_mic2
328 | p234_287_mic2
329 | p234_141_mic2
330 | p234_203_mic2
331 | p234_211_mic2
332 | p234_136_mic2
333 | p234_036_mic2
334 | p234_150_mic2
335 | p234_255_mic2
336 | p234_134_mic2
337 | p234_128_mic2
338 | p234_238_mic2
339 | p234_014_mic2
340 | p234_297_mic2
341 | p234_278_mic2
342 | p234_184_mic2
343 | p234_267_mic2
344 | p234_330_mic2
345 | p234_251_mic2
346 | p234_066_mic2
347 | p234_351_mic2
348 | p234_084_mic2
349 | p234_051_mic2
350 | p234_300_mic2
351 | p234_232_mic2
352 | p234_045_mic2
353 | p234_283_mic2
354 | p234_305_mic2
355 | p234_065_mic2
356 | p234_007_mic2
357 | p234_357_mic2
358 | p234_269_mic2
359 | p234_163_mic2


--------------------------------------------------------------------------------
/filelists/valid.txt:
--------------------------------------------------------------------------------
  1 | 240_144999_000031_000000
  2 | 240_160592_000061_000000
  3 | 240_160593_000049_000000
  4 | 240_144999_000033_000000
  5 | 240_160592_000063_000000
  6 | 240_160593_000050_000000
  7 | 240_144999_000036_000000
  8 | 240_160592_000073_000000
  9 | 240_160593_000052_000000
 10 | 240_144999_000038_000000
 11 | 240_160592_000075_000000
 12 | 240_160593_000054_000000
 13 | 240_144999_000042_000000
 14 | 240_160592_000077_000000
 15 | 240_160593_000057_000000
 16 | 4133_6541_000001_000001
 17 | 4133_6541_000019_000004
 18 | 4133_6541_000031_000001
 19 | 4133_6541_000047_000003
 20 | 4133_6541_000002_000000
 21 | 4133_6541_000019_000005
 22 | 4133_6541_000032_000000
 23 | 4133_6541_000047_000004
 24 | 4133_6541_000004_000000
 25 | 4133_6541_000020_000000
 26 | 4133_6541_000033_000000
 27 | 4133_6541_000049_000000
 28 | 479_107479_000011_000000
 29 | 479_107479_000043_000002
 30 | 479_107480_000017_000002
 31 | 479_126480_000009_000000
 32 | 479_107479_000013_000002
 33 | 479_107479_000044_000000
 34 | 479_107480_000017_000005
 35 | 479_126480_000011_000000
 36 | 479_107479_000014_000001
 37 | 479_107479_000045_000001
 38 | 479_107480_000017_000006
 39 | 479_126480_000014_000000
 40 | 5093_29101_000019_000005
 41 | 5093_29101_000041_000000
 42 | 5093_39749_000007_000004
 43 | 5093_26496_000002_000012
 44 | 5093_29101_000020_000000
 45 | 5093_29101_000042_000000
 46 | 5093_39749_000007_000005
 47 | 5093_26496_000002_000013
 48 | 5093_29101_000020_000001
 49 | 5093_29101_000042_000001
 50 | 5093_39749_000007_000007
 51 | 5339_14133_000018_000004
 52 | 5339_14134_000012_000000
 53 | 5339_14134_000042_000002
 54 | 5339_14134_000091_000008
 55 | 5339_14133_000018_000006
 56 | 5339_14134_000013_000000
 57 | 5339_14134_000043_000000
 58 | 5339_14134_000091_000009
 59 | 5339_14133_000018_000007
 60 | 5339_14134_000013_000001
 61 | 5339_14134_000047_000000
 62 | 5339_14134_000091_000010
 63 | 5660_101883_000012_000000
 64 | 5660_101884_000021_000000
 65 | 5660_101892_000029_000003
 66 | 5660_101883_000013_000000
 67 | 5660_101884_000021_000002
 68 | 5660_101892_000030_000000
 69 | 5660_101883_000015_000000
 70 | 5660_101884_000022_000000
 71 | 5660_101892_000031_000001
 72 | 5808_48608_000005_000001
 73 | 5808_54425_000010_000005
 74 | 5808_54425_000029_000000
 75 | 5808_54425_000056_000000
 76 | 5808_48608_000005_000003
 77 | 5808_54425_000010_000006
 78 | 5808_54425_000029_000002
 79 | 5808_54425_000058_000000
 80 | 5808_48608_000005_000004
 81 | 5808_54425_000010_000008
 82 | 5808_54425_000029_000003
 83 | 5808_54425_000059_000000
 84 | 7789_103120_000032_000004
 85 | 7789_103120_000065_000000
 86 | 7789_258266_000018_000000
 87 | 7789_103120_000033_000000
 88 | 7789_103120_000065_000001
 89 | 7789_258266_000019_000001
 90 | 7789_103120_000034_000000
 91 | 7789_103120_000066_000000
 92 | 7789_258266_000021_000000
 93 | 7832_114468_000017_000001
 94 | 7832_114468_000042_000003
 95 | 7832_258250_000015_000012
 96 | 7832_114468_000017_000002
 97 | 7832_114468_000042_000004
 98 | 7832_258250_000015_000019
 99 | 7832_114468_000017_000003
100 | 7832_114468_000042_000005
101 | 7832_258250_000015_000020
102 | 8797_294123_000011_000005
103 | 8797_294123_000027_000001
104 | 8797_294123_000036_000005
105 | 8797_294123_000011_000007
106 | 8797_294123_000027_000002
107 | 8797_294123_000036_000006
108 | 8797_294123_000012_000005
109 | 8797_294123_000027_000003
110 | 8797_294123_000036_000007


--------------------------------------------------------------------------------
/hifi-gan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/hifi-gan/README.md:
--------------------------------------------------------------------------------
  1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
  2 | 
  3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
  4 | 
  5 | In our [paper](https://arxiv.org/abs/2010.05646), 
  6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
  7 | We provide our implementation and pretrained models as open source in this repository.
  8 | 
  9 | **Abstract :**
 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 
 11 | Although such methods improve the sampling efficiency and memory usage, 
 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models. 
 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 
 14 | As speech audio consists of sinusoidal signals with various periods, 
 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 
 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 
 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 
 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 
 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 
 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart.
 21 | 
 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
 23 | 
 24 | 
 25 | ## Pre-requisites
 26 | 1. Python >= 3.6
 27 | 2. Clone this repository.
 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt)
 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
 30 | And move all wav files to `LJSpeech-1.1/wavs`
 31 | 
 32 | 
 33 | ## Training
 34 | ```
 35 | python train.py --config config_v1.json
 36 | ```
 37 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
 38 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
 39 | You can change the path by adding `--checkpoint_path` option.
 40 | 
 41 | Validation loss during training with V1 generator.<br>
 42 | ![validation loss](./validation_loss.png)
 43 | 
 44 | ## Pretrained Model
 45 | You can also use pretrained models we provide.<br/>
 46 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/> 
 47 | Details of each folder are as in follows:
 48 | 
 49 | |Folder Name|Generator|Dataset|Fine-Tuned|
 50 | |------|---|---|---|
 51 | |LJ_V1|V1|LJSpeech|No|
 52 | |LJ_V2|V2|LJSpeech|No|
 53 | |LJ_V3|V3|LJSpeech|No|
 54 | |LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
 55 | |LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
 56 | |LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
 57 | |VCTK_V1|V1|VCTK|No|
 58 | |VCTK_V2|V2|VCTK|No|
 59 | |VCTK_V3|V3|VCTK|No|
 60 | |UNIVERSAL_V1|V1|Universal|No|
 61 | 
 62 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
 63 | 
 64 | ## Fine-Tuning
 65 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
 66 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
 67 | Example:
 68 |     ```
 69 |     Audio File : LJ001-0001.wav
 70 |     Mel-Spectrogram File : LJ001-0001.npy
 71 |     ```
 72 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
 73 | 3. Run the following command.
 74 |     ```
 75 |     python train.py --fine_tuning True --config config_v1.json
 76 |     ```
 77 |     For other command line options, please refer to the training section.
 78 | 
 79 | 
 80 | ## Inference from wav file
 81 | 1. Make `test_files` directory and copy wav files into the directory.
 82 | 2. Run the following command.
 83 |     ```
 84 |     python inference.py --checkpoint_file [generator checkpoint file path]
 85 |     ```
 86 | Generated wav files are saved in `generated_files` by default.<br>
 87 | You can change the path by adding `--output_dir` option.
 88 | 
 89 | 
 90 | ## Inference for end-to-end speech synthesis
 91 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
 92 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 
 93 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
 94 | 2. Run the following command.
 95 |     ```
 96 |     python inference_e2e.py --checkpoint_file [generator checkpoint file path]
 97 |     ```
 98 | Generated wav files are saved in `generated_files_from_mel` by default.<br>
 99 | You can change the path by adding `--output_dir` option.
100 | 
101 | 
102 | ## Acknowledgements
103 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 
104 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.
105 | 
106 | 


--------------------------------------------------------------------------------
/hifi-gan/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super(AttrDict, self).__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/hifi-gan/meldataset.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/jik876/hifi-gan """
  2 | 
  3 | import math
  4 | import os
  5 | import random
  6 | import torch
  7 | import torch.utils.data
  8 | import numpy as np
  9 | from librosa.util import normalize
 10 | from scipy.io.wavfile import read
 11 | from librosa.filters import mel as librosa_mel_fn
 12 | 
 13 | MAX_WAV_VALUE = 32768.0
 14 | 
 15 | 
 16 | def load_wav(full_path):
 17 |     sampling_rate, data = read(full_path)
 18 |     return data, sampling_rate
 19 | 
 20 | 
 21 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 22 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 23 | 
 24 | 
 25 | def dynamic_range_decompression(x, C=1):
 26 |     return np.exp(x) / C
 27 | 
 28 | 
 29 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 30 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 31 | 
 32 | 
 33 | def dynamic_range_decompression_torch(x, C=1):
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 52 |     if torch.min(y) < -1.:
 53 |         print('min value is ', torch.min(y))
 54 |     if torch.max(y) > 1.:
 55 |         print('max value is ', torch.max(y))
 56 | 
 57 |     global mel_basis, hann_window
 58 |     if fmax not in mel_basis:
 59 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 60 |         mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
 61 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 62 | 
 63 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 68 | 
 69 |     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
 70 | 
 71 |     spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
 72 |     spec = spectral_normalize_torch(spec)
 73 | 
 74 |     return spec
 75 | 
 76 | 
 77 | def get_dataset_filelist(a):
 78 |     with open(a.input_training_file, 'r', encoding='utf-8') as fi:
 79 |         training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 80 |                           for x in fi.read().split('\n') if len(x) > 0]
 81 | 
 82 |     with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
 83 |         validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 84 |                             for x in fi.read().split('\n') if len(x) > 0]
 85 |     return training_files, validation_files
 86 | 
 87 | 
 88 | class MelDataset(torch.utils.data.Dataset):
 89 |     def __init__(self, training_files, segment_size, n_fft, num_mels,
 90 |                  hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
 91 |                  device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
 92 |         self.audio_files = training_files
 93 |         random.seed(1234)
 94 |         if shuffle:
 95 |             random.shuffle(self.audio_files)
 96 |         self.segment_size = segment_size
 97 |         self.sampling_rate = sampling_rate
 98 |         self.split = split
 99 |         self.n_fft = n_fft
100 |         self.num_mels = num_mels
101 |         self.hop_size = hop_size
102 |         self.win_size = win_size
103 |         self.fmin = fmin
104 |         self.fmax = fmax
105 |         self.fmax_loss = fmax_loss
106 |         self.cached_wav = None
107 |         self.n_cache_reuse = n_cache_reuse
108 |         self._cache_ref_count = 0
109 |         self.device = device
110 |         self.fine_tuning = fine_tuning
111 |         self.base_mels_path = base_mels_path
112 | 
113 |     def __getitem__(self, index):
114 |         filename = self.audio_files[index]
115 |         if self._cache_ref_count == 0:
116 |             audio, sampling_rate = load_wav(filename)
117 |             audio = audio / MAX_WAV_VALUE
118 |             if not self.fine_tuning:
119 |                 audio = normalize(audio) * 0.95
120 |             self.cached_wav = audio
121 |             if sampling_rate != self.sampling_rate:
122 |                 raise ValueError("{} SR doesn't match target {} SR".format(
123 |                     sampling_rate, self.sampling_rate))
124 |             self._cache_ref_count = self.n_cache_reuse
125 |         else:
126 |             audio = self.cached_wav
127 |             self._cache_ref_count -= 1
128 | 
129 |         audio = torch.FloatTensor(audio)
130 |         audio = audio.unsqueeze(0)
131 | 
132 |         if not self.fine_tuning:
133 |             if self.split:
134 |                 if audio.size(1) >= self.segment_size:
135 |                     max_audio_start = audio.size(1) - self.segment_size
136 |                     audio_start = random.randint(0, max_audio_start)
137 |                     audio = audio[:, audio_start:audio_start+self.segment_size]
138 |                 else:
139 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
140 | 
141 |             mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
142 |                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
143 |                                   center=False)
144 |         else:
145 |             mel = np.load(
146 |                 os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
147 |             mel = torch.from_numpy(mel)
148 | 
149 |             if len(mel.shape) < 3:
150 |                 mel = mel.unsqueeze(0)
151 | 
152 |             if self.split:
153 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
154 | 
155 |                 if audio.size(1) >= self.segment_size:
156 |                     mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
157 |                     mel = mel[:, :, mel_start:mel_start + frames_per_seg]
158 |                     audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
159 |                 else:
160 |                     mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
161 |                     audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
162 | 
163 |         mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
164 |                                    self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
165 |                                    center=False)
166 | 
167 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
168 | 
169 |     def __len__(self):
170 |         return len(self.audio_files)
171 | 


--------------------------------------------------------------------------------
/hifi-gan/models.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/jik876/hifi-gan """
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | import torch.nn as nn
  6 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  7 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
  8 | from xutils import init_weights, get_padding
  9 | 
 10 | LRELU_SLOPE = 0.1
 11 | 
 12 | 
 13 | class ResBlock1(torch.nn.Module):
 14 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 15 |         super(ResBlock1, self).__init__()
 16 |         self.h = h
 17 |         self.convs1 = nn.ModuleList([
 18 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 19 |                                padding=get_padding(kernel_size, dilation[0]))),
 20 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 21 |                                padding=get_padding(kernel_size, dilation[1]))),
 22 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
 23 |                                padding=get_padding(kernel_size, dilation[2])))
 24 |         ])
 25 |         self.convs1.apply(init_weights)
 26 | 
 27 |         self.convs2 = nn.ModuleList([
 28 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 29 |                                padding=get_padding(kernel_size, 1))),
 30 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 31 |                                padding=get_padding(kernel_size, 1))),
 32 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
 33 |                                padding=get_padding(kernel_size, 1)))
 34 |         ])
 35 |         self.convs2.apply(init_weights)
 36 | 
 37 |     def forward(self, x):
 38 |         for c1, c2 in zip(self.convs1, self.convs2):
 39 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 40 |             xt = c1(xt)
 41 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
 42 |             xt = c2(xt)
 43 |             x = xt + x
 44 |         return x
 45 | 
 46 |     def remove_weight_norm(self):
 47 |         for l in self.convs1:
 48 |             remove_weight_norm(l)
 49 |         for l in self.convs2:
 50 |             remove_weight_norm(l)
 51 | 
 52 | 
 53 | class ResBlock2(torch.nn.Module):
 54 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
 55 |         super(ResBlock2, self).__init__()
 56 |         self.h = h
 57 |         self.convs = nn.ModuleList([
 58 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
 59 |                                padding=get_padding(kernel_size, dilation[0]))),
 60 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
 61 |                                padding=get_padding(kernel_size, dilation[1])))
 62 |         ])
 63 |         self.convs.apply(init_weights)
 64 | 
 65 |     def forward(self, x):
 66 |         for c in self.convs:
 67 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 68 |             xt = c(xt)
 69 |             x = xt + x
 70 |         return x
 71 | 
 72 |     def remove_weight_norm(self):
 73 |         for l in self.convs:
 74 |             remove_weight_norm(l)
 75 | 
 76 | 
 77 | class Generator(torch.nn.Module):
 78 |     def __init__(self, h):
 79 |         super(Generator, self).__init__()
 80 |         self.h = h
 81 |         self.num_kernels = len(h.resblock_kernel_sizes)
 82 |         self.num_upsamples = len(h.upsample_rates)
 83 |         self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
 84 |         resblock = ResBlock1 if h.resblock == '1' else ResBlock2
 85 | 
 86 |         self.ups = nn.ModuleList()
 87 |         for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
 88 |             self.ups.append(weight_norm(
 89 |                 ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
 90 |                                 k, u, padding=(k-u)//2)))
 91 | 
 92 |         self.resblocks = nn.ModuleList()
 93 |         for i in range(len(self.ups)):
 94 |             ch = h.upsample_initial_channel//(2**(i+1))
 95 |             for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
 96 |                 self.resblocks.append(resblock(h, ch, k, d))
 97 | 
 98 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
 99 |         self.ups.apply(init_weights)
100 |         self.conv_post.apply(init_weights)
101 | 
102 |     def forward(self, x):
103 |         x = self.conv_pre(x)
104 |         for i in range(self.num_upsamples):
105 |             x = F.leaky_relu(x, LRELU_SLOPE)
106 |             x = self.ups[i](x)
107 |             xs = None
108 |             for j in range(self.num_kernels):
109 |                 if xs is None:
110 |                     xs = self.resblocks[i*self.num_kernels+j](x)
111 |                 else:
112 |                     xs += self.resblocks[i*self.num_kernels+j](x)
113 |             x = xs / self.num_kernels
114 |         x = F.leaky_relu(x)
115 |         x = self.conv_post(x)
116 |         x = torch.tanh(x)
117 | 
118 |         return x
119 | 
120 |     def remove_weight_norm(self):
121 |         print('Removing weight norm...')
122 |         for l in self.ups:
123 |             remove_weight_norm(l)
124 |         for l in self.resblocks:
125 |             l.remove_weight_norm()
126 |         remove_weight_norm(self.conv_pre)
127 |         remove_weight_norm(self.conv_post)
128 | 
129 | 
130 | class DiscriminatorP(torch.nn.Module):
131 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
132 |         super(DiscriminatorP, self).__init__()
133 |         self.period = period
134 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
135 |         self.convs = nn.ModuleList([
136 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
137 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
138 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
139 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
140 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
141 |         ])
142 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
143 | 
144 |     def forward(self, x):
145 |         fmap = []
146 | 
147 |         # 1d to 2d
148 |         b, c, t = x.shape
149 |         if t % self.period != 0: # pad first
150 |             n_pad = self.period - (t % self.period)
151 |             x = F.pad(x, (0, n_pad), "reflect")
152 |             t = t + n_pad
153 |         x = x.view(b, c, t // self.period, self.period)
154 | 
155 |         for l in self.convs:
156 |             x = l(x)
157 |             x = F.leaky_relu(x, LRELU_SLOPE)
158 |             fmap.append(x)
159 |         x = self.conv_post(x)
160 |         fmap.append(x)
161 |         x = torch.flatten(x, 1, -1)
162 | 
163 |         return x, fmap
164 | 
165 | 
166 | class MultiPeriodDiscriminator(torch.nn.Module):
167 |     def __init__(self):
168 |         super(MultiPeriodDiscriminator, self).__init__()
169 |         self.discriminators = nn.ModuleList([
170 |             DiscriminatorP(2),
171 |             DiscriminatorP(3),
172 |             DiscriminatorP(5),
173 |             DiscriminatorP(7),
174 |             DiscriminatorP(11),
175 |         ])
176 | 
177 |     def forward(self, y, y_hat):
178 |         y_d_rs = []
179 |         y_d_gs = []
180 |         fmap_rs = []
181 |         fmap_gs = []
182 |         for i, d in enumerate(self.discriminators):
183 |             y_d_r, fmap_r = d(y)
184 |             y_d_g, fmap_g = d(y_hat)
185 |             y_d_rs.append(y_d_r)
186 |             fmap_rs.append(fmap_r)
187 |             y_d_gs.append(y_d_g)
188 |             fmap_gs.append(fmap_g)
189 | 
190 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
191 | 
192 | 
193 | class DiscriminatorS(torch.nn.Module):
194 |     def __init__(self, use_spectral_norm=False):
195 |         super(DiscriminatorS, self).__init__()
196 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
197 |         self.convs = nn.ModuleList([
198 |             norm_f(Conv1d(1, 128, 15, 1, padding=7)),
199 |             norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
200 |             norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
201 |             norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
202 |             norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
203 |             norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
204 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
205 |         ])
206 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
207 | 
208 |     def forward(self, x):
209 |         fmap = []
210 |         for l in self.convs:
211 |             x = l(x)
212 |             x = F.leaky_relu(x, LRELU_SLOPE)
213 |             fmap.append(x)
214 |         x = self.conv_post(x)
215 |         fmap.append(x)
216 |         x = torch.flatten(x, 1, -1)
217 | 
218 |         return x, fmap
219 | 
220 | 
221 | class MultiScaleDiscriminator(torch.nn.Module):
222 |     def __init__(self):
223 |         super(MultiScaleDiscriminator, self).__init__()
224 |         self.discriminators = nn.ModuleList([
225 |             DiscriminatorS(use_spectral_norm=True),
226 |             DiscriminatorS(),
227 |             DiscriminatorS(),
228 |         ])
229 |         self.meanpools = nn.ModuleList([
230 |             AvgPool1d(4, 2, padding=2),
231 |             AvgPool1d(4, 2, padding=2)
232 |         ])
233 | 
234 |     def forward(self, y, y_hat):
235 |         y_d_rs = []
236 |         y_d_gs = []
237 |         fmap_rs = []
238 |         fmap_gs = []
239 |         for i, d in enumerate(self.discriminators):
240 |             if i != 0:
241 |                 y = self.meanpools[i-1](y)
242 |                 y_hat = self.meanpools[i-1](y_hat)
243 |             y_d_r, fmap_r = d(y)
244 |             y_d_g, fmap_g = d(y_hat)
245 |             y_d_rs.append(y_d_r)
246 |             fmap_rs.append(fmap_r)
247 |             y_d_gs.append(y_d_g)
248 |             fmap_gs.append(fmap_g)
249 | 
250 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
251 | 
252 | 
253 | def feature_loss(fmap_r, fmap_g):
254 |     loss = 0
255 |     for dr, dg in zip(fmap_r, fmap_g):
256 |         for rl, gl in zip(dr, dg):
257 |             loss += torch.mean(torch.abs(rl - gl))
258 | 
259 |     return loss*2
260 | 
261 | 
262 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
263 |     loss = 0
264 |     r_losses = []
265 |     g_losses = []
266 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
267 |         r_loss = torch.mean((1-dr)**2)
268 |         g_loss = torch.mean(dg**2)
269 |         loss += (r_loss + g_loss)
270 |         r_losses.append(r_loss.item())
271 |         g_losses.append(g_loss.item())
272 | 
273 |     return loss, r_losses, g_losses
274 | 
275 | 
276 | def generator_loss(disc_outputs):
277 |     loss = 0
278 |     gen_losses = []
279 |     for dg in disc_outputs:
280 |         l = torch.mean((1-dg)**2)
281 |         gen_losses.append(l)
282 |         loss += l
283 | 
284 |     return loss, gen_losses
285 | 
286 | 


--------------------------------------------------------------------------------
/hifi-gan/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | import matplotlib
 6 | import torch
 7 | from torch.nn.utils import weight_norm
 8 | matplotlib.use("Agg")
 9 | import matplotlib.pylab as plt
10 | 
11 | 
12 | def plot_spectrogram(spectrogram):
13 |     fig, ax = plt.subplots(figsize=(10, 2))
14 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
15 |                    interpolation='none')
16 |     plt.colorbar(im, ax=ax)
17 | 
18 |     fig.canvas.draw()
19 |     plt.close()
20 | 
21 |     return fig
22 | 
23 | 
24 | def init_weights(m, mean=0.0, std=0.01):
25 |     classname = m.__class__.__name__
26 |     if classname.find("Conv") != -1:
27 |         m.weight.data.normal_(mean, std)
28 | 
29 | 
30 | def apply_weight_norm(m):
31 |     classname = m.__class__.__name__
32 |     if classname.find("Conv") != -1:
33 |         weight_norm(m)
34 | 
35 | 
36 | def get_padding(kernel_size, dilation=1):
37 |     return int((kernel_size*dilation - dilation)/2)
38 | 
39 | 
40 | def load_checkpoint(filepath, device):
41 |     assert os.path.isfile(filepath)
42 |     print("Loading '{}'".format(filepath))
43 |     checkpoint_dict = torch.load(filepath, map_location=device)
44 |     print("Complete.")
45 |     return checkpoint_dict
46 | 
47 | 
48 | def save_checkpoint(filepath, obj):
49 |     print("Saving checkpoint to {}".format(filepath))
50 |     torch.save(obj, filepath)
51 |     print("Complete.")
52 | 
53 | 
54 | def scan_checkpoint(cp_dir, prefix):
55 |     pattern = os.path.join(cp_dir, prefix + '????????')
56 |     cp_list = glob.glob(pattern)
57 |     if len(cp_list) == 0:
58 |         return None
59 |     return sorted(cp_list)[-1]
60 | 
61 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | import soundfile as sf
  7 | import torch
  8 | use_gpu = torch.cuda.is_available()
  9 | 
 10 | import librosa
 11 | from librosa.core import load
 12 | from librosa.filters import mel as librosa_mel_fn
 13 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)
 14 | 
 15 | import params
 16 | from model import DiffVC
 17 | 
 18 | import sys
 19 | sys.path.append('hifi-gan/')
 20 | from env import AttrDict
 21 | from models import Generator as HiFiGAN
 22 | 
 23 | sys.path.append('speaker_encoder/')
 24 | from encoder import inference as spk_encoder
 25 | from pathlib import Path
 26 | 
 27 | 
 28 | class Inferencer(): 
 29 |     def __init__(self, generator, spk_encoder, hifigan_universal, output_path="./output_demo", use_gpu=False):
 30 | 
 31 |         self.generator = generator
 32 |         self.spk_encoder = spk_encoder
 33 |         self.hifigan_universal = hifigan_universal
 34 |         # if not os.path.isdir(output_path):
 35 |         #     os.makedirs(output_path)
 36 |             
 37 |         self.output_path = output_path
 38 |         
 39 |         self.use_gpu = use_gpu
 40 |         
 41 |         
 42 |     def get_mel(self, wav_path):
 43 |         wav, _ = load(wav_path, sr=22050)
 44 |         wav = wav[:(wav.shape[0] // 256)*256]
 45 |         wav = np.pad(wav, 384, mode='reflect')
 46 |         stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
 47 |         stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
 48 |         mel_spectrogram = np.matmul(mel_basis, stftm)
 49 |         log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
 50 |         return log_mel_spectrogram
 51 | 
 52 |     def get_embed(self, wav_path):
 53 |         wav_preprocessed = spk_encoder.preprocess_wav(wav_path)
 54 |         embed = spk_encoder.embed_utterance(wav_preprocessed)
 55 |         return embed
 56 | 
 57 |     def noise_median_smoothing(self, x, w=5):
 58 |         y = np.copy(x)
 59 |         x = np.pad(x, w, "edge")
 60 |         for i in range(y.shape[0]):
 61 |             med = np.median(x[i:i+2*w+1])
 62 |             y[i] = min(x[i+w+1], med)
 63 |         return y
 64 | 
 65 |     def mel_spectral_subtraction(self, mel_synth, mel_source, spectral_floor=0.02, silence_window=5, smoothing_window=5):
 66 |         mel_len = mel_source.shape[-1]
 67 |         energy_min = 100000.0
 68 |         i_min = 0
 69 |         for i in range(mel_len - silence_window):
 70 |             energy_cur = np.sum(np.exp(2.0 * mel_source[:, i:i+silence_window]))
 71 |             if energy_cur < energy_min:
 72 |                 i_min = i
 73 |                 energy_min = energy_cur
 74 |         estimated_noise_energy = np.min(np.exp(2.0 * mel_synth[:, i_min:i_min+silence_window]), axis=-1)
 75 |         if smoothing_window is not None:
 76 |             estimated_noise_energy = self.noise_median_smoothing(estimated_noise_energy, smoothing_window)
 77 |         mel_denoised = np.copy(mel_synth)
 78 |         for i in range(mel_len):
 79 |             signal_subtract_noise = np.exp(2.0 * mel_synth[:, i]) - estimated_noise_energy
 80 |             estimated_signal_energy = np.maximum(signal_subtract_noise, spectral_floor * estimated_noise_energy)
 81 |             mel_denoised[:, i] = np.log(np.sqrt(estimated_signal_energy))
 82 |         return mel_denoised
 83 | 
 84 | 
 85 |     def infer(self, src_path, tgt_path, n_timesteps=30, return_output_path=False, sr=16000): 
 86 |         
 87 |         source_basename = os.path.basename(src_path).split('.wav')[0]
 88 |         target_basename = os.path.basename(tgt_path).split('.wav')[0]
 89 |         output_basename = f'{source_basename}_to_{target_basename}'
 90 |         output_wav = os.path.join(self.output_path, output_basename+'.wav')
 91 |         
 92 |         mel_source = torch.from_numpy(self.get_mel(src_path)).float().unsqueeze(0)
 93 |         if self.use_gpu:
 94 |             mel_source = mel_source.cuda()
 95 |         mel_source_lengths = torch.LongTensor([mel_source.shape[-1]])
 96 |         if self.use_gpu:
 97 |             mel_source_lengths = mel_source_lengths.cuda()
 98 |         
 99 |         mel_target = torch.from_numpy(self.get_mel(tgt_path)).float().unsqueeze(0)
100 |         if self.use_gpu:
101 |             mel_target = mel_target.cuda()
102 |         mel_target_lengths = torch.LongTensor([mel_target.shape[-1]])
103 |         if self.use_gpu:
104 |             mel_target_lengths = mel_target_lengths.cuda()
105 | 
106 |         embed_target = torch.from_numpy(self.get_embed(tgt_path)).float().unsqueeze(0)
107 |         if self.use_gpu:
108 |             embed_target = embed_target.cuda()
109 |             
110 |             
111 |         # performing voice conversion
112 |         mel_encoded, mel_ = self.generator.forward(mel_source, mel_source_lengths, mel_target, mel_target_lengths, embed_target, 
113 |                                             n_timesteps=n_timesteps, mode='ml')
114 |         mel_synth_np = mel_.cpu().detach().squeeze().numpy()
115 |         mel_source_np = mel_.cpu().detach().squeeze().numpy()
116 |         mel = torch.from_numpy(self.mel_spectral_subtraction(mel_synth_np, mel_source_np, smoothing_window=1)).float().unsqueeze(0)
117 |         if self.use_gpu:
118 |             mel = mel.cuda() 
119 | 
120 |         with torch.no_grad():
121 |             audio = self.hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1)
122 |             print(audio.shape)
123 |         sf.write(f'{output_wav}', audio, sr)
124 |         
125 |         if return_output_path:     
126 |             return output_wav
127 |         else: 
128 |             return audio
129 | 
130 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
2 | # This program is free software; you can redistribute it and/or modify
3 | # it under the terms of the MIT License.
4 | # This program is distributed in the hope that it will be useful,
5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7 | # MIT License for more details.
8 | 
9 | from .vc import DiffVC


--------------------------------------------------------------------------------
/model/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
 2 | # This program is free software; you can redistribute it and/or modify
 3 | # it under the terms of the MIT License.
 4 | # This program is distributed in the hope that it will be useful,
 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 7 | # MIT License for more details.
 8 | 
 9 | import numpy as np
10 | import torch
11 | 
12 | 
13 | class BaseModule(torch.nn.Module):
14 |     def __init__(self):
15 |         super(BaseModule, self).__init__()
16 | 
17 |     @property
18 |     def nparams(self):
19 |         num_params = 0
20 |         for name, param in self.named_parameters():
21 |             if param.requires_grad:
22 |                 num_params += np.prod(param.detach().cpu().numpy().shape)
23 |         return num_params
24 | 
25 | 
26 |     def relocate_input(self, x: list):
27 |         device = next(self.parameters()).device
28 |         for i in range(len(x)):
29 |             if isinstance(x[i], torch.Tensor) and x[i].device != device:
30 |                 x[i] = x[i].to(device)
31 |         return x
32 | 


--------------------------------------------------------------------------------
/model/diffusion.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import math
 10 | import torch
 11 | 
 12 | from model.base import BaseModule
 13 | from model.modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock
 14 | from model.modules import LinearAttention, Residual, SinusoidalPosEmb, RefBlock
 15 | 
 16 | 
 17 | class GradLogPEstimator(BaseModule):
 18 |     def __init__(self, dim_base, dim_cond, use_ref_t, dim_mults=(1, 2, 4)):
 19 |         super(GradLogPEstimator, self).__init__()
 20 |         self.use_ref_t = use_ref_t
 21 |         dims = [2 + dim_cond, *map(lambda m: dim_base * m, dim_mults)]
 22 |         in_out = list(zip(dims[:-1], dims[1:]))
 23 | 
 24 |         self.time_pos_emb = SinusoidalPosEmb(dim_base)
 25 |         self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4), 
 26 |                                Mish(), torch.nn.Linear(dim_base * 4, dim_base))
 27 | 
 28 |         cond_total = dim_base + 256
 29 |         if use_ref_t:
 30 |             self.ref_block = RefBlock(out_dim=dim_cond, time_emb_dim=dim_base)
 31 |             cond_total += dim_cond
 32 |         self.cond_block = torch.nn.Sequential(torch.nn.Linear(cond_total, 4 * dim_cond),
 33 |                                       Mish(), torch.nn.Linear(4 * dim_cond, dim_cond))
 34 | 
 35 |         self.downs = torch.nn.ModuleList([])
 36 |         self.ups = torch.nn.ModuleList([])
 37 |         num_resolutions = len(in_out)
 38 | 
 39 |         for ind, (dim_in, dim_out) in enumerate(in_out):
 40 |             is_last = ind >= (num_resolutions - 1)
 41 |             self.downs.append(torch.nn.ModuleList([
 42 |                        ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base),
 43 |                        ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base),
 44 |                        Residual(Rezero(LinearAttention(dim_out))),
 45 |                        Downsample(dim_out) if not is_last else torch.nn.Identity()]))
 46 | 
 47 |         mid_dim = dims[-1]
 48 |         self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
 49 |         self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
 50 |         self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
 51 | 
 52 |         for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
 53 |             self.ups.append(torch.nn.ModuleList([
 54 |                      ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base),
 55 |                      ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base),
 56 |                      Residual(Rezero(LinearAttention(dim_in))),
 57 |                      Upsample(dim_in)]))
 58 |         self.final_block = Block(dim_base, dim_base)
 59 |         self.final_conv = torch.nn.Conv2d(dim_base, 1, 1)
 60 | 
 61 |     def forward(self, x, x_mask, mean, ref, ref_mask, c, t):
 62 |         condition = self.time_pos_emb(t)
 63 |         t = self.mlp(condition)
 64 | 
 65 |         x = torch.stack([mean, x], 1)
 66 |         x_mask = x_mask.unsqueeze(1)
 67 |         ref_mask = ref_mask.unsqueeze(1)
 68 | 
 69 |         if self.use_ref_t:
 70 |             condition = torch.cat([condition, self.ref_block(ref, ref_mask, t)], 1)
 71 |         condition = torch.cat([condition, c], 1)
 72 | 
 73 |         condition = self.cond_block(condition).unsqueeze(-1).unsqueeze(-1)
 74 |         condition = torch.cat(x.shape[2]*[condition], 2)
 75 |         condition = torch.cat(x.shape[3]*[condition], 3)
 76 |         x = torch.cat([x, condition], 1)
 77 | 
 78 |         hiddens = []
 79 |         masks = [x_mask]
 80 |         for resnet1, resnet2, attn, downsample in self.downs:
 81 |             mask_down = masks[-1]
 82 |             x = resnet1(x, mask_down, t)
 83 |             x = resnet2(x, mask_down, t)
 84 |             x = attn(x)
 85 |             hiddens.append(x)
 86 |             x = downsample(x * mask_down)
 87 |             masks.append(mask_down[:, :, :, ::2])
 88 | 
 89 |         masks = masks[:-1]
 90 |         mask_mid = masks[-1]
 91 |         x = self.mid_block1(x, mask_mid, t)
 92 |         x = self.mid_attn(x)
 93 |         x = self.mid_block2(x, mask_mid, t)
 94 | 
 95 |         for resnet1, resnet2, attn, upsample in self.ups:
 96 |             mask_up = masks.pop()
 97 |             x = torch.cat((x, hiddens.pop()), dim=1)
 98 |             x = resnet1(x, mask_up, t)
 99 |             x = resnet2(x, mask_up, t)
100 |             x = attn(x)
101 |             x = upsample(x * mask_up)
102 | 
103 |         x = self.final_block(x, x_mask)
104 |         output = self.final_conv(x * x_mask)
105 | 
106 |         return (output * x_mask).squeeze(1)
107 | 
108 | 
109 | class Diffusion(BaseModule):
110 |     def __init__(self, n_feats, dim_unet, dim_spk, use_ref_t, beta_min, beta_max):
111 |         super(Diffusion, self).__init__()
112 |         self.estimator = GradLogPEstimator(dim_unet, dim_spk, use_ref_t)
113 |         self.n_feats = n_feats
114 |         self.dim_unet = dim_unet
115 |         self.dim_spk = dim_spk
116 |         self.use_ref_t = use_ref_t
117 |         self.beta_min = beta_min
118 |         self.beta_max = beta_max
119 | 
120 |     def get_beta(self, t):
121 |         beta = self.beta_min + (self.beta_max - self.beta_min) * t
122 |         return beta
123 | 
124 |     def get_gamma(self, s, t, p=1.0, use_torch=False):
125 |         beta_integral = self.beta_min + 0.5*(self.beta_max - self.beta_min)*(t + s)
126 |         beta_integral *= (t - s)
127 |         if use_torch:
128 |             gamma = torch.exp(-0.5*p*beta_integral).unsqueeze(-1).unsqueeze(-1)
129 |         else:
130 |             gamma = math.exp(-0.5*p*beta_integral)
131 |         return gamma
132 | 
133 |     def get_mu(self, s, t):
134 |         a = self.get_gamma(s, t)
135 |         b = 1.0 - self.get_gamma(0, s, p=2.0)
136 |         c = 1.0 - self.get_gamma(0, t, p=2.0)
137 |         return a * b / c
138 | 
139 |     def get_nu(self, s, t):
140 |         a = self.get_gamma(0, s)
141 |         b = 1.0 - self.get_gamma(s, t, p=2.0)
142 |         c = 1.0 - self.get_gamma(0, t, p=2.0)
143 |         return a * b / c
144 | 
145 |     def get_sigma(self, s, t):
146 |         a = 1.0 - self.get_gamma(0, s, p=2.0)
147 |         b = 1.0 - self.get_gamma(s, t, p=2.0)
148 |         c = 1.0 - self.get_gamma(0, t, p=2.0)
149 |         return math.sqrt(a * b / c)
150 | 
151 |     def compute_diffused_mean(self, x0, mask, mean, t, use_torch=False):
152 |         x0_weight = self.get_gamma(0, t, use_torch=use_torch)
153 |         mean_weight = 1.0 - x0_weight
154 |         xt_mean = x0 * x0_weight + mean * mean_weight
155 |         return xt_mean * mask
156 | 
157 |     def forward_diffusion(self, x0, mask, mean, t):
158 |         xt_mean = self.compute_diffused_mean(x0, mask, mean, t, use_torch=True)
159 |         variance = 1.0 - self.get_gamma(0, t, p=2.0, use_torch=True)
160 |         z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device, requires_grad=False)
161 |         xt = xt_mean + z * torch.sqrt(variance)
162 |         return xt * mask, z * mask
163 | 
164 |     @torch.no_grad()
165 |     def reverse_diffusion(self, z, mask, mean, ref, ref_mask, mean_ref, c, 
166 |                           n_timesteps, mode):
167 |         h = 1.0 / n_timesteps
168 |         xt = z * mask
169 |         for i in range(n_timesteps):
170 |             t = 1.0 - i*h
171 |             time = t * torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
172 |             beta_t = self.get_beta(t)
173 |             xt_ref = [self.compute_diffused_mean(ref, ref_mask, mean_ref, t)]
174 | #            for j in range(15):
175 | #                xt_ref += [self.compute_diffused_mean(ref, ref_mask, mean_ref, (j+0.5)/15.0)]
176 |             xt_ref = torch.stack(xt_ref, 1)
177 |             if mode == 'pf':
178 |                 dxt = 0.5 * (mean - xt - self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time)) * (beta_t * h)
179 |             else:
180 |                 if mode == 'ml':
181 |                     kappa = self.get_gamma(0, t - h) * (1.0 - self.get_gamma(t - h, t, p=2.0))
182 |                     kappa /= (self.get_gamma(0, t) * beta_t * h)
183 |                     kappa -= 1.0
184 |                     omega = self.get_nu(t - h, t) / self.get_gamma(0, t)
185 |                     omega += self.get_mu(t - h, t)
186 |                     omega -= (0.5 * beta_t * h + 1.0)
187 |                     sigma = self.get_sigma(t - h, t)
188 |                 else:
189 |                     kappa = 0.0
190 |                     omega = 0.0
191 |                     sigma = math.sqrt(beta_t * h)
192 |                 dxt = (mean - xt) * (0.5 * beta_t * h + omega)
193 |                 dxt -= self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time) * (1.0 + kappa) * (beta_t * h)
194 |                 dxt += torch.randn_like(z, device=z.device) * sigma
195 |             xt = (xt - dxt) * mask
196 |         return xt
197 | 
198 |     @torch.no_grad()
199 |     def forward(self, z, mask, mean, ref, ref_mask, mean_ref, c, 
200 |                 n_timesteps, mode):
201 |         if mode not in ['pf', 'em', 'ml']:
202 |             print('Inference mode must be one of [pf, em, ml]!')
203 |             return z
204 |         return self.reverse_diffusion(z, mask, mean, ref, ref_mask, mean_ref, c, 
205 |                                       n_timesteps, mode)
206 | 
207 |     def loss_t(self, x0, mask, mean, x_ref, mean_ref, c, t):
208 |         xt, z = self.forward_diffusion(x0, mask, mean, t)
209 |         xt_ref = [self.compute_diffused_mean(x_ref, mask, mean_ref, t, use_torch=True)]
210 | #        for j in range(15):
211 | #            xt_ref += [self.compute_diffused_mean(x_ref, mask, mean_ref, (j+0.5)/15.0)]
212 |         xt_ref = torch.stack(xt_ref, 1)
213 |         z_estimation = self.estimator(xt, mask, mean, xt_ref, mask, c, t)
214 |         z_estimation *= torch.sqrt(1.0 - self.get_gamma(0, t, p=2.0, use_torch=True))
215 |         loss = torch.sum((z_estimation + z)**2) / (torch.sum(mask)*self.n_feats)
216 |         return loss
217 | 
218 |     def compute_loss(self, x0, mask, mean, x_ref, mean_ref, c, offset=1e-5):
219 |         b = x0.shape[0]
220 |         t = torch.rand(b, dtype=x0.dtype, device=x0.device, requires_grad=False)
221 |         t = torch.clamp(t, offset, 1.0 - offset)
222 |         return self.loss_t(x0, mask, mean, x_ref, mean_ref, c, t)
223 | 


--------------------------------------------------------------------------------
/model/modules.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import math
 10 | import torch
 11 | from einops import rearrange
 12 | 
 13 | from model.base import BaseModule
 14 | 
 15 | 
 16 | class Mish(BaseModule):
 17 |     def forward(self, x):
 18 |         return x * torch.tanh(torch.nn.functional.softplus(x))
 19 | 
 20 | 
 21 | class Upsample(BaseModule):
 22 |     def __init__(self, dim):
 23 |         super(Upsample, self).__init__()
 24 |         self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
 25 | 
 26 |     def forward(self, x):
 27 |         return self.conv(x)
 28 | 
 29 | 
 30 | class Downsample(BaseModule):
 31 |     def __init__(self, dim):
 32 |         super(Downsample, self).__init__()
 33 |         self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
 34 | 
 35 |     def forward(self, x):
 36 |         return self.conv(x)
 37 | 
 38 | 
 39 | class Rezero(BaseModule):
 40 |     def __init__(self, fn):
 41 |         super(Rezero, self).__init__()
 42 |         self.fn = fn
 43 |         self.g = torch.nn.Parameter(torch.zeros(1))
 44 | 
 45 |     def forward(self, x):
 46 |         return self.fn(x) * self.g
 47 | 
 48 | 
 49 | class Block(BaseModule):
 50 |     def __init__(self, dim, dim_out, groups=8):
 51 |         super(Block, self).__init__()
 52 |         self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3, 
 53 |                                          padding=1), torch.nn.GroupNorm(
 54 |                                          groups, dim_out), Mish())
 55 | 
 56 |     def forward(self, x, mask):
 57 |         output = self.block(x * mask)
 58 |         return output * mask
 59 | 
 60 | 
 61 | class ResnetBlock(BaseModule):
 62 |     def __init__(self, dim, dim_out, time_emb_dim, groups=8):
 63 |         super(ResnetBlock, self).__init__()
 64 |         self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 
 65 |                                                                dim_out))
 66 | 
 67 |         self.block1 = Block(dim, dim_out, groups=groups)
 68 |         self.block2 = Block(dim_out, dim_out, groups=groups)
 69 |         if dim != dim_out:
 70 |             self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
 71 |         else:
 72 |             self.res_conv = torch.nn.Identity()
 73 | 
 74 |     def forward(self, x, mask, time_emb):
 75 |         h = self.block1(x, mask)
 76 |         h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
 77 |         h = self.block2(h, mask)
 78 |         output = h + self.res_conv(x * mask)
 79 |         return output
 80 | 
 81 | 
 82 | class LinearAttention(BaseModule):
 83 |     def __init__(self, dim, heads=4, dim_head=32):
 84 |         super(LinearAttention, self).__init__()
 85 |         self.heads = heads
 86 |         hidden_dim = dim_head * heads
 87 |         self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
 88 |         self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)            
 89 | 
 90 |     def forward(self, x):
 91 |         b, c, h, w = x.shape
 92 |         qkv = self.to_qkv(x)
 93 |         q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', 
 94 |                             heads = self.heads, qkv=3)            
 95 |         k = k.softmax(dim=-1)
 96 |         context = torch.einsum('bhdn,bhen->bhde', k, v)
 97 |         out = torch.einsum('bhde,bhdn->bhen', context, q)
 98 |         out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', 
 99 |                         heads=self.heads, h=h, w=w)
100 |         return self.to_out(out)
101 | 
102 | 
103 | class Residual(BaseModule):
104 |     def __init__(self, fn):
105 |         super(Residual, self).__init__()
106 |         self.fn = fn
107 | 
108 |     def forward(self, x, *args, **kwargs):
109 |         output = self.fn(x, *args, **kwargs) + x
110 |         return output
111 | 
112 | 
113 | class SinusoidalPosEmb(BaseModule):
114 |     def __init__(self, dim):
115 |         super(SinusoidalPosEmb, self).__init__()
116 |         self.dim = dim
117 | 
118 |     def forward(self, x):
119 |         device = x.device
120 |         half_dim = self.dim // 2
121 |         emb = math.log(10000) / (half_dim - 1)
122 |         emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
123 |         emb = 1000.0 * x.unsqueeze(1) * emb.unsqueeze(0)
124 |         emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
125 |         return emb
126 | 
127 | 
128 | class RefBlock(BaseModule):
129 |     def __init__(self, out_dim, time_emb_dim):
130 |         super(RefBlock, self).__init__()
131 |         base_dim = out_dim // 4
132 |         self.mlp1 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 
133 |                                                                 base_dim))
134 |         self.mlp2 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 
135 |                                                                 2 * base_dim))
136 |         self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim, 
137 |                       3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
138 |                       torch.nn.GLU(dim=1))
139 |         self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim, 
140 |                       3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
141 |                       torch.nn.GLU(dim=1))
142 |         self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim,
143 |                       3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
144 |                       torch.nn.GLU(dim=1))
145 |         self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim,
146 |                       3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
147 |                       torch.nn.GLU(dim=1))
148 |         self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim,
149 |                       3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
150 |                       torch.nn.GLU(dim=1))
151 |         self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim,
152 |                       3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
153 |                       torch.nn.GLU(dim=1))
154 |         self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1)
155 | 
156 |     def forward(self, x, mask, time_emb):
157 |         y = self.block11(x * mask)
158 |         y = self.block12(y * mask)
159 |         y += self.mlp1(time_emb).unsqueeze(-1).unsqueeze(-1)
160 |         y = self.block21(y * mask)
161 |         y = self.block22(y * mask)
162 |         y += self.mlp2(time_emb).unsqueeze(-1).unsqueeze(-1)
163 |         y = self.block31(y * mask)
164 |         y = self.block32(y * mask)
165 |         y = self.final_conv(y * mask)
166 |         return (y * mask).sum((2, 3)) / (mask.sum((2, 3)) * x.shape[2])
167 | 


--------------------------------------------------------------------------------
/model/postnet.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
 2 | # This program is free software; you can redistribute it and/or modify
 3 | # it under the terms of the MIT License.
 4 | # This program is distributed in the hope that it will be useful,
 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 7 | # MIT License for more details.
 8 | 
 9 | import torch
10 | 
11 | from model.base import BaseModule
12 | from model.modules import Mish
13 | 
14 | 
15 | class Block(BaseModule):
16 |     def __init__(self, dim, groups=8):
17 |         super(Block, self).__init__()
18 |         self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim, 7, 
19 |                      padding=3), torch.nn.GroupNorm(groups, dim), Mish())
20 | 
21 |     def forward(self, x, mask):
22 |         output = self.block(x * mask)
23 |         return output * mask
24 | 
25 | 
26 | class ResnetBlock(BaseModule):
27 |     def __init__(self, dim, groups=8):
28 |         super(ResnetBlock, self).__init__()
29 |         self.block1 = Block(dim, groups=groups)
30 |         self.block2 = Block(dim, groups=groups)
31 |         self.res = torch.nn.Conv2d(dim, dim, 1)
32 | 
33 |     def forward(self, x, mask):
34 |         h = self.block1(x, mask)
35 |         h = self.block2(h, mask)
36 |         output = self.res(x * mask) + h
37 |         return output
38 | 
39 | 
40 | class PostNet(BaseModule):
41 |     def __init__(self, dim, groups=8):
42 |         super(PostNet, self).__init__()
43 |         self.init_conv = torch.nn.Conv2d(1, dim, 1)
44 |         self.res_block = ResnetBlock(dim, groups=groups)
45 |         self.final_conv = torch.nn.Conv2d(dim, 1, 1)
46 | 
47 |     def forward(self, x, mask):
48 |         x = x.unsqueeze(1)
49 |         mask = mask.unsqueeze(1)
50 |         x = self.init_conv(x * mask)
51 |         x = self.res_block(x, mask)
52 |         output = self.final_conv(x * mask)
53 |         return output.squeeze(1)
54 | 


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import torch
 10 | import torchaudio
 11 | import numpy as np
 12 | from librosa.filters import mel as librosa_mel_fn
 13 | 
 14 | from model.base import BaseModule
 15 | 
 16 | 
 17 | def mse_loss(x, y, mask, n_feats):
 18 |     loss = torch.sum(((x - y)**2) * mask)
 19 |     return loss / (torch.sum(mask) * n_feats)
 20 | 
 21 | 
 22 | def sequence_mask(length, max_length=None):
 23 |     if max_length is None:
 24 |         max_length = length.max()
 25 |     x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
 26 |     return x.unsqueeze(0) < length.unsqueeze(1)
 27 | 
 28 | 
 29 | def convert_pad_shape(pad_shape):
 30 |     l = pad_shape[::-1]
 31 |     pad_shape = [item for sublist in l for item in sublist]
 32 |     return pad_shape
 33 | 
 34 | 
 35 | def fix_len_compatibility(length, num_downsamplings_in_unet=2):
 36 |     while True:
 37 |         if length % (2**num_downsamplings_in_unet) == 0:
 38 |             return length
 39 |         length += 1
 40 | 
 41 | 
 42 | class PseudoInversion(BaseModule):
 43 |     def __init__(self, n_mels, sampling_rate, n_fft):
 44 |         super(PseudoInversion, self).__init__()
 45 |         self.n_mels = n_mels
 46 |         self.sampling_rate = sampling_rate
 47 |         self.n_fft = n_fft
 48 |         mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mels, 0, 8000)
 49 |         mel_basis_inverse = np.linalg.pinv(mel_basis)
 50 |         mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float()
 51 |         self.register_buffer("mel_basis_inverse", mel_basis_inverse)
 52 | 
 53 |     def forward(self, log_mel_spectrogram):
 54 |         mel_spectrogram = torch.exp(log_mel_spectrogram)
 55 |         stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram)
 56 |         return stftm
 57 | 
 58 | 
 59 | class InitialReconstruction(BaseModule):
 60 |     def __init__(self, n_fft, hop_size):
 61 |         super(InitialReconstruction, self).__init__()
 62 |         self.n_fft = n_fft
 63 |         self.hop_size = hop_size
 64 |         window = torch.hann_window(n_fft).float()
 65 |         self.register_buffer("window", window)
 66 | 
 67 |     def forward(self, stftm):
 68 |         real_part = torch.ones_like(stftm, device=stftm.device)
 69 |         imag_part = torch.zeros_like(stftm, device=stftm.device)
 70 |         stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1)
 71 |         istft = torchaudio.functional.istft(stft, n_fft=self.n_fft, 
 72 |                            hop_length=self.hop_size, win_length=self.n_fft, 
 73 |                            window=self.window, center=True)
 74 |         return istft.unsqueeze(1)
 75 | 
 76 | 
 77 | # Fast Griffin-Lim algorithm as a PyTorch module
 78 | class FastGL(BaseModule):
 79 |     def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99):
 80 |         super(FastGL, self).__init__()
 81 |         self.n_mels = n_mels
 82 |         self.sampling_rate = sampling_rate
 83 |         self.n_fft = n_fft
 84 |         self.hop_size = hop_size
 85 |         self.momentum = momentum
 86 |         self.pi = PseudoInversion(n_mels, sampling_rate, n_fft)
 87 |         self.ir = InitialReconstruction(n_fft, hop_size)
 88 |         window = torch.hann_window(n_fft).float()
 89 |         self.register_buffer("window", window)
 90 | 
 91 |     @torch.no_grad()
 92 |     def forward(self, s, n_iters=32):
 93 |         c = self.pi(s)
 94 |         x = self.ir(c)
 95 |         x = x.squeeze(1)
 96 |         c = c.unsqueeze(-1)
 97 |         prev_angles = torch.zeros_like(c, device=c.device)
 98 |         for _ in range(n_iters):        
 99 |             s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size, 
100 |                            win_length=self.n_fft, window=self.window, 
101 |                            center=True)
102 |             real_part, imag_part = s.unbind(-1)
103 |             stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8))
104 |             angles = s / stftm.unsqueeze(-1)
105 |             s = c * (angles + self.momentum * (angles - prev_angles))
106 |             x = torchaudio.functional.istft(s, n_fft=self.n_fft, hop_length=self.hop_size, 
107 |                                             win_length=self.n_fft, window=self.window, 
108 |                                             center=True)
109 |             prev_angles = angles
110 |         return x.unsqueeze(1)
111 | 


--------------------------------------------------------------------------------
/model/vc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import torch
 10 | 
 11 | from model.base import BaseModule
 12 | from model.encoder import MelEncoder
 13 | from model.postnet import PostNet
 14 | from model.diffusion import Diffusion
 15 | from model.utils import sequence_mask, fix_len_compatibility, mse_loss
 16 | 
 17 | 
 18 | # "average voice" encoder as the module parameterizing the diffusion prior
 19 | class FwdDiffusion(BaseModule):
 20 |     def __init__(self, n_feats, channels, filters, heads, layers, kernel, 
 21 |                  dropout, window_size, dim):
 22 |         super(FwdDiffusion, self).__init__()
 23 |         self.n_feats = n_feats
 24 |         self.channels = channels
 25 |         self.filters = filters
 26 |         self.heads = heads
 27 |         self.layers = layers
 28 |         self.kernel = kernel
 29 |         self.dropout = dropout
 30 |         self.window_size = window_size
 31 |         self.dim = dim
 32 |         self.encoder = MelEncoder(n_feats, channels, filters, heads, layers, 
 33 |                                   kernel, dropout, window_size)
 34 |         self.postnet = PostNet(dim)
 35 | 
 36 |     @torch.no_grad()
 37 |     def forward(self, x, mask):
 38 |         x, mask = self.relocate_input([x, mask])
 39 |         z = self.encoder(x, mask)
 40 |         z_output = self.postnet(z, mask)
 41 |         return z_output
 42 | 
 43 |     def compute_loss(self, x, y, mask):
 44 |         x, y, mask = self.relocate_input([x, y, mask])
 45 |         z = self.encoder(x, mask)
 46 |         z_output = self.postnet(z, mask)
 47 |         loss = mse_loss(z_output, y, mask, self.n_feats)
 48 |         return loss
 49 | 
 50 | 
 51 | # the whole voice conversion model consisting of the "average voice" encoder 
 52 | # and the diffusion-based speaker-conditional decoder
 53 | class DiffVC(BaseModule):
 54 |     def __init__(self, n_feats, channels, filters, heads, layers, kernel, 
 55 |                  dropout, window_size, enc_dim, spk_dim, use_ref_t, dec_dim, 
 56 |                  beta_min, beta_max):
 57 |         super(DiffVC, self).__init__()
 58 |         self.n_feats = n_feats
 59 |         self.channels = channels
 60 |         self.filters = filters
 61 |         self.heads = heads
 62 |         self.layers = layers
 63 |         self.kernel = kernel
 64 |         self.dropout = dropout
 65 |         self.window_size = window_size
 66 |         self.enc_dim = enc_dim
 67 |         self.spk_dim = spk_dim
 68 |         self.use_ref_t = use_ref_t
 69 |         self.dec_dim = dec_dim
 70 |         self.beta_min = beta_min
 71 |         self.beta_max = beta_max
 72 |         self.encoder = FwdDiffusion(n_feats, channels, filters, heads, layers,
 73 |                                     kernel, dropout, window_size, enc_dim)
 74 |         self.decoder = Diffusion(n_feats, dec_dim, spk_dim, use_ref_t, 
 75 |                                  beta_min, beta_max)
 76 | 
 77 |     def load_encoder(self, enc_path):
 78 |         enc_dict = torch.load(enc_path, map_location=lambda loc, storage: loc)
 79 |         self.encoder.load_state_dict(enc_dict, strict=False)
 80 | 
 81 |     @torch.no_grad()
 82 |     def forward(self, x, x_lengths, x_ref, x_ref_lengths, c, n_timesteps, 
 83 |                 mode='ml'):
 84 |         """
 85 |         Generates mel-spectrogram from source mel-spectrogram conditioned on
 86 |         target speaker embedding. Returns:
 87 |             1. 'average voice' encoder outputs
 88 |             2. decoder outputs
 89 |         
 90 |         Args:
 91 |             x (torch.Tensor): batch of source mel-spectrograms.
 92 |             x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms.
 93 |             x_ref (torch.Tensor): batch of reference mel-spectrograms.
 94 |             x_ref_lengths (torch.Tensor): numbers of frames in reference mel-spectrograms.
 95 |             c (torch.Tensor): batch of reference speaker embeddings
 96 |             n_timesteps (int): number of steps to use for reverse diffusion in decoder.
 97 |             mode (string, optional): sampling method. Can be one of:
 98 |               'pf' - probability flow sampling (Euler scheme for ODE)
 99 |               'em' - Euler-Maruyama SDE solver
100 |               'ml' - Maximum Likelihood SDE solver
101 |         """
102 |         x, x_lengths = self.relocate_input([x, x_lengths])
103 |         x_ref, x_ref_lengths, c = self.relocate_input([x_ref, x_ref_lengths, c])
104 |         x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype)
105 |         x_ref_mask = sequence_mask(x_ref_lengths).unsqueeze(1).to(x_ref.dtype)
106 |         mean = self.encoder(x, x_mask)
107 |         mean_x = self.decoder.compute_diffused_mean(x, x_mask, mean, 1.0)
108 |         mean_ref = self.encoder(x_ref, x_ref_mask)
109 | 
110 |         b = x.shape[0]
111 |         max_length = int(x_lengths.max())
112 |         max_length_new = fix_len_compatibility(max_length)
113 |         x_mask_new = sequence_mask(x_lengths, max_length_new).unsqueeze(1).to(x.dtype)
114 |         mean_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 
115 |                                 device=x.device)
116 |         mean_x_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 
117 |                                   device=x.device)
118 |         for i in range(b):
119 |             mean_new[i, :, :x_lengths[i]] = mean[i, :, :x_lengths[i]]
120 |             mean_x_new[i, :, :x_lengths[i]] = mean_x[i, :, :x_lengths[i]]
121 | 
122 |         z = mean_x_new
123 |         z += torch.randn_like(mean_x_new, device=mean_x_new.device)
124 | 
125 |         y = self.decoder(z, x_mask_new, mean_new, x_ref, x_ref_mask, mean_ref, c, 
126 |                          n_timesteps, mode)
127 |         return mean_x, y[:, :, :max_length]
128 | 
129 |     def compute_loss(self, x, x_lengths, x_ref, c):
130 |         """
131 |         Computes diffusion (score matching) loss.
132 |             
133 |         Args:
134 |             x (torch.Tensor): batch of source mel-spectrograms.
135 |             x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms.
136 |             x_ref (torch.Tensor): batch of reference mel-spectrograms.
137 |             c (torch.Tensor): batch of reference speaker embeddings
138 |         """
139 |         x, x_lengths, x_ref, c = self.relocate_input([x, x_lengths, x_ref, c])
140 |         x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype)
141 |         mean = self.encoder(x, x_mask).detach()
142 |         mean_ref = self.encoder(x_ref, x_mask).detach()
143 |         diff_loss = self.decoder.compute_loss(x, x_mask, mean, x_ref, mean_ref, c)
144 |         return diff_loss
145 | 


--------------------------------------------------------------------------------
/params.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
 2 | # This program is free software; you can redistribute it and/or modify
 3 | # it under the terms of the MIT License.
 4 | # This program is distributed in the hope that it will be useful,
 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 7 | # MIT License for more details.
 8 | 
 9 | # data parameters
10 | n_mels = 80
11 | sampling_rate = 22050
12 | n_fft = 1024
13 | hop_size = 256
14 | 
15 | # "average voice" encoder parameters
16 | channels = 192
17 | filters = 768
18 | layers = 6
19 | kernel = 3
20 | dropout = 0.1
21 | heads = 2
22 | window_size = 4
23 | enc_dim = 128
24 | 
25 | # diffusion-based decoder parameters
26 | dec_dim = 256
27 | spk_dim = 128
28 | use_ref_t = True
29 | beta_min = 0.05
30 | beta_max = 20.0
31 | 
32 | # training parameters
33 | seed = 37
34 | test_size = 1
35 | train_frames = 128
36 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | datetime == 4.4
 2 | datasets == 1.17.0 
 3 | protobuf == 3.19.4
 4 | pydub == 0.25.1
 5 | numpy == 1.21.5 
 6 | onnx == 1.11.0 
 7 | onnxruntime == 1.11.1 
 8 | requests == 2.22.0 
 9 | soundfile == 0.10.2 
10 | uvicorn == 0.17.5
11 | gunicorn == 20.1.0
12 | fastapi == 0.81.0
13 | python-multipart == 0.0.5
14 | tritonclient[all]
15 | python-dotenv
16 | loguru
17 | inflect
18 | webrtcvad-wheels
19 | einops==0.3.0
20 | librosa==0.8.0
21 | tb-nightly
22 | future
23 | tqdm
24 | tgt
25 | matplotlib==3.7.2


--------------------------------------------------------------------------------
/run-container.sh:
--------------------------------------------------------------------------------
 1 | IMAGE_NAME = diffvc
 2 | CONTAINER_NAME = diff-vc-dev
 3 | PORT = 1402
 4 | GPUS = all 
 5 | 
 6 | 
 7 | docker run -itd --gpus $GPUS \
 8 | --name $CONTAINER_NAME \
 9 | -p $PORT:$PORT \
10 | -v $(pwd)/:/workspace  \
11 | $IMAGE_NAME


--------------------------------------------------------------------------------
/scenario/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/scenario/__init__.py


--------------------------------------------------------------------------------
/scenario/train_dec.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import os
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | 
 13 | import torch
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | import params
 17 | from data import VCDecDataset, VCDecBatchCollate
 18 | from model.vc import DiffVC
 19 | from model.utils import FastGL
 20 | from utils import save_plot, save_audio
 21 | 
 22 | n_mels = params.n_mels
 23 | sampling_rate = params.sampling_rate
 24 | n_fft = params.n_fft
 25 | hop_size = params.hop_size
 26 | 
 27 | channels = params.channels
 28 | filters = params.filters
 29 | layers = params.layers
 30 | kernel = params.kernel
 31 | dropout = params.dropout
 32 | heads = params.heads
 33 | window_size = params.window_size
 34 | enc_dim = params.enc_dim
 35 | 
 36 | dec_dim = params.dec_dim
 37 | spk_dim = params.spk_dim
 38 | use_ref_t = params.use_ref_t
 39 | beta_min = params.beta_min
 40 | beta_max = params.beta_max
 41 | 
 42 | random_seed = params.seed
 43 | test_size = params.test_size
 44 | 
 45 | data_dir = '../data/LibriTTS'
 46 | val_file = 'filelists/valid.txt'
 47 | exc_file = 'filelists/exceptions_libritts.txt'
 48 | 
 49 | log_dir = 'logs_dec'
 50 | enc_dir = 'logs_enc'
 51 | epochs = 110
 52 | batch_size = 32
 53 | learning_rate = 1e-4
 54 | save_every = 1
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 | 
 59 |     torch.manual_seed(random_seed)
 60 |     np.random.seed(random_seed)
 61 | 
 62 |     os.makedirs(log_dir, exist_ok=True)
 63 | 
 64 |     print('Initializing data loaders...')
 65 |     train_set = VCDecDataset(data_dir, val_file, exc_file)
 66 |     collate_fn = VCDecBatchCollate()
 67 |     train_loader = DataLoader(train_set, batch_size=batch_size, 
 68 |                               collate_fn=collate_fn, num_workers=4, drop_last=True)
 69 | 
 70 |     print('Initializing and loading models...')
 71 |     fgl = FastGL(n_mels, sampling_rate, n_fft, hop_size).cuda()
 72 |     model = DiffVC(n_mels, channels, filters, heads, layers, kernel, 
 73 |                    dropout, window_size, enc_dim, spk_dim, use_ref_t, 
 74 |                    dec_dim, beta_min, beta_max).cuda()
 75 |     model.load_encoder(os.path.join(enc_dir, 'enc.pt'))
 76 | 
 77 |     print('Encoder:')
 78 |     print(model.encoder)
 79 |     print('Number of parameters = %.2fm\n' % (model.encoder.nparams/1e6))
 80 |     print('Decoder:')
 81 |     print(model.decoder)
 82 |     print('Number of parameters = %.2fm\n' % (model.decoder.nparams/1e6))
 83 | 
 84 |     print('Initializing optimizers...')
 85 |     optimizer = torch.optim.Adam(params=model.decoder.parameters(), lr=learning_rate)
 86 | 
 87 |     print('Start training.')
 88 |     torch.backends.cudnn.benchmark = True
 89 |     iteration = 0
 90 |     for epoch in range(1, epochs + 1):
 91 |         print(f'Epoch: {epoch} [iteration: {iteration}]')
 92 |         model.train()
 93 |         losses = []
 94 |         for batch in tqdm(train_loader, total=len(train_set)//batch_size):
 95 |             mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda()
 96 |             c, mel_lengths = batch['c'].cuda(), batch['mel_lengths'].cuda()
 97 |             model.zero_grad()
 98 |             loss = model.compute_loss(mel, mel_lengths, mel_ref, c)
 99 |             loss.backward()
100 |             torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), max_norm=1)
101 |             optimizer.step()
102 |             losses.append(loss.item())
103 |             iteration += 1
104 | 
105 |         losses = np.asarray(losses)
106 |         msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses))
107 |         print(msg)
108 |         with open(f'{log_dir}/train_dec.log', 'a') as f:
109 |             f.write(msg)
110 |         losses = []
111 | 
112 |         if epoch % save_every > 0:
113 |             continue
114 | 
115 |         model.eval()
116 |         print('Inference...\n')
117 |         with torch.no_grad():
118 |             mels = train_set.get_valid_dataset()
119 |             for i, (mel, c) in enumerate(mels):
120 |                 if i >= test_size:
121 |                     break
122 |                 mel = mel.unsqueeze(0).float().cuda()
123 |                 c = c.unsqueeze(0).float().cuda()
124 |                 mel_lengths = torch.LongTensor([mel.shape[-1]]).cuda()
125 |                 mel_avg, mel_rec = model(mel, mel_lengths, mel, mel_lengths, c, 
126 |                                          n_timesteps=100)
127 |                 if epoch == save_every:
128 |                     save_plot(mel.squeeze().cpu(), f'{log_dir}/original_{i}.png')
129 |                     audio = fgl(mel)
130 |                     save_audio(f'{log_dir}/original_{i}.wav', sampling_rate, audio)
131 |                 save_plot(mel_avg.squeeze().cpu(), f'{log_dir}/average_{i}.png')
132 |                 audio = fgl(mel_avg)
133 |                 save_audio(f'{log_dir}/average_{i}.wav', sampling_rate, audio)
134 |                 save_plot(mel_rec.squeeze().cpu(), f'{log_dir}/reconstructed_{i}.png')
135 |                 audio = fgl(mel_rec)
136 |                 save_audio(f'{log_dir}/reconstructed_{i}.wav', sampling_rate, audio)
137 | 
138 |         print('Saving model...\n')
139 |         ckpt = model.state_dict()
140 |         torch.save(ckpt, f=f"{log_dir}/vc_{epoch}.pt")
141 | 


--------------------------------------------------------------------------------
/scenario/train_enc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
  2 | # This program is free software; you can redistribute it and/or modify
  3 | # it under the terms of the MIT License.
  4 | # This program is distributed in the hope that it will be useful,
  5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  7 | # MIT License for more details.
  8 | 
  9 | import os
 10 | import numpy as np
 11 | from tqdm import tqdm
 12 | 
 13 | import torch
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | import params
 17 | from data import VCEncDataset, VCEncBatchCollate
 18 | from model.vc import FwdDiffusion
 19 | from model.utils import FastGL, sequence_mask
 20 | from utils import save_plot, save_audio
 21 | 
 22 | n_mels = params.n_mels
 23 | sampling_rate = params.sampling_rate
 24 | n_fft = params.n_fft
 25 | hop_size = params.hop_size
 26 | 
 27 | channels = params.channels
 28 | filters = params.filters
 29 | layers = params.layers
 30 | kernel = params.kernel
 31 | dropout = params.dropout
 32 | heads = params.heads
 33 | window_size = params.window_size
 34 | dim = params.enc_dim
 35 | 
 36 | random_seed = params.seed
 37 | test_size = params.test_size
 38 | 
 39 | data_dir = '../data/LibriTTS'
 40 | exc_file = 'filelists/exceptions_libritts.txt'
 41 | avg_type = 'mode'
 42 | 
 43 | log_dir = 'logs_enc'
 44 | epochs = 300
 45 | batch_size = 128
 46 | learning_rate = 5e-4
 47 | save_every = 1
 48 | 
 49 | 
 50 | if __name__ == "__main__":
 51 | 
 52 |     torch.manual_seed(random_seed)
 53 |     np.random.seed(random_seed)
 54 | 
 55 |     os.makedirs(log_dir, exist_ok=True)
 56 | 
 57 |     print('Initializing data loaders...')
 58 |     train_set = VCEncDataset(data_dir, exc_file, avg_type)
 59 |     collate_fn = VCEncBatchCollate()
 60 |     train_loader = DataLoader(train_set, batch_size=batch_size, 
 61 |                               collate_fn=collate_fn, num_workers=4,
 62 |                               drop_last=True)
 63 | 
 64 |     print('Initializing models...')
 65 |     fgl = FastGL(n_mels, sampling_rate, n_fft, hop_size).cuda()
 66 |     model = FwdDiffusion(n_mels, channels, filters, heads, layers, kernel, 
 67 |                          dropout, window_size, dim).cuda()
 68 | 
 69 |     print('Encoder:')
 70 |     print(model)
 71 |     print('Number of parameters = %.2fm\n' % (model.nparams/1e6))
 72 | 
 73 |     print('Initializing optimizers...')
 74 |     optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
 75 | 
 76 |     print('Start training.')
 77 |     torch.backends.cudnn.benchmark = True
 78 |     iteration = 0
 79 |     for epoch in range(1, epochs + 1):
 80 |         print(f'Epoch: {epoch} [iteration: {iteration}]')
 81 |         model.train()
 82 |         losses = []
 83 |         for batch in tqdm(train_loader, total=len(train_set)//batch_size):
 84 |             mel_x, mel_y = batch['x'].cuda(), batch['y'].cuda()
 85 |             mel_lengths = batch['lengths'].cuda()
 86 |             mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype)
 87 | 
 88 |             model.zero_grad()
 89 |             loss = model.compute_loss(mel_x, mel_y, mel_mask)
 90 |             loss.backward()
 91 |             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
 92 |             optimizer.step()
 93 | 
 94 |             losses.append(loss.item())
 95 |             iteration += 1
 96 | 
 97 |         losses = np.asarray(losses)
 98 |         msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses))
 99 |         print(msg)
100 |         with open(f'{log_dir}/train_enc.log', 'a') as f:
101 |             f.write(msg)
102 |         losses = []
103 |  
104 |         if epoch % save_every > 0:
105 |             continue
106 | 
107 |         model.eval()
108 |         print('Inference...\n')
109 |         with torch.no_grad():
110 |             mels = train_set.get_test_dataset()
111 |             for i, (mel_x, mel_y) in enumerate(mels):
112 |                 if i >= test_size:
113 |                     break
114 |                 mel_x = mel_x.unsqueeze(0).float().cuda()
115 |                 mel_y = mel_y.unsqueeze(0).float().cuda()
116 |                 mel_lengths = torch.LongTensor([mel_x.shape[-1]]).cuda()
117 |                 mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype)
118 |                 mel = model(mel_x, mel_mask)
119 |                 save_plot(mel.squeeze().cpu(), f'{log_dir}/generated_{i}.png')
120 |                 audio = fgl(mel)
121 |                 save_audio(f'{log_dir}/generated_{i}.wav', sampling_rate, audio)
122 |                 if epoch == save_every:
123 |                     save_plot(mel_x.squeeze().cpu(), f'{log_dir}/source_{i}.png')
124 |                     audio = fgl(mel_x)
125 |                     save_audio(f'{log_dir}/source_{i}.wav', sampling_rate, audio)
126 |                     save_plot(mel_y.squeeze().cpu(), f'{log_dir}/target_{i}.png')
127 |                     audio = fgl(mel_y)
128 |                     save_audio(f'{log_dir}/target_{i}.wav', sampling_rate, audio)
129 | 
130 |         print('Saving model...\n')
131 |         ckpt = model.state_dict()
132 |         torch.save(ckpt, f=f"{log_dir}/enc.pt")
133 | 


--------------------------------------------------------------------------------
/speaker_encoder/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead)
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/speaker_encoder/README.md:
--------------------------------------------------------------------------------
 1 | # Real-Time Voice Cloning
 2 | This repository is an implementation of [Transfer Learning from Speaker Verification to
 3 | Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
 4 | 
 5 | SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
 6 | 
 7 | **Video demonstration** (click the picture):
 8 | 
 9 | [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
10 | 
11 | 
12 | 
13 | ### Papers implemented  
14 | | URL | Designation | Title | Implementation source |
15 | | --- | ----------- | ----- | --------------------- |
16 | |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
17 | |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
18 | |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
19 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
20 | 
21 | ## News
22 | **10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
23 | 
24 | **28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
25 | 
26 | **14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
27 | 
28 | **13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
29 | - **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
30 | - **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
31 | 
32 | **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
33 | 
34 | 
35 | ## Setup
36 | 
37 | ### 1. Install Requirements
38 | 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
39 | 2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
40 | 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
41 | 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
42 | 5. Install the remaining requirements with `pip install -r requirements.txt`
43 | 
44 | ### 2. (Optional) Download Pretrained Models
45 | Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
46 | 
47 | ### 3. (Optional) Test Configuration
48 | Before you download any dataset, you can begin by testing your configuration with:
49 | 
50 | `python demo_cli.py`
51 | 
52 | If all tests pass, you're good to go.
53 | 
54 | ### 4. (Optional) Download Datasets
55 | For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
56 | 
57 | ### 5. Launch the Toolbox
58 | You can then try the toolbox:
59 | 
60 | `python demo_toolbox.py -d <datasets_root>`  
61 | or  
62 | `python demo_toolbox.py`  
63 | 
64 | depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
65 | 


--------------------------------------------------------------------------------
/speaker_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/speaker_encoder/__init__.py


--------------------------------------------------------------------------------
/speaker_encoder/encoder/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/audio.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from scipy.ndimage.morphology import binary_dilation
  4 | from encoder.params_data import *
  5 | from pathlib import Path
  6 | from typing import Optional, Union
  7 | import numpy as np
  8 | import webrtcvad
  9 | import librosa
 10 | import struct
 11 | 
 12 | import torch
 13 | from torchaudio.transforms import Resample
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | 
 17 | int16_max = (2 ** 15) - 1
 18 | 
 19 | 
 20 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
 21 |                    source_sr: Optional[int] = None):
 22 |     """
 23 |     Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
 24 |     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
 25 | 
 26 |     :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
 27 |     just .wav), either the waveform as a numpy array of floats.
 28 |     :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
 29 |     preprocessing. After preprocessing, the waveform's sampling rate will match the data 
 30 |     hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
 31 |     this argument will be ignored.
 32 |     """
 33 |     # Load the wav from disk if needed
 34 |     if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
 35 |         wav, source_sr = librosa.load(fpath_or_wav, sr=None)
 36 |     else:
 37 |         wav = fpath_or_wav
 38 |     
 39 |     # Resample the wav if needed
 40 |     if source_sr is not None and source_sr != sampling_rate:
 41 |         wav = librosa.resample(wav, source_sr, sampling_rate)
 42 | 
 43 |     # Apply the preprocessing: normalize volume and shorten long silences 
 44 |     wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
 45 |     wav = trim_long_silences(wav)
 46 |     
 47 |     return wav
 48 | 
 49 | 
 50 | def preprocess_wav_batch(wavs, source_sr=22050):
 51 |     # This torch version is designed to cope with a batch of same lengths wavs
 52 |     if sampling_rate != source_sr:
 53 |         resample = Resample(source_sr, sampling_rate)
 54 |         wavs = resample(wavs)
 55 |     wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, 
 56 |                                                increase_only=True)
 57 |     # Trimming silence is not implemented in this version yet!
 58 |     return wavs_preprocessed
 59 | 
 60 | 
 61 | def wav_to_mel_spectrogram(wav):
 62 |     """
 63 |     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
 64 |     Note: this not a log-mel spectrogram.
 65 |     """
 66 |     frames = librosa.feature.melspectrogram(
 67 |         wav,
 68 |         sampling_rate,
 69 |         n_fft=int(sampling_rate * mel_window_length / 1000),
 70 |         hop_length=int(sampling_rate * mel_window_step / 1000),
 71 |         n_mels=mel_n_channels
 72 |     )
 73 |     return frames.astype(np.float32).T
 74 | 
 75 | 
 76 | def wav_to_mel_spectrogram_batch(wavs):
 77 |     # This torch version is designed to cope with a batch of same lengths wavs
 78 |     n_fft = int(sampling_rate * mel_window_length / 1000)
 79 |     hop_length = int(sampling_rate * mel_window_step / 1000)
 80 |     win_length = int(sampling_rate * mel_window_length / 1000)
 81 |     window = torch.hann_window(n_fft).to(wavs)
 82 |     mel_basis = torch.from_numpy(librosa_mel_fn(sampling_rate, n_fft, 
 83 |                                                 mel_n_channels)).to(wavs)
 84 |     s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, 
 85 |                    win_length=win_length, window=window, center=True)
 86 |     real_part, imag_part = s.unbind(-1)
 87 |     stftm = real_part**2 + imag_part**2
 88 |     mels = torch.matmul(mel_basis, stftm)
 89 |     return torch.transpose(mels, 1, 2)
 90 | 
 91 | 
 92 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
 93 |     if increase_only and decrease_only:
 94 |         raise ValueError("Both increase only and decrease only are set")
 95 |     dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
 96 |     if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
 97 |         return wav
 98 |     return wav * (10 ** (dBFS_change / 20))
 99 | 
100 | 
101 | def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
102 |     # This torch version is designed to cope with a batch of same lengths wavs
103 |     if increase_only and decrease_only:
104 |         raise ValueError("Both increase only and decrease only are set")
105 |     dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
106 |     scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
107 |     if increase_only:
108 |         mask = (dBFS_change > 0).to(scales)
109 |     elif decrease_only:
110 |         mask = (dBFS_change < 0).to(scales)
111 |     else:
112 |         mask = torch.zeros_like(scales)
113 |     scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
114 |     return wavs * scales.unsqueeze(-1)
115 | 
116 | 
117 | def trim_long_silences(wav):
118 |     """
119 |     Ensures that segments without voice in the waveform remain no longer than a 
120 |     threshold determined by the VAD parameters in params.py.
121 | 
122 |     :param wav: the raw waveform as a numpy array of floats 
123 |     :return: the same waveform with silences trimmed away (length <= original wav length)
124 |     """
125 |     # Compute the voice detection window size
126 |     samples_per_window = (vad_window_length * sampling_rate) // 1000
127 |     
128 |     # Trim the end of the audio to have a multiple of the window size
129 |     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
130 |     
131 |     # Convert the float waveform to 16-bit mono PCM
132 |     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
133 |     
134 |     # Perform voice activation detection
135 |     voice_flags = []
136 |     vad = webrtcvad.Vad(mode=3)
137 |     for window_start in range(0, len(wav), samples_per_window):
138 |         window_end = window_start + samples_per_window
139 |         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
140 |                                          sample_rate=sampling_rate))
141 |     voice_flags = np.array(voice_flags)
142 |     
143 |     # Smooth the voice detection with a moving average
144 |     def moving_average(array, width):
145 |         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
146 |         ret = np.cumsum(array_padded, dtype=float)
147 |         ret[width:] = ret[width:] - ret[:-width]
148 |         return ret[width - 1:] / width
149 |     
150 |     audio_mask = moving_average(voice_flags, vad_moving_average_width)
151 |     audio_mask = np.round(audio_mask).astype(np.bool)
152 |     
153 |     # Dilate the voiced regions
154 |     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
155 |     audio_mask = np.repeat(audio_mask, samples_per_window)
156 |     
157 |     return wav[audio_mask == True]
158 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/config.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | librispeech_datasets = {
 4 |     "train": {
 5 |         "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
 6 |         "other": ["LibriSpeech/train-other-500"]
 7 |     },
 8 |     "test": {
 9 |         "clean": ["LibriSpeech/test-clean"],
10 |         "other": ["LibriSpeech/test-other"]
11 |     },
12 |     "dev": {
13 |         "clean": ["LibriSpeech/dev-clean"],
14 |         "other": ["LibriSpeech/dev-other"]
15 |     },
16 | }
17 | libritts_datasets = {
18 |     "train": {
19 |         "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
20 |         "other": ["LibriTTS/train-other-500"]
21 |     },
22 |     "test": {
23 |         "clean": ["LibriTTS/test-clean"],
24 |         "other": ["LibriTTS/test-other"]
25 |     },
26 |     "dev": {
27 |         "clean": ["LibriTTS/dev-clean"],
28 |         "other": ["LibriTTS/dev-other"]
29 |     },
30 | }
31 | voxceleb_datasets = {
32 |     "voxceleb1" : {
33 |         "train": ["VoxCeleb1/wav"],
34 |         "test": ["VoxCeleb1/test_wav"]
35 |     },
36 |     "voxceleb2" : {
37 |         "train": ["VoxCeleb2/dev/aac"],
38 |         "test": ["VoxCeleb2/test_wav"]
39 |     }
40 | }
41 | 
42 | other_datasets = [
43 |     "LJSpeech-1.1",
44 |     "VCTK-Corpus/wav48",
45 | ]
46 | 
47 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
48 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2 | 
3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
4 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
5 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/random_cycler.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | import random
 4 | 
 5 | class RandomCycler:
 6 |     """
 7 |     Creates an internal copy of a sequence and allows access to its items in a constrained random 
 8 |     order. For a source sequence of n items and one or several consecutive queries of a total 
 9 |     of m items, the following guarantees hold (one implies the other):
10 |         - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
11 |         - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
12 |     """
13 |     
14 |     def __init__(self, source):
15 |         if len(source) == 0:
16 |             raise Exception("Can't create RandomCycler from an empty collection")
17 |         self.all_items = list(source)
18 |         self.next_items = []
19 |     
20 |     def sample(self, count: int):
21 |         shuffle = lambda l: random.sample(l, len(l))
22 |         
23 |         out = []
24 |         while count > 0:
25 |             if count >= len(self.all_items):
26 |                 out.extend(shuffle(list(self.all_items)))
27 |                 count -= len(self.all_items)
28 |                 continue
29 |             n = min(count, len(self.next_items))
30 |             out.extend(self.next_items[:n])
31 |             count -= n
32 |             self.next_items = self.next_items[n:]
33 |             if len(self.next_items) == 0:
34 |                 self.next_items = shuffle(list(self.all_items))
35 |         return out
36 |     
37 |     def __next__(self):
38 |         return self.sample(1)[0]
39 | 
40 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/speaker.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | from encoder.data_objects.random_cycler import RandomCycler
 4 | from encoder.data_objects.utterance import Utterance
 5 | from pathlib import Path
 6 | 
 7 | # Contains the set of utterances of a single speaker
 8 | class Speaker:
 9 |     def __init__(self, root: Path):
10 |         self.root = root
11 |         self.name = root.name
12 |         self.utterances = None
13 |         self.utterance_cycler = None
14 |         
15 |     def _load_utterances(self):
16 |         with self.root.joinpath("_sources.txt").open("r") as sources_file:
17 |             sources = [l.split(",") for l in sources_file]
18 |         sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
19 |         self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
20 |         self.utterance_cycler = RandomCycler(self.utterances)
21 |                
22 |     def random_partial(self, count, n_frames):
23 |         """
24 |         Samples a batch of <count> unique partial utterances from the disk in a way that all 
25 |         utterances come up at least once every two cycles and in a random order every time.
26 |         
27 |         :param count: The number of partial utterances to sample from the set of utterances from 
28 |         that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
29 |         the number of utterances available.
30 |         :param n_frames: The number of frames in the partial utterance.
31 |         :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
32 |         frames are the frames of the partial utterances and range is the range of the partial 
33 |         utterance with regard to the complete utterance.
34 |         """
35 |         if self.utterances is None:
36 |             self._load_utterances()
37 | 
38 |         utterances = self.utterance_cycler.sample(count)
39 | 
40 |         a = [(u,) + u.random_partial(n_frames) for u in utterances]
41 | 
42 |         return a
43 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/speaker_batch.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | import numpy as np
 4 | from typing import List
 5 | from encoder.data_objects.speaker import Speaker
 6 | 
 7 | class SpeakerBatch:
 8 |     def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
 9 |         self.speakers = speakers
10 |         self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
11 |         
12 |         # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
13 |         # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
14 |         self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
15 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | from encoder.data_objects.random_cycler import RandomCycler
 4 | from encoder.data_objects.speaker_batch import SpeakerBatch
 5 | from encoder.data_objects.speaker import Speaker
 6 | from encoder.params_data import partials_n_frames
 7 | from torch.utils.data import Dataset, DataLoader
 8 | from pathlib import Path
 9 | 
10 | # TODO: improve with a pool of speakers for data efficiency
11 | 
12 | class SpeakerVerificationDataset(Dataset):
13 |     def __init__(self, datasets_root: Path):
14 |         self.root = datasets_root
15 |         speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
16 |         if len(speaker_dirs) == 0:
17 |             raise Exception("No speakers found. Make sure you are pointing to the directory "
18 |                             "containing all preprocessed speaker directories.")
19 |         self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
20 |         self.speaker_cycler = RandomCycler(self.speakers)
21 | 
22 |     def __len__(self):
23 |         return int(1e10)
24 |         
25 |     def __getitem__(self, index):
26 |         return next(self.speaker_cycler)
27 |     
28 |     def get_logs(self):
29 |         log_string = ""
30 |         for log_fpath in self.root.glob("*.txt"):
31 |             with log_fpath.open("r") as log_file:
32 |                 log_string += "".join(log_file.readlines())
33 |         return log_string
34 |     
35 |     
36 | class SpeakerVerificationDataLoader(DataLoader):
37 |     def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
38 |                  batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
39 |                  worker_init_fn=None):
40 |         self.utterances_per_speaker = utterances_per_speaker
41 | 
42 |         super().__init__(
43 |             dataset=dataset, 
44 |             batch_size=speakers_per_batch, 
45 |             shuffle=False, 
46 |             sampler=sampler, 
47 |             batch_sampler=batch_sampler, 
48 |             num_workers=num_workers,
49 |             collate_fn=self.collate, 
50 |             pin_memory=pin_memory, 
51 |             drop_last=False, 
52 |             timeout=timeout, 
53 |             worker_init_fn=worker_init_fn
54 |         )
55 | 
56 |     def collate(self, speakers):
57 |         return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
58 |     


--------------------------------------------------------------------------------
/speaker_encoder/encoder/data_objects/utterance.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Utterance:
 7 |     def __init__(self, frames_fpath, wave_fpath):
 8 |         self.frames_fpath = frames_fpath
 9 |         self.wave_fpath = wave_fpath
10 | 
11 |     def get_frames(self):
12 |         return np.load(self.frames_fpath)
13 | 
14 |     def random_partial(self, n_frames):
15 |         """
16 |         Crops the frames into a partial utterance of n_frames
17 |         
18 |         :param n_frames: The number of frames of the partial utterance
19 |         :return: the partial utterance frames and a tuple indicating the start and end of the 
20 |         partial utterance in the complete utterance.
21 |         """
22 |         frames = self.get_frames()
23 |         if frames.shape[0] == n_frames:
24 |             start = 0
25 |         else:
26 |             start = np.random.randint(0, frames.shape[0] - n_frames)
27 |         end = start + n_frames
28 |         return frames[start:end], (start, end)


--------------------------------------------------------------------------------
/speaker_encoder/encoder/inference.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from encoder.params_data import *
  4 | from encoder.model import SpeakerEncoder
  5 | from encoder.audio import preprocess_wav, preprocess_wav_batch
  6 | from matplotlib import cm
  7 | from encoder import audio
  8 | from pathlib import Path
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | _model = None # type: SpeakerEncoder
 14 | _device = None # type: torch.device
 15 | 
 16 | 
 17 | def load_model(weights_fpath: Path, device="cpu"):
 18 |     """
 19 |     Loads the model in memory. If this function is not explicitely called, it will be run on the 
 20 |     first call to embed_frames() with the default weights file.
 21 |     
 22 |     :param weights_fpath: the path to saved model weights.
 23 |     :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
 24 |     model will be loaded and will run on this device. Outputs will however always be on the cpu. 
 25 |     If None, will default to your GPU if it"s available, otherwise your CPU.
 26 |     """
 27 |     # TODO: I think the slow loading of the encoder might have something to do with the device it
 28 |     #   was saved on. Worth investigating.
 29 |     global _model, _device
 30 |     if device is None:
 31 |         _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 32 |     elif isinstance(device, str):
 33 |         _device = torch.device(device)
 34 |     _model = SpeakerEncoder(_device, torch.device("cpu"))
 35 |     checkpoint = torch.load(weights_fpath, map_location="cpu")
 36 |     _model.load_state_dict(checkpoint["model_state"])
 37 |     _model.eval()
 38 |     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
 39 |     
 40 |     
 41 | def is_loaded():
 42 |     return _model is not None
 43 | 
 44 | 
 45 | def embed_frames_batch(frames, use_torch=False):
 46 |     if _model is None:
 47 |         raise Exception("Model was not loaded. Call load_model() before inference.")
 48 | 
 49 |     if not use_torch:
 50 |         frames = torch.from_numpy(frames)
 51 |     frames = frames.to(_device)
 52 |     print(frames.shape)
 53 |     embeds = _model.forward(frames)
 54 |     print(embeds.shape)
 55 |     if not use_torch:
 56 |         embeds = embeds.detach().cpu().numpy()
 57 |     return embeds
 58 | 
 59 | 
 60 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
 61 |                            min_pad_coverage=0.75, overlap=0.5):
 62 |     """
 63 |     Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
 64 |     partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
 65 |     spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
 66 |     its spectrogram. This function assumes that the mel spectrogram parameters used are those 
 67 |     defined in params_data.py.
 68 |     
 69 |     The returned ranges may be indexing further than the length of the waveform. It is 
 70 |     recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
 71 |     
 72 |     :param n_samples: the number of samples in the waveform
 73 |     :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
 74 |     utterance
 75 |     :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
 76 |     enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
 77 |     then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
 78 |     it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
 79 |     utterance, this parameter is ignored so that the function always returns at least 1 slice.
 80 |     :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
 81 |     utterances are entirely disjoint. 
 82 |     :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
 83 |     respectively the waveform and the mel spectrogram with these slices to obtain the partial 
 84 |     utterances.
 85 |     """
 86 |     assert 0 <= overlap < 1
 87 |     assert 0 < min_pad_coverage <= 1
 88 |     
 89 |     samples_per_frame = int((sampling_rate * mel_window_step / 1000))
 90 |     n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
 91 |     frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
 92 | 
 93 |     # Compute the slices
 94 |     wav_slices, mel_slices = [], []
 95 |     steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
 96 |     for i in range(0, steps, frame_step):
 97 |         mel_range = np.array([i, i + partial_utterance_n_frames])
 98 |         wav_range = mel_range * samples_per_frame
 99 |         mel_slices.append(slice(*mel_range))
100 |         wav_slices.append(slice(*wav_range))
101 |         
102 |     # Evaluate whether extra padding is warranted or not
103 |     last_wav_range = wav_slices[-1]
104 |     coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
105 |     if coverage < min_pad_coverage and len(mel_slices) > 1:
106 |         mel_slices = mel_slices[:-1]
107 |         wav_slices = wav_slices[:-1]
108 |     
109 |     return wav_slices, mel_slices
110 | 
111 | 
112 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
113 |     """
114 |     Computes an embedding for a single utterance.
115 |     
116 |     # TODO: handle multiple wavs to benefit from batching on GPU
117 |     :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
118 |     :param using_partials: if True, then the utterance is split in partial utterances of 
119 |     <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
120 |     normalized average. If False, the utterance is instead computed from feeding the entire 
121 |     spectogram to the network.
122 |     :param return_partials: if True, the partial embeddings will also be returned along with the 
123 |     wav slices that correspond to the partial embeddings.
124 |     :param kwargs: additional arguments to compute_partial_splits()
125 |     :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
126 |     <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
127 |     (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
128 |     returned. If <using_partials> is simultaneously set to False, both these values will be None 
129 |     instead.
130 |     """
131 |     # Process the entire utterance if not using partials
132 |     if not using_partials:
133 |         frames = audio.wav_to_mel_spectrogram(wav)
134 |         embed = embed_frames_batch(frames[None, ...])[0]
135 |         if return_partials:
136 |             return embed, None, None
137 |         return embed
138 | 
139 |     # Compute where to split the utterance into partials and pad if necessary
140 |     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
141 |     max_wave_length = wave_slices[-1].stop
142 |     if max_wave_length >= len(wav):
143 |         wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
144 |     
145 |     # Split the utterance into partials
146 |     frames = audio.wav_to_mel_spectrogram(wav)
147 |     frames_batch = np.array([frames[s] for s in mel_slices])
148 |     partial_embeds = embed_frames_batch(frames_batch)
149 |     
150 |     # Compute the utterance embedding from the partial embeddings
151 |     raw_embed = np.mean(partial_embeds, axis=0)
152 |     embed = raw_embed / np.linalg.norm(raw_embed, 2)
153 |     
154 |     if return_partials:
155 |         return embed, partial_embeds, wave_slices
156 |     return embed
157 | 
158 | 
159 | def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
160 |     # This torch version is designed to cope with a batch of same lengths wavs
161 |     if not using_partials:
162 |         print(wavs.shape)
163 |         frames = audio.wav_to_mel_spectrogram_batch(wavs)
164 |         embeds = embed_frames_batch(frames)
165 |         if return_partials:
166 |             return embeds, None, None
167 |         return embeds
168 | 
169 |     wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
170 |     max_wave_length = wave_slices[-1].stop
171 |     if max_wave_length >= wavs.shape[-1]:
172 |         wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 
173 |                                             dtype=wavs.dtype, device=wavs.device)], 1)
174 | 
175 |     frames = audio.wav_to_mel_spectrogram_batch(wavs)
176 |     frames_batch = []
177 |     for i in range(len(frames)):
178 |         frames_batch += [frames[i][s] for s in mel_slices]
179 |     frames_batch = torch.stack(frames_batch, 0)
180 |     partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
181 |     partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
182 | 
183 |     raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
184 |     embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
185 | 
186 |     if return_partials:
187 |         return embeds, partial_embeds, wave_slices
188 |     return embeds
189 | 
190 | 
191 | def embed_speaker(wavs, **kwargs):
192 |     raise NotImplemented()
193 | 
194 | 
195 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
196 |     if ax is None:
197 |         ax = plt.gca()
198 |     
199 |     if shape is None:
200 |         height = int(np.sqrt(len(embed)))
201 |         shape = (height, -1)
202 |     embed = embed.reshape(shape)
203 |     
204 |     cmap = cm.get_cmap()
205 |     mappable = ax.imshow(embed, cmap=cmap)
206 |     cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
207 |     cbar.set_clim(*color_range)
208 |     
209 |     ax.set_xticks([]), ax.set_yticks([])
210 |     ax.set_title(title)
211 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/model.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from encoder.params_model import *
  4 | from encoder.params_data import *
  5 | from scipy.interpolate import interp1d
  6 | from sklearn.metrics import roc_curve
  7 | from torch.nn.utils import clip_grad_norm_
  8 | from scipy.optimize import brentq
  9 | from torch import nn
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | 
 14 | class SpeakerEncoder(nn.Module):
 15 |     def __init__(self, device, loss_device):
 16 |         super().__init__()
 17 |         self.loss_device = loss_device
 18 |         
 19 |         # Network defition
 20 |         self.lstm = nn.LSTM(input_size=mel_n_channels,
 21 |                             hidden_size=model_hidden_size, 
 22 |                             num_layers=model_num_layers, 
 23 |                             batch_first=True).to(device)
 24 |         self.linear = nn.Linear(in_features=model_hidden_size, 
 25 |                                 out_features=model_embedding_size).to(device)
 26 |         self.relu = torch.nn.ReLU().to(device)
 27 |         
 28 |         # Cosine similarity scaling (with fixed initial parameter values)
 29 |         self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
 30 |         self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
 31 | 
 32 |         # Loss
 33 |         self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
 34 |         
 35 |     def do_gradient_ops(self):
 36 |         # Gradient scale
 37 |         self.similarity_weight.grad *= 0.01
 38 |         self.similarity_bias.grad *= 0.01
 39 |             
 40 |         # Gradient clipping
 41 |         clip_grad_norm_(self.parameters(), 3, norm_type=2)
 42 |     
 43 |     def forward(self, utterances, hidden_init=None):
 44 |         """
 45 |         Computes the embeddings of a batch of utterance spectrograms.
 46 |         
 47 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
 48 |         (batch_size, n_frames, n_channels) 
 49 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
 50 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
 51 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
 52 |         """
 53 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
 54 |         # and the final cell state.
 55 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
 56 |         
 57 |         # We take only the hidden state of the last layer
 58 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 59 |         
 60 |         # L2-normalize it
 61 |         embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 62 |         
 63 |         return embeds
 64 |     
 65 |     def similarity_matrix(self, embeds):
 66 |         """
 67 |         Computes the similarity matrix according the section 2.1 of GE2E.
 68 | 
 69 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
 70 |         utterances_per_speaker, embedding_size)
 71 |         :return: the similarity matrix as a tensor of shape (speakers_per_batch,
 72 |         utterances_per_speaker, speakers_per_batch)
 73 |         """
 74 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
 75 |         
 76 |         # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
 77 |         centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
 78 |         centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
 79 | 
 80 |         # Exclusive centroids (1 per utterance)
 81 |         centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
 82 |         centroids_excl /= (utterances_per_speaker - 1)
 83 |         centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
 84 | 
 85 |         # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
 86 |         # product of these vectors (which is just an element-wise multiplication reduced by a sum).
 87 |         # We vectorize the computation for efficiency.
 88 |         sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
 89 |                                  speakers_per_batch).to(self.loss_device)
 90 |         mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
 91 |         for j in range(speakers_per_batch):
 92 |             mask = np.where(mask_matrix[j])[0]
 93 |             sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
 94 |             sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
 95 |         
 96 |         ## Even more vectorized version (slower maybe because of transpose)
 97 |         # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
 98 |         #                           ).to(self.loss_device)
 99 |         # eye = np.eye(speakers_per_batch, dtype=np.int)
100 |         # mask = np.where(1 - eye)
101 |         # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
102 |         # mask = np.where(eye)
103 |         # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
104 |         # sim_matrix2 = sim_matrix2.transpose(1, 2)
105 |         
106 |         sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
107 |         return sim_matrix
108 |     
109 |     def loss(self, embeds):
110 |         """
111 |         Computes the softmax loss according the section 2.1 of GE2E.
112 |         
113 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
114 |         utterances_per_speaker, embedding_size)
115 |         :return: the loss and the EER for this batch of embeddings.
116 |         """
117 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
118 |         
119 |         # Loss
120 |         sim_matrix = self.similarity_matrix(embeds)
121 |         sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
122 |                                          speakers_per_batch))
123 |         ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
124 |         target = torch.from_numpy(ground_truth).long().to(self.loss_device)
125 |         loss = self.loss_fn(sim_matrix, target)
126 |         
127 |         # EER (not backpropagated)
128 |         with torch.no_grad():
129 |             inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
130 |             labels = np.array([inv_argmax(i) for i in ground_truth])
131 |             preds = sim_matrix.detach().cpu().numpy()
132 | 
133 |             # Snippet from https://yangcha.github.io/EER-ROC/
134 |             fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
135 |             eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
136 |             
137 |         return loss, eer


--------------------------------------------------------------------------------
/speaker_encoder/encoder/params_data.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | ## Mel-filterbank
 4 | mel_window_length = 25  # In milliseconds
 5 | mel_window_step = 10    # In milliseconds
 6 | mel_n_channels = 40
 7 | 
 8 | 
 9 | ## Audio
10 | sampling_rate = 16000
11 | # Number of spectrogram frames in a partial utterance
12 | partials_n_frames = 160     # 1600 ms
13 | # Number of spectrogram frames at inference
14 | inference_n_frames = 80     #  800 ms
15 | 
16 | 
17 | ## Voice Activation Detection
18 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
19 | # This sets the granularity of the VAD. Should not need to be changed.
20 | vad_window_length = 30  # In milliseconds
21 | # Number of frames to average together when performing the moving average smoothing.
22 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
23 | vad_moving_average_width = 8
24 | # Maximum number of consecutive silent frames a segment can have.
25 | vad_max_silence_length = 6
26 | 
27 | 
28 | ## Audio volume normalization
29 | audio_norm_target_dBFS = -30
30 | 
31 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/params_model.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | ## Model parameters
 4 | model_hidden_size = 256
 5 | model_embedding_size = 256
 6 | model_num_layers = 3
 7 | 
 8 | 
 9 | ## Training parameters
10 | learning_rate_init = 1e-4
11 | speakers_per_batch = 64
12 | utterances_per_speaker = 10
13 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/preprocess.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from multiprocess.pool import ThreadPool
  4 | from encoder.params_data import *
  5 | from encoder.config import librispeech_datasets, anglophone_nationalites
  6 | from datetime import datetime
  7 | from encoder import audio
  8 | from pathlib import Path
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | 
 12 | 
 13 | class DatasetLog:
 14 |     """
 15 |     Registers metadata about the dataset in a text file.
 16 |     """
 17 |     def __init__(self, root, name):
 18 |         self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
 19 |         self.sample_data = dict()
 20 |         
 21 |         start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 22 |         self.write_line("Creating dataset %s on %s" % (name, start_time))
 23 |         self.write_line("-----")
 24 |         self._log_params()
 25 |         
 26 |     def _log_params(self):
 27 |         from encoder import params_data
 28 |         self.write_line("Parameter values:")
 29 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 30 |             value = getattr(params_data, param_name)
 31 |             self.write_line("\t%s: %s" % (param_name, value))
 32 |         self.write_line("-----")
 33 |     
 34 |     def write_line(self, line):
 35 |         self.text_file.write("%s\n" % line)
 36 |         
 37 |     def add_sample(self, **kwargs):
 38 |         for param_name, value in kwargs.items():
 39 |             if not param_name in self.sample_data:
 40 |                 self.sample_data[param_name] = []
 41 |             self.sample_data[param_name].append(value)
 42 |             
 43 |     def finalize(self):
 44 |         self.write_line("Statistics:")
 45 |         for param_name, values in self.sample_data.items():
 46 |             self.write_line("\t%s:" % param_name)
 47 |             self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
 48 |             self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
 49 |         self.write_line("-----")
 50 |         end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 51 |         self.write_line("Finished on %s" % end_time)
 52 |         self.text_file.close()
 53 |        
 54 |         
 55 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
 56 |     dataset_root = datasets_root.joinpath(dataset_name)
 57 |     if not dataset_root.exists():
 58 |         print("Couldn\'t find %s, skipping this dataset." % dataset_root)
 59 |         return None, None
 60 |     return dataset_root, DatasetLog(out_dir, dataset_name)
 61 | 
 62 | 
 63 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
 64 |                              skip_existing, logger):
 65 |     print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
 66 |     
 67 |     # Function to preprocess utterances for one speaker
 68 |     def preprocess_speaker(speaker_dir: Path):
 69 |         # Give a name to the speaker that includes its dataset
 70 |         speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
 71 |         
 72 |         # Create an output directory with that name, as well as a txt file containing a 
 73 |         # reference to each source file.
 74 |         speaker_out_dir = out_dir.joinpath(speaker_name)
 75 |         speaker_out_dir.mkdir(exist_ok=True)
 76 |         sources_fpath = speaker_out_dir.joinpath("_sources.txt")
 77 |         
 78 |         # There's a possibility that the preprocessing was interrupted earlier, check if 
 79 |         # there already is a sources file.
 80 |         if sources_fpath.exists():
 81 |             try:
 82 |                 with sources_fpath.open("r") as sources_file:
 83 |                     existing_fnames = {line.split(",")[0] for line in sources_file}
 84 |             except:
 85 |                 existing_fnames = {}
 86 |         else:
 87 |             existing_fnames = {}
 88 |         
 89 |         # Gather all audio files for that speaker recursively
 90 |         sources_file = sources_fpath.open("a" if skip_existing else "w")
 91 |         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
 92 |             # Check if the target output file already exists
 93 |             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
 94 |             out_fname = out_fname.replace(".%s" % extension, ".npy")
 95 |             if skip_existing and out_fname in existing_fnames:
 96 |                 continue
 97 |                 
 98 |             # Load and preprocess the waveform
 99 |             wav = audio.preprocess_wav(in_fpath)
100 |             if len(wav) == 0:
101 |                 continue
102 |             
103 |             # Create the mel spectrogram, discard those that are too short
104 |             frames = audio.wav_to_mel_spectrogram(wav)
105 |             if len(frames) < partials_n_frames:
106 |                 continue
107 |             
108 |             out_fpath = speaker_out_dir.joinpath(out_fname)
109 |             np.save(out_fpath, frames)
110 |             logger.add_sample(duration=len(wav) / sampling_rate)
111 |             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
112 |         
113 |         sources_file.close()
114 |     
115 |     # Process the utterances for each speaker
116 |     with ThreadPool(8) as pool:
117 |         list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
118 |                   unit="speakers"))
119 |     logger.finalize()
120 |     print("Done preprocessing %s.\n" % dataset_name)
121 | 
122 | 
123 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
124 |     for dataset_name in librispeech_datasets["train"]["other"]:
125 |         # Initialize the preprocessing
126 |         dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
127 |         if not dataset_root:
128 |             return 
129 |         
130 |         # Preprocess all speakers
131 |         speaker_dirs = list(dataset_root.glob("*"))
132 |         _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
133 |                                  skip_existing, logger)
134 | 
135 | 
136 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
137 |     # Initialize the preprocessing
138 |     dataset_name = "VoxCeleb1"
139 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
140 |     if not dataset_root:
141 |         return
142 | 
143 |     # Get the contents of the meta file
144 |     with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
145 |         metadata = [line.split("\t") for line in metafile][1:]
146 |     
147 |     # Select the ID and the nationality, filter out non-anglophone speakers
148 |     nationalities = {line[0]: line[3] for line in metadata}
149 |     keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
150 |                         nationality.lower() in anglophone_nationalites]
151 |     print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
152 |           (len(keep_speaker_ids), len(nationalities)))
153 |     
154 |     # Get the speaker directories for anglophone speakers only
155 |     speaker_dirs = dataset_root.joinpath("wav").glob("*")
156 |     speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
157 |                     speaker_dir.name in keep_speaker_ids]
158 |     print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
159 |           (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
160 | 
161 |     # Preprocess all speakers
162 |     _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
163 |                              skip_existing, logger)
164 | 
165 | 
166 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
167 |     # Initialize the preprocessing
168 |     dataset_name = "VoxCeleb2"
169 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
170 |     if not dataset_root:
171 |         return
172 |     
173 |     # Get the speaker directories
174 |     # Preprocess all speakers
175 |     speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
176 |     _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
177 |                              skip_existing, logger)
178 | 


--------------------------------------------------------------------------------
/speaker_encoder/encoder/train.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from encoder.visualizations import Visualizations
  4 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
  5 | from encoder.params_model import *
  6 | from encoder.model import SpeakerEncoder
  7 | from utils.profiler import Profiler
  8 | from pathlib import Path
  9 | import torch
 10 | 
 11 | def sync(device: torch.device):
 12 |     # FIXME
 13 |     return 
 14 |     # For correct profiling (cuda operations are async)
 15 |     if device.type == "cuda":
 16 |         torch.cuda.synchronize(device)
 17 | 
 18 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
 19 |           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
 20 |           no_visdom: bool):
 21 |     # Create a dataset and a dataloader
 22 |     dataset = SpeakerVerificationDataset(clean_data_root)
 23 |     loader = SpeakerVerificationDataLoader(
 24 |         dataset,
 25 |         speakers_per_batch,
 26 |         utterances_per_speaker,
 27 |         num_workers=8,
 28 |     )
 29 |     
 30 |     # Setup the device on which to run the forward pass and the loss. These can be different, 
 31 |     # because the forward pass is faster on the GPU whereas the loss is often (depending on your
 32 |     # hyperparameters) faster on the CPU.
 33 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 34 |     # FIXME: currently, the gradient is None if loss_device is cuda
 35 |     loss_device = torch.device("cpu")
 36 |     
 37 |     # Create the model and the optimizer
 38 |     model = SpeakerEncoder(device, loss_device)
 39 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
 40 |     init_step = 1
 41 |     
 42 |     # Configure file path for the model
 43 |     state_fpath = models_dir.joinpath(run_id + ".pt")
 44 |     backup_dir = models_dir.joinpath(run_id + "_backups")
 45 | 
 46 |     # Load any existing model
 47 |     if not force_restart:
 48 |         if state_fpath.exists():
 49 |             print("Found existing model \"%s\", loading it and resuming training." % run_id)
 50 |             checkpoint = torch.load(state_fpath)
 51 |             init_step = checkpoint["step"]
 52 |             model.load_state_dict(checkpoint["model_state"])
 53 |             optimizer.load_state_dict(checkpoint["optimizer_state"])
 54 |             optimizer.param_groups[0]["lr"] = learning_rate_init
 55 |         else:
 56 |             print("No model \"%s\" found, starting training from scratch." % run_id)
 57 |     else:
 58 |         print("Starting the training from scratch.")
 59 |     model.train()
 60 |     
 61 |     # Initialize the visualization environment
 62 |     vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
 63 |     vis.log_dataset(dataset)
 64 |     vis.log_params()
 65 |     device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
 66 |     vis.log_implementation({"Device": device_name})
 67 |     
 68 |     # Training loop
 69 |     profiler = Profiler(summarize_every=10, disabled=False)
 70 |     for step, speaker_batch in enumerate(loader, init_step):
 71 |         profiler.tick("Blocking, waiting for batch (threaded)")
 72 |         
 73 |         # Forward pass
 74 |         inputs = torch.from_numpy(speaker_batch.data).to(device)
 75 |         sync(device)
 76 |         profiler.tick("Data to %s" % device)
 77 |         embeds = model(inputs)
 78 |         sync(device)
 79 |         profiler.tick("Forward pass")
 80 |         embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
 81 |         loss, eer = model.loss(embeds_loss)
 82 |         sync(loss_device)
 83 |         profiler.tick("Loss")
 84 | 
 85 |         # Backward pass
 86 |         model.zero_grad()
 87 |         loss.backward()
 88 |         profiler.tick("Backward pass")
 89 |         model.do_gradient_ops()
 90 |         optimizer.step()
 91 |         profiler.tick("Parameter update")
 92 |         
 93 |         # Update visualizations
 94 |         # learning_rate = optimizer.param_groups[0]["lr"]
 95 |         vis.update(loss.item(), eer, step)
 96 |         
 97 |         # Draw projections and save them to the backup folder
 98 |         if umap_every != 0 and step % umap_every == 0:
 99 |             print("Drawing and saving projections (step %d)" % step)
100 |             backup_dir.mkdir(exist_ok=True)
101 |             projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
102 |             embeds = embeds.detach().cpu().numpy()
103 |             vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
104 |             vis.save()
105 | 
106 |         # Overwrite the latest version of the model
107 |         if save_every != 0 and step % save_every == 0:
108 |             print("Saving the model (step %d)" % step)
109 |             torch.save({
110 |                 "step": step + 1,
111 |                 "model_state": model.state_dict(),
112 |                 "optimizer_state": optimizer.state_dict(),
113 |             }, state_fpath)
114 |             
115 |         # Make a backup
116 |         if backup_every != 0 and step % backup_every == 0:
117 |             print("Making a backup (step %d)" % step)
118 |             backup_dir.mkdir(exist_ok=True)
119 |             backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
120 |             torch.save({
121 |                 "step": step + 1,
122 |                 "model_state": model.state_dict(),
123 |                 "optimizer_state": optimizer.state_dict(),
124 |             }, backup_fpath)
125 |             
126 |         profiler.tick("Extras (visualizations, saving)")
127 |         


--------------------------------------------------------------------------------
/speaker_encoder/encoder/visualizations.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
  4 | from datetime import datetime
  5 | from time import perf_counter as timer
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | # import webbrowser
  9 | import visdom
 10 | import umap
 11 | 
 12 | colormap = np.array([
 13 |     [76, 255, 0],
 14 |     [0, 127, 70],
 15 |     [255, 0, 0],
 16 |     [255, 217, 38],
 17 |     [0, 135, 255],
 18 |     [165, 0, 165],
 19 |     [255, 167, 255],
 20 |     [0, 255, 255],
 21 |     [255, 96, 38],
 22 |     [142, 76, 0],
 23 |     [33, 0, 127],
 24 |     [0, 0, 0],
 25 |     [183, 183, 183],
 26 | ], dtype=np.float) / 255 
 27 | 
 28 | 
 29 | class Visualizations:
 30 |     def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
 31 |         # Tracking data
 32 |         self.last_update_timestamp = timer()
 33 |         self.update_every = update_every
 34 |         self.step_times = []
 35 |         self.losses = []
 36 |         self.eers = []
 37 |         print("Updating the visualizations every %d steps." % update_every)
 38 |         
 39 |         # If visdom is disabled TODO: use a better paradigm for that
 40 |         self.disabled = disabled    
 41 |         if self.disabled:
 42 |             return 
 43 |         
 44 |         # Set the environment name
 45 |         now = str(datetime.now().strftime("%d-%m %Hh%M"))
 46 |         if env_name is None:
 47 |             self.env_name = now
 48 |         else:
 49 |             self.env_name = "%s (%s)" % (env_name, now)
 50 |         
 51 |         # Connect to visdom and open the corresponding window in the browser
 52 |         try:
 53 |             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
 54 |         except ConnectionError:
 55 |             raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
 56 |                             "start it.")
 57 |         # webbrowser.open("http://localhost:8097/env/" + self.env_name)
 58 |         
 59 |         # Create the windows
 60 |         self.loss_win = None
 61 |         self.eer_win = None
 62 |         # self.lr_win = None
 63 |         self.implementation_win = None
 64 |         self.projection_win = None
 65 |         self.implementation_string = ""
 66 |         
 67 |     def log_params(self):
 68 |         if self.disabled:
 69 |             return 
 70 |         from encoder import params_data
 71 |         from encoder import params_model
 72 |         param_string = "<b>Model parameters</b>:<br>"
 73 |         for param_name in (p for p in dir(params_model) if not p.startswith("__")):
 74 |             value = getattr(params_model, param_name)
 75 |             param_string += "\t%s: %s<br>" % (param_name, value)
 76 |         param_string += "<b>Data parameters</b>:<br>"
 77 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 78 |             value = getattr(params_data, param_name)
 79 |             param_string += "\t%s: %s<br>" % (param_name, value)
 80 |         self.vis.text(param_string, opts={"title": "Parameters"})
 81 |         
 82 |     def log_dataset(self, dataset: SpeakerVerificationDataset):
 83 |         if self.disabled:
 84 |             return 
 85 |         dataset_string = ""
 86 |         dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
 87 |         dataset_string += "\n" + dataset.get_logs()
 88 |         dataset_string = dataset_string.replace("\n", "<br>")
 89 |         self.vis.text(dataset_string, opts={"title": "Dataset"})
 90 |         
 91 |     def log_implementation(self, params):
 92 |         if self.disabled:
 93 |             return 
 94 |         implementation_string = ""
 95 |         for param, value in params.items():
 96 |             implementation_string += "<b>%s</b>: %s\n" % (param, value)
 97 |             implementation_string = implementation_string.replace("\n", "<br>")
 98 |         self.implementation_string = implementation_string
 99 |         self.implementation_win = self.vis.text(
100 |             implementation_string, 
101 |             opts={"title": "Training implementation"}
102 |         )
103 | 
104 |     def update(self, loss, eer, step):
105 |         # Update the tracking data
106 |         now = timer()
107 |         self.step_times.append(1000 * (now - self.last_update_timestamp))
108 |         self.last_update_timestamp = now
109 |         self.losses.append(loss)
110 |         self.eers.append(eer)
111 |         print(".", end="")
112 |         
113 |         # Update the plots every <update_every> steps
114 |         if step % self.update_every != 0:
115 |             return
116 |         time_string = "Step time:  mean: %5dms  std: %5dms" % \
117 |                       (int(np.mean(self.step_times)), int(np.std(self.step_times)))
118 |         print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
119 |               (step, np.mean(self.losses), np.mean(self.eers), time_string))
120 |         if not self.disabled:
121 |             self.loss_win = self.vis.line(
122 |                 [np.mean(self.losses)],
123 |                 [step],
124 |                 win=self.loss_win,
125 |                 update="append" if self.loss_win else None,
126 |                 opts=dict(
127 |                     legend=["Avg. loss"],
128 |                     xlabel="Step",
129 |                     ylabel="Loss",
130 |                     title="Loss",
131 |                 )
132 |             )
133 |             self.eer_win = self.vis.line(
134 |                 [np.mean(self.eers)],
135 |                 [step],
136 |                 win=self.eer_win,
137 |                 update="append" if self.eer_win else None,
138 |                 opts=dict(
139 |                     legend=["Avg. EER"],
140 |                     xlabel="Step",
141 |                     ylabel="EER",
142 |                     title="Equal error rate"
143 |                 )
144 |             )
145 |             if self.implementation_win is not None:
146 |                 self.vis.text(
147 |                     self.implementation_string + ("<b>%s</b>" % time_string), 
148 |                     win=self.implementation_win,
149 |                     opts={"title": "Training implementation"},
150 |                 )
151 | 
152 |         # Reset the tracking
153 |         self.losses.clear()
154 |         self.eers.clear()
155 |         self.step_times.clear()
156 |         
157 |     def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
158 |                          max_speakers=10):
159 |         max_speakers = min(max_speakers, len(colormap))
160 |         embeds = embeds[:max_speakers * utterances_per_speaker]
161 |         
162 |         n_speakers = len(embeds) // utterances_per_speaker
163 |         ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
164 |         colors = [colormap[i] for i in ground_truth]
165 |         
166 |         reducer = umap.UMAP()
167 |         projected = reducer.fit_transform(embeds)
168 |         plt.scatter(projected[:, 0], projected[:, 1], c=colors)
169 |         plt.gca().set_aspect("equal", "datalim")
170 |         plt.title("UMAP projection (step %d)" % step)
171 |         if not self.disabled:
172 |             self.projection_win = self.vis.matplot(plt, win=self.projection_win)
173 |         if out_fpath is not None:
174 |             plt.savefig(out_fpath)
175 |         plt.clf()
176 |         
177 |     def save(self):
178 |         if not self.disabled:
179 |             self.vis.save([self.env_name])
180 |         


--------------------------------------------------------------------------------
/speaker_encoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2 | 


--------------------------------------------------------------------------------
/speaker_encoder/utils/argutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | from pathlib import Path
 4 | import numpy as np
 5 | import argparse
 6 | 
 7 | _type_priorities = [    # In decreasing order
 8 |     Path,
 9 |     str,
10 |     int,
11 |     float,
12 |     bool,
13 | ]
14 | 
15 | def _priority(o):
16 |     p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 
17 |     if p is not None:
18 |         return p
19 |     p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 
20 |     if p is not None:
21 |         return p
22 |     return len(_type_priorities)
23 | 
24 | def print_args(args: argparse.Namespace, parser=None):
25 |     args = vars(args)
26 |     if parser is None:
27 |         priorities = list(map(_priority, args.values()))
28 |     else:
29 |         all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
30 |         priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
31 |         priorities = list(map(priority, args.keys()))
32 |     
33 |     pad = max(map(len, args.keys())) + 3
34 |     indices = np.lexsort((list(args.keys()), priorities))
35 |     items = list(args.items())
36 |     
37 |     print("Arguments:")
38 |     for i in indices:
39 |         param, value = items[i]
40 |         print("    {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
41 |     print("")
42 |     


--------------------------------------------------------------------------------
/speaker_encoder/utils/logmmse.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
  2 | 
  3 | import numpy as np
  4 | import math
  5 | from scipy.special import expn
  6 | from collections import namedtuple
  7 | 
  8 | NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
  9 | 
 10 | 
 11 | def profile_noise(noise, sampling_rate, window_size=0):
 12 |     """
 13 |     Creates a profile of the noise in a given waveform.
 14 |     
 15 |     :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 
 16 |     :param sampling_rate: the sampling rate of the audio
 17 |     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
 18 |     will be picked if left as 0.
 19 |     :return: a NoiseProfile object
 20 |     """
 21 |     noise, dtype = to_float(noise)
 22 |     noise += np.finfo(np.float64).eps
 23 | 
 24 |     if window_size == 0:
 25 |         window_size = int(math.floor(0.02 * sampling_rate))
 26 | 
 27 |     if window_size % 2 == 1:
 28 |         window_size = window_size + 1
 29 |     
 30 |     perc = 50
 31 |     len1 = int(math.floor(window_size * perc / 100))
 32 |     len2 = int(window_size - len1)
 33 | 
 34 |     win = np.hanning(window_size)
 35 |     win = win * len2 / np.sum(win)
 36 |     n_fft = 2 * window_size
 37 | 
 38 |     noise_mean = np.zeros(n_fft)
 39 |     n_frames = len(noise) // window_size
 40 |     for j in range(0, window_size * n_frames, window_size):
 41 |         noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
 42 |     noise_mu2 = (noise_mean / n_frames) ** 2
 43 |     
 44 |     return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
 45 | 
 46 | 
 47 | def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
 48 |     """
 49 |     Cleans the noise from a speech waveform given a noise profile. The waveform must have the 
 50 |     same sampling rate as the one used to create the noise profile. 
 51 |     
 52 |     :param wav: a speech waveform as a numpy array of floats or ints.
 53 |     :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 
 54 |     the same) waveform.
 55 |     :param eta: voice threshold for noise update. While the voice activation detection value is 
 56 |     below this threshold, the noise profile will be continuously updated throughout the audio. 
 57 |     Set to 0 to disable updating the noise profile.
 58 |     :return: the clean wav as a numpy array of floats or ints of the same length.
 59 |     """
 60 |     wav, dtype = to_float(wav)
 61 |     wav += np.finfo(np.float64).eps
 62 |     p = noise_profile
 63 |     
 64 |     nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
 65 |     x_final = np.zeros(nframes * p.len2)
 66 | 
 67 |     aa = 0.98
 68 |     mu = 0.98
 69 |     ksi_min = 10 ** (-25 / 10)
 70 |     
 71 |     x_old = np.zeros(p.len1)
 72 |     xk_prev = np.zeros(p.len1)
 73 |     noise_mu2 = p.noise_mu2
 74 |     for k in range(0, nframes * p.len2, p.len2):
 75 |         insign = p.win * wav[k:k + p.window_size]
 76 | 
 77 |         spec = np.fft.fft(insign, p.n_fft, axis=0)
 78 |         sig = np.absolute(spec)
 79 |         sig2 = sig ** 2
 80 | 
 81 |         gammak = np.minimum(sig2 / noise_mu2, 40)
 82 | 
 83 |         if xk_prev.all() == 0:
 84 |             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
 85 |         else:
 86 |             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
 87 |             ksi = np.maximum(ksi_min, ksi)
 88 | 
 89 |         log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
 90 |         vad_decision = np.sum(log_sigma_k) / p.window_size
 91 |         if vad_decision < eta:
 92 |             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
 93 | 
 94 |         a = ksi / (1 + ksi)
 95 |         vk = a * gammak
 96 |         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
 97 |         hw = a * np.exp(ei_vk)
 98 |         sig = sig * hw
 99 |         xk_prev = sig ** 2
100 |         xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
101 |         xi_w = np.real(xi_w)
102 | 
103 |         x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
104 |         x_old = xi_w[p.len1:p.window_size]
105 | 
106 |     output = from_float(x_final, dtype)
107 |     output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
108 |     return output
109 | 
110 | 
111 | ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 
112 | ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 
113 | ## webrctvad
114 | # def vad(wav, sampling_rate, eta=0.15, window_size=0):
115 | #     """
116 | #     TODO: fix doc
117 | #     Creates a profile of the noise in a given waveform.
118 | # 
119 | #     :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 
120 | #     :param sampling_rate: the sampling rate of the audio
121 | #     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
122 | #     will be picked if left as 0.
123 | #     :param eta: voice threshold for noise update. While the voice activation detection value is 
124 | #     below this threshold, the noise profile will be continuously updated throughout the audio. 
125 | #     Set to 0 to disable updating the noise profile.
126 | #     """
127 | #     wav, dtype = to_float(wav)
128 | #     wav += np.finfo(np.float64).eps
129 | #     
130 | #     if window_size == 0:
131 | #         window_size = int(math.floor(0.02 * sampling_rate))
132 | #     
133 | #     if window_size % 2 == 1:
134 | #         window_size = window_size + 1
135 | #     
136 | #     perc = 50
137 | #     len1 = int(math.floor(window_size * perc / 100))
138 | #     len2 = int(window_size - len1)
139 | #     
140 | #     win = np.hanning(window_size)
141 | #     win = win * len2 / np.sum(win)
142 | #     n_fft = 2 * window_size
143 | #     
144 | #     wav_mean = np.zeros(n_fft)
145 | #     n_frames = len(wav) // window_size
146 | #     for j in range(0, window_size * n_frames, window_size):
147 | #         wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
148 | #     noise_mu2 = (wav_mean / n_frames) ** 2
149 | #     
150 | #     wav, dtype = to_float(wav)
151 | #     wav += np.finfo(np.float64).eps
152 | #     
153 | #     nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
154 | #     vad = np.zeros(nframes * len2, dtype=np.bool)
155 | # 
156 | #     aa = 0.98
157 | #     mu = 0.98
158 | #     ksi_min = 10 ** (-25 / 10)
159 | #     
160 | #     xk_prev = np.zeros(len1)
161 | #     noise_mu2 = noise_mu2
162 | #     for k in range(0, nframes * len2, len2):
163 | #         insign = win * wav[k:k + window_size]
164 | #         
165 | #         spec = np.fft.fft(insign, n_fft, axis=0)
166 | #         sig = np.absolute(spec)
167 | #         sig2 = sig ** 2
168 | #         
169 | #         gammak = np.minimum(sig2 / noise_mu2, 40)
170 | #         
171 | #         if xk_prev.all() == 0:
172 | #             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
173 | #         else:
174 | #             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
175 | #             ksi = np.maximum(ksi_min, ksi)
176 | #         
177 | #         log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
178 | #         vad_decision = np.sum(log_sigma_k) / window_size
179 | #         if vad_decision < eta:
180 | #             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
181 | #         print(vad_decision)
182 | #         
183 | #         a = ksi / (1 + ksi)
184 | #         vk = a * gammak
185 | #         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
186 | #         hw = a * np.exp(ei_vk)
187 | #         sig = sig * hw
188 | #         xk_prev = sig ** 2
189 | #         
190 | #         vad[k:k + len2] = vad_decision >= eta
191 | #         
192 | #     vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
193 | #     return vad
194 | 
195 | 
196 | def to_float(_input):
197 |     if _input.dtype == np.float64:
198 |         return _input, _input.dtype
199 |     elif _input.dtype == np.float32:
200 |         return _input.astype(np.float64), _input.dtype
201 |     elif _input.dtype == np.uint8:
202 |         return (_input - 128) / 128., _input.dtype
203 |     elif _input.dtype == np.int16:
204 |         return _input / 32768., _input.dtype
205 |     elif _input.dtype == np.int32:
206 |         return _input / 2147483648., _input.dtype
207 |     raise ValueError('Unsupported wave file format')
208 | 
209 | 
210 | def from_float(_input, dtype):
211 |     if dtype == np.float64:
212 |         return _input, np.float64
213 |     elif dtype == np.float32:
214 |         return _input.astype(np.float32)
215 |     elif dtype == np.uint8:
216 |         return ((_input * 128) + 128).astype(np.uint8)
217 |     elif dtype == np.int16:
218 |         return (_input * 32768).astype(np.int16)
219 |     elif dtype == np.int32:
220 |         print(_input)
221 |         return (_input * 2147483648).astype(np.int32)
222 |     raise ValueError('Unsupported wave file format')
223 | 


--------------------------------------------------------------------------------
/speaker_encoder/utils/profiler.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
 2 | 
 3 | from time import perf_counter as timer
 4 | from collections import OrderedDict
 5 | import numpy as np
 6 | 
 7 | 
 8 | class Profiler:
 9 |     def __init__(self, summarize_every=5, disabled=False):
10 |         self.last_tick = timer()
11 |         self.logs = OrderedDict()
12 |         self.summarize_every = summarize_every
13 |         self.disabled = disabled
14 |     
15 |     def tick(self, name):
16 |         if self.disabled:
17 |             return
18 |         
19 |         # Log the time needed to execute that function
20 |         if not name in self.logs:
21 |             self.logs[name] = []
22 |         if len(self.logs[name]) >= self.summarize_every:
23 |             self.summarize()
24 |             self.purge_logs()
25 |         self.logs[name].append(timer() - self.last_tick)
26 |         
27 |         self.reset_timer()
28 |         
29 |     def purge_logs(self):
30 |         for name in self.logs:
31 |             self.logs[name].clear()
32 |     
33 |     def reset_timer(self):
34 |         self.last_tick = timer()
35 |     
36 |     def summarize(self):
37 |         n = max(map(len, self.logs.values()))
38 |         assert n == self.summarize_every
39 |         print("\nAverage execution time over %d steps:" % n)
40 | 
41 |         name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
42 |         pad = max(map(len, name_msgs))
43 |         for name_msg, deltas in zip(name_msgs, self.logs.values()):
44 |             print("  %s  mean: %4.0fms   std: %4.0fms" % 
45 |                   (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
46 |         print("", flush=True)    
47 |         


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
 2 | # This program is free software; you can redistribute it and/or modify
 3 | # it under the terms of the MIT License.
 4 | # This program is distributed in the hope that it will be useful,
 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 7 | # MIT License for more details.
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | from scipy.io import wavfile
12 | 
13 | 
14 | def save_plot(tensor, savepath):
15 |     plt.style.use('default')
16 |     fig, ax = plt.subplots(figsize=(12, 3))
17 |     im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
18 |     plt.colorbar(im, ax=ax)
19 |     plt.tight_layout()
20 |     fig.canvas.draw()
21 |     plt.savefig(savepath)
22 |     plt.close()
23 | 
24 | 
25 | def save_audio(file_path, sampling_rate, audio):
26 |     audio = np.clip(audio.detach().cpu().squeeze().numpy(), -0.999, 0.999)
27 |     wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
28 | 


--------------------------------------------------------------------------------
/var.env:
--------------------------------------------------------------------------------
 1 | 
 2 | MODEL_VERSION = "1"
 3 | TRITON_MODEL_NAME = "vc_pipeline_python"
 4 | 
 5 | INPUT_NAME = "INPUT"
 6 | OUTPUT_NAME= "OUTPUT"
 7 | 
 8 | 
 9 | PYTHONUNBUFFERED=TRUE
10 | 
11 | 


--------------------------------------------------------------------------------