├── .dockerignore ├── .gitignore ├── Docker-compose.yml ├── Dockerfile ├── LICENSE ├── README.md ├── THIRD_PARTY_NOTICE ├── api.py ├── app_gradio.py ├── checkpts ├── spk_encoder │ └── LICENSE ├── vc │ ├── train_dec_libritts_wodyn.log │ ├── train_dec_vctk_wodyn.log │ ├── train_enc_libritts.log │ └── train_enc_vctk.log └── vocoder │ ├── LICENSE │ └── config.json ├── deploy ├── Dockerfile └── model_repository │ ├── vc_pipeline_python │ ├── 1 │ │ ├── model.py │ │ └── pipeline │ │ │ └── __init__.py │ └── config.pbtxt │ ├── vc_spk_encoder │ └── config.pbtxt │ └── vc_vocoder │ └── config.pbtxt ├── example ├── 6415_111615_000012_000005.wav └── 8534_216567_000015_000010.wav ├── export_onnx ├── __init__.py ├── export_hifigan.py ├── export_spk_enc.py └── onnx_check.py ├── filelists ├── exceptions_libritts.txt ├── exceptions_vctk.txt └── valid.txt ├── get_avg_mels.ipynb ├── hifi-gan ├── LICENSE ├── README.md ├── env.py ├── meldataset.py ├── models.py └── xutils.py ├── inference.py ├── inference_pipeline.ipynb ├── model ├── __init__.py ├── base.py ├── diffusion.py ├── encoder.py ├── modules.py ├── postnet.py ├── utils.py └── vc.py ├── params.py ├── requirements.txt ├── run-container.sh ├── scenario ├── __init__.py ├── prepare_data.py ├── train_dec.py └── train_enc.py ├── speaker_encoder ├── LICENSE ├── README.md ├── __init__.py ├── encoder │ ├── __init__.py │ ├── audio.py │ ├── config.py │ ├── data_objects │ │ ├── __init__.py │ │ ├── random_cycler.py │ │ ├── speaker.py │ │ ├── speaker_batch.py │ │ ├── speaker_verification_dataset.py │ │ └── utterance.py │ ├── inference.py │ ├── model.py │ ├── params_data.py │ ├── params_model.py │ ├── preprocess.py │ ├── train.py │ └── visualizations.py └── utils │ ├── __init__.py │ ├── argutils.py │ ├── logmmse.py │ └── profiler.py ├── utils.py └── var.env /.dockerignore: -------------------------------------------------------------------------------- 1 | /__pycache__/* 2 | /checkpts 3 | /deploy 4 | /example 5 | /export_oxxn 6 | /filelists 7 | trash*.* -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # myself 3 | requirement_env.txt 4 | checkpts/spk_encoder/pretrained.pt 5 | checkpts/vc/vc_libritts_wodyn.pt 6 | checkpts/vocoder/generator 7 | trash*.py 8 | deploy/* 9 | output_demo/* 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | **/__pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | share/python-wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | cover/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | .pybuilder/ 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | # For a library or package, you might want to ignore these files since the code is 98 | # intended to run in multiple environments; otherwise, check them in: 99 | # .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # poetry 109 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 110 | # This is especially recommended for binary packages to ensure reproducibility, and is more 111 | # commonly ignored for libraries. 112 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 113 | #poetry.lock 114 | 115 | # pdm 116 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 117 | #pdm.lock 118 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 119 | # in version control. 120 | # https://pdm.fming.dev/#use-with-ide 121 | .pdm.toml 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ -------------------------------------------------------------------------------- /Docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | vc-triton: 5 | image: nvcr.io/nvidia/tritonserver:21.10-py3 6 | shm_size: '1gb' #<-- when RUNNING 7 | container_name: triton_multi_ensemble 8 | restart: unless-stopped 9 | networks: 10 | - vcnetwork 11 | hostname: vctriton 12 | ports: 13 | - "8030-8032:8000-8002" 14 | environment: 15 | - HOME=/config # fix "Home directory not accessible: Permission denied docker" when calling espeak 16 | volumes: 17 | - ./hifi-gan:/hifi-gan 18 | - ./speaker_encoder:/speaker_encoder 19 | command: bash -c "tritonserver --model-repository=/models --log-verbose 1" 20 | deploy: 21 | resources: 22 | reservations: 23 | devices: 24 | - driver: nvidia 25 | count: 1 26 | capabilities: [gpu] 27 | 28 | 29 | vc-api: 30 | container_name: voice-conversion-api 31 | build: . 32 | restart: always 33 | networks: 34 | - vcnetwork 35 | environment: 36 | TRITON_URL: "vc-triton:8031" 37 | ports: 38 | - 1513:1513 39 | depends_on: 40 | - "vc-triton" 41 | command: bash -c "uvicorn filename:app --host 0.0.0.0 --port 1900 --workers 1" 42 | 43 | 44 | 45 | networks: 46 | vcnetwork: 47 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime 2 | 3 | WORKDIR /workspace 4 | 5 | RUN apt-get update \ 6 | && apt-get install curl libcurl4-openssl-dev libb64-dev -y \ 7 | && apt-get install libsndfile1-dev -y \ 8 | && pip install --upgrade pip 9 | RUN pip install torchaudio==0.8.1 10 | # setup for librosa 11 | RUN apt-get install libsndfile1 12 | 13 | COPY requirements.txt requirements.txt 14 | 15 | RUN pip install -r requirements.txt --no-cache-dir 16 | 17 | COPY . . 18 | # CMD ["python3", "app_gradio.py"] 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Huawei Technologies Co., Ltd. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Diffusion-Based Any-to-Any Voice Conversion 2 | 3 | ### Introduction 4 | - This repository is a derivative of the Official implementation of the paper "Diffusion-Based Voice Conversion with Fast Maximum Likelihood Sampling Scheme" [Link](https://arxiv.org/abs/2109.13821). It builds upon their work and incorporates additional features and modifications specific to this project. 5 | 6 | 7 | - [The Official Demo Page](https://diffvc-fast-ml-solver.github.io/). 8 | 9 | # Pre-trained models 10 | 11 | - Please check `inference.ipynb` for the detailed instructions. 12 | 13 | - The pre-trained speaker encoder we use is available at https://drive.google.com/file/d/1Y8IO2_OqeT85P1kks9I9eeAq--S65YFb/view?usp=sharing 14 | Please put it to `/checkpts/spk_encoder/` 15 | 16 | - The pre-trained universal HiFi-GAN vocoder we use is available at https://drive.google.com/file/d/10khlrM645pTbQ4rc2aNEYPba8RFDBkW-/view?usp=sharing. It is taken from the official HiFi-GAN repository. Please put it to `/checkpts/vocoder/` 17 | 18 | - You have to download voice conversion model trained on LibriTTS from here: https://drive.google.com/file/d/18Xbme0CTVo58p2vOHoTQm8PBGW7oEjAy/view?usp=sharing 19 | 20 | - Additionally, we provide voice conversion model trained on VCTK: https://drive.google.com/file/d/12s9RPmwp9suleMkBCVetD8pub7wsDAy4/view?usp=sharing 21 | . Please put models to `/checkpts/vc/` 22 | 23 | # Build docker environment 24 | 25 | - To build image, run: 26 | ```bash 27 | Docker build -t diffvc . 28 | ``` 29 | 30 | - To run a container for develop, run: 31 | ```bash 32 | bash run-container.sh 33 | ``` 34 | 35 | # Training your own model 36 | 37 | - To train model on your data, first create a data directory with three folders: "wavs", "mels" and "embeds". Put raw audio files sampled at 22.05kHz to "wavs" directory. The functions for calculating mel-spectrograms and extracting 256-dimensional speaker embeddings with the pre-trained speaker verification network located at *checkpts/spk_encoder/* can be found at *inference.ipynb* notebook (*get_mel* and *get_embed* correspondingly). Please put these data to "mels" and "embeds" folders respectively. Note that all the folders in your data directory should have subfolders corresponding to particular speakers and containing data only for corresponding speakers. 38 | 39 | - If you want to train the encoder, create "logs_enc" directory and run *train_enc.py*. Before that, you have to prepare another folder "mels_mode" with mel-spectrograms of the "average voice" (i.e. target mels for the encoder) in the data directory. To obtain them, you have to run Montreal Forced Aligner on the input mels, get *.TextGrid* files and put them to "textgrids" folder in the data directory. Once you have "mels" and "textgrids" folders, run *get_avg_mels.ipynb*. 40 | `python3 -m scenario.train_enc` 41 | - Alternatively, you may load the encoder trained on LibriTTS from https://drive.google.com/file/d/1JdoC5hh7k6Nz_oTcumH0nXNEib-GDbSq/view?usp=sharing and put it to "logs_enc" directory. 42 | 43 | - Once you have the encoder *enc.pt* in "logs_enc" directory, create "logs_dec" directory and run *train_dec.py* to train the diffusion-based decoder. 44 | `python3 -m scenario.train_dec` 45 | - Please check *params.py* for the most important hyperparameters. 46 | 47 | # Demo 48 | 49 | - To launch gradio demo app, run: 50 | ```bash 51 | python3 app_gradio.py 52 | ``` 53 | 54 | # Serve model (developing) 55 | 56 | 1. Convert model from .pt to .onnx 57 | ```bash 58 | python3 -m export_onnx.export_hifigan 59 | ``` 60 | 61 | ```bash 62 | python3 -m export_onnx.export_spk_enc 63 | ``` 64 | 65 | 2. Deploy pipeline using Triton Inference Server: 66 | 67 | -------------------------------------------------------------------------------- /THIRD_PARTY_NOTICE: -------------------------------------------------------------------------------- 1 | Please note we provide an open source software notice for the third party 2 | open source software along with this software and/or this software component 3 | contributed by Huawei (in the following just “this SOFTWARE”). The open source 4 | software licenses are granted by the respective right holders. 5 | 6 | WARRANTY DISCLAIMER 7 | THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL 8 | BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF 9 | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES 10 | FOR MORE DETAILS. 11 | 12 | COPYRIGHT NOTICE AND LICENSE TEXTS 13 | 14 | SOFTWARE: HiFi-GAN 15 | Copyright (c) 2020 Jungil Kong 16 | License: MIT 17 | Permission is hereby granted, free of charge, to any person obtaining a copy 18 | of this software and associated documentation files (the "Software"), to deal 19 | in the Software without restriction, including without limitation the rights 20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 21 | copies of the Software, and to permit persons to whom the Software is 22 | furnished to do so, subject to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included in all 25 | copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 | SOFTWARE. 34 | 35 | SOFTWARE: Real-Time Voice Cloning 36 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 37 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 38 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 39 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 40 | License: MIT 41 | Text: See above -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import uuid 4 | import json 5 | from time import time 6 | from loguru import logger 7 | import numpy as np 8 | 9 | 10 | from fastapi import FastAPI, Response, status, File, UploadFile, Body 11 | from starlette.middleware.cors import CORSMiddleware 12 | from pydantic import BaseModel, Field 13 | 14 | 15 | from inference import Inferencer 16 | 17 | import params 18 | from model import DiffVC 19 | 20 | import sys 21 | sys.path.append('hifi-gan/') 22 | from env import AttrDict 23 | from models import Generator as HiFiGAN 24 | 25 | sys.path.append('speaker_encoder/') 26 | from encoder import inference as spk_encoder 27 | from pathlib import Path 28 | 29 | 30 | use_gpu = torch.cuda.is_available() 31 | vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model 32 | 33 | generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 34 | params.layers, params.kernel, params.dropout, params.window_size, 35 | params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 36 | params.beta_min, params.beta_max) 37 | if use_gpu: 38 | generator = generator.cuda() 39 | generator.load_state_dict(torch.load(vc_path)) 40 | else: 41 | generator.load_state_dict(torch.load(vc_path, map_location='cpu')) 42 | generator.eval() 43 | 44 | 45 | # loading HiFi-GAN vocoder 46 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path 47 | 48 | with open(hfg_path + 'config.json') as f: 49 | h = AttrDict(json.load(f)) 50 | 51 | if use_gpu: 52 | hifigan_universal = HiFiGAN(h).cuda() 53 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator']) 54 | else: 55 | hifigan_universal = HiFiGAN(h) 56 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator', map_location='cpu')['generator']) 57 | 58 | _ = hifigan_universal.eval() 59 | hifigan_universal.remove_weight_norm() 60 | 61 | 62 | # loading speaker encoder 63 | enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path 64 | if use_gpu: 65 | spk_encoder.load_model(enc_model_fpath, device="cuda") 66 | else: 67 | spk_encoder.load_model(enc_model_fpath, device="cpu") 68 | # Define Inferencer 69 | _inferencer = Inferencer(generator, spk_encoder, hifigan_universal, MEDIA_ROOT, True ) 70 | 71 | 72 | # Make dir to save audio files log 73 | MEDIA_ROOT = os.path.join('/logs', 'media') 74 | if not os.path.exists(MEDIA_ROOT): 75 | os.makedirs(MEDIA_ROOT) 76 | 77 | # Make dir to save json response log 78 | LOG_ROOT = os.path.join('/logs', 'json') 79 | if not os.path.exists(LOG_ROOT): 80 | os.makedirs(LOG_ROOT) 81 | 82 | def save_audio(file): 83 | job_id = str(uuid.uuid4()) 84 | output_dir = os.path.join(MEDIA_ROOT, str(job_id)) 85 | if not os.path.exists(output_dir): 86 | os.makedirs(output_dir) 87 | audio_save_path = os.path.join(output_dir, file.filename) 88 | with open(audio_save_path, "wb+") as file_object: 89 | file_object.write(file.file.read()) 90 | 91 | return audio_save_path 92 | 93 | 94 | 95 | 96 | app = FastAPI( 97 | title="Voice Conversion", 98 | ) 99 | 100 | app.add_middleware( 101 | CORSMiddleware, 102 | allow_origins=["*"], 103 | allow_credentials=True, 104 | allow_methods=["*"], 105 | allow_headers=["*"], 106 | ) 107 | 108 | 109 | @app.get('/', status_code=status.HTTP_200_OK) 110 | async def check_status(response: Response): 111 | api_status = {"API Status": "Running"} 112 | return api_status 113 | 114 | 115 | 116 | @app.post('/convert', status_code=200) 117 | async def convert(response:Response, file1: UploadFile = File(...), file2: UploadFile = File(...) ): 118 | # Save source and target to MEDIA 119 | source_fpath = save_audio(file1) 120 | target_fpath = save_audio(file2) 121 | 122 | audio = _inferencer.infer(src_path=audio_path, tgt_path=target_path, return_output_path=False) 123 | 124 | return audio 125 | 126 | 127 | -------------------------------------------------------------------------------- /app_gradio.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import os 3 | import uuid 4 | import torch 5 | import json 6 | from inference import Inferencer 7 | 8 | import params 9 | from model import DiffVC 10 | 11 | import sys 12 | sys.path.append('hifi-gan/') 13 | from env import AttrDict 14 | from models import Generator as HiFiGAN 15 | 16 | sys.path.append('speaker_encoder/') 17 | from encoder import inference as spk_encoder 18 | from pathlib import Path 19 | 20 | use_gpu = torch.cuda.is_available() 21 | 22 | MEDIA_ROOT = os.path.join('/logs', 'media') 23 | if not os.path.exists(MEDIA_ROOT): 24 | os.makedirs(MEDIA_ROOT) 25 | 26 | # load voice conversion 27 | vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model 28 | 29 | generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 30 | params.layers, params.kernel, params.dropout, params.window_size, 31 | params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 32 | params.beta_min, params.beta_max) 33 | if use_gpu: 34 | generator = generator.cuda() 35 | generator.load_state_dict(torch.load(vc_path)) 36 | else: 37 | generator.load_state_dict(torch.load(vc_path, map_location='cpu')) 38 | generator.eval() 39 | 40 | 41 | # loading HiFi-GAN vocoder 42 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path 43 | 44 | with open(hfg_path + 'config.json') as f: 45 | h = AttrDict(json.load(f)) 46 | 47 | if use_gpu: 48 | hifigan_universal = HiFiGAN(h).cuda() 49 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator']) 50 | else: 51 | hifigan_universal = HiFiGAN(h) 52 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator', map_location='cpu')['generator']) 53 | 54 | _ = hifigan_universal.eval() 55 | hifigan_universal.remove_weight_norm() 56 | 57 | 58 | # loading speaker encoder 59 | enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path 60 | if use_gpu: 61 | spk_encoder.load_model(enc_model_fpath, device="cuda") 62 | else: 63 | spk_encoder.load_model(enc_model_fpath, device="cpu") 64 | 65 | 66 | # define inference object 67 | _inferencer = Inferencer(generator, spk_encoder, hifigan_universal, MEDIA_ROOT, True) 68 | 69 | 70 | def _inference(audio_path, target_path, mic_path1=None, mic_path2=None): 71 | 72 | if mic_path1: 73 | audio_path = mic_path1 74 | if mic_path2: 75 | target_path = mic_path2 76 | 77 | output_path = _inferencer.infer(src_path=audio_path, tgt_path=target_path, return_output_path=True) 78 | 79 | return output_path 80 | 81 | # gradio app 82 | 83 | title = "VC-DEMO" 84 | description = "Gradio demo for Voice Conversion" 85 | # examples = [['./test_wav/p225_001.wav', "./test_wav/p226_001.wav"]] 86 | 87 | 88 | def toggle(choice): 89 | if choice == "mic": 90 | return gr.update(visible=True, value=None), gr.update(visible=False, value=None) 91 | else: 92 | return gr.update(visible=False, value=None), gr.update(visible=True, value=None) 93 | 94 | 95 | with gr.Blocks() as demo: 96 | with gr.Row(): 97 | with gr.Column(): 98 | radio1 = gr.Radio(["mic", "file"], value="file", 99 | label="How would you like to upload your audio?") 100 | 101 | mic_input1 = gr.Mic(label="Input", type="filepath", visible=False) 102 | audio_input = gr.Audio( 103 | type="filepath", label="Input", visible=True) 104 | 105 | radio2 = gr.Radio(["mic", "file"], value="file", 106 | label="How would you like to upload your audio?") 107 | mic_input2 = gr.Mic(label="Target", type="filepath", visible=False) 108 | audio_target = gr.Audio( 109 | type="filepath", label="Target", visible=True) 110 | with gr.Column(): 111 | audio_output = gr.Audio(label="Output") 112 | 113 | # gr.Examples(examples, fn=_inference, inputs=[audio_input, audio_target], 114 | # outputs=audio_output, cache_examples=True) 115 | 116 | btn = gr.Button("Generate") 117 | btn.click(_inference, inputs=[audio_input, 118 | audio_target, mic_input1, mic_input2], outputs=audio_output) 119 | radio1.change(toggle, radio1, [mic_input1, audio_input]) 120 | radio2.change(toggle, radio2, [mic_input2, audio_target]) 121 | 122 | demo.launch(enable_queue=True, server_port=1402, server_name="0.0.0.0", share=True) -------------------------------------------------------------------------------- /checkpts/spk_encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /checkpts/vc/train_dec_libritts_wodyn.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.1397 2 | Epoch 2: loss = 0.1209 3 | Epoch 3: loss = 0.1192 4 | Epoch 4: loss = 0.1185 5 | Epoch 5: loss = 0.1170 6 | Epoch 6: loss = 0.1161 7 | Epoch 7: loss = 0.1157 8 | Epoch 8: loss = 0.1157 9 | Epoch 9: loss = 0.1152 10 | Epoch 10: loss = 0.1141 11 | Epoch 11: loss = 0.1140 12 | Epoch 12: loss = 0.1139 13 | Epoch 13: loss = 0.1132 14 | Epoch 14: loss = 0.1137 15 | Epoch 15: loss = 0.1136 16 | Epoch 16: loss = 0.1138 17 | Epoch 17: loss = 0.1130 18 | Epoch 18: loss = 0.1124 19 | Epoch 19: loss = 0.1121 20 | Epoch 20: loss = 0.1123 21 | Epoch 21: loss = 0.1121 22 | Epoch 22: loss = 0.1122 23 | Epoch 23: loss = 0.1126 24 | Epoch 24: loss = 0.1122 25 | Epoch 25: loss = 0.1118 26 | Epoch 26: loss = 0.1118 27 | Epoch 27: loss = 0.1120 28 | Epoch 28: loss = 0.1112 29 | Epoch 29: loss = 0.1106 30 | Epoch 30: loss = 0.1111 31 | Epoch 31: loss = 0.1111 32 | Epoch 32: loss = 0.1107 33 | Epoch 33: loss = 0.1115 34 | Epoch 34: loss = 0.1111 35 | Epoch 35: loss = 0.1118 36 | Epoch 36: loss = 0.1111 37 | Epoch 37: loss = 0.1106 38 | Epoch 38: loss = 0.1108 39 | Epoch 39: loss = 0.1101 40 | Epoch 40: loss = 0.1109 41 | Epoch 41: loss = 0.1110 42 | Epoch 42: loss = 0.1106 43 | Epoch 43: loss = 0.1107 44 | Epoch 44: loss = 0.1104 45 | Epoch 45: loss = 0.1099 46 | Epoch 46: loss = 0.1093 47 | Epoch 47: loss = 0.1105 48 | Epoch 48: loss = 0.1107 49 | Epoch 49: loss = 0.1092 50 | Epoch 50: loss = 0.1100 51 | Epoch 51: loss = 0.1098 52 | Epoch 52: loss = 0.1097 53 | Epoch 53: loss = 0.1103 54 | Epoch 54: loss = 0.1103 55 | Epoch 55: loss = 0.1101 56 | Epoch 56: loss = 0.1090 57 | Epoch 57: loss = 0.1095 58 | Epoch 58: loss = 0.1105 59 | Epoch 59: loss = 0.1098 60 | Epoch 60: loss = 0.1098 61 | Epoch 61: loss = 0.1098 62 | Epoch 62: loss = 0.1095 63 | Epoch 63: loss = 0.1107 64 | Epoch 64: loss = 0.1097 65 | Epoch 65: loss = 0.1088 66 | Epoch 66: loss = 0.1099 67 | Epoch 67: loss = 0.1085 68 | Epoch 68: loss = 0.1091 69 | Epoch 69: loss = 0.1092 70 | Epoch 70: loss = 0.1093 71 | Epoch 71: loss = 0.1094 72 | Epoch 72: loss = 0.1094 73 | Epoch 73: loss = 0.1084 74 | Epoch 74: loss = 0.1090 75 | Epoch 75: loss = 0.1102 76 | Epoch 76: loss = 0.1083 77 | Epoch 77: loss = 0.1085 78 | Epoch 78: loss = 0.1092 79 | Epoch 79: loss = 0.1088 80 | Epoch 80: loss = 0.1083 81 | Epoch 81: loss = 0.1082 82 | Epoch 82: loss = 0.1083 83 | Epoch 83: loss = 0.1089 84 | Epoch 84: loss = 0.1077 85 | Epoch 85: loss = 0.1089 86 | Epoch 86: loss = 0.1087 87 | Epoch 87: loss = 0.1086 88 | Epoch 88: loss = 0.1086 89 | Epoch 89: loss = 0.1089 90 | Epoch 90: loss = 0.1086 91 | Epoch 91: loss = 0.1082 92 | Epoch 92: loss = 0.1090 93 | Epoch 93: loss = 0.1087 94 | Epoch 94: loss = 0.1081 95 | Epoch 95: loss = 0.1082 96 | Epoch 96: loss = 0.1082 97 | Epoch 97: loss = 0.1079 98 | Epoch 98: loss = 0.1079 99 | Epoch 99: loss = 0.1094 100 | Epoch 100: loss = 0.1092 101 | Epoch 101: loss = 0.1084 102 | Epoch 102: loss = 0.1086 103 | Epoch 103: loss = 0.1082 104 | Epoch 104: loss = 0.1081 105 | Epoch 105: loss = 0.1084 106 | Epoch 106: loss = 0.1081 107 | Epoch 107: loss = 0.1086 108 | Epoch 108: loss = 0.1093 109 | Epoch 109: loss = 0.1070 110 | Epoch 110: loss = 0.1081 111 | -------------------------------------------------------------------------------- /checkpts/vc/train_dec_vctk_wodyn.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.1779 2 | Epoch 2: loss = 0.1237 3 | Epoch 3: loss = 0.1198 4 | Epoch 4: loss = 0.1165 5 | Epoch 5: loss = 0.1158 6 | Epoch 6: loss = 0.1162 7 | Epoch 7: loss = 0.1158 8 | Epoch 8: loss = 0.1129 9 | Epoch 9: loss = 0.1115 10 | Epoch 10: loss = 0.1124 11 | Epoch 11: loss = 0.1107 12 | Epoch 12: loss = 0.1116 13 | Epoch 13: loss = 0.1095 14 | Epoch 14: loss = 0.1079 15 | Epoch 15: loss = 0.1108 16 | Epoch 16: loss = 0.1060 17 | Epoch 17: loss = 0.1081 18 | Epoch 18: loss = 0.1066 19 | Epoch 19: loss = 0.1087 20 | Epoch 20: loss = 0.1057 21 | Epoch 21: loss = 0.1062 22 | Epoch 22: loss = 0.1070 23 | Epoch 23: loss = 0.1078 24 | Epoch 24: loss = 0.1064 25 | Epoch 25: loss = 0.1063 26 | Epoch 26: loss = 0.1066 27 | Epoch 27: loss = 0.1068 28 | Epoch 28: loss = 0.1058 29 | Epoch 29: loss = 0.1052 30 | Epoch 30: loss = 0.1057 31 | Epoch 31: loss = 0.1057 32 | Epoch 32: loss = 0.1055 33 | Epoch 33: loss = 0.1046 34 | Epoch 34: loss = 0.1046 35 | Epoch 35: loss = 0.1052 36 | Epoch 36: loss = 0.1046 37 | Epoch 37: loss = 0.1053 38 | Epoch 38: loss = 0.1049 39 | Epoch 39: loss = 0.1034 40 | Epoch 40: loss = 0.1037 41 | Epoch 41: loss = 0.1051 42 | Epoch 42: loss = 0.1039 43 | Epoch 43: loss = 0.1033 44 | Epoch 44: loss = 0.1058 45 | Epoch 45: loss = 0.1039 46 | Epoch 46: loss = 0.1025 47 | Epoch 47: loss = 0.1031 48 | Epoch 48: loss = 0.1037 49 | Epoch 49: loss = 0.1034 50 | Epoch 50: loss = 0.1046 51 | Epoch 51: loss = 0.1037 52 | Epoch 52: loss = 0.1044 53 | Epoch 53: loss = 0.1029 54 | Epoch 54: loss = 0.1022 55 | Epoch 55: loss = 0.1026 56 | Epoch 56: loss = 0.1031 57 | Epoch 57: loss = 0.1031 58 | Epoch 58: loss = 0.1030 59 | Epoch 59: loss = 0.1036 60 | Epoch 60: loss = 0.1025 61 | Epoch 61: loss = 0.1031 62 | Epoch 62: loss = 0.1042 63 | Epoch 63: loss = 0.1038 64 | Epoch 64: loss = 0.1034 65 | Epoch 65: loss = 0.1031 66 | Epoch 66: loss = 0.1023 67 | Epoch 67: loss = 0.1029 68 | Epoch 68: loss = 0.1018 69 | Epoch 69: loss = 0.1007 70 | Epoch 70: loss = 0.1022 71 | Epoch 71: loss = 0.1020 72 | Epoch 72: loss = 0.1026 73 | Epoch 73: loss = 0.1008 74 | Epoch 74: loss = 0.1024 75 | Epoch 75: loss = 0.1012 76 | Epoch 76: loss = 0.1016 77 | Epoch 77: loss = 0.1036 78 | Epoch 78: loss = 0.1018 79 | Epoch 79: loss = 0.1009 80 | Epoch 80: loss = 0.1009 81 | Epoch 81: loss = 0.1011 82 | Epoch 82: loss = 0.1012 83 | Epoch 83: loss = 0.1024 84 | Epoch 84: loss = 0.1025 85 | Epoch 85: loss = 0.1015 86 | Epoch 86: loss = 0.0998 87 | Epoch 87: loss = 0.1011 88 | Epoch 88: loss = 0.1033 89 | Epoch 89: loss = 0.1024 90 | Epoch 90: loss = 0.1032 91 | Epoch 91: loss = 0.1033 92 | Epoch 92: loss = 0.1014 93 | Epoch 93: loss = 0.1008 94 | Epoch 94: loss = 0.1011 95 | Epoch 95: loss = 0.1010 96 | Epoch 96: loss = 0.1001 97 | Epoch 97: loss = 0.1001 98 | Epoch 98: loss = 0.1011 99 | Epoch 99: loss = 0.1024 100 | Epoch 100: loss = 0.1007 101 | Epoch 101: loss = 0.0998 102 | Epoch 102: loss = 0.1010 103 | Epoch 103: loss = 0.1004 104 | Epoch 104: loss = 0.1014 105 | Epoch 105: loss = 0.1002 106 | Epoch 106: loss = 0.1003 107 | Epoch 107: loss = 0.0998 108 | Epoch 108: loss = 0.0996 109 | Epoch 109: loss = 0.0994 110 | Epoch 110: loss = 0.0997 111 | Epoch 111: loss = 0.1007 112 | Epoch 112: loss = 0.0990 113 | Epoch 113: loss = 0.0997 114 | Epoch 114: loss = 0.0994 115 | Epoch 115: loss = 0.1003 116 | Epoch 116: loss = 0.1011 117 | Epoch 117: loss = 0.1009 118 | Epoch 118: loss = 0.0991 119 | Epoch 119: loss = 0.0992 120 | Epoch 120: loss = 0.0998 121 | Epoch 121: loss = 0.1002 122 | Epoch 122: loss = 0.1007 123 | Epoch 123: loss = 0.1004 124 | Epoch 124: loss = 0.0995 125 | Epoch 125: loss = 0.1004 126 | Epoch 126: loss = 0.0998 127 | Epoch 127: loss = 0.0994 128 | Epoch 128: loss = 0.1007 129 | Epoch 129: loss = 0.0991 130 | Epoch 130: loss = 0.1009 131 | Epoch 131: loss = 0.0994 132 | Epoch 132: loss = 0.0990 133 | Epoch 133: loss = 0.1015 134 | Epoch 134: loss = 0.0986 135 | Epoch 135: loss = 0.1002 136 | Epoch 136: loss = 0.1000 137 | Epoch 137: loss = 0.0996 138 | Epoch 138: loss = 0.0994 139 | Epoch 139: loss = 0.0988 140 | Epoch 140: loss = 0.0996 141 | Epoch 141: loss = 0.0989 142 | Epoch 142: loss = 0.0991 143 | Epoch 143: loss = 0.1002 144 | Epoch 144: loss = 0.0985 145 | Epoch 145: loss = 0.1004 146 | Epoch 146: loss = 0.0998 147 | Epoch 147: loss = 0.0981 148 | Epoch 148: loss = 0.0989 149 | Epoch 149: loss = 0.0997 150 | Epoch 150: loss = 0.0993 151 | Epoch 151: loss = 0.0984 152 | Epoch 152: loss = 0.0993 153 | Epoch 153: loss = 0.0993 154 | Epoch 154: loss = 0.1006 155 | Epoch 155: loss = 0.1009 156 | Epoch 156: loss = 0.0989 157 | Epoch 157: loss = 0.0974 158 | Epoch 158: loss = 0.0978 159 | Epoch 159: loss = 0.0988 160 | Epoch 160: loss = 0.0984 161 | Epoch 161: loss = 0.0985 162 | Epoch 162: loss = 0.1005 163 | Epoch 163: loss = 0.0987 164 | Epoch 164: loss = 0.0992 165 | Epoch 165: loss = 0.0987 166 | Epoch 166: loss = 0.1003 167 | Epoch 167: loss = 0.1000 168 | Epoch 168: loss = 0.0983 169 | Epoch 169: loss = 0.0988 170 | Epoch 170: loss = 0.1004 171 | Epoch 171: loss = 0.0991 172 | Epoch 172: loss = 0.0985 173 | Epoch 173: loss = 0.0999 174 | Epoch 174: loss = 0.1012 175 | Epoch 175: loss = 0.0993 176 | Epoch 176: loss = 0.0980 177 | Epoch 177: loss = 0.0987 178 | Epoch 178: loss = 0.0991 179 | Epoch 179: loss = 0.0987 180 | Epoch 180: loss = 0.0986 181 | Epoch 181: loss = 0.0985 182 | Epoch 182: loss = 0.0968 183 | Epoch 183: loss = 0.0993 184 | Epoch 184: loss = 0.0973 185 | Epoch 185: loss = 0.0981 186 | Epoch 186: loss = 0.0993 187 | Epoch 187: loss = 0.0974 188 | Epoch 188: loss = 0.0989 189 | Epoch 189: loss = 0.0974 190 | Epoch 190: loss = 0.0985 191 | Epoch 191: loss = 0.0989 192 | Epoch 192: loss = 0.0992 193 | Epoch 193: loss = 0.0973 194 | Epoch 194: loss = 0.0980 195 | Epoch 195: loss = 0.0975 196 | Epoch 196: loss = 0.0990 197 | Epoch 197: loss = 0.0969 198 | Epoch 198: loss = 0.0973 199 | Epoch 199: loss = 0.0981 200 | Epoch 200: loss = 0.0978 201 | -------------------------------------------------------------------------------- /checkpts/vc/train_enc_libritts.log: -------------------------------------------------------------------------------- 1 | Epoch 1: loss = 0.5523 2 | Epoch 2: loss = 0.2962 3 | Epoch 3: loss = 0.2634 4 | Epoch 4: loss = 0.2445 5 | Epoch 5: loss = 0.2324 6 | Epoch 6: loss = 0.2246 7 | Epoch 7: loss = 0.2179 8 | Epoch 8: loss = 0.2124 9 | Epoch 9: loss = 0.2083 10 | Epoch 10: loss = 0.2052 11 | Epoch 11: loss = 0.2023 12 | Epoch 12: loss = 0.2001 13 | Epoch 13: loss = 0.1970 14 | Epoch 14: loss = 0.1947 15 | Epoch 15: loss = 0.1933 16 | Epoch 16: loss = 0.1918 17 | Epoch 17: loss = 0.1904 18 | Epoch 18: loss = 0.1890 19 | Epoch 19: loss = 0.1874 20 | Epoch 20: loss = 0.1867 21 | Epoch 21: loss = 0.1859 22 | Epoch 22: loss = 0.1833 23 | Epoch 23: loss = 0.1827 24 | Epoch 24: loss = 0.1822 25 | Epoch 25: loss = 0.1815 26 | Epoch 26: loss = 0.1803 27 | Epoch 27: loss = 0.1795 28 | Epoch 28: loss = 0.1790 29 | Epoch 29: loss = 0.1784 30 | Epoch 30: loss = 0.1777 31 | Epoch 31: loss = 0.1771 32 | Epoch 32: loss = 0.1761 33 | Epoch 33: loss = 0.1761 34 | Epoch 34: loss = 0.1748 35 | Epoch 35: loss = 0.1740 36 | Epoch 36: loss = 0.1735 37 | Epoch 37: loss = 0.1730 38 | Epoch 38: loss = 0.1722 39 | Epoch 39: loss = 0.1717 40 | Epoch 40: loss = 0.1715 41 | Epoch 41: loss = 0.1705 42 | Epoch 42: loss = 0.1706 43 | Epoch 43: loss = 0.1700 44 | Epoch 44: loss = 0.1694 45 | Epoch 45: loss = 0.1688 46 | Epoch 46: loss = 0.1686 47 | Epoch 47: loss = 0.1684 48 | Epoch 48: loss = 0.1678 49 | Epoch 49: loss = 0.1670 50 | Epoch 50: loss = 0.1670 51 | Epoch 51: loss = 0.1666 52 | Epoch 52: loss = 0.1666 53 | Epoch 53: loss = 0.1659 54 | Epoch 54: loss = 0.1656 55 | Epoch 55: loss = 0.1651 56 | Epoch 56: loss = 0.1647 57 | Epoch 57: loss = 0.1646 58 | Epoch 58: loss = 0.1639 59 | Epoch 59: loss = 0.1638 60 | Epoch 60: loss = 0.1635 61 | Epoch 61: loss = 0.1629 62 | Epoch 62: loss = 0.1635 63 | Epoch 63: loss = 0.1625 64 | Epoch 64: loss = 0.1622 65 | Epoch 65: loss = 0.1622 66 | Epoch 66: loss = 0.1617 67 | Epoch 67: loss = 0.1614 68 | Epoch 68: loss = 0.1614 69 | Epoch 69: loss = 0.1606 70 | Epoch 70: loss = 0.1607 71 | Epoch 71: loss = 0.1603 72 | Epoch 72: loss = 0.1601 73 | Epoch 73: loss = 0.1600 74 | Epoch 74: loss = 0.1594 75 | Epoch 75: loss = 0.1593 76 | Epoch 76: loss = 0.1594 77 | Epoch 77: loss = 0.1590 78 | Epoch 78: loss = 0.1584 79 | Epoch 79: loss = 0.1582 80 | Epoch 80: loss = 0.1581 81 | Epoch 81: loss = 0.1578 82 | Epoch 82: loss = 0.1581 83 | Epoch 83: loss = 0.1578 84 | Epoch 84: loss = 0.1571 85 | Epoch 85: loss = 0.1571 86 | Epoch 86: loss = 0.1572 87 | Epoch 87: loss = 0.1566 88 | Epoch 88: loss = 0.1562 89 | Epoch 89: loss = 0.1566 90 | Epoch 90: loss = 0.1556 91 | Epoch 91: loss = 0.1553 92 | Epoch 92: loss = 0.1559 93 | Epoch 93: loss = 0.1562 94 | Epoch 94: loss = 0.1556 95 | Epoch 95: loss = 0.1553 96 | Epoch 96: loss = 0.1553 97 | Epoch 97: loss = 0.1548 98 | Epoch 98: loss = 0.1544 99 | Epoch 99: loss = 0.1544 100 | Epoch 100: loss = 0.1545 101 | Epoch 101: loss = 0.1538 102 | Epoch 102: loss = 0.1538 103 | Epoch 103: loss = 0.1538 104 | Epoch 104: loss = 0.1538 105 | Epoch 105: loss = 0.1533 106 | Epoch 106: loss = 0.1535 107 | Epoch 107: loss = 0.1528 108 | Epoch 108: loss = 0.1529 109 | Epoch 109: loss = 0.1528 110 | Epoch 110: loss = 0.1523 111 | Epoch 111: loss = 0.1526 112 | Epoch 112: loss = 0.1522 113 | Epoch 113: loss = 0.1518 114 | Epoch 114: loss = 0.1518 115 | Epoch 115: loss = 0.1522 116 | Epoch 116: loss = 0.1514 117 | Epoch 117: loss = 0.1510 118 | Epoch 118: loss = 0.1517 119 | Epoch 119: loss = 0.1519 120 | Epoch 120: loss = 0.1508 121 | Epoch 121: loss = 0.1508 122 | Epoch 122: loss = 0.1515 123 | Epoch 123: loss = 0.1508 124 | Epoch 124: loss = 0.1505 125 | Epoch 125: loss = 0.1507 126 | Epoch 126: loss = 0.1508 127 | Epoch 127: loss = 0.1497 128 | Epoch 128: loss = 0.1497 129 | Epoch 129: loss = 0.1497 130 | Epoch 130: loss = 0.1498 131 | Epoch 131: loss = 0.1498 132 | Epoch 132: loss = 0.1493 133 | Epoch 133: loss = 0.1498 134 | Epoch 134: loss = 0.1488 135 | Epoch 135: loss = 0.1490 136 | Epoch 136: loss = 0.1493 137 | Epoch 137: loss = 0.1488 138 | Epoch 138: loss = 0.1485 139 | Epoch 139: loss = 0.1486 140 | Epoch 140: loss = 0.1486 141 | Epoch 141: loss = 0.1481 142 | Epoch 142: loss = 0.1483 143 | Epoch 143: loss = 0.1475 144 | Epoch 144: loss = 0.1483 145 | Epoch 145: loss = 0.1483 146 | Epoch 146: loss = 0.1476 147 | Epoch 147: loss = 0.1477 148 | Epoch 148: loss = 0.1475 149 | Epoch 149: loss = 0.1473 150 | Epoch 150: loss = 0.1474 151 | Epoch 151: loss = 0.1469 152 | Epoch 152: loss = 0.1473 153 | Epoch 153: loss = 0.1472 154 | Epoch 154: loss = 0.1465 155 | Epoch 155: loss = 0.1467 156 | Epoch 156: loss = 0.1469 157 | Epoch 157: loss = 0.1466 158 | Epoch 158: loss = 0.1468 159 | Epoch 159: loss = 0.1459 160 | Epoch 160: loss = 0.1463 161 | Epoch 161: loss = 0.1461 162 | Epoch 162: loss = 0.1459 163 | Epoch 163: loss = 0.1461 164 | Epoch 164: loss = 0.1455 165 | Epoch 165: loss = 0.1458 166 | Epoch 166: loss = 0.1457 167 | Epoch 167: loss = 0.1455 168 | Epoch 168: loss = 0.1457 169 | Epoch 169: loss = 0.1452 170 | Epoch 170: loss = 0.1457 171 | Epoch 171: loss = 0.1451 172 | Epoch 172: loss = 0.1448 173 | Epoch 173: loss = 0.1445 174 | Epoch 174: loss = 0.1451 175 | Epoch 175: loss = 0.1451 176 | Epoch 176: loss = 0.1451 177 | Epoch 177: loss = 0.1446 178 | Epoch 178: loss = 0.1442 179 | Epoch 179: loss = 0.1452 180 | Epoch 180: loss = 0.1447 181 | Epoch 181: loss = 0.1445 182 | Epoch 182: loss = 0.1444 183 | Epoch 183: loss = 0.1440 184 | Epoch 184: loss = 0.1446 185 | Epoch 185: loss = 0.1442 186 | Epoch 186: loss = 0.1442 187 | Epoch 187: loss = 0.1441 188 | Epoch 188: loss = 0.1438 189 | Epoch 189: loss = 0.1441 190 | Epoch 190: loss = 0.1433 191 | Epoch 191: loss = 0.1436 192 | Epoch 192: loss = 0.1435 193 | Epoch 193: loss = 0.1431 194 | Epoch 194: loss = 0.1431 195 | Epoch 195: loss = 0.1431 196 | Epoch 196: loss = 0.1432 197 | Epoch 197: loss = 0.1434 198 | Epoch 198: loss = 0.1427 199 | Epoch 199: loss = 0.1429 200 | Epoch 200: loss = 0.1428 201 | Epoch 201: loss = 0.1425 202 | Epoch 202: loss = 0.1420 203 | Epoch 203: loss = 0.1431 204 | Epoch 204: loss = 0.1424 205 | Epoch 205: loss = 0.1422 206 | Epoch 206: loss = 0.1425 207 | Epoch 207: loss = 0.1426 208 | Epoch 208: loss = 0.1425 209 | Epoch 209: loss = 0.1419 210 | Epoch 210: loss = 0.1422 211 | Epoch 211: loss = 0.1420 212 | Epoch 212: loss = 0.1419 213 | Epoch 213: loss = 0.1418 214 | Epoch 214: loss = 0.1416 215 | Epoch 215: loss = 0.1415 216 | Epoch 216: loss = 0.1418 217 | Epoch 217: loss = 0.1414 218 | Epoch 218: loss = 0.1417 219 | Epoch 219: loss = 0.1418 220 | Epoch 220: loss = 0.1418 221 | Epoch 221: loss = 0.1414 222 | Epoch 222: loss = 0.1414 223 | Epoch 223: loss = 0.1414 224 | Epoch 224: loss = 0.1410 225 | Epoch 225: loss = 0.1410 226 | Epoch 226: loss = 0.1408 227 | Epoch 227: loss = 0.1409 228 | Epoch 228: loss = 0.1406 229 | Epoch 229: loss = 0.1409 230 | Epoch 230: loss = 0.1407 231 | Epoch 231: loss = 0.1406 232 | Epoch 232: loss = 0.1407 233 | Epoch 233: loss = 0.1412 234 | Epoch 234: loss = 0.1405 235 | Epoch 235: loss = 0.1398 236 | Epoch 236: loss = 0.1402 237 | Epoch 237: loss = 0.1405 238 | Epoch 238: loss = 0.1401 239 | Epoch 239: loss = 0.1401 240 | Epoch 240: loss = 0.1401 241 | Epoch 241: loss = 0.1402 242 | Epoch 242: loss = 0.1398 243 | Epoch 243: loss = 0.1400 244 | Epoch 244: loss = 0.1399 245 | Epoch 245: loss = 0.1395 246 | Epoch 246: loss = 0.1398 247 | Epoch 247: loss = 0.1391 248 | Epoch 248: loss = 0.1397 249 | Epoch 249: loss = 0.1391 250 | Epoch 250: loss = 0.1398 251 | Epoch 251: loss = 0.1394 252 | Epoch 252: loss = 0.1394 253 | Epoch 253: loss = 0.1400 254 | Epoch 254: loss = 0.1395 255 | Epoch 255: loss = 0.1396 256 | Epoch 256: loss = 0.1388 257 | Epoch 257: loss = 0.1391 258 | Epoch 258: loss = 0.1390 259 | Epoch 259: loss = 0.1392 260 | Epoch 260: loss = 0.1391 261 | Epoch 261: loss = 0.1390 262 | Epoch 262: loss = 0.1385 263 | Epoch 263: loss = 0.1383 264 | Epoch 264: loss = 0.1395 265 | Epoch 265: loss = 0.1386 266 | Epoch 266: loss = 0.1382 267 | Epoch 267: loss = 0.1387 268 | Epoch 268: loss = 0.1382 269 | Epoch 269: loss = 0.1384 270 | Epoch 270: loss = 0.1385 271 | Epoch 271: loss = 0.1382 272 | Epoch 272: loss = 0.1385 273 | Epoch 273: loss = 0.1380 274 | Epoch 274: loss = 0.1381 275 | Epoch 275: loss = 0.1385 276 | Epoch 276: loss = 0.1384 277 | Epoch 277: loss = 0.1381 278 | Epoch 278: loss = 0.1380 279 | Epoch 279: loss = 0.1382 280 | Epoch 280: loss = 0.1384 281 | Epoch 281: loss = 0.1376 282 | Epoch 282: loss = 0.1379 283 | Epoch 283: loss = 0.1379 284 | Epoch 284: loss = 0.1378 285 | Epoch 285: loss = 0.1379 286 | Epoch 286: loss = 0.1376 287 | Epoch 287: loss = 0.1373 288 | Epoch 288: loss = 0.1374 289 | Epoch 289: loss = 0.1375 290 | Epoch 290: loss = 0.1372 291 | Epoch 291: loss = 0.1378 292 | Epoch 292: loss = 0.1373 293 | Epoch 293: loss = 0.1375 294 | Epoch 294: loss = 0.1373 295 | Epoch 295: loss = 0.1375 296 | Epoch 296: loss = 0.1372 297 | Epoch 297: loss = 0.1372 298 | Epoch 298: loss = 0.1370 299 | Epoch 299: loss = 0.1367 300 | Epoch 300: loss = 0.1368 301 | -------------------------------------------------------------------------------- /checkpts/vocoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /checkpts/vocoder/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "segment_size": 8192, 18 | "num_mels": 80, 19 | "num_freq": 1025, 20 | "n_fft": 1024, 21 | "hop_size": 256, 22 | "win_size": 1024, 23 | 24 | "sampling_rate": 22050, 25 | 26 | "fmin": 0, 27 | "fmax": 8000, 28 | "fmax_for_loss": null, 29 | 30 | "num_workers": 4, 31 | 32 | "dist_config": { 33 | "dist_backend": "nccl", 34 | "dist_url": "tcp://localhost:54321", 35 | "world_size": 1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /deploy/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/Dockerfile -------------------------------------------------------------------------------- /deploy/model_repository/vc_pipeline_python/1/model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import torch 4 | import triton_python_backend_utils as pb_utils 5 | 6 | class TritonPythonModel: 7 | 8 | def initialize(self, args): 9 | 10 | self.model_config = model_config = json.loads(args['model_config']) 11 | 12 | def execute(self, requests): 13 | responses = [] 14 | for request in requests: 15 | inp = pb_utils.get_input_tensor_by_name(request, "prompt") 16 | -------------------------------------------------------------------------------- /deploy/model_repository/vc_pipeline_python/1/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/model_repository/vc_pipeline_python/1/pipeline/__init__.py -------------------------------------------------------------------------------- /deploy/model_repository/vc_pipeline_python/config.pbtxt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/deploy/model_repository/vc_pipeline_python/config.pbtxt -------------------------------------------------------------------------------- /deploy/model_repository/vc_spk_encoder/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "vc_spk_encoder" 2 | platform: "onnxruntime_onnx" 3 | max_batch_size : 8 4 | version_policy: { 5 | specific: { 6 | versions: [1] 7 | } 8 | } 9 | input [ 10 | { 11 | name: "frame_input" 12 | data_type: TYPE_FP16 13 | dims: [-1, 160, 40] 14 | } 15 | ] 16 | output [ 17 | { 18 | name: "embed_output" 19 | data_type: TYPE_FP16 20 | dims: [-1, 256] 21 | } 22 | ] -------------------------------------------------------------------------------- /deploy/model_repository/vc_vocoder/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "vc_vocoder" 2 | platform: "onnxruntime_onnx" 3 | max_batch_size : 8 4 | version_policy: { 5 | specific: { 6 | versions: [1] 7 | } 8 | } 9 | input [ 10 | { 11 | name: "mel_input" 12 | data_type: TYPE_FP16 13 | dims: [-1, 80, -1] 14 | } 15 | ] 16 | output [ 17 | { 18 | name: "audio_output" 19 | data_type: TYPE_FP16 20 | dims: [-1,1,-1] 21 | } 22 | ] -------------------------------------------------------------------------------- /example/6415_111615_000012_000005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/example/6415_111615_000012_000005.wav -------------------------------------------------------------------------------- /example/8534_216567_000015_000010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/example/8534_216567_000015_000010.wav -------------------------------------------------------------------------------- /export_onnx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/export_onnx/__init__.py -------------------------------------------------------------------------------- /export_onnx/export_hifigan.py: -------------------------------------------------------------------------------- 1 | # vocoder 2 | 3 | import argparse 4 | import json 5 | import os 6 | import numpy as np 7 | import IPython.display as ipd 8 | from tqdm import tqdm 9 | from scipy.io.wavfile import write 10 | 11 | import torch 12 | use_gpu = torch.cuda.is_available() 13 | 14 | import librosa 15 | from librosa.core import load 16 | from librosa.filters import mel as librosa_mel_fn 17 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) 18 | 19 | import params 20 | from model import DiffVC 21 | 22 | import sys 23 | sys.path.append('hifi-gan/') 24 | from env import AttrDict 25 | from models import Generator as HiFiGAN 26 | 27 | sys.path.append('speaker_encoder/') 28 | from encoder import inference as spk_encoder 29 | from pathlib import Path 30 | 31 | os.environ["CUDA_VISIBLE_DEVICES"]= "1" 32 | 33 | 34 | # loading HiFi-GAN vocoder 35 | hfg_path = 'checkpts/vocoder/' # HiFi-GAN path 36 | 37 | with open(hfg_path + 'config.json') as f: 38 | h = AttrDict(json.load(f)) 39 | 40 | if use_gpu: 41 | hifigan_universal = HiFiGAN(h).cuda() 42 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator']) 43 | else: 44 | hifigan_universal = HiFiGAN(h) 45 | hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator', map_location='cpu')['generator']) 46 | 47 | _ = hifigan_universal.eval() 48 | hifigan_universal.remove_weight_norm() 49 | 50 | 51 | 52 | 53 | def convert_torch_to_onnx_batch(model, output_path, dummy_input, device=None): 54 | 55 | input_names = ["mel_input"] 56 | output_names = ["audio_output"] 57 | 58 | if device!=None: 59 | model = model.to(device) 60 | dummy_input = dummy_input.to(device) 61 | 62 | torch.onnx.export(model, 63 | dummy_input, 64 | output_path, 65 | verbose=True, 66 | input_names=input_names, 67 | output_names=output_names, 68 | dynamic_axes={'mel_input' : {0: 'batch_size', 2 : 'mel_leghths'}, # variable length axes 69 | 'audio_output' : {0:'batch_size', 2 : 'audio_lenghts'}}) 70 | 71 | device = torch.device('cuda') 72 | output_path = "hifigan.onnx" 73 | # dummy_input = mel_source 74 | dummy_input = torch.rand(2,80,200) 75 | dummy_output = torch.rand(2,1,124321) 76 | convert_torch_to_onnx_batch(hifigan_universal, output_path, dummy_input, device=device) 77 | 78 | print(device) -------------------------------------------------------------------------------- /export_onnx/export_spk_enc.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import numpy as np 5 | import IPython.display as ipd 6 | from tqdm import tqdm 7 | from scipy.io.wavfile import write 8 | 9 | import torch 10 | use_gpu = torch.cuda.is_available() 11 | 12 | import librosa 13 | from librosa.core import load 14 | from librosa.filters import mel as librosa_mel_fn 15 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) 16 | 17 | import params 18 | from model import DiffVC 19 | 20 | import sys 21 | # sys.path.append('hifi-gan/') 22 | # from env import AttrDict 23 | # from models import Generator as HiFiGAN 24 | 25 | sys.path.append('speaker_encoder/') 26 | from encoder import inference as spk_encoder 27 | from pathlib import Path 28 | 29 | os.environ["CUDA_VISIBLE_DEVICES"]= "1" 30 | 31 | from encoder.model import SpeakerEncoder 32 | 33 | 34 | weights_fpath = Path('checkpts/spk_encoder/pretrained.pt') 35 | _device = torch.device('cuda') 36 | 37 | _model = SpeakerEncoder(_device, torch.device("cpu")) 38 | checkpoint = torch.load(weights_fpath, map_location="cuda") 39 | _model.load_state_dict(checkpoint["model_state"]) 40 | _model.eval() 41 | 42 | def convert_torch_to_onnx_batch(model, output_path, dummy_input, device=None): 43 | 44 | input_names = ["frame_input"] 45 | output_names = ["embed_output"] 46 | 47 | if device!=None: 48 | model = model.to(device) 49 | dummy_input = dummy_input.to(device) 50 | 51 | torch.onnx.export(model, 52 | dummy_input, 53 | output_path, 54 | verbose=True, 55 | input_names=input_names, 56 | output_names=output_names, 57 | dynamic_axes={'frame_input' : {0: 'batch_size'}, # variable length axes 58 | 'embed_output' : {0:'batch_size'}}) 59 | print("hihi") 60 | device = torch.device('cuda') 61 | output_path = "spk_enc.onnx" 62 | # dummy_input = mel_source 63 | dummy_input = torch.rand(2, 10, 160, 40) 64 | dummpy_ouput = torch.rand(10,256) 65 | 66 | convert_torch_to_onnx_batch(_model, output_path, dummy_input, device=device) 67 | 68 | # print(device) 69 | 70 | 71 | -------------------------------------------------------------------------------- /export_onnx/onnx_check.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | 3 | # load model from onnx 4 | 5 | model = onnx.load('./path/to/onnx.onnx') 6 | 7 | # confirm model has valid schema 8 | onnx.checker.check_model(model) 9 | 10 | 11 | # Print a human readable representation of the graph 12 | onnx.helper.printable_graph(model.graph) -------------------------------------------------------------------------------- /filelists/exceptions_libritts.txt: -------------------------------------------------------------------------------- 1 | 1027_125147_000139_000000 2 | 7739_8592_000126_000000 3 | 1535_141642_000009_000000 4 | 1974_139741_000015_000001 5 | 8855_283242_000010_000000 6 | 7120_118112_000003_000000 7 | 5583_41919_000016_000001 8 | 1509_145742_000007_000000 9 | 1265_135635_000018_000000 10 | 176_122025_000000_000001 11 | 5655_46267_000030_000000 12 | 2012_139358_000006_000000 13 | 207_143321_000019_000000 14 | 176_123269_000011_000000 15 | 8699_291107_000027_000000 16 | 1638_84447_000036_000000 17 | 1050_134119_000035_000002 18 | 4108_2777_000010_000000 19 | 6233_61741_000020_000000 20 | 4088_158079_000094_000000 21 | 1731_142320_000096_000000 22 | 2401_144485_000071_000001 23 | 1553_140048_000009_000000 24 | 3240_131232_000080_000000 25 | 7402_59171_000003_000000 26 | 8396_120280_000033_000001 27 | 8238_283452_000006_000000 28 | 166_352_000004_000000 29 | 589_146346_000020_000005 30 | 3513_163606_000046_000001 31 | 500_125123_000112_000000 32 | 4586_96498_000028_000000 33 | 7538_100045_000021_000001 34 | 1958_144503_000020_000001 35 | 3857_182315_000006_000000 36 | 176_122025_000017_000001 37 | 8063_274112_000077_000000 38 | 176_122025_000009_000001 39 | 6458_232057_000041_000001 40 | 7495_102612_000071_000000 41 | 4807_26852_000071_000000 42 | 78_369_000023_000000 43 | 14_212_000011_000009 44 | 4363_11049_000058_000000 45 | 3224_167024_000041_000000 46 | 16_122828_000022_000000 47 | 207_143321_000019_000001 48 | 1271_136861_000021_000000 49 | 339_132718_000019_000002 50 | 9023_296467_000008_000000 51 | 5660_101884_000031_000000 52 | 1845_145083_000010_000002 53 | 2060_150855_000024_000000 54 | 2045_158081_000020_000001 55 | 4088_158079_000088_000000 56 | 922_132300_000030_000001 57 | 5333_5083_000012_000011 58 | 6385_34655_000022_000000 59 | 340_124368_000004_000000 60 | 4044_9010_000022_000000 61 | 2204_131732_000017_000017 62 | 64_76974_000089_000000 63 | 2436_2476_000048_000000 64 | 4806_26894_000004_000000 65 | 28_12332_000061_000000 66 | 2531_156724_000012_000000 67 | 4957_30119_000014_000000 68 | 1182_134981_000027_000000 69 | 3540_163612_000169_000000 70 | 6104_58845_000020_000000 71 | 60_121082_000029_000000 72 | 4363_11049_000177_000000 73 | 1958_144503_000027_000000 74 | 7278_246956_000020_000000 75 | 2401_144485_000068_000001 76 | 1264_129805_000026_000000 77 | 6098_57836_000021_000000 78 | 2517_135227_000015_000004 79 | 4680_16026_000096_000000 80 | 4116_3582_000039_000001 81 | 7511_102420_000005_000001 82 | 4297_13009_000042_000000 83 | 307_127535_000033_000003 84 | 1841_179183_000009_000000 85 | 8770_295462_000051_000000 86 | 3513_7741_000060_000001 87 | 7800_283492_000025_000000 88 | 8479_276730_000026_000000 89 | 3728_105386_000010_000001 90 | 4800_73729_000026_000004 91 | 4427_20023_000004_000007 92 | 8063_274112_000030_000000 93 | 7145_87280_000100_000004 94 | 4243_187023_000016_000000 95 | 6643_67857_000002_000000 96 | 1885_136863_000025_000000 97 | 7120_118112_000035_000000 98 | 3513_163607_000044_000002 99 | 1553_140048_000001_000000 100 | 2053_138901_000004_000004 101 | 510_130101_000054_000000 102 | 899_126233_000033_000000 103 | 1265_135636_000067_000001 104 | 6981_70843_000127_000000 105 | 6104_58843_000033_000000 106 | 7078_271888_000015_000000 107 | 335_125945_000035_000001 108 | 1265_135635_000052_000000 109 | 5339_14134_000072_000000 110 | 7939_120318_000016_000000 111 | 6032_58192_000008_000000 112 | 1743_142912_000015_000001 113 | 839_130898_000020_000000 114 | 1121_176698_000015_000000 115 | 501_125128_000068_000000 116 | 7783_107486_000060_000001 117 | 806_124221_000037_000000 118 | 78_369_000043_000006 119 | 1731_142320_000053_000000 120 | 4837_285896_000040_000000 121 | 5622_44586_000017_000000 122 | 7991_102381_000019_000000 123 | 14_208_000021_000002 124 | 157_121907_000017_000000 125 | 4108_2777_000059_000000 126 | 14_212_000011_000004 127 | 3983_5331_000002_000000 128 | 497_125118_000079_000000 129 | 8699_291107_000302_000000 130 | 512_124520_000071_000008 131 | 7120_118112_000019_000000 132 | 78_369_000030_000009 133 | 954_130627_000038_000001 134 | 6574_70756_000008_000007 135 | 4381_14897_000005_000006 136 | 6006_60489_000033_000005 137 | 4267_72637_000007_000000 138 | 5731_50776_000030_000001 139 | 2053_138901_000037_000001 140 | 2592_5341_000039_000000 141 | 1283_136983_000009_000000 142 | 1731_142320_000127_000000 143 | 1265_135635_000003_000000 144 | 1731_142320_000059_000000 145 | 6895_96175_000051_000000 146 | 1027_125147_000080_000000 147 | 8479_276730_000042_000000 148 | 1335_163935_000018_000001 149 | 1731_142320_000114_000000 150 | 374_180298_000028_000001 151 | 4088_158079_000154_000000 152 | 8875_293959_000083_000000 153 | 5876_8675_000009_000000 154 | 7665_104979_000053_000000 155 | 5968_55202_000071_000000 156 | 500_125123_000032_000000 157 | 1705_142318_000022_000000 158 | 4586_96498_000035_000001 159 | 6104_58845_000023_000000 160 | 7739_8592_000024_000000 161 | 2473_157859_000047_000004 162 | 249_121331_000003_000000 163 | 2012_139358_000012_000000 164 | 6104_58843_000080_000000 165 | 454_134728_000083_000000 166 | 6904_262305_000001_000000 167 | 1974_139741_000048_000001 168 | 549_126410_000049_000001 169 | 1603_139325_000039_000000 170 | 8770_295465_000020_000000 171 | 816_53638_000055_000000 172 | 6701_71404_000089_000000 173 | 78_369_000035_000003 174 | 4495_18533_000041_000000 175 | 2436_2477_000061_000001 176 | 118_47824_000109_000000 177 | 8479_276730_000034_000000 178 | 298_126791_000064_000000 179 | 8176_115047_000053_000004 180 | 7511_102419_000004_000001 181 | 1027_125140_000073_000000 182 | 5583_41259_000007_000005 183 | 8465_246947_000028_000000 184 | 4535_279856_000055_000000 185 | 6880_216547_000039_000000 186 | 2045_158081_000012_000000 187 | 1958_144503_000083_000000 188 | 1974_139742_000065_000000 189 | 576_129623_000056_000005 190 | 5519_39481_000017_000000 191 | 1027_125147_000045_000001 192 | 5304_55856_000010_000000 193 | 205_159056_000013_000000 194 | 337_123025_000026_000003 195 | 2368_157056_000070_000000 196 | 806_124221_000045_000000 197 | 2092_145709_000002_000001 198 | 14_212_000018_000001 199 | 1974_139742_000069_000000 200 | 1731_142320_000069_000000 201 | 211_122442_000144_000000 202 | 7945_112011_000069_000000 203 | 7000_83706_000006_000003 204 | 78_369_000065_000003 205 | 8190_284435_000073_000000 206 | 806_124221_000040_000000 207 | 1271_136861_000062_000000 208 | 2401_144485_000092_000000 -------------------------------------------------------------------------------- /filelists/exceptions_vctk.txt: -------------------------------------------------------------------------------- 1 | p234_280_mic2 2 | p234_122_mic2 3 | p234_010_mic2 4 | p234_097_mic2 5 | p234_304_mic2 6 | p234_124_mic2 7 | p234_075_mic2 8 | p234_318_mic2 9 | p234_125_mic2 10 | p234_355_mic2 11 | p234_157_mic2 12 | p234_089_mic2 13 | p234_062_mic2 14 | p234_317_mic2 15 | p234_279_mic2 16 | p234_094_mic2 17 | p234_199_mic2 18 | p234_272_mic2 19 | p234_054_mic2 20 | p234_083_mic2 21 | p234_336_mic2 22 | p234_030_mic2 23 | p234_091_mic2 24 | p234_055_mic2 25 | p234_191_mic2 26 | p234_258_mic2 27 | p234_038_mic2 28 | p234_035_mic2 29 | p234_346_mic2 30 | p234_222_mic2 31 | p234_200_mic2 32 | p234_173_mic2 33 | p234_262_mic2 34 | p234_334_mic2 35 | p234_253_mic2 36 | p234_241_mic2 37 | p234_139_mic2 38 | p234_316_mic2 39 | p234_099_mic2 40 | p234_207_mic2 41 | p234_325_mic2 42 | p234_093_mic2 43 | p234_118_mic2 44 | p234_194_mic2 45 | p234_006_mic2 46 | p234_155_mic2 47 | p234_259_mic2 48 | p234_081_mic2 49 | p234_063_mic2 50 | p234_046_mic2 51 | p234_177_mic2 52 | p234_024_mic2 53 | p234_213_mic2 54 | p234_333_mic2 55 | p234_189_mic2 56 | p234_236_mic2 57 | p234_135_mic2 58 | p234_228_mic2 59 | p234_005_mic2 60 | p234_108_mic2 61 | p234_257_mic2 62 | p234_100_mic2 63 | p234_179_mic2 64 | p234_309_mic2 65 | p234_165_mic2 66 | p234_040_mic2 67 | p234_074_mic2 68 | p234_181_mic2 69 | p234_242_mic2 70 | p234_170_mic2 71 | p234_327_mic2 72 | p234_013_mic2 73 | p234_132_mic2 74 | p234_204_mic2 75 | p234_342_mic2 76 | p234_056_mic2 77 | p234_111_mic2 78 | p234_095_mic2 79 | p234_031_mic2 80 | p234_275_mic2 81 | p234_137_mic2 82 | p234_130_mic2 83 | p234_245_mic2 84 | p234_290_mic2 85 | p234_129_mic2 86 | p234_288_mic2 87 | p234_221_mic2 88 | p234_019_mic2 89 | p234_043_mic2 90 | p234_077_mic2 91 | p234_050_mic2 92 | p234_350_mic2 93 | p234_167_mic2 94 | p234_273_mic2 95 | p234_294_mic2 96 | p234_187_mic2 97 | p234_156_mic2 98 | p234_266_mic2 99 | p234_254_mic2 100 | p234_227_mic2 101 | p360_262_mic2 102 | p234_303_mic2 103 | p234_295_mic2 104 | p234_032_mic2 105 | p234_025_mic2 106 | p234_003_mic2 107 | p234_328_mic2 108 | p234_291_mic2 109 | p234_016_mic2 110 | p234_322_mic2 111 | p234_248_mic2 112 | p234_102_mic2 113 | p234_356_mic2 114 | p234_087_mic2 115 | p234_012_mic2 116 | p234_270_mic2 117 | p234_104_mic2 118 | p234_073_mic2 119 | p234_209_mic2 120 | p234_026_mic2 121 | p234_205_mic2 122 | p234_017_mic2 123 | p234_343_mic2 124 | p234_086_mic2 125 | p234_212_mic2 126 | p234_027_mic2 127 | p234_018_mic2 128 | p234_105_mic2 129 | p234_249_mic2 130 | p234_311_mic2 131 | p234_041_mic2 132 | p234_326_mic2 133 | p234_123_mic2 134 | p234_329_mic2 135 | p234_299_mic2 136 | p234_296_mic2 137 | p234_171_mic2 138 | p234_263_mic2 139 | p234_216_mic2 140 | p234_321_mic2 141 | p234_090_mic2 142 | p234_069_mic2 143 | p234_282_mic2 144 | p234_117_mic2 145 | p234_286_mic2 146 | p234_233_mic2 147 | p234_214_mic2 148 | p234_047_mic2 149 | p234_022_mic2 150 | p234_106_mic2 151 | p234_239_mic2 152 | p234_219_mic2 153 | p234_133_mic2 154 | p234_353_mic2 155 | p234_052_mic2 156 | p234_277_mic2 157 | p234_208_mic2 158 | p234_033_mic2 159 | p234_186_mic2 160 | p234_256_mic2 161 | p234_064_mic2 162 | p234_140_mic2 163 | p234_354_mic2 164 | p234_182_mic2 165 | p234_240_mic2 166 | p234_298_mic2 167 | p234_127_mic2 168 | p234_071_mic2 169 | p234_034_mic2 170 | p234_324_mic2 171 | p234_175_mic2 172 | p234_308_mic2 173 | p234_159_mic2 174 | p234_152_mic2 175 | p234_183_mic2 176 | p234_079_mic2 177 | p234_053_mic2 178 | p234_112_mic2 179 | p234_072_mic2 180 | p234_176_mic2 181 | p234_323_mic2 182 | p234_285_mic2 183 | p234_314_mic2 184 | p234_349_mic2 185 | p234_115_mic2 186 | p234_061_mic2 187 | p234_174_mic2 188 | p234_060_mic2 189 | p234_110_mic2 190 | p234_224_mic2 191 | p234_229_mic2 192 | p234_261_mic2 193 | p234_250_mic2 194 | p234_188_mic2 195 | p234_310_mic2 196 | p234_276_mic2 197 | p234_202_mic2 198 | p234_265_mic2 199 | p234_169_mic2 200 | p234_339_mic2 201 | p234_193_mic2 202 | p234_168_mic2 203 | p234_274_mic2 204 | p234_082_mic2 205 | p234_029_mic2 206 | p234_210_mic2 207 | p234_068_mic2 208 | p234_107_mic2 209 | p234_340_mic2 210 | p234_301_mic2 211 | p234_103_mic2 212 | p234_048_mic2 213 | p234_058_mic2 214 | p234_185_mic2 215 | p234_120_mic2 216 | p234_218_mic2 217 | p234_001_mic2 218 | p234_237_mic2 219 | p234_154_mic2 220 | p234_161_mic2 221 | p234_109_mic2 222 | p234_143_mic2 223 | p234_085_mic2 224 | p234_180_mic2 225 | p234_057_mic2 226 | p234_009_mic2 227 | p234_198_mic2 228 | p234_313_mic2 229 | p234_195_mic2 230 | p234_348_mic2 231 | p234_306_mic2 232 | p234_337_mic2 233 | p234_178_mic2 234 | p234_243_mic2 235 | p234_044_mic2 236 | p234_347_mic2 237 | p234_359_mic2 238 | p234_126_mic2 239 | p234_002_mic2 240 | p234_023_mic2 241 | p234_246_mic2 242 | p234_039_mic2 243 | p234_092_mic2 244 | p234_096_mic2 245 | p234_315_mic2 246 | p234_147_mic2 247 | p234_004_mic2 248 | p234_358_mic2 249 | p234_160_mic2 250 | p234_217_mic2 251 | p234_164_mic2 252 | p234_149_mic2 253 | p234_289_mic2 254 | p234_252_mic2 255 | p234_020_mic2 256 | p234_021_mic2 257 | p234_172_mic2 258 | p234_244_mic2 259 | p234_113_mic2 260 | p234_264_mic2 261 | p234_153_mic2 262 | p234_220_mic2 263 | p234_247_mic2 264 | p234_360_mic2 265 | p234_101_mic2 266 | p234_338_mic2 267 | p234_225_mic2 268 | p234_284_mic2 269 | p234_302_mic2 270 | p234_260_mic2 271 | p234_145_mic2 272 | p234_144_mic2 273 | p234_190_mic2 274 | p234_235_mic2 275 | p234_320_mic2 276 | p234_098_mic2 277 | p234_138_mic2 278 | p234_226_mic2 279 | p234_345_mic2 280 | p234_197_mic2 281 | p234_331_mic2 282 | p234_271_mic2 283 | p234_230_mic2 284 | p234_119_mic2 285 | p234_335_mic2 286 | p234_344_mic2 287 | p234_341_mic2 288 | p234_148_mic2 289 | p234_059_mic2 290 | p234_307_mic2 291 | p323_011_mic2 292 | p234_114_mic2 293 | p234_319_mic2 294 | p234_116_mic2 295 | p234_008_mic2 296 | p234_166_mic2 297 | p234_361_mic2 298 | p234_231_mic2 299 | p234_076_mic2 300 | p234_015_mic2 301 | p234_070_mic2 302 | p234_158_mic2 303 | p234_131_mic2 304 | p234_088_mic2 305 | p234_142_mic2 306 | p234_080_mic2 307 | p234_121_mic2 308 | p234_192_mic2 309 | p234_312_mic2 310 | p234_234_mic2 311 | p234_281_mic2 312 | p234_162_mic2 313 | p234_268_mic2 314 | p234_352_mic2 315 | p234_028_mic2 316 | p234_049_mic2 317 | p234_293_mic2 318 | p234_151_mic2 319 | p234_196_mic2 320 | p234_037_mic2 321 | p234_042_mic2 322 | p234_201_mic2 323 | p234_332_mic2 324 | p234_067_mic2 325 | p234_292_mic2 326 | p234_146_mic2 327 | p234_223_mic2 328 | p234_287_mic2 329 | p234_141_mic2 330 | p234_203_mic2 331 | p234_211_mic2 332 | p234_136_mic2 333 | p234_036_mic2 334 | p234_150_mic2 335 | p234_255_mic2 336 | p234_134_mic2 337 | p234_128_mic2 338 | p234_238_mic2 339 | p234_014_mic2 340 | p234_297_mic2 341 | p234_278_mic2 342 | p234_184_mic2 343 | p234_267_mic2 344 | p234_330_mic2 345 | p234_251_mic2 346 | p234_066_mic2 347 | p234_351_mic2 348 | p234_084_mic2 349 | p234_051_mic2 350 | p234_300_mic2 351 | p234_232_mic2 352 | p234_045_mic2 353 | p234_283_mic2 354 | p234_305_mic2 355 | p234_065_mic2 356 | p234_007_mic2 357 | p234_357_mic2 358 | p234_269_mic2 359 | p234_163_mic2 -------------------------------------------------------------------------------- /filelists/valid.txt: -------------------------------------------------------------------------------- 1 | 240_144999_000031_000000 2 | 240_160592_000061_000000 3 | 240_160593_000049_000000 4 | 240_144999_000033_000000 5 | 240_160592_000063_000000 6 | 240_160593_000050_000000 7 | 240_144999_000036_000000 8 | 240_160592_000073_000000 9 | 240_160593_000052_000000 10 | 240_144999_000038_000000 11 | 240_160592_000075_000000 12 | 240_160593_000054_000000 13 | 240_144999_000042_000000 14 | 240_160592_000077_000000 15 | 240_160593_000057_000000 16 | 4133_6541_000001_000001 17 | 4133_6541_000019_000004 18 | 4133_6541_000031_000001 19 | 4133_6541_000047_000003 20 | 4133_6541_000002_000000 21 | 4133_6541_000019_000005 22 | 4133_6541_000032_000000 23 | 4133_6541_000047_000004 24 | 4133_6541_000004_000000 25 | 4133_6541_000020_000000 26 | 4133_6541_000033_000000 27 | 4133_6541_000049_000000 28 | 479_107479_000011_000000 29 | 479_107479_000043_000002 30 | 479_107480_000017_000002 31 | 479_126480_000009_000000 32 | 479_107479_000013_000002 33 | 479_107479_000044_000000 34 | 479_107480_000017_000005 35 | 479_126480_000011_000000 36 | 479_107479_000014_000001 37 | 479_107479_000045_000001 38 | 479_107480_000017_000006 39 | 479_126480_000014_000000 40 | 5093_29101_000019_000005 41 | 5093_29101_000041_000000 42 | 5093_39749_000007_000004 43 | 5093_26496_000002_000012 44 | 5093_29101_000020_000000 45 | 5093_29101_000042_000000 46 | 5093_39749_000007_000005 47 | 5093_26496_000002_000013 48 | 5093_29101_000020_000001 49 | 5093_29101_000042_000001 50 | 5093_39749_000007_000007 51 | 5339_14133_000018_000004 52 | 5339_14134_000012_000000 53 | 5339_14134_000042_000002 54 | 5339_14134_000091_000008 55 | 5339_14133_000018_000006 56 | 5339_14134_000013_000000 57 | 5339_14134_000043_000000 58 | 5339_14134_000091_000009 59 | 5339_14133_000018_000007 60 | 5339_14134_000013_000001 61 | 5339_14134_000047_000000 62 | 5339_14134_000091_000010 63 | 5660_101883_000012_000000 64 | 5660_101884_000021_000000 65 | 5660_101892_000029_000003 66 | 5660_101883_000013_000000 67 | 5660_101884_000021_000002 68 | 5660_101892_000030_000000 69 | 5660_101883_000015_000000 70 | 5660_101884_000022_000000 71 | 5660_101892_000031_000001 72 | 5808_48608_000005_000001 73 | 5808_54425_000010_000005 74 | 5808_54425_000029_000000 75 | 5808_54425_000056_000000 76 | 5808_48608_000005_000003 77 | 5808_54425_000010_000006 78 | 5808_54425_000029_000002 79 | 5808_54425_000058_000000 80 | 5808_48608_000005_000004 81 | 5808_54425_000010_000008 82 | 5808_54425_000029_000003 83 | 5808_54425_000059_000000 84 | 7789_103120_000032_000004 85 | 7789_103120_000065_000000 86 | 7789_258266_000018_000000 87 | 7789_103120_000033_000000 88 | 7789_103120_000065_000001 89 | 7789_258266_000019_000001 90 | 7789_103120_000034_000000 91 | 7789_103120_000066_000000 92 | 7789_258266_000021_000000 93 | 7832_114468_000017_000001 94 | 7832_114468_000042_000003 95 | 7832_258250_000015_000012 96 | 7832_114468_000017_000002 97 | 7832_114468_000042_000004 98 | 7832_258250_000015_000019 99 | 7832_114468_000017_000003 100 | 7832_114468_000042_000005 101 | 7832_258250_000015_000020 102 | 8797_294123_000011_000005 103 | 8797_294123_000027_000001 104 | 8797_294123_000036_000005 105 | 8797_294123_000011_000007 106 | 8797_294123_000027_000002 107 | 8797_294123_000036_000006 108 | 8797_294123_000012_000005 109 | 8797_294123_000027_000003 110 | 8797_294123_000036_000007 -------------------------------------------------------------------------------- /hifi-gan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /hifi-gan/README.md: -------------------------------------------------------------------------------- 1 | # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis 2 | 3 | ### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae 4 | 5 | In our [paper](https://arxiv.org/abs/2010.05646), 6 | we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.
7 | We provide our implementation and pretrained models as open source in this repository. 8 | 9 | **Abstract :** 10 | Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms. 11 | Although such methods improve the sampling efficiency and memory usage, 12 | their sample quality has not yet reached that of autoregressive and flow-based generative models. 13 | In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis. 14 | As speech audio consists of sinusoidal signals with various periods, 15 | we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality. 16 | A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method 17 | demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than 18 | real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen 19 | speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times 20 | faster than real-time on CPU with comparable quality to an autoregressive counterpart. 21 | 22 | Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples. 23 | 24 | 25 | ## Pre-requisites 26 | 1. Python >= 3.6 27 | 2. Clone this repository. 28 | 3. Install python requirements. Please refer [requirements.txt](requirements.txt) 29 | 4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/). 30 | And move all wav files to `LJSpeech-1.1/wavs` 31 | 32 | 33 | ## Training 34 | ``` 35 | python train.py --config config_v1.json 36 | ``` 37 | To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.
38 | Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.
39 | You can change the path by adding `--checkpoint_path` option. 40 | 41 | Validation loss during training with V1 generator.
42 | ![validation loss](./validation_loss.png) 43 | 44 | ## Pretrained Model 45 | You can also use pretrained models we provide.
46 | [Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)
47 | Details of each folder are as in follows: 48 | 49 | |Folder Name|Generator|Dataset|Fine-Tuned| 50 | |------|---|---|---| 51 | |LJ_V1|V1|LJSpeech|No| 52 | |LJ_V2|V2|LJSpeech|No| 53 | |LJ_V3|V3|LJSpeech|No| 54 | |LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 55 | |LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 56 | |LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))| 57 | |VCTK_V1|V1|VCTK|No| 58 | |VCTK_V2|V2|VCTK|No| 59 | |VCTK_V3|V3|VCTK|No| 60 | |UNIVERSAL_V1|V1|Universal|No| 61 | 62 | We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets. 63 | 64 | ## Fine-Tuning 65 | 1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.
66 | The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.
67 | Example: 68 | ``` 69 | Audio File : LJ001-0001.wav 70 | Mel-Spectrogram File : LJ001-0001.npy 71 | ``` 72 | 2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.
73 | 3. Run the following command. 74 | ``` 75 | python train.py --fine_tuning True --config config_v1.json 76 | ``` 77 | For other command line options, please refer to the training section. 78 | 79 | 80 | ## Inference from wav file 81 | 1. Make `test_files` directory and copy wav files into the directory. 82 | 2. Run the following command. 83 | ``` 84 | python inference.py --checkpoint_file [generator checkpoint file path] 85 | ``` 86 | Generated wav files are saved in `generated_files` by default.
87 | You can change the path by adding `--output_dir` option. 88 | 89 | 90 | ## Inference for end-to-end speech synthesis 91 | 1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.
92 | You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2), 93 | [Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth. 94 | 2. Run the following command. 95 | ``` 96 | python inference_e2e.py --checkpoint_file [generator checkpoint file path] 97 | ``` 98 | Generated wav files are saved in `generated_files_from_mel` by default.
99 | You can change the path by adding `--output_dir` option. 100 | 101 | 102 | ## Acknowledgements 103 | We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips) 104 | and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this. 105 | 106 | -------------------------------------------------------------------------------- /hifi-gan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super(AttrDict, self).__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /hifi-gan/meldataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import math 4 | import os 5 | import random 6 | import torch 7 | import torch.utils.data 8 | import numpy as np 9 | from librosa.util import normalize 10 | from scipy.io.wavfile import read 11 | from librosa.filters import mel as librosa_mel_fn 12 | 13 | MAX_WAV_VALUE = 32768.0 14 | 15 | 16 | def load_wav(full_path): 17 | sampling_rate, data = read(full_path) 18 | return data, sampling_rate 19 | 20 | 21 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 22 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 23 | 24 | 25 | def dynamic_range_decompression(x, C=1): 26 | return np.exp(x) / C 27 | 28 | 29 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 30 | return torch.log(torch.clamp(x, min=clip_val) * C) 31 | 32 | 33 | def dynamic_range_decompression_torch(x, C=1): 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 52 | if torch.min(y) < -1.: 53 | print('min value is ', torch.min(y)) 54 | if torch.max(y) > 1.: 55 | print('max value is ', torch.max(y)) 56 | 57 | global mel_basis, hann_window 58 | if fmax not in mel_basis: 59 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 60 | mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device) 61 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True) 68 | 69 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 70 | 71 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 72 | spec = spectral_normalize_torch(spec) 73 | 74 | return spec 75 | 76 | 77 | def get_dataset_filelist(a): 78 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 79 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 80 | for x in fi.read().split('\n') if len(x) > 0] 81 | 82 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 83 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 84 | for x in fi.read().split('\n') if len(x) > 0] 85 | return training_files, validation_files 86 | 87 | 88 | class MelDataset(torch.utils.data.Dataset): 89 | def __init__(self, training_files, segment_size, n_fft, num_mels, 90 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 91 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 92 | self.audio_files = training_files 93 | random.seed(1234) 94 | if shuffle: 95 | random.shuffle(self.audio_files) 96 | self.segment_size = segment_size 97 | self.sampling_rate = sampling_rate 98 | self.split = split 99 | self.n_fft = n_fft 100 | self.num_mels = num_mels 101 | self.hop_size = hop_size 102 | self.win_size = win_size 103 | self.fmin = fmin 104 | self.fmax = fmax 105 | self.fmax_loss = fmax_loss 106 | self.cached_wav = None 107 | self.n_cache_reuse = n_cache_reuse 108 | self._cache_ref_count = 0 109 | self.device = device 110 | self.fine_tuning = fine_tuning 111 | self.base_mels_path = base_mels_path 112 | 113 | def __getitem__(self, index): 114 | filename = self.audio_files[index] 115 | if self._cache_ref_count == 0: 116 | audio, sampling_rate = load_wav(filename) 117 | audio = audio / MAX_WAV_VALUE 118 | if not self.fine_tuning: 119 | audio = normalize(audio) * 0.95 120 | self.cached_wav = audio 121 | if sampling_rate != self.sampling_rate: 122 | raise ValueError("{} SR doesn't match target {} SR".format( 123 | sampling_rate, self.sampling_rate)) 124 | self._cache_ref_count = self.n_cache_reuse 125 | else: 126 | audio = self.cached_wav 127 | self._cache_ref_count -= 1 128 | 129 | audio = torch.FloatTensor(audio) 130 | audio = audio.unsqueeze(0) 131 | 132 | if not self.fine_tuning: 133 | if self.split: 134 | if audio.size(1) >= self.segment_size: 135 | max_audio_start = audio.size(1) - self.segment_size 136 | audio_start = random.randint(0, max_audio_start) 137 | audio = audio[:, audio_start:audio_start+self.segment_size] 138 | else: 139 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 140 | 141 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 142 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 143 | center=False) 144 | else: 145 | mel = np.load( 146 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 147 | mel = torch.from_numpy(mel) 148 | 149 | if len(mel.shape) < 3: 150 | mel = mel.unsqueeze(0) 151 | 152 | if self.split: 153 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 154 | 155 | if audio.size(1) >= self.segment_size: 156 | mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1) 157 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 158 | audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 159 | else: 160 | mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant') 161 | audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant') 162 | 163 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 164 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 165 | center=False) 166 | 167 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 168 | 169 | def __len__(self): 170 | return len(self.audio_files) 171 | -------------------------------------------------------------------------------- /hifi-gan/models.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import torch.nn as nn 6 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 7 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 8 | from xutils import init_weights, get_padding 9 | 10 | LRELU_SLOPE = 0.1 11 | 12 | 13 | class ResBlock1(torch.nn.Module): 14 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 15 | super(ResBlock1, self).__init__() 16 | self.h = h 17 | self.convs1 = nn.ModuleList([ 18 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 19 | padding=get_padding(kernel_size, dilation[0]))), 20 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 21 | padding=get_padding(kernel_size, dilation[1]))), 22 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 23 | padding=get_padding(kernel_size, dilation[2]))) 24 | ]) 25 | self.convs1.apply(init_weights) 26 | 27 | self.convs2 = nn.ModuleList([ 28 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 29 | padding=get_padding(kernel_size, 1))), 30 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 31 | padding=get_padding(kernel_size, 1))), 32 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 33 | padding=get_padding(kernel_size, 1))) 34 | ]) 35 | self.convs2.apply(init_weights) 36 | 37 | def forward(self, x): 38 | for c1, c2 in zip(self.convs1, self.convs2): 39 | xt = F.leaky_relu(x, LRELU_SLOPE) 40 | xt = c1(xt) 41 | xt = F.leaky_relu(xt, LRELU_SLOPE) 42 | xt = c2(xt) 43 | x = xt + x 44 | return x 45 | 46 | def remove_weight_norm(self): 47 | for l in self.convs1: 48 | remove_weight_norm(l) 49 | for l in self.convs2: 50 | remove_weight_norm(l) 51 | 52 | 53 | class ResBlock2(torch.nn.Module): 54 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 55 | super(ResBlock2, self).__init__() 56 | self.h = h 57 | self.convs = nn.ModuleList([ 58 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 59 | padding=get_padding(kernel_size, dilation[0]))), 60 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 61 | padding=get_padding(kernel_size, dilation[1]))) 62 | ]) 63 | self.convs.apply(init_weights) 64 | 65 | def forward(self, x): 66 | for c in self.convs: 67 | xt = F.leaky_relu(x, LRELU_SLOPE) 68 | xt = c(xt) 69 | x = xt + x 70 | return x 71 | 72 | def remove_weight_norm(self): 73 | for l in self.convs: 74 | remove_weight_norm(l) 75 | 76 | 77 | class Generator(torch.nn.Module): 78 | def __init__(self, h): 79 | super(Generator, self).__init__() 80 | self.h = h 81 | self.num_kernels = len(h.resblock_kernel_sizes) 82 | self.num_upsamples = len(h.upsample_rates) 83 | self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) 84 | resblock = ResBlock1 if h.resblock == '1' else ResBlock2 85 | 86 | self.ups = nn.ModuleList() 87 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 88 | self.ups.append(weight_norm( 89 | ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), 90 | k, u, padding=(k-u)//2))) 91 | 92 | self.resblocks = nn.ModuleList() 93 | for i in range(len(self.ups)): 94 | ch = h.upsample_initial_channel//(2**(i+1)) 95 | for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): 96 | self.resblocks.append(resblock(h, ch, k, d)) 97 | 98 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 99 | self.ups.apply(init_weights) 100 | self.conv_post.apply(init_weights) 101 | 102 | def forward(self, x): 103 | x = self.conv_pre(x) 104 | for i in range(self.num_upsamples): 105 | x = F.leaky_relu(x, LRELU_SLOPE) 106 | x = self.ups[i](x) 107 | xs = None 108 | for j in range(self.num_kernels): 109 | if xs is None: 110 | xs = self.resblocks[i*self.num_kernels+j](x) 111 | else: 112 | xs += self.resblocks[i*self.num_kernels+j](x) 113 | x = xs / self.num_kernels 114 | x = F.leaky_relu(x) 115 | x = self.conv_post(x) 116 | x = torch.tanh(x) 117 | 118 | return x 119 | 120 | def remove_weight_norm(self): 121 | print('Removing weight norm...') 122 | for l in self.ups: 123 | remove_weight_norm(l) 124 | for l in self.resblocks: 125 | l.remove_weight_norm() 126 | remove_weight_norm(self.conv_pre) 127 | remove_weight_norm(self.conv_post) 128 | 129 | 130 | class DiscriminatorP(torch.nn.Module): 131 | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): 132 | super(DiscriminatorP, self).__init__() 133 | self.period = period 134 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 135 | self.convs = nn.ModuleList([ 136 | norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 137 | norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 138 | norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 139 | norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), 140 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 141 | ]) 142 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 143 | 144 | def forward(self, x): 145 | fmap = [] 146 | 147 | # 1d to 2d 148 | b, c, t = x.shape 149 | if t % self.period != 0: # pad first 150 | n_pad = self.period - (t % self.period) 151 | x = F.pad(x, (0, n_pad), "reflect") 152 | t = t + n_pad 153 | x = x.view(b, c, t // self.period, self.period) 154 | 155 | for l in self.convs: 156 | x = l(x) 157 | x = F.leaky_relu(x, LRELU_SLOPE) 158 | fmap.append(x) 159 | x = self.conv_post(x) 160 | fmap.append(x) 161 | x = torch.flatten(x, 1, -1) 162 | 163 | return x, fmap 164 | 165 | 166 | class MultiPeriodDiscriminator(torch.nn.Module): 167 | def __init__(self): 168 | super(MultiPeriodDiscriminator, self).__init__() 169 | self.discriminators = nn.ModuleList([ 170 | DiscriminatorP(2), 171 | DiscriminatorP(3), 172 | DiscriminatorP(5), 173 | DiscriminatorP(7), 174 | DiscriminatorP(11), 175 | ]) 176 | 177 | def forward(self, y, y_hat): 178 | y_d_rs = [] 179 | y_d_gs = [] 180 | fmap_rs = [] 181 | fmap_gs = [] 182 | for i, d in enumerate(self.discriminators): 183 | y_d_r, fmap_r = d(y) 184 | y_d_g, fmap_g = d(y_hat) 185 | y_d_rs.append(y_d_r) 186 | fmap_rs.append(fmap_r) 187 | y_d_gs.append(y_d_g) 188 | fmap_gs.append(fmap_g) 189 | 190 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 191 | 192 | 193 | class DiscriminatorS(torch.nn.Module): 194 | def __init__(self, use_spectral_norm=False): 195 | super(DiscriminatorS, self).__init__() 196 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 197 | self.convs = nn.ModuleList([ 198 | norm_f(Conv1d(1, 128, 15, 1, padding=7)), 199 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 200 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 201 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 202 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 203 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 204 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 205 | ]) 206 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 207 | 208 | def forward(self, x): 209 | fmap = [] 210 | for l in self.convs: 211 | x = l(x) 212 | x = F.leaky_relu(x, LRELU_SLOPE) 213 | fmap.append(x) 214 | x = self.conv_post(x) 215 | fmap.append(x) 216 | x = torch.flatten(x, 1, -1) 217 | 218 | return x, fmap 219 | 220 | 221 | class MultiScaleDiscriminator(torch.nn.Module): 222 | def __init__(self): 223 | super(MultiScaleDiscriminator, self).__init__() 224 | self.discriminators = nn.ModuleList([ 225 | DiscriminatorS(use_spectral_norm=True), 226 | DiscriminatorS(), 227 | DiscriminatorS(), 228 | ]) 229 | self.meanpools = nn.ModuleList([ 230 | AvgPool1d(4, 2, padding=2), 231 | AvgPool1d(4, 2, padding=2) 232 | ]) 233 | 234 | def forward(self, y, y_hat): 235 | y_d_rs = [] 236 | y_d_gs = [] 237 | fmap_rs = [] 238 | fmap_gs = [] 239 | for i, d in enumerate(self.discriminators): 240 | if i != 0: 241 | y = self.meanpools[i-1](y) 242 | y_hat = self.meanpools[i-1](y_hat) 243 | y_d_r, fmap_r = d(y) 244 | y_d_g, fmap_g = d(y_hat) 245 | y_d_rs.append(y_d_r) 246 | fmap_rs.append(fmap_r) 247 | y_d_gs.append(y_d_g) 248 | fmap_gs.append(fmap_g) 249 | 250 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 251 | 252 | 253 | def feature_loss(fmap_r, fmap_g): 254 | loss = 0 255 | for dr, dg in zip(fmap_r, fmap_g): 256 | for rl, gl in zip(dr, dg): 257 | loss += torch.mean(torch.abs(rl - gl)) 258 | 259 | return loss*2 260 | 261 | 262 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 263 | loss = 0 264 | r_losses = [] 265 | g_losses = [] 266 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 267 | r_loss = torch.mean((1-dr)**2) 268 | g_loss = torch.mean(dg**2) 269 | loss += (r_loss + g_loss) 270 | r_losses.append(r_loss.item()) 271 | g_losses.append(g_loss.item()) 272 | 273 | return loss, r_losses, g_losses 274 | 275 | 276 | def generator_loss(disc_outputs): 277 | loss = 0 278 | gen_losses = [] 279 | for dg in disc_outputs: 280 | l = torch.mean((1-dg)**2) 281 | gen_losses.append(l) 282 | loss += l 283 | 284 | return loss, gen_losses 285 | 286 | -------------------------------------------------------------------------------- /hifi-gan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | import matplotlib 6 | import torch 7 | from torch.nn.utils import weight_norm 8 | matplotlib.use("Agg") 9 | import matplotlib.pylab as plt 10 | 11 | 12 | def plot_spectrogram(spectrogram): 13 | fig, ax = plt.subplots(figsize=(10, 2)) 14 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 15 | interpolation='none') 16 | plt.colorbar(im, ax=ax) 17 | 18 | fig.canvas.draw() 19 | plt.close() 20 | 21 | return fig 22 | 23 | 24 | def init_weights(m, mean=0.0, std=0.01): 25 | classname = m.__class__.__name__ 26 | if classname.find("Conv") != -1: 27 | m.weight.data.normal_(mean, std) 28 | 29 | 30 | def apply_weight_norm(m): 31 | classname = m.__class__.__name__ 32 | if classname.find("Conv") != -1: 33 | weight_norm(m) 34 | 35 | 36 | def get_padding(kernel_size, dilation=1): 37 | return int((kernel_size*dilation - dilation)/2) 38 | 39 | 40 | def load_checkpoint(filepath, device): 41 | assert os.path.isfile(filepath) 42 | print("Loading '{}'".format(filepath)) 43 | checkpoint_dict = torch.load(filepath, map_location=device) 44 | print("Complete.") 45 | return checkpoint_dict 46 | 47 | 48 | def save_checkpoint(filepath, obj): 49 | print("Saving checkpoint to {}".format(filepath)) 50 | torch.save(obj, filepath) 51 | print("Complete.") 52 | 53 | 54 | def scan_checkpoint(cp_dir, prefix): 55 | pattern = os.path.join(cp_dir, prefix + '????????') 56 | cp_list = glob.glob(pattern) 57 | if len(cp_list) == 0: 58 | return None 59 | return sorted(cp_list)[-1] 60 | 61 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import numpy as np 5 | from tqdm import tqdm 6 | import soundfile as sf 7 | import torch 8 | use_gpu = torch.cuda.is_available() 9 | 10 | import librosa 11 | from librosa.core import load 12 | from librosa.filters import mel as librosa_mel_fn 13 | mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) 14 | 15 | import params 16 | from model import DiffVC 17 | 18 | import sys 19 | sys.path.append('hifi-gan/') 20 | from env import AttrDict 21 | from models import Generator as HiFiGAN 22 | 23 | sys.path.append('speaker_encoder/') 24 | from encoder import inference as spk_encoder 25 | from pathlib import Path 26 | 27 | 28 | class Inferencer(): 29 | def __init__(self, generator, spk_encoder, hifigan_universal, output_path="./output_demo", use_gpu=False): 30 | 31 | self.generator = generator 32 | self.spk_encoder = spk_encoder 33 | self.hifigan_universal = hifigan_universal 34 | # if not os.path.isdir(output_path): 35 | # os.makedirs(output_path) 36 | 37 | self.output_path = output_path 38 | 39 | self.use_gpu = use_gpu 40 | 41 | 42 | def get_mel(self, wav_path): 43 | wav, _ = load(wav_path, sr=22050) 44 | wav = wav[:(wav.shape[0] // 256)*256] 45 | wav = np.pad(wav, 384, mode='reflect') 46 | stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) 47 | stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) 48 | mel_spectrogram = np.matmul(mel_basis, stftm) 49 | log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) 50 | return log_mel_spectrogram 51 | 52 | def get_embed(self, wav_path): 53 | wav_preprocessed = spk_encoder.preprocess_wav(wav_path) 54 | embed = spk_encoder.embed_utterance(wav_preprocessed) 55 | return embed 56 | 57 | def noise_median_smoothing(self, x, w=5): 58 | y = np.copy(x) 59 | x = np.pad(x, w, "edge") 60 | for i in range(y.shape[0]): 61 | med = np.median(x[i:i+2*w+1]) 62 | y[i] = min(x[i+w+1], med) 63 | return y 64 | 65 | def mel_spectral_subtraction(self, mel_synth, mel_source, spectral_floor=0.02, silence_window=5, smoothing_window=5): 66 | mel_len = mel_source.shape[-1] 67 | energy_min = 100000.0 68 | i_min = 0 69 | for i in range(mel_len - silence_window): 70 | energy_cur = np.sum(np.exp(2.0 * mel_source[:, i:i+silence_window])) 71 | if energy_cur < energy_min: 72 | i_min = i 73 | energy_min = energy_cur 74 | estimated_noise_energy = np.min(np.exp(2.0 * mel_synth[:, i_min:i_min+silence_window]), axis=-1) 75 | if smoothing_window is not None: 76 | estimated_noise_energy = self.noise_median_smoothing(estimated_noise_energy, smoothing_window) 77 | mel_denoised = np.copy(mel_synth) 78 | for i in range(mel_len): 79 | signal_subtract_noise = np.exp(2.0 * mel_synth[:, i]) - estimated_noise_energy 80 | estimated_signal_energy = np.maximum(signal_subtract_noise, spectral_floor * estimated_noise_energy) 81 | mel_denoised[:, i] = np.log(np.sqrt(estimated_signal_energy)) 82 | return mel_denoised 83 | 84 | 85 | def infer(self, src_path, tgt_path, n_timesteps=30, return_output_path=False, sr=16000): 86 | 87 | source_basename = os.path.basename(src_path).split('.wav')[0] 88 | target_basename = os.path.basename(tgt_path).split('.wav')[0] 89 | output_basename = f'{source_basename}_to_{target_basename}' 90 | output_wav = os.path.join(self.output_path, output_basename+'.wav') 91 | 92 | mel_source = torch.from_numpy(self.get_mel(src_path)).float().unsqueeze(0) 93 | if self.use_gpu: 94 | mel_source = mel_source.cuda() 95 | mel_source_lengths = torch.LongTensor([mel_source.shape[-1]]) 96 | if self.use_gpu: 97 | mel_source_lengths = mel_source_lengths.cuda() 98 | 99 | mel_target = torch.from_numpy(self.get_mel(tgt_path)).float().unsqueeze(0) 100 | if self.use_gpu: 101 | mel_target = mel_target.cuda() 102 | mel_target_lengths = torch.LongTensor([mel_target.shape[-1]]) 103 | if self.use_gpu: 104 | mel_target_lengths = mel_target_lengths.cuda() 105 | 106 | embed_target = torch.from_numpy(self.get_embed(tgt_path)).float().unsqueeze(0) 107 | if self.use_gpu: 108 | embed_target = embed_target.cuda() 109 | 110 | 111 | # performing voice conversion 112 | mel_encoded, mel_ = self.generator.forward(mel_source, mel_source_lengths, mel_target, mel_target_lengths, embed_target, 113 | n_timesteps=n_timesteps, mode='ml') 114 | mel_synth_np = mel_.cpu().detach().squeeze().numpy() 115 | mel_source_np = mel_.cpu().detach().squeeze().numpy() 116 | mel = torch.from_numpy(self.mel_spectral_subtraction(mel_synth_np, mel_source_np, smoothing_window=1)).float().unsqueeze(0) 117 | if self.use_gpu: 118 | mel = mel.cuda() 119 | 120 | with torch.no_grad(): 121 | audio = self.hifigan_universal.forward(mel).cpu().squeeze().clamp(-1, 1) 122 | print(audio.shape) 123 | sf.write(f'{output_wav}', audio, sr) 124 | 125 | if return_output_path: 126 | return output_wav 127 | else: 128 | return audio 129 | 130 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | from .vc import DiffVC -------------------------------------------------------------------------------- /model/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import numpy as np 10 | import torch 11 | 12 | 13 | class BaseModule(torch.nn.Module): 14 | def __init__(self): 15 | super(BaseModule, self).__init__() 16 | 17 | @property 18 | def nparams(self): 19 | num_params = 0 20 | for name, param in self.named_parameters(): 21 | if param.requires_grad: 22 | num_params += np.prod(param.detach().cpu().numpy().shape) 23 | return num_params 24 | 25 | 26 | def relocate_input(self, x: list): 27 | device = next(self.parameters()).device 28 | for i in range(len(x)): 29 | if isinstance(x[i], torch.Tensor) and x[i].device != device: 30 | x[i] = x[i].to(device) 31 | return x 32 | -------------------------------------------------------------------------------- /model/diffusion.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import math 10 | import torch 11 | 12 | from model.base import BaseModule 13 | from model.modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock 14 | from model.modules import LinearAttention, Residual, SinusoidalPosEmb, RefBlock 15 | 16 | 17 | class GradLogPEstimator(BaseModule): 18 | def __init__(self, dim_base, dim_cond, use_ref_t, dim_mults=(1, 2, 4)): 19 | super(GradLogPEstimator, self).__init__() 20 | self.use_ref_t = use_ref_t 21 | dims = [2 + dim_cond, *map(lambda m: dim_base * m, dim_mults)] 22 | in_out = list(zip(dims[:-1], dims[1:])) 23 | 24 | self.time_pos_emb = SinusoidalPosEmb(dim_base) 25 | self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4), 26 | Mish(), torch.nn.Linear(dim_base * 4, dim_base)) 27 | 28 | cond_total = dim_base + 256 29 | if use_ref_t: 30 | self.ref_block = RefBlock(out_dim=dim_cond, time_emb_dim=dim_base) 31 | cond_total += dim_cond 32 | self.cond_block = torch.nn.Sequential(torch.nn.Linear(cond_total, 4 * dim_cond), 33 | Mish(), torch.nn.Linear(4 * dim_cond, dim_cond)) 34 | 35 | self.downs = torch.nn.ModuleList([]) 36 | self.ups = torch.nn.ModuleList([]) 37 | num_resolutions = len(in_out) 38 | 39 | for ind, (dim_in, dim_out) in enumerate(in_out): 40 | is_last = ind >= (num_resolutions - 1) 41 | self.downs.append(torch.nn.ModuleList([ 42 | ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base), 43 | ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base), 44 | Residual(Rezero(LinearAttention(dim_out))), 45 | Downsample(dim_out) if not is_last else torch.nn.Identity()])) 46 | 47 | mid_dim = dims[-1] 48 | self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base) 49 | self.mid_attn = Residual(Rezero(LinearAttention(mid_dim))) 50 | self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base) 51 | 52 | for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])): 53 | self.ups.append(torch.nn.ModuleList([ 54 | ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base), 55 | ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base), 56 | Residual(Rezero(LinearAttention(dim_in))), 57 | Upsample(dim_in)])) 58 | self.final_block = Block(dim_base, dim_base) 59 | self.final_conv = torch.nn.Conv2d(dim_base, 1, 1) 60 | 61 | def forward(self, x, x_mask, mean, ref, ref_mask, c, t): 62 | condition = self.time_pos_emb(t) 63 | t = self.mlp(condition) 64 | 65 | x = torch.stack([mean, x], 1) 66 | x_mask = x_mask.unsqueeze(1) 67 | ref_mask = ref_mask.unsqueeze(1) 68 | 69 | if self.use_ref_t: 70 | condition = torch.cat([condition, self.ref_block(ref, ref_mask, t)], 1) 71 | condition = torch.cat([condition, c], 1) 72 | 73 | condition = self.cond_block(condition).unsqueeze(-1).unsqueeze(-1) 74 | condition = torch.cat(x.shape[2]*[condition], 2) 75 | condition = torch.cat(x.shape[3]*[condition], 3) 76 | x = torch.cat([x, condition], 1) 77 | 78 | hiddens = [] 79 | masks = [x_mask] 80 | for resnet1, resnet2, attn, downsample in self.downs: 81 | mask_down = masks[-1] 82 | x = resnet1(x, mask_down, t) 83 | x = resnet2(x, mask_down, t) 84 | x = attn(x) 85 | hiddens.append(x) 86 | x = downsample(x * mask_down) 87 | masks.append(mask_down[:, :, :, ::2]) 88 | 89 | masks = masks[:-1] 90 | mask_mid = masks[-1] 91 | x = self.mid_block1(x, mask_mid, t) 92 | x = self.mid_attn(x) 93 | x = self.mid_block2(x, mask_mid, t) 94 | 95 | for resnet1, resnet2, attn, upsample in self.ups: 96 | mask_up = masks.pop() 97 | x = torch.cat((x, hiddens.pop()), dim=1) 98 | x = resnet1(x, mask_up, t) 99 | x = resnet2(x, mask_up, t) 100 | x = attn(x) 101 | x = upsample(x * mask_up) 102 | 103 | x = self.final_block(x, x_mask) 104 | output = self.final_conv(x * x_mask) 105 | 106 | return (output * x_mask).squeeze(1) 107 | 108 | 109 | class Diffusion(BaseModule): 110 | def __init__(self, n_feats, dim_unet, dim_spk, use_ref_t, beta_min, beta_max): 111 | super(Diffusion, self).__init__() 112 | self.estimator = GradLogPEstimator(dim_unet, dim_spk, use_ref_t) 113 | self.n_feats = n_feats 114 | self.dim_unet = dim_unet 115 | self.dim_spk = dim_spk 116 | self.use_ref_t = use_ref_t 117 | self.beta_min = beta_min 118 | self.beta_max = beta_max 119 | 120 | def get_beta(self, t): 121 | beta = self.beta_min + (self.beta_max - self.beta_min) * t 122 | return beta 123 | 124 | def get_gamma(self, s, t, p=1.0, use_torch=False): 125 | beta_integral = self.beta_min + 0.5*(self.beta_max - self.beta_min)*(t + s) 126 | beta_integral *= (t - s) 127 | if use_torch: 128 | gamma = torch.exp(-0.5*p*beta_integral).unsqueeze(-1).unsqueeze(-1) 129 | else: 130 | gamma = math.exp(-0.5*p*beta_integral) 131 | return gamma 132 | 133 | def get_mu(self, s, t): 134 | a = self.get_gamma(s, t) 135 | b = 1.0 - self.get_gamma(0, s, p=2.0) 136 | c = 1.0 - self.get_gamma(0, t, p=2.0) 137 | return a * b / c 138 | 139 | def get_nu(self, s, t): 140 | a = self.get_gamma(0, s) 141 | b = 1.0 - self.get_gamma(s, t, p=2.0) 142 | c = 1.0 - self.get_gamma(0, t, p=2.0) 143 | return a * b / c 144 | 145 | def get_sigma(self, s, t): 146 | a = 1.0 - self.get_gamma(0, s, p=2.0) 147 | b = 1.0 - self.get_gamma(s, t, p=2.0) 148 | c = 1.0 - self.get_gamma(0, t, p=2.0) 149 | return math.sqrt(a * b / c) 150 | 151 | def compute_diffused_mean(self, x0, mask, mean, t, use_torch=False): 152 | x0_weight = self.get_gamma(0, t, use_torch=use_torch) 153 | mean_weight = 1.0 - x0_weight 154 | xt_mean = x0 * x0_weight + mean * mean_weight 155 | return xt_mean * mask 156 | 157 | def forward_diffusion(self, x0, mask, mean, t): 158 | xt_mean = self.compute_diffused_mean(x0, mask, mean, t, use_torch=True) 159 | variance = 1.0 - self.get_gamma(0, t, p=2.0, use_torch=True) 160 | z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device, requires_grad=False) 161 | xt = xt_mean + z * torch.sqrt(variance) 162 | return xt * mask, z * mask 163 | 164 | @torch.no_grad() 165 | def reverse_diffusion(self, z, mask, mean, ref, ref_mask, mean_ref, c, 166 | n_timesteps, mode): 167 | h = 1.0 / n_timesteps 168 | xt = z * mask 169 | for i in range(n_timesteps): 170 | t = 1.0 - i*h 171 | time = t * torch.ones(z.shape[0], dtype=z.dtype, device=z.device) 172 | beta_t = self.get_beta(t) 173 | xt_ref = [self.compute_diffused_mean(ref, ref_mask, mean_ref, t)] 174 | # for j in range(15): 175 | # xt_ref += [self.compute_diffused_mean(ref, ref_mask, mean_ref, (j+0.5)/15.0)] 176 | xt_ref = torch.stack(xt_ref, 1) 177 | if mode == 'pf': 178 | dxt = 0.5 * (mean - xt - self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time)) * (beta_t * h) 179 | else: 180 | if mode == 'ml': 181 | kappa = self.get_gamma(0, t - h) * (1.0 - self.get_gamma(t - h, t, p=2.0)) 182 | kappa /= (self.get_gamma(0, t) * beta_t * h) 183 | kappa -= 1.0 184 | omega = self.get_nu(t - h, t) / self.get_gamma(0, t) 185 | omega += self.get_mu(t - h, t) 186 | omega -= (0.5 * beta_t * h + 1.0) 187 | sigma = self.get_sigma(t - h, t) 188 | else: 189 | kappa = 0.0 190 | omega = 0.0 191 | sigma = math.sqrt(beta_t * h) 192 | dxt = (mean - xt) * (0.5 * beta_t * h + omega) 193 | dxt -= self.estimator(xt, mask, mean, xt_ref, ref_mask, c, time) * (1.0 + kappa) * (beta_t * h) 194 | dxt += torch.randn_like(z, device=z.device) * sigma 195 | xt = (xt - dxt) * mask 196 | return xt 197 | 198 | @torch.no_grad() 199 | def forward(self, z, mask, mean, ref, ref_mask, mean_ref, c, 200 | n_timesteps, mode): 201 | if mode not in ['pf', 'em', 'ml']: 202 | print('Inference mode must be one of [pf, em, ml]!') 203 | return z 204 | return self.reverse_diffusion(z, mask, mean, ref, ref_mask, mean_ref, c, 205 | n_timesteps, mode) 206 | 207 | def loss_t(self, x0, mask, mean, x_ref, mean_ref, c, t): 208 | xt, z = self.forward_diffusion(x0, mask, mean, t) 209 | xt_ref = [self.compute_diffused_mean(x_ref, mask, mean_ref, t, use_torch=True)] 210 | # for j in range(15): 211 | # xt_ref += [self.compute_diffused_mean(x_ref, mask, mean_ref, (j+0.5)/15.0)] 212 | xt_ref = torch.stack(xt_ref, 1) 213 | z_estimation = self.estimator(xt, mask, mean, xt_ref, mask, c, t) 214 | z_estimation *= torch.sqrt(1.0 - self.get_gamma(0, t, p=2.0, use_torch=True)) 215 | loss = torch.sum((z_estimation + z)**2) / (torch.sum(mask)*self.n_feats) 216 | return loss 217 | 218 | def compute_loss(self, x0, mask, mean, x_ref, mean_ref, c, offset=1e-5): 219 | b = x0.shape[0] 220 | t = torch.rand(b, dtype=x0.dtype, device=x0.device, requires_grad=False) 221 | t = torch.clamp(t, offset, 1.0 - offset) 222 | return self.loss_t(x0, mask, mean, x_ref, mean_ref, c, t) 223 | -------------------------------------------------------------------------------- /model/modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import math 10 | import torch 11 | from einops import rearrange 12 | 13 | from model.base import BaseModule 14 | 15 | 16 | class Mish(BaseModule): 17 | def forward(self, x): 18 | return x * torch.tanh(torch.nn.functional.softplus(x)) 19 | 20 | 21 | class Upsample(BaseModule): 22 | def __init__(self, dim): 23 | super(Upsample, self).__init__() 24 | self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1) 25 | 26 | def forward(self, x): 27 | return self.conv(x) 28 | 29 | 30 | class Downsample(BaseModule): 31 | def __init__(self, dim): 32 | super(Downsample, self).__init__() 33 | self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1) 34 | 35 | def forward(self, x): 36 | return self.conv(x) 37 | 38 | 39 | class Rezero(BaseModule): 40 | def __init__(self, fn): 41 | super(Rezero, self).__init__() 42 | self.fn = fn 43 | self.g = torch.nn.Parameter(torch.zeros(1)) 44 | 45 | def forward(self, x): 46 | return self.fn(x) * self.g 47 | 48 | 49 | class Block(BaseModule): 50 | def __init__(self, dim, dim_out, groups=8): 51 | super(Block, self).__init__() 52 | self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3, 53 | padding=1), torch.nn.GroupNorm( 54 | groups, dim_out), Mish()) 55 | 56 | def forward(self, x, mask): 57 | output = self.block(x * mask) 58 | return output * mask 59 | 60 | 61 | class ResnetBlock(BaseModule): 62 | def __init__(self, dim, dim_out, time_emb_dim, groups=8): 63 | super(ResnetBlock, self).__init__() 64 | self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 65 | dim_out)) 66 | 67 | self.block1 = Block(dim, dim_out, groups=groups) 68 | self.block2 = Block(dim_out, dim_out, groups=groups) 69 | if dim != dim_out: 70 | self.res_conv = torch.nn.Conv2d(dim, dim_out, 1) 71 | else: 72 | self.res_conv = torch.nn.Identity() 73 | 74 | def forward(self, x, mask, time_emb): 75 | h = self.block1(x, mask) 76 | h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1) 77 | h = self.block2(h, mask) 78 | output = h + self.res_conv(x * mask) 79 | return output 80 | 81 | 82 | class LinearAttention(BaseModule): 83 | def __init__(self, dim, heads=4, dim_head=32): 84 | super(LinearAttention, self).__init__() 85 | self.heads = heads 86 | hidden_dim = dim_head * heads 87 | self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) 88 | self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1) 89 | 90 | def forward(self, x): 91 | b, c, h, w = x.shape 92 | qkv = self.to_qkv(x) 93 | q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', 94 | heads = self.heads, qkv=3) 95 | k = k.softmax(dim=-1) 96 | context = torch.einsum('bhdn,bhen->bhde', k, v) 97 | out = torch.einsum('bhde,bhdn->bhen', context, q) 98 | out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', 99 | heads=self.heads, h=h, w=w) 100 | return self.to_out(out) 101 | 102 | 103 | class Residual(BaseModule): 104 | def __init__(self, fn): 105 | super(Residual, self).__init__() 106 | self.fn = fn 107 | 108 | def forward(self, x, *args, **kwargs): 109 | output = self.fn(x, *args, **kwargs) + x 110 | return output 111 | 112 | 113 | class SinusoidalPosEmb(BaseModule): 114 | def __init__(self, dim): 115 | super(SinusoidalPosEmb, self).__init__() 116 | self.dim = dim 117 | 118 | def forward(self, x): 119 | device = x.device 120 | half_dim = self.dim // 2 121 | emb = math.log(10000) / (half_dim - 1) 122 | emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) 123 | emb = 1000.0 * x.unsqueeze(1) * emb.unsqueeze(0) 124 | emb = torch.cat((emb.sin(), emb.cos()), dim=-1) 125 | return emb 126 | 127 | 128 | class RefBlock(BaseModule): 129 | def __init__(self, out_dim, time_emb_dim): 130 | super(RefBlock, self).__init__() 131 | base_dim = out_dim // 4 132 | self.mlp1 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 133 | base_dim)) 134 | self.mlp2 = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 135 | 2 * base_dim)) 136 | self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim, 137 | 3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True), 138 | torch.nn.GLU(dim=1)) 139 | self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim, 140 | 3, 1, 1), torch.nn.InstanceNorm2d(2 * base_dim, affine=True), 141 | torch.nn.GLU(dim=1)) 142 | self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim, 143 | 3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True), 144 | torch.nn.GLU(dim=1)) 145 | self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim, 146 | 3, 1, 1), torch.nn.InstanceNorm2d(4 * base_dim, affine=True), 147 | torch.nn.GLU(dim=1)) 148 | self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim, 149 | 3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True), 150 | torch.nn.GLU(dim=1)) 151 | self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim, 152 | 3, 1, 1), torch.nn.InstanceNorm2d(8 * base_dim, affine=True), 153 | torch.nn.GLU(dim=1)) 154 | self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1) 155 | 156 | def forward(self, x, mask, time_emb): 157 | y = self.block11(x * mask) 158 | y = self.block12(y * mask) 159 | y += self.mlp1(time_emb).unsqueeze(-1).unsqueeze(-1) 160 | y = self.block21(y * mask) 161 | y = self.block22(y * mask) 162 | y += self.mlp2(time_emb).unsqueeze(-1).unsqueeze(-1) 163 | y = self.block31(y * mask) 164 | y = self.block32(y * mask) 165 | y = self.final_conv(y * mask) 166 | return (y * mask).sum((2, 3)) / (mask.sum((2, 3)) * x.shape[2]) 167 | -------------------------------------------------------------------------------- /model/postnet.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | 11 | from model.base import BaseModule 12 | from model.modules import Mish 13 | 14 | 15 | class Block(BaseModule): 16 | def __init__(self, dim, groups=8): 17 | super(Block, self).__init__() 18 | self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim, 7, 19 | padding=3), torch.nn.GroupNorm(groups, dim), Mish()) 20 | 21 | def forward(self, x, mask): 22 | output = self.block(x * mask) 23 | return output * mask 24 | 25 | 26 | class ResnetBlock(BaseModule): 27 | def __init__(self, dim, groups=8): 28 | super(ResnetBlock, self).__init__() 29 | self.block1 = Block(dim, groups=groups) 30 | self.block2 = Block(dim, groups=groups) 31 | self.res = torch.nn.Conv2d(dim, dim, 1) 32 | 33 | def forward(self, x, mask): 34 | h = self.block1(x, mask) 35 | h = self.block2(h, mask) 36 | output = self.res(x * mask) + h 37 | return output 38 | 39 | 40 | class PostNet(BaseModule): 41 | def __init__(self, dim, groups=8): 42 | super(PostNet, self).__init__() 43 | self.init_conv = torch.nn.Conv2d(1, dim, 1) 44 | self.res_block = ResnetBlock(dim, groups=groups) 45 | self.final_conv = torch.nn.Conv2d(dim, 1, 1) 46 | 47 | def forward(self, x, mask): 48 | x = x.unsqueeze(1) 49 | mask = mask.unsqueeze(1) 50 | x = self.init_conv(x * mask) 51 | x = self.res_block(x, mask) 52 | output = self.final_conv(x * mask) 53 | return output.squeeze(1) 54 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | import torchaudio 11 | import numpy as np 12 | from librosa.filters import mel as librosa_mel_fn 13 | 14 | from model.base import BaseModule 15 | 16 | 17 | def mse_loss(x, y, mask, n_feats): 18 | loss = torch.sum(((x - y)**2) * mask) 19 | return loss / (torch.sum(mask) * n_feats) 20 | 21 | 22 | def sequence_mask(length, max_length=None): 23 | if max_length is None: 24 | max_length = length.max() 25 | x = torch.arange(int(max_length), dtype=length.dtype, device=length.device) 26 | return x.unsqueeze(0) < length.unsqueeze(1) 27 | 28 | 29 | def convert_pad_shape(pad_shape): 30 | l = pad_shape[::-1] 31 | pad_shape = [item for sublist in l for item in sublist] 32 | return pad_shape 33 | 34 | 35 | def fix_len_compatibility(length, num_downsamplings_in_unet=2): 36 | while True: 37 | if length % (2**num_downsamplings_in_unet) == 0: 38 | return length 39 | length += 1 40 | 41 | 42 | class PseudoInversion(BaseModule): 43 | def __init__(self, n_mels, sampling_rate, n_fft): 44 | super(PseudoInversion, self).__init__() 45 | self.n_mels = n_mels 46 | self.sampling_rate = sampling_rate 47 | self.n_fft = n_fft 48 | mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mels, 0, 8000) 49 | mel_basis_inverse = np.linalg.pinv(mel_basis) 50 | mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float() 51 | self.register_buffer("mel_basis_inverse", mel_basis_inverse) 52 | 53 | def forward(self, log_mel_spectrogram): 54 | mel_spectrogram = torch.exp(log_mel_spectrogram) 55 | stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram) 56 | return stftm 57 | 58 | 59 | class InitialReconstruction(BaseModule): 60 | def __init__(self, n_fft, hop_size): 61 | super(InitialReconstruction, self).__init__() 62 | self.n_fft = n_fft 63 | self.hop_size = hop_size 64 | window = torch.hann_window(n_fft).float() 65 | self.register_buffer("window", window) 66 | 67 | def forward(self, stftm): 68 | real_part = torch.ones_like(stftm, device=stftm.device) 69 | imag_part = torch.zeros_like(stftm, device=stftm.device) 70 | stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1) 71 | istft = torchaudio.functional.istft(stft, n_fft=self.n_fft, 72 | hop_length=self.hop_size, win_length=self.n_fft, 73 | window=self.window, center=True) 74 | return istft.unsqueeze(1) 75 | 76 | 77 | # Fast Griffin-Lim algorithm as a PyTorch module 78 | class FastGL(BaseModule): 79 | def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99): 80 | super(FastGL, self).__init__() 81 | self.n_mels = n_mels 82 | self.sampling_rate = sampling_rate 83 | self.n_fft = n_fft 84 | self.hop_size = hop_size 85 | self.momentum = momentum 86 | self.pi = PseudoInversion(n_mels, sampling_rate, n_fft) 87 | self.ir = InitialReconstruction(n_fft, hop_size) 88 | window = torch.hann_window(n_fft).float() 89 | self.register_buffer("window", window) 90 | 91 | @torch.no_grad() 92 | def forward(self, s, n_iters=32): 93 | c = self.pi(s) 94 | x = self.ir(c) 95 | x = x.squeeze(1) 96 | c = c.unsqueeze(-1) 97 | prev_angles = torch.zeros_like(c, device=c.device) 98 | for _ in range(n_iters): 99 | s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size, 100 | win_length=self.n_fft, window=self.window, 101 | center=True) 102 | real_part, imag_part = s.unbind(-1) 103 | stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8)) 104 | angles = s / stftm.unsqueeze(-1) 105 | s = c * (angles + self.momentum * (angles - prev_angles)) 106 | x = torchaudio.functional.istft(s, n_fft=self.n_fft, hop_length=self.hop_size, 107 | win_length=self.n_fft, window=self.window, 108 | center=True) 109 | prev_angles = angles 110 | return x.unsqueeze(1) 111 | -------------------------------------------------------------------------------- /model/vc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import torch 10 | 11 | from model.base import BaseModule 12 | from model.encoder import MelEncoder 13 | from model.postnet import PostNet 14 | from model.diffusion import Diffusion 15 | from model.utils import sequence_mask, fix_len_compatibility, mse_loss 16 | 17 | 18 | # "average voice" encoder as the module parameterizing the diffusion prior 19 | class FwdDiffusion(BaseModule): 20 | def __init__(self, n_feats, channels, filters, heads, layers, kernel, 21 | dropout, window_size, dim): 22 | super(FwdDiffusion, self).__init__() 23 | self.n_feats = n_feats 24 | self.channels = channels 25 | self.filters = filters 26 | self.heads = heads 27 | self.layers = layers 28 | self.kernel = kernel 29 | self.dropout = dropout 30 | self.window_size = window_size 31 | self.dim = dim 32 | self.encoder = MelEncoder(n_feats, channels, filters, heads, layers, 33 | kernel, dropout, window_size) 34 | self.postnet = PostNet(dim) 35 | 36 | @torch.no_grad() 37 | def forward(self, x, mask): 38 | x, mask = self.relocate_input([x, mask]) 39 | z = self.encoder(x, mask) 40 | z_output = self.postnet(z, mask) 41 | return z_output 42 | 43 | def compute_loss(self, x, y, mask): 44 | x, y, mask = self.relocate_input([x, y, mask]) 45 | z = self.encoder(x, mask) 46 | z_output = self.postnet(z, mask) 47 | loss = mse_loss(z_output, y, mask, self.n_feats) 48 | return loss 49 | 50 | 51 | # the whole voice conversion model consisting of the "average voice" encoder 52 | # and the diffusion-based speaker-conditional decoder 53 | class DiffVC(BaseModule): 54 | def __init__(self, n_feats, channels, filters, heads, layers, kernel, 55 | dropout, window_size, enc_dim, spk_dim, use_ref_t, dec_dim, 56 | beta_min, beta_max): 57 | super(DiffVC, self).__init__() 58 | self.n_feats = n_feats 59 | self.channels = channels 60 | self.filters = filters 61 | self.heads = heads 62 | self.layers = layers 63 | self.kernel = kernel 64 | self.dropout = dropout 65 | self.window_size = window_size 66 | self.enc_dim = enc_dim 67 | self.spk_dim = spk_dim 68 | self.use_ref_t = use_ref_t 69 | self.dec_dim = dec_dim 70 | self.beta_min = beta_min 71 | self.beta_max = beta_max 72 | self.encoder = FwdDiffusion(n_feats, channels, filters, heads, layers, 73 | kernel, dropout, window_size, enc_dim) 74 | self.decoder = Diffusion(n_feats, dec_dim, spk_dim, use_ref_t, 75 | beta_min, beta_max) 76 | 77 | def load_encoder(self, enc_path): 78 | enc_dict = torch.load(enc_path, map_location=lambda loc, storage: loc) 79 | self.encoder.load_state_dict(enc_dict, strict=False) 80 | 81 | @torch.no_grad() 82 | def forward(self, x, x_lengths, x_ref, x_ref_lengths, c, n_timesteps, 83 | mode='ml'): 84 | """ 85 | Generates mel-spectrogram from source mel-spectrogram conditioned on 86 | target speaker embedding. Returns: 87 | 1. 'average voice' encoder outputs 88 | 2. decoder outputs 89 | 90 | Args: 91 | x (torch.Tensor): batch of source mel-spectrograms. 92 | x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms. 93 | x_ref (torch.Tensor): batch of reference mel-spectrograms. 94 | x_ref_lengths (torch.Tensor): numbers of frames in reference mel-spectrograms. 95 | c (torch.Tensor): batch of reference speaker embeddings 96 | n_timesteps (int): number of steps to use for reverse diffusion in decoder. 97 | mode (string, optional): sampling method. Can be one of: 98 | 'pf' - probability flow sampling (Euler scheme for ODE) 99 | 'em' - Euler-Maruyama SDE solver 100 | 'ml' - Maximum Likelihood SDE solver 101 | """ 102 | x, x_lengths = self.relocate_input([x, x_lengths]) 103 | x_ref, x_ref_lengths, c = self.relocate_input([x_ref, x_ref_lengths, c]) 104 | x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype) 105 | x_ref_mask = sequence_mask(x_ref_lengths).unsqueeze(1).to(x_ref.dtype) 106 | mean = self.encoder(x, x_mask) 107 | mean_x = self.decoder.compute_diffused_mean(x, x_mask, mean, 1.0) 108 | mean_ref = self.encoder(x_ref, x_ref_mask) 109 | 110 | b = x.shape[0] 111 | max_length = int(x_lengths.max()) 112 | max_length_new = fix_len_compatibility(max_length) 113 | x_mask_new = sequence_mask(x_lengths, max_length_new).unsqueeze(1).to(x.dtype) 114 | mean_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 115 | device=x.device) 116 | mean_x_new = torch.zeros((b, self.n_feats, max_length_new), dtype=x.dtype, 117 | device=x.device) 118 | for i in range(b): 119 | mean_new[i, :, :x_lengths[i]] = mean[i, :, :x_lengths[i]] 120 | mean_x_new[i, :, :x_lengths[i]] = mean_x[i, :, :x_lengths[i]] 121 | 122 | z = mean_x_new 123 | z += torch.randn_like(mean_x_new, device=mean_x_new.device) 124 | 125 | y = self.decoder(z, x_mask_new, mean_new, x_ref, x_ref_mask, mean_ref, c, 126 | n_timesteps, mode) 127 | return mean_x, y[:, :, :max_length] 128 | 129 | def compute_loss(self, x, x_lengths, x_ref, c): 130 | """ 131 | Computes diffusion (score matching) loss. 132 | 133 | Args: 134 | x (torch.Tensor): batch of source mel-spectrograms. 135 | x_lengths (torch.Tensor): numbers of frames in source mel-spectrograms. 136 | x_ref (torch.Tensor): batch of reference mel-spectrograms. 137 | c (torch.Tensor): batch of reference speaker embeddings 138 | """ 139 | x, x_lengths, x_ref, c = self.relocate_input([x, x_lengths, x_ref, c]) 140 | x_mask = sequence_mask(x_lengths).unsqueeze(1).to(x.dtype) 141 | mean = self.encoder(x, x_mask).detach() 142 | mean_ref = self.encoder(x_ref, x_mask).detach() 143 | diff_loss = self.decoder.compute_loss(x, x_mask, mean, x_ref, mean_ref, c) 144 | return diff_loss 145 | -------------------------------------------------------------------------------- /params.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | # data parameters 10 | n_mels = 80 11 | sampling_rate = 22050 12 | n_fft = 1024 13 | hop_size = 256 14 | 15 | # "average voice" encoder parameters 16 | channels = 192 17 | filters = 768 18 | layers = 6 19 | kernel = 3 20 | dropout = 0.1 21 | heads = 2 22 | window_size = 4 23 | enc_dim = 128 24 | 25 | # diffusion-based decoder parameters 26 | dec_dim = 256 27 | spk_dim = 128 28 | use_ref_t = True 29 | beta_min = 0.05 30 | beta_max = 20.0 31 | 32 | # training parameters 33 | seed = 37 34 | test_size = 1 35 | train_frames = 128 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datetime == 4.4 2 | datasets == 1.17.0 3 | protobuf == 3.19.4 4 | pydub == 0.25.1 5 | numpy == 1.21.5 6 | onnx == 1.11.0 7 | onnxruntime == 1.11.1 8 | requests == 2.22.0 9 | soundfile == 0.10.2 10 | uvicorn == 0.17.5 11 | gunicorn == 20.1.0 12 | fastapi == 0.81.0 13 | python-multipart == 0.0.5 14 | tritonclient[all] 15 | python-dotenv 16 | loguru 17 | inflect 18 | webrtcvad-wheels 19 | einops==0.3.0 20 | librosa==0.8.0 21 | tb-nightly 22 | future 23 | tqdm 24 | tgt 25 | matplotlib==3.7.2 -------------------------------------------------------------------------------- /run-container.sh: -------------------------------------------------------------------------------- 1 | IMAGE_NAME = diffvc 2 | CONTAINER_NAME = diff-vc-dev 3 | PORT = 1402 4 | GPUS = all 5 | 6 | 7 | docker run -itd --gpus $GPUS \ 8 | --name $CONTAINER_NAME \ 9 | -p $PORT:$PORT \ 10 | -v $(pwd)/:/workspace \ 11 | $IMAGE_NAME -------------------------------------------------------------------------------- /scenario/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/scenario/__init__.py -------------------------------------------------------------------------------- /scenario/train_dec.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | import torch 14 | from torch.utils.data import DataLoader 15 | 16 | import params 17 | from data import VCDecDataset, VCDecBatchCollate 18 | from model.vc import DiffVC 19 | from model.utils import FastGL 20 | from utils import save_plot, save_audio 21 | 22 | n_mels = params.n_mels 23 | sampling_rate = params.sampling_rate 24 | n_fft = params.n_fft 25 | hop_size = params.hop_size 26 | 27 | channels = params.channels 28 | filters = params.filters 29 | layers = params.layers 30 | kernel = params.kernel 31 | dropout = params.dropout 32 | heads = params.heads 33 | window_size = params.window_size 34 | enc_dim = params.enc_dim 35 | 36 | dec_dim = params.dec_dim 37 | spk_dim = params.spk_dim 38 | use_ref_t = params.use_ref_t 39 | beta_min = params.beta_min 40 | beta_max = params.beta_max 41 | 42 | random_seed = params.seed 43 | test_size = params.test_size 44 | 45 | data_dir = '../data/LibriTTS' 46 | val_file = 'filelists/valid.txt' 47 | exc_file = 'filelists/exceptions_libritts.txt' 48 | 49 | log_dir = 'logs_dec' 50 | enc_dir = 'logs_enc' 51 | epochs = 110 52 | batch_size = 32 53 | learning_rate = 1e-4 54 | save_every = 1 55 | 56 | 57 | if __name__ == "__main__": 58 | 59 | torch.manual_seed(random_seed) 60 | np.random.seed(random_seed) 61 | 62 | os.makedirs(log_dir, exist_ok=True) 63 | 64 | print('Initializing data loaders...') 65 | train_set = VCDecDataset(data_dir, val_file, exc_file) 66 | collate_fn = VCDecBatchCollate() 67 | train_loader = DataLoader(train_set, batch_size=batch_size, 68 | collate_fn=collate_fn, num_workers=4, drop_last=True) 69 | 70 | print('Initializing and loading models...') 71 | fgl = FastGL(n_mels, sampling_rate, n_fft, hop_size).cuda() 72 | model = DiffVC(n_mels, channels, filters, heads, layers, kernel, 73 | dropout, window_size, enc_dim, spk_dim, use_ref_t, 74 | dec_dim, beta_min, beta_max).cuda() 75 | model.load_encoder(os.path.join(enc_dir, 'enc.pt')) 76 | 77 | print('Encoder:') 78 | print(model.encoder) 79 | print('Number of parameters = %.2fm\n' % (model.encoder.nparams/1e6)) 80 | print('Decoder:') 81 | print(model.decoder) 82 | print('Number of parameters = %.2fm\n' % (model.decoder.nparams/1e6)) 83 | 84 | print('Initializing optimizers...') 85 | optimizer = torch.optim.Adam(params=model.decoder.parameters(), lr=learning_rate) 86 | 87 | print('Start training.') 88 | torch.backends.cudnn.benchmark = True 89 | iteration = 0 90 | for epoch in range(1, epochs + 1): 91 | print(f'Epoch: {epoch} [iteration: {iteration}]') 92 | model.train() 93 | losses = [] 94 | for batch in tqdm(train_loader, total=len(train_set)//batch_size): 95 | mel, mel_ref = batch['mel1'].cuda(), batch['mel2'].cuda() 96 | c, mel_lengths = batch['c'].cuda(), batch['mel_lengths'].cuda() 97 | model.zero_grad() 98 | loss = model.compute_loss(mel, mel_lengths, mel_ref, c) 99 | loss.backward() 100 | torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), max_norm=1) 101 | optimizer.step() 102 | losses.append(loss.item()) 103 | iteration += 1 104 | 105 | losses = np.asarray(losses) 106 | msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses)) 107 | print(msg) 108 | with open(f'{log_dir}/train_dec.log', 'a') as f: 109 | f.write(msg) 110 | losses = [] 111 | 112 | if epoch % save_every > 0: 113 | continue 114 | 115 | model.eval() 116 | print('Inference...\n') 117 | with torch.no_grad(): 118 | mels = train_set.get_valid_dataset() 119 | for i, (mel, c) in enumerate(mels): 120 | if i >= test_size: 121 | break 122 | mel = mel.unsqueeze(0).float().cuda() 123 | c = c.unsqueeze(0).float().cuda() 124 | mel_lengths = torch.LongTensor([mel.shape[-1]]).cuda() 125 | mel_avg, mel_rec = model(mel, mel_lengths, mel, mel_lengths, c, 126 | n_timesteps=100) 127 | if epoch == save_every: 128 | save_plot(mel.squeeze().cpu(), f'{log_dir}/original_{i}.png') 129 | audio = fgl(mel) 130 | save_audio(f'{log_dir}/original_{i}.wav', sampling_rate, audio) 131 | save_plot(mel_avg.squeeze().cpu(), f'{log_dir}/average_{i}.png') 132 | audio = fgl(mel_avg) 133 | save_audio(f'{log_dir}/average_{i}.wav', sampling_rate, audio) 134 | save_plot(mel_rec.squeeze().cpu(), f'{log_dir}/reconstructed_{i}.png') 135 | audio = fgl(mel_rec) 136 | save_audio(f'{log_dir}/reconstructed_{i}.wav', sampling_rate, audio) 137 | 138 | print('Saving model...\n') 139 | ckpt = model.state_dict() 140 | torch.save(ckpt, f=f"{log_dir}/vc_{epoch}.pt") 141 | -------------------------------------------------------------------------------- /scenario/train_enc.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import os 10 | import numpy as np 11 | from tqdm import tqdm 12 | 13 | import torch 14 | from torch.utils.data import DataLoader 15 | 16 | import params 17 | from data import VCEncDataset, VCEncBatchCollate 18 | from model.vc import FwdDiffusion 19 | from model.utils import FastGL, sequence_mask 20 | from utils import save_plot, save_audio 21 | 22 | n_mels = params.n_mels 23 | sampling_rate = params.sampling_rate 24 | n_fft = params.n_fft 25 | hop_size = params.hop_size 26 | 27 | channels = params.channels 28 | filters = params.filters 29 | layers = params.layers 30 | kernel = params.kernel 31 | dropout = params.dropout 32 | heads = params.heads 33 | window_size = params.window_size 34 | dim = params.enc_dim 35 | 36 | random_seed = params.seed 37 | test_size = params.test_size 38 | 39 | data_dir = '../data/LibriTTS' 40 | exc_file = 'filelists/exceptions_libritts.txt' 41 | avg_type = 'mode' 42 | 43 | log_dir = 'logs_enc' 44 | epochs = 300 45 | batch_size = 128 46 | learning_rate = 5e-4 47 | save_every = 1 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | torch.manual_seed(random_seed) 53 | np.random.seed(random_seed) 54 | 55 | os.makedirs(log_dir, exist_ok=True) 56 | 57 | print('Initializing data loaders...') 58 | train_set = VCEncDataset(data_dir, exc_file, avg_type) 59 | collate_fn = VCEncBatchCollate() 60 | train_loader = DataLoader(train_set, batch_size=batch_size, 61 | collate_fn=collate_fn, num_workers=4, 62 | drop_last=True) 63 | 64 | print('Initializing models...') 65 | fgl = FastGL(n_mels, sampling_rate, n_fft, hop_size).cuda() 66 | model = FwdDiffusion(n_mels, channels, filters, heads, layers, kernel, 67 | dropout, window_size, dim).cuda() 68 | 69 | print('Encoder:') 70 | print(model) 71 | print('Number of parameters = %.2fm\n' % (model.nparams/1e6)) 72 | 73 | print('Initializing optimizers...') 74 | optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) 75 | 76 | print('Start training.') 77 | torch.backends.cudnn.benchmark = True 78 | iteration = 0 79 | for epoch in range(1, epochs + 1): 80 | print(f'Epoch: {epoch} [iteration: {iteration}]') 81 | model.train() 82 | losses = [] 83 | for batch in tqdm(train_loader, total=len(train_set)//batch_size): 84 | mel_x, mel_y = batch['x'].cuda(), batch['y'].cuda() 85 | mel_lengths = batch['lengths'].cuda() 86 | mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype) 87 | 88 | model.zero_grad() 89 | loss = model.compute_loss(mel_x, mel_y, mel_mask) 90 | loss.backward() 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) 92 | optimizer.step() 93 | 94 | losses.append(loss.item()) 95 | iteration += 1 96 | 97 | losses = np.asarray(losses) 98 | msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses)) 99 | print(msg) 100 | with open(f'{log_dir}/train_enc.log', 'a') as f: 101 | f.write(msg) 102 | losses = [] 103 | 104 | if epoch % save_every > 0: 105 | continue 106 | 107 | model.eval() 108 | print('Inference...\n') 109 | with torch.no_grad(): 110 | mels = train_set.get_test_dataset() 111 | for i, (mel_x, mel_y) in enumerate(mels): 112 | if i >= test_size: 113 | break 114 | mel_x = mel_x.unsqueeze(0).float().cuda() 115 | mel_y = mel_y.unsqueeze(0).float().cuda() 116 | mel_lengths = torch.LongTensor([mel_x.shape[-1]]).cuda() 117 | mel_mask = sequence_mask(mel_lengths).unsqueeze(1).to(mel_x.dtype) 118 | mel = model(mel_x, mel_mask) 119 | save_plot(mel.squeeze().cpu(), f'{log_dir}/generated_{i}.png') 120 | audio = fgl(mel) 121 | save_audio(f'{log_dir}/generated_{i}.wav', sampling_rate, audio) 122 | if epoch == save_every: 123 | save_plot(mel_x.squeeze().cpu(), f'{log_dir}/source_{i}.png') 124 | audio = fgl(mel_x) 125 | save_audio(f'{log_dir}/source_{i}.wav', sampling_rate, audio) 126 | save_plot(mel_y.squeeze().cpu(), f'{log_dir}/target_{i}.png') 127 | audio = fgl(mel_y) 128 | save_audio(f'{log_dir}/target_{i}.wav', sampling_rate, audio) 129 | 130 | print('Saving model...\n') 131 | ckpt = model.state_dict() 132 | torch.save(ckpt, f=f"{log_dir}/enc.pt") 133 | -------------------------------------------------------------------------------- /speaker_encoder/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /speaker_encoder/README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Voice Cloning 2 | This repository is an implementation of [Transfer Learning from Speaker Verification to 3 | Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801). 4 | 5 | SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text. 6 | 7 | **Video demonstration** (click the picture): 8 | 9 | [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) 10 | 11 | 12 | 13 | ### Papers implemented 14 | | URL | Designation | Title | Implementation source | 15 | | --- | ----------- | ----- | --------------------- | 16 | |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | 17 | |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | 18 | |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) 19 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | 20 | 21 | ## News 22 | **10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion. 23 | 24 | **28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below. 25 | 26 | **14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish. 27 | 28 | **13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this: 29 | - **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors. 30 | - **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info. 31 | 32 | **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it. 33 | 34 | 35 | ## Setup 36 | 37 | ### 1. Install Requirements 38 | 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory. 39 | 2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional. 40 | 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files. 41 | 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command. 42 | 5. Install the remaining requirements with `pip install -r requirements.txt` 43 | 44 | ### 2. (Optional) Download Pretrained Models 45 | Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). 46 | 47 | ### 3. (Optional) Test Configuration 48 | Before you download any dataset, you can begin by testing your configuration with: 49 | 50 | `python demo_cli.py` 51 | 52 | If all tests pass, you're good to go. 53 | 54 | ### 4. (Optional) Download Datasets 55 | For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. 56 | 57 | ### 5. Launch the Toolbox 58 | You can then try the toolbox: 59 | 60 | `python demo_toolbox.py -d ` 61 | or 62 | `python demo_toolbox.py` 63 | 64 | depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). 65 | -------------------------------------------------------------------------------- /speaker_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinhtuanvubk/Diff-VC/d7d9d5f61ef51caf67af56707cd9143c17b69900/speaker_encoder/__init__.py -------------------------------------------------------------------------------- /speaker_encoder/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/audio.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from scipy.ndimage.morphology import binary_dilation 4 | from encoder.params_data import * 5 | from pathlib import Path 6 | from typing import Optional, Union 7 | import numpy as np 8 | import webrtcvad 9 | import librosa 10 | import struct 11 | 12 | import torch 13 | from torchaudio.transforms import Resample 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | 17 | int16_max = (2 ** 15) - 1 18 | 19 | 20 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], 21 | source_sr: Optional[int] = None): 22 | """ 23 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 24 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 25 | 26 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 27 | just .wav), either the waveform as a numpy array of floats. 28 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 29 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 30 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 31 | this argument will be ignored. 32 | """ 33 | # Load the wav from disk if needed 34 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 35 | wav, source_sr = librosa.load(fpath_or_wav, sr=None) 36 | else: 37 | wav = fpath_or_wav 38 | 39 | # Resample the wav if needed 40 | if source_sr is not None and source_sr != sampling_rate: 41 | wav = librosa.resample(wav, source_sr, sampling_rate) 42 | 43 | # Apply the preprocessing: normalize volume and shorten long silences 44 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 45 | wav = trim_long_silences(wav) 46 | 47 | return wav 48 | 49 | 50 | def preprocess_wav_batch(wavs, source_sr=22050): 51 | # This torch version is designed to cope with a batch of same lengths wavs 52 | if sampling_rate != source_sr: 53 | resample = Resample(source_sr, sampling_rate) 54 | wavs = resample(wavs) 55 | wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS, 56 | increase_only=True) 57 | # Trimming silence is not implemented in this version yet! 58 | return wavs_preprocessed 59 | 60 | 61 | def wav_to_mel_spectrogram(wav): 62 | """ 63 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 64 | Note: this not a log-mel spectrogram. 65 | """ 66 | frames = librosa.feature.melspectrogram( 67 | wav, 68 | sampling_rate, 69 | n_fft=int(sampling_rate * mel_window_length / 1000), 70 | hop_length=int(sampling_rate * mel_window_step / 1000), 71 | n_mels=mel_n_channels 72 | ) 73 | return frames.astype(np.float32).T 74 | 75 | 76 | def wav_to_mel_spectrogram_batch(wavs): 77 | # This torch version is designed to cope with a batch of same lengths wavs 78 | n_fft = int(sampling_rate * mel_window_length / 1000) 79 | hop_length = int(sampling_rate * mel_window_step / 1000) 80 | win_length = int(sampling_rate * mel_window_length / 1000) 81 | window = torch.hann_window(n_fft).to(wavs) 82 | mel_basis = torch.from_numpy(librosa_mel_fn(sampling_rate, n_fft, 83 | mel_n_channels)).to(wavs) 84 | s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length, 85 | win_length=win_length, window=window, center=True) 86 | real_part, imag_part = s.unbind(-1) 87 | stftm = real_part**2 + imag_part**2 88 | mels = torch.matmul(mel_basis, stftm) 89 | return torch.transpose(mels, 1, 2) 90 | 91 | 92 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 93 | if increase_only and decrease_only: 94 | raise ValueError("Both increase only and decrease only are set") 95 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) 96 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 97 | return wav 98 | return wav * (10 ** (dBFS_change / 20)) 99 | 100 | 101 | def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False): 102 | # This torch version is designed to cope with a batch of same lengths wavs 103 | if increase_only and decrease_only: 104 | raise ValueError("Both increase only and decrease only are set") 105 | dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1)) 106 | scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype) 107 | if increase_only: 108 | mask = (dBFS_change > 0).to(scales) 109 | elif decrease_only: 110 | mask = (dBFS_change < 0).to(scales) 111 | else: 112 | mask = torch.zeros_like(scales) 113 | scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0) 114 | return wavs * scales.unsqueeze(-1) 115 | 116 | 117 | def trim_long_silences(wav): 118 | """ 119 | Ensures that segments without voice in the waveform remain no longer than a 120 | threshold determined by the VAD parameters in params.py. 121 | 122 | :param wav: the raw waveform as a numpy array of floats 123 | :return: the same waveform with silences trimmed away (length <= original wav length) 124 | """ 125 | # Compute the voice detection window size 126 | samples_per_window = (vad_window_length * sampling_rate) // 1000 127 | 128 | # Trim the end of the audio to have a multiple of the window size 129 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] 130 | 131 | # Convert the float waveform to 16-bit mono PCM 132 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) 133 | 134 | # Perform voice activation detection 135 | voice_flags = [] 136 | vad = webrtcvad.Vad(mode=3) 137 | for window_start in range(0, len(wav), samples_per_window): 138 | window_end = window_start + samples_per_window 139 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], 140 | sample_rate=sampling_rate)) 141 | voice_flags = np.array(voice_flags) 142 | 143 | # Smooth the voice detection with a moving average 144 | def moving_average(array, width): 145 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) 146 | ret = np.cumsum(array_padded, dtype=float) 147 | ret[width:] = ret[width:] - ret[:-width] 148 | return ret[width - 1:] / width 149 | 150 | audio_mask = moving_average(voice_flags, vad_moving_average_width) 151 | audio_mask = np.round(audio_mask).astype(np.bool) 152 | 153 | # Dilate the voiced regions 154 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) 155 | audio_mask = np.repeat(audio_mask, samples_per_window) 156 | 157 | return wav[audio_mask == True] 158 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/config.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | librispeech_datasets = { 4 | "train": { 5 | "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], 6 | "other": ["LibriSpeech/train-other-500"] 7 | }, 8 | "test": { 9 | "clean": ["LibriSpeech/test-clean"], 10 | "other": ["LibriSpeech/test-other"] 11 | }, 12 | "dev": { 13 | "clean": ["LibriSpeech/dev-clean"], 14 | "other": ["LibriSpeech/dev-other"] 15 | }, 16 | } 17 | libritts_datasets = { 18 | "train": { 19 | "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], 20 | "other": ["LibriTTS/train-other-500"] 21 | }, 22 | "test": { 23 | "clean": ["LibriTTS/test-clean"], 24 | "other": ["LibriTTS/test-other"] 25 | }, 26 | "dev": { 27 | "clean": ["LibriTTS/dev-clean"], 28 | "other": ["LibriTTS/dev-other"] 29 | }, 30 | } 31 | voxceleb_datasets = { 32 | "voxceleb1" : { 33 | "train": ["VoxCeleb1/wav"], 34 | "test": ["VoxCeleb1/test_wav"] 35 | }, 36 | "voxceleb2" : { 37 | "train": ["VoxCeleb2/dev/aac"], 38 | "test": ["VoxCeleb2/test_wav"] 39 | } 40 | } 41 | 42 | other_datasets = [ 43 | "LJSpeech-1.1", 44 | "VCTK-Corpus/wav48", 45 | ] 46 | 47 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] 48 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 4 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader 5 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/random_cycler.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import random 4 | 5 | class RandomCycler: 6 | """ 7 | Creates an internal copy of a sequence and allows access to its items in a constrained random 8 | order. For a source sequence of n items and one or several consecutive queries of a total 9 | of m items, the following guarantees hold (one implies the other): 10 | - Each item will be returned between m // n and ((m - 1) // n) + 1 times. 11 | - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. 12 | """ 13 | 14 | def __init__(self, source): 15 | if len(source) == 0: 16 | raise Exception("Can't create RandomCycler from an empty collection") 17 | self.all_items = list(source) 18 | self.next_items = [] 19 | 20 | def sample(self, count: int): 21 | shuffle = lambda l: random.sample(l, len(l)) 22 | 23 | out = [] 24 | while count > 0: 25 | if count >= len(self.all_items): 26 | out.extend(shuffle(list(self.all_items))) 27 | count -= len(self.all_items) 28 | continue 29 | n = min(count, len(self.next_items)) 30 | out.extend(self.next_items[:n]) 31 | count -= n 32 | self.next_items = self.next_items[n:] 33 | if len(self.next_items) == 0: 34 | self.next_items = shuffle(list(self.all_items)) 35 | return out 36 | 37 | def __next__(self): 38 | return self.sample(1)[0] 39 | 40 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.random_cycler import RandomCycler 4 | from encoder.data_objects.utterance import Utterance 5 | from pathlib import Path 6 | 7 | # Contains the set of utterances of a single speaker 8 | class Speaker: 9 | def __init__(self, root: Path): 10 | self.root = root 11 | self.name = root.name 12 | self.utterances = None 13 | self.utterance_cycler = None 14 | 15 | def _load_utterances(self): 16 | with self.root.joinpath("_sources.txt").open("r") as sources_file: 17 | sources = [l.split(",") for l in sources_file] 18 | sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} 19 | self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] 20 | self.utterance_cycler = RandomCycler(self.utterances) 21 | 22 | def random_partial(self, count, n_frames): 23 | """ 24 | Samples a batch of unique partial utterances from the disk in a way that all 25 | utterances come up at least once every two cycles and in a random order every time. 26 | 27 | :param count: The number of partial utterances to sample from the set of utterances from 28 | that speaker. Utterances are guaranteed not to be repeated if is not larger than 29 | the number of utterances available. 30 | :param n_frames: The number of frames in the partial utterance. 31 | :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 32 | frames are the frames of the partial utterances and range is the range of the partial 33 | utterance with regard to the complete utterance. 34 | """ 35 | if self.utterances is None: 36 | self._load_utterances() 37 | 38 | utterances = self.utterance_cycler.sample(count) 39 | 40 | a = [(u,) + u.random_partial(n_frames) for u in utterances] 41 | 42 | return a 43 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker_batch.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | from typing import List 5 | from encoder.data_objects.speaker import Speaker 6 | 7 | class SpeakerBatch: 8 | def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): 9 | self.speakers = speakers 10 | self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} 11 | 12 | # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with 13 | # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) 14 | self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) 15 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/speaker_verification_dataset.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.random_cycler import RandomCycler 4 | from encoder.data_objects.speaker_batch import SpeakerBatch 5 | from encoder.data_objects.speaker import Speaker 6 | from encoder.params_data import partials_n_frames 7 | from torch.utils.data import Dataset, DataLoader 8 | from pathlib import Path 9 | 10 | # TODO: improve with a pool of speakers for data efficiency 11 | 12 | class SpeakerVerificationDataset(Dataset): 13 | def __init__(self, datasets_root: Path): 14 | self.root = datasets_root 15 | speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] 16 | if len(speaker_dirs) == 0: 17 | raise Exception("No speakers found. Make sure you are pointing to the directory " 18 | "containing all preprocessed speaker directories.") 19 | self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] 20 | self.speaker_cycler = RandomCycler(self.speakers) 21 | 22 | def __len__(self): 23 | return int(1e10) 24 | 25 | def __getitem__(self, index): 26 | return next(self.speaker_cycler) 27 | 28 | def get_logs(self): 29 | log_string = "" 30 | for log_fpath in self.root.glob("*.txt"): 31 | with log_fpath.open("r") as log_file: 32 | log_string += "".join(log_file.readlines()) 33 | return log_string 34 | 35 | 36 | class SpeakerVerificationDataLoader(DataLoader): 37 | def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 38 | batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 39 | worker_init_fn=None): 40 | self.utterances_per_speaker = utterances_per_speaker 41 | 42 | super().__init__( 43 | dataset=dataset, 44 | batch_size=speakers_per_batch, 45 | shuffle=False, 46 | sampler=sampler, 47 | batch_sampler=batch_sampler, 48 | num_workers=num_workers, 49 | collate_fn=self.collate, 50 | pin_memory=pin_memory, 51 | drop_last=False, 52 | timeout=timeout, 53 | worker_init_fn=worker_init_fn 54 | ) 55 | 56 | def collate(self, speakers): 57 | return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 58 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/data_objects/utterance.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | 5 | 6 | class Utterance: 7 | def __init__(self, frames_fpath, wave_fpath): 8 | self.frames_fpath = frames_fpath 9 | self.wave_fpath = wave_fpath 10 | 11 | def get_frames(self): 12 | return np.load(self.frames_fpath) 13 | 14 | def random_partial(self, n_frames): 15 | """ 16 | Crops the frames into a partial utterance of n_frames 17 | 18 | :param n_frames: The number of frames of the partial utterance 19 | :return: the partial utterance frames and a tuple indicating the start and end of the 20 | partial utterance in the complete utterance. 21 | """ 22 | frames = self.get_frames() 23 | if frames.shape[0] == n_frames: 24 | start = 0 25 | else: 26 | start = np.random.randint(0, frames.shape[0] - n_frames) 27 | end = start + n_frames 28 | return frames[start:end], (start, end) -------------------------------------------------------------------------------- /speaker_encoder/encoder/inference.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.params_data import * 4 | from encoder.model import SpeakerEncoder 5 | from encoder.audio import preprocess_wav, preprocess_wav_batch 6 | from matplotlib import cm 7 | from encoder import audio 8 | from pathlib import Path 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import torch 12 | 13 | _model = None # type: SpeakerEncoder 14 | _device = None # type: torch.device 15 | 16 | 17 | def load_model(weights_fpath: Path, device="cpu"): 18 | """ 19 | Loads the model in memory. If this function is not explicitely called, it will be run on the 20 | first call to embed_frames() with the default weights file. 21 | 22 | :param weights_fpath: the path to saved model weights. 23 | :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 24 | model will be loaded and will run on this device. Outputs will however always be on the cpu. 25 | If None, will default to your GPU if it"s available, otherwise your CPU. 26 | """ 27 | # TODO: I think the slow loading of the encoder might have something to do with the device it 28 | # was saved on. Worth investigating. 29 | global _model, _device 30 | if device is None: 31 | _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | elif isinstance(device, str): 33 | _device = torch.device(device) 34 | _model = SpeakerEncoder(_device, torch.device("cpu")) 35 | checkpoint = torch.load(weights_fpath, map_location="cpu") 36 | _model.load_state_dict(checkpoint["model_state"]) 37 | _model.eval() 38 | print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) 39 | 40 | 41 | def is_loaded(): 42 | return _model is not None 43 | 44 | 45 | def embed_frames_batch(frames, use_torch=False): 46 | if _model is None: 47 | raise Exception("Model was not loaded. Call load_model() before inference.") 48 | 49 | if not use_torch: 50 | frames = torch.from_numpy(frames) 51 | frames = frames.to(_device) 52 | print(frames.shape) 53 | embeds = _model.forward(frames) 54 | print(embeds.shape) 55 | if not use_torch: 56 | embeds = embeds.detach().cpu().numpy() 57 | return embeds 58 | 59 | 60 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, 61 | min_pad_coverage=0.75, overlap=0.5): 62 | """ 63 | Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 64 | partial utterances of each. Both the waveform and the mel 65 | spectrogram slices are returned, so as to make each partial utterance waveform correspond to 66 | its spectrogram. This function assumes that the mel spectrogram parameters used are those 67 | defined in params_data.py. 68 | 69 | The returned ranges may be indexing further than the length of the waveform. It is 70 | recommended that you pad the waveform with zeros up to wave_slices[-1].stop. 71 | 72 | :param n_samples: the number of samples in the waveform 73 | :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 74 | utterance 75 | :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 76 | enough frames. If at least of are present, 77 | then the last partial utterance will be considered, as if we padded the audio. Otherwise, 78 | it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 79 | utterance, this parameter is ignored so that the function always returns at least 1 slice. 80 | :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 81 | utterances are entirely disjoint. 82 | :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 83 | respectively the waveform and the mel spectrogram with these slices to obtain the partial 84 | utterances. 85 | """ 86 | assert 0 <= overlap < 1 87 | assert 0 < min_pad_coverage <= 1 88 | 89 | samples_per_frame = int((sampling_rate * mel_window_step / 1000)) 90 | n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) 91 | frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) 92 | 93 | # Compute the slices 94 | wav_slices, mel_slices = [], [] 95 | steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) 96 | for i in range(0, steps, frame_step): 97 | mel_range = np.array([i, i + partial_utterance_n_frames]) 98 | wav_range = mel_range * samples_per_frame 99 | mel_slices.append(slice(*mel_range)) 100 | wav_slices.append(slice(*wav_range)) 101 | 102 | # Evaluate whether extra padding is warranted or not 103 | last_wav_range = wav_slices[-1] 104 | coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) 105 | if coverage < min_pad_coverage and len(mel_slices) > 1: 106 | mel_slices = mel_slices[:-1] 107 | wav_slices = wav_slices[:-1] 108 | 109 | return wav_slices, mel_slices 110 | 111 | 112 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): 113 | """ 114 | Computes an embedding for a single utterance. 115 | 116 | # TODO: handle multiple wavs to benefit from batching on GPU 117 | :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 118 | :param using_partials: if True, then the utterance is split in partial utterances of 119 | frames and the utterance embedding is computed from their 120 | normalized average. If False, the utterance is instead computed from feeding the entire 121 | spectogram to the network. 122 | :param return_partials: if True, the partial embeddings will also be returned along with the 123 | wav slices that correspond to the partial embeddings. 124 | :param kwargs: additional arguments to compute_partial_splits() 125 | :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 126 | is True, the partial utterances as a numpy array of float32 of shape 127 | (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 128 | returned. If is simultaneously set to False, both these values will be None 129 | instead. 130 | """ 131 | # Process the entire utterance if not using partials 132 | if not using_partials: 133 | frames = audio.wav_to_mel_spectrogram(wav) 134 | embed = embed_frames_batch(frames[None, ...])[0] 135 | if return_partials: 136 | return embed, None, None 137 | return embed 138 | 139 | # Compute where to split the utterance into partials and pad if necessary 140 | wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) 141 | max_wave_length = wave_slices[-1].stop 142 | if max_wave_length >= len(wav): 143 | wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") 144 | 145 | # Split the utterance into partials 146 | frames = audio.wav_to_mel_spectrogram(wav) 147 | frames_batch = np.array([frames[s] for s in mel_slices]) 148 | partial_embeds = embed_frames_batch(frames_batch) 149 | 150 | # Compute the utterance embedding from the partial embeddings 151 | raw_embed = np.mean(partial_embeds, axis=0) 152 | embed = raw_embed / np.linalg.norm(raw_embed, 2) 153 | 154 | if return_partials: 155 | return embed, partial_embeds, wave_slices 156 | return embed 157 | 158 | 159 | def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs): 160 | # This torch version is designed to cope with a batch of same lengths wavs 161 | if not using_partials: 162 | print(wavs.shape) 163 | frames = audio.wav_to_mel_spectrogram_batch(wavs) 164 | embeds = embed_frames_batch(frames) 165 | if return_partials: 166 | return embeds, None, None 167 | return embeds 168 | 169 | wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs) 170 | max_wave_length = wave_slices[-1].stop 171 | if max_wave_length >= wavs.shape[-1]: 172 | wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]), 173 | dtype=wavs.dtype, device=wavs.device)], 1) 174 | 175 | frames = audio.wav_to_mel_spectrogram_batch(wavs) 176 | frames_batch = [] 177 | for i in range(len(frames)): 178 | frames_batch += [frames[i][s] for s in mel_slices] 179 | frames_batch = torch.stack(frames_batch, 0) 180 | partial_embeds = embed_frames_batch(frames_batch, use_torch=True) 181 | partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1) 182 | 183 | raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False) 184 | embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True) 185 | 186 | if return_partials: 187 | return embeds, partial_embeds, wave_slices 188 | return embeds 189 | 190 | 191 | def embed_speaker(wavs, **kwargs): 192 | raise NotImplemented() 193 | 194 | 195 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): 196 | if ax is None: 197 | ax = plt.gca() 198 | 199 | if shape is None: 200 | height = int(np.sqrt(len(embed))) 201 | shape = (height, -1) 202 | embed = embed.reshape(shape) 203 | 204 | cmap = cm.get_cmap() 205 | mappable = ax.imshow(embed, cmap=cmap) 206 | cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) 207 | cbar.set_clim(*color_range) 208 | 209 | ax.set_xticks([]), ax.set_yticks([]) 210 | ax.set_title(title) 211 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.params_model import * 4 | from encoder.params_data import * 5 | from scipy.interpolate import interp1d 6 | from sklearn.metrics import roc_curve 7 | from torch.nn.utils import clip_grad_norm_ 8 | from scipy.optimize import brentq 9 | from torch import nn 10 | import numpy as np 11 | import torch 12 | 13 | 14 | class SpeakerEncoder(nn.Module): 15 | def __init__(self, device, loss_device): 16 | super().__init__() 17 | self.loss_device = loss_device 18 | 19 | # Network defition 20 | self.lstm = nn.LSTM(input_size=mel_n_channels, 21 | hidden_size=model_hidden_size, 22 | num_layers=model_num_layers, 23 | batch_first=True).to(device) 24 | self.linear = nn.Linear(in_features=model_hidden_size, 25 | out_features=model_embedding_size).to(device) 26 | self.relu = torch.nn.ReLU().to(device) 27 | 28 | # Cosine similarity scaling (with fixed initial parameter values) 29 | self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) 30 | self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) 31 | 32 | # Loss 33 | self.loss_fn = nn.CrossEntropyLoss().to(loss_device) 34 | 35 | def do_gradient_ops(self): 36 | # Gradient scale 37 | self.similarity_weight.grad *= 0.01 38 | self.similarity_bias.grad *= 0.01 39 | 40 | # Gradient clipping 41 | clip_grad_norm_(self.parameters(), 3, norm_type=2) 42 | 43 | def forward(self, utterances, hidden_init=None): 44 | """ 45 | Computes the embeddings of a batch of utterance spectrograms. 46 | 47 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 48 | (batch_size, n_frames, n_channels) 49 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 50 | batch_size, hidden_size). Will default to a tensor of zeros if None. 51 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 52 | """ 53 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 54 | # and the final cell state. 55 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 56 | 57 | # We take only the hidden state of the last layer 58 | embeds_raw = self.relu(self.linear(hidden[-1])) 59 | 60 | # L2-normalize it 61 | embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 62 | 63 | return embeds 64 | 65 | def similarity_matrix(self, embeds): 66 | """ 67 | Computes the similarity matrix according the section 2.1 of GE2E. 68 | 69 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 70 | utterances_per_speaker, embedding_size) 71 | :return: the similarity matrix as a tensor of shape (speakers_per_batch, 72 | utterances_per_speaker, speakers_per_batch) 73 | """ 74 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 75 | 76 | # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation 77 | centroids_incl = torch.mean(embeds, dim=1, keepdim=True) 78 | centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) 79 | 80 | # Exclusive centroids (1 per utterance) 81 | centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) 82 | centroids_excl /= (utterances_per_speaker - 1) 83 | centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) 84 | 85 | # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot 86 | # product of these vectors (which is just an element-wise multiplication reduced by a sum). 87 | # We vectorize the computation for efficiency. 88 | sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, 89 | speakers_per_batch).to(self.loss_device) 90 | mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) 91 | for j in range(speakers_per_batch): 92 | mask = np.where(mask_matrix[j])[0] 93 | sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) 94 | sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) 95 | 96 | ## Even more vectorized version (slower maybe because of transpose) 97 | # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker 98 | # ).to(self.loss_device) 99 | # eye = np.eye(speakers_per_batch, dtype=np.int) 100 | # mask = np.where(1 - eye) 101 | # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) 102 | # mask = np.where(eye) 103 | # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) 104 | # sim_matrix2 = sim_matrix2.transpose(1, 2) 105 | 106 | sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias 107 | return sim_matrix 108 | 109 | def loss(self, embeds): 110 | """ 111 | Computes the softmax loss according the section 2.1 of GE2E. 112 | 113 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 114 | utterances_per_speaker, embedding_size) 115 | :return: the loss and the EER for this batch of embeddings. 116 | """ 117 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 118 | 119 | # Loss 120 | sim_matrix = self.similarity_matrix(embeds) 121 | sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 122 | speakers_per_batch)) 123 | ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) 124 | target = torch.from_numpy(ground_truth).long().to(self.loss_device) 125 | loss = self.loss_fn(sim_matrix, target) 126 | 127 | # EER (not backpropagated) 128 | with torch.no_grad(): 129 | inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] 130 | labels = np.array([inv_argmax(i) for i in ground_truth]) 131 | preds = sim_matrix.detach().cpu().numpy() 132 | 133 | # Snippet from https://yangcha.github.io/EER-ROC/ 134 | fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) 135 | eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) 136 | 137 | return loss, eer -------------------------------------------------------------------------------- /speaker_encoder/encoder/params_data.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | ## Mel-filterbank 4 | mel_window_length = 25 # In milliseconds 5 | mel_window_step = 10 # In milliseconds 6 | mel_n_channels = 40 7 | 8 | 9 | ## Audio 10 | sampling_rate = 16000 11 | # Number of spectrogram frames in a partial utterance 12 | partials_n_frames = 160 # 1600 ms 13 | # Number of spectrogram frames at inference 14 | inference_n_frames = 80 # 800 ms 15 | 16 | 17 | ## Voice Activation Detection 18 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 19 | # This sets the granularity of the VAD. Should not need to be changed. 20 | vad_window_length = 30 # In milliseconds 21 | # Number of frames to average together when performing the moving average smoothing. 22 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 23 | vad_moving_average_width = 8 24 | # Maximum number of consecutive silent frames a segment can have. 25 | vad_max_silence_length = 6 26 | 27 | 28 | ## Audio volume normalization 29 | audio_norm_target_dBFS = -30 30 | 31 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/params_model.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | ## Model parameters 4 | model_hidden_size = 256 5 | model_embedding_size = 256 6 | model_num_layers = 3 7 | 8 | 9 | ## Training parameters 10 | learning_rate_init = 1e-4 11 | speakers_per_batch = 64 12 | utterances_per_speaker = 10 13 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/preprocess.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from multiprocess.pool import ThreadPool 4 | from encoder.params_data import * 5 | from encoder.config import librispeech_datasets, anglophone_nationalites 6 | from datetime import datetime 7 | from encoder import audio 8 | from pathlib import Path 9 | from tqdm import tqdm 10 | import numpy as np 11 | 12 | 13 | class DatasetLog: 14 | """ 15 | Registers metadata about the dataset in a text file. 16 | """ 17 | def __init__(self, root, name): 18 | self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") 19 | self.sample_data = dict() 20 | 21 | start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 22 | self.write_line("Creating dataset %s on %s" % (name, start_time)) 23 | self.write_line("-----") 24 | self._log_params() 25 | 26 | def _log_params(self): 27 | from encoder import params_data 28 | self.write_line("Parameter values:") 29 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 30 | value = getattr(params_data, param_name) 31 | self.write_line("\t%s: %s" % (param_name, value)) 32 | self.write_line("-----") 33 | 34 | def write_line(self, line): 35 | self.text_file.write("%s\n" % line) 36 | 37 | def add_sample(self, **kwargs): 38 | for param_name, value in kwargs.items(): 39 | if not param_name in self.sample_data: 40 | self.sample_data[param_name] = [] 41 | self.sample_data[param_name].append(value) 42 | 43 | def finalize(self): 44 | self.write_line("Statistics:") 45 | for param_name, values in self.sample_data.items(): 46 | self.write_line("\t%s:" % param_name) 47 | self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) 48 | self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) 49 | self.write_line("-----") 50 | end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 51 | self.write_line("Finished on %s" % end_time) 52 | self.text_file.close() 53 | 54 | 55 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): 56 | dataset_root = datasets_root.joinpath(dataset_name) 57 | if not dataset_root.exists(): 58 | print("Couldn\'t find %s, skipping this dataset." % dataset_root) 59 | return None, None 60 | return dataset_root, DatasetLog(out_dir, dataset_name) 61 | 62 | 63 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, 64 | skip_existing, logger): 65 | print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) 66 | 67 | # Function to preprocess utterances for one speaker 68 | def preprocess_speaker(speaker_dir: Path): 69 | # Give a name to the speaker that includes its dataset 70 | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) 71 | 72 | # Create an output directory with that name, as well as a txt file containing a 73 | # reference to each source file. 74 | speaker_out_dir = out_dir.joinpath(speaker_name) 75 | speaker_out_dir.mkdir(exist_ok=True) 76 | sources_fpath = speaker_out_dir.joinpath("_sources.txt") 77 | 78 | # There's a possibility that the preprocessing was interrupted earlier, check if 79 | # there already is a sources file. 80 | if sources_fpath.exists(): 81 | try: 82 | with sources_fpath.open("r") as sources_file: 83 | existing_fnames = {line.split(",")[0] for line in sources_file} 84 | except: 85 | existing_fnames = {} 86 | else: 87 | existing_fnames = {} 88 | 89 | # Gather all audio files for that speaker recursively 90 | sources_file = sources_fpath.open("a" if skip_existing else "w") 91 | for in_fpath in speaker_dir.glob("**/*.%s" % extension): 92 | # Check if the target output file already exists 93 | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) 94 | out_fname = out_fname.replace(".%s" % extension, ".npy") 95 | if skip_existing and out_fname in existing_fnames: 96 | continue 97 | 98 | # Load and preprocess the waveform 99 | wav = audio.preprocess_wav(in_fpath) 100 | if len(wav) == 0: 101 | continue 102 | 103 | # Create the mel spectrogram, discard those that are too short 104 | frames = audio.wav_to_mel_spectrogram(wav) 105 | if len(frames) < partials_n_frames: 106 | continue 107 | 108 | out_fpath = speaker_out_dir.joinpath(out_fname) 109 | np.save(out_fpath, frames) 110 | logger.add_sample(duration=len(wav) / sampling_rate) 111 | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) 112 | 113 | sources_file.close() 114 | 115 | # Process the utterances for each speaker 116 | with ThreadPool(8) as pool: 117 | list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), 118 | unit="speakers")) 119 | logger.finalize() 120 | print("Done preprocessing %s.\n" % dataset_name) 121 | 122 | 123 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): 124 | for dataset_name in librispeech_datasets["train"]["other"]: 125 | # Initialize the preprocessing 126 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 127 | if not dataset_root: 128 | return 129 | 130 | # Preprocess all speakers 131 | speaker_dirs = list(dataset_root.glob("*")) 132 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", 133 | skip_existing, logger) 134 | 135 | 136 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): 137 | # Initialize the preprocessing 138 | dataset_name = "VoxCeleb1" 139 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 140 | if not dataset_root: 141 | return 142 | 143 | # Get the contents of the meta file 144 | with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: 145 | metadata = [line.split("\t") for line in metafile][1:] 146 | 147 | # Select the ID and the nationality, filter out non-anglophone speakers 148 | nationalities = {line[0]: line[3] for line in metadata} 149 | keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 150 | nationality.lower() in anglophone_nationalites] 151 | print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 152 | (len(keep_speaker_ids), len(nationalities))) 153 | 154 | # Get the speaker directories for anglophone speakers only 155 | speaker_dirs = dataset_root.joinpath("wav").glob("*") 156 | speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if 157 | speaker_dir.name in keep_speaker_ids] 158 | print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 159 | (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) 160 | 161 | # Preprocess all speakers 162 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", 163 | skip_existing, logger) 164 | 165 | 166 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): 167 | # Initialize the preprocessing 168 | dataset_name = "VoxCeleb2" 169 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 170 | if not dataset_root: 171 | return 172 | 173 | # Get the speaker directories 174 | # Preprocess all speakers 175 | speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) 176 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", 177 | skip_existing, logger) 178 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/train.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.visualizations import Visualizations 4 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset 5 | from encoder.params_model import * 6 | from encoder.model import SpeakerEncoder 7 | from utils.profiler import Profiler 8 | from pathlib import Path 9 | import torch 10 | 11 | def sync(device: torch.device): 12 | # FIXME 13 | return 14 | # For correct profiling (cuda operations are async) 15 | if device.type == "cuda": 16 | torch.cuda.synchronize(device) 17 | 18 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, 19 | backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, 20 | no_visdom: bool): 21 | # Create a dataset and a dataloader 22 | dataset = SpeakerVerificationDataset(clean_data_root) 23 | loader = SpeakerVerificationDataLoader( 24 | dataset, 25 | speakers_per_batch, 26 | utterances_per_speaker, 27 | num_workers=8, 28 | ) 29 | 30 | # Setup the device on which to run the forward pass and the loss. These can be different, 31 | # because the forward pass is faster on the GPU whereas the loss is often (depending on your 32 | # hyperparameters) faster on the CPU. 33 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 34 | # FIXME: currently, the gradient is None if loss_device is cuda 35 | loss_device = torch.device("cpu") 36 | 37 | # Create the model and the optimizer 38 | model = SpeakerEncoder(device, loss_device) 39 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) 40 | init_step = 1 41 | 42 | # Configure file path for the model 43 | state_fpath = models_dir.joinpath(run_id + ".pt") 44 | backup_dir = models_dir.joinpath(run_id + "_backups") 45 | 46 | # Load any existing model 47 | if not force_restart: 48 | if state_fpath.exists(): 49 | print("Found existing model \"%s\", loading it and resuming training." % run_id) 50 | checkpoint = torch.load(state_fpath) 51 | init_step = checkpoint["step"] 52 | model.load_state_dict(checkpoint["model_state"]) 53 | optimizer.load_state_dict(checkpoint["optimizer_state"]) 54 | optimizer.param_groups[0]["lr"] = learning_rate_init 55 | else: 56 | print("No model \"%s\" found, starting training from scratch." % run_id) 57 | else: 58 | print("Starting the training from scratch.") 59 | model.train() 60 | 61 | # Initialize the visualization environment 62 | vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) 63 | vis.log_dataset(dataset) 64 | vis.log_params() 65 | device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") 66 | vis.log_implementation({"Device": device_name}) 67 | 68 | # Training loop 69 | profiler = Profiler(summarize_every=10, disabled=False) 70 | for step, speaker_batch in enumerate(loader, init_step): 71 | profiler.tick("Blocking, waiting for batch (threaded)") 72 | 73 | # Forward pass 74 | inputs = torch.from_numpy(speaker_batch.data).to(device) 75 | sync(device) 76 | profiler.tick("Data to %s" % device) 77 | embeds = model(inputs) 78 | sync(device) 79 | profiler.tick("Forward pass") 80 | embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) 81 | loss, eer = model.loss(embeds_loss) 82 | sync(loss_device) 83 | profiler.tick("Loss") 84 | 85 | # Backward pass 86 | model.zero_grad() 87 | loss.backward() 88 | profiler.tick("Backward pass") 89 | model.do_gradient_ops() 90 | optimizer.step() 91 | profiler.tick("Parameter update") 92 | 93 | # Update visualizations 94 | # learning_rate = optimizer.param_groups[0]["lr"] 95 | vis.update(loss.item(), eer, step) 96 | 97 | # Draw projections and save them to the backup folder 98 | if umap_every != 0 and step % umap_every == 0: 99 | print("Drawing and saving projections (step %d)" % step) 100 | backup_dir.mkdir(exist_ok=True) 101 | projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) 102 | embeds = embeds.detach().cpu().numpy() 103 | vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) 104 | vis.save() 105 | 106 | # Overwrite the latest version of the model 107 | if save_every != 0 and step % save_every == 0: 108 | print("Saving the model (step %d)" % step) 109 | torch.save({ 110 | "step": step + 1, 111 | "model_state": model.state_dict(), 112 | "optimizer_state": optimizer.state_dict(), 113 | }, state_fpath) 114 | 115 | # Make a backup 116 | if backup_every != 0 and step % backup_every == 0: 117 | print("Making a backup (step %d)" % step) 118 | backup_dir.mkdir(exist_ok=True) 119 | backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) 120 | torch.save({ 121 | "step": step + 1, 122 | "model_state": model.state_dict(), 123 | "optimizer_state": optimizer.state_dict(), 124 | }, backup_fpath) 125 | 126 | profiler.tick("Extras (visualizations, saving)") 127 | -------------------------------------------------------------------------------- /speaker_encoder/encoder/visualizations.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 4 | from datetime import datetime 5 | from time import perf_counter as timer 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | # import webbrowser 9 | import visdom 10 | import umap 11 | 12 | colormap = np.array([ 13 | [76, 255, 0], 14 | [0, 127, 70], 15 | [255, 0, 0], 16 | [255, 217, 38], 17 | [0, 135, 255], 18 | [165, 0, 165], 19 | [255, 167, 255], 20 | [0, 255, 255], 21 | [255, 96, 38], 22 | [142, 76, 0], 23 | [33, 0, 127], 24 | [0, 0, 0], 25 | [183, 183, 183], 26 | ], dtype=np.float) / 255 27 | 28 | 29 | class Visualizations: 30 | def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): 31 | # Tracking data 32 | self.last_update_timestamp = timer() 33 | self.update_every = update_every 34 | self.step_times = [] 35 | self.losses = [] 36 | self.eers = [] 37 | print("Updating the visualizations every %d steps." % update_every) 38 | 39 | # If visdom is disabled TODO: use a better paradigm for that 40 | self.disabled = disabled 41 | if self.disabled: 42 | return 43 | 44 | # Set the environment name 45 | now = str(datetime.now().strftime("%d-%m %Hh%M")) 46 | if env_name is None: 47 | self.env_name = now 48 | else: 49 | self.env_name = "%s (%s)" % (env_name, now) 50 | 51 | # Connect to visdom and open the corresponding window in the browser 52 | try: 53 | self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) 54 | except ConnectionError: 55 | raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " 56 | "start it.") 57 | # webbrowser.open("http://localhost:8097/env/" + self.env_name) 58 | 59 | # Create the windows 60 | self.loss_win = None 61 | self.eer_win = None 62 | # self.lr_win = None 63 | self.implementation_win = None 64 | self.projection_win = None 65 | self.implementation_string = "" 66 | 67 | def log_params(self): 68 | if self.disabled: 69 | return 70 | from encoder import params_data 71 | from encoder import params_model 72 | param_string = "Model parameters:
" 73 | for param_name in (p for p in dir(params_model) if not p.startswith("__")): 74 | value = getattr(params_model, param_name) 75 | param_string += "\t%s: %s
" % (param_name, value) 76 | param_string += "Data parameters:
" 77 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 78 | value = getattr(params_data, param_name) 79 | param_string += "\t%s: %s
" % (param_name, value) 80 | self.vis.text(param_string, opts={"title": "Parameters"}) 81 | 82 | def log_dataset(self, dataset: SpeakerVerificationDataset): 83 | if self.disabled: 84 | return 85 | dataset_string = "" 86 | dataset_string += "Speakers: %s\n" % len(dataset.speakers) 87 | dataset_string += "\n" + dataset.get_logs() 88 | dataset_string = dataset_string.replace("\n", "
") 89 | self.vis.text(dataset_string, opts={"title": "Dataset"}) 90 | 91 | def log_implementation(self, params): 92 | if self.disabled: 93 | return 94 | implementation_string = "" 95 | for param, value in params.items(): 96 | implementation_string += "%s: %s\n" % (param, value) 97 | implementation_string = implementation_string.replace("\n", "
") 98 | self.implementation_string = implementation_string 99 | self.implementation_win = self.vis.text( 100 | implementation_string, 101 | opts={"title": "Training implementation"} 102 | ) 103 | 104 | def update(self, loss, eer, step): 105 | # Update the tracking data 106 | now = timer() 107 | self.step_times.append(1000 * (now - self.last_update_timestamp)) 108 | self.last_update_timestamp = now 109 | self.losses.append(loss) 110 | self.eers.append(eer) 111 | print(".", end="") 112 | 113 | # Update the plots every steps 114 | if step % self.update_every != 0: 115 | return 116 | time_string = "Step time: mean: %5dms std: %5dms" % \ 117 | (int(np.mean(self.step_times)), int(np.std(self.step_times))) 118 | print("\nStep %6d Loss: %.4f EER: %.4f %s" % 119 | (step, np.mean(self.losses), np.mean(self.eers), time_string)) 120 | if not self.disabled: 121 | self.loss_win = self.vis.line( 122 | [np.mean(self.losses)], 123 | [step], 124 | win=self.loss_win, 125 | update="append" if self.loss_win else None, 126 | opts=dict( 127 | legend=["Avg. loss"], 128 | xlabel="Step", 129 | ylabel="Loss", 130 | title="Loss", 131 | ) 132 | ) 133 | self.eer_win = self.vis.line( 134 | [np.mean(self.eers)], 135 | [step], 136 | win=self.eer_win, 137 | update="append" if self.eer_win else None, 138 | opts=dict( 139 | legend=["Avg. EER"], 140 | xlabel="Step", 141 | ylabel="EER", 142 | title="Equal error rate" 143 | ) 144 | ) 145 | if self.implementation_win is not None: 146 | self.vis.text( 147 | self.implementation_string + ("%s" % time_string), 148 | win=self.implementation_win, 149 | opts={"title": "Training implementation"}, 150 | ) 151 | 152 | # Reset the tracking 153 | self.losses.clear() 154 | self.eers.clear() 155 | self.step_times.clear() 156 | 157 | def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, 158 | max_speakers=10): 159 | max_speakers = min(max_speakers, len(colormap)) 160 | embeds = embeds[:max_speakers * utterances_per_speaker] 161 | 162 | n_speakers = len(embeds) // utterances_per_speaker 163 | ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) 164 | colors = [colormap[i] for i in ground_truth] 165 | 166 | reducer = umap.UMAP() 167 | projected = reducer.fit_transform(embeds) 168 | plt.scatter(projected[:, 0], projected[:, 1], c=colors) 169 | plt.gca().set_aspect("equal", "datalim") 170 | plt.title("UMAP projection (step %d)" % step) 171 | if not self.disabled: 172 | self.projection_win = self.vis.matplot(plt, win=self.projection_win) 173 | if out_fpath is not None: 174 | plt.savefig(out_fpath) 175 | plt.clf() 176 | 177 | def save(self): 178 | if not self.disabled: 179 | self.vis.save([self.env_name]) 180 | -------------------------------------------------------------------------------- /speaker_encoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | -------------------------------------------------------------------------------- /speaker_encoder/utils/argutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from pathlib import Path 4 | import numpy as np 5 | import argparse 6 | 7 | _type_priorities = [ # In decreasing order 8 | Path, 9 | str, 10 | int, 11 | float, 12 | bool, 13 | ] 14 | 15 | def _priority(o): 16 | p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 17 | if p is not None: 18 | return p 19 | p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 20 | if p is not None: 21 | return p 22 | return len(_type_priorities) 23 | 24 | def print_args(args: argparse.Namespace, parser=None): 25 | args = vars(args) 26 | if parser is None: 27 | priorities = list(map(_priority, args.values())) 28 | else: 29 | all_params = [a.dest for g in parser._action_groups for a in g._group_actions ] 30 | priority = lambda p: all_params.index(p) if p in all_params else len(all_params) 31 | priorities = list(map(priority, args.keys())) 32 | 33 | pad = max(map(len, args.keys())) + 3 34 | indices = np.lexsort((list(args.keys()), priorities)) 35 | items = list(args.items()) 36 | 37 | print("Arguments:") 38 | for i in indices: 39 | param, value = items[i] 40 | print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) 41 | print("") 42 | -------------------------------------------------------------------------------- /speaker_encoder/utils/logmmse.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | import numpy as np 4 | import math 5 | from scipy.special import expn 6 | from collections import namedtuple 7 | 8 | NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2") 9 | 10 | 11 | def profile_noise(noise, sampling_rate, window_size=0): 12 | """ 13 | Creates a profile of the noise in a given waveform. 14 | 15 | :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 16 | :param sampling_rate: the sampling rate of the audio 17 | :param window_size: the size of the window the logmmse algorithm operates on. A default value 18 | will be picked if left as 0. 19 | :return: a NoiseProfile object 20 | """ 21 | noise, dtype = to_float(noise) 22 | noise += np.finfo(np.float64).eps 23 | 24 | if window_size == 0: 25 | window_size = int(math.floor(0.02 * sampling_rate)) 26 | 27 | if window_size % 2 == 1: 28 | window_size = window_size + 1 29 | 30 | perc = 50 31 | len1 = int(math.floor(window_size * perc / 100)) 32 | len2 = int(window_size - len1) 33 | 34 | win = np.hanning(window_size) 35 | win = win * len2 / np.sum(win) 36 | n_fft = 2 * window_size 37 | 38 | noise_mean = np.zeros(n_fft) 39 | n_frames = len(noise) // window_size 40 | for j in range(0, window_size * n_frames, window_size): 41 | noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0)) 42 | noise_mu2 = (noise_mean / n_frames) ** 2 43 | 44 | return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2) 45 | 46 | 47 | def denoise(wav, noise_profile: NoiseProfile, eta=0.15): 48 | """ 49 | Cleans the noise from a speech waveform given a noise profile. The waveform must have the 50 | same sampling rate as the one used to create the noise profile. 51 | 52 | :param wav: a speech waveform as a numpy array of floats or ints. 53 | :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 54 | the same) waveform. 55 | :param eta: voice threshold for noise update. While the voice activation detection value is 56 | below this threshold, the noise profile will be continuously updated throughout the audio. 57 | Set to 0 to disable updating the noise profile. 58 | :return: the clean wav as a numpy array of floats or ints of the same length. 59 | """ 60 | wav, dtype = to_float(wav) 61 | wav += np.finfo(np.float64).eps 62 | p = noise_profile 63 | 64 | nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2)) 65 | x_final = np.zeros(nframes * p.len2) 66 | 67 | aa = 0.98 68 | mu = 0.98 69 | ksi_min = 10 ** (-25 / 10) 70 | 71 | x_old = np.zeros(p.len1) 72 | xk_prev = np.zeros(p.len1) 73 | noise_mu2 = p.noise_mu2 74 | for k in range(0, nframes * p.len2, p.len2): 75 | insign = p.win * wav[k:k + p.window_size] 76 | 77 | spec = np.fft.fft(insign, p.n_fft, axis=0) 78 | sig = np.absolute(spec) 79 | sig2 = sig ** 2 80 | 81 | gammak = np.minimum(sig2 / noise_mu2, 40) 82 | 83 | if xk_prev.all() == 0: 84 | ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 85 | else: 86 | ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 87 | ksi = np.maximum(ksi_min, ksi) 88 | 89 | log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi) 90 | vad_decision = np.sum(log_sigma_k) / p.window_size 91 | if vad_decision < eta: 92 | noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 93 | 94 | a = ksi / (1 + ksi) 95 | vk = a * gammak 96 | ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 97 | hw = a * np.exp(ei_vk) 98 | sig = sig * hw 99 | xk_prev = sig ** 2 100 | xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0) 101 | xi_w = np.real(xi_w) 102 | 103 | x_final[k:k + p.len2] = x_old + xi_w[0:p.len1] 104 | x_old = xi_w[p.len1:p.window_size] 105 | 106 | output = from_float(x_final, dtype) 107 | output = np.pad(output, (0, len(wav) - len(output)), mode="constant") 108 | return output 109 | 110 | 111 | ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 112 | ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 113 | ## webrctvad 114 | # def vad(wav, sampling_rate, eta=0.15, window_size=0): 115 | # """ 116 | # TODO: fix doc 117 | # Creates a profile of the noise in a given waveform. 118 | # 119 | # :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 120 | # :param sampling_rate: the sampling rate of the audio 121 | # :param window_size: the size of the window the logmmse algorithm operates on. A default value 122 | # will be picked if left as 0. 123 | # :param eta: voice threshold for noise update. While the voice activation detection value is 124 | # below this threshold, the noise profile will be continuously updated throughout the audio. 125 | # Set to 0 to disable updating the noise profile. 126 | # """ 127 | # wav, dtype = to_float(wav) 128 | # wav += np.finfo(np.float64).eps 129 | # 130 | # if window_size == 0: 131 | # window_size = int(math.floor(0.02 * sampling_rate)) 132 | # 133 | # if window_size % 2 == 1: 134 | # window_size = window_size + 1 135 | # 136 | # perc = 50 137 | # len1 = int(math.floor(window_size * perc / 100)) 138 | # len2 = int(window_size - len1) 139 | # 140 | # win = np.hanning(window_size) 141 | # win = win * len2 / np.sum(win) 142 | # n_fft = 2 * window_size 143 | # 144 | # wav_mean = np.zeros(n_fft) 145 | # n_frames = len(wav) // window_size 146 | # for j in range(0, window_size * n_frames, window_size): 147 | # wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0)) 148 | # noise_mu2 = (wav_mean / n_frames) ** 2 149 | # 150 | # wav, dtype = to_float(wav) 151 | # wav += np.finfo(np.float64).eps 152 | # 153 | # nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2)) 154 | # vad = np.zeros(nframes * len2, dtype=np.bool) 155 | # 156 | # aa = 0.98 157 | # mu = 0.98 158 | # ksi_min = 10 ** (-25 / 10) 159 | # 160 | # xk_prev = np.zeros(len1) 161 | # noise_mu2 = noise_mu2 162 | # for k in range(0, nframes * len2, len2): 163 | # insign = win * wav[k:k + window_size] 164 | # 165 | # spec = np.fft.fft(insign, n_fft, axis=0) 166 | # sig = np.absolute(spec) 167 | # sig2 = sig ** 2 168 | # 169 | # gammak = np.minimum(sig2 / noise_mu2, 40) 170 | # 171 | # if xk_prev.all() == 0: 172 | # ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 173 | # else: 174 | # ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 175 | # ksi = np.maximum(ksi_min, ksi) 176 | # 177 | # log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi) 178 | # vad_decision = np.sum(log_sigma_k) / window_size 179 | # if vad_decision < eta: 180 | # noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 181 | # print(vad_decision) 182 | # 183 | # a = ksi / (1 + ksi) 184 | # vk = a * gammak 185 | # ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 186 | # hw = a * np.exp(ei_vk) 187 | # sig = sig * hw 188 | # xk_prev = sig ** 2 189 | # 190 | # vad[k:k + len2] = vad_decision >= eta 191 | # 192 | # vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant") 193 | # return vad 194 | 195 | 196 | def to_float(_input): 197 | if _input.dtype == np.float64: 198 | return _input, _input.dtype 199 | elif _input.dtype == np.float32: 200 | return _input.astype(np.float64), _input.dtype 201 | elif _input.dtype == np.uint8: 202 | return (_input - 128) / 128., _input.dtype 203 | elif _input.dtype == np.int16: 204 | return _input / 32768., _input.dtype 205 | elif _input.dtype == np.int32: 206 | return _input / 2147483648., _input.dtype 207 | raise ValueError('Unsupported wave file format') 208 | 209 | 210 | def from_float(_input, dtype): 211 | if dtype == np.float64: 212 | return _input, np.float64 213 | elif dtype == np.float32: 214 | return _input.astype(np.float32) 215 | elif dtype == np.uint8: 216 | return ((_input * 128) + 128).astype(np.uint8) 217 | elif dtype == np.int16: 218 | return (_input * 32768).astype(np.int16) 219 | elif dtype == np.int32: 220 | print(_input) 221 | return (_input * 2147483648).astype(np.int32) 222 | raise ValueError('Unsupported wave file format') 223 | -------------------------------------------------------------------------------- /speaker_encoder/utils/profiler.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """ 2 | 3 | from time import perf_counter as timer 4 | from collections import OrderedDict 5 | import numpy as np 6 | 7 | 8 | class Profiler: 9 | def __init__(self, summarize_every=5, disabled=False): 10 | self.last_tick = timer() 11 | self.logs = OrderedDict() 12 | self.summarize_every = summarize_every 13 | self.disabled = disabled 14 | 15 | def tick(self, name): 16 | if self.disabled: 17 | return 18 | 19 | # Log the time needed to execute that function 20 | if not name in self.logs: 21 | self.logs[name] = [] 22 | if len(self.logs[name]) >= self.summarize_every: 23 | self.summarize() 24 | self.purge_logs() 25 | self.logs[name].append(timer() - self.last_tick) 26 | 27 | self.reset_timer() 28 | 29 | def purge_logs(self): 30 | for name in self.logs: 31 | self.logs[name].clear() 32 | 33 | def reset_timer(self): 34 | self.last_tick = timer() 35 | 36 | def summarize(self): 37 | n = max(map(len, self.logs.values())) 38 | assert n == self.summarize_every 39 | print("\nAverage execution time over %d steps:" % n) 40 | 41 | name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()] 42 | pad = max(map(len, name_msgs)) 43 | for name_msg, deltas in zip(name_msgs, self.logs.values()): 44 | print(" %s mean: %4.0fms std: %4.0fms" % 45 | (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) 46 | print("", flush=True) 47 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 2 | # This program is free software; you can redistribute it and/or modify 3 | # it under the terms of the MIT License. 4 | # This program is distributed in the hope that it will be useful, 5 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 6 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 7 | # MIT License for more details. 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy.io import wavfile 12 | 13 | 14 | def save_plot(tensor, savepath): 15 | plt.style.use('default') 16 | fig, ax = plt.subplots(figsize=(12, 3)) 17 | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') 18 | plt.colorbar(im, ax=ax) 19 | plt.tight_layout() 20 | fig.canvas.draw() 21 | plt.savefig(savepath) 22 | plt.close() 23 | 24 | 25 | def save_audio(file_path, sampling_rate, audio): 26 | audio = np.clip(audio.detach().cpu().squeeze().numpy(), -0.999, 0.999) 27 | wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16")) 28 | -------------------------------------------------------------------------------- /var.env: -------------------------------------------------------------------------------- 1 | 2 | MODEL_VERSION = "1" 3 | TRITON_MODEL_NAME = "vc_pipeline_python" 4 | 5 | INPUT_NAME = "INPUT" 6 | OUTPUT_NAME= "OUTPUT" 7 | 8 | 9 | PYTHONUNBUFFERED=TRUE 10 | 11 | --------------------------------------------------------------------------------