├── .gitignore ├── LICENSE ├── README.md ├── egs ├── TEMPLATE │ └── a2o_vc │ │ ├── README.md │ │ ├── cmd.sh │ │ ├── conf │ │ ├── f0.yaml │ │ └── taco2_ar.yaml │ │ ├── local │ │ ├── data_prep.sh │ │ ├── evaluate.py │ │ └── vocoder_download.sh │ │ ├── path.sh │ │ ├── run.sh │ │ └── utils ├── arctic │ └── a2o_vc │ │ ├── cmd.sh │ │ ├── conf │ │ ├── config.yaml │ │ ├── diffusion.yaml │ │ ├── f0.yaml │ │ └── taco2_ar.yaml │ │ ├── local │ │ ├── data_download.sh │ │ ├── data_prep.sh │ │ ├── evaluate.py │ │ └── pretrained_model_download.sh │ │ ├── path.sh │ │ ├── run.sh │ │ └── utils └── vcc2020 │ └── a2o_vc │ ├── README.md │ ├── cmd.sh │ ├── conf │ ├── f0.yaml │ └── taco2_ar.yaml │ ├── local │ ├── data_download.sh │ ├── data_prep.sh │ ├── evaluate.py │ ├── lists │ │ ├── E_dev_list.txt │ │ ├── E_train_list.txt │ │ ├── F_dev_list.txt │ │ ├── F_train_list.txt │ │ ├── G_dev_list.txt │ │ ├── G_train_list.txt │ │ ├── M_dev_list.txt │ │ ├── M_train_list.txt │ │ ├── custom_eval_wenchin.yaml │ │ ├── eval_list.txt │ │ └── ref_list.txt │ └── vocoder_download.sh │ ├── path.sh │ ├── run.sh │ └── utils ├── pyproject.toml ├── s3prl_vc ├── __init__.py ├── bin │ ├── __init__.py │ ├── compute_statistics.py │ ├── create_histogram.py │ ├── decode.py │ ├── decode_downstream.py │ ├── extract_spemb.py │ ├── extract_upstream.py │ └── train.py ├── datasets │ ├── __init__.py │ └── datasets.py ├── evaluate │ ├── __init__.py │ ├── asr.py │ ├── asv.py │ └── dtw_based.py ├── layers │ ├── __init__.py │ └── utils.py ├── losses │ ├── __init__.py │ ├── l1_loss.py │ └── l2_loss.py ├── models │ ├── Taco2_AR.py │ ├── __init__.py │ ├── diffsinger │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── denoiser.py │ │ └── diffusion.py │ └── diffusion.py ├── schedulers │ ├── __init__.py │ └── schedulers.py ├── transform │ ├── __init__.py │ ├── f0.py │ └── spectrogram.py ├── upstream │ ├── __init__.py │ ├── interface.py │ ├── ppg_sxliu │ │ ├── __init__.py │ │ ├── e2e_asr_common.py │ │ ├── encoder │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── conformer_encoder.py │ │ │ ├── convolution.py │ │ │ ├── embedding.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ ├── layer_norm.py │ │ │ ├── multi_layer_conv.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── repeat.py │ │ │ ├── subsampling.py │ │ │ ├── swish.py │ │ │ └── vgg.py │ │ ├── encoders.py │ │ ├── frontend.py │ │ ├── log_mel.py │ │ ├── model.py │ │ ├── nets_utils.py │ │ ├── ppg_sxliu.py │ │ ├── stft.py │ │ └── utterance_mvn.py │ └── whisper.py ├── utils │ ├── __init__.py │ ├── data.py │ ├── download.py │ ├── plot.py │ ├── signal.py │ ├── speaker_embedding_resemblyzer.py │ ├── speaker_embedding_wespeaker.py │ └── utils.py └── vocoder │ ├── __init__.py │ ├── griffin_lim.py │ └── vocoder.py ├── setup.cfg ├── tools └── Makefile └── utils ├── combine_data.sh ├── download_from_google_drive.sh ├── make_subset_data.sh ├── parse_options.sh ├── run.pl ├── split_data.sh └── split_scp.pl /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | exp/ 132 | downloads/ 133 | data/ 134 | *.done 135 | *.wav 136 | *.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Wen-Chin Huang (unilight) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3PRL-VC: A Voice Conversion Toolkit based on S3PRL 2 | 3 | Paper (ICASSP2023) [![arXiv](https://img.shields.io/badge/arXiv-2110.06280-b31b1b.svg)](https://arxiv.org/abs/2110.06280) 4 | Paper (IEEE Journal of Selected Topics in Signal Processing) [![arXiv](https://img.shields.io/badge/arXiv-2207.04356-b31b1b.svg)](https://arxiv.org/abs/2207.04356) 5 | Original codebase on S3PRL [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/s3prl/s3prl/tree/master/s3prl/downstream/a2o-vc-vcc2020) 6 | **NEW!** HuggingFace Spaces Demo [![Open In Spaces](https://camo.githubusercontent.com/00380c35e60d6b04be65d3d94a58332be5cc93779f630bcdfc18ab9a3a7d3388/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f25463025394625413425393725323048756767696e67253230466163652d5370616365732d626c7565)](https://huggingface.co/spaces/unilight/s3prl-vc-vcc2020) 7 | 8 | ## Introduction and motivation 9 | 10 | [S3PRL](https://github.com/s3prl/s3prl) stands for "Self-Supervised Speech/Sound Pre-training and Representation Learning Toolkit". It is a toolkit for benchmarking self-supervised speech representations (S3Rs) models using a collection of so-called "downstream" tasks. S3PRL-VC was originally built under S3PRL, which implements voice conversion (VC) as one of the downstream tasks. However, as S3PRL grows bigger and bigger, it is getting harder to integrate the various VC recipes into the main S3PRL repository. Therefore, this repository aims to isolate the VC downstream task from S3PRL to become an independently-maintained toolkit (hopefully). 11 | 12 | ## What can this repo do? 13 | 14 | This repo aims to provide **a platform for frame-based recognition-synthesis voice conversion**. 15 | 16 | ### Pipeline 17 | 18 | Given a source speech, we first use a recognizer (or upstream) to extract intermediate representations, which are then mapped to the acoustic feature space (log melspectrograms) with a synthesizer (or downstream). Finally a waveform synthesizer (some call it vocoder) is used to convert the acoustic feature to waveform. 19 | 20 | ### Supported upstream 21 | 22 | Currently the supported S3R upstream completely depend on the official [S3PRL](https://s3prl.github.io/s3prl/tutorial/upstream_collection.html) repository. In addition, we also provide two PPG models: `ppg_sxliu` uses the ASR model provided by [Songxiang Liu's ppg-vc repo](https://github.com/liusongxiang/ppg-vc), and `ppg_whisper` uses the [OpenAI Whisper ASR model](https://github.com/openai/whisper). Note that in my experiments, the Whisper model yields very bad results, but I don't know what the reason is. I would appreciate it if someone could figure out why. 23 | 24 | ## Installation 25 | 26 | ### 1. (Recommended) Editable installation with virtualenv 27 | 28 | This repo is designed for research purposes, so it is recommended to install in this fashion. 29 | 30 | ``` 31 | git clone https://github.com/unilight/s3prl-vc.git 32 | cd s3prl-vc/tools 33 | make 34 | ``` 35 | 36 | ### 2. pip 37 | 38 | For my own other research projects (which might be publicized in the future), I needed this repo as a standalone toolkit. So I also made command-line entries, which can be installed like this: 39 | 40 | ``` 41 | pip install s3prl-vc 42 | ``` 43 | 44 | ## Complete training, decoding and benchmarking 45 | 46 | Same as many speech processing based repositories ([ESPNet](https://github.com/espnet/espnet), [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN), etc.), we formulate our recipes in kaldi-style. They can be found in the `egs` folder. Please check the detailed usage in each recipe. 47 | 48 | ## Citation 49 | 50 | ``` 51 | @inproceedings{huang2021s3prl, 52 | title={S3PRL-VC: Open-source Voice Conversion Framework with Self-supervised Speech Representations}, 53 | author={Huang, Wen-Chin and Yang, Shu-Wen and Hayashi, Tomoki and Lee, Hung-Yi and Watanabe, Shinji and Toda, Tomoki}, 54 | booktitle={Proc. ICASSP}, 55 | year={2022} 56 | } 57 | @ARTICLE{s3prl-vc-journal, 58 | author={Huang, Wen-Chin and Yang, Shu-Wen and Hayashi, Tomoki and Toda, Tomoki}, 59 | journal={IEEE Journal of Selected Topics in Signal Processing}, 60 | title={{A Comparative Study of Self-Supervised Speech Representation Based Voice Conversion}}, 61 | year={2022}, 62 | volume={16}, 63 | number={6}, 64 | pages={1308-1318}, 65 | } 66 | ``` 67 | 68 | ## Acknowledgements 69 | 70 | This repo is greatly inspired by the following repos. Or I should say, many code snippets are directly taken from part of the following repos. 71 | 72 | - [ESPNet](https://github.com/espnet/espnet) 73 | - [S3PRL](https://github.com/s3prl/s3prl) 74 | - [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN/) 75 | - [NNSVS](https://github.com/nnsvs/nnsvs/) 76 | - [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger) 77 | -------------------------------------------------------------------------------- /egs/TEMPLATE/a2o_vc/README.md: -------------------------------------------------------------------------------- 1 | # Template for any-to-one VC 2 | 3 | This is a template recipe for training any-to-one VC models using your custom dataset. Several preparation steps are needed. Note when modifying them, keep in mind they were written w.r.t. the VCC2020 dataset. You might delete unnecessary or add more codes to your favor. 4 | 5 | ## Preparation 6 | 7 | The following steps are NEEDED: 8 | 9 | - Prepare your dataset and put it somewhere. There is no requirement on the directory structure, as long as the `local/data_prep.sh` can well generate the file lists used for training and decoding (conversion). 10 | - Modify `conf/taco2_ar.yaml`: Modify fields like sampling rate, frame shift or custom vocoder to your preference. 11 | - Modify `local/data_prep.sh`: this script needs to generate files containing space-separated lines with the format ` `, according to the directory structure of your custom dataset. 12 | 13 | The following steps are OPTIONAL: 14 | 15 | - Train your own vocoder. You can use the `hifigan_vctk+vcc2020` vocoder first, and see if you are satiffied with the quality. If not, please open an issue and I can guide you to train your own model. 16 | - `conf/f0.yaml` and `local/evaluate.py`: these files are for evaluation, which is optional depending on your application. Note that each evaluation metric has different requirements. For example, MCD, F0RMSE, F0CORR, DUR need parallel data. CER and WER need trnscription. If you have trouble modifying these files for your custom dataset, please open an issue and I will try to help you. 17 | 18 | ## Training 19 | 20 | Run the following command: 21 | 22 | ``` 23 | ./run.sh --stage -1 --stop_stage 2 --upstream --trgspk 24 | ``` 25 | 26 | Four stages are executed: 27 | - Stage -1: Pretrained model download. The `hifigan_vctk+vcc2020` will be downloaded (by default to `downloads/`). 28 | - Stage 0: Data preparation. File lists should be generated in `data/` by default. Each file contains space-separated lines with the format ` `. These files are used for training anc decoding (conversion) 29 | - Stage 1: Statistics calculation. The statistics of the mel spectrogram used for normalzation is calculated using the training set of the target speaker. Calculation log and the statistics h5 file are saved in `data/` by default. 30 | - Stage 2: Main training script. By default, `exp/__` is used to save the training log, saved checkpoints and intermediate samples for debugging (saved in `predictions/`). 31 | 32 | Modifiable arguments: 33 | - `--trgspk`: depending on your dataset, this can be conveniently used to train several A2O VC models. 34 | - `--upstream`: In addition to the various upstreams provided by [S3PRL](https://s3prl.github.io/s3prl/tutorial/upstream_collection.html), we also provide two PPG models: `ppg_sxliu` uses the ASR model provided by [Songxiang Liu's ppg-vc repo](https://github.com/liusongxiang/ppg-vc), and `ppg_whisper` uses the [OpenAI Whisper ASR model](https://github.com/openai/whisper). Note that in my experiments, the Whisper model yields very bad results, but I don't know what the reason is. I would appreciate it if someone could figure out why. 35 | - `--tag`: if a tag is specified, results from stage 2 will be saved in `exp/__`. 36 | 37 | ## Decoding and evaluation 38 | 39 | ``` 40 | ./run.sh --stage 3 --stop_stage 4 --upstream --trgspk --checkpoint 41 | ``` 42 | 43 | Generated files from both stages 3 and 4 are saved in `results/checkpoint-XXXXXsteps`. 44 | 45 | - Stage 3 is the decoding stage and a log file is also generated. The mel spectrogram visualization can be viewed in `plot_mel/`. The generated waveform files are saved in `wav/`. 46 | - Stage 4 is the evaluation stage using `local/evaluate.py`. MCD, F0RMSE, F0CORR, DUR, CER, WER are calculated. Detailed results are saved in `evaluation.log`. -------------------------------------------------------------------------------- /egs/TEMPLATE/a2o_vc/cmd.sh: -------------------------------------------------------------------------------- 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== 2 | # Usage: .pl [options] JOB=1: 3 | # e.g. 4 | # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB 5 | # 6 | # Options: 7 | # --time