├── .gitignore
├── LICENSE
├── README.md
├── egs
    ├── TEMPLATE
    │   └── a2o_vc
    │   │   ├── README.md
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── f0.yaml
    │   │       └── taco2_ar.yaml
    │   │   ├── local
    │   │       ├── data_prep.sh
    │   │       ├── evaluate.py
    │   │       └── vocoder_download.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   └── utils
    ├── arctic
    │   └── a2o_vc
    │   │   ├── cmd.sh
    │   │   ├── conf
    │   │       ├── config.yaml
    │   │       ├── diffusion.yaml
    │   │       ├── f0.yaml
    │   │       └── taco2_ar.yaml
    │   │   ├── local
    │   │       ├── data_download.sh
    │   │       ├── data_prep.sh
    │   │       ├── evaluate.py
    │   │       └── pretrained_model_download.sh
    │   │   ├── path.sh
    │   │   ├── run.sh
    │   │   └── utils
    └── vcc2020
    │   └── a2o_vc
    │       ├── README.md
    │       ├── cmd.sh
    │       ├── conf
    │           ├── f0.yaml
    │           └── taco2_ar.yaml
    │       ├── local
    │           ├── data_download.sh
    │           ├── data_prep.sh
    │           ├── evaluate.py
    │           ├── lists
    │           │   ├── E_dev_list.txt
    │           │   ├── E_train_list.txt
    │           │   ├── F_dev_list.txt
    │           │   ├── F_train_list.txt
    │           │   ├── G_dev_list.txt
    │           │   ├── G_train_list.txt
    │           │   ├── M_dev_list.txt
    │           │   ├── M_train_list.txt
    │           │   ├── custom_eval_wenchin.yaml
    │           │   ├── eval_list.txt
    │           │   └── ref_list.txt
    │           └── vocoder_download.sh
    │       ├── path.sh
    │       ├── run.sh
    │       └── utils
├── pyproject.toml
├── s3prl_vc
    ├── __init__.py
    ├── bin
    │   ├── __init__.py
    │   ├── compute_statistics.py
    │   ├── create_histogram.py
    │   ├── decode.py
    │   ├── decode_downstream.py
    │   ├── extract_spemb.py
    │   ├── extract_upstream.py
    │   └── train.py
    ├── datasets
    │   ├── __init__.py
    │   └── datasets.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── asr.py
    │   ├── asv.py
    │   └── dtw_based.py
    ├── layers
    │   ├── __init__.py
    │   └── utils.py
    ├── losses
    │   ├── __init__.py
    │   ├── l1_loss.py
    │   └── l2_loss.py
    ├── models
    │   ├── Taco2_AR.py
    │   ├── __init__.py
    │   ├── diffsinger
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── denoiser.py
    │   │   └── diffusion.py
    │   └── diffusion.py
    ├── schedulers
    │   ├── __init__.py
    │   └── schedulers.py
    ├── transform
    │   ├── __init__.py
    │   ├── f0.py
    │   └── spectrogram.py
    ├── upstream
    │   ├── __init__.py
    │   ├── interface.py
    │   ├── ppg_sxliu
    │   │   ├── __init__.py
    │   │   ├── e2e_asr_common.py
    │   │   ├── encoder
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── conformer_encoder.py
    │   │   │   ├── convolution.py
    │   │   │   ├── embedding.py
    │   │   │   ├── encoder.py
    │   │   │   ├── encoder_layer.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── multi_layer_conv.py
    │   │   │   ├── positionwise_feed_forward.py
    │   │   │   ├── repeat.py
    │   │   │   ├── subsampling.py
    │   │   │   ├── swish.py
    │   │   │   └── vgg.py
    │   │   ├── encoders.py
    │   │   ├── frontend.py
    │   │   ├── log_mel.py
    │   │   ├── model.py
    │   │   ├── nets_utils.py
    │   │   ├── ppg_sxliu.py
    │   │   ├── stft.py
    │   │   └── utterance_mvn.py
    │   └── whisper.py
    ├── utils
    │   ├── __init__.py
    │   ├── data.py
    │   ├── download.py
    │   ├── plot.py
    │   ├── signal.py
    │   ├── speaker_embedding_resemblyzer.py
    │   ├── speaker_embedding_wespeaker.py
    │   └── utils.py
    └── vocoder
    │   ├── __init__.py
    │   ├── griffin_lim.py
    │   └── vocoder.py
├── setup.cfg
├── tools
    └── Makefile
└── utils
    ├── combine_data.sh
    ├── download_from_google_drive.sh
    ├── make_subset_data.sh
    ├── parse_options.sh
    ├── run.pl
    ├── split_data.sh
    └── split_scp.pl


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | exp/
132 | downloads/
133 | data/
134 | *.done
135 | *.wav
136 | *.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Wen-Chin Huang (unilight)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # S3PRL-VC: A Voice Conversion Toolkit based on S3PRL
 2 | 
 3 | Paper (ICASSP2023) [![arXiv](https://img.shields.io/badge/arXiv-2110.06280-b31b1b.svg)](https://arxiv.org/abs/2110.06280)  
 4 | Paper (IEEE Journal of Selected Topics in Signal Processing)  [![arXiv](https://img.shields.io/badge/arXiv-2207.04356-b31b1b.svg)](https://arxiv.org/abs/2207.04356)  
 5 | Original codebase on S3PRL [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/s3prl/s3prl/tree/master/s3prl/downstream/a2o-vc-vcc2020)  
 6 | **NEW!** HuggingFace Spaces Demo [![Open In Spaces](https://camo.githubusercontent.com/00380c35e60d6b04be65d3d94a58332be5cc93779f630bcdfc18ab9a3a7d3388/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f25463025394625413425393725323048756767696e67253230466163652d5370616365732d626c7565)](https://huggingface.co/spaces/unilight/s3prl-vc-vcc2020)
 7 | 
 8 | ## Introduction and motivation
 9 | 
10 | [S3PRL](https://github.com/s3prl/s3prl) stands for "Self-Supervised Speech/Sound Pre-training and Representation Learning Toolkit". It is a toolkit for benchmarking self-supervised speech representations (S3Rs) models using a collection of so-called "downstream" tasks. S3PRL-VC was originally built under S3PRL, which implements voice conversion (VC) as one of the downstream tasks. However, as S3PRL grows bigger and bigger, it is getting harder to integrate the various VC recipes into the main S3PRL repository. Therefore, this repository aims to isolate the VC downstream task from S3PRL to become an independently-maintained toolkit (hopefully).
11 | 
12 | ## What can this repo do?
13 | 
14 | This repo aims to provide **a platform for frame-based recognition-synthesis voice conversion**.
15 | 
16 | ### Pipeline
17 | 
18 | Given a source speech, we first use a recognizer (or upstream) to extract intermediate representations, which are then mapped to the acoustic feature space (log melspectrograms) with a synthesizer (or downstream). Finally a waveform synthesizer (some call it vocoder) is used to convert the acoustic feature to waveform.
19 | 
20 | ### Supported upstream
21 | 
22 | Currently the supported S3R upstream completely depend on the official [S3PRL](https://s3prl.github.io/s3prl/tutorial/upstream_collection.html) repository. In addition, we also provide two PPG models: `ppg_sxliu` uses the ASR model provided by [Songxiang Liu's ppg-vc repo](https://github.com/liusongxiang/ppg-vc), and `ppg_whisper` uses the [OpenAI Whisper ASR model](https://github.com/openai/whisper). Note that in my experiments, the Whisper model yields very bad results, but I don't know what the reason is. I would appreciate it if someone could figure out why.
23 | 
24 | ## Installation 
25 | 
26 | ### 1. (Recommended) Editable installation with virtualenv 
27 | 
28 | This repo is designed for research purposes, so it is recommended to install in this fashion.
29 | 
30 | ```
31 | git clone https://github.com/unilight/s3prl-vc.git
32 | cd s3prl-vc/tools
33 | make
34 | ```
35 | 
36 | ### 2. pip
37 | 
38 | For my own other research projects (which might be publicized in the future), I needed this repo as a standalone toolkit. So I also made command-line entries, which can be installed like this:
39 | 
40 | ```
41 | pip install s3prl-vc
42 | ```
43 | 
44 | ## Complete training, decoding and benchmarking
45 | 
46 | Same as many speech processing based repositories ([ESPNet](https://github.com/espnet/espnet), [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN), etc.), we formulate our recipes in kaldi-style. They can be found in the `egs` folder. Please check the detailed usage in each recipe.
47 | 
48 | ## Citation
49 | 
50 | ```
51 | @inproceedings{huang2021s3prl,
52 |   title={S3PRL-VC: Open-source Voice Conversion Framework with Self-supervised Speech Representations},
53 |   author={Huang, Wen-Chin and Yang, Shu-Wen and Hayashi, Tomoki and Lee, Hung-Yi and Watanabe, Shinji and Toda, Tomoki},
54 |   booktitle={Proc. ICASSP},
55 |   year={2022}
56 | }
57 | @ARTICLE{s3prl-vc-journal,
58 |   author={Huang, Wen-Chin and Yang, Shu-Wen and Hayashi, Tomoki and Toda, Tomoki},
59 |   journal={IEEE Journal of Selected Topics in Signal Processing}, 
60 |   title={{A Comparative Study of Self-Supervised Speech Representation Based Voice Conversion}}, 
61 |   year={2022},
62 |   volume={16},
63 |   number={6},
64 |   pages={1308-1318},
65 | }
66 | ```
67 | 
68 | ## Acknowledgements
69 | 
70 | This repo is greatly inspired by the following repos. Or I should say, many code snippets are directly taken from part of the following repos.
71 | 
72 | - [ESPNet](https://github.com/espnet/espnet)
73 | - [S3PRL](https://github.com/s3prl/s3prl)
74 | - [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN/)
75 | - [NNSVS](https://github.com/nnsvs/nnsvs/)
76 | - [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger)
77 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/README.md:
--------------------------------------------------------------------------------
 1 | # Template for any-to-one VC
 2 | 
 3 | This is a template recipe for training any-to-one VC models using your custom dataset. Several preparation steps are needed. Note when modifying them, keep in mind they were written w.r.t. the VCC2020 dataset. You might delete unnecessary or add more codes to your favor.
 4 | 
 5 | ## Preparation
 6 | 
 7 | The following steps are NEEDED:
 8 | 
 9 | - Prepare your dataset and put it somewhere. There is no requirement on the directory structure, as long as the `local/data_prep.sh` can well generate the file lists used for training and decoding (conversion).
10 | - Modify `conf/taco2_ar.yaml`: Modify fields like sampling rate, frame shift or custom vocoder to your preference.
11 | - Modify `local/data_prep.sh`: this script needs to generate files containing space-separated lines with the format `<id> <wave file path>`, according to the directory structure of your custom dataset.
12 | 
13 | The following steps are OPTIONAL:
14 | 
15 | - Train your own vocoder. You can use the `hifigan_vctk+vcc2020` vocoder first, and see if you are satiffied with the quality. If not, please open an issue and I can guide you to train your own model. 
16 | - `conf/f0.yaml` and `local/evaluate.py`: these files are for evaluation, which is optional depending on your application. Note that each evaluation metric has different requirements. For example, MCD, F0RMSE, F0CORR, DUR need parallel data. CER and WER need trnscription. If you have trouble modifying these files for your custom dataset, please open an issue and I will try to help you.
17 | 
18 | ## Training
19 | 
20 | Run the following command:
21 | 
22 | ```
23 | ./run.sh --stage -1 --stop_stage 2 --upstream <upstream> --trgspk <trgspk>
24 | ```
25 | 
26 | Four stages are executed:
27 | - Stage -1: Pretrained model download. The `hifigan_vctk+vcc2020` will be downloaded (by default to `downloads/`).
28 | - Stage 0: Data preparation. File lists should be generated in `data/` by default. Each file contains space-separated lines with the format `<id> <wave file path>`. These files are used for training anc decoding (conversion)
29 | - Stage 1: Statistics calculation. The statistics of the mel spectrogram used for normalzation is calculated using the training set of the target speaker. Calculation log and the statistics h5 file are saved in `data/` by default.
30 | - Stage 2: Main training script. By default, `exp/<trgspk>_<upstream>_<taco2_ar>` is used to save the training log, saved checkpoints and intermediate samples for debugging (saved in `predictions/`).
31 | 
32 | Modifiable arguments:
33 | - `--trgspk`: depending on your dataset, this can be conveniently used to train several A2O VC models.
34 | - `--upstream`: In addition to the various upstreams provided by [S3PRL](https://s3prl.github.io/s3prl/tutorial/upstream_collection.html), we also provide two PPG models: `ppg_sxliu` uses the ASR model provided by [Songxiang Liu's ppg-vc repo](https://github.com/liusongxiang/ppg-vc), and `ppg_whisper` uses the [OpenAI Whisper ASR model](https://github.com/openai/whisper). Note that in my experiments, the Whisper model yields very bad results, but I don't know what the reason is. I would appreciate it if someone could figure out why.
35 | - `--tag`: if a tag is specified, results from stage 2 will be saved in `exp/<trgspk>_<upstream>_<tag>`.
36 | 
37 | ## Decoding and evaluation
38 |  
39 | ```
40 | ./run.sh --stage 3 --stop_stage 4 --upstream <upstream> --trgspk <trgspk> --checkpoint <checkpoint>
41 | ```
42 | 
43 | Generated files from both stages 3 and 4 are saved in `results/checkpoint-XXXXXsteps`.
44 | 
45 | - Stage 3 is the decoding stage and a log file is also generated. The mel spectrogram visualization can be viewed in `plot_mel/`. The generated waveform files are saved in `wav/`.
46 | - Stage 4 is the evaluation stage using `local/evaluate.py`. MCD, F0RMSE, F0CORR, DUR, CER, WER are calculated. Detailed results are saved in `evaluation.log`.


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/conf/f0.yaml:
--------------------------------------------------------------------------------
 1 | TEF1:
 2 |     f0min: 127
 3 |     f0max: 405
 4 | TEF2:
 5 |     f0min: 108
 6 |     f0max: 361
 7 | TEM1:
 8 |     f0min: 71
 9 |     f0max: 227
10 | TEM2:
11 |     f0min: 57
12 |     f0max: 182
13 | TFF1:
14 |     f0min: 139
15 |     f0max: 387
16 | TFM1:
17 |     f0min: 60
18 |     f0max: 189
19 | TGF1:
20 |     f0min: 109
21 |     f0max: 345
22 | TGM1:
23 |     f0min: 75
24 |     f0max: 236
25 | TMF1:
26 |     f0min: 126
27 |     f0max: 409
28 | TMM1:
29 |     f0min: 81
30 |     f0max: 254
31 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/conf/taco2_ar.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                FEATURE EXTRACTION SETTING               #
 3 | ###########################################################
 4 | sampling_rate: 24000     # Sampling rate.
 5 | fft_size: 1024           # FFT size.
 6 | hop_size: 256            # Hop size.
 7 | win_length: null         # Window length.
 8 |                          # If set to null, it will be the same as fft_size.
 9 | window: "hann"           # Window function.
10 | num_mels: 80             # Number of mel basis.
11 | fmin: 80                 # Minimum freq in mel basis calculation.
12 | fmax: 7600               # Maximum frequency in mel basis calculation.
13 | global_gain_scale: 1.0   # Will be multiplied to all of waveform.
14 | trim_silence: false      # Whether to trim the start and end of silence.
15 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
16 | trim_frame_size: 2048    # Frame size in trimming.
17 | trim_hop_size: 512       # Hop size in trimming.
18 | 
19 | ###########################################################
20 | #              NETWORK ARCHITECTURE SETTING               #
21 | ###########################################################
22 | model_type: "Taco2_AR"
23 | model_params:
24 |     ar: True
25 |     encoder_type: "taco2"
26 |     hidden_dim: 1024
27 |     prenet_layers: 2  # if set 0, only dropout is applied
28 |     prenet_dim: 256
29 |     prenet_dropout_rate: 0.5
30 |     lstmp_layers: 2
31 |     lstmp_dropout_rate: 0.2
32 |     lstmp_proj_dim: 256
33 |     lstmp_layernorm: False
34 | 
35 | ###########################################################
36 | #                      LOSS SETTING                       #
37 | ###########################################################
38 | main_loss_type: "L1Loss"
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | vocoder:
44 |     checkpoint: ./downloads/hifigan_vctk+vcc2020/checkpoint-2500000steps.pkl
45 |     config: ./downloads/hifigan_vctk+vcc2020/config.yml
46 |     stats: ./downloads/hifigan_vctk+vcc2020/stats.h5
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | batch_size: 6             # Batch size.
52 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
53 | num_workers: 2              # Number of workers in Pytorch DataLoader.
54 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
55 | 
56 | ###########################################################
57 | #             OPTIMIZER & SCHEDULER SETTING               #
58 | ###########################################################
59 | optimizer_type: AdamW
60 | optimizer_params:
61 |     lr: 1.0e-4
62 | grad_norm: 1.0              # Gradient norm.
63 | scheduler: linear_schedule_with_warmup
64 | scheduler_params:
65 |     num_warmup_steps: 4000      # Scheduler warm up step
66 | 
67 | ###########################################################
68 | #                    INTERVAL SETTING                     #
69 | ###########################################################
70 | train_max_steps: 10000                 # Number of training steps.
71 | save_interval_steps: 1000              # Interval steps to save checkpoint.
72 | eval_interval_steps: 1000              # Interval steps to evaluate the network.
73 | log_interval_steps: 100                # Interval steps to record the training log.
74 | 
75 | ###########################################################
76 | #                     OTHER SETTING                       #
77 | ###########################################################
78 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/local/data_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | # shellcheck disable=SC1091
 7 | . ./path.sh || exit 1;
 8 | 
 9 | num_dev=100
10 | num_eval=100
11 | train_set="train_nodev"
12 | dev_set="dev"
13 | eval_set="eval"
14 | shuffle=false
15 | 
16 | # shellcheck disable=SC1091
17 | . utils/parse_options.sh || exit 1;
18 | 
19 | db_root=$1
20 | spk=$2
21 | data_dir=$3
22 | 
23 | # check arguments
24 | if [ $# != 3 ]; then
25 |     echo "Usage: $0 <db_root> <spk> <data_dir>"
26 |     echo "e.g.: $0 downloads TEF1 data"
27 |     echo ""
28 |     echo "Options:"
29 |     echo "    --num_dev: number of development uttreances (default=100)."
30 |     echo "    --num_eval: number of evaluation uttreances (default=100)."
31 |     echo "    --train_set: name of train set (default=train_nodev)."
32 |     echo "    --dev_set: name of dev set (default=dev)."
33 |     echo "    --eval_set: name of eval set (default=eval)."
34 |     echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
35 |     exit 1
36 | fi
37 | 
38 | set -euo pipefail
39 | 
40 | [ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
41 | 
42 | # set filenames
43 | scp="${data_dir}/all/wav.scp"
44 | segments="${data_dir}/all/segments"
45 | 
46 | # check file existence
47 | [ -e "${scp}" ] && rm "${scp}"
48 | [ -e "${segments}" ] && rm "${segments}"
49 | 
50 | # make scp
51 | find "$(realpath ${db_root}/wav)" -name "*.wav" -follow | sort | while read -r filename; do
52 |     id="$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
53 |     echo "${id} ${filename}" >> "${scp}"
54 | done
55 | 
56 | # split
57 | num_all=$(wc -l < "${scp}")
58 | num_deveval=$((num_dev + num_eval))
59 | num_train=$((num_all - num_deveval))
60 | utils/split_data.sh \
61 |     --num_first "${num_train}" \
62 |     --num_second "${num_deveval}" \
63 |     --shuffle "${shuffle}" \
64 |     "${data_dir}/all" \
65 |     "${data_dir}/${train_set}" \
66 |     "${data_dir}/deveval"
67 | utils/split_data.sh \
68 |     --num_first "${num_dev}" \
69 |     --num_second "${num_eval}" \
70 |     --shuffle "${shuffle}" \
71 |     "${data_dir}/deveval" \
72 |     "${data_dir}/${dev_set}" \
73 |     "${data_dir}/${eval_set}"
74 | 
75 | # remove tmp directories
76 | rm -rf "${data_dir}/all"
77 | rm -rf "${data_dir}/deveval"
78 | 
79 | echo "Successfully prepared data."


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/local/vocoder_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is based on the following links:
 4 | # https://raw.githubusercontent.com/espnet/espnet/master/egs/vcc20/vc1_task1/local/pretrained_model_download.sh
 5 | # https://github.com/espnet/espnet/blob/master/utils/download_from_google_drive.sh
 6 | 
 7 | download_dir=$1
 8 | 
 9 | # check arguments
10 | if [ $# != 1 ]; then
11 |     echo "Usage: $0 <download_dir>"
12 |     exit 1
13 | fi
14 | 
15 | hifigan_url="https://drive.google.com/open?id=136tzvhczhHQ4sbaaJUU8UKjkCaca0ub6"
16 | 
17 | download_from_google_drive() {
18 |     share_url=$1
19 |     dir=$2
20 |     file_ext=$3
21 | 
22 |     # make temp dir
23 |     [ ! -e "${dir}" ] && mkdir -p "${dir}"
24 |     tmp=$(mktemp "${dir}/XXXXXX.${file_ext}")
25 | 
26 |     # download & decompress
27 |     file_id=$(echo "${share_url}" | cut -d"=" -f 2)
28 |     gdown --id "${file_id}" -O "${tmp}"
29 |     tar xvzf "${tmp}" -C "${dir}"
30 | 
31 |     # remove tmp
32 |     rm "${tmp}"
33 | }
34 | 
35 | download_from_google_drive ${hifigan_url} ${download_dir}/hifigan_vctk+vcc2020 ".tar.gz"
36 | echo "Successfully finished donwload of pretrained models."
37 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/path.sh:
--------------------------------------------------------------------------------
 1 | # cuda related
 2 | # export CUDA_HOME=/usr/local/cuda-10.0
 3 | # export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
 4 | 
 5 | # path related
 6 | export PRJ_ROOT="${PWD}/../../.."
 7 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 8 |     # shellcheck disable=SC1090
 9 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
10 | fi
11 | 
12 | MAIN_ROOT=$PWD/../../..
13 | export PATH=$MAIN_ROOT/s3prl_vc/bin:$PATH
14 | 
15 | # python related
16 | export OMP_NUM_THREADS=1
17 | export PYTHONIOENCODING=UTF-8
18 | export MPL_BACKEND=Agg
19 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/run.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # basic settings
 10 | stage=-1       # stage to start
 11 | stop_stage=100 # stage to stop
 12 | verbose=1      # verbosity level (lower is less info)
 13 | n_gpus=1       # number of gpus in training
 14 | n_jobs=16      # number of parallel jobs in feature extraction
 15 | 
 16 | upstream=vq_wav2vec
 17 | conf=conf/taco2_ar.yaml
 18 | 
 19 | # dataset configuration
 20 | db_root=downloads
 21 | trgspk=
 22 | 
 23 | # training related setting
 24 | tag=""     # tag for directory to save model
 25 | resume=""  # checkpoint path to resume training
 26 |            # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
 27 |            
 28 | # decoding related setting
 29 | checkpoint=""               # checkpoint path to be used for decoding
 30 |                             # if not provided, the latest one will be used
 31 |                             # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
 32 |                                        
 33 | # shellcheck disable=SC1091
 34 | . utils/parse_options.sh || exit 1;
 35 | 
 36 | set -euo pipefail
 37 | 
 38 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 39 |     echo "stage -1: Pretrained Model Download"
 40 |     local/vocoder_download.sh ${db_root}
 41 | fi
 42 | 
 43 | 
 44 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 45 |     echo "stage 0: Data preparation"
 46 |     local/data_prep.sh \
 47 |         --train_set "train" \
 48 |         --dev_set "dev" \
 49 |         --eval_set "eval" \
 50 |         "${db_root}" "${trgspk}" "data"
 51 | fi
 52 | 
 53 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 54 |     echo "stage 1: Pre-calculation"
 55 |     echo "Statistics computation start. See the progress via data/${trgspk}_train/compute_statistics.log."
 56 |     ${train_cmd} "data/${trgspk}_train/compute_statistics.log" \
 57 |         compute_statistics.py \
 58 |             --config "${conf}" \
 59 |             --scp "data/${trgspk}_train/wav.scp" \
 60 |             --dumpdir "data/${trgspk}_train" \
 61 |             --verbose "${verbose}"
 62 |     echo "Successfully calculated statistics."
 63 | fi
 64 | 
 65 | if [ -z ${tag} ]; then
 66 |     expname=${trgspk}_${upstream}_$(basename ${conf%.*})
 67 | else
 68 |     expname=${trgspk}_${upstream}_${tag}
 69 | fi
 70 | expdir=exp/${expname}
 71 | if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
 72 |     echo "Stage 2: Network training"
 73 |     [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
 74 |     cp "data/${trgspk}_train/stats.h5" "${expdir}"
 75 |     if [ "${n_gpus}" -gt 1 ]; then
 76 |         echo "Multi-GPU training is not implemented yet."
 77 |         exit 1
 78 |         # train="python -m seq2seq_vc.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
 79 |     else
 80 |         train="train.py"
 81 |     fi
 82 |     echo "Training start. See the progress via ${expdir}/train.log."
 83 |     ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
 84 |         ${train} \
 85 |             --upstream ${upstream} \
 86 |             --config "${conf}" \
 87 |             --train-scp "data/${trgspk}_train/wav.scp" \
 88 |             --dev-scp "data/${trgspk}_dev/wav.scp" \
 89 |             --trg-stats "${expdir}/stats.h5" \
 90 |             --outdir "${expdir}" \
 91 |             --resume "${resume}" \
 92 |             --verbose "${verbose}"
 93 |     echo "Successfully finished training."
 94 | fi
 95 | 
 96 | if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
 97 |     echo "Stage 3: Network decoding"
 98 |     # shellcheck disable=SC2012
 99 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
100 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
101 |     for name in "eval"; do
102 |         [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
103 |         [ "${n_gpus}" -gt 1 ] && n_gpus=1
104 |         echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
105 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
106 |             decode.py \
107 |                 --scp "data/${name}/wav.scp" \
108 |                 --checkpoint "${checkpoint}" \
109 |                 --trg-stats "${expdir}/stats.h5" \
110 |                 --outdir "${outdir}/${name}" \
111 |                 --verbose "${verbose}"
112 |         echo "Successfully finished decoding of ${name} set."
113 |     done
114 |     echo "Successfully finished decoding."
115 | fi
116 | 
117 | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
118 |     echo "stage 4: Objective Evaluation"
119 | 
120 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
121 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
122 |     for name in "eval"; do
123 |         wavdir="${outdir}/${name}/wav"
124 |         echo "Evaluation start. See the progress via ${outdir}/${name}/evaluation.log."
125 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/evaluation.log" \
126 |             local/evaluate.py \
127 |                 --wavdir ${wavdir} \
128 |                 --data_root "${db_root}" \
129 |                 --trgspk ${trgspk} \
130 |                 --f0_path "conf/f0.yaml"
131 |         grep "Mean MCD" "${outdir}/${name}/evaluation.log"
132 |     done
133 | fi


--------------------------------------------------------------------------------
/egs/TEMPLATE/a2o_vc/utils:
--------------------------------------------------------------------------------
1 | ../../../utils/


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | runner:
 2 |   total_steps: 10000
 3 |   gradient_clipping: 1
 4 |   gradient_accumulate_steps: 1
 5 |   max_keep: 10
 6 |   eval_dataloaders:
 7 |     - dev
 8 |     - test
 9 |   log_step: 100
10 |   eval_step: 1000
11 |   save_step: 1000
12 | 
13 |   # debug
14 |   # log_step: 10
15 |   # eval_step: 50
16 |   # save_step: 50
17 | 
18 | optimizer:
19 |   name: AdamW
20 |   lr: 1.0e-4
21 | 
22 | # comment the whole scheduler config block
23 | # to disable learning rate scheduling
24 | scheduler:
25 |   name: linear_schedule_with_warmup
26 |   num_warmup_steps: 4000
27 | 
28 | downstream_expert:
29 |   # should be changed through commandline
30 |   trgspk: "TEF1"
31 | 
32 |   datarc:
33 |     num_workers: 3
34 |     train_batch_size: 6
35 |     eval_batch_size: 5
36 | 
37 |     # change these to absolute paths if using batch training
38 |     data_root: "./downstream/a2o-vc-vcc2020/data/vcc2020"
39 |     lists_root: "./downstream/a2o-vc-vcc2020/data/lists"
40 |     stats_root: "./downstream/a2o-vc-vcc2020/data/stats"
41 | 
42 |     fbank_config:
43 |       fs: 24000
44 |       n_mels: 80
45 |       n_fft: 1024
46 |       n_shift: 256
47 |       win_length: null
48 |       window: "hann"
49 |       fmin: 80
50 |       fmax: 7600
51 |       gl_iters: 64
52 | 
53 |   modelrc:
54 |     ar: True
55 |     encoder_type: "taco2"
56 |     hidden_dim: 1024
57 |     prenet_layers: 2  # if set 0, only dropout is applied
58 |     prenet_dim: 256
59 |     prenet_dropout_rate: 0.5
60 |     lstmp_layers: 2
61 |     lstmp_dropout_rate: 0.2
62 |     lstmp_proj_dim: 256
63 |     lstmp_layernorm: False
64 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/conf/diffusion.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                FEATURE EXTRACTION SETTING               #
 3 | ###########################################################
 4 | sampling_rate: 16000     # Sampling rate.
 5 | fft_size: 1024           # FFT size.
 6 | hop_size: 256            # Hop size.
 7 | win_length: null         # Window length.
 8 |                          # If set to null, it will be the same as fft_size.
 9 | window: "hann"           # Window function.
10 | num_mels: 80             # Number of mel basis.
11 | fmin: 80                 # Minimum freq in mel basis calculation.
12 | fmax: 7600               # Maximum frequency in mel basis calculation.
13 | global_gain_scale: 1.0   # Will be multiplied to all of waveform.
14 | trim_silence: false      # Whether to trim the start and end of silence.
15 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
16 | trim_frame_size: 2048    # Frame size in trimming.
17 | trim_hop_size: 512       # Hop size in trimming.
18 | 
19 | ###########################################################
20 | #              NETWORK ARCHITECTURE SETTING               #
21 | ###########################################################
22 | model_type: "Diffusion"
23 | model_params:
24 |     denoiser_residual_channels: 256         # Number of hidden channels used in the denoiser.
25 | use_spk_emb: false
26 |     
27 | 
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | main_loss_type: L2Loss
32 | 
33 | ###########################################################
34 | #                   INFERENCE SETTING                     #
35 | ###########################################################
36 | vocoder:
37 |     checkpoint: ./downloads/pwg_slt/checkpoint-400000steps.pkl
38 |     config: ./downloads/pwg_slt/config.yml
39 |     stats: ./downloads/pwg_slt/stats.h5
40 | 
41 | ###########################################################
42 | #                  DATA LOADER SETTING                    #
43 | ###########################################################
44 | batch_size: 16             # Batch size.
45 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
46 | num_workers: 2              # Number of workers in Pytorch DataLoader.
47 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
48 | 
49 | ###########################################################
50 | #             OPTIMIZER & SCHEDULER SETTING               #
51 | ###########################################################
52 | optimizer_type: AdamW
53 | optimizer_params:
54 |     lr: 1.0e-3
55 | grad_norm: 1.0              # Gradient norm.
56 | scheduler: linear_schedule_with_warmup
57 | scheduler_params:
58 |     num_warmup_steps: 5000      # Scheduler warm up step
59 | 
60 | ###########################################################
61 | #                    INTERVAL SETTING                     #
62 | ###########################################################
63 | train_max_steps: 100000                 # Number of training steps.
64 | save_interval_steps: 5000              # Interval steps to save checkpoint.
65 | eval_interval_steps: 1000              # Interval steps to evaluate the network.
66 | log_interval_steps: 100                # Interval steps to record the training log.
67 | 
68 | ###########################################################
69 | #                     OTHER SETTING                       #
70 | ###########################################################
71 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
72 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/conf/f0.yaml:
--------------------------------------------------------------------------------
 1 | bdl:
 2 |   f0min: 70
 3 |   f0max: 210
 4 | clb:
 5 |   f0min: 110
 6 |   f0max: 270
 7 | rms:
 8 |   f0min: 55
 9 |   f0max: 200
10 | slt:
11 |   f0min: 120
12 |   f0max: 275


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/conf/taco2_ar.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                FEATURE EXTRACTION SETTING               #
 3 | ###########################################################
 4 | sampling_rate: 16000     # Sampling rate.
 5 | fft_size: 1024           # FFT size.
 6 | hop_size: 256            # Hop size.
 7 | win_length: null         # Window length.
 8 |                          # If set to null, it will be the same as fft_size.
 9 | window: "hann"           # Window function.
10 | num_mels: 80             # Number of mel basis.
11 | fmin: 80                 # Minimum freq in mel basis calculation.
12 | fmax: 7600               # Maximum frequency in mel basis calculation.
13 | global_gain_scale: 1.0   # Will be multiplied to all of waveform.
14 | trim_silence: false      # Whether to trim the start and end of silence.
15 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
16 | trim_frame_size: 2048    # Frame size in trimming.
17 | trim_hop_size: 512       # Hop size in trimming.
18 | 
19 | ###########################################################
20 | #              NETWORK ARCHITECTURE SETTING               #
21 | ###########################################################
22 | model_type: "Taco2_AR"
23 | model_params:
24 |     ar: True
25 |     encoder_type: "taco2"
26 |     hidden_dim: 1024
27 |     prenet_layers: 2  # if set 0, only dropout is applied
28 |     prenet_dim: 256
29 |     prenet_dropout_rate: 0.5
30 |     lstmp_layers: 2
31 |     lstmp_dropout_rate: 0.2
32 |     lstmp_proj_dim: 256
33 |     lstmp_layernorm: False
34 | 
35 | ###########################################################
36 | #                      LOSS SETTING                       #
37 | ###########################################################
38 | main_loss_type: "L1Loss"
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | vocoder:
44 |     checkpoint: ./downloads/pwg_slt/checkpoint-400000steps.pkl
45 |     config: ./downloads/pwg_slt/config.yml
46 |     stats: ./downloads/pwg_slt/stats.h5
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | batch_size: 16             # Batch size.
52 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
53 | num_workers: 2              # Number of workers in Pytorch DataLoader.
54 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
55 | 
56 | ###########################################################
57 | #             OPTIMIZER & SCHEDULER SETTING               #
58 | ###########################################################
59 | optimizer_type: AdamW
60 | optimizer_params:
61 |     lr: 1.0e-4
62 | grad_norm: 1.0              # Gradient norm.
63 | scheduler: linear_schedule_with_warmup
64 | scheduler_params:
65 |     num_warmup_steps: 4000      # Scheduler warm up step
66 | 
67 | ###########################################################
68 | #                    INTERVAL SETTING                     #
69 | ###########################################################
70 | train_max_steps: 50000                 # Number of training steps.
71 | save_interval_steps: 5000              # Interval steps to save checkpoint.
72 | eval_interval_steps: 1000              # Interval steps to evaluate the network.
73 | log_interval_steps: 100                # Interval steps to record the training log.
74 | 
75 | ###########################################################
76 | #                     OTHER SETTING                       #
77 | ###########################################################
78 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2019 Nagoya University (Tomoki Hayashi)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | db=$1
 8 | spk=$2
 9 | 
10 | available_spks=(
11 |     "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
12 | )
13 | 
14 | # check arguments
15 | if [ $# != 2 ]; then
16 |     echo "Usage: $0 <db_root_dir> <spk>"
17 |     echo "Available speakers: ${available_spks[*]}"
18 |     exit 1
19 | fi
20 | 
21 | # check speakers
22 | if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
23 |     echo "Specified spk (${spk}) is not available or not supported." >&2
24 |     exit 1
25 | fi
26 | 
27 | # download dataset
28 | cwd=`pwd`
29 | if [ ! -e ${db}/${spk}.done ]; then
30 |     mkdir -p ${db}
31 |     cd ${db}
32 |     wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2
33 |     tar xf cmu_us_${spk}*.tar.bz2
34 |     rm cmu_us_${spk}*.tar.bz2
35 |     cd $cwd
36 |     echo "Successfully finished download."
37 |     touch ${db}/${spk}.done
38 | else
39 |     echo "Already exists. Skip download."
40 | fi
41 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/local/data_prep.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | # shellcheck disable=SC1091
  7 | . ./path.sh || exit 1;
  8 | 
  9 | num_dev=100
 10 | num_eval=100
 11 | num_train=932
 12 | train_set="train_nodev"
 13 | dev_set="dev"
 14 | eval_set="eval"
 15 | shuffle=false
 16 | 
 17 | # shellcheck disable=SC1091
 18 | . utils/parse_options.sh || exit 1;
 19 | 
 20 | db_root=$1
 21 | spk=$2
 22 | data_dir=$3
 23 | 
 24 | # check arguments
 25 | if [ $# != 3 ]; then
 26 |     echo "Usage: $0 <db_root> <spk> <data_dir>"
 27 |     echo "e.g.: $0 downloads/cms_us_slt_arctic slt data"
 28 |     echo ""
 29 |     echo "Options:"
 30 |     echo "    --num_dev: number of development uttreances (default=100)."
 31 |     echo "    --num_eval: number of evaluation uttreances (default=100)."
 32 |     echo "    --train_set: name of train set (default=train_nodev)."
 33 |     echo "    --dev_set: name of dev set (default=dev)."
 34 |     echo "    --eval_set: name of eval set (default=eval)."
 35 |     echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
 36 |     exit 1
 37 | fi
 38 | 
 39 | set -euo pipefail
 40 | 
 41 | # check speaker
 42 | available_spks=(
 43 |     "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
 44 | )
 45 | if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
 46 |     echo "Specified speaker ${spk} is not available."
 47 |     echo "Available speakers: ${available_spks[*]}"
 48 |     exit 1
 49 | fi
 50 | 
 51 | [ ! -e "${data_dir}/all" ] && mkdir -p "${data_dir}/all"
 52 | 
 53 | # set filenames
 54 | scp="${data_dir}/all/wav.scp"
 55 | segments="${data_dir}/all/segments"
 56 | 
 57 | # check file existence
 58 | [ -e "${scp}" ] && rm "${scp}"
 59 | [ -e "${segments}" ] && rm "${segments}"
 60 | 
 61 | # make scp
 62 | find "$(realpath ${db_root})" -name "*.wav" -follow | sort | while read -r filename; do
 63 |     id="$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")"
 64 |     echo "${id} ${filename}" >> "${scp}"
 65 | done
 66 | 
 67 | # make segments
 68 | find "${db_root}/lab" -name "*.lab" -follow | sort | while read -r filename; do
 69 |     # get start time
 70 |     while read -r line; do
 71 |         phn=$(echo "${line}" | cut -d " " -f 3)
 72 |         if [ "${phn}" != "pau" ]; then
 73 |             break
 74 |         fi
 75 |         start=$(echo "${line}" | cut -d " " -f 1)
 76 |     done < <(tail -n +2 "$filename")
 77 |     # get end time
 78 |     while read -r line; do
 79 |         end=$(echo "${line}" | cut -d " " -f 1)
 80 |         phn=$(echo "${line}" | cut -d " " -f 3)
 81 |         if [ "${phn}" != "pau" ]; then
 82 |             break
 83 |         fi
 84 |     done < <(tail -n +2 "$filename" | tac)
 85 |     echo "$(basename "${filename}" .lab) $(basename "${filename}" .lab) ${start} ${end}" >> "${segments}"
 86 | done
 87 | 
 88 | # check
 89 | diff -q <(awk '{print $1}' "${scp}") <(awk '{print $1}' "${segments}") > /dev/null
 90 | 
 91 | # split
 92 | num_all=$(wc -l < "${scp}")
 93 | num_deveval=$((num_dev + num_eval))
 94 | num_train_temp=$((num_all - num_deveval))
 95 | utils/split_data.sh \
 96 |     --num_first "${num_train_temp}" \
 97 |     --num_second "${num_deveval}" \
 98 |     --shuffle "${shuffle}" \
 99 |     "${data_dir}/all" \
100 |     "${data_dir}/${train_set}_temp" \
101 |     "${data_dir}/deveval"
102 | utils/split_data.sh \
103 |     --num_first "${num_dev}" \
104 |     --num_second "${num_eval}" \
105 |     --shuffle "${shuffle}" \
106 |     "${data_dir}/deveval" \
107 |     "${data_dir}/${dev_set}" \
108 |     "${data_dir}/${eval_set}"
109 | 
110 | # check if further splitting is necessary
111 | num_train_temp2=$((num_train_temp - num_train))
112 | if [ ${num_train_temp2} -gt 0 ]; then
113 |     utils/split_data.sh \
114 |         --num_first "${num_train}" \
115 |         --num_second "${num_train_temp2}" \
116 |         --shuffle "${shuffle}" \
117 |         "${data_dir}/${train_set}_temp" \
118 |         "${data_dir}/${train_set}" \
119 |         "${data_dir}/${train_set}_temp2"
120 |     rm -rf "${data_dir}/${train_set}_temp"
121 |     rm -rf "${data_dir}/${train_set}_temp2"
122 | elif [ ${num_train_temp2} -eq 0 ]; then
123 |     mv "${data_dir}/${train_set}_temp" "${data_dir}/${train_set}"
124 | else
125 |     echo "Please make sure num_train (${num_train}) + num_dev (${num_dev}) + num_eval (${num_eval}) = 1132."
126 |     exit 1
127 | fi
128 | 
129 | # remove tmp directories
130 | rm -rf "${data_dir}/all"
131 | rm -rf "${data_dir}/deveval"
132 | 
133 | echo "Successfully prepared data."
134 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/local/pretrained_model_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2020 Nagoya University (Wen-Chin Huang)
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | download_dir=$1
 8 | pretrained_model=$2
 9 | 
10 | # check arguments
11 | if [ $# != 2 ]; then
12 |     echo "Usage: $0 <download_dir> <pretrained_model>"
13 |     echo ""
14 |     echo "Available pretrained models:"
15 |     echo "    - m_ailabs.judy.vtn.tts_pt"
16 |     echo "    - pwg_slt"
17 |     echo "    - pwg_rms"
18 |     exit 1
19 | fi
20 | 
21 | case "${pretrained_model}" in
22 |     "m_ailabs.judy.vtn_tts_pt")     share_url="https://drive.google.com/open?id=1mPf-BxX3t_pqFFV6MGPBRePm5kgNR5sM" ;;
23 |     "m_ailabs.judy.taco2_tts_pt")   share_url="https://drive.google.com/open?id=1fRLw6EA0x55xa449i_YRjCgm8sgv3hJI" ;;
24 |     "pwg_slt")                      share_url="https://drive.google.com/open?id=1v70TtwfmYtTHq9LvksX907mNTEv1G-J1" ;;
25 |     "pwg_rms")                      share_url="https://drive.google.com/open?id=1ty_de85SNldzVJSMQrHwl1ASBdGdSRav" ;;
26 |     *) echo "No such pretrained model: ${pretrained_model}"; exit 1 ;;
27 | esac
28 | 
29 | dir=${download_dir}/${pretrained_model}
30 | mkdir -p ${dir}
31 | if [ ! -e ${dir}/.complete ]; then
32 |     utils/download_from_google_drive.sh ${share_url} ${dir} "tar.gz"
33 |     touch ${dir}/.complete
34 | fi
35 | echo "Successfully finished download of pretrained model."
36 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/path.sh:
--------------------------------------------------------------------------------
 1 | # cuda related
 2 | # export CUDA_HOME=/usr/local/cuda-10.0
 3 | # export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
 4 | 
 5 | # path related
 6 | export PRJ_ROOT="${PWD}/../../.."
 7 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 8 |     # shellcheck disable=SC1090
 9 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
10 | fi
11 | 
12 | MAIN_ROOT=$PWD/../../..
13 | export PATH=$MAIN_ROOT/s3prl_vc/bin:$PATH
14 | 
15 | # python related
16 | export OMP_NUM_THREADS=1
17 | export PYTHONIOENCODING=UTF-8
18 | export MPL_BACKEND=Agg
19 | 


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/run.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # basic settings
 10 | stage=-1       # stage to start
 11 | stop_stage=100 # stage to stop
 12 | verbose=1      # verbosity level (lower is less info)
 13 | n_gpus=1       # number of gpus in training
 14 | n_jobs=16      # number of parallel jobs in feature extraction
 15 | 
 16 | upstream=vq_wav2vec
 17 | conf=conf/taco2_ar.yaml
 18 | 
 19 | # dataset configuration
 20 | db_root=downloads
 21 | trgspk=slt
 22 | num_train=932
 23 | 
 24 | # training related setting
 25 | tag=""     # tag for directory to save model
 26 | resume=""  # checkpoint path to resume training
 27 |            # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
 28 |            
 29 | # decoding related setting
 30 | outdir=                     # In case not evaluation not executed together with decoding & synthesis stage
 31 | model=                      # VC Model checkpoint for decoding. If not specified, automatically set to the latest checkpoint 
 32 | checkpoint=""               # checkpoint path to be used for decoding
 33 |                             # if not provided, the latest one will be used
 34 |                             # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
 35 |                                        
 36 | # shellcheck disable=SC1091
 37 | . utils/parse_options.sh || exit 1;
 38 | 
 39 | set -euo pipefail
 40 | 
 41 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 42 |     echo "stage -1: Data and Pretrained Vocoder Download"
 43 |     local/data_download.sh ${db_root} ${trgspk}
 44 | 
 45 |     # download pretrained vocoder
 46 |     local/pretrained_model_download.sh ${db_root} pwg_${trgspk}
 47 | fi
 48 | 
 49 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 50 |     echo "stage 0: Data preparation"
 51 |         local/data_prep.sh \
 52 |             --train_set "${trgspk}_train_${num_train}" \
 53 |             --dev_set "${trgspk}_dev" \
 54 |             --eval_set "${trgspk}_eval" \
 55 |             --num_train ${num_train} \
 56 |             --num_dev 50 --num_eval 50 \
 57 |             "${db_root}/cmu_us_${trgspk}_arctic" "${trgspk}" data
 58 | fi
 59 | 
 60 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 61 |     echo "stage 1: Pre-calculation"
 62 |     echo "Statistics computation start. See the progress via data/${trgspk}_train_${num_train}/compute_statistics.log."
 63 |     ${train_cmd} "data/${trgspk}_train_${num_train}/compute_statistics.log" \
 64 |         compute_statistics.py \
 65 |             --config "${conf}" \
 66 |             --scp "data/${trgspk}_train_${num_train}/wav.scp" \
 67 |             --dumpdir "data/${trgspk}_train_${num_train}" \
 68 |             --verbose "${verbose}"
 69 |     echo "Successfully calculated statistics."
 70 | fi
 71 | 
 72 | if [ -z ${tag} ]; then
 73 |     expname=${trgspk}_${upstream}_$(basename ${conf%.*})
 74 | else
 75 |     expname=${trgspk}_${upstream}_${tag}
 76 | fi
 77 | expdir=exp/${expname}
 78 | if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
 79 |     echo "Stage 2: Network training"
 80 |     [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
 81 |     cp "data/${trgspk}_train_${num_train}/stats.h5" "${expdir}"
 82 |     if [ "${n_gpus}" -gt 1 ]; then
 83 |         echo "Not Implemented yet."
 84 |         # train="python -m seq2seq_vc.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
 85 |     else
 86 |         train="train.py"
 87 |     fi
 88 |     echo "Training start. See the progress via ${expdir}/train.log."
 89 |     ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
 90 |         ${train} \
 91 |             --upstream ${upstream} \
 92 |             --config "${conf}" \
 93 |             --train-scp "data/${trgspk}_train_${num_train}/wav.scp" \
 94 |             --dev-scp "data/${trgspk}_dev/wav.scp" \
 95 |             --trg-stats "${expdir}/stats.h5" \
 96 |             --outdir "${expdir}" \
 97 |             --resume "${resume}" \
 98 |             --verbose "${verbose}"
 99 |     echo "Successfully finished training."
100 | fi
101 | 
102 | if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
103 |     echo "Stage 3: Network decoding"
104 |     # shellcheck disable=SC2012
105 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
106 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
107 |     [ ! -e "${outdir}/${trgspk}_eval" ] && mkdir -p "${outdir}/${trgspk}_eval"
108 |     [ "${n_gpus}" -gt 1 ] && n_gpus=1
109 |     echo "Decoding start. See the progress via ${outdir}/${trgspk}_eval/decode.log."
110 |     ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${trgspk}_eval/decode.log" \
111 |         decode.py \
112 |             --scp "data/${trgspk}_eval/wav.scp" \
113 |             --checkpoint "${checkpoint}" \
114 |             --trg-stats "${expdir}/stats.h5" \
115 |             --outdir "${outdir}/${trgspk}_eval" \
116 |             --verbose "${verbose}"
117 |     echo "Successfully finished decoding."
118 | fi
119 | 
120 | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
121 |     echo "stage 4: Objective Evaluation"
122 | 
123 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
124 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
125 |     wavdir="${outdir}/${trgspk}_eval/wav"
126 |     echo "Evaluation start. See the progress via ${outdir}/${trgspk}_eval/evaluation.log."
127 |     ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${trgspk}_eval/evaluation.log" \
128 |         local/evaluate.py \
129 |             --wavdir ${wavdir} \
130 |             --data_root "${db_root}/cmu_us_${trgspk}_arctic" \
131 |             --trgspk ${trgspk} \
132 |             --f0_path "conf/f0.yaml"
133 |     grep "Mean MCD" "${outdir}/${trgspk}_eval/evaluation.log"
134 | fi


--------------------------------------------------------------------------------
/egs/arctic/a2o_vc/utils:
--------------------------------------------------------------------------------
1 | ../../../utils/


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/README.md:
--------------------------------------------------------------------------------
 1 | # Any-to-one VC recipe using the voice conversion challenge 2020 (VCC2020) dataset
 2 | 
 3 | Thie recipe can be used to reproduce the results in the [S3PRL-VC paper](https://arxiv.org/abs/2110.06280).
 4 | 
 5 | ## Training
 6 | 
 7 | Run the following command:
 8 | 
 9 | ```
10 | ./run.sh --stage -1 --stop_stage 2 --upstream <upstream> --trgspk <trgspk>
11 | ```
12 | 
13 | Four stages are executed:
14 | - Stage -1: Data and pretrained model download. First, the VCC2020 dataset is downloaded (by default to `downloads/`). Then, several pre-trained vocoders are also downloaded. In the any-to-one VC setting, we use a Parallel WaveGAN vocoder.
15 | - Stage 0: Data preparation. File lists are generated in `data/` by default. Each file contains space-separated lines with the format `<id> <wave file path>`. These files are used for training anc decoding (conversion)
16 | - Stage 1: Statistics calculation. The statistics of the mel spectrogram used for normalzation is calculated using the training set of the target speaker. Calculation log and the statistics h5 file are saved in `data/` by default.
17 | - Stage 2: Main training script. By default, `exp/<trgspk>_<upstream>_<taco2_ar>` is used to save the training log, saved checkpoints and intermediate samples for debugging (saved in `predictions/`).
18 | 
19 | Modifiable arguments:
20 | - `--trgspk`: The S3PRL-VC paper focused on task 1 in VCC2020. So, there are four target speakers to choose fromL TEF1, TEF2 (female), TEM1, TEM2 (male).
21 | - `--upstream`: In addition to the various upstreams provided by [S3PRL](https://s3prl.github.io/s3prl/tutorial/upstream_collection.html), we also provide two PPG models: `ppg_sxliu` uses the ASR model provided by [Songxiang Liu's ppg-vc repo](https://github.com/liusongxiang/ppg-vc), and `ppg_whisper` uses the [OpenAI Whisper ASR model](https://github.com/openai/whisper). Note that in my experiments, the Whisper model yields very bad results, but I don't know what the reason is. I would appreciate it if someone could figure out why.
22 | - `--tag`: if a tag is specified, results from stage 2 will be saved in `exp/<trgspk>_<upstream>_<tag>`.
23 | 
24 | ## Decoding (conversion) and evaluation
25 | 
26 | Run the following command:
27 | 
28 | ```
29 | ./run.sh --stage 3 --stop_stage 4 --upstream <upstream> --trgspk <trgspk> --checkpoint <checkpoint>
30 | ```
31 | 
32 | Generated files from both stages 3 and 4 are saved in `results/checkpoint-XXXXXsteps`.
33 | 
34 | - Stage 3 is the decoding stage and a log file is also generated. The mel spectrogram visualization can be viewed in `plot_mel/`. The generated waveform files are saved in `wav/`.
35 | - Stage 4 is the evaluation stage using `local/evaluate.py`. MCD, F0RMSE, F0CORR, DUR, CER, WER are calculated. Please refer to the code and the paper for what they represent. Detailed results are saved in `evaluation.log`.


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/conf/f0.yaml:
--------------------------------------------------------------------------------
 1 | TEF1:
 2 |     f0min: 127
 3 |     f0max: 405
 4 | TEF2:
 5 |     f0min: 108
 6 |     f0max: 361
 7 | TEM1:
 8 |     f0min: 71
 9 |     f0max: 227
10 | TEM2:
11 |     f0min: 57
12 |     f0max: 182
13 | TFF1:
14 |     f0min: 139
15 |     f0max: 387
16 | TFM1:
17 |     f0min: 60
18 |     f0max: 189
19 | TGF1:
20 |     f0min: 109
21 |     f0max: 345
22 | TGM1:
23 |     f0min: 75
24 |     f0max: 236
25 | TMF1:
26 |     f0min: 126
27 |     f0max: 409
28 | TMM1:
29 |     f0min: 81
30 |     f0max: 254
31 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/conf/taco2_ar.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                FEATURE EXTRACTION SETTING               #
 3 | ###########################################################
 4 | sampling_rate: 24000     # Sampling rate.
 5 | fft_size: 1024           # FFT size.
 6 | hop_size: 256            # Hop size.
 7 | win_length: null         # Window length.
 8 |                          # If set to null, it will be the same as fft_size.
 9 | window: "hann"           # Window function.
10 | num_mels: 80             # Number of mel basis.
11 | fmin: 80                 # Minimum freq in mel basis calculation.
12 | fmax: 7600               # Maximum frequency in mel basis calculation.
13 | global_gain_scale: 1.0   # Will be multiplied to all of waveform.
14 | trim_silence: false      # Whether to trim the start and end of silence.
15 | trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
16 | trim_frame_size: 2048    # Frame size in trimming.
17 | trim_hop_size: 512       # Hop size in trimming.
18 | 
19 | ###########################################################
20 | #              NETWORK ARCHITECTURE SETTING               #
21 | ###########################################################
22 | model_type: "Taco2_AR"
23 | model_params:
24 |     ar: True
25 |     encoder_type: "taco2"
26 |     hidden_dim: 1024
27 |     prenet_layers: 2  # if set 0, only dropout is applied
28 |     prenet_dim: 256
29 |     prenet_dropout_rate: 0.5
30 |     lstmp_layers: 2
31 |     lstmp_dropout_rate: 0.2
32 |     lstmp_proj_dim: 256
33 |     lstmp_layernorm: False
34 | 
35 | ###########################################################
36 | #                      LOSS SETTING                       #
37 | ###########################################################
38 | main_loss_type: "L1Loss"
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | vocoder:
44 |     checkpoint: ./downloads/pwg_task1/checkpoint-400000steps.pkl
45 |     config: ./downloads/pwg_task1/config.yml
46 |     stats: ./downloads/pwg_task1/stats.h5
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | batch_size: 6             # Batch size.
52 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
53 | num_workers: 2              # Number of workers in Pytorch DataLoader.
54 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
55 | 
56 | ###########################################################
57 | #             OPTIMIZER & SCHEDULER SETTING               #
58 | ###########################################################
59 | optimizer_type: AdamW
60 | optimizer_params:
61 |     lr: 1.0e-4
62 | grad_norm: 1.0              # Gradient norm.
63 | scheduler: linear_schedule_with_warmup
64 | scheduler_params:
65 |     num_warmup_steps: 4000      # Scheduler warm up step
66 | 
67 | ###########################################################
68 | #                    INTERVAL SETTING                     #
69 | ###########################################################
70 | train_max_steps: 10000                 # Number of training steps.
71 | save_interval_steps: 1000              # Interval steps to save checkpoint.
72 | eval_interval_steps: 1000              # Interval steps to evaluate the network.
73 | log_interval_steps: 100                # Interval steps to record the training log.
74 | 
75 | ###########################################################
76 | #                     OTHER SETTING                       #
77 | ###########################################################
78 | num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | db=$1
 4 | 
 5 | # check arguments
 6 | if [ $# != 1 ]; then
 7 |     echo "Usage: $0 <db_root_dir>"
 8 |     exit 1
 9 | fi
10 | 
11 | srcspks=(
12 |     "SEF1" "SEF2" "SEM1" "SEM2"
13 | )
14 | 
15 | trgspks_task1=(
16 |     "TEF1" "TEF2" "TEM1" "TEM2"
17 | )
18 | 
19 | trgspks_task2=(
20 |     "TFF1" "TFM1" "TGF1" "TGM1" "TMF1" "TMM1"
21 | )
22 | 
23 | # download dataset
24 | cwd=`pwd`
25 | if [ ! -e ${db}/.done ]; then
26 |     mkdir -p ${db}
27 |     cd ${db}
28 |     git clone https://github.com/nii-yamagishilab/VCC2020-database.git
29 |     
30 |     cd VCC2020-database
31 |     unzip '*.zip'
32 |     rm -rf __MACOSX/ # remove extra folder
33 |     
34 |     # integrate source waveforms
35 |     for srcspk in "${srcspks[@]}"; do
36 |         mv vcc2020_database_evaluation/${srcspk}/*.wav source/${srcspk}/
37 |     done
38 |     mv source/* ./
39 |     
40 |     # integrate target waveforms
41 |     for trgspk in "${trgspks_task1[@]}"; do
42 |         mv vcc2020_database_groundtruth/${trgspk}/*.wav target_task1/${trgspk}/
43 |     done
44 |     for trgspk in "${trgspks_task2[@]}"; do
45 |         mv vcc2020_database_groundtruth/${trgspk}/*.wav target_task2/${trgspk}/
46 |     done
47 |     mv target_task1/* ./
48 |     mv target_task2/* ./
49 | 
50 |     # move transcriptions
51 |     mkdir prompts
52 |     cat vcc2020_database_transcriptions/transcriptions_training/vcc2020_database_training_Eng_transcriptions.txt \
53 |         vcc2020_database_transcriptions/transcriptions_evaluation/vcc2020_database_evaluation_transcriptions.txt > prompts/Eng_transcriptions.txt
54 |     mv vcc2020_database_transcriptions/transcriptions_training/vcc2020_database_training_Fin_transcriptions.txt prompts/Fin_transcriptions.txt
55 |     mv vcc2020_database_transcriptions/transcriptions_training/vcc2020_database_training_Ger_transcriptions.txt prompts/Ger_transcriptions.txt
56 |     mv vcc2020_database_transcriptions/transcriptions_training/vcc2020_database_training_Man_transcriptions.txt prompts/Man_transcriptions.txt
57 |     rm -rf vcc2020_database_transcriptions
58 | 
59 |     # delete folders and files
60 |     rm -f *.zip
61 |     rm -rf source
62 |     rm -rf target_task1
63 |     rm -rf target_task2
64 |     rm -rf vcc2020_database_groundtruth/
65 |     rm -rf vcc2020_database_evaluation/
66 |     cd ..
67 |     mv VCC2020-database/* ./
68 |     rm -rf VCC2020-database
69 |     cd $cwd
70 |     echo "Successfully finished download."
71 |     touch ${db}/.done
72 | else
73 |     echo "Already exists. Skip download."
74 | fi
75 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/data_prep.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | # shellcheck disable=SC1091
  7 | . ./path.sh || exit 1;
  8 | 
  9 | train_set="train_nodev"
 10 | dev_set="dev"
 11 | eval_set="eval"
 12 | shuffle=false
 13 | 
 14 | # shellcheck disable=SC1091
 15 | . utils/parse_options.sh || exit 1;
 16 | 
 17 | db_root=$1
 18 | spk=$2
 19 | data_dir=$3
 20 | lists_dir=$4
 21 | 
 22 | # check arguments
 23 | if [ $# != 4 ]; then
 24 |     echo "Usage: $0 <db_root> <spk> <data_dir> <lists_dir>"
 25 |     echo "e.g.: $0 downloads/vcc2020 TEF1 data local/lists"
 26 |     echo ""
 27 |     echo "Options:"
 28 |     echo "    --num_dev: number of development uttreances (default=100)."
 29 |     echo "    --num_eval: number of evaluation uttreances (default=100)."
 30 |     echo "    --train_set: name of train set (default=train_nodev)."
 31 |     echo "    --dev_set: name of dev set (default=dev)."
 32 |     echo "    --eval_set: name of eval set (default=eval)."
 33 |     echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
 34 |     exit 1
 35 | fi
 36 | 
 37 | set -euo pipefail
 38 | 
 39 | srcspks=(
 40 |     "SEF1" "SEF2" "SEM1" "SEM2"
 41 | )
 42 | trgspks=(
 43 |     "TEF1" "TEF2" "TEM1" "TEM2"
 44 | )
 45 | 
 46 | # check speaker
 47 | if ! echo "${trgspks[*]}" | grep -q "${spk}"; then
 48 |     echo "Specified speaker ${spk} is not available."
 49 |     echo "Available speakers: ${trgspks[*]}"
 50 |     exit 1
 51 | fi
 52 | 
 53 | [ ! -e "${data_dir}/${spk}_${train_set}" ] && mkdir -p "${data_dir}/${spk}_${train_set}"
 54 | [ ! -e "${data_dir}/${spk}_${dev_set}" ] && mkdir -p "${data_dir}/${spk}_${dev_set}"
 55 | 
 56 | # set filenames
 57 | train_scp="${data_dir}/${spk}_${train_set}/wav.scp"
 58 | dev_scp="${data_dir}/${spk}_${dev_set}/wav.scp"
 59 | 
 60 | # check file existence
 61 | [ -e "${train_scp}" ] && rm "${train_scp}"
 62 | [ -e "${dev_scp}" ] && rm "${dev_scp}"
 63 | 
 64 | # make train scp
 65 | while IFS= read -r number; do
 66 |     wavfile="${db_root}/${spk}/${number}.wav"
 67 |     [ -e "${wavfile}" ] && echo "${number} ${wavfile}" >> "${train_scp}"
 68 | done < "${lists_dir}/E_train_list.txt"
 69 | 
 70 | echo "Successfully prepared train data scp."
 71 | 
 72 | # make dev scp
 73 | while IFS= read -r number; do
 74 |     wavfile="${db_root}/${spk}/${number}.wav"
 75 |     [ -e "${wavfile}" ] && echo "${number} ${wavfile}" >> "${dev_scp}"
 76 | done < "${lists_dir}/E_dev_list.txt"
 77 | 
 78 | echo "Successfully prepared dev data scp."
 79 | 
 80 | ###################################################
 81 | 
 82 | [ ! -e "${data_dir}/${eval_set}" ] && mkdir -p "${data_dir}/${eval_set}"
 83 | 
 84 | # set filenames
 85 | eval_scp="${data_dir}/${eval_set}/wav.scp"
 86 | 
 87 | # check file existence
 88 | if [ ! -e "${eval_scp}" ]; then
 89 |     # make eval scp
 90 |     while IFS= read -r number; do
 91 |         # loop through source speakers
 92 |         for srcspk in "${srcspks[@]}"; do
 93 |             wavfile="${db_root}/${srcspk}/${number}.wav"
 94 |             [ -e "${wavfile}" ] && echo "${srcspk}_${number} ${wavfile}" >> "${eval_scp}"
 95 |         done 
 96 |     done < "${lists_dir}/eval_list.txt"
 97 | fi
 98 | 
 99 | echo "Successfully prepared eval data."
100 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/E_dev_list.txt:
--------------------------------------------------------------------------------
 1 | E10061
 2 | E10062
 3 | E10063
 4 | E10064
 5 | E10065
 6 | E10066
 7 | E10067
 8 | E10068
 9 | E10069
10 | E10070
11 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/E_train_list.txt:
--------------------------------------------------------------------------------
  1 | E10001
  2 | E10002
  3 | E10003
  4 | E10004
  5 | E10005
  6 | E10006
  7 | E10007
  8 | E10008
  9 | E10009
 10 | E10010
 11 | E10011
 12 | E10012
 13 | E10013
 14 | E10014
 15 | E10015
 16 | E10016
 17 | E10017
 18 | E10018
 19 | E10019
 20 | E10020
 21 | E10021
 22 | E10022
 23 | E10023
 24 | E10024
 25 | E10025
 26 | E10026
 27 | E10027
 28 | E10028
 29 | E10029
 30 | E10030
 31 | E10031
 32 | E10032
 33 | E10033
 34 | E10034
 35 | E10035
 36 | E10036
 37 | E10037
 38 | E10038
 39 | E10039
 40 | E10040
 41 | E10041
 42 | E10042
 43 | E10043
 44 | E10044
 45 | E10045
 46 | E10046
 47 | E10047
 48 | E10048
 49 | E10049
 50 | E10050
 51 | E10051
 52 | E10052
 53 | E10053
 54 | E10054
 55 | E10055
 56 | E10056
 57 | E10057
 58 | E10058
 59 | E10059
 60 | E10060
 61 | E20001
 62 | E20002
 63 | E20003
 64 | E20004
 65 | E20005
 66 | E20006
 67 | E20007
 68 | E20008
 69 | E20009
 70 | E20010
 71 | E20011
 72 | E20012
 73 | E20013
 74 | E20014
 75 | E20015
 76 | E20016
 77 | E20017
 78 | E20018
 79 | E20019
 80 | E20020
 81 | E20021
 82 | E20022
 83 | E20023
 84 | E20024
 85 | E20025
 86 | E20026
 87 | E20027
 88 | E20028
 89 | E20029
 90 | E20030
 91 | E20031
 92 | E20032
 93 | E20033
 94 | E20034
 95 | E20035
 96 | E20036
 97 | E20037
 98 | E20038
 99 | E20039
100 | E20040
101 | E20041
102 | E20042
103 | E20043
104 | E20044
105 | E20045
106 | E20046
107 | E20047
108 | E20048
109 | E20049
110 | E20050
111 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/F_dev_list.txt:
--------------------------------------------------------------------------------
 1 | F10061
 2 | F10062
 3 | F10063
 4 | F10064
 5 | F10065
 6 | F10066
 7 | F10067
 8 | F10068
 9 | F10069
10 | F10070
11 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/F_train_list.txt:
--------------------------------------------------------------------------------
 1 | F10001
 2 | F10002
 3 | F10003
 4 | F10004
 5 | F10005
 6 | F10006
 7 | F10007
 8 | F10008
 9 | F10009
10 | F10010
11 | F10011
12 | F10012
13 | F10013
14 | F10014
15 | F10015
16 | F10016
17 | F10017
18 | F10018
19 | F10019
20 | F10020
21 | F10021
22 | F10022
23 | F10023
24 | F10024
25 | F10025
26 | F10026
27 | F10027
28 | F10028
29 | F10029
30 | F10030
31 | F10031
32 | F10032
33 | F10033
34 | F10034
35 | F10035
36 | F10036
37 | F10037
38 | F10038
39 | F10039
40 | F10040
41 | F10041
42 | F10042
43 | F10043
44 | F10044
45 | F10045
46 | F10046
47 | F10047
48 | F10048
49 | F10049
50 | F10050
51 | F10051
52 | F10052
53 | F10053
54 | F10054
55 | F10055
56 | F10056
57 | F10057
58 | F10058
59 | F10059
60 | F10060
61 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/G_dev_list.txt:
--------------------------------------------------------------------------------
 1 | G10061
 2 | G10062
 3 | G10063
 4 | G10064
 5 | G10065
 6 | G10066
 7 | G10067
 8 | G10068
 9 | G10069
10 | G10070
11 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/G_train_list.txt:
--------------------------------------------------------------------------------
 1 | G10001
 2 | G10002
 3 | G10003
 4 | G10004
 5 | G10005
 6 | G10006
 7 | G10007
 8 | G10008
 9 | G10009
10 | G10010
11 | G10011
12 | G10012
13 | G10013
14 | G10014
15 | G10015
16 | G10016
17 | G10017
18 | G10018
19 | G10019
20 | G10020
21 | G10021
22 | G10022
23 | G10023
24 | G10024
25 | G10025
26 | G10026
27 | G10027
28 | G10028
29 | G10029
30 | G10030
31 | G10031
32 | G10032
33 | G10033
34 | G10034
35 | G10035
36 | G10036
37 | G10037
38 | G10038
39 | G10039
40 | G10040
41 | G10041
42 | G10042
43 | G10043
44 | G10044
45 | G10045
46 | G10046
47 | G10047
48 | G10048
49 | G10049
50 | G10050
51 | G10051
52 | G10052
53 | G10053
54 | G10054
55 | G10055
56 | G10056
57 | G10057
58 | G10058
59 | G10059
60 | G10060
61 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/M_dev_list.txt:
--------------------------------------------------------------------------------
 1 | M10061
 2 | M10062
 3 | M10063
 4 | M10064
 5 | M10065
 6 | M10066
 7 | M10067
 8 | M10068
 9 | M10069
10 | M10070
11 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/M_train_list.txt:
--------------------------------------------------------------------------------
 1 | M10001
 2 | M10002
 3 | M10003
 4 | M10004
 5 | M10005
 6 | M10006
 7 | M10007
 8 | M10008
 9 | M10009
10 | M10010
11 | M10011
12 | M10012
13 | M10013
14 | M10014
15 | M10015
16 | M10016
17 | M10017
18 | M10018
19 | M10019
20 | M10020
21 | M10021
22 | M10022
23 | M10023
24 | M10024
25 | M10025
26 | M10026
27 | M10027
28 | M10028
29 | M10029
30 | M10030
31 | M10031
32 | M10032
33 | M10033
34 | M10034
35 | M10035
36 | M10036
37 | M10037
38 | M10038
39 | M10039
40 | M10040
41 | M10041
42 | M10042
43 | M10043
44 | M10044
45 | M10045
46 | M10046
47 | M10047
48 | M10048
49 | M10049
50 | M10050
51 | M10051
52 | M10052
53 | M10053
54 | M10054
55 | M10055
56 | M10056
57 | M10057
58 | M10058
59 | M10059
60 | M10060
61 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/custom_eval_wenchin.yaml:
--------------------------------------------------------------------------------
1 | /mrnas02/internal/wenchin-h/Experiments/s3prl-merge/s3prl/downstream/a2a-vc-vctk/data/wenchin_recording/wenchin_001.wav
2 | /mrnas02/internal/wenchin-h/Experiments/s3prl-merge/s3prl/downstream/a2a-vc-vctk/data/wenchin_recording/wenchin_002.wav
3 | /mrnas02/internal/wenchin-h/Experiments/s3prl-merge/s3prl/downstream/a2a-vc-vctk/data/wenchin_recording/wenchin_003.wav
4 | /mrnas02/internal/wenchin-h/Experiments/s3prl-merge/s3prl/downstream/a2a-vc-vctk/data/wenchin_recording/wenchin_004.wav
5 | /mrnas02/internal/wenchin-h/Experiments/s3prl-merge/s3prl/downstream/a2a-vc-vctk/data/wenchin_recording/wenchin_005.wav


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/eval_list.txt:
--------------------------------------------------------------------------------
 1 | E30001
 2 | E30002
 3 | E30003
 4 | E30004
 5 | E30005
 6 | E30006
 7 | E30007
 8 | E30008
 9 | E30009
10 | E30010
11 | E30011
12 | E30012
13 | E30013
14 | E30014
15 | E30015
16 | E30016
17 | E30017
18 | E30018
19 | E30019
20 | E30020
21 | E30021
22 | E30022
23 | E30023
24 | E30024
25 | E30025
26 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/lists/ref_list.txt:
--------------------------------------------------------------------------------
 1 | E20001
 2 | E20002
 3 | E20003
 4 | E20004
 5 | E20005
 6 | E20006
 7 | E20007
 8 | E20008
 9 | E20009
10 | E20010
11 | E20011
12 | E20012
13 | E20013
14 | E20014
15 | E20015
16 | E20016
17 | E20017
18 | E20018
19 | E20019
20 | E20020
21 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/local/vocoder_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is based on the following links:
 4 | # https://raw.githubusercontent.com/espnet/espnet/master/egs/vcc20/vc1_task1/local/pretrained_model_download.sh
 5 | # https://github.com/espnet/espnet/blob/master/utils/download_from_google_drive.sh
 6 | 
 7 | download_dir=$1
 8 | 
 9 | # check arguments
10 | if [ $# != 1 ]; then
11 |     echo "Usage: $0 <download_dir>"
12 |     exit 1
13 | fi
14 | 
15 | pwg_task1_url="https://drive.google.com/open?id=11KKux-du6fvsMMB4jNk9YH23YUJjRcDV"
16 | pwg_task2_url="https://drive.google.com/open?id=1li9DLZGnAheWZrB4oXGo0KWq-fHuFH_l"
17 | hifigan_url="https://drive.google.com/open?id=136tzvhczhHQ4sbaaJUU8UKjkCaca0ub6"
18 | 
19 | download_from_google_drive() {
20 |     share_url=$1
21 |     dir=$2
22 |     file_ext=$3
23 | 
24 |     # make temp dir
25 |     [ ! -e "${dir}" ] && mkdir -p "${dir}"
26 |     tmp=$(mktemp "${dir}/XXXXXX.${file_ext}")
27 | 
28 |     # download & decompress
29 |     file_id=$(echo "${share_url}" | cut -d"=" -f 2)
30 |     gdown --id "${file_id}" -O "${tmp}"
31 |     tar xvzf "${tmp}" -C "${dir}"
32 | 
33 |     # remove tmp
34 |     rm "${tmp}"
35 | }
36 | 
37 | download_from_google_drive ${pwg_task1_url} ${download_dir}/pwg_task1 ".tar.gz"
38 | download_from_google_drive ${pwg_task2_url} ${download_dir}/pwg_task2 ".tar.gz"
39 | download_from_google_drive ${hifigan_url} ${download_dir}/hifigan_vctk+vcc2020 ".tar.gz"
40 | echo "Successfully finished donwload of pretrained models."
41 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/path.sh:
--------------------------------------------------------------------------------
 1 | # cuda related
 2 | # export CUDA_HOME=/usr/local/cuda-10.0
 3 | # export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
 4 | 
 5 | # path related
 6 | export PRJ_ROOT="${PWD}/../../.."
 7 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 8 |     # shellcheck disable=SC1090
 9 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
10 | fi
11 | 
12 | MAIN_ROOT=$PWD/../../..
13 | export PATH=$MAIN_ROOT/s3prl_vc/bin:$PATH
14 | 
15 | # python related
16 | export OMP_NUM_THREADS=1
17 | export PYTHONIOENCODING=UTF-8
18 | export MPL_BACKEND=Agg
19 | 


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/run.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # basic settings
 10 | stage=-1       # stage to start
 11 | stop_stage=100 # stage to stop
 12 | verbose=1      # verbosity level (lower is less info)
 13 | n_gpus=1       # number of gpus in training
 14 | n_jobs=16      # number of parallel jobs in feature extraction
 15 | 
 16 | upstream=vq_wav2vec
 17 | conf=conf/taco2_ar.yaml
 18 | 
 19 | # dataset configuration
 20 | db_root=downloads
 21 | trgspk=TEF1
 22 | 
 23 | # training related setting
 24 | tag=""     # tag for directory to save model
 25 | resume=""  # checkpoint path to resume training
 26 |            # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
 27 |            
 28 | # decoding related setting
 29 | checkpoint=""               # checkpoint path to be used for decoding
 30 |                             # if not provided, the latest one will be used
 31 |                             # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
 32 |                                        
 33 | # shellcheck disable=SC1091
 34 | . utils/parse_options.sh || exit 1;
 35 | 
 36 | set -euo pipefail
 37 | 
 38 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 39 |     echo "stage -1: Data and Pretrained Model Download"
 40 |     local/data_download.sh ${db_root}/vcc2020
 41 |     local/vocoder_download.sh ${db_root}
 42 | fi
 43 | 
 44 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 45 |     echo "stage 0: Data preparation"
 46 |     local/data_prep.sh \
 47 |         --train_set "train" \
 48 |         --dev_set "dev" \
 49 |         --eval_set "eval" \
 50 |         "${db_root}/vcc2020" "${trgspk}" "data" "local/lists"
 51 | fi
 52 | 
 53 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 54 |     echo "stage 1: Pre-calculation"
 55 |     echo "Statistics computation start. See the progress via data/${trgspk}_train/compute_statistics.log."
 56 |     ${train_cmd} "data/${trgspk}_train/compute_statistics.log" \
 57 |         compute_statistics.py \
 58 |             --config "${conf}" \
 59 |             --scp "data/${trgspk}_train/wav.scp" \
 60 |             --dumpdir "data/${trgspk}_train" \
 61 |             --verbose "${verbose}"
 62 |     echo "Successfully calculated statistics."
 63 | fi
 64 | 
 65 | if [ -z ${tag} ]; then
 66 |     expname=${trgspk}_${upstream}_$(basename ${conf%.*})
 67 | else
 68 |     expname=${trgspk}_${upstream}_${tag}
 69 | fi
 70 | expdir=exp/${expname}
 71 | if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
 72 |     echo "Stage 2: Network training"
 73 |     [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
 74 |     cp "data/${trgspk}_train/stats.h5" "${expdir}"
 75 |     if [ "${n_gpus}" -gt 1 ]; then
 76 |         echo "Not Implemented yet."
 77 |         # train="python -m seq2seq_vc.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
 78 |     else
 79 |         train="train.py"
 80 |     fi
 81 |     echo "Training start. See the progress via ${expdir}/train.log."
 82 |     ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
 83 |         ${train} \
 84 |             --upstream ${upstream} \
 85 |             --config "${conf}" \
 86 |             --train-scp "data/${trgspk}_train/wav.scp" \
 87 |             --dev-scp "data/${trgspk}_dev/wav.scp" \
 88 |             --trg-stats "${expdir}/stats.h5" \
 89 |             --outdir "${expdir}" \
 90 |             --resume "${resume}" \
 91 |             --verbose "${verbose}"
 92 |     echo "Successfully finished training."
 93 | fi
 94 | 
 95 | if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
 96 |     echo "Stage 3: Network decoding"
 97 |     # shellcheck disable=SC2012
 98 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
 99 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
100 |     for name in "eval"; do
101 |         [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
102 |         [ "${n_gpus}" -gt 1 ] && n_gpus=1
103 |         echo "Decoding start. See the progress via ${outdir}/${name}/decode.log."
104 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/decode.log" \
105 |             decode.py \
106 |                 --scp "data/${name}/wav.scp" \
107 |                 --checkpoint "${checkpoint}" \
108 |                 --trg-stats "${expdir}/stats.h5" \
109 |                 --outdir "${outdir}/${name}" \
110 |                 --verbose "${verbose}"
111 |         echo "Successfully finished decoding of ${name} set."
112 |     done
113 |     echo "Successfully finished decoding."
114 | fi
115 | 
116 | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
117 |     echo "stage 4: Objective Evaluation"
118 | 
119 |     [ -z "${checkpoint}" ] && checkpoint="$(ls -dt "${expdir}"/*.pkl | head -1 || true)"
120 |     outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
121 |     for name in "eval"; do
122 |         wavdir="${outdir}/${name}/wav"
123 |         echo "Evaluation start. See the progress via ${outdir}/${name}/evaluation.log."
124 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/evaluation.log" \
125 |             local/evaluate.py \
126 |                 --wavdir ${wavdir} \
127 |                 --data_root "${db_root}/vcc2020" \
128 |                 --trgspk ${trgspk} \
129 |                 --f0_path "conf/f0.yaml"
130 |         grep "Mean MCD" "${outdir}/${name}/evaluation.log"
131 |     done
132 | fi


--------------------------------------------------------------------------------
/egs/vcc2020/a2o_vc/utils:
--------------------------------------------------------------------------------
1 | ../../../utils/


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/s3prl_vc/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __version__ = "0.3.1"
4 | 


--------------------------------------------------------------------------------
/s3prl_vc/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/bin/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/bin/compute_statistics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  MIT License (https://opensource.org/licenses/MIT)
  6 | 
  7 | """Calculate statistics of feature files."""
  8 | 
  9 | import argparse
 10 | import logging
 11 | import os
 12 | 
 13 | import numpy as np
 14 | import yaml
 15 | 
 16 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
 17 | from tqdm import tqdm
 18 | 
 19 | from s3prl_vc.utils import write_hdf5
 20 | from s3prl_vc.datasets.datasets import AudioSCPMelDataset
 21 | 
 22 | 
 23 | def main():
 24 |     """Run preprocessing process."""
 25 |     parser = argparse.ArgumentParser(
 26 |         description=(
 27 |             "Compute mean and variance of dumped raw features "
 28 |             "(See detail in bin/compute_statistics.py)."
 29 |         )
 30 |     )
 31 |     parser.add_argument(
 32 |         "--scp",
 33 |         default=None,
 34 |         type=str,
 35 |         required=True,
 36 |         help=(
 37 |             "kaldi-style feats.scp file. "
 38 |             "you need to specify either feats-scp or rootdir."
 39 |         ),
 40 |     )
 41 |     parser.add_argument(
 42 |         "--config",
 43 |         type=str,
 44 |         required=True,
 45 |         help="yaml format configuration file.",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--dumpdir",
 49 |         default=None,
 50 |         type=str,
 51 |         required=True,
 52 |         help=(
 53 |             "directory to save statistics. if not provided, "
 54 |             "stats will be saved in the above root directory. (default=None)"
 55 |         ),
 56 |     )
 57 |     parser.add_argument("--f0", action="store_true", help="calculate f0 statistics")
 58 |     parser.add_argument(
 59 |         "--f0_path", default=None, type=str, help="yaml file storing f0 ranges"
 60 |     )
 61 |     parser.add_argument(
 62 |         "--spk", default=None, type=str, help="speaker (for getting the f0 range)"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--verbose",
 66 |         type=int,
 67 |         default=1,
 68 |         help="logging level. higher is more logging. (default=1)",
 69 |     )
 70 |     args = parser.parse_args()
 71 | 
 72 |     # set logger
 73 |     if args.verbose > 1:
 74 |         logging.basicConfig(
 75 |             level=logging.DEBUG,
 76 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 77 |         )
 78 |     elif args.verbose > 0:
 79 |         logging.basicConfig(
 80 |             level=logging.INFO,
 81 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 82 |         )
 83 |     else:
 84 |         logging.basicConfig(
 85 |             level=logging.WARN,
 86 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 87 |         )
 88 |         logging.warning("Skip DEBUG/INFO messages")
 89 | 
 90 |     # load config
 91 |     with open(args.config) as f:
 92 |         config = yaml.load(f, Loader=yaml.Loader)
 93 |     config.update(vars(args))
 94 | 
 95 |     # load config
 96 |     if args.f0:
 97 |         with open(args.f0_path) as f:
 98 |             f0_config = yaml.load(f, Loader=yaml.Loader)
 99 |             f0min = f0_config[args.spk]["f0min"]
100 |             f0max = f0_config[args.spk]["f0max"]
101 | 
102 |     # check directory existence
103 |     if not os.path.exists(args.dumpdir):
104 |         os.makedirs(args.dumpdir)
105 | 
106 |     # get dataset
107 |     if args.f0:
108 |         dataset = AudioSCPMelDataset(
109 |             config,
110 |             args.scp,
111 |             extract_f0=config.get("use_f0", False),
112 |             f0_extractor=config.get("f0_extractor", "world"),
113 |             f0_min=f0min,
114 |             f0_max=f0max,
115 |             log_f0=config.get("log_f0", True),
116 |         )
117 |     else:
118 |         dataset = AudioSCPMelDataset(
119 |             config,
120 |             args.scp,
121 |         )
122 |     logging.info(f"The number of files = {len(dataset)}.")
123 | 
124 |     # calculate statistics
125 |     scaler = StandardScaler()
126 |     for items in tqdm(dataset):
127 |         mel = items["mel"]
128 |         scaler.partial_fit(mel)
129 | 
130 |     # write statistics
131 |     write_hdf5(
132 |         os.path.join(args.dumpdir, "stats.h5"),
133 |         "mean",
134 |         scaler.mean_.astype(np.float32),
135 |     )
136 |     write_hdf5(
137 |         os.path.join(args.dumpdir, "stats.h5"),
138 |         "scale",
139 |         scaler.scale_.astype(np.float32),
140 |     )
141 | 
142 |     if args.f0:
143 |         scaler = StandardScaler()
144 |         minmaxscaler = MinMaxScaler()
145 |         for items in tqdm(dataset):
146 |             f0 = items["f0"]
147 |             f0 = f0[f0 > 0]
148 |             scaler.partial_fit(f0.reshape([-1, 1]))
149 |             minmaxscaler.partial_fit(f0.reshape([-1, 1]))
150 | 
151 |         # write statistics
152 |         write_hdf5(
153 |             os.path.join(args.dumpdir, "stats.h5"),
154 |             "lf0_mean",
155 |             scaler.mean_.astype(np.float32),
156 |         )
157 |         write_hdf5(
158 |             os.path.join(args.dumpdir, "stats.h5"),
159 |             "lf0_scale",
160 |             scaler.scale_.astype(np.float32),
161 |         )
162 |         write_hdf5(
163 |             os.path.join(args.dumpdir, "stats.h5"),
164 |             "lf0_max",
165 |             minmaxscaler.data_max_.astype(np.float32),
166 |         )
167 |         write_hdf5(
168 |             os.path.join(args.dumpdir, "stats.h5"),
169 |             "lf0_min",
170 |             minmaxscaler.data_min_.astype(np.float32),
171 |         )
172 |         write_hdf5(
173 |             os.path.join(args.dumpdir, "stats.h5"),
174 |             "f0_max",
175 |             f0max,
176 |         )
177 |         write_hdf5(
178 |             os.path.join(args.dumpdir, "stats.h5"),
179 |             "f0_min",
180 |             f0min,
181 |         )
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------
/s3prl_vc/bin/create_histogram.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import logging
  6 | import os
  7 | from pathlib import Path
  8 | from joblib import Parallel, delayed
  9 | 
 10 | import librosa
 11 | import matplotlib
 12 | import numpy as np
 13 | 
 14 | from s3prl_vc.utils.signal import world_extract
 15 | from s3prl_vc.utils import find_files
 16 | 
 17 | matplotlib.use("Agg")  # noqa #isort:skip
 18 | import matplotlib.pyplot as plt  # noqa isort:skip
 19 | 
 20 | 
 21 | def create_histogram(
 22 |     data, figure_path, range_min=-70, range_max=20, step=10, xlabel="Power [dB]"
 23 | ):
 24 |     """Create histogram
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     data : list,
 29 |         List of several data sequences
 30 |     figure_path : str,
 31 |         Filepath to be output figure
 32 |     range_min : int, optional,
 33 |         Minimum range for histogram
 34 |         Default set to -70
 35 |     range_max : int, optional,
 36 |         Maximum range for histogram
 37 |         Default set to -20
 38 |     step : int, optional
 39 |         Stap size of label in horizontal axis
 40 |         Default set to 10
 41 |     xlabel : str, optional
 42 |         Label of the horizontal axis
 43 |         Default set to 'Power [dB]'
 44 | 
 45 |     """
 46 | 
 47 |     # plot histgram
 48 |     plt.hist(
 49 |         data,
 50 |         bins=200,
 51 |         range=(range_min, range_max),
 52 |         density=True,
 53 |         histtype="stepfilled",
 54 |     )
 55 |     plt.xlabel(xlabel)
 56 |     plt.ylabel("Probability")
 57 |     plt.xticks(np.arange(range_min, range_max, step))
 58 | 
 59 |     figure_dir = os.path.dirname(figure_path)
 60 |     if not os.path.exists(figure_dir):
 61 |         os.makedirs(figure_dir)
 62 | 
 63 |     plt.savefig(figure_path)
 64 |     plt.close()
 65 | 
 66 | 
 67 | def extract_f0_and_npow(wavf, f0min=40, f0max=500):
 68 |     """
 69 |     F0 and npow extraction
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     wavf : str,
 74 |         File path of waveform file
 75 | 
 76 |     Returns
 77 |     -------
 78 |     dict :
 79 |         Dictionary consisting of F0 and npow arrays
 80 | 
 81 |     """
 82 | 
 83 |     x, fs = librosa.load(wavf, sr=None)
 84 |     return world_extract(x, fs, f0min, f0max)
 85 | 
 86 | 
 87 | def main():
 88 |     dcp = "Create histogram for speaker-dependent configure"
 89 |     parser = argparse.ArgumentParser(description=dcp)
 90 |     parser.add_argument("--n_jobs", type=int, default=16, help="# of CPUs")
 91 |     parser.add_argument(
 92 |         "--wav_dir", type=str, default=None, help="Directory of wav file"
 93 |     )
 94 |     parser.add_argument(
 95 |         "--scp",
 96 |         default=None,
 97 |         type=str,
 98 |         help="kaldi-style wav.scp file.",
 99 |     )
100 |     parser.add_argument("--figure_dir", type=str, help="Directory for figure output")
101 |     args = parser.parse_args()
102 | 
103 |     # set logger
104 |     logging.basicConfig(
105 |         level=logging.INFO,
106 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
107 |     )
108 | 
109 |     f0histogrampath = os.path.join(args.figure_dir, "f0histogram.png")
110 |     npowhistogrampath = os.path.join(args.figure_dir, "npowhistogram.png")
111 | 
112 |     if not os.path.exists(f0histogrampath) and not os.path.exists(npowhistogrampath):
113 | 
114 |         # sanity check
115 |         assert (args.scp is None and args.wav_dir is not None) or (
116 |             args.scp is not None and args.wav_dir is None
117 |         ), "Please assure only either --scp or --wav_dir is specified."
118 | 
119 |         # get file list
120 |         if args.scp is not None:
121 |             with open(args.scp, "r") as f:
122 |                 file_list = [line.split(" ")[1] for line in f.read().splitlines()]
123 |         else:
124 |             file_list = sorted(find_files(args.wav_dir))
125 | 
126 |         # extract features in parallel
127 |         results = Parallel(n_jobs=args.n_jobs)(
128 |             [delayed(extract_f0_and_npow)(str(f)) for f in file_list]
129 |         )
130 | 
131 |         # parse results
132 |         f0s = [r["f0"] for r in results]
133 |         npows = [r["npow"] for r in results]
134 | 
135 |         # stack feature vectors
136 |         f0s = np.hstack(f0s).flatten()
137 |         npows = np.hstack(npows).flatten()
138 | 
139 |         # create a histogram to visualize F0 range of the speaker
140 |         create_histogram(
141 |             f0s,
142 |             f0histogrampath,
143 |             range_min=40,
144 |             range_max=700,
145 |             step=50,
146 |             xlabel="Fundamental frequency [Hz]",
147 |         )
148 | 
149 |         # create a histogram to visualize npow range of the speaker
150 |         create_histogram(
151 |             npows,
152 |             npowhistogrampath,
153 |             range_min=-70,
154 |             range_max=20,
155 |             step=10,
156 |             xlabel="Frame power [dB]",
157 |         )
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     main()
162 | 


--------------------------------------------------------------------------------
/s3prl_vc/bin/extract_spemb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  MIT License (https://opensource.org/licenses/MIT)
  6 | 
  7 | """Calculate statistics of feature files."""
  8 | 
  9 | import argparse
 10 | import logging
 11 | import os
 12 | 
 13 | import numpy as np
 14 | import yaml
 15 | 
 16 | from tqdm import tqdm
 17 | 
 18 | from s3prl_vc.utils import write_hdf5
 19 | from s3prl_vc.datasets.datasets import AudioSCPMelDataset
 20 | from s3prl_vc.utils.speaker_embedding_resemblyzer import (
 21 |     load_asv_model,
 22 |     get_embedding,
 23 | )
 24 | 
 25 | 
 26 | def main():
 27 |     """Run preprocessing process."""
 28 |     parser = argparse.ArgumentParser(
 29 |         description=(
 30 |             "Compute speaker embedding " "(See detail in bin/extract_spemb.py)."
 31 |         )
 32 |     )
 33 |     parser.add_argument(
 34 |         "--scp",
 35 |         default=None,
 36 |         type=str,
 37 |         required=True,
 38 |         help=(
 39 |             "kaldi-style feats.scp file. "
 40 |             "you need to specify either feats-scp or rootdir."
 41 |         ),
 42 |     )
 43 |     parser.add_argument(
 44 |         "--config",
 45 |         type=str,
 46 |         required=True,
 47 |         help="yaml format configuration file.",
 48 |     )
 49 |     parser.add_argument(
 50 |         "--dumpdir",
 51 |         default=None,
 52 |         type=str,
 53 |         required=True,
 54 |         help=(
 55 |             "directory to save statistics. if not provided, "
 56 |             "stats will be saved in the above root directory. (default=None)"
 57 |         ),
 58 |     )
 59 |     parser.add_argument(
 60 |         "--verbose",
 61 |         type=int,
 62 |         default=1,
 63 |         help="logging level. higher is more logging. (default=1)",
 64 |     )
 65 |     args = parser.parse_args()
 66 | 
 67 |     # set logger
 68 |     if args.verbose > 1:
 69 |         logging.basicConfig(
 70 |             level=logging.DEBUG,
 71 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 72 |         )
 73 |     elif args.verbose > 0:
 74 |         logging.basicConfig(
 75 |             level=logging.INFO,
 76 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 77 |         )
 78 |     else:
 79 |         logging.basicConfig(
 80 |             level=logging.WARN,
 81 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 82 |         )
 83 |         logging.warning("Skip DEBUG/INFO messages")
 84 | 
 85 |     # load config
 86 |     with open(args.config) as f:
 87 |         config = yaml.load(f, Loader=yaml.Loader)
 88 |     config.update(vars(args))
 89 | 
 90 |     # check directory existence
 91 |     if not os.path.exists(args.dumpdir):
 92 |         os.makedirs(args.dumpdir)
 93 | 
 94 |     # get dataset
 95 |     dataset = AudioSCPMelDataset(
 96 |         config,
 97 |         args.scp,
 98 |         return_utt_id=True,
 99 |         return_wavpath=True,
100 |     )
101 |     logging.info(f"The number of files = {len(dataset)}.")
102 | 
103 |     # load speaker encoder
104 |     spk_emb_model = load_asv_model()
105 | 
106 |     # calculate speaker embedding
107 |     for items in tqdm(dataset):
108 |         utt_id = items["utt_id"]
109 |         wavpath = items["wavpath"]
110 |         spk_emb = get_embedding(wavpath, spk_emb_model)
111 | 
112 |         # write to file
113 |         write_hdf5(
114 |             os.path.join(args.dumpdir, utt_id + ".h5"),
115 |             "spemb",
116 |             spk_emb.astype(np.float32),
117 |         )
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/s3prl_vc/bin/extract_upstream.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  MIT License (https://opensource.org/licenses/MIT)
  6 | 
  7 | """Extract upstream features."""
  8 | 
  9 | import argparse
 10 | import logging
 11 | import os
 12 | import time
 13 | 
 14 | import numpy as np
 15 | import soundfile as sf
 16 | import torch
 17 | import yaml
 18 | 
 19 | from tqdm import tqdm
 20 | 
 21 | from s3prl.nn import Featurizer
 22 | 
 23 | import s3prl_vc.models
 24 | from s3prl_vc.upstream.interface import get_upstream
 25 | from s3prl_vc.datasets.datasets import AudioSCPMelDataset
 26 | from s3prl_vc.utils import read_hdf5, write_hdf5
 27 | from s3prl_vc.utils.data import pad_list
 28 | from s3prl_vc.utils.plot import plot_generated_and_ref_2d, plot_1d
 29 | from s3prl_vc.vocoder import Vocoder
 30 | 
 31 | 
 32 | def main():
 33 |     """Run decoding process."""
 34 |     parser = argparse.ArgumentParser(
 35 |         description=("Decode with trained model " "(See detail in bin/decode.py).")
 36 |     )
 37 |     parser.add_argument(
 38 |         "--scp",
 39 |         type=str,
 40 |         default=None,
 41 |         help=("kaldi-style wav.scp file. "),
 42 |     )
 43 |     parser.add_argument(
 44 |         "--wavdir",
 45 |         default=None,
 46 |         type=str,
 47 |         help=(
 48 |             "directory including input wav files. you need to specify either scp or wavdir."
 49 |         ),
 50 |     )
 51 |     parser.add_argument(
 52 |         "--outdir",
 53 |         type=str,
 54 |         required=True,
 55 |         help="directory to save generated stuff.",
 56 |     )
 57 |     parser.add_argument(
 58 |         "--checkpoint",
 59 |         type=str,
 60 |         required=True,
 61 |         help="checkpoint file to be loaded (for featurizer).",
 62 |     )
 63 |     parser.add_argument(
 64 |         "--config",
 65 |         default=None,
 66 |         type=str,
 67 |         help=(
 68 |             "yaml format configuration file. if not explicitly provided, "
 69 |             "it will be searched in the checkpoint directory. (default=None)"
 70 |         ),
 71 |     )
 72 |     parser.add_argument(
 73 |         "--feat_type",
 74 |         type=str,
 75 |         default="feats",
 76 |         help=("feature type. this is used as key name to read h5 featyre files. "),
 77 |     )
 78 |     parser.add_argument(
 79 |         "--verbose",
 80 |         type=int,
 81 |         default=1,
 82 |         help="logging level. higher is more logging. (default=1)",
 83 |     )
 84 |     args = parser.parse_args()
 85 | 
 86 |     # set logger
 87 |     if args.verbose > 1:
 88 |         logging.basicConfig(
 89 |             level=logging.DEBUG,
 90 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 91 |         )
 92 |     elif args.verbose > 0:
 93 |         logging.basicConfig(
 94 |             level=logging.INFO,
 95 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 96 |         )
 97 |     else:
 98 |         logging.basicConfig(
 99 |             level=logging.WARN,
100 |             format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
101 |         )
102 |         logging.warning("Skip DEBUG/INFO messages")
103 | 
104 |     # check directory existence
105 |     if not os.path.exists(args.outdir):
106 |         os.makedirs(args.outdir)
107 | 
108 |     # load config
109 |     if args.config is None:
110 |         dirname = os.path.dirname(args.checkpoint)
111 |         args.config = os.path.join(dirname, "config.yml")
112 |     with open(args.config) as f:
113 |         config = yaml.load(f, Loader=yaml.Loader)
114 |     config.update(vars(args))
115 | 
116 |     # setup device
117 |     if torch.cuda.is_available():
118 |         device = torch.device("cuda")
119 |     else:
120 |         device = torch.device("cpu")
121 | 
122 |     # check arguments
123 |     if (args.scp is not None and args.wavdir is not None) or (
124 |         args.scp is None and args.wavdir is None
125 |     ):
126 |         raise ValueError("Please specify either --wavdir or --scp.")
127 | 
128 |     # get dataset
129 |     if args.scp is not None:
130 |         dataset = AudioSCPMelDataset(
131 |             config,
132 |             args.scp,
133 |             return_utt_id=True,
134 |         )
135 |     else:
136 |         dataset = AudioMelDataset(
137 |             config,
138 |             args.wavdir,
139 |             return_utt_id=True,
140 |         )
141 | 
142 |     logging.info(f"The number of files to be extracted = {len(dataset)}.")
143 |     dataloader = torch.utils.data.DataLoader(dataset, batch_size=len(dataset))
144 | 
145 |     # define upstream model
146 |     upstream_model = get_upstream(config["upstream"]).to(device)
147 |     upstream_model.eval()
148 |     upstream_featurizer = Featurizer(upstream_model).to(device)
149 |     upstream_featurizer.load_state_dict(
150 |         torch.load(args.checkpoint, map_location="cpu")["featurizer"]
151 |     )
152 |     upstream_featurizer.eval()
153 |     logging.info(f"Loaded model parameters from {args.checkpoint}.")
154 | 
155 |     # start generation
156 |     with torch.no_grad():
157 |         for items in tqdm(dataset):
158 |             utt_id = items["utt_id"]
159 |             x = items["audio"]
160 |             xs = torch.from_numpy(x).unsqueeze(0).float().to(device)
161 |             ilens = torch.LongTensor([x.shape[0]]).to(device)
162 | 
163 |             start_time = time.time()
164 |             all_hs, all_hlens = upstream_model(xs, ilens)
165 |             hs, _ = upstream_featurizer(all_hs, all_hlens)
166 |             h = hs[0]
167 |             logging.info(
168 |                 "inference speed = %.1f frames / sec."
169 |                 % (int(h.size(0)) / (time.time() - start_time))
170 |             )
171 | 
172 |             # write feats
173 |             if not os.path.exists(os.path.join(config["outdir"], args.feat_type)):
174 |                 os.makedirs(
175 |                     os.path.join(config["outdir"], args.feat_type), exist_ok=True
176 |                 )
177 | 
178 |             write_hdf5(
179 |                 config["outdir"] + f"/{args.feat_type}/{utt_id}.h5",
180 |                 args.feat_type,
181 |                 h.cpu().numpy().astype(np.float32),
182 |             )
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     main()
187 | 


--------------------------------------------------------------------------------
/s3prl_vc/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/datasets/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/evaluate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/evaluate/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/evaluate/asr.py:
--------------------------------------------------------------------------------
 1 | import editdistance as ed
 2 | import jiwer
 3 | import torch
 4 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
 5 | 
 6 | ASR_PRETRAINED_MODEL = "facebook/wav2vec2-large-960h-lv60-self"
 7 | 
 8 | 
 9 | def load_asr_model(device):
10 |     """Load model"""
11 |     print(f"[INFO]: Load the pre-trained ASR by {ASR_PRETRAINED_MODEL}.")
12 |     model = Wav2Vec2ForCTC.from_pretrained(ASR_PRETRAINED_MODEL).to(device)
13 |     tokenizer = Wav2Vec2Tokenizer.from_pretrained(ASR_PRETRAINED_MODEL)
14 |     models = {"model": model, "tokenizer": tokenizer}
15 |     return models
16 | 
17 | 
18 | def normalize_sentence(sentence):
19 |     """Normalize sentence"""
20 |     # Convert all characters to upper.
21 |     sentence = sentence.upper()
22 |     # Delete punctuations.
23 |     sentence = jiwer.RemovePunctuation()(sentence)
24 |     # Remove \n, \t, \r, \x0c.
25 |     sentence = jiwer.RemoveWhiteSpace(replace_by_space=True)(sentence)
26 |     # Remove multiple spaces.
27 |     sentence = jiwer.RemoveMultipleSpaces()(sentence)
28 |     # Remove white space in two end of string.
29 |     sentence = jiwer.Strip()(sentence)
30 | 
31 |     # Convert all characters to upper.
32 |     sentence = sentence.upper()
33 | 
34 |     return sentence
35 | 
36 | 
37 | def calculate_measures(groundtruth, transcription):
38 |     """Calculate character/word measures (hits, subs, inserts, deletes) for one given sentence"""
39 |     groundtruth = normalize_sentence(groundtruth)
40 |     transcription = normalize_sentence(transcription)
41 | 
42 |     # cer = ed.eval(transcription, groundtruth) / len(groundtruth)
43 |     # c_result = jiwer.compute_measures([c for c in groundtruth if c != " "], [c for c in transcription if c != " "])
44 |     c_result = jiwer.cer(groundtruth, transcription, return_dict=True)
45 |     w_result = jiwer.compute_measures(groundtruth, transcription)
46 | 
47 |     return c_result, w_result, groundtruth, transcription
48 | 
49 | 
50 | def transcribe(model, device, wav):
51 |     """Calculate score on one single waveform"""
52 |     # preparation
53 |     inputs = model["tokenizer"](
54 |         wav, sampling_rate=16000, return_tensors="pt", padding="longest"
55 |     )
56 |     input_values = inputs.input_values.to(device)
57 |     attention_mask = inputs.attention_mask.to(device)
58 | 
59 |     # forward
60 |     logits = model["model"](input_values, attention_mask=attention_mask).logits
61 |     predicted_ids = torch.argmax(logits, dim=-1)
62 |     transcription = model["tokenizer"].batch_decode(predicted_ids)[0]
63 | 
64 |     return transcription
65 | 


--------------------------------------------------------------------------------
/s3prl_vc/evaluate/asv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | from s3prl_vc.utils.speaker_embedding_resemblyzer import get_embedding
 4 | 
 5 | 
 6 | def get_cosine_similarity(x_emb, y_emb):
 7 |     return np.inner(x_emb, y_emb) / (np.linalg.norm(x_emb) * np.linalg.norm(y_emb))
 8 | 
 9 | 
10 | def calculate_accept(x_path, y_path, model, threshold):
11 |     x_emb = get_embedding(x_path, model)
12 |     y_emb = get_embedding(y_path, model)
13 |     cosine_similarity = get_cosine_similarity(x_emb, y_emb)
14 |     return cosine_similarity > threshold
15 | 


--------------------------------------------------------------------------------
/s3prl_vc/evaluate/dtw_based.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from fastdtw import fastdtw
 3 | import librosa
 4 | import numpy as np
 5 | import scipy
 6 | from scipy.io import wavfile
 7 | 
 8 | from s3prl_vc.utils.signal import world_extract, extfrm
 9 | 
10 | 
11 | def calculate_mcd_f0(x, y, fs, f0min, f0max):
12 |     """
13 |     x and y must be in range [-1, 1]
14 |     """
15 | 
16 |     # extract ground truth and converted features
17 |     gt_feats = world_extract(x, fs, f0min, f0max)
18 |     cvt_feats = world_extract(y, fs, f0min, f0max)
19 | 
20 |     # VAD & DTW based on power
21 |     gt_mcep_nonsil_pow = extfrm(gt_feats["mcep"], gt_feats["npow"])
22 |     cvt_mcep_nonsil_pow = extfrm(cvt_feats["mcep"], cvt_feats["npow"])
23 |     _, path = fastdtw(
24 |         cvt_mcep_nonsil_pow, gt_mcep_nonsil_pow, dist=scipy.spatial.distance.euclidean
25 |     )
26 |     twf_pow = np.array(path).T
27 | 
28 |     # MCD using power-based DTW
29 |     cvt_mcep_dtw_pow = cvt_mcep_nonsil_pow[twf_pow[0]]
30 |     gt_mcep_dtw_pow = gt_mcep_nonsil_pow[twf_pow[1]]
31 |     diff2sum = np.sum((cvt_mcep_dtw_pow - gt_mcep_dtw_pow) ** 2, 1)
32 |     mcd = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
33 | 
34 |     # VAD & DTW based on f0
35 |     gt_nonsil_f0_idx = np.where(gt_feats["f0"] > 0)[0]
36 |     cvt_nonsil_f0_idx = np.where(cvt_feats["f0"] > 0)[0]
37 |     try:
38 |         gt_mcep_nonsil_f0 = gt_feats["mcep"][gt_nonsil_f0_idx]
39 |         cvt_mcep_nonsil_f0 = cvt_feats["mcep"][cvt_nonsil_f0_idx]
40 |         _, path = fastdtw(
41 |             cvt_mcep_nonsil_f0, gt_mcep_nonsil_f0, dist=scipy.spatial.distance.euclidean
42 |         )
43 |         twf_f0 = np.array(path).T
44 | 
45 |         # f0RMSE, f0CORR using f0-based DTW
46 |         cvt_f0_dtw = cvt_feats["f0"][cvt_nonsil_f0_idx][twf_f0[0]]
47 |         gt_f0_dtw = gt_feats["f0"][gt_nonsil_f0_idx][twf_f0[1]]
48 |         f0rmse = np.sqrt(np.mean((cvt_f0_dtw - gt_f0_dtw) ** 2))
49 |         f0corr = scipy.stats.pearsonr(cvt_f0_dtw, gt_f0_dtw)[0]
50 |     except ValueError:
51 |         logging.warning(
52 |             "No nonzero f0 is found. Skip f0rmse f0corr computation and set them to NaN. "
53 |             "This might due to unconverge training. Please tune the training time and hypers."
54 |         )
55 |         f0rmse = np.nan
56 |         f0corr = np.nan
57 | 
58 |     # DDUR
59 |     # energy-based VAD with librosa
60 |     x_trim, _ = librosa.effects.trim(y=x)
61 |     y_trim, _ = librosa.effects.trim(y=y)
62 |     ddur = float(abs(len(x_trim) - len(y_trim)) / fs)
63 | 
64 |     return mcd, f0rmse, f0corr, ddur
65 | 


--------------------------------------------------------------------------------
/s3prl_vc/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/layers/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .l1_loss import *  # NOQA
2 | from .l2_loss import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/s3prl_vc/losses/l1_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
 2 | #  MIT License (https://opensource.org/licenses/MIT)
 3 | 
 4 | import logging
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | import torch.nn.functional as F
 9 | 
10 | from s3prl_vc.layers.utils import make_non_pad_mask
11 | 
12 | 
13 | class L1Loss(torch.nn.Module):
14 |     """
15 |     L1 loss module supporting (1) loss calculation in the normalized target feature space
16 |                               (2) masked loss calculation
17 |     """
18 | 
19 |     def __init__(self):
20 |         super(L1Loss, self).__init__()
21 |         self.objective = torch.nn.L1Loss(reduction="mean")
22 | 
23 |     def forward(self, predicted, predicted_lens, target, target_lens, device):
24 |         # match the upstream feature length to acoustic feature length to calculate the loss
25 |         if predicted.shape[1] > target.shape[1]:
26 |             predicted = predicted[:, : target.shape[1]]
27 |             masks = make_non_pad_mask(target_lens).unsqueeze(-1).to(device)
28 |         if predicted.shape[1] <= target.shape[1]:
29 |             target = target[:, : predicted.shape[1]]
30 |             masks = make_non_pad_mask(predicted_lens).unsqueeze(-1).to(device)
31 | 
32 |         # calculate masked loss
33 |         predicted_masked = predicted.masked_select(masks)
34 |         target_masked = target.masked_select(masks)
35 |         loss = self.objective(predicted_masked, target_masked)
36 |         return loss
37 | 


--------------------------------------------------------------------------------
/s3prl_vc/losses/l2_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Wen-Chin Huang (Nagoya University)
 2 | #  MIT License (https://opensource.org/licenses/MIT)
 3 | 
 4 | import logging
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | import torch.nn.functional as F
 9 | 
10 | from s3prl_vc.layers.utils import make_non_pad_mask
11 | 
12 | 
13 | class L2Loss(torch.nn.Module):
14 |     """
15 |     L2 loss module supporting (1) loss calculation in the normalized target feature space
16 |                               (2) masked loss calculation
17 |     """
18 | 
19 |     def __init__(self):
20 |         super(L2Loss, self).__init__()
21 |         self.objective = torch.nn.MSELoss(reduction="mean")
22 | 
23 |     def forward(self, predicted, predicted_lens, target, target_lens, device):
24 |         # match the upstream feature length to acoustic feature length to calculate the loss
25 | 
26 |         # NOTE:
27 |         # diffusion model needs the inputs and outputs to be
28 |         # of the same length, so there are cases where it is
29 |         # cut to match with the upsampled upstream features
30 |         # thus, target_lens is not used
31 | 
32 |         if predicted.shape[1] > target.shape[1]:
33 |             predicted = predicted[:, : target.shape[1]]
34 |             masks = make_non_pad_mask(predicted_lens).unsqueeze(-1).to(device)
35 |         if predicted.shape[1] <= target.shape[1]:
36 |             target = target[:, : predicted.shape[1]]
37 |             masks = make_non_pad_mask(predicted_lens).unsqueeze(-1).to(device)
38 | 
39 |         # calculate masked loss
40 |         predicted_masked = predicted.masked_select(masks)
41 |         target_masked = target.masked_select(masks)
42 |         loss = self.objective(predicted_masked, target_masked)
43 |         return loss
44 | 


--------------------------------------------------------------------------------
/s3prl_vc/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .Taco2_AR import *  # NOQA
2 | from .diffusion import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/s3prl_vc/models/diffsinger/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jinglin Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/s3prl_vc/models/diffsinger/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | 
3 | Code in this directory was adapted from https://github.com/MoonInTheRiver/DiffSinger and https://github.com/nnsvs/nnsvs
4 | 


--------------------------------------------------------------------------------
/s3prl_vc/models/diffsinger/__init__.py:
--------------------------------------------------------------------------------
1 | from .denoiser import DiffNet
2 | from .diffusion import GaussianDiffusion
3 | 


--------------------------------------------------------------------------------
/s3prl_vc/models/diffsinger/denoiser.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from math import sqrt
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class SpeakerAdapter(nn.Module):
 10 |     """
 11 |     Adaspeech2 conditional layer normalization.
 12 |     """
 13 | 
 14 |     def __init__(self, speaker_dim, adapter_dim, epsilon=1e-5):
 15 |         super(SpeakerAdapter, self).__init__()
 16 |         self.speaker_dim = speaker_dim
 17 |         self.adapter_dim = adapter_dim
 18 |         self.epsilon = epsilon
 19 |         self.W_scale = nn.Linear(self.speaker_dim, self.adapter_dim)
 20 |         self.W_bias = nn.Linear(self.speaker_dim, self.adapter_dim)
 21 |         self.reset_parameters()
 22 | 
 23 |     def reset_parameters(self):
 24 |         torch.nn.init.constant_(self.W_scale.weight, 0.0)
 25 |         torch.nn.init.constant_(self.W_scale.bias, 1.0)
 26 |         torch.nn.init.constant_(self.W_bias.weight, 0.0)
 27 |         torch.nn.init.constant_(self.W_bias.bias, 0.0)
 28 | 
 29 |     def forward(self, x, speaker_embedding):
 30 |         x = x.transpose(1, -1)
 31 |         mean = x.mean(dim=-1, keepdim=True)
 32 |         var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
 33 |         std = (var + self.epsilon).sqrt()
 34 |         y = (x - mean) / std
 35 |         scale = self.W_scale(speaker_embedding)
 36 |         bias = self.W_bias(speaker_embedding)
 37 |         y *= scale.unsqueeze(1)
 38 |         y += bias.unsqueeze(1)
 39 |         y = y.transpose(1, -1)
 40 |         return y
 41 | 
 42 | 
 43 | class Mish(nn.Module):
 44 |     def forward(self, x):
 45 |         return x * torch.tanh(F.softplus(x))
 46 | 
 47 | 
 48 | class SinusoidalPosEmb(nn.Module):
 49 |     def __init__(self, dim):
 50 |         super().__init__()
 51 |         self.dim = dim
 52 | 
 53 |     def forward(self, x):
 54 |         device = x.device
 55 |         half_dim = self.dim // 2
 56 |         emb = math.log(10000) / (half_dim - 1)
 57 |         emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
 58 |         emb = x[:, None] * emb[None, :]
 59 |         emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
 60 |         return emb
 61 | 
 62 | 
 63 | def Conv1d(*args, **kwargs):
 64 |     layer = nn.Conv1d(*args, **kwargs)
 65 |     nn.init.kaiming_normal_(layer.weight)
 66 |     return layer
 67 | 
 68 | 
 69 | @torch.jit.script
 70 | def silu(x):
 71 |     return x * torch.sigmoid(x)
 72 | 
 73 | 
 74 | class ResidualBlock(nn.Module):
 75 |     def __init__(self, encoder_hidden, residual_channels, dilation, use_spk_emb):
 76 |         super().__init__()
 77 |         self.dilated_conv = Conv1d(
 78 |             residual_channels,
 79 |             2 * residual_channels,
 80 |             3,
 81 |             padding=dilation,
 82 |             dilation=dilation,
 83 |         )
 84 |         self.diffusion_projection = nn.Linear(residual_channels, residual_channels)
 85 |         self.conditioner_projection = Conv1d(encoder_hidden, 2 * residual_channels, 1)
 86 |         self.output_projection = Conv1d(residual_channels, 2 * residual_channels, 1)
 87 | 
 88 |         self.use_spk_emb = use_spk_emb
 89 |         if use_spk_emb:
 90 |             self.speaker_projection = SpeakerAdapter(
 91 |                 speaker_dim=512, adapter_dim=residual_channels
 92 |             )
 93 | 
 94 |     def forward(self, x, conditioner, diffusion_step, spk_emb):
 95 |         diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
 96 |         conditioner = self.conditioner_projection(conditioner)
 97 | 
 98 |         if self.use_spk_emb:
 99 |             x = self.speaker_projection(x, spk_emb)
100 | 
101 |         y = x + diffusion_step
102 | 
103 |         y = self.dilated_conv(y) + conditioner
104 | 
105 |         gate, filter = torch.chunk(y, 2, dim=1)
106 |         y = torch.sigmoid(gate) * torch.tanh(filter)
107 | 
108 |         y = self.output_projection(y)
109 |         residual, skip = torch.chunk(y, 2, dim=1)
110 |         return (x + residual) / sqrt(2.0), skip
111 | 
112 | 
113 | class DiffNet(nn.Module):
114 |     def __init__(
115 |         self,
116 |         in_dim=80,
117 |         encoder_hidden_dim=256,
118 |         residual_layers=20,
119 |         residual_channels=256,
120 |         dilation_cycle_length=4,
121 |         use_spk_emb=False,
122 |     ):
123 |         super().__init__()
124 |         self.in_dim = in_dim
125 | 
126 |         self.input_projection = Conv1d(in_dim, residual_channels, 1)
127 |         self.diffusion_embedding = SinusoidalPosEmb(residual_channels)
128 |         dim = residual_channels
129 |         self.mlp = nn.Sequential(
130 |             nn.Linear(dim, dim * 4), Mish(), nn.Linear(dim * 4, dim)
131 |         )
132 |         self.residual_layers = nn.ModuleList(
133 |             [
134 |                 ResidualBlock(
135 |                     encoder_hidden_dim,
136 |                     residual_channels,
137 |                     2 ** (i % dilation_cycle_length),
138 |                     use_spk_emb,
139 |                 )
140 |                 for i in range(residual_layers)
141 |             ]
142 |         )
143 |         self.skip_projection = Conv1d(residual_channels, residual_channels, 1)
144 |         self.output_projection = Conv1d(residual_channels, in_dim, 1)
145 |         nn.init.zeros_(self.output_projection.weight)
146 | 
147 |     def forward(self, spec, diffusion_step, cond, spk_emb):
148 |         """
149 | 
150 |         :param spec: [B, 1, M, T]
151 |         :param diffusion_step: [B, 1]
152 |         :param cond: [B, M, T]
153 |         :return:
154 |         """
155 |         x = spec[:, 0]
156 |         x = self.input_projection(x)  # x [B, residual_channel, T]
157 | 
158 |         x = F.relu(x)
159 |         diffusion_step = self.diffusion_embedding(diffusion_step)
160 |         diffusion_step = self.mlp(diffusion_step)
161 |         skip = []
162 |         for _, layer in enumerate(self.residual_layers):
163 |             x, skip_connection = layer(x, cond, diffusion_step, spk_emb)
164 |             skip.append(skip_connection)
165 | 
166 |         x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
167 |         x = self.skip_projection(x)
168 |         x = F.relu(x)
169 |         x = self.output_projection(x)  # [B, 80, T]
170 |         return x[:, None, :, :]
171 | 


--------------------------------------------------------------------------------
/s3prl_vc/models/diffusion.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2023 Lester Violeta (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """
  7 | Diffusion-based acoustic model implementation for voice conversion
  8 | References:
  9 |     - https://github.com/MoonInTheRiver/DiffSinger
 10 |     - https://github.com/nnsvs/nnsvs
 11 | """
 12 | 
 13 | from typing import Sequence
 14 | from collections import OrderedDict
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | import logging
 19 | 
 20 | from torch.nn import functional as F
 21 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 22 | from s3prl_vc.models.diffsinger import GaussianDiffusion, DiffNet
 23 | 
 24 | 
 25 | class Diffusion(torch.nn.Module):
 26 |     def __init__(
 27 |         self,
 28 |         input_dim,
 29 |         output_dim,
 30 |         resample_ratio,
 31 |         stats,
 32 |         # model params below
 33 |         use_spemb=False,
 34 |         denoiser_residual_channels=256,
 35 |     ):
 36 |         super().__init__()
 37 |         self.input_dim = input_dim
 38 |         self.output_dim = output_dim
 39 |         self.denoiser_residual_channels = denoiser_residual_channels
 40 |         self.resample_ratio = resample_ratio
 41 |         self.stats = stats
 42 | 
 43 |         # ppgel -> melspec
 44 |         self.mel_model = GaussianDiffusion(
 45 |             in_dim=input_dim,
 46 |             out_dim=output_dim,
 47 |             denoise_fn=DiffNet(
 48 |                 encoder_hidden_dim=input_dim,
 49 |                 residual_channels=denoiser_residual_channels,
 50 |                 use_spk_emb=use_spemb,
 51 |             ),
 52 |         )
 53 |         """Initialize Diffusion Module.
 54 | 
 55 |         Args:
 56 |             input_dim (int): Dimension of the inputs.
 57 |             output_dim (int): Dimension of the outputs.
 58 |             denoiser_residual_channels (int): Dimension of diffusion model hidden units.
 59 |             use_spemb (bool): Whether or not to use speaker embeddings.
 60 |             resample_ratio (float): Ratio to align the input and output features.
 61 |         """
 62 | 
 63 |     def forward(
 64 |         self,
 65 |         x,
 66 |         lengths,
 67 |         targets=None,
 68 |         spk_embs=None,
 69 |         f0s=None,  # not used, but just to unify with taco2
 70 |     ):
 71 |         """Calculate forward propagation.
 72 | 
 73 |         Args:
 74 |             x (Tensor): Batch of padded input conditioning features (B, Lmax, input_dim).
 75 |             lengths (LongTensor): Batch of lengths of each input batch (B,).
 76 |             targets (Tensor): Batch of padded target features (B, Lmax, output_dim).
 77 |             spk (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
 78 | 
 79 |         Returns (training):
 80 |             Tensor: Ground truth noise.
 81 |             Tensor: Predicted noise.
 82 |             LongTensor: Resampled lengths based on upstream feature.
 83 |         Returns (inference):
 84 |             Tensor: Predicted mel spectrogram.
 85 |         """
 86 | 
 87 |         # resample the input features according to resample_ratio
 88 |         x = x.permute(0, 2, 1)
 89 |         resampled_features = F.interpolate(x, scale_factor=self.resample_ratio)
 90 |         x = resampled_features.permute(0, 2, 1)
 91 |         lengths = lengths * self.resample_ratio
 92 | 
 93 |         if spk_embs is not None and type(spk_embs) != list:
 94 |             spk_embs = spk_embs.squeeze(-1)
 95 | 
 96 |         if targets is not None:
 97 |             # normalize
 98 |             targets = (targets - self.stats["mean"]) / self.stats["scale"]
 99 | 
100 |             # cut if necessary
101 |             if x.size(1) > targets.size(1):
102 |                 x = x[:, : targets.size(1), :]
103 |             elif x.size(1) < targets.size(1):
104 |                 targets = targets[:, : x.size(1), :]
105 | 
106 |             # training
107 |             mel_ = self.mel_model(x, lengths, targets, spk_embs)
108 |             return mel_[0], mel_[1], lengths
109 | 
110 |         elif targets is None:
111 |             # inference
112 |             mel_ = self.mel_model.inference(x, spk_emb=spk_embs)
113 | 
114 |             # normalize
115 |             mel_ = self.stats["mean"] + (mel_ * self.stats["scale"])
116 |             return mel_.squeeze(0), None, lengths
117 | 


--------------------------------------------------------------------------------
/s3prl_vc/schedulers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/schedulers/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/schedulers/schedulers.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import LambdaLR
 2 | 
 3 | 
 4 | def Linear_schedule_with_warmup(
 5 |     optimizer, num_warmup_steps, num_training_steps, last_epoch=-1
 6 | ):
 7 |     """
 8 |     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0,
 9 |     after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
10 |     Args:
11 |         optimizer (:class:`~torch.optim.Optimizer`):
12 |             The optimizer for which to schedule the learning rate.
13 |         num_warmup_steps (:obj:`int`):
14 |             The number of steps for the warmup phase.
15 |         num_training_steps (:obj:`int`):
16 |             The total number of training steps.
17 |         last_epoch (:obj:`int`, `optional`, defaults to -1):
18 |             The index of the last epoch when resuming training.
19 |     Return:
20 |         :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
21 |     """
22 | 
23 |     def lr_lambda(current_step: int):
24 |         if current_step < num_warmup_steps:
25 |             return float(current_step) / float(max(1, num_warmup_steps))
26 |         return max(
27 |             0.0,
28 |             float(num_training_steps - current_step)
29 |             / float(max(1, num_training_steps - num_warmup_steps)),
30 |         )
31 | 
32 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
33 | 


--------------------------------------------------------------------------------
/s3prl_vc/transform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/transform/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/transform/f0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyworld as pw
 3 | import amfm_decompy.basic_tools as basic
 4 | import amfm_decompy.pYAAPT as pYAAPT
 5 | 
 6 | from s3prl_vc.utils.signal import low_cut_filter
 7 | 
 8 | 
 9 | def get_yaapt_f0(audio, rate=16000, frame_length=1024, frame_shift=256, interp=False):
10 |     # convert frame_length and frame_shift from sample to ms
11 |     frame_length_ms = int(frame_length / rate * 1000)
12 |     frame_shift_ms = int(frame_shift / rate * 1000)
13 | 
14 |     # padding to match the shape of mel
15 |     to_pad = frame_length // 2
16 |     audio_pad = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
17 | 
18 |     signal = basic.SignalObj(audio_pad, rate)
19 |     pitch = pYAAPT.yaapt(
20 |         signal,
21 |         **{
22 |             "frame_length": frame_length_ms,
23 |             "frame_space": frame_shift_ms,
24 |             "nccf_thresh1": 0.25,
25 |             "tda_frame_length": 25.0,
26 |         }
27 |     )
28 |     if interp:
29 |         return pitch.samp_interp
30 |     else:
31 |         return pitch.samp_values
32 | 
33 | 
34 | def get_world_f0(
35 |     x,
36 |     fs,
37 |     algorithm="dio",
38 |     f0min=40,
39 |     f0max=500,
40 |     frame_length=1024,
41 |     frame_shift=256,
42 |     interp=False,
43 | ):
44 |     frame_shift_ms = int(frame_shift / fs * 1000)
45 |     x = x * np.iinfo(np.int16).max
46 |     x = np.array(x, dtype=np.float64)
47 |     x = low_cut_filter(x, fs)
48 | 
49 |     # extract features
50 |     if algorithm == "harvest":
51 |         f = pw.harvest
52 |     elif algorithm == "dio":
53 |         f = pw.dio
54 |     f0, _ = f(x, fs, f0_floor=f0min, f0_ceil=f0max, frame_period=frame_shift_ms)
55 | 
56 |     return f0
57 | 


--------------------------------------------------------------------------------
/s3prl_vc/transform/spectrogram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Tomoki Hayashi
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """(Log Mel) Spectrogram feature extraction."""
 8 | 
 9 | import librosa
10 | import numpy as np
11 | 
12 | 
13 | def logmelfilterbank(
14 |     audio,
15 |     sampling_rate,
16 |     fft_size=1024,
17 |     hop_size=256,
18 |     win_length=None,
19 |     window="hann",
20 |     num_mels=80,
21 |     fmin=None,
22 |     fmax=None,
23 |     eps=1e-10,
24 |     log_base=10.0,
25 | ):
26 |     """Compute log-Mel filterbank feature.
27 | 
28 |     Args:
29 |         audio (ndarray): Audio signal (T,).
30 |         sampling_rate (int): Sampling rate.
31 |         fft_size (int): FFT size.
32 |         hop_size (int): Hop size.
33 |         win_length (int): Window length. If set to None, it will be the same as fft_size.
34 |         window (str): Window function type.
35 |         num_mels (int): Number of mel basis.
36 |         fmin (int): Minimum frequency in mel basis calculation.
37 |         fmax (int): Maximum frequency in mel basis calculation.
38 |         eps (float): Epsilon value to avoid inf in log calculation.
39 |         log_base (float): Log base. If set to None, use np.log.
40 | 
41 |     Returns:
42 |         ndarray: Log Mel filterbank feature (#frames, num_mels).
43 | 
44 |     """
45 |     # get amplitude spectrogram
46 |     x_stft = librosa.stft(
47 |         audio,
48 |         n_fft=fft_size,
49 |         hop_length=hop_size,
50 |         win_length=win_length,
51 |         window=window,
52 |         pad_mode="reflect",
53 |     )
54 |     spc = np.abs(x_stft).T  # (#frames, #bins)
55 | 
56 |     # get mel basis
57 |     fmin = 0 if fmin is None else fmin
58 |     fmax = sampling_rate / 2 if fmax is None else fmax
59 |     mel_basis = librosa.filters.mel(
60 |         sr=sampling_rate,
61 |         n_fft=fft_size,
62 |         n_mels=num_mels,
63 |         fmin=fmin,
64 |         fmax=fmax,
65 |     )
66 |     mel = np.maximum(eps, np.dot(spc, mel_basis.T))
67 | 
68 |     if log_base is None:
69 |         return np.log(mel)
70 |     elif log_base == 10.0:
71 |         return np.log10(mel)
72 |     elif log_base == 2.0:
73 |         return np.log2(mel)
74 |     else:
75 |         raise ValueError(f"{log_base} is not supported.")
76 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/__init__.py:
--------------------------------------------------------------------------------
1 | from .interface import get_upstream
2 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/interface.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from s3prl.nn import S3PRLUpstream
 3 | 
 4 | 
 5 | def get_upstream(name: str) -> torch.nn.Module:
 6 |     if name in S3PRLUpstream.available_names():
 7 |         return S3PRLUpstream(name)
 8 |     elif name == "ppg_sxliu":
 9 |         from s3prl_vc.upstream.ppg_sxliu.model import build_ppg_model
10 | 
11 |         return build_ppg_model()
12 |     elif name == "ppg_whisper":
13 |         from s3prl_vc.upstream.whisper import WhisperPPG
14 | 
15 |         return WhisperPPG()
16 |     else:
17 |         raise ValueError("upstream not supported.")
18 |         exit(1)
19 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/upstream/ppg_sxliu/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/s3prl-vc/6f2c0504fb37a9395e28048b27f4c7562d0f913c/s3prl_vc/upstream/ppg_sxliu/encoder/__init__.py


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/convolution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """ConvolutionModule definition."""
 9 | 
10 | from torch import nn
11 | 
12 | 
13 | class ConvolutionModule(nn.Module):
14 |     """ConvolutionModule in Conformer model.
15 | 
16 |     :param int channels: channels of cnn
17 |     :param int kernel_size: kernerl size of cnn
18 | 
19 |     """
20 | 
21 |     def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
22 |         """Construct an ConvolutionModule object."""
23 |         super(ConvolutionModule, self).__init__()
24 |         # kernerl_size should be a odd number for 'SAME' padding
25 |         assert (kernel_size - 1) % 2 == 0
26 | 
27 |         self.pointwise_conv1 = nn.Conv1d(
28 |             channels,
29 |             2 * channels,
30 |             kernel_size=1,
31 |             stride=1,
32 |             padding=0,
33 |             bias=bias,
34 |         )
35 |         self.depthwise_conv = nn.Conv1d(
36 |             channels,
37 |             channels,
38 |             kernel_size,
39 |             stride=1,
40 |             padding=(kernel_size - 1) // 2,
41 |             groups=channels,
42 |             bias=bias,
43 |         )
44 |         self.norm = nn.BatchNorm1d(channels)
45 |         self.pointwise_conv2 = nn.Conv1d(
46 |             channels,
47 |             channels,
48 |             kernel_size=1,
49 |             stride=1,
50 |             padding=0,
51 |             bias=bias,
52 |         )
53 |         self.activation = activation
54 | 
55 |     def forward(self, x):
56 |         """Compute convolution module.
57 | 
58 |         :param torch.Tensor x: (batch, time, size)
59 |         :return torch.Tensor: convoluted `value` (batch, time, d_model)
60 |         """
61 |         # exchange the temporal dimension and the feature dimension
62 |         x = x.transpose(1, 2)
63 | 
64 |         # GLU mechanism
65 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
66 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
67 | 
68 |         # 1D Depthwise Conv
69 |         x = self.depthwise_conv(x)
70 |         x = self.activation(self.norm(x))
71 | 
72 |         x = self.pointwise_conv2(x)
73 | 
74 |         return x.transpose(1, 2)
75 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/embedding.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Shigeki Karita
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Positonal Encoding Module."""
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | def _pre_hook(
 15 |     state_dict,
 16 |     prefix,
 17 |     local_metadata,
 18 |     strict,
 19 |     missing_keys,
 20 |     unexpected_keys,
 21 |     error_msgs,
 22 | ):
 23 |     """Perform pre-hook in load_state_dict for backward compatibility.
 24 | 
 25 |     Note:
 26 |         We saved self.pe until v.0.5.2 but we have omitted it later.
 27 |         Therefore, we remove the item "pe" from `state_dict` for backward compatibility.
 28 | 
 29 |     """
 30 |     k = prefix + "pe"
 31 |     if k in state_dict:
 32 |         state_dict.pop(k)
 33 | 
 34 | 
 35 | class PositionalEncoding(torch.nn.Module):
 36 |     """Positional encoding.
 37 | 
 38 |     :param int d_model: embedding dim
 39 |     :param float dropout_rate: dropout rate
 40 |     :param int max_len: maximum input length
 41 |     :param reverse: whether to reverse the input position
 42 | 
 43 |     """
 44 | 
 45 |     def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
 46 |         """Construct an PositionalEncoding object."""
 47 |         super(PositionalEncoding, self).__init__()
 48 |         self.d_model = d_model
 49 |         self.reverse = reverse
 50 |         self.xscale = math.sqrt(self.d_model)
 51 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
 52 |         self.pe = None
 53 |         self.extend_pe(torch.tensor(0.0).expand(1, max_len))
 54 |         self._register_load_state_dict_pre_hook(_pre_hook)
 55 | 
 56 |     def extend_pe(self, x):
 57 |         """Reset the positional encodings."""
 58 |         if self.pe is not None:
 59 |             if self.pe.size(1) >= x.size(1):
 60 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
 61 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
 62 |                 return
 63 |         pe = torch.zeros(x.size(1), self.d_model)
 64 |         if self.reverse:
 65 |             position = torch.arange(
 66 |                 x.size(1) - 1, -1, -1.0, dtype=torch.float32
 67 |             ).unsqueeze(1)
 68 |         else:
 69 |             position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
 70 |         div_term = torch.exp(
 71 |             torch.arange(0, self.d_model, 2, dtype=torch.float32)
 72 |             * -(math.log(10000.0) / self.d_model)
 73 |         )
 74 |         pe[:, 0::2] = torch.sin(position * div_term)
 75 |         pe[:, 1::2] = torch.cos(position * div_term)
 76 |         pe = pe.unsqueeze(0)
 77 |         self.pe = pe.to(device=x.device, dtype=x.dtype)
 78 | 
 79 |     def forward(self, x: torch.Tensor):
 80 |         """Add positional encoding.
 81 | 
 82 |         Args:
 83 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
 84 | 
 85 |         Returns:
 86 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
 87 | 
 88 |         """
 89 |         self.extend_pe(x)
 90 |         x = x * self.xscale + self.pe[:, : x.size(1)]
 91 |         return self.dropout(x)
 92 | 
 93 | 
 94 | class ScaledPositionalEncoding(PositionalEncoding):
 95 |     """Scaled positional encoding module.
 96 | 
 97 |     See also: Sec. 3.2  https://arxiv.org/pdf/1809.08895.pdf
 98 | 
 99 |     """
100 | 
101 |     def __init__(self, d_model, dropout_rate, max_len=5000):
102 |         """Initialize class.
103 | 
104 |         :param int d_model: embedding dim
105 |         :param float dropout_rate: dropout rate
106 |         :param int max_len: maximum input length
107 | 
108 |         """
109 |         super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
110 |         self.alpha = torch.nn.Parameter(torch.tensor(1.0))
111 | 
112 |     def reset_parameters(self):
113 |         """Reset parameters."""
114 |         self.alpha.data = torch.tensor(1.0)
115 | 
116 |     def forward(self, x):
117 |         """Add positional encoding.
118 | 
119 |         Args:
120 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
121 | 
122 |         Returns:
123 |             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
124 | 
125 |         """
126 |         self.extend_pe(x)
127 |         x = x + self.alpha * self.pe[:, : x.size(1)]
128 |         return self.dropout(x)
129 | 
130 | 
131 | class RelPositionalEncoding(PositionalEncoding):
132 |     """Relitive positional encoding module.
133 | 
134 |     See : Appendix B in https://arxiv.org/abs/1901.02860
135 | 
136 |     :param int d_model: embedding dim
137 |     :param float dropout_rate: dropout rate
138 |     :param int max_len: maximum input length
139 | 
140 |     """
141 | 
142 |     def __init__(self, d_model, dropout_rate, max_len=5000):
143 |         """Initialize class.
144 | 
145 |         :param int d_model: embedding dim
146 |         :param float dropout_rate: dropout rate
147 |         :param int max_len: maximum input length
148 | 
149 |         """
150 |         super().__init__(d_model, dropout_rate, max_len, reverse=True)
151 | 
152 |     def forward(self, x):
153 |         """Compute positional encoding.
154 | 
155 |         Args:
156 |             x (torch.Tensor): Input. Its shape is (batch, time, ...)
157 | 
158 |         Returns:
159 |             torch.Tensor: x. Its shape is (batch, time, ...)
160 |             torch.Tensor: pos_emb. Its shape is (1, time, ...)
161 | 
162 |         """
163 |         self.extend_pe(x)
164 |         x = x * self.xscale
165 |         pos_emb = self.pe[:, : x.size(1)]
166 |         return self.dropout(x), self.dropout(pos_emb)
167 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/encoder_layer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Encoder self-attention layer definition."""
  8 | 
  9 | import torch
 10 | 
 11 | from torch import nn
 12 | 
 13 | from .layer_norm import LayerNorm
 14 | 
 15 | 
 16 | class EncoderLayer(nn.Module):
 17 |     """Encoder layer module.
 18 | 
 19 |     :param int size: input dim
 20 |     :param espnet.nets.pytorch_backend.transformer.attention.
 21 |         MultiHeadedAttention self_attn: self attention module
 22 |         RelPositionMultiHeadedAttention self_attn: self attention module
 23 |     :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward.
 24 |         PositionwiseFeedForward feed_forward:
 25 |         feed forward module
 26 |     :param espnet.nets.pytorch_backend.transformer.positionwise_feed_forward
 27 |     for macaron style
 28 |     PositionwiseFeedForward feed_forward:
 29 |     feed forward module
 30 |     :param espnet.nets.pytorch_backend.conformer.convolution.
 31 |         ConvolutionModule feed_foreard:
 32 |         feed forward module
 33 |     :param float dropout_rate: dropout rate
 34 |     :param bool normalize_before: whether to use layer_norm before the first block
 35 |     :param bool concat_after: whether to concat attention layer's input and output
 36 |         if True, additional linear will be applied.
 37 |         i.e. x -> x + linear(concat(x, att(x)))
 38 |         if False, no additional linear will be applied. i.e. x -> x + att(x)
 39 | 
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         size,
 45 |         self_attn,
 46 |         feed_forward,
 47 |         feed_forward_macaron,
 48 |         conv_module,
 49 |         dropout_rate,
 50 |         normalize_before=True,
 51 |         concat_after=False,
 52 |     ):
 53 |         """Construct an EncoderLayer object."""
 54 |         super(EncoderLayer, self).__init__()
 55 |         self.self_attn = self_attn
 56 |         self.feed_forward = feed_forward
 57 |         self.feed_forward_macaron = feed_forward_macaron
 58 |         self.conv_module = conv_module
 59 |         self.norm_ff = LayerNorm(size)  # for the FNN module
 60 |         self.norm_mha = LayerNorm(size)  # for the MHA module
 61 |         if feed_forward_macaron is not None:
 62 |             self.norm_ff_macaron = LayerNorm(size)
 63 |             self.ff_scale = 0.5
 64 |         else:
 65 |             self.ff_scale = 1.0
 66 |         if self.conv_module is not None:
 67 |             self.norm_conv = LayerNorm(size)  # for the CNN module
 68 |             self.norm_final = LayerNorm(size)  # for the final output of the block
 69 |         self.dropout = nn.Dropout(dropout_rate)
 70 |         self.size = size
 71 |         self.normalize_before = normalize_before
 72 |         self.concat_after = concat_after
 73 |         if self.concat_after:
 74 |             self.concat_linear = nn.Linear(size + size, size)
 75 | 
 76 |     def forward(self, x_input, mask, cache=None):
 77 |         """Compute encoded features.
 78 | 
 79 |         :param torch.Tensor x_input: encoded source features, w/o pos_emb
 80 |         tuple((batch, max_time_in, size), (1, max_time_in, size))
 81 |         or (batch, max_time_in, size)
 82 |         :param torch.Tensor mask: mask for x (batch, max_time_in)
 83 |         :param torch.Tensor cache: cache for x (batch, max_time_in - 1, size)
 84 |         :rtype: Tuple[torch.Tensor, torch.Tensor]
 85 |         """
 86 |         if isinstance(x_input, tuple):
 87 |             x, pos_emb = x_input[0], x_input[1]
 88 |         else:
 89 |             x, pos_emb = x_input, None
 90 | 
 91 |         # whether to use macaron style
 92 |         if self.feed_forward_macaron is not None:
 93 |             residual = x
 94 |             if self.normalize_before:
 95 |                 x = self.norm_ff_macaron(x)
 96 |             x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
 97 |             if not self.normalize_before:
 98 |                 x = self.norm_ff_macaron(x)
 99 | 
100 |         # multi-headed self-attention module
101 |         residual = x
102 |         if self.normalize_before:
103 |             x = self.norm_mha(x)
104 | 
105 |         if cache is None:
106 |             x_q = x
107 |         else:
108 |             assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
109 |             x_q = x[:, -1:, :]
110 |             residual = residual[:, -1:, :]
111 |             mask = None if mask is None else mask[:, -1:, :]
112 | 
113 |         if pos_emb is not None:
114 |             x_att = self.self_attn(x_q, x, x, pos_emb, mask)
115 |         else:
116 |             x_att = self.self_attn(x_q, x, x, mask)
117 | 
118 |         if self.concat_after:
119 |             x_concat = torch.cat((x, x_att), dim=-1)
120 |             x = residual + self.concat_linear(x_concat)
121 |         else:
122 |             x = residual + self.dropout(x_att)
123 |         if not self.normalize_before:
124 |             x = self.norm_mha(x)
125 | 
126 |         # convolution module
127 |         if self.conv_module is not None:
128 |             residual = x
129 |             if self.normalize_before:
130 |                 x = self.norm_conv(x)
131 |             x = residual + self.dropout(self.conv_module(x))
132 |             if not self.normalize_before:
133 |                 x = self.norm_conv(x)
134 | 
135 |         # feed forward module
136 |         residual = x
137 |         if self.normalize_before:
138 |             x = self.norm_ff(x)
139 |         x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
140 |         if not self.normalize_before:
141 |             x = self.norm_ff(x)
142 | 
143 |         if self.conv_module is not None:
144 |             x = self.norm_final(x)
145 | 
146 |         if cache is not None:
147 |             x = torch.cat([cache, x], dim=1)
148 | 
149 |         if pos_emb is not None:
150 |             return (x, pos_emb), mask
151 | 
152 |         return x, mask
153 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Layer normalization module."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class LayerNorm(torch.nn.LayerNorm):
13 |     """Layer normalization module.
14 | 
15 |     :param int nout: output dim size
16 |     :param int dim: dimension to be normalized
17 |     """
18 | 
19 |     def __init__(self, nout, dim=-1):
20 |         """Construct an LayerNorm object."""
21 |         super(LayerNorm, self).__init__(nout, eps=1e-12)
22 |         self.dim = dim
23 | 
24 |     def forward(self, x):
25 |         """Apply layer normalization.
26 | 
27 |         :param torch.Tensor x: input tensor
28 |         :return: layer normalized tensor
29 |         :rtype torch.Tensor
30 |         """
31 |         if self.dim == -1:
32 |             return super(LayerNorm, self).forward(x)
33 |         return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
34 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/multi_layer_conv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2019 Tomoki Hayashi
  5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  6 | 
  7 | """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
  8 | 
  9 | import torch
 10 | 
 11 | 
 12 | class MultiLayeredConv1d(torch.nn.Module):
 13 |     """Multi-layered conv1d for Transformer block.
 14 | 
 15 |     This is a module of multi-leyered conv1d designed
 16 |     to replace positionwise feed-forward network
 17 |     in Transforner block, which is introduced in
 18 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
 19 | 
 20 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
 21 |         https://arxiv.org/pdf/1905.09263.pdf
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 26 |         """Initialize MultiLayeredConv1d module.
 27 | 
 28 |         Args:
 29 |             in_chans (int): Number of input channels.
 30 |             hidden_chans (int): Number of hidden channels.
 31 |             kernel_size (int): Kernel size of conv1d.
 32 |             dropout_rate (float): Dropout rate.
 33 | 
 34 |         """
 35 |         super(MultiLayeredConv1d, self).__init__()
 36 |         self.w_1 = torch.nn.Conv1d(
 37 |             in_chans,
 38 |             hidden_chans,
 39 |             kernel_size,
 40 |             stride=1,
 41 |             padding=(kernel_size - 1) // 2,
 42 |         )
 43 |         self.w_2 = torch.nn.Conv1d(
 44 |             hidden_chans,
 45 |             in_chans,
 46 |             kernel_size,
 47 |             stride=1,
 48 |             padding=(kernel_size - 1) // 2,
 49 |         )
 50 |         self.dropout = torch.nn.Dropout(dropout_rate)
 51 | 
 52 |     def forward(self, x):
 53 |         """Calculate forward propagation.
 54 | 
 55 |         Args:
 56 |             x (Tensor): Batch of input tensors (B, ..., in_chans).
 57 | 
 58 |         Returns:
 59 |             Tensor: Batch of output tensors (B, ..., hidden_chans).
 60 | 
 61 |         """
 62 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
 63 |         return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
 64 | 
 65 | 
 66 | class Conv1dLinear(torch.nn.Module):
 67 |     """Conv1D + Linear for Transformer block.
 68 | 
 69 |     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
 70 | 
 71 |     """
 72 | 
 73 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
 74 |         """Initialize Conv1dLinear module.
 75 | 
 76 |         Args:
 77 |             in_chans (int): Number of input channels.
 78 |             hidden_chans (int): Number of hidden channels.
 79 |             kernel_size (int): Kernel size of conv1d.
 80 |             dropout_rate (float): Dropout rate.
 81 | 
 82 |         """
 83 |         super(Conv1dLinear, self).__init__()
 84 |         self.w_1 = torch.nn.Conv1d(
 85 |             in_chans,
 86 |             hidden_chans,
 87 |             kernel_size,
 88 |             stride=1,
 89 |             padding=(kernel_size - 1) // 2,
 90 |         )
 91 |         self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
 92 |         self.dropout = torch.nn.Dropout(dropout_rate)
 93 | 
 94 |     def forward(self, x):
 95 |         """Calculate forward propagation.
 96 | 
 97 |         Args:
 98 |             x (Tensor): Batch of input tensors (B, ..., in_chans).
 99 | 
100 |         Returns:
101 |             Tensor: Batch of output tensors (B, ..., hidden_chans).
102 | 
103 |         """
104 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
105 |         return self.w_2(self.dropout(x))
106 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/positionwise_feed_forward.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Positionwise feed forward layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class PositionwiseFeedForward(torch.nn.Module):
13 |     """Positionwise feed forward layer.
14 | 
15 |     :param int idim: input dimenstion
16 |     :param int hidden_units: number of hidden units
17 |     :param float dropout_rate: dropout rate
18 | 
19 |     """
20 | 
21 |     def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
22 |         """Construct an PositionwiseFeedForward object."""
23 |         super(PositionwiseFeedForward, self).__init__()
24 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
25 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
26 |         self.dropout = torch.nn.Dropout(dropout_rate)
27 |         self.activation = activation
28 | 
29 |     def forward(self, x):
30 |         """Forward funciton."""
31 |         return self.w_2(self.dropout(self.activation(self.w_1(x))))
32 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/repeat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Shigeki Karita
 5 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 6 | 
 7 | """Repeat the same layer definition."""
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiSequential(torch.nn.Sequential):
13 |     """Multi-input multi-output torch.nn.Sequential."""
14 | 
15 |     def forward(self, *args):
16 |         """Repeat."""
17 |         for m in self:
18 |             args = m(*args)
19 |         return args
20 | 
21 | 
22 | def repeat(N, fn):
23 |     """Repeat module N times.
24 | 
25 |     :param int N: repeat time
26 |     :param function fn: function to generate module
27 |     :return: repeated modules
28 |     :rtype: MultiSequential
29 |     """
30 |     return MultiSequential(*[fn(n) for n in range(N)])
31 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/swish.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 5 | #                Northwestern Polytechnical University (Pengcheng Guo)
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | """Swish() activation function for Conformer."""
 9 | 
10 | import torch
11 | 
12 | 
13 | class Swish(torch.nn.Module):
14 |     """Construct an Swish object."""
15 | 
16 |     def forward(self, x):
17 |         """Return Swich activation function."""
18 |         return x * torch.sigmoid(x)
19 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/encoder/vgg.py:
--------------------------------------------------------------------------------
 1 | """VGG2L definition for transformer-transducer."""
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class VGG2L(torch.nn.Module):
 7 |     """VGG2L module for transformer-transducer encoder."""
 8 | 
 9 |     def __init__(self, idim, odim):
10 |         """Construct a VGG2L object.
11 | 
12 |         Args:
13 |             idim (int): dimension of inputs
14 |             odim (int): dimension of outputs
15 | 
16 |         """
17 |         super(VGG2L, self).__init__()
18 | 
19 |         self.vgg2l = torch.nn.Sequential(
20 |             torch.nn.Conv2d(1, 64, 3, stride=1, padding=1),
21 |             torch.nn.ReLU(),
22 |             torch.nn.Conv2d(64, 64, 3, stride=1, padding=1),
23 |             torch.nn.ReLU(),
24 |             torch.nn.MaxPool2d((3, 2)),
25 |             torch.nn.Conv2d(64, 128, 3, stride=1, padding=1),
26 |             torch.nn.ReLU(),
27 |             torch.nn.Conv2d(128, 128, 3, stride=1, padding=1),
28 |             torch.nn.ReLU(),
29 |             torch.nn.MaxPool2d((2, 2)),
30 |         )
31 | 
32 |         self.output = torch.nn.Linear(128 * ((idim // 2) // 2), odim)
33 | 
34 |     def forward(self, x, x_mask):
35 |         """VGG2L forward for x.
36 | 
37 |         Args:
38 |             x (torch.Tensor): input torch (B, T, idim)
39 |             x_mask (torch.Tensor): (B, 1, T)
40 | 
41 |         Returns:
42 |             x (torch.Tensor): input torch (B, sub(T), attention_dim)
43 |             x_mask (torch.Tensor): (B, 1, sub(T))
44 | 
45 |         """
46 |         x = x.unsqueeze(1)
47 |         x = self.vgg2l(x)
48 | 
49 |         b, c, t, f = x.size()
50 | 
51 |         x = self.output(x.transpose(1, 2).contiguous().view(b, t, c * f))
52 | 
53 |         if x_mask is None:
54 |             return x, None
55 |         else:
56 |             x_mask = self.create_new_mask(x_mask, x)
57 | 
58 |             return x, x_mask
59 | 
60 |     def create_new_mask(self, x_mask, x):
61 |         """Create a subsampled version of x_mask.
62 | 
63 |         Args:
64 |             x_mask (torch.Tensor): (B, 1, T)
65 |             x (torch.Tensor): (B, sub(T), attention_dim)
66 | 
67 |         Returns:
68 |             x_mask (torch.Tensor): (B, 1, sub(T))
69 | 
70 |         """
71 |         x_t1 = x_mask.size(2) - (x_mask.size(2) % 3)
72 |         x_mask = x_mask[:, :, :x_t1][:, :, ::3]
73 | 
74 |         x_t2 = x_mask.size(2) - (x_mask.size(2) % 2)
75 |         x_mask = x_mask[:, :, :x_t2][:, :, ::2]
76 | 
77 |         return x_mask
78 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/frontend.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional
  3 | from typing import Tuple
  4 | from typing import Union
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch_complex.tensor import ComplexTensor
  9 | 
 10 | from s3prl_vc.upstream.ppg_sxliu.log_mel import LogMel
 11 | from s3prl_vc.upstream.ppg_sxliu.stft import Stft
 12 | 
 13 | 
 14 | class DefaultFrontend(torch.nn.Module):
 15 |     """Conventional frontend structure for ASR
 16 | 
 17 |     Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         fs: Union[int, str] = 16000,
 23 |         n_fft: int = 1024,
 24 |         win_length: int = 800,
 25 |         hop_length: int = 160,
 26 |         center: bool = True,
 27 |         pad_mode: str = "reflect",
 28 |         normalized: bool = False,
 29 |         onesided: bool = True,
 30 |         n_mels: int = 80,
 31 |         fmin: int = None,
 32 |         fmax: int = None,
 33 |         htk: bool = False,
 34 |         norm=1,
 35 |         frontend_conf=None,  # Optional[dict] = get_default_kwargs(Frontend),
 36 |         kaldi_padding_mode=False,
 37 |         downsample_rate: int = 1,
 38 |     ):
 39 |         super().__init__()
 40 |         if isinstance(fs, str):
 41 |             fs = int(fs)
 42 |         self.downsample_rate = downsample_rate
 43 | 
 44 |         # Deepcopy (In general, dict shouldn't be used as default arg)
 45 |         frontend_conf = copy.deepcopy(frontend_conf)
 46 | 
 47 |         self.stft = Stft(
 48 |             n_fft=n_fft,
 49 |             win_length=win_length,
 50 |             hop_length=hop_length,
 51 |             center=center,
 52 |             pad_mode=pad_mode,
 53 |             normalized=normalized,
 54 |             onesided=onesided,
 55 |             kaldi_padding_mode=kaldi_padding_mode,
 56 |         )
 57 |         if frontend_conf is not None:
 58 |             self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf)
 59 |         else:
 60 |             self.frontend = None
 61 | 
 62 |         self.logmel = LogMel(
 63 |             fs=fs,
 64 |             n_fft=n_fft,
 65 |             n_mels=n_mels,
 66 |             fmin=fmin,
 67 |             fmax=fmax,
 68 |             htk=htk,
 69 |             norm=norm,
 70 |         )
 71 |         self.n_mels = n_mels
 72 | 
 73 |     def output_size(self) -> int:
 74 |         return self.n_mels
 75 | 
 76 |     def forward(
 77 |         self, input: torch.Tensor, input_lengths: torch.Tensor
 78 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 79 |         # 1. Domain-conversion: e.g. Stft: time -> time-freq
 80 |         input_stft, feats_lens = self.stft(input, input_lengths)
 81 | 
 82 |         assert input_stft.dim() >= 4, input_stft.shape
 83 |         # "2" refers to the real/imag parts of Complex
 84 |         assert input_stft.shape[-1] == 2, input_stft.shape
 85 | 
 86 |         # Change torch.Tensor to ComplexTensor
 87 |         # input_stft: (..., F, 2) -> (..., F)
 88 |         input_stft = ComplexTensor(input_stft[..., 0], input_stft[..., 1])
 89 | 
 90 |         # 2. [Option] Speech enhancement
 91 |         if self.frontend is not None:
 92 |             assert isinstance(input_stft, ComplexTensor), type(input_stft)
 93 |             # input_stft: (Batch, Length, [Channel], Freq)
 94 |             input_stft, _, mask = self.frontend(input_stft, feats_lens)
 95 | 
 96 |         # 3. [Multi channel case]: Select a channel
 97 |         if input_stft.dim() == 4:
 98 |             # h: (B, T, C, F) -> h: (B, T, F)
 99 |             if self.training:
100 |                 # Select 1ch randomly
101 |                 ch = np.random.randint(input_stft.size(2))
102 |                 input_stft = input_stft[:, :, ch, :]
103 |             else:
104 |                 # Use the first channel
105 |                 input_stft = input_stft[:, :, 0, :]
106 | 
107 |         # 4. STFT -> Power spectrum
108 |         # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
109 |         input_power = input_stft.real**2 + input_stft.imag**2
110 | 
111 |         # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
112 |         # input_power: (Batch, [Channel,] Length, Freq)
113 |         #       -> input_feats: (Batch, Length, Dim)
114 |         input_feats, _ = self.logmel(input_power, feats_lens)
115 | 
116 |         # NOTE(sx): pad
117 |         max_len = input_feats.size(1)
118 |         if self.downsample_rate > 1 and max_len % self.downsample_rate != 0:
119 |             padding = self.downsample_rate - max_len % self.downsample_rate
120 |             # print("Logmel: ", input_feats.size())
121 |             input_feats = torch.nn.functional.pad(
122 |                 input_feats, (0, 0, 0, padding), "constant", 0
123 |             )
124 |             # print("Logmel(after padding): ",input_feats.size())
125 |             feats_lens[torch.argmax(feats_lens)] = max_len + padding
126 | 
127 |         return input_feats, feats_lens
128 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/log_mel.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | import torch
 4 | from typing import Tuple
 5 | 
 6 | from s3prl_vc.upstream.ppg_sxliu.nets_utils import make_pad_mask
 7 | 
 8 | 
 9 | class LogMel(torch.nn.Module):
10 |     """Convert STFT to fbank feats
11 | 
12 |     The arguments is same as librosa.filters.mel
13 | 
14 |     Args:
15 |         fs: number > 0 [scalar] sampling rate of the incoming signal
16 |         n_fft: int > 0 [scalar] number of FFT components
17 |         n_mels: int > 0 [scalar] number of Mel bands to generate
18 |         fmin: float >= 0 [scalar] lowest frequency (in Hz)
19 |         fmax: float >= 0 [scalar] highest frequency (in Hz).
20 |             If `None`, use `fmax = fs / 2.0`
21 |         htk: use HTK formula instead of Slaney
22 |         norm: {None, 1, np.inf} [scalar]
23 |             if 1, divide the triangular mel weights by the width of the mel band
24 |             (area normalization).  Otherwise, leave all the triangles aiming for
25 |             a peak value of 1.0
26 | 
27 |     """
28 | 
29 |     def __init__(
30 |         self,
31 |         fs: int = 16000,
32 |         n_fft: int = 512,
33 |         n_mels: int = 80,
34 |         fmin: float = None,
35 |         fmax: float = None,
36 |         htk: bool = False,
37 |         norm=1,
38 |     ):
39 |         super().__init__()
40 | 
41 |         fmin = 0 if fmin is None else fmin
42 |         fmax = fs / 2 if fmax is None else fmax
43 |         _mel_options = dict(
44 |             sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
45 |         )
46 |         self.mel_options = _mel_options
47 | 
48 |         # Note(kamo): The mel matrix of librosa is different from kaldi.
49 |         melmat = librosa.filters.mel(**_mel_options)
50 |         # melmat: (D2, D1) -> (D1, D2)
51 |         self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
52 |         inv_mel = np.linalg.pinv(melmat)
53 |         self.register_buffer("inv_melmat", torch.from_numpy(inv_mel.T).float())
54 | 
55 |     def extra_repr(self):
56 |         return ", ".join(f"{k}={v}" for k, v in self.mel_options.items())
57 | 
58 |     def forward(
59 |         self,
60 |         feat: torch.Tensor,
61 |         ilens: torch.Tensor = None,
62 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
63 |         # feat: (B, T, D1) x melmat: (D1, D2) -> mel_feat: (B, T, D2)
64 |         mel_feat = torch.matmul(feat, self.melmat)
65 | 
66 |         logmel_feat = (mel_feat + 1e-20).log()
67 |         # Zero padding
68 |         if ilens is not None:
69 |             logmel_feat = logmel_feat.masked_fill(
70 |                 make_pad_mask(ilens, logmel_feat, 1), 0.0
71 |             )
72 |         else:
73 |             ilens = feat.new_full(
74 |                 [feat.size(0)], fill_value=feat.size(1), dtype=torch.long
75 |             )
76 |         return logmel_feat, ilens
77 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/model.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | from pathlib import Path
  4 | import yaml
  5 | 
  6 | 
  7 | from s3prl_vc.upstream.ppg_sxliu.frontend import DefaultFrontend
  8 | from s3prl_vc.upstream.ppg_sxliu.utterance_mvn import UtteranceMVN
  9 | from s3prl_vc.upstream.ppg_sxliu.encoder.conformer_encoder import ConformerEncoder
 10 | from s3prl_vc.utils.download import _urls_to_filepaths
 11 | 
 12 | TRAIN_CONFIG_URL = "https://github.com/liusongxiang/ppg-vc/raw/main/conformer_ppg_model/en_conformer_ctc_att/config.yaml"
 13 | MODEL_FILE_URL = "https://github.com/liusongxiang/ppg-vc/raw/main/conformer_ppg_model/en_conformer_ctc_att/24epoch.pth"
 14 | 
 15 | 
 16 | class PPGModel(torch.nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         frontend,
 20 |         normalizer,
 21 |         encoder,
 22 |     ):
 23 |         super().__init__()
 24 |         self.frontend = frontend
 25 |         self.normalize = normalizer
 26 |         self.encoder = encoder
 27 |         self.hidden_size = encoder.output_size()
 28 | 
 29 |     # required by S3PRL Featurizer
 30 |     def get_downsample_rates(self, key: str = None) -> int:
 31 |         return 160
 32 | 
 33 |     @property
 34 |     def num_layers(self):
 35 |         return 1
 36 | 
 37 |     @property
 38 |     def hidden_sizes(self):
 39 |         return [self.hidden_size]
 40 | 
 41 |     @property
 42 |     def downsample_rates(self):
 43 |         return [self.get_downsample_rates()]
 44 | 
 45 |     def forward(self, speech, speech_lengths):
 46 |         """
 47 | 
 48 |         Args:
 49 |             speech (tensor): (B, L)
 50 |             speech_lengths (tensor): (B, )
 51 | 
 52 |         Returns:
 53 |             bottle_neck_feats (tensor): (B, L//hop_size, 144)
 54 | 
 55 |         """
 56 |         feats, feats_lengths = self._extract_feats(speech, speech_lengths)
 57 |         feats, feats_lengths = self.normalize(feats, feats_lengths)
 58 |         encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
 59 | 
 60 |         # As required by S3PRL Featurizer, needs to be returned in lists
 61 |         return [encoder_out], [encoder_out_lens]
 62 | 
 63 |     def _extract_feats(self, speech: torch.Tensor, speech_lengths: torch.Tensor):
 64 |         assert speech_lengths.dim() == 1, speech_lengths.shape
 65 | 
 66 |         # for data-parallel
 67 |         speech = speech[:, : speech_lengths.max()]
 68 | 
 69 |         if self.frontend is not None:
 70 |             # Frontend
 71 |             #  e.g. STFT and Feature extract
 72 |             #       data_loader may send time-domain signal in this case
 73 |             # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
 74 |             feats, feats_lengths = self.frontend(speech, speech_lengths)
 75 |         else:
 76 |             # No frontend and no feature extract
 77 |             feats, feats_lengths = speech, speech_lengths
 78 |         return feats, feats_lengths
 79 | 
 80 | 
 81 | def build_model(args):
 82 |     normalizer = UtteranceMVN(**args.normalize_conf)
 83 |     frontend = DefaultFrontend(**args.frontend_conf)
 84 |     encoder = ConformerEncoder(input_size=80, **args.encoder_conf)
 85 |     model = PPGModel(frontend, normalizer, encoder)
 86 | 
 87 |     return model
 88 | 
 89 | 
 90 | def build_ppg_model():
 91 | 
 92 |     # download from Songxiang's repo
 93 |     train_config = _urls_to_filepaths(TRAIN_CONFIG_URL)
 94 |     model_file = _urls_to_filepaths(MODEL_FILE_URL)
 95 | 
 96 |     config_file = Path(train_config)
 97 |     with config_file.open("r", encoding="utf-8") as f:
 98 |         args = yaml.safe_load(f)
 99 | 
100 |     args = argparse.Namespace(**args)
101 | 
102 |     model = build_model(args)
103 |     model_state_dict = model.state_dict()
104 | 
105 |     ckpt_state_dict = torch.load(model_file, map_location="cpu")
106 |     ckpt_state_dict = {k: v for k, v in ckpt_state_dict.items() if "encoder" in k}
107 | 
108 |     model_state_dict.update(ckpt_state_dict)
109 |     model.load_state_dict(model_state_dict)
110 | 
111 |     return model
112 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/ppg_sxliu.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/stft.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from typing import Tuple
  3 | from typing import Union
  4 | 
  5 | import torch
  6 | 
  7 | from s3prl_vc.upstream.ppg_sxliu.nets_utils import make_pad_mask
  8 | 
  9 | 
 10 | class Stft(torch.nn.Module):
 11 |     def __init__(
 12 |         self,
 13 |         n_fft: int = 512,
 14 |         win_length: Union[int, None] = 512,
 15 |         hop_length: int = 128,
 16 |         center: bool = True,
 17 |         pad_mode: str = "reflect",
 18 |         normalized: bool = False,
 19 |         onesided: bool = True,
 20 |         kaldi_padding_mode=False,
 21 |     ):
 22 |         super().__init__()
 23 |         self.n_fft = n_fft
 24 |         if win_length is None:
 25 |             self.win_length = n_fft
 26 |         else:
 27 |             self.win_length = win_length
 28 |         self.hop_length = hop_length
 29 |         self.center = center
 30 |         self.pad_mode = pad_mode
 31 |         self.normalized = normalized
 32 |         self.onesided = onesided
 33 |         self.kaldi_padding_mode = kaldi_padding_mode
 34 |         if self.kaldi_padding_mode:
 35 |             self.win_length = 400
 36 | 
 37 |     def extra_repr(self):
 38 |         return (
 39 |             f"n_fft={self.n_fft}, "
 40 |             f"win_length={self.win_length}, "
 41 |             f"hop_length={self.hop_length}, "
 42 |             f"center={self.center}, "
 43 |             f"pad_mode={self.pad_mode}, "
 44 |             f"normalized={self.normalized}, "
 45 |             f"onesided={self.onesided}"
 46 |         )
 47 | 
 48 |     def forward(
 49 |         self, input: torch.Tensor, ilens: torch.Tensor = None
 50 |     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
 51 |         """STFT forward function.
 52 | 
 53 |         Args:
 54 |             input: (Batch, Nsamples) or (Batch, Nsample, Channels)
 55 |             ilens: (Batch)
 56 |         Returns:
 57 |             output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)
 58 | 
 59 |         """
 60 |         bs = input.size(0)
 61 |         if input.dim() == 3:
 62 |             multi_channel = True
 63 |             # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
 64 |             input = input.transpose(1, 2).reshape(-1, input.size(1))
 65 |         else:
 66 |             multi_channel = False
 67 | 
 68 |         # output: (Batch, Freq, Frames, 2=real_imag)
 69 |         # or (Batch, Channel, Freq, Frames, 2=real_imag)
 70 |         if not self.kaldi_padding_mode:
 71 |             output = torch.stft(
 72 |                 input,
 73 |                 n_fft=self.n_fft,
 74 |                 win_length=self.win_length,
 75 |                 hop_length=self.hop_length,
 76 |                 center=self.center,
 77 |                 pad_mode=self.pad_mode,
 78 |                 normalized=self.normalized,
 79 |                 onesided=self.onesided,
 80 |             )
 81 |         else:
 82 |             # NOTE(sx): Use Kaldi-fasion padding, maybe wrong
 83 |             num_pads = self.n_fft - self.win_length
 84 |             input = torch.nn.functional.pad(input, (num_pads, 0))
 85 |             output = torch.stft(
 86 |                 input,
 87 |                 n_fft=self.n_fft,
 88 |                 win_length=self.win_length,
 89 |                 hop_length=self.hop_length,
 90 |                 center=False,
 91 |                 pad_mode=self.pad_mode,
 92 |                 normalized=self.normalized,
 93 |                 onesided=self.onesided,
 94 |             )
 95 | 
 96 |         # output: (Batch, Freq, Frames, 2=real_imag)
 97 |         # -> (Batch, Frames, Freq, 2=real_imag)
 98 |         output = output.transpose(1, 2)
 99 |         if multi_channel:
100 |             # output: (Batch * Channel, Frames, Freq, 2=real_imag)
101 |             # -> (Batch, Frame, Channel, Freq, 2=real_imag)
102 |             output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(
103 |                 1, 2
104 |             )
105 | 
106 |         if ilens is not None:
107 |             if self.center:
108 |                 pad = self.win_length // 2
109 |                 ilens = ilens + 2 * pad
110 | 
111 |             olens = (ilens - self.win_length) // self.hop_length + 1
112 |             output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
113 |         else:
114 |             olens = None
115 | 
116 |         return output, olens
117 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/ppg_sxliu/utterance_mvn.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | 
 5 | from s3prl_vc.upstream.ppg_sxliu.nets_utils import make_pad_mask
 6 | 
 7 | 
 8 | class UtteranceMVN(torch.nn.Module):
 9 |     def __init__(
10 |         self,
11 |         norm_means: bool = True,
12 |         norm_vars: bool = False,
13 |         eps: float = 1.0e-20,
14 |     ):
15 |         super().__init__()
16 |         self.norm_means = norm_means
17 |         self.norm_vars = norm_vars
18 |         self.eps = eps
19 | 
20 |     def extra_repr(self):
21 |         return f"norm_means={self.norm_means}, norm_vars={self.norm_vars}"
22 | 
23 |     def forward(
24 |         self, x: torch.Tensor, ilens: torch.Tensor = None
25 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
26 |         """Forward function
27 | 
28 |         Args:
29 |             x: (B, L, ...)
30 |             ilens: (B,)
31 | 
32 |         """
33 |         return utterance_mvn(
34 |             x,
35 |             ilens,
36 |             norm_means=self.norm_means,
37 |             norm_vars=self.norm_vars,
38 |             eps=self.eps,
39 |         )
40 | 
41 | 
42 | def utterance_mvn(
43 |     x: torch.Tensor,
44 |     ilens: torch.Tensor = None,
45 |     norm_means: bool = True,
46 |     norm_vars: bool = False,
47 |     eps: float = 1.0e-20,
48 | ) -> Tuple[torch.Tensor, torch.Tensor]:
49 |     """Apply utterance mean and variance normalization
50 | 
51 |     Args:
52 |         x: (B, T, D), assumed zero padded
53 |         ilens: (B,)
54 |         norm_means:
55 |         norm_vars:
56 |         eps:
57 | 
58 |     """
59 |     if ilens is None:
60 |         ilens = x.new_full([x.size(0)], x.size(1))
61 |     ilens_ = ilens.to(x.device, x.dtype).view(-1, *[1 for _ in range(x.dim() - 1)])
62 |     # Zero padding
63 |     if x.requires_grad:
64 |         x = x.masked_fill(make_pad_mask(ilens, x, 1), 0.0)
65 |     else:
66 |         x.masked_fill_(make_pad_mask(ilens, x, 1), 0.0)
67 |     # mean: (B, 1, D)
68 |     mean = x.sum(dim=1, keepdim=True) / ilens_
69 | 
70 |     if norm_means:
71 |         x -= mean
72 | 
73 |         if norm_vars:
74 |             var = x.pow(2).sum(dim=1, keepdim=True) / ilens_
75 |             std = torch.clamp(var.sqrt(), min=eps)
76 |             x = x / std.sqrt()
77 |         return x, ilens
78 |     else:
79 |         if norm_vars:
80 |             y = x - mean
81 |             y.masked_fill_(make_pad_mask(ilens, y, 1), 0.0)
82 |             var = y.pow(2).sum(dim=1, keepdim=True) / ilens_
83 |             std = torch.clamp(var.sqrt(), min=eps)
84 |             x /= std
85 |         return x, ilens
86 | 


--------------------------------------------------------------------------------
/s3prl_vc/upstream/whisper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import whisper
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class WhisperPPG(torch.nn.Module):
 7 |     def __init__(self, model_size: str = "small", default_pe_max_length: int = 5000):
 8 |         super().__init__()
 9 |         self.model = whisper.load_model(model_size)
10 |         self.hidden_size = self.model.dims.n_audio_state
11 |         self.pe = None
12 |         self.extend_pe(torch.tensor(0.0).expand(1, default_pe_max_length))
13 | 
14 |     def extend_pe(self, x):
15 |         if self.pe is not None:
16 |             if x.shape[1] <= self.pe.shape[1]:
17 |                 if self.pe.dtype != x.dtype or self.pe.device != x.device:
18 |                     self.pe = self.pe.to(dtype=x.dtype, device=x.device)
19 |                 return
20 | 
21 |         pe = whisper.model.sinusoids(x.shape[1], self.hidden_size)
22 |         pe = pe.unsqueeze(0)
23 |         self.pe = pe.to(device=x.device, dtype=x.dtype)
24 | 
25 |     # required by S3PRL Featurizer
26 |     def get_downsample_rates(self, key: str = None) -> int:
27 |         return 320
28 | 
29 |     @property
30 |     def num_layers(self):
31 |         return 1
32 | 
33 |     @property
34 |     def hidden_sizes(self):
35 |         return [self.hidden_size]
36 | 
37 |     @property
38 |     def downsample_rates(self):
39 |         return [self.get_downsample_rates()]
40 | 
41 |     def forward(self, speech, speech_lens):
42 |         x = whisper.audio.log_mel_spectrogram(speech)  # [B, 80, L]
43 | 
44 |         x = F.gelu(self.model.encoder.conv1(x))
45 |         x = F.gelu(self.model.encoder.conv2(x))
46 |         x = x.permute(0, 2, 1)
47 | 
48 |         self.extend_pe(x)
49 |         x = x + self.pe[:, : x.shape[1]]
50 | 
51 |         for block in self.model.encoder.blocks:
52 |             x = block(x)
53 | 
54 |         x = self.model.encoder.ln_post(x)
55 |         return [x], [torch.round(speech_lens / self.get_downsample_rates())]
56 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/data.py:
--------------------------------------------------------------------------------
 1 | def pad_list(xs, pad_value):
 2 |     """Perform padding for the list of tensors.
 3 | 
 4 |     Args:
 5 |         xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
 6 |         pad_value (float): Value for padding.
 7 | 
 8 |     Returns:
 9 |         Tensor: Padded tensor (B, Tmax, `*`).
10 | 
11 |     Examples:
12 |         >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
13 |         >>> x
14 |         [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
15 |         >>> pad_list(x, 0)
16 |         tensor([[1., 1., 1., 1.],
17 |                 [1., 1., 0., 0.],
18 |                 [1., 0., 0., 0.]])
19 | 
20 |     """
21 |     n_batch = len(xs)
22 |     max_len = max(x.size(0) for x in xs)
23 |     pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
24 | 
25 |     for i in range(n_batch):
26 |         pad[i, : xs[i].size(0)] = xs[i]
27 | 
28 |     return pad
29 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib
 4 | 
 5 | matplotlib.use("Agg")
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | 
 9 | def plot_attention(array, figname, figsize=(6, 4), dpi=150, origin="upper"):
10 |     shape = array.shape
11 |     # for transformer attention weights,
12 |     # whose shape is (#leyers, #heads, out_length, in_length)
13 |     plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi)
14 |     for idx1, xs in enumerate(array):
15 |         for idx2, x in enumerate(xs, 1):
16 |             plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
17 |             plt.imshow(x, aspect="auto")
18 |             plt.xlabel("Input")
19 |             plt.ylabel("Output")
20 | 
21 |     plt.tight_layout()
22 |     if not os.path.exists(os.path.dirname(figname)):
23 |         # NOTE: exist_ok = True is needed for parallel process decoding
24 |         os.makedirs(os.path.dirname(figname), exist_ok=True)
25 |     plt.savefig(figname)
26 |     plt.close()
27 | 
28 | 
29 | def plot_generated_and_ref_2d(
30 |     array,
31 |     figname,
32 |     figsize=(6, 4),
33 |     dpi=150,
34 |     ref=None,
35 |     origin="upper",
36 |     xlabel="Frame",
37 |     ylabel="Freqeuency",
38 | ):
39 |     if ref is None:
40 |         plt.figure(figsize=figsize, dpi=dpi)
41 |         plt.imshow(array.T, aspect="auto", origin=origin)
42 |         plt.xlabel(xlabel)
43 |         plt.ylabel(ylabel)
44 |         plt.title("Generated")
45 |     else:
46 |         plt.figure(figsize=(figsize[0] * 2, figsize[1]), dpi=dpi)
47 |         plt.subplot(1, 2, 1)
48 |         plt.imshow(array.T, aspect="auto", origin=origin)
49 |         plt.xlabel(xlabel)
50 |         plt.ylabel(ylabel)
51 |         plt.title("Generated")
52 |         plt.subplot(1, 2, 2)
53 |         plt.imshow(ref.T, aspect="auto", origin=origin)
54 |         plt.xlabel(xlabel)
55 |         plt.ylabel(ylabel)
56 |         plt.title("Reference")
57 | 
58 |     plt.tight_layout()
59 |     if not os.path.exists(os.path.dirname(figname)):
60 |         # NOTE: exist_ok = True is needed for parallel process decoding
61 |         os.makedirs(os.path.dirname(figname), exist_ok=True)
62 |     plt.savefig(figname)
63 |     plt.close()
64 | 
65 | 
66 | def plot_1d(array, figname, figsize=(6, 4), dpi=150, origin="upper"):
67 |     # for eos probability
68 |     plt.figure(figsize=figsize, dpi=dpi)
69 |     plt.plot(array)
70 |     plt.xlabel("Frame")
71 |     plt.ylabel("Probability")
72 |     plt.ylim([0, 1])
73 | 
74 |     plt.tight_layout()
75 |     if not os.path.exists(os.path.dirname(figname)):
76 |         # NOTE: exist_ok = True is needed for parallel process decoding
77 |         os.makedirs(os.path.dirname(figname), exist_ok=True)
78 |     plt.savefig(figname)
79 |     plt.close()
80 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/signal.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pysptk
  3 | import pyworld as pw
  4 | from scipy.signal import firwin
  5 | from scipy.signal import lfilter
  6 | 
  7 | MCEP_DIM = 39
  8 | MCEP_ALPHA = 0.466
  9 | MCEP_SHIFT = 5
 10 | MCEP_FFTL = 1024
 11 | 
 12 | 
 13 | def low_cut_filter(x, fs, cutoff=70):
 14 |     """FUNCTION TO APPLY LOW CUT FILTER
 15 |     Args:
 16 |         x (ndarray): Waveform sequence
 17 |         fs (int): Sampling frequency
 18 |         cutoff (float): Cutoff frequency of low cut filter
 19 |     Return:
 20 |         (ndarray): Low cut filtered waveform sequence
 21 |     """
 22 | 
 23 |     nyquist = fs // 2
 24 |     norm_cutoff = cutoff / nyquist
 25 | 
 26 |     # low cut filter
 27 |     fil = firwin(255, norm_cutoff, pass_zero=False)
 28 |     lcf_x = lfilter(fil, 1, x)
 29 | 
 30 |     return lcf_x
 31 | 
 32 | 
 33 | def spc2npow(spectrogram):
 34 |     """Calculate normalized power sequence from spectrogram
 35 |     Parameters
 36 |     ----------
 37 |     spectrogram : array, shape (T, `fftlen / 2 + 1`)
 38 |         Array of spectrum envelope
 39 |     Return
 40 |     ------
 41 |     npow : array, shape (`T`, `1`)
 42 |         Normalized power sequence
 43 |     """
 44 | 
 45 |     # frame based processing
 46 |     npow = np.apply_along_axis(_spvec2pow, 1, spectrogram)
 47 | 
 48 |     meanpow = np.mean(npow)
 49 |     npow = 10.0 * np.log10(npow / meanpow)
 50 | 
 51 |     return npow
 52 | 
 53 | 
 54 | def _spvec2pow(specvec):
 55 |     """Convert a spectrum envelope into a power
 56 |     Parameters
 57 |     ----------
 58 |     specvec : vector, shape (`fftlen / 2 + 1`)
 59 |         Vector of specturm envelope |H(w)|^2
 60 |     Return
 61 |     ------
 62 |     power : scala,
 63 |         Power of a frame
 64 |     """
 65 | 
 66 |     # set FFT length
 67 |     fftl2 = len(specvec) - 1
 68 |     fftl = fftl2 * 2
 69 | 
 70 |     # specvec is not amplitude spectral |H(w)| but power spectral |H(w)|^2
 71 |     power = specvec[0] + specvec[fftl2]
 72 |     for k in range(1, fftl2):
 73 |         power += 2.0 * specvec[k]
 74 |     power /= fftl
 75 | 
 76 |     return power
 77 | 
 78 | 
 79 | def extfrm(data, npow, power_threshold=-20):
 80 |     """Extract frame over the power threshold
 81 |     Parameters
 82 |     ----------
 83 |     data: array, shape (`T`, `dim`)
 84 |         Array of input data
 85 |     npow : array, shape (`T`)
 86 |         Vector of normalized power sequence.
 87 |     power_threshold : float, optional
 88 |         Value of power threshold [dB]
 89 |         Default set to -20
 90 |     Returns
 91 |     -------
 92 |     data: array, shape (`T_ext`, `dim`)
 93 |         Remaining data after extracting frame
 94 |         `T_ext` <= `T`
 95 |     """
 96 | 
 97 |     T = data.shape[0]
 98 |     if T != len(npow):
 99 |         raise ("Length of two vectors is different.")
100 | 
101 |     valid_index = np.where(npow > power_threshold)
102 |     extdata = data[valid_index]
103 |     assert extdata.shape[0] <= T
104 | 
105 |     return extdata
106 | 
107 | 
108 | def world_extract(x, fs, f0min, f0max):
109 |     # scale from [-1, 1] to [-32768, 32767]
110 |     x = x * np.iinfo(np.int16).max
111 | 
112 |     x = np.array(x, dtype=np.float64)
113 |     x = low_cut_filter(x, fs)
114 | 
115 |     # extract features
116 |     f0, time_axis = pw.harvest(
117 |         x, fs, f0_floor=f0min, f0_ceil=f0max, frame_period=MCEP_SHIFT
118 |     )
119 |     sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
120 |     ap = pw.d4c(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
121 |     mcep = pysptk.sp2mc(sp, MCEP_DIM, MCEP_ALPHA)
122 |     npow = spc2npow(sp)
123 | 
124 |     return {
125 |         "sp": sp,
126 |         "mcep": mcep,
127 |         "ap": ap,
128 |         "f0": f0,
129 |         "npow": npow,
130 |     }
131 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/speaker_embedding_resemblyzer.py:
--------------------------------------------------------------------------------
 1 | from resemblyzer import VoiceEncoder, preprocess_wav
 2 | 
 3 | 
 4 | def load_asv_model():
 5 |     return VoiceEncoder(device="cpu")
 6 | 
 7 | 
 8 | def get_embedding(wav_path, model):
 9 |     wav = preprocess_wav(wav_path)
10 |     return model.embed_utterance(wav)
11 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/speaker_embedding_wespeaker.py:
--------------------------------------------------------------------------------
 1 | import wespeakerruntime as wespeaker
 2 | 
 3 | 
 4 | def load_asv_model(lang="en"):
 5 |     return wespeaker.Inference(lang=lang)
 6 | 
 7 | 
 8 | def get_embedding(wav_path, model):
 9 |     return model.extract_embedding_wav(wav_path)
10 | 


--------------------------------------------------------------------------------
/s3prl_vc/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Utility functions."""
  7 | 
  8 | import fnmatch
  9 | import logging
 10 | import os
 11 | import sys
 12 | 
 13 | from distutils.version import LooseVersion
 14 | from filelock import FileLock
 15 | 
 16 | import h5py
 17 | import numpy as np
 18 | 
 19 | 
 20 | def get_basename(path):
 21 |     return os.path.splitext(os.path.split(path)[-1])[0]
 22 | 
 23 | 
 24 | def find_files(root_dir, query="*.wav", include_root_dir=True):
 25 |     """Find files recursively.
 26 | 
 27 |     Args:
 28 |         root_dir (str): Root root_dir to find.
 29 |         query (str): Query to find.
 30 |         include_root_dir (bool): If False, root_dir name is not included.
 31 | 
 32 |     Returns:
 33 |         list: List of found filenames.
 34 | 
 35 |     """
 36 |     files = []
 37 |     for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
 38 |         for filename in fnmatch.filter(filenames, query):
 39 |             files.append(os.path.join(root, filename))
 40 |     if not include_root_dir:
 41 |         files = [file_.replace(root_dir + "/", "") for file_ in files]
 42 | 
 43 |     return files
 44 | 
 45 | 
 46 | def read_hdf5(hdf5_name, hdf5_path):
 47 |     """Read hdf5 dataset.
 48 | 
 49 |     Args:
 50 |         hdf5_name (str): Filename of hdf5 file.
 51 |         hdf5_path (str): Dataset name in hdf5 file.
 52 | 
 53 |     Return:
 54 |         any: Dataset values.
 55 | 
 56 |     """
 57 |     if not os.path.exists(hdf5_name):
 58 |         logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
 59 |         sys.exit(1)
 60 | 
 61 |     hdf5_file = h5py.File(hdf5_name, "r")
 62 | 
 63 |     if hdf5_path not in hdf5_file:
 64 |         logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
 65 |         sys.exit(1)
 66 | 
 67 |     hdf5_data = hdf5_file[hdf5_path][()]
 68 |     hdf5_file.close()
 69 | 
 70 |     return hdf5_data
 71 | 
 72 | 
 73 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
 74 |     """Write dataset to hdf5.
 75 | 
 76 |     Args:
 77 |         hdf5_name (str): Hdf5 dataset filename.
 78 |         hdf5_path (str): Dataset path in hdf5.
 79 |         write_data (ndarray): Data to write.
 80 |         is_overwrite (bool): Whether to overwrite dataset.
 81 | 
 82 |     """
 83 |     # convert to numpy array
 84 |     write_data = np.array(write_data)
 85 | 
 86 |     # check folder existence
 87 |     folder_name, _ = os.path.split(hdf5_name)
 88 |     if not os.path.exists(folder_name) and len(folder_name) != 0:
 89 |         os.makedirs(folder_name)
 90 | 
 91 |     # check hdf5 existence
 92 |     if os.path.exists(hdf5_name):
 93 |         # if already exists, open with r+ mode
 94 |         hdf5_file = h5py.File(hdf5_name, "r+")
 95 |         # check dataset existence
 96 |         if hdf5_path in hdf5_file:
 97 |             if is_overwrite:
 98 |                 logging.warning(
 99 |                     "Dataset in hdf5 file already exists. recreate dataset in hdf5."
100 |                 )
101 |                 hdf5_file.__delitem__(hdf5_path)
102 |             else:
103 |                 logging.error(
104 |                     "Dataset in hdf5 file already exists. "
105 |                     "if you want to overwrite, please set is_overwrite = True."
106 |                 )
107 |                 hdf5_file.close()
108 |                 sys.exit(1)
109 |     else:
110 |         # if not exists, open with w mode
111 |         hdf5_file = h5py.File(hdf5_name, "w")
112 | 
113 |     # write data to hdf5
114 |     hdf5_file.create_dataset(hdf5_path, data=write_data)
115 |     hdf5_file.flush()
116 |     hdf5_file.close()
117 | 


--------------------------------------------------------------------------------
/s3prl_vc/vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .vocoder import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/s3prl_vc/vocoder/griffin_lim.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """Griffin-Lim related modules."""
  4 | 
  5 | # Copyright 2019 Tomoki Hayashi
  6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  7 | 
  8 | import logging
  9 | from functools import partial
 10 | from typing import Optional
 11 | 
 12 | import librosa
 13 | import numpy as np
 14 | import torch
 15 | from packaging.version import parse as V
 16 | 
 17 | EPS = 1e-10
 18 | 
 19 | 
 20 | def logmel2linear(
 21 |     lmspc: np.ndarray,
 22 |     fs: int,
 23 |     n_fft: int,
 24 |     n_mels: int,
 25 |     fmin: int = None,
 26 |     fmax: int = None,
 27 | ) -> np.ndarray:
 28 |     """Convert log Mel filterbank to linear spectrogram.
 29 | 
 30 |     Args:
 31 |         lmspc: Log Mel filterbank (T, n_mels).
 32 |         fs: Sampling frequency.
 33 |         n_fft: The number of FFT points.
 34 |         n_mels: The number of mel basis.
 35 |         f_min: Minimum frequency to analyze.
 36 |         f_max: Maximum frequency to analyze.
 37 | 
 38 |     Returns:
 39 |         Linear spectrogram (T, n_fft // 2 + 1).
 40 | 
 41 |     """
 42 |     assert lmspc.shape[1] == n_mels
 43 |     fmin = 0 if fmin is None else fmin
 44 |     fmax = fs / 2 if fmax is None else fmax
 45 |     mspc = np.power(10.0, lmspc)
 46 |     mel_basis = librosa.filters.mel(
 47 |         sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
 48 |     )
 49 |     inv_mel_basis = np.linalg.pinv(mel_basis)
 50 |     return np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
 51 | 
 52 | 
 53 | def griffin_lim(
 54 |     spc: np.ndarray,
 55 |     n_fft: int,
 56 |     n_shift: int,
 57 |     win_length: int = None,
 58 |     window: Optional[str] = "hann",
 59 |     n_iter: Optional[int] = 32,
 60 | ) -> np.ndarray:
 61 |     """Convert linear spectrogram into waveform using Griffin-Lim.
 62 | 
 63 |     Args:
 64 |         spc: Linear spectrogram (T, n_fft // 2 + 1).
 65 |         n_fft: The number of FFT points.
 66 |         n_shift: Shift size in points.
 67 |         win_length: Window length in points.
 68 |         window: Window function type.
 69 |         n_iter: The number of iterations.
 70 | 
 71 |     Returns:
 72 |         Reconstructed waveform (N,).
 73 | 
 74 |     """
 75 |     # assert the size of input linear spectrogram
 76 |     assert spc.shape[1] == n_fft // 2 + 1
 77 | 
 78 |     if V(librosa.__version__) >= V("0.7.0"):
 79 |         # use librosa's fast Grriffin-Lim algorithm
 80 |         spc = np.abs(spc.T)
 81 |         y = librosa.griffinlim(
 82 |             S=spc,
 83 |             n_iter=n_iter,
 84 |             hop_length=n_shift,
 85 |             win_length=win_length,
 86 |             window=window,
 87 |             center=True if spc.shape[1] > 1 else False,
 88 |         )
 89 |     else:
 90 |         # use slower version of Grriffin-Lim algorithm
 91 |         logging.warning(
 92 |             "librosa version is old. use slow version of Grriffin-Lim algorithm."
 93 |             "if you want to use fast Griffin-Lim, please update librosa via "
 94 |             "`source ./path.sh && pip install librosa==0.7.0`."
 95 |         )
 96 |         cspc = np.abs(spc).astype(np.complex).T
 97 |         angles = np.exp(2j * np.pi * np.random.rand(*cspc.shape))
 98 |         y = librosa.istft(cspc * angles, n_shift, win_length, window=window)
 99 |         for i in range(n_iter):
100 |             angles = np.exp(
101 |                 1j
102 |                 * np.angle(librosa.stft(y, n_fft, n_shift, win_length, window=window))
103 |             )
104 |             y = librosa.istft(cspc * angles, n_shift, win_length, window=window)
105 | 
106 |     return y
107 | 
108 | 
109 | # TODO(kan-bayashi): write as torch.nn.Module
110 | class Spectrogram2Waveform(object):
111 |     """Spectrogram to waveform conversion module."""
112 | 
113 |     def __init__(
114 |         self,
115 |         stats,
116 |         n_fft: int,
117 |         n_shift: int,
118 |         fs: int = None,
119 |         n_mels: int = None,
120 |         win_length: int = None,
121 |         window: Optional[str] = "hann",
122 |         fmin: int = None,
123 |         fmax: int = None,
124 |         griffin_lim_iters: Optional[int] = 8,
125 |     ):
126 |         """Initialize module.
127 | 
128 |         Args:
129 |             fs: Sampling frequency.
130 |             n_fft: The number of FFT points.
131 |             n_shift: Shift size in points.
132 |             n_mels: The number of mel basis.
133 |             win_length: Window length in points.
134 |             window: Window function type.
135 |             f_min: Minimum frequency to analyze.
136 |             f_max: Maximum frequency to analyze.
137 |             griffin_lim_iters: The number of iterations.
138 | 
139 |         """
140 |         self.stats = stats
141 |         self.fs = fs
142 |         self.logmel2linear = (
143 |             partial(
144 |                 logmel2linear, fs=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
145 |             )
146 |             if n_mels is not None
147 |             else None
148 |         )
149 |         self.griffin_lim = partial(
150 |             griffin_lim,
151 |             n_fft=n_fft,
152 |             n_shift=n_shift,
153 |             win_length=win_length,
154 |             window=window,
155 |             n_iter=griffin_lim_iters,
156 |         )
157 |         self.params = dict(
158 |             n_fft=n_fft,
159 |             n_shift=n_shift,
160 |             win_length=win_length,
161 |             window=window,
162 |             n_iter=griffin_lim_iters,
163 |         )
164 |         if n_mels is not None:
165 |             self.params.update(fs=fs, n_mels=n_mels, fmin=fmin, fmax=fmax)
166 | 
167 |     def __repr__(self):
168 |         retval = f"{self.__class__.__name__}("
169 |         for k, v in self.params.items():
170 |             retval += f"{k}={v}, "
171 |         retval += ")"
172 |         return retval
173 | 
174 |     def decode(self, spc: torch.Tensor) -> torch.Tensor:
175 |         """Convert spectrogram to waveform.
176 | 
177 |         Args:
178 |             spc: Log Mel filterbank (T_feats, n_mels)
179 |                 or linear spectrogram (T_feats, n_fft // 2 + 1).
180 | 
181 |         Returns:
182 |             Tensor: Reconstructed waveform (T_wav,).
183 | 
184 |         """
185 |         device = spc.device
186 |         dtype = spc.dtype
187 |         spc = spc.cpu().numpy()
188 | 
189 |         if self.logmel2linear is not None:
190 |             spc = self.logmel2linear(spc)
191 |         wav = self.griffin_lim(spc)
192 |         return torch.tensor(wav).to(device=device, dtype=dtype), self.fs
193 | 


--------------------------------------------------------------------------------
/s3prl_vc/vocoder/vocoder.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | import torch
 4 | import yaml
 5 | 
 6 | from parallel_wavegan.utils import load_model
 7 | from s3prl_vc.utils import read_hdf5
 8 | 
 9 | 
10 | class Vocoder(object):
11 |     def __init__(self, checkpoint, config, stats, trg_stats, device):
12 |         self.device = device
13 |         self.trg_stats = {
14 |             "mean": trg_stats["mean"].to(self.device),
15 |             "scale": trg_stats["scale"].to(self.device),
16 |         }
17 | 
18 |         # load config
19 |         with open(config) as f:
20 |             self.config = yaml.load(f, Loader=yaml.Loader)
21 | 
22 |         # load model
23 |         self.model = load_model(checkpoint, self.config)
24 |         logging.info(f"Loaded model parameters from {checkpoint}.")
25 |         self.model.remove_weight_norm()
26 |         self.model = self.model.eval().to(device)
27 | 
28 |         # load stats for normalization
29 |         self.stats = {
30 |             "mean": torch.tensor(read_hdf5(stats, "mean"), dtype=torch.float).to(
31 |                 self.device
32 |             ),
33 |             "scale": torch.tensor(read_hdf5(stats, "scale"), dtype=torch.float).to(
34 |                 self.device
35 |             ),
36 |         }
37 | 
38 |     def decode(self, c):
39 |         # normalize with vocoder stats
40 |         c = (c - self.stats["mean"]) / self.stats["scale"]
41 | 
42 |         start = time.time()
43 |         y = self.model.inference(c, normalize_before=False).view(-1)
44 |         rtf = (time.time() - start) / (len(y) / self.config["sampling_rate"])
45 |         logging.info(f"Finished waveform generation. (RTF = {rtf:.03f}).")
46 |         return y, self.config["sampling_rate"]
47 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [options]
 2 | packages = find:
 3 | install_requires =
 4 |     torch >= 1.13
 5 |     librosa >= 0.8.0
 6 |     soundfile>=0.10.2
 7 |     tensorboardX>=1.8
 8 |     matplotlib>=3.1.0
 9 |     pyyaml
10 |     tqdm>=4.26.1
11 |     kaldiio>=2.14.1
12 |     h5py>=2.9.0
13 |     yq>=2.10.0
14 |     gdown
15 |     filelock
16 |     protobuf<=3.20.1
17 |     parallel-wavegan
18 |     jiwer
19 |     pyworld
20 |     scipy
21 |     pysptk
22 |     transformers
23 |     editdistance
24 |     fastdtw
25 |     s3prl
26 |     wespeakerruntime
27 |     textgrid
28 |     amfm_decompy
29 |     torch_complex
30 |     openai-whisper
31 |     resemblyzer
32 | 
33 | [options.entry_points]
34 | console_scripts =
35 |     s3prl-vc-decode = s3prl_vc.bin.decode:main
36 |     s3prl-vc-extract-upstream = s3prl_vc.bin.extract_upstream:main
37 |     s3prl-vc-decode-downstream = s3prl_vc.bin.decode_downstream:main
38 | 
39 | [metadata]
40 | name = s3prl_vc
41 | version = 0.3.1
42 | author = Wen-Chin Huang
43 | author_email = wen.chinhuang@g.sp.m.is.nagoya-u.ac.jp
44 | description = Voice conversion toolkit based on S3PRL: Self-Supervised Speech/Sound Pre-training and Representation Learning Toolkit
45 | keywords = voice conversion, self-supervised learning
46 | license = MIT
47 | long_description=README.md
48 | long_description_content_type=text/markdown
49 | classifiers =
50 |     License :: OSI Approved :: MIT License
51 |     Programming Language :: Python :: 3
52 | 


--------------------------------------------------------------------------------
/tools/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON:= python
 2 | CUDA_VERSION:= 11.2
 3 | PYTORCH_VERSION:= 1.13
 4 | DOT:= .
 5 | .PHONY: all clean show_variables s3prl_vc
 6 | 
 7 | all: show_variables virtualenv.done pytorch.done s3prl_vc.done
 8 | 
 9 | s3prl_vc: s3prl_vc.done
10 | 
11 | show_variables:
12 | 	@echo PYTHON=$(PYTHON)
13 | 	@echo CUDA_VERSION=$(CUDA_VERSION)
14 | 	@echo PYTORCH_VERSION=$(PYTORCH_VERSION)
15 | 
16 | virtualenv.done: show_variables
17 | 	test -d venv || $(PYTHON) -m venv venv
18 | 	. venv/bin/activate; cd ../; pip install -U pip
19 | 	# install numpy here since python3.6 is not supported in > 1.20
20 | 	. venv/bin/activate; cd ../; pip install numpy
21 | 	touch virtualenv.done
22 | 
23 | pytorch.done: virtualenv.done
24 | ifeq ($(CUDA_VERSION),)
25 | 	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
26 | 		-f https://download.pytorch.org/whl/cpu/stable.html
27 | else
28 | 	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
29 | 		-f https://download.pytorch.org/whl/cu$(subst $(DOT),,$(CUDA_VERSION))/torch_stable.html
30 | endif
31 | 	touch pytorch.done
32 | 
33 | s3prl_vc.done: virtualenv.done pytorch.done
34 | 	. venv/bin/activate; cd ../; pip install -e .
35 | 	touch s3prl_vc.done
36 | 
37 | apex.done: virtualenv.done pytorch.done
38 | 	git clone https://github.com/NVIDIA/apex.git
39 | 	. venv/bin/activate; cd apex; \
40 | 		pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
41 | 	touch apex.done
42 | 
43 | clean:
44 | 	rm -fr venv apex *.done
45 | 	find -iname "*.pyc" -delete
46 | 


--------------------------------------------------------------------------------
/utils/combine_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Combine data direcotries into a single data direcotry
 4 | 
 5 | # Copyright 2019 Tomoki Hayashi
 6 | #  MIT License (https://opensource.org/licenses/MIT)
 7 | 
 8 | if [ $# -lt 2 ]; then
 9 |     echo "Usage: $0 <dist_dir> <src_dir_1> <src_dir_2> ..."
10 |     echo "e.g.: $0 data/all data/spk_1 data/spk_2 data/spk_3"
11 |     exit 1
12 | fi
13 | 
14 | set -euo pipefail
15 | 
16 | dist_dir=$1
17 | shift
18 | first_src_dir=$1
19 | 
20 | 
21 | [ ! -e "${dist_dir}" ] && mkdir -p "${dist_dir}"
22 | 
23 | if [ -e "${first_src_dir}/segments" ]; then
24 |     has_segments=true
25 |     segments=${dist_dir}/segments
26 |     segments_tmp=${dist_dir}/segments.unsorted
27 |     [ -e "${segments_tmp}" ] && rm "${segments_tmp}"
28 | else
29 |     has_segments=false
30 | fi
31 | scp=${dist_dir}/wav.scp
32 | scp_tmp=${dist_dir}/wav.scp.unsorted
33 | [ -e "${scp_tmp}" ] && rm "${scp_tmp}"
34 | 
35 | # concatenate all of wav.scp and segments file
36 | for _ in $(seq 1 ${#}); do
37 |     src_dir=$1
38 | 
39 |     if "${has_segments}"; then
40 |         [ ! -e "${src_dir}/segments" ] && echo "WARN: Not found segments in ${src_dir}. Skipped." >&2 && shift && continue
41 |         cat "${src_dir}/segments" >> "${segments_tmp}"
42 |     fi
43 | 
44 |     [ ! -e "${src_dir}/wav.scp" ] && echo "Not found wav.scp in ${src_dir}." >&2 && exit 1;
45 |     cat "${src_dir}/wav.scp" >> "${scp_tmp}"
46 | 
47 |     shift
48 | done
49 | 
50 | # sort
51 | sort "${scp_tmp}" > "${scp}"
52 | if "${has_segments}"; then
53 |     sort "${segments_tmp}" > "${segments}"
54 | fi
55 | rm "${dist_dir}"/*.unsorted
56 | 
57 | echo "Successfully combined data direcotries."
58 | 


--------------------------------------------------------------------------------
/utils/download_from_google_drive.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Download zip, tar, or tar.gz file from google drive
 4 | 
 5 | # Copyright 2019 Tomoki Hayashi
 6 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 7 | 
 8 | share_url=$1
 9 | download_dir=${2:-"downloads"}
10 | file_ext=${3:-"zip"}
11 | 
12 | if [ "$1" = "--help" ] || [ $# -lt 1 ] || [ $# -gt 3 ]; then
13 |    echo "Usage: $0 <share-url> [<download_dir> <file_ext>]";
14 |    echo "e.g.: $0 https://drive.google.com/open?id=1zF88bRNbJhw9hNBq3NrDg8vnGGibREmg downloads zip"
15 |    echo "Options:"
16 |    echo "    <download_dir>: directory to save downloaded file. (Default=downloads)"
17 |    echo "    <file_ext>: file extension of the file to be downloaded. (Default=zip)"
18 |    if [ "$1" = "--help" ]; then
19 |        exit 0;
20 |    fi
21 |    exit 1;
22 | fi
23 | 
24 | [ ! -e "${download_dir}" ] && mkdir -p "${download_dir}"
25 | tmp=$(mktemp "${download_dir}/XXXXXX.${file_ext}")
26 | 
27 | # file id in google drive can be obtain from sharing link
28 | # ref: https://qiita.com/namakemono/items/c963e75e0af3f7eed732
29 | file_id=$(echo "${share_url}" | cut -d"=" -f 2)
30 | 
31 | # define decompressor
32 | decompress () {
33 |     filename=$1
34 |     decompress_dir=$2
35 |     if echo "${filename}" | grep -q ".zip"; then
36 |         unzip "${filename}" -d "${decompress_dir}"
37 |     elif echo "${filename}" | grep -q -e ".tar" -e ".tar.gz" -e ".tgz"; then
38 |         tar xvzf "${filename}" -C "${decompress_dir}"
39 |     else
40 |         echo "Unsupported file extension." >&2 && exit 1
41 |     fi
42 | }
43 | 
44 | set -e
45 | # Solution from https://github.com/wkentaro/gdown
46 | gdown --id "${file_id}" -O "${tmp}"
47 | decompress "${tmp}" "${download_dir}"
48 | 
49 | # remove tmpfiles
50 | rm "${tmp}"
51 | echo "Successfully downloaded ${file_ext} file from ${share_url}"
52 | 


--------------------------------------------------------------------------------
/utils/make_subset_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make subset files located in data direcoty.
 4 | 
 5 | # Copyright 2020 Tomoki Hayashi
 6 | #  MIT License (https://opensource.org/licenses/MIT)
 7 | 
 8 | # shellcheck disable=SC1091
 9 | . ./path.sh || exit 1;
10 | 
11 | 
12 | if [ $# -ne 3 ]; then
13 |     echo "Usage: $0 <src_dir> <num_split> <dst_dir>"
14 |     echo "e.g.: $0 data/train_nodev 16 data/train_nodev/split16"
15 |     exit 1
16 | fi
17 | 
18 | set -eu
19 | 
20 | src_dir=$1
21 | num_split=$2
22 | dst_dir=$3
23 | 
24 | src_scp=${src_dir}/wav.scp
25 | if [ -e "${src_dir}/segments" ]; then
26 |     has_segments=true
27 |     src_segments=${src_dir}/segments
28 | else
29 |     has_segments=false
30 | fi
31 | 
32 | if ! ${has_segments}; then
33 |     split_scps=""
34 |     for i in $(seq 1 "${num_split}"); do
35 |         split_scps+=" ${dst_dir}/wav.${i}.scp"
36 |     done
37 |     # shellcheck disable=SC2086
38 |     utils/split_scp.pl "${src_scp}" ${split_scps}
39 | else
40 |     split_scps=""
41 |     for i in $(seq 1 "${num_split}"); do
42 |         split_scps+=" ${dst_dir}/segments.${i}"
43 |     done
44 |     # shellcheck disable=SC2086
45 |     utils/split_scp.pl "${src_segments}" ${split_scps}
46 |     for i in $(seq 1 "${num_split}"); do
47 |         awk '{print $2}' < "${dst_dir}/segments.${i}" | sort | uniq | while read -r wav_id; do
48 |             grep "^${wav_id} " < "${src_scp}" >> "${dst_dir}/wav.${i}.scp"
49 |         done
50 |     done
51 | fi
52 | echo "Successfully make subsets."
53 | 


--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
 4 | #                 Arnab Ghoshal, Karel Vesely
 5 | 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #  http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
16 | # See the Apache 2 License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | 
20 | # Parse command-line options.
21 | # To be sourced by another script (as in ". parse_options.sh").
22 | # Option format is: --option-name arg
23 | # and shell variable "option_name" gets set to value "arg."
24 | # The exception is --help, which takes no arguments, but prints the 
25 | # $help_message variable (if defined).
26 | 
27 | 
28 | ###
29 | ### The --config file options have lower priority to command line 
30 | ### options, so we need to import them first...
31 | ###
32 | 
33 | # Now import all the configs specified by command-line, in left-to-right order
34 | for ((argpos=1; argpos<$#; argpos++)); do
35 |   if [ "${!argpos}" == "--config" ]; then
36 |     argpos_plus1=$((argpos+1))
37 |     config=${!argpos_plus1}
38 |     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
39 |     . $config  # source the config file.
40 |   fi
41 | done
42 | 
43 | 
44 | ###
45 | ### No we process the command line options
46 | ###
47 | while true; do
48 |   [ -z "${1:-}" ] && break;  # break if there are no arguments
49 |   case "$1" in
50 |     # If the enclosing script is called with --help option, print the help 
51 |     # message and exit.  Scripts should put help messages in $help_message
52 |   --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
53 | 	  else printf "$help_message\n" 1>&2 ; fi; 
54 | 	  exit 0 ;; 
55 |   --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
56 |        exit 1 ;;
57 |     # If the first command-line argument begins with "--" (e.g. --foo-bar), 
58 |     # then work out the variable name as $name, which will equal "foo_bar".
59 |   --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
60 |     # Next we test whether the variable in question is undefned-- if so it's 
61 |     # an invalid option and we die.  Note: $0 evaluates to the name of the 
62 |     # enclosing script.
63 |     # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
64 |     # is undefined.  We then have to wrap this test inside "eval" because 
65 |     # foo_bar is itself inside a variable ($name).
66 |       eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
67 |       
68 |       oldval="`eval echo \\$$name`";
69 |     # Work out whether we seem to be expecting a Boolean argument.
70 |       if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
71 | 	was_bool=true;
72 |       else 
73 | 	was_bool=false;
74 |       fi
75 | 
76 |     # Set the variable to the right value-- the escaped quotes make it work if
77 |     # the option had spaces, like --cmd "queue.pl -sync y"
78 |       eval $name=\"$2\"; 
79 |         
80 |     # Check that Boolean-valued arguments are really Boolean.
81 |       if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
82 |         echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
83 |         exit 1;
84 |       fi
85 |       shift 2;
86 |       ;;
87 |   *) break;
88 |   esac
89 | done
90 | 
91 | 
92 | # Check for an empty argument to the --cmd option, which can easily occur as a 
93 | # result of scripting errors.
94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
95 | 
96 | 
97 | true; # so this script returns exit code 0.
98 | 


--------------------------------------------------------------------------------
/utils/split_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Split data direcoty into two data direcotries
  4 | 
  5 | # Copyright 2019 Tomoki Hayashi
  6 | #  MIT License (https://opensource.org/licenses/MIT)
  7 | 
  8 | # shellcheck disable=SC1091
  9 | . ./path.sh || exit 1;
 10 | 
 11 | shuffle=false
 12 | num_first=0
 13 | num_second=0
 14 | 
 15 | # shellcheck disable=SC1091
 16 | . utils/parse_options.sh || exit 1;
 17 | 
 18 | if [ $# -ne 3 ]; then
 19 |     echo "Usage: $0 <src_dir> <dist_dir_1> <dist_dir_2> ..."
 20 |     echo "e.g.: $0 data/all data/train data/deveval"
 21 |     echo ""
 22 |     echo "Options:"
 23 |     echo "    --shuffle: Whether to perform shuffle (default=false)."
 24 |     echo "    --num_first: Number of utts in the first dist dir."
 25 |     echo "        If set to 0, it will be automatically decided (default=0)."
 26 |     echo "    --num_second: Number of utts in the second dist dir."
 27 |     echo "        If set to 0, it will be automatically decided (default=0)."
 28 |     exit 1
 29 | fi
 30 | 
 31 | set -eu
 32 | 
 33 | src_dir=$1
 34 | first_dist_dir=$2
 35 | second_dist_dir=$3
 36 | 
 37 | src_scp=${src_dir}/wav.scp
 38 | if [ -e "${src_dir}/segments" ]; then
 39 |     has_segments=true
 40 |     src_segments=${src_dir}/segments
 41 |     num_src_utts=$(wc -l < "${src_segments}")
 42 | else
 43 |     has_segments=false
 44 |     num_src_utts=$(wc -l < "${src_scp}")
 45 | fi
 46 | 
 47 | if [ -e "${src_dir}/text" ]; then
 48 |     has_text=true
 49 |     src_text=${src_dir}/text
 50 | else
 51 |     has_text=false
 52 | fi
 53 | 
 54 | # check number of utts
 55 | if [ "${num_first}" -eq 0 ] && [ "${num_second}" -eq 0 ]; then
 56 |     num_first=$((num_src_utts / 2 ))
 57 |     num_second=$((num_src_utts - num_first))
 58 | elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -eq 0 ]; then
 59 |     [ "${num_src_utts}" -le "${num_first}" ] && \
 60 |         echo "ERROR: num_first must be less than # utts in src. (${num_first} vs ${num_src_utts})" >&2 && \
 61 |         exit 1
 62 |     num_second=$((num_src_utts - num_first))
 63 | elif [ "${num_first}" -eq 0 ] && [ "${num_second}" -gt 0 ]; then
 64 |     [ "${num_src_utts}" -le "${num_second}" ] && \
 65 |         echo "ERROR: num_second must be less than # utts in src. (${num_second} vs ${num_src_utts})" >&2 && \
 66 |         exit 1
 67 |     num_first=$((num_src_utts - num_second))
 68 | elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -gt 0 ]; then
 69 |     [ "${num_src_utts}" -ne "$((num_first + num_second))" ] && \
 70 |         echo "ERROR: num_first + num_second must be the same # utts in src. ($((num_first + num_second)) vs ${num_src_utts})" >&2 && \
 71 |         exit 1
 72 | fi
 73 | 
 74 | # check directory existence
 75 | [ ! -e "${first_dist_dir}" ] && mkdir -p "${first_dist_dir}"
 76 | [ ! -e "${second_dist_dir}" ] && mkdir -p "${second_dist_dir}"
 77 | 
 78 | # split
 79 | if ! "${has_segments}"; then
 80 |     if "${shuffle}"; then
 81 |         sort -R "${src_scp}" > "${src_scp}.unsorted"
 82 |         head -n "${num_first}" "${src_scp}.unsorted" | sort > "${first_dist_dir}/wav.scp"
 83 |         tail -n "${num_second}" "${src_scp}.unsorted" | sort > "${second_dist_dir}/wav.scp"
 84 |         rm "${src_scp}.unsorted"
 85 |     else
 86 |         head -n "${num_first}" "${src_scp}" | sort > "${first_dist_dir}/wav.scp"
 87 |         tail -n "${num_second}" "${src_scp}" | sort > "${second_dist_dir}/wav.scp"
 88 |     fi
 89 |     # split text
 90 |     if "${has_text}"; then
 91 |         rm -rf "${first_dist_dir}/text"
 92 |         awk '{print $1}' < "${first_dist_dir}/wav.scp" | sort | uniq | while read -r wav_id; do
 93 |             grep "^${wav_id} " < "${src_text}" >> "${first_dist_dir}/text"
 94 |         done
 95 |         rm -rf "${second_dist_dir}/text"
 96 |         awk '{print $1}' < "${second_dist_dir}/wav.scp" | sort | uniq | while read -r wav_id; do
 97 |             grep "^${wav_id} " < "${src_text}" >> "${second_dist_dir}/text"
 98 |         done
 99 |     fi
100 | else
101 |     # split segments at first
102 |     if "${shuffle}"; then
103 |         sort -R "${src_segments}" > "${src_segments}.unsorted"
104 |         head -n "${num_first}" "${src_segments}.unsorted" | sort > "${first_dist_dir}/segments"
105 |         tail -n "${num_second}" "${src_segments}.unsorted" | sort > "${second_dist_dir}/segments"
106 |         rm "${src_segments}.unsorted"
107 |     else
108 |         head -n "${num_first}" "${src_segments}" | sort > "${first_dist_dir}/segments"
109 |         tail -n "${num_second}" "${src_segments}" | sort > "${second_dist_dir}/segments"
110 |     fi
111 |     # split wav.scp
112 |     rm -rf "${first_dist_dir}/wav.scp"
113 |     awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
114 |         grep "^${wav_id} " < "${src_scp}" >> "${first_dist_dir}/wav.scp"
115 |     done
116 |     rm -rf "${second_dist_dir}/wav.scp"
117 |     awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
118 |         grep "^${wav_id} " < "${src_scp}" >> "${second_dist_dir}/wav.scp"
119 |     done
120 |     # split text
121 |     if "${has_text}"; then
122 |         rm -rf "${first_dist_dir}/text"
123 |         awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
124 |             grep "^${wav_id} " < "${src_text}" >> "${first_dist_dir}/text"
125 |         done
126 |         rm -rf "${second_dist_dir}/text"
127 |         awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
128 |             grep "^${wav_id} " < "${src_text}" >> "${second_dist_dir}/text"
129 |         done
130 |     fi
131 | fi
132 | 
133 | echo "Successfully split data directory."
134 | 


--------------------------------------------------------------------------------