├── .gitignore ├── LICENSE ├── README.md ├── egs ├── libritts-r-clean+nus-48e_all │ └── data │ │ └── scp │ │ ├── libritts_r_clean+nus-48e_dev.list │ │ ├── libritts_r_clean+nus-48e_dev.scp │ │ ├── libritts_r_clean+nus-48e_eval.list │ │ ├── libritts_r_clean+nus-48e_eval.scp │ │ ├── libritts_r_clean+nus-48e_train_no_dev.list │ │ └── libritts_r_clean+nus-48e_train_no_dev.scp └── namine_ritsu │ └── data │ └── scp │ ├── namine_ritsu_all.scp │ ├── namine_ritsu_dev.scp │ ├── namine_ritsu_eval.scp │ └── namine_ritsu_train_no_dev.scp ├── setup.py └── sifigan ├── __init__.py ├── bin ├── __init__.py ├── anasyn.py ├── compute_statistics.py ├── config │ ├── __init__.py │ ├── anasyn.yaml │ ├── compute_statistics.yaml │ ├── data │ │ ├── libritts-r-clean+nus-48e.yaml │ │ └── namine_ritsu.yaml │ ├── decode.yaml │ ├── discriminator │ │ ├── hifigan.yaml │ │ └── univnet.yaml │ ├── extract_features.yaml │ ├── generator │ │ ├── hifigan.yaml │ │ ├── sifigan.direct.yaml │ │ └── sifigan.yaml │ ├── param_count.yaml │ ├── train.yaml │ └── train │ │ ├── hifigan.yaml │ │ ├── sifigan.yaml │ │ └── sifigan_1000k.yaml ├── decode.py ├── extract_features.py ├── param_count.py └── train.py ├── datasets ├── __init__.py └── audio_feat_dataset.py ├── layers ├── __init__.py ├── cheaptrick.py ├── residual_block.py └── snake.py ├── losses ├── __init__.py ├── adv.py ├── mel.py └── reg.py ├── models ├── __init__.py ├── discriminator.py └── generator.py └── utils ├── __init__.py ├── features.py ├── index.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .eggs 3 | *.egg-info 4 | *.log 5 | *.out 6 | *.wav 7 | *.h5 8 | .venv 9 | 10 | egs/*/exp -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Reo YONEYAMA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Source-Filter HiFi-GAN (SiFi-GAN) 2 | 3 | This repo provides official PyTorch implementation of [SiFi-GAN](https://arxiv.org/abs/2210.15533), a fast and pitch controllable high-fidelity neural vocoder.
4 | For more information, please see our [DEMO](https://chomeyama.github.io/SiFiGAN-Demo/). 5 | 6 | ## Environment setup 7 | 8 | ```bash 9 | $ cd SiFiGAN 10 | $ pip install -e . 11 | ``` 12 | 13 | Please refer to the [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) repo for more details. 14 | 15 | ## Folder architecture 16 | - **egs**: 17 | The folder for projects. 18 | - **egs/namine_ritsu**: 19 | The folder of the [Namine Ritsu](https://www.youtube.com/watch?v=pKeo9IE_L1I) project example. 20 | - **sifigan**: 21 | The folder of the source codes. 22 | 23 | The dataset preparation of Namine Ritsu database is based on [NNSVS](https://github.com/nnsvs/nnsvs/). 24 | Please refer to it for the procedure and details. 25 | 26 | ## Run 27 | 28 | In this repo, hyperparameters are managed using [Hydra](https://hydra.cc/docs/intro/).
29 | Hydra provides an easy way to dynamically create a hierarchical configuration by composition and override it through config files and the command line. 30 | 31 | ### Dataset preparation 32 | 33 | Make dataset and scp files denoting paths to each audio files according to your own dataset (e.g., `egs/namine_ritsu/data/scp/namine_ritsu.scp`).
34 | List files denoting paths to the extracted features are automatically created in the next step (e.g., `egs/namine_ritsu/data/scp/namine_ritsu.list`).
35 | Note that scp/list files for training/validation/evaluation are needed. 36 | 37 | ### Preprocessing 38 | 39 | ```bash 40 | # Move to the project directory 41 | $ cd egs/namine_ritsu 42 | 43 | # Extract acoustic features (F0, mel-cepstrum, and etc.) 44 | # You can customize parameters according to sifigan/bin/config/extract_features.yaml 45 | $ sifigan-extract-features audio=data/scp/namine_ritsu_all.scp 46 | 47 | # Compute statistics of training data 48 | $ sifigan-compute-statistics feats=data/scp/namine_ritsu_train.list stats=data/stats/namine_ritsu_train.joblib 49 | ``` 50 | 51 | ### Training 52 | 53 | ```bash 54 | # Train a model customizing the hyperparameters as you like 55 | $ sifigan-train generator=sifigan discriminator=univnet train=sifigan data=namine_ritsu out_dir=exp/sifigan 56 | ``` 57 | 58 | ### Inference 59 | 60 | ```bash 61 | # Decode with several F0 scaling factors 62 | $ sifigan-decode generator=sifigan data=namine_ritsu out_dir=exp/sifigan checkpoint_steps=400000 f0_factors=[0.5,1.0,2.0] 63 | ``` 64 | 65 | ### Analysis-Synthesis 66 | 67 | ```bash 68 | # WORLD analysis + Neural vocoder synthesis 69 | $ sifigan-anasyn generator=sifigan in_dir=your_own_input_wav_dir out_dir=your_own_output_wav_dir stats=pretrained_sifigan/namine_ritsu_train_no_dev.joblib checkpoint_path=pretrained_sifigan/checkpoint-400000steps.pkl f0_factors=[1.0] 70 | ``` 71 | 72 | ### Pretrained model 73 | 74 | ~~I provide a pretrained SiFiGAN model [HERE](https://www.dropbox.com/s/akofngycxxz1dg5/pretrained_sifigan.tar.gz?dl=0) which is trained on the Namine Ritsu corpus in the same training manner described in the paper. 75 | You can download and place it in your own directory. Then set the appropriate path to the pretrained model and the command should work.~~ 76 | 77 | 78 | ~~However, since the Namine Ritsu corpus includes a single female Japanese singer, there is a possibility that the model would not work well especially for male singers. 79 | I am planning to publish another pretrained model trained on larger dataset including many speakers.~~ 80 | 81 | Due to being trained on the code before bug fixes, I have decided to cancel the release of the model trained on the Namine Ritsu database. Instead, a model trained on the following large-scale dataset is available. 82 | 83 | A pretrained model on 24 kHz speech + singing datasets is available [HERE](https://drive.google.com/file/d/1uzqTeumvkPQpfdK_D4U41MDL5-s-hs0l/view?usp=sharing). We used train-clean-100 and train-clean-360 in [LibriTTS-R](https://google.github.io/df-conformer/librittsr/), and [NUS-48E](https://www.smcnus.org/wp-content/uploads/2013/09/05-Pub-NUS-48E.pdf) for training. 84 | Two speakers, ADIZ and JLEE in NUS-48E, were excluded from the training data for evaluation. Also, the wav data of NUS-48E were divided into clips of approximately one second each before the feature extraction step. 85 | 86 | The feature preprocessing and training commands are as follows: 87 | ```bash 88 | sifigan-extract-features audio=data/scp/libritts_r_clean+nus-48e_train_no_dev.scp minf0=60 maxf0=1000 89 | sifigan-extract-features audio=data/scp/libritts_r_clean+nus-48e_dev.scp minf0=60 maxf0=1000 90 | sifigan-extract-features audio=data/scp/libritts_r_clean+nus-48e_eval.scp minf0=60 maxf0=1000 91 | 92 | sifigan-compute-statistics feats=data/scp/libritts_r_clean+nus-48e_train_no_dev.list stats=data/stats/libritts_r_clean+nus-48e_train_no_dev.joblib 93 | 94 | sifigan-train out_dir=test/sifigan generator=sifigan data=libritts_r_clean+nus-48e train=sifigan_1000k 95 | ``` 96 | 97 | ### Monitor training progress 98 | 99 | ```bash 100 | $ tensorboard --logdir exp 101 | ``` 102 | 103 | ## Citation 104 | If you find the code is helpful, please cite the following article. 105 | 106 | ``` 107 | @INPROCEEDINGS{10095298, 108 | author={Yoneyama, Reo and Wu, Yi-Chiao and Toda, Tomoki}, 109 | booktitle={ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 110 | title={{Source-Filter HiFi-GAN: Fast and Pitch Controllable High-Fidelity Neural Vocoder}}, 111 | year={2023}, 112 | volume={}, 113 | number={}, 114 | pages={1-5}, 115 | doi={10.1109/ICASSP49357.2023.10095298} 116 | } 117 | ``` 118 | 119 | ## Authors 120 | 121 | Development: 122 | [Reo Yoneyama](https://chomeyama.github.io/Profile/) @ Nagoya University, Japan
123 | E-mail: `yoneyama.reo@g.sp.m.is.nagoya-u.ac.jp` 124 | 125 | Advisors:
126 | [Yi-Chiao Wu](https://bigpon.github.io/) @ Meta Reality Labs Research, USA
127 | E-mail: `yichiaowu@fb.com`
128 | [Tomoki Toda](https://sites.google.com/site/tomokitoda/) @ Nagoya University, Japan
129 | E-mail: `tomoki@icts.nagoya-u.ac.jp` 130 | -------------------------------------------------------------------------------- /egs/namine_ritsu/data/scp/namine_ritsu_dev.scp: -------------------------------------------------------------------------------- 1 | data/wav/namine_ritsu_BRD_seg0.wav 2 | data/wav/namine_ritsu_BRD_seg1.wav 3 | data/wav/namine_ritsu_BRD_seg10.wav 4 | data/wav/namine_ritsu_BRD_seg11.wav 5 | data/wav/namine_ritsu_BRD_seg12.wav 6 | data/wav/namine_ritsu_BRD_seg13.wav 7 | data/wav/namine_ritsu_BRD_seg14.wav 8 | data/wav/namine_ritsu_BRD_seg15.wav 9 | data/wav/namine_ritsu_BRD_seg16.wav 10 | data/wav/namine_ritsu_BRD_seg2.wav 11 | data/wav/namine_ritsu_BRD_seg3.wav 12 | data/wav/namine_ritsu_BRD_seg4.wav 13 | data/wav/namine_ritsu_BRD_seg5.wav 14 | data/wav/namine_ritsu_BRD_seg6.wav 15 | data/wav/namine_ritsu_BRD_seg7.wav 16 | data/wav/namine_ritsu_BRD_seg8.wav 17 | data/wav/namine_ritsu_BRD_seg9.wav 18 | data/wav/namine_ritsu_Baptism_seg0.wav 19 | data/wav/namine_ritsu_Baptism_seg1.wav 20 | data/wav/namine_ritsu_Baptism_seg10.wav 21 | data/wav/namine_ritsu_Baptism_seg11.wav 22 | data/wav/namine_ritsu_Baptism_seg12.wav 23 | data/wav/namine_ritsu_Baptism_seg13.wav 24 | data/wav/namine_ritsu_Baptism_seg14.wav 25 | data/wav/namine_ritsu_Baptism_seg15.wav 26 | data/wav/namine_ritsu_Baptism_seg16.wav 27 | data/wav/namine_ritsu_Baptism_seg17.wav 28 | data/wav/namine_ritsu_Baptism_seg18.wav 29 | data/wav/namine_ritsu_Baptism_seg19.wav 30 | data/wav/namine_ritsu_Baptism_seg2.wav 31 | data/wav/namine_ritsu_Baptism_seg3.wav 32 | data/wav/namine_ritsu_Baptism_seg4.wav 33 | data/wav/namine_ritsu_Baptism_seg5.wav 34 | data/wav/namine_ritsu_Baptism_seg6.wav 35 | data/wav/namine_ritsu_Baptism_seg7.wav 36 | data/wav/namine_ritsu_Baptism_seg8.wav 37 | data/wav/namine_ritsu_Baptism_seg9.wav 38 | data/wav/namine_ritsu_COZMIC_HEART_seg0.wav 39 | data/wav/namine_ritsu_COZMIC_HEART_seg1.wav 40 | data/wav/namine_ritsu_COZMIC_HEART_seg10.wav 41 | data/wav/namine_ritsu_COZMIC_HEART_seg11.wav 42 | data/wav/namine_ritsu_COZMIC_HEART_seg12.wav 43 | data/wav/namine_ritsu_COZMIC_HEART_seg13.wav 44 | data/wav/namine_ritsu_COZMIC_HEART_seg14.wav 45 | data/wav/namine_ritsu_COZMIC_HEART_seg15.wav 46 | data/wav/namine_ritsu_COZMIC_HEART_seg16.wav 47 | data/wav/namine_ritsu_COZMIC_HEART_seg17.wav 48 | data/wav/namine_ritsu_COZMIC_HEART_seg18.wav 49 | data/wav/namine_ritsu_COZMIC_HEART_seg19.wav 50 | data/wav/namine_ritsu_COZMIC_HEART_seg2.wav 51 | data/wav/namine_ritsu_COZMIC_HEART_seg20.wav 52 | data/wav/namine_ritsu_COZMIC_HEART_seg21.wav 53 | data/wav/namine_ritsu_COZMIC_HEART_seg3.wav 54 | data/wav/namine_ritsu_COZMIC_HEART_seg4.wav 55 | data/wav/namine_ritsu_COZMIC_HEART_seg5.wav 56 | data/wav/namine_ritsu_COZMIC_HEART_seg6.wav 57 | data/wav/namine_ritsu_COZMIC_HEART_seg7.wav 58 | data/wav/namine_ritsu_COZMIC_HEART_seg8.wav 59 | data/wav/namine_ritsu_COZMIC_HEART_seg9.wav 60 | data/wav/namine_ritsu_Choir_seg0.wav 61 | data/wav/namine_ritsu_Choir_seg1.wav 62 | data/wav/namine_ritsu_Choir_seg10.wav 63 | data/wav/namine_ritsu_Choir_seg11.wav 64 | data/wav/namine_ritsu_Choir_seg12.wav 65 | data/wav/namine_ritsu_Choir_seg13.wav 66 | data/wav/namine_ritsu_Choir_seg14.wav 67 | data/wav/namine_ritsu_Choir_seg15.wav 68 | data/wav/namine_ritsu_Choir_seg16.wav 69 | data/wav/namine_ritsu_Choir_seg17.wav 70 | data/wav/namine_ritsu_Choir_seg18.wav 71 | data/wav/namine_ritsu_Choir_seg2.wav 72 | data/wav/namine_ritsu_Choir_seg3.wav 73 | data/wav/namine_ritsu_Choir_seg4.wav 74 | data/wav/namine_ritsu_Choir_seg5.wav 75 | data/wav/namine_ritsu_Choir_seg6.wav 76 | data/wav/namine_ritsu_Choir_seg7.wav 77 | data/wav/namine_ritsu_Choir_seg8.wav 78 | data/wav/namine_ritsu_Choir_seg9.wav 79 | data/wav/namine_ritsu_Creuzer_seg0.wav 80 | data/wav/namine_ritsu_Creuzer_seg1.wav 81 | data/wav/namine_ritsu_Creuzer_seg10.wav 82 | data/wav/namine_ritsu_Creuzer_seg11.wav 83 | data/wav/namine_ritsu_Creuzer_seg12.wav 84 | data/wav/namine_ritsu_Creuzer_seg2.wav 85 | data/wav/namine_ritsu_Creuzer_seg3.wav 86 | data/wav/namine_ritsu_Creuzer_seg4.wav 87 | data/wav/namine_ritsu_Creuzer_seg5.wav 88 | data/wav/namine_ritsu_Creuzer_seg6.wav 89 | data/wav/namine_ritsu_Creuzer_seg7.wav 90 | data/wav/namine_ritsu_Creuzer_seg8.wav 91 | data/wav/namine_ritsu_Creuzer_seg9.wav 92 | -------------------------------------------------------------------------------- /egs/namine_ritsu/data/scp/namine_ritsu_eval.scp: -------------------------------------------------------------------------------- 1 | data/wav/namine_ritsu_1st_color_seg0.wav 2 | data/wav/namine_ritsu_1st_color_seg1.wav 3 | data/wav/namine_ritsu_1st_color_seg10.wav 4 | data/wav/namine_ritsu_1st_color_seg11.wav 5 | data/wav/namine_ritsu_1st_color_seg12.wav 6 | data/wav/namine_ritsu_1st_color_seg13.wav 7 | data/wav/namine_ritsu_1st_color_seg14.wav 8 | data/wav/namine_ritsu_1st_color_seg15.wav 9 | data/wav/namine_ritsu_1st_color_seg16.wav 10 | data/wav/namine_ritsu_1st_color_seg17.wav 11 | data/wav/namine_ritsu_1st_color_seg18.wav 12 | data/wav/namine_ritsu_1st_color_seg2.wav 13 | data/wav/namine_ritsu_1st_color_seg3.wav 14 | data/wav/namine_ritsu_1st_color_seg4.wav 15 | data/wav/namine_ritsu_1st_color_seg5.wav 16 | data/wav/namine_ritsu_1st_color_seg6.wav 17 | data/wav/namine_ritsu_1st_color_seg7.wav 18 | data/wav/namine_ritsu_1st_color_seg8.wav 19 | data/wav/namine_ritsu_1st_color_seg9.wav 20 | data/wav/namine_ritsu_ARROW_seg0.wav 21 | data/wav/namine_ritsu_ARROW_seg1.wav 22 | data/wav/namine_ritsu_ARROW_seg10.wav 23 | data/wav/namine_ritsu_ARROW_seg11.wav 24 | data/wav/namine_ritsu_ARROW_seg12.wav 25 | data/wav/namine_ritsu_ARROW_seg13.wav 26 | data/wav/namine_ritsu_ARROW_seg2.wav 27 | data/wav/namine_ritsu_ARROW_seg3.wav 28 | data/wav/namine_ritsu_ARROW_seg4.wav 29 | data/wav/namine_ritsu_ARROW_seg5.wav 30 | data/wav/namine_ritsu_ARROW_seg6.wav 31 | data/wav/namine_ritsu_ARROW_seg7.wav 32 | data/wav/namine_ritsu_ARROW_seg8.wav 33 | data/wav/namine_ritsu_ARROW_seg9.wav 34 | data/wav/namine_ritsu_BC_seg0.wav 35 | data/wav/namine_ritsu_BC_seg1.wav 36 | data/wav/namine_ritsu_BC_seg10.wav 37 | data/wav/namine_ritsu_BC_seg11.wav 38 | data/wav/namine_ritsu_BC_seg12.wav 39 | data/wav/namine_ritsu_BC_seg13.wav 40 | data/wav/namine_ritsu_BC_seg14.wav 41 | data/wav/namine_ritsu_BC_seg15.wav 42 | data/wav/namine_ritsu_BC_seg16.wav 43 | data/wav/namine_ritsu_BC_seg17.wav 44 | data/wav/namine_ritsu_BC_seg18.wav 45 | data/wav/namine_ritsu_BC_seg19.wav 46 | data/wav/namine_ritsu_BC_seg2.wav 47 | data/wav/namine_ritsu_BC_seg20.wav 48 | data/wav/namine_ritsu_BC_seg21.wav 49 | data/wav/namine_ritsu_BC_seg3.wav 50 | data/wav/namine_ritsu_BC_seg4.wav 51 | data/wav/namine_ritsu_BC_seg5.wav 52 | data/wav/namine_ritsu_BC_seg6.wav 53 | data/wav/namine_ritsu_BC_seg7.wav 54 | data/wav/namine_ritsu_BC_seg8.wav 55 | data/wav/namine_ritsu_BC_seg9.wav 56 | data/wav/namine_ritsu_Closetoyou_seg0.wav 57 | data/wav/namine_ritsu_Closetoyou_seg1.wav 58 | data/wav/namine_ritsu_Closetoyou_seg10.wav 59 | data/wav/namine_ritsu_Closetoyou_seg11.wav 60 | data/wav/namine_ritsu_Closetoyou_seg12.wav 61 | data/wav/namine_ritsu_Closetoyou_seg13.wav 62 | data/wav/namine_ritsu_Closetoyou_seg14.wav 63 | data/wav/namine_ritsu_Closetoyou_seg15.wav 64 | data/wav/namine_ritsu_Closetoyou_seg16.wav 65 | data/wav/namine_ritsu_Closetoyou_seg17.wav 66 | data/wav/namine_ritsu_Closetoyou_seg18.wav 67 | data/wav/namine_ritsu_Closetoyou_seg19.wav 68 | data/wav/namine_ritsu_Closetoyou_seg2.wav 69 | data/wav/namine_ritsu_Closetoyou_seg20.wav 70 | data/wav/namine_ritsu_Closetoyou_seg21.wav 71 | data/wav/namine_ritsu_Closetoyou_seg22.wav 72 | data/wav/namine_ritsu_Closetoyou_seg23.wav 73 | data/wav/namine_ritsu_Closetoyou_seg24.wav 74 | data/wav/namine_ritsu_Closetoyou_seg25.wav 75 | data/wav/namine_ritsu_Closetoyou_seg26.wav 76 | data/wav/namine_ritsu_Closetoyou_seg27.wav 77 | data/wav/namine_ritsu_Closetoyou_seg3.wav 78 | data/wav/namine_ritsu_Closetoyou_seg4.wav 79 | data/wav/namine_ritsu_Closetoyou_seg5.wav 80 | data/wav/namine_ritsu_Closetoyou_seg6.wav 81 | data/wav/namine_ritsu_Closetoyou_seg7.wav 82 | data/wav/namine_ritsu_Closetoyou_seg8.wav 83 | data/wav/namine_ritsu_Closetoyou_seg9.wav 84 | data/wav/namine_ritsu_ERROR_seg0.wav 85 | data/wav/namine_ritsu_ERROR_seg1.wav 86 | data/wav/namine_ritsu_ERROR_seg10.wav 87 | data/wav/namine_ritsu_ERROR_seg11.wav 88 | data/wav/namine_ritsu_ERROR_seg12.wav 89 | data/wav/namine_ritsu_ERROR_seg13.wav 90 | data/wav/namine_ritsu_ERROR_seg14.wav 91 | data/wav/namine_ritsu_ERROR_seg15.wav 92 | data/wav/namine_ritsu_ERROR_seg16.wav 93 | data/wav/namine_ritsu_ERROR_seg17.wav 94 | data/wav/namine_ritsu_ERROR_seg18.wav 95 | data/wav/namine_ritsu_ERROR_seg19.wav 96 | data/wav/namine_ritsu_ERROR_seg2.wav 97 | data/wav/namine_ritsu_ERROR_seg20.wav 98 | data/wav/namine_ritsu_ERROR_seg21.wav 99 | data/wav/namine_ritsu_ERROR_seg22.wav 100 | data/wav/namine_ritsu_ERROR_seg23.wav 101 | data/wav/namine_ritsu_ERROR_seg24.wav 102 | data/wav/namine_ritsu_ERROR_seg3.wav 103 | data/wav/namine_ritsu_ERROR_seg4.wav 104 | data/wav/namine_ritsu_ERROR_seg5.wav 105 | data/wav/namine_ritsu_ERROR_seg6.wav 106 | data/wav/namine_ritsu_ERROR_seg7.wav 107 | data/wav/namine_ritsu_ERROR_seg8.wav 108 | data/wav/namine_ritsu_ERROR_seg9.wav 109 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Setup Source-Filter HiFiGAN Library.""" 4 | 5 | import os 6 | import sys 7 | from distutils.version import LooseVersion 8 | 9 | import pip 10 | from setuptools import find_packages, setup 11 | 12 | if LooseVersion(sys.version) < LooseVersion("3.8"): 13 | raise RuntimeError( 14 | "sifigan requires Python>=3.8, " "but your Python is {}".format(sys.version) 15 | ) 16 | if LooseVersion(pip.__version__) < LooseVersion("21.0.0"): 17 | raise RuntimeError( 18 | "pip>=21.0.0 is required, but your pip is {}. " 19 | 'Try again after "pip install -U pip"'.format(pip.__version__) 20 | ) 21 | 22 | requirements = { 23 | "install": [ 24 | "wheel", 25 | "torch>=1.9.0", 26 | "torchaudio>=0.8.1", 27 | "setuptools>=38.5.1", 28 | "librosa>=0.8.0", 29 | "soundfile>=0.10.2", 30 | "tensorboardX>=2.2", 31 | "matplotlib>=3.1.0", 32 | "PyYAML>=3.12", 33 | "tqdm>=4.26.1", 34 | "h5py>=2.10.0", 35 | "pyworld>=0.2.12", 36 | "sprocket-vc", 37 | "protobuf<=3.19.0", 38 | "hydra-core>=1.2", 39 | ], 40 | "setup": [ 41 | "numpy", 42 | "pytest-runner", 43 | ], 44 | } 45 | entry_points = { 46 | "console_scripts": [ 47 | "sifigan-extract-features=sifigan.bin.extract_features:main", 48 | "sifigan-compute-statistics=sifigan.bin.compute_statistics:main", 49 | "sifigan-train=sifigan.bin.train:main", 50 | "sifigan-decode=sifigan.bin.decode:main", 51 | "sifigan-anasyn=sifigan.bin.anasyn:main", 52 | "sifigan-param-count=sifigan.bin.param_count:main", 53 | ] 54 | } 55 | 56 | install_requires = requirements["install"] 57 | setup_requires = requirements["setup"] 58 | extras_require = { 59 | k: v for k, v in requirements.items() if k not in ["install", "setup"] 60 | } 61 | 62 | dirname = os.path.dirname(__file__) 63 | setup( 64 | name="sifigan", 65 | version="0.1", 66 | url="http://github.com/chomeyama/SourceFilterHiFiGAN", 67 | author="Reo Yoneyama", 68 | author_email="yoneyama.reo@g.sp.m.is.nagoya-u.ac.jp", 69 | description="Source-Filter HiFiGAN implementation", 70 | long_description_content_type="text/markdown", 71 | long_description=open(os.path.join(dirname, "README.md"), encoding="utf-8").read(), 72 | license="MIT License", 73 | packages=find_packages(include=["sifigan*"]), 74 | install_requires=install_requires, 75 | setup_requires=setup_requires, 76 | extras_require=extras_require, 77 | entry_points=entry_points, 78 | classifiers=[ 79 | "Programming Language :: Python :: 3.9.5", 80 | "Intended Audience :: Science/Research", 81 | "Operating System :: POSIX :: Linux", 82 | "License :: OSI Approved :: MIT License", 83 | "Topic :: Software Development :: Libraries :: Python Modules", 84 | ], 85 | ) 86 | -------------------------------------------------------------------------------- /sifigan/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __version__ = "0.0.1" 4 | -------------------------------------------------------------------------------- /sifigan/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/SiFiGAN/cdb20daeb4da4a126077819091706ee90522c7a8/sifigan/bin/__init__.py -------------------------------------------------------------------------------- /sifigan/bin/anasyn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Analysis-synthesis script. 7 | 8 | Analysis: WORLD vocoder 9 | Synthesis: Pre-trained neural vocoder 10 | 11 | """ 12 | 13 | # A logger for this file 14 | import copy 15 | import os 16 | from logging import getLogger 17 | 18 | import hydra 19 | import librosa 20 | import numpy as np 21 | import pysptk 22 | import pyworld as pw 23 | import soundfile as sf 24 | import torch 25 | from hydra.utils import instantiate, to_absolute_path 26 | from joblib import load 27 | from omegaconf import DictConfig 28 | from scipy.interpolate import interp1d 29 | from sifigan.utils.features import SignalGenerator, dilated_factor 30 | 31 | logger = getLogger(__name__) 32 | 33 | # All-pass-filter coefficients {key -> sampling rate : value -> coefficient} 34 | ALPHA = { 35 | 8000: 0.312, 36 | 12000: 0.369, 37 | 16000: 0.410, 38 | 22050: 0.455, 39 | 24000: 0.466, 40 | 32000: 0.504, 41 | 44100: 0.544, 42 | 48000: 0.554, 43 | } 44 | 45 | 46 | def convert_continuos_f0(f0): 47 | # get uv information as binary 48 | uv = np.float32(f0 != 0) 49 | # get start and end of f0 50 | if (f0 == 0).all(): 51 | logger.warn("all of the f0 values are 0.") 52 | return uv, f0, False 53 | start_f0 = f0[f0 != 0][0] 54 | end_f0 = f0[f0 != 0][-1] 55 | # padding start and end of f0 sequence 56 | cont_f0 = copy.deepcopy(f0) 57 | start_idx = np.where(cont_f0 == start_f0)[0][0] 58 | end_idx = np.where(cont_f0 == end_f0)[0][-1] 59 | cont_f0[:start_idx] = start_f0 60 | cont_f0[end_idx:] = end_f0 61 | # get non-zero frame index 62 | nz_frames = np.where(cont_f0 != 0)[0] 63 | # perform linear interpolation 64 | f = interp1d(nz_frames, cont_f0[nz_frames]) 65 | cont_f0 = f(np.arange(0, cont_f0.shape[0])) 66 | 67 | return uv, cont_f0 68 | 69 | 70 | @torch.no_grad() 71 | @hydra.main(version_base=None, config_path="config", config_name="anasyn") 72 | def main(config: DictConfig) -> None: 73 | """Run analysis-synthesis process.""" 74 | 75 | np.random.seed(config.seed) 76 | torch.manual_seed(config.seed) 77 | torch.cuda.manual_seed(config.seed) 78 | os.environ["PYTHONHASHSEED"] = str(config.seed) 79 | 80 | # set device 81 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 82 | logger.info(f"Synthesize on {device}.") 83 | 84 | # load pre-trained model from checkpoint file 85 | model = instantiate(config.generator) 86 | state_dict = torch.load(to_absolute_path(config.checkpoint_path), map_location="cpu") 87 | model.load_state_dict(state_dict["model"]["generator"]) 88 | logger.info(f"Loaded model parameters from {config.checkpoint_path}.") 89 | model.remove_weight_norm() 90 | model.eval().to(device) 91 | 92 | # get scaler 93 | scaler = load(config.stats) 94 | 95 | # get data processor 96 | signal_generator = SignalGenerator( 97 | sample_rate=config.sample_rate, 98 | hop_size=int(config.sample_rate * config.frame_period * 0.001), 99 | sine_amp=config.sine_amp, 100 | noise_amp=config.noise_amp, 101 | signal_types=config.signal_types, 102 | ) 103 | 104 | # create output directory 105 | os.makedirs(config.out_dir, exist_ok=True) 106 | 107 | # loop all wav files in in_dir 108 | for wav_file in os.listdir(config.in_dir): 109 | logger.info(f"Start processing {wav_file}") 110 | if os.path.splitext(wav_file)[1] != ".wav": 111 | continue 112 | wav_path = os.path.join(config.in_dir, wav_file) 113 | 114 | # WORLD analysis 115 | x, sr = sf.read(to_absolute_path(wav_path)) 116 | if sr != config.sample_rate: 117 | x = librosa.resample(x, orig_sr=sr, target_sr=config.sample_rate) 118 | f0_, t = pw.harvest( 119 | x, 120 | config.sample_rate, 121 | f0_floor=config.f0_floor, 122 | f0_ceil=config.f0_ceil, 123 | frame_period=config.frame_period, 124 | ) 125 | sp = pw.cheaptrick(x, f0_, t, config.sample_rate) 126 | ap = pw.d4c(x, f0_, t, config.sample_rate) 127 | mcep = pysptk.sp2mc(sp, order=config.mcep_dim, alpha=ALPHA[config.sample_rate]) 128 | mcap = pysptk.sp2mc(ap, order=config.mcap_dim, alpha=ALPHA[config.sample_rate]) 129 | bap = pw.code_aperiodicity(ap, config.sample_rate) 130 | 131 | # prepare f0 related features 132 | uv, cf0_ = convert_continuos_f0(f0_) 133 | uv = uv[:, np.newaxis] # (T, 1) 134 | f0_ = f0_[:, np.newaxis] # (T, 1) 135 | cf0_ = cf0_[:, np.newaxis] # (T, 1) 136 | 137 | for f0_factor in config.f0_factors: 138 | 139 | f0 = f0_ * f0_factor 140 | cf0 = cf0_ * f0_factor 141 | 142 | # prepare input acoustic features 143 | c = [] 144 | for feat_type in config.aux_feats: 145 | if feat_type == "f0": 146 | c += [scaler[feat_type].transform(f0)] 147 | elif feat_type == "cf0": 148 | c += [scaler[feat_type].transform(cf0)] 149 | elif feat_type == "uv": 150 | c += [scaler[feat_type].transform(uv)] 151 | elif feat_type == "mcep": 152 | c += [scaler[feat_type].transform(mcep)] 153 | elif feat_type == "mcap": 154 | c += [scaler[feat_type].transform(mcap)] 155 | elif feat_type == "bap": 156 | c += [scaler[feat_type].transform(bap)] 157 | c = np.concatenate(c, axis=1) 158 | 159 | # prepare dense factors 160 | dfs = [] 161 | for df, us in zip( 162 | config.dense_factors, 163 | np.cumprod(config.generator.upsample_scales), 164 | ): 165 | dfs += [ 166 | np.repeat(dilated_factor(cf0, config.sample_rate, df), us) 167 | if config.df_f0_type == "cf0" 168 | else np.repeat(dilated_factor(f0, config.sample_rate, df), us) 169 | ] 170 | 171 | # convert to torch tensors 172 | f0 = torch.FloatTensor(f0).view(1, 1, -1).to(device) 173 | cf0 = torch.FloatTensor(cf0).view(1, 1, -1).to(device) 174 | c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) 175 | dfs = [torch.FloatTensor(np.array(df)).view(1, 1, -1).to(device) for df in dfs] 176 | 177 | # generate input signals 178 | if config.sine_f0_type == "cf0": 179 | in_signal = signal_generator(cf0) 180 | elif config.sine_f0_type == "f0": 181 | in_signal = signal_generator(f0) 182 | 183 | # synthesize with the neural vocoder 184 | y = model(in_signal, c, dfs)[0] 185 | 186 | # save output signal as PCM 16 bit wav file 187 | out_path = os.path.join(config.out_dir, wav_file).replace(".wav", f"_{f0_factor:.2f}.wav") 188 | sf.write( 189 | to_absolute_path(out_path), 190 | y.view(-1).cpu().numpy(), 191 | config.sample_rate, 192 | "PCM_16", 193 | ) 194 | 195 | 196 | if __name__ == "__main__": 197 | main() 198 | -------------------------------------------------------------------------------- /sifigan/bin/compute_statistics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Feature statistics computing script. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | 12 | """ 13 | 14 | import os 15 | from logging import getLogger 16 | 17 | import hydra 18 | import numpy as np 19 | from hydra.utils import to_absolute_path 20 | from joblib import dump, load 21 | from omegaconf import DictConfig, OmegaConf 22 | from sifigan.utils import read_hdf5, read_txt 23 | from sklearn.preprocessing import StandardScaler 24 | 25 | # A logger for this file 26 | logger = getLogger(__name__) 27 | 28 | 29 | def calc_stats(file_list, config): 30 | """Calcute statistics 31 | 32 | Args: 33 | file_list (list): File list. 34 | config (dict): Dictionary of config. 35 | 36 | """ 37 | # define scalers 38 | scaler = load(config.stats) if os.path.isfile(config.stats) else {} 39 | for feat_type in config.feat_types: 40 | scaler[feat_type] = StandardScaler() 41 | 42 | # process over all of data 43 | for i, filename in enumerate(file_list): 44 | logger.info(f"now processing {filename} ({i + 1}/{len(file_list)})") 45 | for feat_type in config.feat_types: 46 | if feat_type == "f0": 47 | f0 = read_hdf5(to_absolute_path(filename), "/f0") 48 | feat = np.expand_dims(f0[f0 > 0], axis=-1) 49 | elif feat_type == "lcf0": 50 | cf0 = read_hdf5(to_absolute_path(filename), "/cf0") 51 | feat = np.log(cf0) 52 | else: 53 | feat = read_hdf5(to_absolute_path(filename), f"/{feat_type}") 54 | if feat.shape[0] == 0: 55 | logger.warning(f"feat length is 0 {filename}/{feat_type}") 56 | continue 57 | scaler[feat_type].partial_fit(feat) 58 | 59 | if not os.path.exists(os.path.dirname(config.stats)): 60 | os.makedirs(os.path.dirname(config.stats)) 61 | dump(scaler, to_absolute_path(config.stats)) 62 | logger.info(f"Successfully saved statistics to {config.stats}.") 63 | 64 | 65 | @hydra.main(version_base=None, config_path="config", config_name="compute_statistics") 66 | def main(config: DictConfig): 67 | # show argument 68 | logger.info(OmegaConf.to_yaml(config)) 69 | 70 | # read file list 71 | file_list = read_txt(to_absolute_path(config.feats)) 72 | logger.info(f"number of utterances = {len(file_list)}") 73 | 74 | # calculate statistics 75 | calc_stats(file_list, config) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /sifigan/bin/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/SiFiGAN/cdb20daeb4da4a126077819091706ee90522c7a8/sifigan/bin/config/__init__.py -------------------------------------------------------------------------------- /sifigan/bin/config/anasyn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: sifigan 6 | 7 | hydra: 8 | run: 9 | dir: ./ 10 | output_subdir: null 11 | job_logging: 12 | formatters: 13 | simple: 14 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 15 | disable_existing_loggers: false 16 | 17 | in_dir: # Path to directory which include wav files you want to process. 18 | out_dir: # Path to directory to save the synthesized wavs. 19 | stats: # Path to statistics file. 20 | checkpoint_path: # Path to pre-trained model. 21 | f0_factors: [1.00] # F0 scaling factor. 22 | seed: 100 # Seed number for random numbers. 23 | 24 | # The same parametes should be set as in the training. 25 | sample_rate: 24000 # Sampling rate. 26 | frame_period: 5 # Frameshift in ms. 27 | f0_floor: 100 # Minimum F0 for WORLD F0 analysis. 28 | f0_ceil: 840 # Maximum F0 for WORLD F0 analysis. 29 | mcep_dim: 39 # Number of dimension of mel-generalized cepstrum. 30 | mcap_dim: 19 # Number of dimention of mel-cepstral aperiodicity. 31 | aux_feats: ["mcep", "bap"] # Input acoustic features. 32 | dense_factors: [0.5, 1, 4, 8] # Dense factor in PDCNNs. 33 | df_f0_type: "cf0" # F0 type for dilation factor ("f0" or "cf0"). 34 | sine_amp: 0.1 # Sine amplitude. 35 | noise_amp: 0.003 # Noise amplitude. 36 | sine_f0_type: "cf0" # F0 type for sine signal ("f0" or "cf0"). 37 | signal_types: ["sine"] # List of input signal types. -------------------------------------------------------------------------------- /sifigan/bin/config/compute_statistics.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | run: 5 | dir: ./ 6 | output_subdir: null 7 | job_logging: 8 | formatters: 9 | simple: 10 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 11 | disable_existing_loggers: false 12 | 13 | feats: data/scp/namine_ritsu_train_no_dev.list # List file of input features. 14 | stats: data/stats/namine_ritsu_train_no_dev.joblib # Path to file to output statistics. 15 | feat_types: ['f0', 'cf0', 'uv', 'mcep', 'bap'] # Feature types. 16 | -------------------------------------------------------------------------------- /sifigan/bin/config/data/libritts-r-clean+nus-48e.yaml: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | train_audio: data/scp/libritts-r-clean+nus-48e_train_no_dev.scp # List file of training audio files. 3 | train_feat: data/scp/libritts-r-clean+nus-48e_train_no_dev.list # List file of training feature files. 4 | valid_audio: data/scp/libritts-r-clean+nus-48e_dev.scp # List file of validation audio files. 5 | valid_feat: data/scp/libritts-r-clean+nus-48e_dev.list # List file of validation feature files. 6 | eval_feat: data/scp/libritts-r-clean+nus-48e_eval.list # List file of evaluation feature files for decoding. 7 | stats: data/stats/libritts-r-clean+nus-48e_train_no_dev.joblib # Path to the file of statistics. 8 | allow_cache: false # Whether to allow cache in dataset. If true, it requires cpu memory 9 | 10 | # Feature settings 11 | sample_rate: 24000 # Sampling rate. 12 | hop_size: 120 # Hop size. 13 | dense_factors: [0.5, 1, 4, 8] # Dense factor in PDCNNs. 14 | sine_amp: 0.1 # Sine amplitude. 15 | noise_amp: 0.003 # Noise amplitude. 16 | signal_types: ["sine"] # List of input signal types for generator. 17 | sine_f0_type: "cf0" # F0 type for sine signal ("f0" or "cf0"). 18 | df_f0_type: "cf0" # F0 type for dilation factor ("f0" or "cf0"). 19 | aux_feats: ["mcep", "bap"] # Auxiliary features. 20 | # "uv": V/UV binary. 21 | # "f0": descrete f0. 22 | # "mcep": mel-cepstral envelope. 23 | # "cf0": continuous f0. 24 | # "mcap": mel-cepstral aperiodicity. 25 | # "bap": coded aperiodicity. 26 | # "logmsp": log mel-spectrogram. 27 | 28 | # Collater setting 29 | batch_max_length: 8400 # Length of each audio in batch. Make sure dividable by hop_size. 30 | 31 | # Data loader setting 32 | batch_size: 16 # Batch size 33 | num_workers: 1 # Number of workers in Pytorch DataLoader 34 | pin_memory: true # Whether to pin memory in Pytorch DataLoader 35 | 36 | # Other setting 37 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_length 38 | -------------------------------------------------------------------------------- /sifigan/bin/config/data/namine_ritsu.yaml: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | train_audio: data/scp/namine_ritsu_train_no_dev.scp # List file of training audio files. 3 | train_feat: data/scp/namine_ritsu_train_no_dev.list # List file of training feature files. 4 | valid_audio: data/scp/namine_ritsu_dev.scp # List file of validation audio files. 5 | valid_feat: data/scp/namine_ritsu_dev.list # List file of validation feature files. 6 | eval_feat: data/scp/namine_ritsu_eval.list # List file of evaluation feature files for decoding. 7 | stats: data/stats/namine_ritsu_train_no_dev.joblib # Path to the file of statistics. 8 | allow_cache: false # Whether to allow cache in dataset. If true, it requires cpu memory 9 | 10 | # Feature settings 11 | sample_rate: 24000 # Sampling rate. 12 | hop_size: 120 # Hop size. 13 | dense_factors: [0.5, 1, 4, 8] # Dense factor in PDCNNs. 14 | sine_amp: 0.1 # Sine amplitude. 15 | noise_amp: 0.003 # Noise amplitude. 16 | signal_types: ["sine"] # List of input signal types for generator. 17 | sine_f0_type: "cf0" # F0 type for sine signal ("f0" or "cf0"). 18 | df_f0_type: "cf0" # F0 type for dilation factor ("f0" or "cf0"). 19 | aux_feats: ["mcep", "bap"] # Auxiliary features. 20 | # "uv": V/UV binary. 21 | # "f0": descrete f0. 22 | # "mcep": mel-cepstral envelope. 23 | # "cf0": continuous f0. 24 | # "mcap": mel-cepstral aperiodicity. 25 | # "bap": coded aperiodicity. 26 | # "logmsp": log mel-spectrogram. 27 | 28 | # Collater setting 29 | batch_max_length: 8400 # Length of each audio in batch. Make sure dividable by hop_size. 30 | 31 | # Data loader setting 32 | batch_size: 16 # Batch size 33 | num_workers: 1 # Number of workers in Pytorch DataLoader 34 | pin_memory: true # Whether to pin memory in Pytorch DataLoader 35 | 36 | # Other setting 37 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_length 38 | -------------------------------------------------------------------------------- /sifigan/bin/config/decode.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: sifigan 6 | - data: namine_ritsu 7 | 8 | hydra: 9 | run: 10 | dir: ./ 11 | output_subdir: null 12 | job_logging: 13 | formatters: 14 | simple: 15 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 16 | disable_existing_loggers: false 17 | 18 | out_dir: # Directory to output decoding results. 19 | checkpoint_path: # Path to the checkpoint of pre-trained model. 20 | checkpoint_steps: 400000 # Path to the checkpoint of pre-trained model. 21 | seed: 100 # Seed number for random numbers. 22 | save_source: false # Whether to save source excitation signals. 23 | f0_factors: [1.00] # F0 factor. -------------------------------------------------------------------------------- /sifigan/bin/config/discriminator/hifigan.yaml: -------------------------------------------------------------------------------- 1 | _target_: sifigan.models.HiFiGANMultiScaleMultiPeriodDiscriminator 2 | scales: 3 # Number of multi-scale discriminator. 3 | scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator. 4 | scale_downsample_pooling_params: # Params for down sampling CNNs. 5 | kernel_size: 4 # Pooling kernel size. 6 | stride: 2 # Pooling stride. 7 | padding: 2 # Padding size. 8 | scale_discriminator_params: # Params for HiFiGAN scale discriminator. 9 | in_channels: 1 # Number of input channels. 10 | out_channels: 1 # Number of output channels. 11 | kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. 12 | channels: 128 # Initial number of channels. 13 | max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. 14 | max_groups: 16 # Maximum number of groups in downsampling conv layers. 15 | bias: true # Whether use bias parameters. 16 | downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. 17 | nonlinear_activation: "LeakyReLU" # Nonlinear activation. 18 | nonlinear_activation_params: # Nonlinear activation paramters. 19 | negative_slope: 0.1 20 | follow_official_norm: true # Whether to follow the official norm setting. 21 | periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. 22 | period_discriminator_params: # Params for HiFiGAN period discriminator. 23 | in_channels: 1 # Number of input channels. 24 | out_channels: 1 # Number of output channels. 25 | kernel_sizes: [5, 3] # List of kernel sizes. 26 | channels: 32 # Initial number of channels. 27 | downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. 28 | max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. 29 | bias: true # Whether to use bias parameter in conv layer." 30 | nonlinear_activation: "LeakyReLU" # Nonlinear activation. 31 | nonlinear_activation_params: # Nonlinear activation paramters. 32 | negative_slope: 0.1 33 | use_weight_norm: true # Whether to apply weight normalization. 34 | use_spectral_norm: false # Whether to apply spectral normalization. 35 | -------------------------------------------------------------------------------- /sifigan/bin/config/discriminator/univnet.yaml: -------------------------------------------------------------------------------- 1 | _target_: sifigan.models.UnivNetMultiResolutionMultiPeriodDiscriminator 2 | fft_sizes: [1024, 2048, 512] # FFT sizes for each spectral discriminator. 3 | hop_sizes: [120, 240, 50] # Hop sizes for each spectral discriminator. 4 | win_lengths: [600, 1200, 240] # Window lengths for each spectral discriminator. 5 | window: "hann_window" # Name of window function. 6 | spectral_discriminator_params: # Params for UnivNet spectral discriminator. 7 | channels: 32 # Number of channels for conv layer. 8 | kernel_sizes: # List of stride sizes in down-sampling CNNs. 9 | - [3, 9] 10 | - [3, 9] 11 | - [3, 9] 12 | - [3, 9] 13 | - [3, 3] 14 | - [3, 3] 15 | strides: # List of kernel sizes in down-sampling CNNs. 16 | - [1, 1] 17 | - [1, 2] 18 | - [1, 2] 19 | - [1, 2] 20 | - [1, 1] 21 | - [1, 1] 22 | bias: true # Whether to add bias parameter in convolution layers. 23 | nonlinear_activation: "LeakyReLU" # Nonlinear activation. 24 | nonlinear_activation_params: # Nonlinear activation paramters. 25 | negative_slope: 0.2 26 | periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. 27 | period_discriminator_params: # Params for HiFiGAN period discriminator. 28 | in_channels: 1 # Number of input channels. 29 | out_channels: 1 # Number of output channels. 30 | kernel_sizes: [5, 3] # List of kernel sizes. 31 | channels: 32 # Initial number of channels. 32 | downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. 33 | max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. 34 | bias: true # Whether to use bias parameter in conv layer." 35 | nonlinear_activation: "LeakyReLU" # Nonlinear activation. 36 | nonlinear_activation_params: # Nonlinear activation paramters. 37 | negative_slope: 0.1 38 | use_weight_norm: true # Whether to apply weight normalization. 39 | use_spectral_norm: false # Whether to apply spectral normalization. 40 | -------------------------------------------------------------------------------- /sifigan/bin/config/extract_features.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | run: 5 | dir: ./ 6 | output_subdir: null 7 | job_logging: 8 | formatters: 9 | simple: 10 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 11 | disable_existing_loggers: false 12 | 13 | audio: data/scp/namine_ritsu_all.scp # List filr of input wav files. 14 | in_dir: wav # Directory of input feature files. 15 | out_dir: hdf5 # Directory to save generated samples. 16 | feature_format: h5 # Feature format. 17 | sample_rate: 24000 # Sampling rate. 18 | spkinfo: data/spk_info.yaml # YAML format speaker information. 19 | spkidx: -4 # Speaker index of the split path. 20 | inv: true # If false, wav is restored from acoustic features. 21 | 22 | # Audio preprocess setting. 23 | highpass_cutoff: 70 # Cut-off-frequency for low-cut-filter. 24 | pow_th: # Threshold of power. 25 | 26 | # Mel-spectrogram extraction setting. 27 | fft_size: 1024 # FFT size. 28 | hop_size: 120 # Hop size. 29 | win_length: 1024 # Window length. 30 | # If set to null, it will be same as fft_size. 31 | window: hann # Window function. 32 | num_mels: 80 # Number of mel basis. 33 | fmin: 0 # Minimum frequency in mel basis calculation. 34 | fmax: null # Maximum frequency in mel basis calculation. 35 | 36 | # WORLD feature extraction setting. 37 | minf0: 100 # Minimum F0 value. 38 | maxf0: 840 # Maximum F0 value. 39 | shiftms: 5 # Frameshift in ms. 40 | mcep_dim: 39 # Number of dimension of mel-generalized cepstrum. 41 | mcap_dim: 19 # Number of dimention of mel-cepstral aperiodicity. 42 | -------------------------------------------------------------------------------- /sifigan/bin/config/generator/hifigan.yaml: -------------------------------------------------------------------------------- 1 | _target_: sifigan.models.HiFiGANGenerator 2 | in_channels: 45 # Number of input channels. 3 | out_channels: 1 # Number of output channels. 4 | channels: 512 # Number of initial channels. 5 | kernel_size: 7 # Kernel size of initial and final conv layers. 6 | upsample_scales: [5, 4, 3, 2] # Upsampling scales. 7 | upsample_kernel_sizes: [10, 8, 6, 4] # Kernel size for upsampling layers. 8 | qp_resblock_kernel_size: 3 # Kernel size for quasi-periodic residual blocks. 9 | qp_resblock_dilations: # Dilations for quasi-periodic residual blocks. 10 | - [1] 11 | - [1, 2] 12 | - [1, 2, 4] 13 | - [1, 2, 4, 8] 14 | qp_use_additional_convs: true # Whether to use additional conv layers. 15 | resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. 16 | resblock_dilations: # Dilations for residual blocks. 17 | - [1, 3, 5] 18 | - [1, 3, 5] 19 | - [1, 3, 5] 20 | use_additional_convs: true # Whether to use additional conv layer in residual blocks. 21 | use_sine_embs: false # Whether to use sine embeddings. 22 | use_qp_resblocks: false # Whether to use quasi-periodic residual blocks. 23 | bias: true # Whether to use bias parameter in conv. 24 | nonlinear_activation: "LeakyReLU" # Nonlinear activation type. 25 | nonlinear_activation_params: # Nonlinear activation paramters. 26 | negative_slope: 0.1 27 | use_weight_norm: true # Whether to apply weight normalization. 28 | -------------------------------------------------------------------------------- /sifigan/bin/config/generator/sifigan.direct.yaml: -------------------------------------------------------------------------------- 1 | _target_: sifigan.models.SiFiGANDirectGenerator 2 | in_channels: 43 # Number of input channels. 3 | out_channels: 1 # Number of output channels. 4 | channels: 512 # Number of initial channels. 5 | kernel_size: 7 # Kernel size of initial and final conv layers. 6 | upsample_scales: [5, 4, 3, 2] # Upsampling scales. 7 | upsample_kernel_sizes: [10, 8, 6, 4] # Kernel size for upsampling layers. 8 | source_network_params: # Parameters for source-network. 9 | resblock_kernel_size: 3 # Kernel size for adaptive residual blocks. 10 | resblock_dilations: # Dilations for adaptive residual blocks. 11 | - [1] 12 | - [1, 2] 13 | - [1, 2, 4] 14 | - [1, 2, 4, 8] 15 | use_additional_convs: true # Whether to use additional conv layers. 16 | filter_network_params: # Parameters for filter-network. 17 | resblock_kernel_sizes: [3, 5, 7] # Kernel size for residual blocks. 18 | resblock_dilations: # Dilations for residual blocks. 19 | - [1, 3, 5] 20 | - [1, 3, 5] 21 | - [1, 3, 5] 22 | use_additional_convs: false # Whether to use additional conv layers. 23 | share_upsamples: false # Whether to share up-sampling transposed CNNs. 24 | bias: true # Whether to use bias parameter in conv. 25 | nonlinear_activation: "LeakyReLU" # Nonlinear activation type. 26 | nonlinear_activation_params: # Nonlinear activation paramters. 27 | negative_slope: 0.1 28 | use_weight_norm: true # Whether to apply weight normalization. 29 | -------------------------------------------------------------------------------- /sifigan/bin/config/generator/sifigan.yaml: -------------------------------------------------------------------------------- 1 | _target_: sifigan.models.SiFiGANGenerator 2 | in_channels: 43 # Number of input channels. 3 | out_channels: 1 # Number of output channels. 4 | channels: 512 # Number of initial channels. 5 | kernel_size: 7 # Kernel size of initial and final conv layers. 6 | upsample_scales: [5, 4, 3, 2] # Upsampling scales. 7 | upsample_kernel_sizes: [10, 8, 6, 4] # Kernel size for upsampling layers. 8 | source_network_params: # Parameters for source-network. 9 | resblock_kernel_size: 3 # Kernel size for adaptive residual blocks. 10 | resblock_dilations: # Dilations for adaptive residual blocks. 11 | - [1] 12 | - [1, 2] 13 | - [1, 2, 4] 14 | - [1, 2, 4, 8] 15 | use_additional_convs: true # Whether to use additional conv layers. 16 | filter_network_params: # Parameters for filter-network. 17 | resblock_kernel_sizes: [3, 5, 7] # Kernel size for residual blocks. 18 | resblock_dilations: # Dilations for residual blocks. 19 | - [1, 3, 5] 20 | - [1, 3, 5] 21 | - [1, 3, 5] 22 | use_additional_convs: false # Whether to use additional conv layers. 23 | share_upsamples: false # Whether to share up-sampling transposed CNNs. 24 | share_downsamples: false # Whether to share down-sampling CNNs. 25 | bias: true # Whether to use bias parameter in conv. 26 | nonlinear_activation: "LeakyReLU" # Nonlinear activation type. 27 | nonlinear_activation_params: # Nonlinear activation paramters. 28 | negative_slope: 0.1 29 | use_weight_norm: true # Whether to apply weight normalization. 30 | -------------------------------------------------------------------------------- /sifigan/bin/config/param_count.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: sifigan 6 | 7 | hydra: 8 | run: 9 | dir: ./ 10 | output_subdir: null 11 | job_logging: 12 | formatters: 13 | simple: 14 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 15 | disable_existing_loggers: false -------------------------------------------------------------------------------- /sifigan/bin/config/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: sifigan 6 | - discriminator: univnet 7 | - train: sifigan 8 | - data: namine_ritsu 9 | 10 | hydra: 11 | run: 12 | dir: ./ 13 | output_subdir: null 14 | job_logging: 15 | formatters: 16 | simple: 17 | format: '[%(asctime)s][%(levelname)s][%(module)s | %(lineno)s] %(message)s' 18 | disable_existing_loggers: false 19 | 20 | out_dir: # Directory to output training results. 21 | seed: 12345 # Seed number for random numbers. 22 | -------------------------------------------------------------------------------- /sifigan/bin/config/train/hifigan.yaml: -------------------------------------------------------------------------------- 1 | # Interval setting 2 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 3 | train_max_steps: 400000 # Number of pre-training steps. 4 | save_interval_steps: 100000 # Interval steps to save checkpoint. 5 | eval_interval_steps: 2000 # Interval steps to evaluate the network. 6 | log_interval_steps: 2000 # Interval steps to record the training log. 7 | resume: # Epoch to resume training. 8 | 9 | # Loss balancing coefficients. 10 | lambda_mel: 45.0 11 | lambda_reg: 0.0 12 | lambda_adv: 1.0 13 | lambda_fm: 2.0 14 | 15 | # Mel-spectral loss setting 16 | mel_loss: 17 | _target_: sifigan.losses.MelSpectralLoss 18 | fft_size: 1024 # FFT size for STFT-based loss. 19 | hop_size: 256 # Hop size for STFT-based loss 20 | win_length: 1024 # Window length for STFT-based loss. 21 | window: hann_window # Window function for STFT-based loss. 22 | sample_rate: 24000 # Samplring rate. 23 | n_mels: 80 # Number of bins of mel-filter-bank. 24 | fmin: 0 # Minimum frequency of mel-filter-bank. 25 | fmax: null # Maximum frequency of mel-filter-bank. 26 | # If null, it will be fs / 2. 27 | 28 | # Adversarial loss setting 29 | adv_loss: 30 | _target_: sifigan.losses.AdversarialLoss 31 | average_by_discriminators: false # Whether to average loss by #discriminators. 32 | loss_type: mse 33 | 34 | # Feature matching loss setting 35 | fm_loss: 36 | _target_: sifigan.losses.FeatureMatchLoss 37 | average_by_layers: false # Whether to average loss by #layers in each discriminator. 38 | 39 | # Optimizer and scheduler setting 40 | generator_optimizer: 41 | _target_: torch.optim.Adam 42 | lr: 2.0e-4 43 | betas: [0.5, 0.9] 44 | weight_decay: 0.0 45 | generator_scheduler: 46 | _target_: torch.optim.lr_scheduler.MultiStepLR 47 | gamma: 0.5 48 | milestones: 49 | - 200000 50 | - 400000 51 | - 600000 52 | - 800000 53 | generator_grad_norm: 10 54 | discriminator_optimizer: 55 | _target_: torch.optim.Adam 56 | lr: 2.0e-4 57 | betas: [0.5, 0.9] 58 | weight_decay: 0.0 59 | discriminator_scheduler: 60 | _target_: torch.optim.lr_scheduler.MultiStepLR 61 | gamma: 0.5 62 | milestones: 63 | - 200000 64 | - 400000 65 | - 600000 66 | - 800000 67 | discriminator_grad_norm: 10 68 | -------------------------------------------------------------------------------- /sifigan/bin/config/train/sifigan.yaml: -------------------------------------------------------------------------------- 1 | # Interval setting 2 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 3 | train_max_steps: 500000 # Number of pre-training steps. 4 | save_interval_steps: 100000 # Interval steps to save checkpoint. 5 | eval_interval_steps: 2000 # Interval steps to evaluate the network. 6 | log_interval_steps: 2000 # Interval steps to record the training log. 7 | resume: # Epoch to resume training. 8 | 9 | # Loss balancing coefficients. 10 | lambda_mel: 45.0 11 | lambda_reg: 1.0 12 | lambda_adv: 1.0 13 | lambda_fm: 2.0 14 | 15 | # Mel-spectral loss setting 16 | mel_loss: 17 | _target_: sifigan.losses.MelSpectralLoss 18 | fft_size: 1024 # FFT size for STFT-based loss. 19 | hop_size: 256 # Hop size for STFT-based loss 20 | win_length: 1024 # Window length for STFT-based loss. 21 | window: hann_window # Window function for STFT-based loss. 22 | sample_rate: 24000 # Samplring rate. 23 | n_mels: 80 # Number of bins of mel-filter-bank. 24 | fmin: 0 # Minimum frequency of mel-filter-bank. 25 | fmax: null # Maximum frequency of mel-filter-bank. 26 | # If null, it will be fs / 2. 27 | 28 | # Source regularization loss setting 29 | reg_loss: 30 | _target_: sifigan.losses.ResidualLoss 31 | sample_rate: 24000 # Sampling rate. 32 | fft_size: 2048 # FFT size. 33 | hop_size: 120 # Hop size. 34 | f0_floor: 100 # Minimum F0. 35 | f0_ceil: 840 # Maximum F0. 36 | n_mels: 80 # Number of mel-filter-bank bins. 37 | fmin: 0 # Minimum frequency of mel-filter-bank. 38 | fmax: null # Maximum frequency of mel-filter-bank. 39 | power: false # Whether to use power or magnitude spectrogram. 40 | elim_0th: true # Whether to exclude 0th components of cepstrums in 41 | # CheapTrick estimation. If set to true, source-network 42 | # is forced to estimate the power of the output signal. 43 | 44 | # Adversarial loss setting 45 | adv_loss: 46 | _target_: sifigan.losses.AdversarialLoss 47 | average_by_discriminators: false # Whether to average loss by #discriminators. 48 | loss_type: mse 49 | 50 | # Feature matching loss setting 51 | fm_loss: 52 | _target_: sifigan.losses.FeatureMatchLoss 53 | average_by_layers: false # Whether to average loss by #layers in each discriminator. 54 | 55 | # Optimizer and scheduler setting 56 | generator_optimizer: 57 | _target_: torch.optim.Adam 58 | lr: 2.0e-4 59 | betas: [0.5, 0.9] 60 | weight_decay: 0.0 61 | generator_scheduler: 62 | _target_: torch.optim.lr_scheduler.MultiStepLR 63 | gamma: 0.5 64 | milestones: 65 | - 100000 66 | - 200000 67 | - 300000 68 | - 400000 69 | generator_grad_norm: 10 70 | discriminator_optimizer: 71 | _target_: torch.optim.Adam 72 | lr: 2.0e-4 73 | betas: [0.5, 0.9] 74 | weight_decay: 0.0 75 | discriminator_scheduler: 76 | _target_: torch.optim.lr_scheduler.MultiStepLR 77 | gamma: 0.5 78 | milestones: 79 | - 100000 80 | - 200000 81 | - 300000 82 | - 400000 83 | discriminator_grad_norm: 10 84 | -------------------------------------------------------------------------------- /sifigan/bin/config/train/sifigan_1000k.yaml: -------------------------------------------------------------------------------- 1 | # Interval setting 2 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 3 | train_max_steps: 1000000 # Number of pre-training steps. 4 | save_interval_steps: 100000 # Interval steps to save checkpoint. 5 | eval_interval_steps: 2000 # Interval steps to evaluate the network. 6 | log_interval_steps: 2000 # Interval steps to record the training log. 7 | resume: # Epoch to resume training. 8 | 9 | # Loss balancing coefficients. 10 | lambda_mel: 45.0 11 | lambda_reg: 1.0 12 | lambda_adv: 1.0 13 | lambda_fm: 2.0 14 | 15 | # Mel-spectral loss setting 16 | mel_loss: 17 | _target_: sifigan.losses.MelSpectralLoss 18 | fft_size: 1024 # FFT size for STFT-based loss. 19 | hop_size: 256 # Hop size for STFT-based loss 20 | win_length: 1024 # Window length for STFT-based loss. 21 | window: hann_window # Window function for STFT-based loss. 22 | sample_rate: 24000 # Samplring rate. 23 | n_mels: 80 # Number of bins of mel-filter-bank. 24 | fmin: 0 # Minimum frequency of mel-filter-bank. 25 | fmax: null # Maximum frequency of mel-filter-bank. 26 | # If null, it will be fs / 2. 27 | 28 | # Source regularization loss setting 29 | reg_loss: 30 | _target_: sifigan.losses.ResidualLoss 31 | sample_rate: 24000 # Sampling rate. 32 | fft_size: 2048 # FFT size. 33 | hop_size: 120 # Hop size. 34 | f0_floor: 100 # Minimum F0. 35 | f0_ceil: 1000 # Maximum F0. 36 | n_mels: 80 # Number of mel-filter-bank bins. 37 | fmin: 0 # Minimum frequency of mel-filter-bank. 38 | fmax: null # Maximum frequency of mel-filter-bank. 39 | power: false # Whether to use power or magnitude spectrogram. 40 | elim_0th: true # Whether to exclude 0th components of cepstrums in 41 | # CheapTrick estimation. If set to true, source-network 42 | # is forced to estimate the power of the output signal. 43 | 44 | # Adversarial loss setting 45 | adv_loss: 46 | _target_: sifigan.losses.AdversarialLoss 47 | average_by_discriminators: false # Whether to average loss by #discriminators. 48 | loss_type: mse 49 | 50 | # Feature matching loss setting 51 | fm_loss: 52 | _target_: sifigan.losses.FeatureMatchLoss 53 | average_by_layers: false # Whether to average loss by #layers in each discriminator. 54 | 55 | # Optimizer and scheduler setting 56 | generator_optimizer: 57 | _target_: torch.optim.Adam 58 | lr: 2.0e-4 59 | betas: [0.5, 0.9] 60 | weight_decay: 0.0 61 | generator_scheduler: 62 | _target_: torch.optim.lr_scheduler.MultiStepLR 63 | gamma: 0.5 64 | milestones: 65 | - 200000 66 | - 400000 67 | - 600000 68 | - 800000 69 | generator_grad_norm: 10 70 | discriminator_optimizer: 71 | _target_: torch.optim.Adam 72 | lr: 2.0e-4 73 | betas: [0.5, 0.9] 74 | weight_decay: 0.0 75 | discriminator_scheduler: 76 | _target_: torch.optim.lr_scheduler.MultiStepLR 77 | gamma: 0.5 78 | milestones: 79 | - 200000 80 | - 400000 81 | - 600000 82 | - 800000 83 | discriminator_grad_norm: 10 84 | -------------------------------------------------------------------------------- /sifigan/bin/decode.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Decoding Script for Source-Filter HiFiGAN. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | 12 | """ 13 | 14 | import os 15 | from logging import getLogger 16 | from time import time 17 | 18 | import hydra 19 | import numpy as np 20 | import soundfile as sf 21 | import torch 22 | from hydra.utils import to_absolute_path 23 | from omegaconf import DictConfig 24 | from sifigan.datasets import FeatDataset 25 | from sifigan.utils import SignalGenerator, dilated_factor 26 | from tqdm import tqdm 27 | 28 | # A logger for this file 29 | logger = getLogger(__name__) 30 | 31 | 32 | @hydra.main(version_base=None, config_path="config", config_name="decode") 33 | def main(config: DictConfig) -> None: 34 | """Run decoding process.""" 35 | 36 | np.random.seed(config.seed) 37 | torch.manual_seed(config.seed) 38 | torch.cuda.manual_seed(config.seed) 39 | os.environ["PYTHONHASHSEED"] = str(config.seed) 40 | 41 | # set device 42 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 43 | logger.info(f"Decode on {device}.") 44 | 45 | # load pre-trained model from checkpoint file 46 | if config.checkpoint_path is None: 47 | checkpoint_path = os.path.join( 48 | config.out_dir, 49 | "checkpoints", 50 | f"checkpoint-{config.checkpoint_steps}steps.pkl", 51 | ) 52 | else: 53 | checkpoint_path = config.checkpoint_path 54 | state_dict = torch.load(to_absolute_path(checkpoint_path), map_location="cpu") 55 | logger.info(f"Loaded model parameters from {checkpoint_path}.") 56 | model = hydra.utils.instantiate(config.generator) 57 | model.load_state_dict(state_dict["model"]["generator"]) 58 | model.remove_weight_norm() 59 | model.eval().to(device) 60 | 61 | # check directory existence 62 | out_dir = to_absolute_path(os.path.join(config.out_dir, "wav", str(config.checkpoint_steps))) 63 | os.makedirs(out_dir, exist_ok=True) 64 | 65 | total_rtf = 0.0 66 | for f0_factor in config.f0_factors: 67 | dataset = FeatDataset( 68 | stats=to_absolute_path(config.data.stats), 69 | feat_list=config.data.eval_feat, 70 | return_filename=True, 71 | sample_rate=config.data.sample_rate, 72 | hop_size=config.data.hop_size, 73 | aux_feats=config.data.aux_feats, 74 | f0_factor=f0_factor, 75 | ) 76 | logger.info(f"The number of features to be decoded = {len(dataset)}.") 77 | 78 | signal_generator = SignalGenerator( 79 | sample_rate=config.data.sample_rate, 80 | hop_size=config.data.hop_size, 81 | sine_amp=config.data.sine_amp, 82 | noise_amp=config.data.noise_amp, 83 | signal_types=config.data.signal_types, 84 | ) 85 | 86 | with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: 87 | for idx, (feat_path, c, f0, cf0) in enumerate(pbar, 1): 88 | # create dense factors 89 | dfs = [] 90 | for df, us in zip( 91 | config.data.dense_factors, 92 | np.cumprod(config.generator.upsample_scales), 93 | ): 94 | dfs += [ 95 | np.repeat(dilated_factor(cf0, config.data.sample_rate, df), us) 96 | if config.data.df_f0_type == "cf0" 97 | else np.repeat(dilated_factor(f0, config.data.sample_rate, df), us) 98 | ] 99 | c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) 100 | f0 = torch.FloatTensor(f0).view(1, 1, -1).to(device) 101 | cf0 = torch.FloatTensor(cf0).view(1, 1, -1).to(device) 102 | dfs = [torch.FloatTensor(np.array(df)).view(1, 1, -1).to(device) for df in dfs] 103 | if config.data.sine_f0_type == "cf0": 104 | in_signal = signal_generator(cf0) 105 | elif config.data.sine_f0_type == "f0": 106 | in_signal = signal_generator(f0) 107 | 108 | # perform decoding 109 | start = time() 110 | outs = model(in_signal, c, dfs) 111 | y = outs[0] 112 | rtf = (time() - start) / (y.size(-1) / config.data.sample_rate) 113 | pbar.set_postfix({"RTF": rtf}) 114 | total_rtf += rtf 115 | 116 | # save output signal as PCM 16 bit wav file 117 | utt_id = os.path.splitext(os.path.basename(feat_path))[0] 118 | save_path = os.path.join(out_dir, f"{utt_id}_f{f0_factor:.2f}.wav") 119 | y = y.view(-1).cpu().numpy() 120 | sf.write(save_path, y, config.data.sample_rate, "PCM_16") 121 | 122 | # save source signal as PCM 16 bit wav file 123 | if config.save_source: 124 | save_path = save_path.replace(".wav", "_s.wav") 125 | s = outs[1].view(-1).cpu().numpy() 126 | s = s / np.max(np.abs(s)) # normalize 127 | sf.write(save_path, s, config.data.sample_rate, "PCM_16") 128 | 129 | # report average RTF 130 | logger.info(f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.4f}).") 131 | 132 | 133 | if __name__ == "__main__": 134 | main() 135 | -------------------------------------------------------------------------------- /sifigan/bin/extract_features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Feature extraction script. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | - https://github.com/k2kobayashi/sprocket 12 | 13 | """ 14 | 15 | import copy 16 | import multiprocessing as mp 17 | import os 18 | import sys 19 | from logging import getLogger 20 | 21 | import hydra 22 | import librosa 23 | import numpy as np 24 | import pysptk 25 | import pyworld 26 | import soundfile as sf 27 | import yaml 28 | from hydra.utils import to_absolute_path 29 | from omegaconf import DictConfig, OmegaConf 30 | from scipy.interpolate import interp1d 31 | from scipy.signal import firwin, lfilter 32 | from sifigan.utils import read_txt, write_hdf5 33 | 34 | # A logger for this file 35 | logger = getLogger(__name__) 36 | 37 | 38 | # All-pass-filter coefficients {key -> sampling rate : value -> coefficient} 39 | ALPHA = { 40 | 8000: 0.312, 41 | 12000: 0.369, 42 | 16000: 0.410, 43 | 22050: 0.455, 44 | 24000: 0.466, 45 | 32000: 0.504, 46 | 44100: 0.544, 47 | 48000: 0.554, 48 | } 49 | 50 | 51 | def path_create(wav_list, in_dir, out_dir, extname): 52 | for wav_name in wav_list: 53 | path_replace(wav_name, in_dir, out_dir, extname=extname) 54 | 55 | 56 | def path_replace(filepath, inputpath, outputpath, extname=None): 57 | if extname is not None: 58 | filepath = f"{os.path.splitext(filepath)[0]}.{extname}" 59 | filepath = filepath.replace(inputpath, outputpath) 60 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 61 | return filepath 62 | 63 | 64 | def spk_division(file_list, config, spkinfo, split="/"): 65 | """Divide list into speaker-dependent list 66 | 67 | Args: 68 | file_list (list): Waveform list 69 | config (dict): Config 70 | spkinfo (dict): Dictionary of 71 | speaker-dependent f0 range and power threshold 72 | split: Path split string 73 | 74 | Return: 75 | (list): List of divided file lists 76 | (list): List of speaker-dependent configs 77 | 78 | """ 79 | file_lists, configs, tempf = [], [], [] 80 | prespk = None 81 | for file in file_list: 82 | spk = file.split(split)[config.spkidx] 83 | if spk != prespk: 84 | if tempf: 85 | file_lists.append(tempf) 86 | tempf = [] 87 | prespk = spk 88 | tempc = copy.deepcopy(config) 89 | if spk in spkinfo: 90 | tempc["minf0"] = spkinfo[spk]["f0_min"] 91 | tempc["maxf0"] = spkinfo[spk]["f0_max"] 92 | # tempc["pow_th"] = spkinfo[spk]["pow_th"] 93 | else: 94 | msg = f"Since {spk} is not in spkinfo dict, " 95 | msg += "default f0 range and power threshold are used." 96 | logger.info(msg) 97 | tempc["minf0"] = 70 98 | tempc["maxf0"] = 300 99 | # tempc["pow_th"] = -20 100 | configs.append(tempc) 101 | tempf.append(file) 102 | file_lists.append(tempf) 103 | 104 | return file_lists, configs 105 | 106 | 107 | def aux_list_create(wav_list_file, config): 108 | """Create list of auxiliary acoustic features 109 | 110 | Args: 111 | wav_list_file (str): Filename of wav list 112 | config (dict): Config 113 | 114 | """ 115 | aux_list_file = wav_list_file.replace(".scp", ".list") 116 | wav_files = read_txt(wav_list_file) 117 | with open(aux_list_file, "w") as f: 118 | for wav_name in wav_files: 119 | feat_name = path_replace( 120 | wav_name, 121 | config.in_dir, 122 | config.out_dir, 123 | extname=config.feature_format, 124 | ) 125 | f.write(f"{feat_name}\n") 126 | 127 | 128 | def low_cut_filter(x, fs, cutoff=70): 129 | """Low cut filter 130 | 131 | Args: 132 | x (ndarray): Waveform sequence 133 | fs (int): Sampling frequency 134 | cutoff (float): Cutoff frequency of low cut filter 135 | 136 | Return: 137 | (ndarray): Low cut filtered waveform sequence 138 | 139 | """ 140 | nyquist = fs // 2 141 | norm_cutoff = cutoff / nyquist 142 | fil = firwin(255, norm_cutoff, pass_zero=False) 143 | lcf_x = lfilter(fil, 1, x) 144 | 145 | return lcf_x 146 | 147 | 148 | def low_pass_filter(x, fs, cutoff=70, padding=True): 149 | """Low pass filter 150 | 151 | Args: 152 | x (ndarray): Waveform sequence 153 | fs (int): Sampling frequency 154 | cutoff (float): Cutoff frequency of low pass filter 155 | 156 | Return: 157 | (ndarray): Low pass filtered waveform sequence 158 | 159 | """ 160 | nyquist = fs // 2 161 | norm_cutoff = cutoff / nyquist 162 | numtaps = 255 163 | fil = firwin(numtaps, norm_cutoff) 164 | x_pad = np.pad(x, (numtaps, numtaps), "edge") 165 | lpf_x = lfilter(fil, 1, x_pad) 166 | lpf_x = lpf_x[numtaps + numtaps // 2 : -numtaps // 2] 167 | 168 | return lpf_x 169 | 170 | 171 | def convert_continuos_f0(f0): 172 | """Convert F0 to continuous F0 173 | 174 | Args: 175 | f0 (ndarray): original f0 sequence with the shape (T) 176 | 177 | Return: 178 | (ndarray): continuous f0 with the shape (T) 179 | 180 | """ 181 | # get uv information as binary 182 | uv = np.float32(f0 != 0) 183 | # get start and end of f0 184 | if (f0 == 0).all(): 185 | logger.warn("all of the f0 values are 0.") 186 | return uv, f0, False 187 | start_f0 = f0[f0 != 0][0] 188 | end_f0 = f0[f0 != 0][-1] 189 | # padding start and end of f0 sequence 190 | cf0 = copy.deepcopy(f0) 191 | start_idx = np.where(cf0 == start_f0)[0][0] 192 | end_idx = np.where(cf0 == end_f0)[0][-1] 193 | cf0[:start_idx] = start_f0 194 | cf0[end_idx:] = end_f0 195 | # get non-zero frame index 196 | nz_frames = np.where(cf0 != 0)[0] 197 | # perform linear interpolation 198 | f = interp1d(nz_frames, cf0[nz_frames]) 199 | cf0 = f(np.arange(0, cf0.shape[0])) 200 | 201 | return uv, cf0, True 202 | 203 | 204 | def melfilterbank( 205 | audio, 206 | sample_rate, 207 | fft_size=1024, 208 | hop_size=256, 209 | win_length=None, 210 | window="hann", 211 | num_mels=80, 212 | fmin=None, 213 | fmax=None, 214 | ): 215 | """Extract linear mel filterbank feature. 216 | 217 | Args: 218 | audio (ndarray): Audio signal (T,). 219 | sample_rate (int): Sampling rate. 220 | fft_size (int): FFT size. 221 | hop_size (int): Hop size. 222 | win_length (int): Window length. If set to None, it will be the same as fft_size. 223 | window (str): Window function type. 224 | num_mels (int): Number of mel basis. 225 | fmin (int): Minimum frequency in mel basis calculation. 226 | fmax (int): Maximum frequency in mel basis calculation. 227 | 228 | Returns: 229 | ndarray: Linear mel filterbank feature (#frames, num_mels). 230 | 231 | """ 232 | # get amplitude spectrogram 233 | x_stft = librosa.stft( 234 | audio, 235 | n_fft=fft_size, 236 | hop_length=hop_size, 237 | win_length=win_length, 238 | window=window, 239 | pad_mode="reflect", 240 | ) 241 | spc = np.abs(x_stft).T # (#frames, #bins) 242 | 243 | # get mel basis 244 | fmin = 0 if fmin is None else fmin 245 | fmax = sample_rate / 2 if fmax is None else fmax 246 | mel_basis = librosa.filters.mel( 247 | sr=sample_rate, 248 | n_fft=fft_size, 249 | n_mels=num_mels, 250 | fmin=fmin, 251 | fmax=fmax, 252 | ) 253 | 254 | return np.dot(spc, mel_basis.T) 255 | 256 | 257 | def world_feature_extract(queue, wav_list, config): 258 | """WORLD feature extraction 259 | 260 | Args: 261 | queue (multiprocessing.Queue): the queue to store the file name of utterance 262 | wav_list (list): list of the wav files 263 | config (dict): feature extraction config 264 | 265 | """ 266 | # extraction 267 | for i, wav_name in enumerate(wav_list): 268 | logger.info(f"now processing {wav_name} ({i + 1}/{len(wav_list)})") 269 | 270 | # load wavfile 271 | x, fs = sf.read(to_absolute_path(wav_name)) 272 | x = np.array(x, dtype=np.float) 273 | 274 | # check sampling frequency 275 | if not fs == config.sample_rate: 276 | logger.warning( 277 | f"Sampling frequency of {wav_name} is not matched." 278 | + "Resample before feature extraction." 279 | ) 280 | x = librosa.resample(x, orig_sr=fs, target_sr=config.sample_rate) 281 | 282 | # apply low-cut-filter 283 | if config.highpass_cutoff > 0: 284 | if (x == 0).all(): 285 | logger.info(f"xxxxx {wav_name}") 286 | continue 287 | x = low_cut_filter(x, config.sample_rate, cutoff=config.highpass_cutoff) 288 | 289 | # extract WORLD features 290 | f0, t = pyworld.harvest( 291 | x, 292 | fs=config.sample_rate, 293 | f0_floor=config.minf0, 294 | f0_ceil=config.maxf0, 295 | frame_period=config.shiftms, 296 | ) 297 | env = pyworld.cheaptrick( 298 | x, 299 | f0, 300 | t, 301 | fs=config.sample_rate, 302 | fft_size=config.fft_size, 303 | ) 304 | ap = pyworld.d4c( 305 | x, 306 | f0, 307 | t, 308 | fs=config.sample_rate, 309 | fft_size=config.fft_size, 310 | ) 311 | uv, cf0, is_all_uv = convert_continuos_f0(f0) 312 | if is_all_uv: 313 | lpf_fs = int(config.sample_rate / config.hop_size) 314 | cf0_lpf = low_pass_filter(cf0, lpf_fs, cutoff=20) 315 | next_cutoff = 70 316 | while not (cf0_lpf >= [0]).all(): 317 | cf0_lpf = low_pass_filter(cf0, lpf_fs, cutoff=next_cutoff) 318 | next_cutoff *= 2 319 | else: 320 | cf0_lpf = cf0 321 | logger.warn(f"all of the f0 values are 0 {wav_name}.") 322 | mcep = pysptk.sp2mc(env, order=config.mcep_dim, alpha=ALPHA[config.sample_rate]) 323 | bap = pyworld.code_aperiodicity(ap, config.sample_rate) 324 | 325 | # adjust shapes 326 | minlen = min(uv.shape[0], mcep.shape[0]) 327 | uv = np.expand_dims(uv[:minlen], axis=-1) 328 | f0 = np.expand_dims(f0[:minlen], axis=-1) 329 | cf0_lpf = np.expand_dims(cf0_lpf[:minlen], axis=-1) 330 | mcep = mcep[:minlen] 331 | bap = bap[:minlen] 332 | 333 | # save features 334 | feat_name = path_replace( 335 | wav_name, 336 | config.in_dir, 337 | config.out_dir, 338 | extname=config.feature_format, 339 | ) 340 | logger.info(f"{to_absolute_path(feat_name)}") 341 | write_hdf5(to_absolute_path(feat_name), "/uv", uv) 342 | write_hdf5(to_absolute_path(feat_name), "/f0", f0) 343 | write_hdf5(to_absolute_path(feat_name), "/cf0", cf0_lpf) 344 | write_hdf5(to_absolute_path(feat_name), "/mcep", mcep) 345 | write_hdf5(to_absolute_path(feat_name), "/bap", bap) 346 | 347 | queue.put("Finish") 348 | 349 | 350 | @hydra.main(version_base=None, config_path="config", config_name="extract_features") 351 | def main(config: DictConfig): 352 | # show argument 353 | logger.info(OmegaConf.to_yaml(config)) 354 | 355 | # read list 356 | file_list = read_txt(to_absolute_path(config.audio)) 357 | logger.info(f"number of utterances = {len(file_list)}") 358 | 359 | # list division 360 | if config.spkinfo and os.path.exists(to_absolute_path(config.spkinfo)): 361 | # load speaker info 362 | with open(to_absolute_path(config.spkinfo), "r") as f: 363 | spkinfo = yaml.safe_load(f) 364 | logger.info(f"Spkinfo {config.spkinfo} is used.") 365 | # divide into each spk list 366 | file_lists, configs = spk_division(file_list, config, spkinfo) 367 | else: 368 | logger.info( 369 | f"Since spkinfo {config.spkinfo} is not exist, default f0 range and power threshold are used." 370 | ) 371 | file_lists = np.array_split(file_list, 10) 372 | file_lists = [f_list.tolist() for f_list in file_lists] 373 | configs = [config] * len(file_lists) 374 | 375 | # set mode 376 | if config.inv: 377 | target_fn = world_feature_extract 378 | # create auxiliary feature list 379 | aux_list_create(to_absolute_path(config.audio), config) 380 | # create folder 381 | path_create(file_list, config.in_dir, config.out_dir, config.feature_format) 382 | 383 | # multi processing 384 | processes = [] 385 | queue = mp.Queue() 386 | for f, _config in zip(file_lists, configs): 387 | p = mp.Process( 388 | target=target_fn, 389 | args=(queue, f, _config), 390 | ) 391 | p.start() 392 | processes.append(p) 393 | 394 | # wait for all process 395 | for p in processes: 396 | p.join() 397 | 398 | 399 | if __name__ == "__main__": 400 | main() 401 | -------------------------------------------------------------------------------- /sifigan/bin/param_count.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Count number of parameters in Generator.""" 7 | 8 | from logging import getLogger 9 | 10 | import hydra 11 | from omegaconf import DictConfig 12 | 13 | # A logger for this file 14 | logger = getLogger(__name__) 15 | 16 | 17 | @hydra.main(version_base=None, config_path="config", config_name="param_count") 18 | def main(config: DictConfig) -> None: 19 | """Count number of model parameters.""" 20 | 21 | model = hydra.utils.instantiate(config.generator) 22 | model.remove_weight_norm() 23 | 24 | params = 0 25 | for p in model.parameters(): 26 | if p.requires_grad: 27 | params += p.numel() 28 | 29 | logger.info(f"Number of params of {model.__class__.__name__} is : {params}") 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /sifigan/bin/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Training Script for Source-Filter HiFiGAN. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | 11 | """ 12 | 13 | import os 14 | import sys 15 | from collections import defaultdict 16 | from logging import getLogger 17 | 18 | import hydra 19 | import librosa.display 20 | import matplotlib 21 | import numpy as np 22 | import torch 23 | from hydra.utils import to_absolute_path 24 | from omegaconf import DictConfig, OmegaConf 25 | from tensorboardX import SummaryWriter 26 | from torch.utils.data import DataLoader 27 | from tqdm import tqdm 28 | 29 | import sifigan 30 | import sifigan.models 31 | from sifigan.datasets import AudioFeatDataset 32 | from sifigan.utils import dilated_factor 33 | from sifigan.utils.features import SignalGenerator 34 | 35 | # set to avoid matplotlib error in CLI environment 36 | matplotlib.use("Agg") 37 | 38 | 39 | # A logger for this file 40 | logger = getLogger(__name__) 41 | 42 | 43 | class Trainer(object): 44 | """Customized trainer module for Source-Filter HiFiGAN training.""" 45 | 46 | def __init__( 47 | self, 48 | config, 49 | steps, 50 | epochs, 51 | data_loader, 52 | model, 53 | criterion, 54 | optimizer, 55 | scheduler, 56 | device=torch.device("cpu"), 57 | ): 58 | """Initialize trainer. 59 | 60 | Args: 61 | config (dict): Config dict loaded from yaml format configuration file. 62 | steps (int): Initial global steps. 63 | epochs (int): Initial global epochs. 64 | data_loader (dict): Dict of data loaders. It must contrain "train" and "dev" loaders. 65 | model (dict): Dict of models. It must contrain "generator" and "discriminator" models. 66 | criterion (dict): Dict of criterions. It must contrain "adv", "encode" and "f0" criterions. 67 | optimizer (dict): Dict of optimizers. It must contrain "generator" and "discriminator" optimizers. 68 | scheduler (dict): Dict of schedulers. It must contrain "generator" and "discriminator" schedulers. 69 | device (torch.deive): Pytorch device instance. 70 | 71 | """ 72 | self.config = config 73 | self.steps = steps 74 | self.epochs = epochs 75 | self.data_loader = data_loader 76 | self.model = model 77 | self.criterion = criterion 78 | self.optimizer = optimizer 79 | self.scheduler = scheduler 80 | self.device = device 81 | self.finish_train = False 82 | self.writer = SummaryWriter(config.out_dir) 83 | self.total_train_loss = defaultdict(float) 84 | self.total_eval_loss = defaultdict(float) 85 | 86 | def run(self): 87 | """Run training.""" 88 | self.tqdm = tqdm( 89 | initial=self.steps, total=self.config.train.train_max_steps, desc="[train]" 90 | ) 91 | while True: 92 | # train one epoch 93 | self._train_epoch() 94 | 95 | # check whether training is finished 96 | if self.finish_train: 97 | break 98 | 99 | self.tqdm.close() 100 | logger.info("Finished training.") 101 | 102 | def save_checkpoint(self, checkpoint_path): 103 | """Save checkpoint. 104 | 105 | Args: 106 | checkpoint_path (str): Checkpoint path to be saved. 107 | 108 | """ 109 | state_dict = { 110 | "optimizer": { 111 | "generator": self.optimizer["generator"].state_dict(), 112 | "discriminator": self.optimizer["discriminator"].state_dict(), 113 | }, 114 | "scheduler": { 115 | "generator": self.scheduler["generator"].state_dict(), 116 | "discriminator": self.scheduler["discriminator"].state_dict(), 117 | }, 118 | "steps": self.steps, 119 | "epochs": self.epochs, 120 | } 121 | state_dict["model"] = { 122 | "generator": self.model["generator"].state_dict(), 123 | "discriminator": self.model["discriminator"].state_dict(), 124 | } 125 | 126 | if not os.path.exists(os.path.dirname(checkpoint_path)): 127 | os.makedirs(os.path.dirname(checkpoint_path)) 128 | torch.save(state_dict, checkpoint_path) 129 | 130 | def load_checkpoint(self, checkpoint_path, load_only_params=False): 131 | """Load checkpoint. 132 | 133 | Args: 134 | checkpoint_path (str): Checkpoint path to be loaded. 135 | load_only_params (bool): Whether to load only model parameters. 136 | 137 | """ 138 | state_dict = torch.load(checkpoint_path, map_location="cpu") 139 | self.model["generator"].load_state_dict(state_dict["model"]["generator"]) 140 | self.model["discriminator"].load_state_dict( 141 | state_dict["model"]["discriminator"] 142 | ) 143 | if not load_only_params: 144 | self.steps = state_dict["steps"] 145 | self.epochs = state_dict["epochs"] 146 | self.optimizer["generator"].load_state_dict( 147 | state_dict["optimizer"]["generator"] 148 | ) 149 | self.optimizer["discriminator"].load_state_dict( 150 | state_dict["optimizer"]["discriminator"] 151 | ) 152 | self.scheduler["generator"].load_state_dict( 153 | state_dict["scheduler"]["generator"] 154 | ) 155 | self.scheduler["discriminator"].load_state_dict( 156 | state_dict["scheduler"]["discriminator"] 157 | ) 158 | 159 | def _train_step(self, batch): 160 | """Train model one step.""" 161 | # parse batch 162 | x, d, y = batch 163 | x = tuple([x.to(self.device) for x in x]) 164 | d = tuple([d.to(self.device) for d in d]) 165 | z, c, f0 = x 166 | y = y.to(self.device) 167 | 168 | # generator forward 169 | outs = self.model["generator"](z, c, d) 170 | y_ = outs[0] 171 | 172 | # calculate spectral loss 173 | mel_loss = self.criterion["mel"](y_, y) 174 | gen_loss = self.config.train.lambda_mel * mel_loss 175 | self.total_train_loss["train/mel_loss"] += mel_loss.item() 176 | 177 | # calculate source regularization loss 178 | if self.config.train.lambda_reg > 0: 179 | s = outs[1] 180 | if isinstance(self.criterion["reg"], sifigan.losses.ResidualLoss): 181 | reg_loss = self.criterion["reg"](s, y, f0) 182 | gen_loss += self.config.train.lambda_reg * reg_loss 183 | self.total_train_loss["train/reg_loss"] += reg_loss.item() 184 | else: 185 | reg_loss = self.criterion["reg"](s, f0) 186 | gen_loss += self.config.train.lambda_reg * reg_loss 187 | self.total_train_loss["train/reg_loss"] += reg_loss.item() 188 | 189 | # calculate discriminator related losses 190 | if self.steps > self.config.train.discriminator_train_start_steps: 191 | # calculate feature matching loss 192 | if self.config.train.lambda_fm > 0: 193 | p_fake, fmaps_fake = self.model["discriminator"](y_, return_fmaps=True) 194 | with torch.no_grad(): 195 | p_real, fmaps_real = self.model["discriminator"]( 196 | y, return_fmaps=True 197 | ) 198 | fm_loss = self.criterion["fm"](fmaps_fake, fmaps_real) 199 | gen_loss += self.config.train.lambda_fm * fm_loss 200 | self.total_train_loss["train/fm_loss"] += fm_loss.item() 201 | else: 202 | p_fake = self.model["discriminator"](y_) 203 | # calculate adversarial loss 204 | adv_loss = self.criterion["adv"](p_fake) 205 | gen_loss += self.config.train.lambda_adv * adv_loss 206 | self.total_train_loss["train/adv_loss"] += adv_loss.item() 207 | 208 | self.total_train_loss["train/generator_loss"] += gen_loss.item() 209 | 210 | # update generator 211 | self.optimizer["generator"].zero_grad() 212 | gen_loss.backward() 213 | if self.config.train.generator_grad_norm > 0: 214 | torch.nn.utils.clip_grad_norm_( 215 | self.model["generator"].parameters(), 216 | self.config.train.generator_grad_norm, 217 | ) 218 | self.optimizer["generator"].step() 219 | self.scheduler["generator"].step() 220 | 221 | # discriminator 222 | if self.steps > self.config.train.discriminator_train_start_steps: 223 | # re-compute y_ 224 | with torch.no_grad(): 225 | y_ = self.model["generator"](z, c, d)[0] 226 | # calculate discriminator loss 227 | p_fake = self.model["discriminator"](y_.detach()) 228 | p_real = self.model["discriminator"](y) 229 | # NOTE: the first argument must to be the fake samples 230 | fake_loss, real_loss = self.criterion["adv"](p_fake, p_real) 231 | dis_loss = fake_loss + real_loss 232 | self.total_train_loss["train/fake_loss"] += fake_loss.item() 233 | self.total_train_loss["train/real_loss"] += real_loss.item() 234 | self.total_train_loss["train/discriminator_loss"] += dis_loss.item() 235 | 236 | # update discriminator 237 | self.optimizer["discriminator"].zero_grad() 238 | dis_loss.backward() 239 | if self.config.train.discriminator_grad_norm > 0: 240 | torch.nn.utils.clip_grad_norm_( 241 | self.model["discriminator"].parameters(), 242 | self.config.train.discriminator_grad_norm, 243 | ) 244 | self.optimizer["discriminator"].step() 245 | self.scheduler["discriminator"].step() 246 | 247 | # update counts 248 | self.steps += 1 249 | self.tqdm.update(1) 250 | self._check_train_finish() 251 | 252 | def _train_epoch(self): 253 | """Train model one epoch.""" 254 | for train_steps_per_epoch, batch in enumerate(self.data_loader["train"], 1): 255 | # train one step 256 | self._train_step(batch) 257 | 258 | # check interval 259 | self._check_log_interval() 260 | self._check_eval_interval() 261 | self._check_save_interval() 262 | 263 | # check whether training is finished 264 | if self.finish_train: 265 | return 266 | 267 | # update 268 | self.epochs += 1 269 | self.train_steps_per_epoch = train_steps_per_epoch 270 | logger.info( 271 | f"(Steps: {self.steps}) Finished {self.epochs} epoch training " 272 | f"({self.train_steps_per_epoch} steps per epoch)." 273 | ) 274 | 275 | @torch.no_grad() 276 | def _eval_step(self, batch): 277 | """Evaluate model one step.""" 278 | # parse batch 279 | x, d, y = batch 280 | x = tuple([x.to(self.device) for x in x]) 281 | d = tuple([d[:1].to(self.device) for d in d]) 282 | z, c, f0 = x 283 | y = y.to(self.device) 284 | 285 | # generator forward 286 | outs = self.model["generator"](z, c, d) 287 | y_ = outs[0] 288 | 289 | # calculate spectral loss 290 | mel_loss = self.criterion["mel"](y_, y) 291 | gen_loss = self.config.train.lambda_mel * mel_loss 292 | self.total_eval_loss["eval/mel_loss"] += mel_loss.item() 293 | 294 | # calculate source regularization loss for sifigan-based models 295 | if self.config.train.lambda_reg > 0: 296 | s = outs[1] 297 | if isinstance( 298 | self.criterion["reg"], 299 | sifigan.losses.ResidualLoss, 300 | ): 301 | reg_loss = self.criterion["reg"](s, y, f0) 302 | gen_loss += self.config.train.lambda_reg * reg_loss 303 | self.total_eval_loss["eval/reg_loss"] += reg_loss.item() 304 | else: 305 | reg_loss = self.criterion["reg"](s, f0) 306 | gen_loss += self.config.train.lambda_reg * reg_loss 307 | self.total_eval_loss["eval/reg_loss"] += reg_loss.item() 308 | 309 | # calculate discriminator related losses 310 | if self.steps > self.config.train.discriminator_train_start_steps: 311 | # calculate feature matching loss 312 | if self.config.train.lambda_fm > 0: 313 | p_fake, fmaps_fake = self.model["discriminator"](y_, return_fmaps=True) 314 | p_real, fmaps_real = self.model["discriminator"](y, return_fmaps=True) 315 | fm_loss = self.criterion["fm"](fmaps_fake, fmaps_real) 316 | gen_loss += self.config.train.lambda_fm * fm_loss 317 | self.total_eval_loss["eval/fm_loss"] += fm_loss.item() 318 | else: 319 | p_fake = self.model["discriminator"](y_) 320 | # calculate adversarial loss 321 | adv_loss = self.criterion["adv"](p_fake) 322 | gen_loss += self.config.train.lambda_adv * adv_loss 323 | self.total_eval_loss["eval/adv_loss"] += adv_loss.item() 324 | 325 | self.total_eval_loss["eval/generator_loss"] += gen_loss.item() 326 | 327 | # discriminator 328 | if self.steps > self.config.train.discriminator_train_start_steps: 329 | # calculate discriminator loss 330 | p_real = self.model["discriminator"](y) 331 | # NOTE: the first augment must to be the fake sample 332 | fake_loss, real_loss = self.criterion["adv"](p_fake, p_real) 333 | dis_loss = fake_loss + real_loss 334 | self.total_eval_loss["eval/fake_loss"] += fake_loss.item() 335 | self.total_eval_loss["eval/real_loss"] += real_loss.item() 336 | self.total_eval_loss["eval/discriminator_loss"] += dis_loss.item() 337 | 338 | def _eval_epoch(self): 339 | """Evaluate model one epoch.""" 340 | logger.info(f"(Steps: {self.steps}) Start evaluation.") 341 | # change mode 342 | for key in self.model.keys(): 343 | self.model[key].eval() 344 | 345 | # calculate loss for each batch 346 | for eval_steps_per_epoch, batch in enumerate( 347 | tqdm(self.data_loader["valid"], desc="[eval]"), 1 348 | ): 349 | # eval one step 350 | self._eval_step(batch) 351 | 352 | # save intermediate result 353 | if eval_steps_per_epoch == 1: 354 | self._genearete_and_save_intermediate_result(batch) 355 | if eval_steps_per_epoch == 3: 356 | break 357 | 358 | logger.info( 359 | f"(Steps: {self.steps}) Finished evaluation " 360 | f"({eval_steps_per_epoch} steps per epoch)." 361 | ) 362 | 363 | # average loss 364 | for key in self.total_eval_loss.keys(): 365 | self.total_eval_loss[key] /= eval_steps_per_epoch 366 | logger.info( 367 | f"(Steps: {self.steps}) {key} = {self.total_eval_loss[key]:.4f}." 368 | ) 369 | 370 | # record 371 | self._write_to_tensorboard(self.total_eval_loss) 372 | 373 | # reset 374 | self.total_eval_loss = defaultdict(float) 375 | 376 | # restore mode 377 | for key in self.model.keys(): 378 | self.model[key].train() 379 | 380 | @torch.no_grad() 381 | def _genearete_and_save_intermediate_result(self, batch): 382 | """Generate and save intermediate result.""" 383 | # delayed import to avoid error related backend error 384 | import matplotlib.pyplot as plt 385 | 386 | x, d, y = batch 387 | # use only the first sample 388 | x = [x[:1].to(self.device) for x in x] 389 | d = [d[:1].to(self.device) for d in d] 390 | y = y[:1].to(self.device) 391 | z, c, _ = x 392 | 393 | # generator forward 394 | outs = self.model["generator"](z, c, d) 395 | 396 | len50ms = int(self.config.data.sample_rate * 0.05) 397 | start = np.random.randint(0, self.config.data.batch_max_length - len50ms) 398 | end = start + len50ms 399 | 400 | for audio, name in zip((y,) + outs, ["real", "fake", "source"]): 401 | if audio is not None: 402 | audio = audio.view(-1).cpu().numpy() 403 | 404 | # plot spectrogram 405 | fig = plt.figure(figsize=(8, 6)) 406 | spectrogram = np.abs( 407 | librosa.stft( 408 | y=audio, 409 | n_fft=1024, 410 | hop_length=128, 411 | win_length=1024, 412 | window="hann", 413 | ) 414 | ) 415 | spectrogram_db = librosa.amplitude_to_db(spectrogram, ref=np.max) 416 | librosa.display.specshow( 417 | spectrogram_db, 418 | sr=self.config.data.sample_rate, 419 | y_axis="linear", 420 | x_axis="time", 421 | hop_length=128, 422 | ) 423 | self.writer.add_figure(f"spectrogram/{name}", fig, self.steps) 424 | plt.clf() 425 | plt.close() 426 | 427 | # plot full waveform 428 | fig = plt.figure(figsize=(6, 3)) 429 | plt.plot(audio, linewidth=1) 430 | self.writer.add_figure(f"waveform/{name}", fig, self.steps) 431 | plt.clf() 432 | plt.close() 433 | 434 | # plot short term waveform 435 | fig = plt.figure(figsize=(6, 3)) 436 | plt.plot(audio[start:end], linewidth=1) 437 | self.writer.add_figure(f"short_waveform/{name}", fig, self.steps) 438 | plt.clf() 439 | plt.close() 440 | 441 | # save as wavfile 442 | self.writer.add_audio( 443 | f"audio_{name}.wav", 444 | audio, 445 | self.steps, 446 | self.config.data.sample_rate, 447 | ) 448 | 449 | def _write_to_tensorboard(self, loss): 450 | """Write to tensorboard.""" 451 | for key, value in loss.items(): 452 | self.writer.add_scalar(key, value, self.steps) 453 | 454 | def _check_save_interval(self): 455 | if self.steps % self.config.train.save_interval_steps == 0: 456 | self.save_checkpoint( 457 | os.path.join( 458 | self.config.out_dir, 459 | "checkpoints", 460 | f"checkpoint-{self.steps}steps.pkl", 461 | ) 462 | ) 463 | logger.info(f"Successfully saved checkpoint @ {self.steps} steps.") 464 | 465 | def _check_eval_interval(self): 466 | if self.steps % self.config.train.eval_interval_steps == 0: 467 | self._eval_epoch() 468 | 469 | def _check_log_interval(self): 470 | if self.steps % self.config.train.log_interval_steps == 0: 471 | for key in self.total_train_loss.keys(): 472 | self.total_train_loss[key] /= self.config.train.log_interval_steps 473 | logger.info( 474 | f"(Steps: {self.steps}) {key} = {self.total_train_loss[key]:.4f}." 475 | ) 476 | self._write_to_tensorboard(self.total_train_loss) 477 | 478 | # reset 479 | self.total_train_loss = defaultdict(float) 480 | 481 | def _check_train_finish(self): 482 | if self.steps >= self.config.train.train_max_steps: 483 | self.finish_train = True 484 | 485 | 486 | class Collater(object): 487 | """Customized collater for Pytorch DataLoader in training.""" 488 | 489 | def __init__( 490 | self, 491 | batch_max_length=12000, 492 | sample_rate=24000, 493 | hop_size=120, 494 | sine_amp=0.1, 495 | noise_amp=0.003, 496 | sine_f0_type="cf0", 497 | signal_types=["sine", "noise"], 498 | df_f0_type="cf0", 499 | dense_factors=[0.5, 1, 4, 8], 500 | upsample_scales=[5, 4, 3, 2], 501 | ): 502 | """Initialize customized collater for PyTorch DataLoader. 503 | 504 | Args: 505 | batch_max_length (int): The maximum length of batch. 506 | sample_rate (int): Sampling rate. 507 | hop_size (int): Hop size of auxiliary features. 508 | sine_amp (float): Amplitude of sine signal. 509 | noise_amp (float): Amplitude of random noise signal. 510 | sine_f0_type (str): F0 type for generating sine signal. 511 | signal_types (list): List of types for input signals. 512 | 513 | """ 514 | if batch_max_length % hop_size != 0: 515 | batch_max_length += -(batch_max_length % hop_size) 516 | self.batch_max_length = batch_max_length 517 | self.batch_max_frames = batch_max_length // hop_size 518 | self.sample_rate = sample_rate 519 | self.hop_size = hop_size 520 | self.sine_f0_type = sine_f0_type 521 | self.signal_generator = SignalGenerator( 522 | sample_rate=sample_rate, 523 | hop_size=hop_size, 524 | sine_amp=sine_amp, 525 | noise_amp=noise_amp, 526 | signal_types=signal_types, 527 | ) 528 | self.df_f0_type = df_f0_type 529 | self.dense_factors = dense_factors 530 | self.prod_upsample_scales = np.cumprod(upsample_scales) 531 | self.df_sample_rates = [ 532 | sample_rate / hop_size * s for s in self.prod_upsample_scales 533 | ] 534 | 535 | def __call__(self, batch): 536 | """Convert into batch tensors. 537 | 538 | Args: 539 | batch (list): list of tuple of the pair of audio and features. 540 | 541 | Returns: 542 | Tensor: Gaussian noise (and sine) batch (B, D, T). 543 | Tensor: Auxiliary feature batch (B, C, T'). 544 | Tensor: Dilated factor batch (B, 1, T). 545 | Tensor: F0 sequence batch (B, 1, T'). 546 | Tensor: Target signal batch (B, 1, T). 547 | 548 | """ 549 | # time resolution check 550 | y_batch, c_batch, f0_batch, cf0_batch = [], [], [], [] 551 | dfs_batch = [[] for _ in range(len(self.dense_factors))] 552 | for idx in range(len(batch)): 553 | x, c, f0, cf0 = batch[idx] 554 | if len(c) > self.batch_max_frames: 555 | # randomly pickup with the batch_max_length length of the part 556 | start_frame = np.random.randint(0, len(c) - self.batch_max_frames) 557 | start_step = start_frame * self.hop_size 558 | y = x[start_step : start_step + self.batch_max_length] 559 | c = c[start_frame : start_frame + self.batch_max_frames] 560 | f0 = f0[start_frame : start_frame + self.batch_max_frames] 561 | cf0 = cf0[start_frame : start_frame + self.batch_max_frames] 562 | dfs = [] 563 | for df, us in zip(self.dense_factors, self.prod_upsample_scales): 564 | dfs += [ 565 | np.repeat(dilated_factor(cf0, self.sample_rate, df), us) 566 | if self.df_f0_type == "cf0" 567 | else np.repeat(dilated_factor(f0, self.sample_rate, df), us) 568 | ] 569 | self._check_length(y, c, f0, cf0, dfs) 570 | else: 571 | logger.warn(f"Removed short sample from batch (length={len(x)}).") 572 | continue 573 | y_batch += [y.astype(np.float32).reshape(-1, 1)] # [(T, 1), ...] 574 | c_batch += [c.astype(np.float32)] # [(T', D), ...] 575 | for i in range(len(self.dense_factors)): 576 | dfs_batch[i] += [ 577 | dfs[i].astype(np.float32).reshape(-1, 1) 578 | ] # [(T', 1), ...] 579 | f0_batch += [f0.astype(np.float32).reshape(-1, 1)] # [(T', 1), ...] 580 | cf0_batch += [cf0.astype(np.float32).reshape(-1, 1)] # [(T', 1), ...] 581 | 582 | # convert each batch to tensor, asuume that each item in batch has the same length 583 | y_batch = torch.FloatTensor(np.array(y_batch)).transpose(2, 1) # (B, 1, T) 584 | c_batch = torch.FloatTensor(np.array(c_batch)).transpose(2, 1) # (B, 1, T') 585 | for i in range(len(self.dense_factors)): 586 | dfs_batch[i] = torch.FloatTensor(np.array(dfs_batch[i])).transpose( 587 | 2, 1 588 | ) # (B, 1, T') 589 | f0_batch = torch.FloatTensor(np.array(f0_batch)).transpose(2, 1) # (B, 1, T') 590 | cf0_batch = torch.FloatTensor(np.array(cf0_batch)).transpose(2, 1) # (B, 1, T') 591 | 592 | # make input signal batch tensor 593 | if self.sine_f0_type == "cf0": 594 | in_batch = self.signal_generator(cf0_batch) 595 | elif self.sine_f0_type == "f0": 596 | in_batch = self.signal_generator(f0_batch) 597 | 598 | return (in_batch, c_batch, f0_batch), dfs_batch, y_batch 599 | 600 | def _check_length(self, x, c, f0, cf0, dfs): 601 | """Assert the audio and feature lengths are correctly adjusted for upsamping.""" 602 | assert len(x) == len(c) * self.hop_size 603 | assert len(x) == len(f0) * self.hop_size 604 | assert len(x) == len(cf0) * self.hop_size 605 | for i in range(len(self.dense_factors)): 606 | assert len(x) * self.df_sample_rates[i] == len(dfs[i]) * self.sample_rate 607 | 608 | 609 | @hydra.main(version_base=None, config_path="config", config_name="train") 610 | def main(config: DictConfig) -> None: 611 | """Run training process.""" 612 | 613 | if not torch.cuda.is_available(): 614 | print("CPU") 615 | device = torch.device("cpu") 616 | else: 617 | print("GPU") 618 | device = torch.device("cuda") 619 | # effective when using fixed size inputs 620 | # see https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 621 | torch.backends.cudnn.benchmark = True 622 | 623 | # fix seed 624 | np.random.seed(config.seed) 625 | torch.manual_seed(config.seed) 626 | torch.cuda.manual_seed(config.seed) 627 | os.environ["PYTHONHASHSEED"] = str(config.seed) 628 | 629 | # check directory existence 630 | if not os.path.exists(config.out_dir): 631 | os.makedirs(config.out_dir) 632 | 633 | # write config to yaml file 634 | with open(os.path.join(config.out_dir, "config.yaml"), "w") as f: 635 | f.write(OmegaConf.to_yaml(config)) 636 | logger.info(OmegaConf.to_yaml(config)) 637 | 638 | # get dataset 639 | if config.data.remove_short_samples: 640 | feat_length_threshold = config.data.batch_max_length // config.data.hop_size 641 | else: 642 | feat_length_threshold = None 643 | 644 | train_dataset = AudioFeatDataset( 645 | stats=to_absolute_path(config.data.stats), 646 | audio_list=to_absolute_path(config.data.train_audio), 647 | feat_list=to_absolute_path(config.data.train_feat), 648 | feat_length_threshold=feat_length_threshold, 649 | allow_cache=config.data.allow_cache, 650 | sample_rate=config.data.sample_rate, 651 | hop_size=config.data.hop_size, 652 | aux_feats=config.data.aux_feats, 653 | ) 654 | logger.info(f"The number of training files = {len(train_dataset)}.") 655 | 656 | valid_dataset = AudioFeatDataset( 657 | stats=to_absolute_path(config.data.stats), 658 | audio_list=to_absolute_path(config.data.valid_audio), 659 | feat_list=to_absolute_path(config.data.valid_feat), 660 | feat_length_threshold=feat_length_threshold, 661 | allow_cache=config.data.allow_cache, 662 | sample_rate=config.data.sample_rate, 663 | hop_size=config.data.hop_size, 664 | aux_feats=config.data.aux_feats, 665 | ) 666 | logger.info(f"The number of validation files = {len(valid_dataset)}.") 667 | 668 | dataset = {"train": train_dataset, "valid": valid_dataset} 669 | 670 | # get data loader 671 | collater = Collater( 672 | batch_max_length=config.data.batch_max_length, 673 | sample_rate=config.data.sample_rate, 674 | hop_size=config.data.hop_size, 675 | sine_amp=config.data.sine_amp, 676 | noise_amp=config.data.noise_amp, 677 | sine_f0_type=config.data.sine_f0_type, 678 | signal_types=config.data.signal_types, 679 | df_f0_type=config.data.df_f0_type, 680 | dense_factors=config.data.dense_factors, 681 | upsample_scales=config.generator.upsample_scales, 682 | ) 683 | train_sampler, valid_sampler = None, None 684 | data_loader = { 685 | "train": DataLoader( 686 | dataset=dataset["train"], 687 | shuffle=True, 688 | collate_fn=collater, 689 | batch_size=config.data.batch_size, 690 | num_workers=config.data.num_workers, 691 | sampler=train_sampler, 692 | pin_memory=config.data.pin_memory, 693 | ), 694 | "valid": DataLoader( 695 | dataset=dataset["valid"], 696 | shuffle=True, 697 | collate_fn=collater, 698 | batch_size=config.data.batch_size, 699 | num_workers=config.data.num_workers, 700 | sampler=valid_sampler, 701 | pin_memory=config.data.pin_memory, 702 | ), 703 | } 704 | 705 | # define models and optimizers 706 | model = { 707 | "generator": hydra.utils.instantiate(config.generator).to(device), 708 | "discriminator": hydra.utils.instantiate(config.discriminator).to(device), 709 | } 710 | 711 | # define training criteria 712 | criterion = { 713 | "mel": hydra.utils.instantiate(config.train.mel_loss).to(device), 714 | "adv": hydra.utils.instantiate(config.train.adv_loss).to(device), 715 | } 716 | if config.train.lambda_fm > 0: 717 | criterion["fm"] = hydra.utils.instantiate(config.train.fm_loss).to(device) 718 | if config.train.lambda_reg > 0: 719 | criterion["reg"] = hydra.utils.instantiate(config.train.reg_loss).to(device) 720 | 721 | # define optimizers and schedulers 722 | optimizer = { 723 | "generator": hydra.utils.instantiate( 724 | config.train.generator_optimizer, params=model["generator"].parameters() 725 | ), 726 | "discriminator": hydra.utils.instantiate( 727 | config.train.discriminator_optimizer, 728 | params=model["discriminator"].parameters(), 729 | ), 730 | } 731 | scheduler = { 732 | "generator": hydra.utils.instantiate( 733 | config.train.generator_scheduler, optimizer=optimizer["generator"] 734 | ), 735 | "discriminator": hydra.utils.instantiate( 736 | config.train.discriminator_scheduler, optimizer=optimizer["discriminator"] 737 | ), 738 | } 739 | 740 | # define trainer 741 | trainer = Trainer( 742 | config=config, 743 | steps=0, 744 | epochs=0, 745 | data_loader=data_loader, 746 | model=model, 747 | criterion=criterion, 748 | optimizer=optimizer, 749 | scheduler=scheduler, 750 | device=device, 751 | ) 752 | 753 | # load trained parameters from checkpoint 754 | if config.train.resume: 755 | resume = os.path.join( 756 | config.out_dir, "checkpoints", f"checkpoint-{config.train.resume}steps.pkl" 757 | ) 758 | if os.path.exists(resume): 759 | trainer.load_checkpoint(resume) 760 | logger.info(f"Successfully resumed from {resume}.") 761 | else: 762 | logger.info(f"Failed to resume from {resume}.") 763 | sys.exit(0) 764 | else: 765 | logger.info("Start a new training process.") 766 | 767 | # run training loop 768 | try: 769 | trainer.run() 770 | except KeyboardInterrupt: 771 | trainer.save_checkpoint( 772 | os.path.join( 773 | config.out_dir, "checkpoints", f"checkpoint-{trainer.steps}steps.pkl" 774 | ) 775 | ) 776 | logger.info(f"Successfully saved checkpoint @ {trainer.steps}steps.") 777 | 778 | 779 | if __name__ == "__main__": 780 | main() 781 | -------------------------------------------------------------------------------- /sifigan/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from sifigan.datasets.audio_feat_dataset import * # NOQA 2 | -------------------------------------------------------------------------------- /sifigan/datasets/audio_feat_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Dataset modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | 12 | """ 13 | 14 | from logging import getLogger 15 | from multiprocessing import Manager 16 | 17 | import librosa 18 | import numpy as np 19 | import soundfile as sf 20 | from hydra.utils import to_absolute_path 21 | from joblib import load 22 | from sifigan.utils import check_filename, read_hdf5, read_txt, validate_length 23 | from torch.utils.data import Dataset 24 | 25 | # A logger for this file 26 | logger = getLogger(__name__) 27 | 28 | 29 | class AudioFeatDataset(Dataset): 30 | """PyTorch compatible audio and acoustic feat. dataset.""" 31 | 32 | def __init__( 33 | self, 34 | stats, 35 | audio_list, 36 | feat_list, 37 | audio_length_threshold=None, 38 | feat_length_threshold=None, 39 | return_filename=False, 40 | allow_cache=False, 41 | sample_rate=24000, 42 | hop_size=120, 43 | aux_feats=["mcep", "bap"], 44 | ): 45 | """Initialize dataset. 46 | 47 | Args: 48 | stats (str): Filename of the statistic hdf5 file. 49 | audio_list (str): Filename of the list of audio files. 50 | feat_list (str): Filename of the list of feature files. 51 | audio_length_threshold (int): Threshold to remove short audio files. 52 | feat_length_threshold (int): Threshold to remove short feature files. 53 | return_filename (bool): Whether to return the filename with arrays. 54 | allow_cache (bool): Whether to allow cache of the loaded files. 55 | sample_rate (int): Sampling frequency. 56 | hop_size (int): Hope size of acoustic feature 57 | aux_feats (str): Type of auxiliary features. 58 | 59 | """ 60 | # load audio and feature files & check filename 61 | audio_files = read_txt(to_absolute_path(audio_list)) 62 | feat_files = read_txt(to_absolute_path(feat_list)) 63 | assert check_filename(audio_files, feat_files) 64 | 65 | # filter by threshold 66 | if audio_length_threshold is not None: 67 | audio_lengths = [sf.read(to_absolute_path(f)).shape[0] for f in audio_files] 68 | idxs = [ 69 | idx 70 | for idx in range(len(audio_files)) 71 | if audio_lengths[idx] > audio_length_threshold 72 | ] 73 | if len(audio_files) != len(idxs): 74 | logger.warning( 75 | f"Some files are filtered by audio length threshold " 76 | f"({len(audio_files)} -> {len(idxs)})." 77 | ) 78 | audio_files = [audio_files[idx] for idx in idxs] 79 | feat_files = [feat_files[idx] for idx in idxs] 80 | if feat_length_threshold is not None: 81 | f0_lengths = [ 82 | read_hdf5(to_absolute_path(f), "/f0").shape[0] for f in feat_files 83 | ] 84 | idxs = [ 85 | idx 86 | for idx in range(len(feat_files)) 87 | if f0_lengths[idx] > feat_length_threshold 88 | ] 89 | if len(feat_files) != len(idxs): 90 | logger.warning( 91 | f"Some files are filtered by mel length threshold " 92 | f"({len(feat_files)} -> {len(idxs)})." 93 | ) 94 | audio_files = [audio_files[idx] for idx in idxs] 95 | feat_files = [feat_files[idx] for idx in idxs] 96 | 97 | # assert the number of files 98 | assert len(audio_files) != 0, f"${audio_list} is empty." 99 | assert len(audio_files) == len( 100 | feat_files 101 | ), f"Number of audio and features files are different ({len(audio_files)} vs {len(feat_files)})." 102 | 103 | self.audio_files = audio_files 104 | self.feat_files = feat_files 105 | self.return_filename = return_filename 106 | self.allow_cache = allow_cache 107 | self.sample_rate = sample_rate 108 | self.hop_size = hop_size 109 | self.aux_feats = aux_feats 110 | logger.info(f"Feature type : {self.aux_feats}") 111 | 112 | if allow_cache: 113 | # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 114 | self.manager = Manager() 115 | self.caches = self.manager.list() 116 | self.caches += [() for _ in range(len(audio_files))] 117 | 118 | # define feature pre-processing function 119 | self.scaler = load(stats) 120 | 121 | def __getitem__(self, idx): 122 | """Get specified idx items. 123 | 124 | Args: 125 | idx (int): Index of the item. 126 | 127 | Returns: 128 | str: Utterance id (only in return_filename = True). 129 | ndarray: Audio signal (T,). 130 | ndarray: Auxiliary features (T', C). 131 | ndarray: F0 sequence (T', 1). 132 | ndarray: Continuous F0 sequence (T', 1).¥ 133 | 134 | """ 135 | if self.allow_cache and len(self.caches[idx]) != 0: 136 | return self.caches[idx] 137 | # load audio and features 138 | audio, sr = sf.read(to_absolute_path(self.audio_files[idx])) 139 | if sr != self.sample_rate: 140 | logger.warning( 141 | f"Resampling {self.audio_files[idx]} incurs extra computational cost." 142 | + "It is recommended to prepare audio files with the desired sampling rate in advance." 143 | ) 144 | audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate) 145 | # audio & feature pre-processing 146 | audio = audio.astype(np.float32) 147 | 148 | # get auxiliary features 149 | aux_feats = [] 150 | for feat_type in self.aux_feats: 151 | aux_feat = read_hdf5( 152 | to_absolute_path(self.feat_files[idx]), f"/{feat_type}" 153 | ) 154 | aux_feat = self.scaler[f"{feat_type}"].transform(aux_feat) 155 | aux_feats += [aux_feat] 156 | aux_feats = np.concatenate(aux_feats, axis=1) 157 | 158 | # get dilated factor sequences 159 | f0 = read_hdf5(to_absolute_path(self.feat_files[idx]), "/f0") # descrete F0 160 | cf0 = read_hdf5(to_absolute_path(self.feat_files[idx]), "/cf0") # continuous F0 161 | 162 | # adjust length 163 | aux_feats, f0, cf0, audio = validate_length( 164 | (aux_feats, f0, cf0), (audio,), self.hop_size 165 | ) 166 | 167 | if self.return_filename: 168 | items = self.feat_files[idx], audio, aux_feats, f0, cf0 169 | else: 170 | items = audio, aux_feats, f0, cf0 171 | 172 | if self.allow_cache: 173 | self.caches[idx] = items 174 | 175 | return items 176 | 177 | def __len__(self): 178 | """Return dataset length. 179 | 180 | Returns: 181 | int: The length of dataset. 182 | 183 | """ 184 | return len(self.audio_files) 185 | 186 | 187 | class FeatDataset(Dataset): 188 | """PyTorch compatible mel dataset.""" 189 | 190 | def __init__( 191 | self, 192 | stats, 193 | feat_list, 194 | feat_length_threshold=None, 195 | return_filename=False, 196 | allow_cache=False, 197 | sample_rate=24000, 198 | hop_size=120, 199 | aux_feats=["mcep", "bap"], 200 | f0_factor=1.0, 201 | ): 202 | """Initialize dataset. 203 | 204 | Args: 205 | stats (str): Filename of the statistic hdf5 file. 206 | feat_list (str): Filename of the list of feature files. 207 | feat_length_threshold (int): Threshold to remove short feature files. 208 | return_filename (bool): Whether to return the utterance id with arrays. 209 | allow_cache (bool): Whether to allow cache of the loaded files. 210 | sample_rate (int): Sampling frequency. 211 | hop_size (int): Hope size of acoustic feature 212 | aux_feats (str): Type of auxiliary features. 213 | f0_factor (float): Ratio of scaled f0. 214 | 215 | """ 216 | # load feat. files 217 | feat_files = read_txt(to_absolute_path(feat_list)) 218 | 219 | # filter by threshold 220 | if feat_length_threshold is not None: 221 | f0_lengths = [ 222 | read_hdf5(to_absolute_path(f), "/f0").shape[0] for f in feat_files 223 | ] 224 | idxs = [ 225 | idx 226 | for idx in range(len(feat_files)) 227 | if f0_lengths[idx] > feat_length_threshold 228 | ] 229 | if len(feat_files) != len(idxs): 230 | logger.warning( 231 | f"Some files are filtered by mel length threshold " 232 | f"({len(feat_files)} -> {len(idxs)})." 233 | ) 234 | feat_files = [feat_files[idx] for idx in idxs] 235 | 236 | # assert the number of files 237 | assert len(feat_files) != 0, f"${feat_list} is empty." 238 | 239 | self.feat_files = feat_files 240 | self.return_filename = return_filename 241 | self.allow_cache = allow_cache 242 | self.sample_rate = sample_rate 243 | self.hop_size = hop_size 244 | self.aux_feats = aux_feats 245 | self.f0_factor = f0_factor 246 | logger.info(f"Feature type : {self.aux_feats}") 247 | 248 | if allow_cache: 249 | # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 250 | self.manager = Manager() 251 | self.caches = self.manager.list() 252 | self.caches += [() for _ in range(len(feat_files))] 253 | 254 | # define feature pre-processing function 255 | self.scaler = load(stats) 256 | 257 | def __getitem__(self, idx): 258 | """Get specified idx items. 259 | 260 | Args: 261 | idx (int): Index of the item. 262 | 263 | Returns: 264 | str: Utterance id (only in return_filename = True). 265 | ndarray: Auxiliary feature (T', C). 266 | ndarray: F0 sequence (T', 1). 267 | ndarray: Continuous F0 sequence (T', 1). 268 | 269 | """ 270 | if self.allow_cache and len(self.caches[idx]) != 0: 271 | return self.caches[idx] 272 | 273 | # get auxiliary features 274 | aux_feats = [] 275 | for feat_type in self.aux_feats: 276 | aux_feat = read_hdf5( 277 | to_absolute_path(self.feat_files[idx]), f"/{feat_type}" 278 | ) 279 | if feat_type in ["f0", "cf0"]: # f0 scaling 280 | aux_feat *= self.f0_factor 281 | aux_feat = self.scaler[f"{feat_type}"].transform(aux_feat) 282 | aux_feats += [aux_feat] 283 | aux_feats = np.concatenate(aux_feats, axis=1) 284 | 285 | # get f0 sequences 286 | f0 = read_hdf5(to_absolute_path(self.feat_files[idx]), "/f0") # descrete F0 287 | cf0 = read_hdf5(to_absolute_path(self.feat_files[idx]), "/cf0") # continuous F0 288 | 289 | # adjust length 290 | aux_feats, f0, cf0 = validate_length((aux_feats, f0, cf0)) 291 | 292 | # f0 scaling 293 | f0 *= self.f0_factor 294 | cf0 *= self.f0_factor 295 | 296 | if self.return_filename: 297 | items = self.feat_files[idx], aux_feats, f0, cf0 298 | else: 299 | items = aux_feats, f0, cf0 300 | 301 | if self.allow_cache: 302 | self.caches[idx] = items 303 | 304 | return items 305 | 306 | def __len__(self): 307 | """Return dataset length. 308 | 309 | Returns: 310 | int: The length of dataset. 311 | 312 | """ 313 | return len(self.feat_files) 314 | -------------------------------------------------------------------------------- /sifigan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from sifigan.layers.cheaptrick import * # NOQA 2 | from sifigan.layers.snake import * # NOQA 3 | from sifigan.layers.residual_block import * # NOQA 4 | -------------------------------------------------------------------------------- /sifigan/layers/cheaptrick.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Spectral envelopes estimation module based on CheapTrick. 7 | 8 | References: 9 | - https://www.sciencedirect.com/science/article/pii/S0167639314000697 10 | - https://github.com/mmorise/World 11 | 12 | """ 13 | 14 | import math 15 | 16 | import torch 17 | import torch.fft 18 | import torch.nn as nn 19 | 20 | 21 | class AdaptiveWindowing(nn.Module): 22 | """CheapTrick F0 adptive windowing module.""" 23 | 24 | def __init__( 25 | self, 26 | sample_rate, 27 | hop_size, 28 | fft_size, 29 | f0_floor, 30 | f0_ceil, 31 | ): 32 | """Initilize AdaptiveWindowing module. 33 | 34 | Args: 35 | sample_rate (int): Sampling rate. 36 | hop_size (int): Hop size. 37 | fft_size (int): FFT size. 38 | f0_floor (int): Minimum value of F0. 39 | f0_ceil (int): Maximum value of F0. 40 | 41 | """ 42 | super(AdaptiveWindowing, self).__init__() 43 | 44 | self.sample_rate = sample_rate 45 | self.hop_size = hop_size 46 | self.fft_size = fft_size 47 | self.register_buffer("window", torch.zeros((f0_ceil + 1, fft_size))) 48 | self.zero_padding = nn.ConstantPad2d((fft_size // 2, fft_size // 2, 0, 0), 0) 49 | 50 | # Pre-calculation of the window functions 51 | for f0 in range(f0_floor, f0_ceil + 1): 52 | half_win_len = round(1.5 * self.sample_rate / f0) 53 | base_index = torch.arange( 54 | -half_win_len, half_win_len + 1, dtype=torch.int64 55 | ) 56 | position = base_index / 1.5 / self.sample_rate 57 | left = fft_size // 2 - half_win_len 58 | right = fft_size // 2 + half_win_len + 1 59 | window = torch.zeros(fft_size) 60 | window[left:right] = 0.5 * torch.cos(math.pi * position * f0) + 0.5 61 | average = torch.sum(window * window).pow(0.5) 62 | self.window[f0] = window / average 63 | 64 | def forward(self, x, f, power=False): 65 | """Calculate forward propagation. 66 | 67 | Args: 68 | x (Tensor): Waveform (B, fft_size // 2 + 1, T). 69 | f (Tensor): F0 sequence (B, T'). 70 | power (boot): Whether to use power or magnitude. 71 | 72 | Returns: 73 | Tensor: Power spectrogram (B, bin_size, T'). 74 | 75 | """ 76 | # Get the matrix of window functions corresponding to F0 77 | x = self.zero_padding(x).unfold(1, self.fft_size, self.hop_size) 78 | windows = self.window[f] 79 | 80 | # Adaptive windowing and calculate power spectrogram. 81 | # In test, change x[:, : -1, :] to x. 82 | x = torch.abs(torch.fft.rfft(x[:, :-1, :] * windows)) 83 | x = x.pow(2) if power else x 84 | 85 | return x 86 | 87 | 88 | class AdaptiveLiftering(nn.Module): 89 | """CheapTrick F0 adptive windowing module.""" 90 | 91 | def __init__( 92 | self, 93 | sample_rate, 94 | fft_size, 95 | f0_floor, 96 | f0_ceil, 97 | q1=-0.15, 98 | ): 99 | """Initilize AdaptiveLiftering module. 100 | 101 | Args: 102 | sample_rate (int): Sampling rate. 103 | fft_size (int): FFT size. 104 | f0_floor (int): Minimum value of F0. 105 | f0_ceil (int): Maximum value of F0. 106 | q1 (float): Parameter to remove effect of adjacent harmonics. 107 | 108 | """ 109 | super(AdaptiveLiftering, self).__init__() 110 | 111 | self.sample_rate = sample_rate 112 | self.bin_size = fft_size // 2 + 1 113 | self.q1 = q1 114 | self.q0 = 1.0 - 2.0 * q1 115 | self.register_buffer( 116 | "smoothing_lifter", torch.zeros((f0_ceil + 1, self.bin_size)) 117 | ) 118 | self.register_buffer( 119 | "compensation_lifter", torch.zeros((f0_ceil + 1, self.bin_size)) 120 | ) 121 | 122 | # Pre-calculation of the smoothing lifters and compensation lifters 123 | for f0 in range(f0_floor, f0_ceil + 1): 124 | smoothing_lifter = torch.zeros(self.bin_size) 125 | compensation_lifter = torch.zeros(self.bin_size) 126 | quefrency = torch.arange(1, self.bin_size) / sample_rate 127 | smoothing_lifter[0] = 1.0 128 | smoothing_lifter[1:] = torch.sin(math.pi * f0 * quefrency) / ( 129 | math.pi * f0 * quefrency 130 | ) 131 | compensation_lifter[0] = self.q0 + 2.0 * self.q1 132 | compensation_lifter[1:] = self.q0 + 2.0 * self.q1 * torch.cos( 133 | 2.0 * math.pi * f0 * quefrency 134 | ) 135 | self.smoothing_lifter[f0] = smoothing_lifter 136 | self.compensation_lifter[f0] = compensation_lifter 137 | 138 | def forward(self, x, f, elim_0th=False): 139 | """Calculate forward propagation. 140 | 141 | Args: 142 | x (Tensor): Power spectrogram (B, bin_size, T'). 143 | f (Tensor): F0 sequence (B, T'). 144 | elim_0th (bool): Whether to eliminate cepstram 0th component. 145 | 146 | Returns: 147 | Tensor: Estimated spectral envelope (B, bin_size, T'). 148 | 149 | """ 150 | # Setting the smoothing lifter and compensation lifter 151 | smoothing_lifter = self.smoothing_lifter[f] 152 | compensation_lifter = self.compensation_lifter[f] 153 | 154 | # Calculating cepstrum 155 | tmp = torch.cat((x, torch.flip(x[:, :, 1:-1], [2])), dim=2) 156 | cepstrum = torch.fft.rfft(torch.log(torch.clamp(tmp, min=1e-7))).real 157 | 158 | # Set the 0th cepstrum to 0 159 | if elim_0th: 160 | cepstrum[..., 0] = 0 161 | 162 | # Liftering cepstrum with the lifters 163 | liftered_cepstrum = cepstrum * smoothing_lifter * compensation_lifter 164 | 165 | # Return the result to the spectral domain 166 | x = torch.fft.irfft(liftered_cepstrum)[:, :, : self.bin_size] 167 | 168 | return x 169 | 170 | 171 | class CheapTrick(nn.Module): 172 | """CheapTrick based spectral envelope estimation module.""" 173 | 174 | def __init__( 175 | self, 176 | sample_rate, 177 | hop_size, 178 | fft_size, 179 | f0_floor=70, 180 | f0_ceil=340, 181 | uv_threshold=0, 182 | q1=-0.15, 183 | ): 184 | """Initilize AdaptiveLiftering module. 185 | 186 | Args: 187 | sample_rate (int): Sampling rate. 188 | hop_size (int): Hop size. 189 | fft_size (int): FFT size. 190 | f0_floor (int): Minimum value of F0. 191 | f0_ceil (int): Maximum value of F0. 192 | uv_threshold (float): V/UV determining threshold. 193 | q1 (float): Parameter to remove effect of adjacent harmonics. 194 | 195 | """ 196 | super(CheapTrick, self).__init__() 197 | 198 | # fft_size must be larger than 3.0 * sample_rate / f0_floor 199 | assert fft_size > 3.0 * sample_rate / f0_floor 200 | self.f0_floor = f0_floor 201 | self.f0_ceil = f0_ceil 202 | self.uv_threshold = uv_threshold 203 | 204 | self.ada_wind = AdaptiveWindowing( 205 | sample_rate, 206 | hop_size, 207 | fft_size, 208 | f0_floor, 209 | f0_ceil, 210 | ) 211 | self.ada_lift = AdaptiveLiftering( 212 | sample_rate, 213 | fft_size, 214 | f0_floor, 215 | f0_ceil, 216 | q1, 217 | ) 218 | 219 | def forward(self, x, f, power=False, elim_0th=False): 220 | """Calculate forward propagation. 221 | 222 | Args: 223 | x (Tensor): Power spectrogram (B, T). 224 | f (Tensor): F0 sequence (B, T'). 225 | power (boot): Whether to use power or magnitude spectrogram. 226 | elim_0th (bool): Whether to eliminate cepstram 0th component. 227 | 228 | Returns: 229 | Tensor: Estimated spectral envelope (B, bin_size, T'). 230 | 231 | """ 232 | # Step0: Round F0 values to integers. 233 | voiced = (f > self.uv_threshold) * torch.ones_like(f) 234 | f = voiced * f + (1.0 - voiced) * self.f0_ceil 235 | f = torch.round(torch.clamp(f, min=self.f0_floor, max=self.f0_ceil)).to( 236 | torch.int64 237 | ) 238 | 239 | # Step1: Adaptive windowing and calculate power or amplitude spectrogram. 240 | x = self.ada_wind(x, f, power) 241 | 242 | # Step3: Smoothing (log axis) and spectral recovery on the cepstrum domain. 243 | x = self.ada_lift(x, f, elim_0th) 244 | 245 | return x 246 | -------------------------------------------------------------------------------- /sifigan/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Residual block modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/bigpon/QPPWG 11 | - https://github.com/r9y9/wavenet_vocoder 12 | 13 | """ 14 | 15 | from logging import getLogger 16 | 17 | import torch 18 | import torch.nn as nn 19 | from sifigan.layers import Snake 20 | from sifigan.utils import index_initial, pd_indexing 21 | 22 | # A logger for this file 23 | logger = getLogger(__name__) 24 | 25 | 26 | class Conv1d(nn.Conv1d): 27 | """Conv1d module with customized initialization.""" 28 | 29 | def __init__(self, *args, **kwargs): 30 | """Initialize Conv1d module.""" 31 | super(Conv1d, self).__init__(*args, **kwargs) 32 | 33 | def reset_parameters(self): 34 | """Reset parameters.""" 35 | nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 36 | if self.bias is not None: 37 | nn.init.constant_(self.bias, 0.0) 38 | 39 | 40 | class Conv1d1x1(Conv1d): 41 | """1x1 Conv1d with customized initialization.""" 42 | 43 | def __init__(self, in_channels, out_channels, bias=True): 44 | """Initialize 1x1 Conv1d module.""" 45 | super(Conv1d1x1, self).__init__( 46 | in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias 47 | ) 48 | 49 | 50 | class Conv2d(nn.Conv2d): 51 | """Conv2d module with customized initialization.""" 52 | 53 | def __init__(self, *args, **kwargs): 54 | """Initialize Conv2d module.""" 55 | super(Conv2d, self).__init__(*args, **kwargs) 56 | 57 | def reset_parameters(self): 58 | """Reset parameters.""" 59 | nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu") 60 | if self.bias is not None: 61 | nn.init.constant_(self.bias, 0.0) 62 | 63 | 64 | class Conv2d1x1(Conv2d): 65 | """1x1 Conv2d with customized initialization.""" 66 | 67 | def __init__(self, in_channels, out_channels, bias=True): 68 | """Initialize 1x1 Conv2d module.""" 69 | super(Conv2d1x1, self).__init__( 70 | in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias 71 | ) 72 | 73 | 74 | class ResidualBlock(nn.Module): 75 | """Residual block module in HiFiGAN.""" 76 | 77 | def __init__( 78 | self, 79 | kernel_size=3, 80 | channels=512, 81 | dilations=(1, 3, 5), 82 | bias=True, 83 | use_additional_convs=True, 84 | nonlinear_activation="LeakyReLU", 85 | nonlinear_activation_params={"negative_slope": 0.1}, 86 | ): 87 | """Initialize ResidualBlock module. 88 | 89 | Args: 90 | kernel_size (int): Kernel size of dilation convolution layer. 91 | channels (int): Number of channels for convolution layer. 92 | dilations (List[int]): List of dilation factors. 93 | use_additional_convs (bool): Whether to use additional convolution layers. 94 | bias (bool): Whether to add bias parameter in convolution layers. 95 | nonlinear_activation (str): Activation function module name. 96 | nonlinear_activation_params (dict): Hyperparameters for activation function. 97 | 98 | """ 99 | super().__init__() 100 | self.use_additional_convs = use_additional_convs 101 | self.convs1 = nn.ModuleList() 102 | if use_additional_convs: 103 | self.convs2 = nn.ModuleList() 104 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 105 | for dilation in dilations: 106 | if nonlinear_activation == "Snake": 107 | nonlinear = Snake(channels, **nonlinear_activation_params) 108 | else: 109 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 110 | self.convs1 += [ 111 | nn.Sequential( 112 | nonlinear, 113 | nn.Conv1d( 114 | channels, 115 | channels, 116 | kernel_size, 117 | dilation=dilation, 118 | bias=bias, 119 | padding=(kernel_size - 1) // 2 * dilation, 120 | ), 121 | ) 122 | ] 123 | if use_additional_convs: 124 | if nonlinear_activation == "Snake": 125 | nonlinear = Snake(channels, **nonlinear_activation_params) 126 | else: 127 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 128 | self.convs2 += [ 129 | nn.Sequential( 130 | nonlinear, 131 | nn.Conv1d( 132 | channels, 133 | channels, 134 | kernel_size, 135 | dilation=1, 136 | bias=bias, 137 | padding=(kernel_size - 1) // 2, 138 | ), 139 | ) 140 | ] 141 | 142 | def forward(self, x): 143 | """Calculate forward propagation. 144 | 145 | Args: 146 | x (Tensor): Input tensor (B, channels, T). 147 | 148 | Returns: 149 | Tensor: Output tensor (B, channels, T). 150 | 151 | """ 152 | for idx in range(len(self.convs1)): 153 | xt = self.convs1[idx](x) 154 | if self.use_additional_convs: 155 | xt = self.convs2[idx](xt) 156 | x = xt + x 157 | return x 158 | 159 | 160 | class AdaptiveResidualBlock(nn.Module): 161 | """Residual block module in HiFiGAN.""" 162 | 163 | def __init__( 164 | self, 165 | kernel_size=3, 166 | channels=512, 167 | dilations=(1, 2, 4), 168 | bias=True, 169 | use_additional_convs=True, 170 | nonlinear_activation="LeakyReLU", 171 | nonlinear_activation_params={"negative_slope": 0.1}, 172 | ): 173 | """Initialize ResidualBlock module. 174 | 175 | Args: 176 | kernel_size (int): Kernel size of dilation convolution layer. 177 | channels (int): Number of channels for convolution layer. 178 | bias (bool): Whether to add bias parameter in convolution layers. 179 | nonlinear_activation (str): Activation function module name. 180 | nonlinear_activation_params (dict): Hyperparameters for activation function. 181 | 182 | """ 183 | super().__init__() 184 | self.use_additional_convs = use_additional_convs 185 | assert kernel_size == 3, "Currently only kernel_size = 3 is supported." 186 | self.channels = channels 187 | self.dilations = dilations 188 | self.nonlinears = nn.ModuleList() 189 | self.convsC = nn.ModuleList() 190 | self.convsP = nn.ModuleList() 191 | self.convsF = nn.ModuleList() 192 | if use_additional_convs: 193 | self.convsA = nn.ModuleList() 194 | for _ in dilations: 195 | if nonlinear_activation == "Snake": 196 | self.nonlinears += [Snake(channels, **nonlinear_activation_params)] 197 | else: 198 | self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)] 199 | self.convsC += [ 200 | Conv1d1x1( 201 | channels, 202 | channels, 203 | bias=bias, 204 | ), 205 | ] 206 | self.convsP += [ 207 | Conv1d1x1( 208 | channels, 209 | channels, 210 | bias=bias, 211 | ), 212 | ] 213 | self.convsF += [ 214 | Conv1d1x1( 215 | channels, 216 | channels, 217 | bias=bias, 218 | ), 219 | ] 220 | if use_additional_convs: 221 | if nonlinear_activation == "Snake": 222 | nonlinear = Snake(channels, **nonlinear_activation_params) 223 | else: 224 | nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) 225 | self.convsA += [ 226 | nn.Sequential( 227 | nonlinear, 228 | nn.Conv1d( 229 | channels, 230 | channels, 231 | kernel_size, 232 | dilation=1, 233 | bias=bias, 234 | padding=(kernel_size - 1) // 2, 235 | ), 236 | ) 237 | ] 238 | 239 | def forward(self, x, d): 240 | """Calculate forward propagation. 241 | 242 | Args: 243 | x (Tensor): Input tensor (B, channels, T). 244 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 245 | 246 | Returns: 247 | Tensor: Output tensor (B, channels, T). 248 | 249 | """ 250 | batch_index, ch_index = index_initial(x.size(0), self.channels) 251 | for i, dilation in enumerate(self.dilations): 252 | xt = self.nonlinears[i](x) 253 | xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index) 254 | xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF) 255 | if self.use_additional_convs: 256 | xt = self.convsA[i](xt) 257 | x = xt + x 258 | return x 259 | -------------------------------------------------------------------------------- /sifigan/layers/snake.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Snake Activation Function Module. 7 | 8 | References: 9 | - Neural Networks Fail to Learn Periodic Functions and How to Fix It 10 | https://arxiv.org/pdf/2006.08195.pdf 11 | - BigVGAN: A Universal Neural Vocoder with Large-Scale Training 12 | https://arxiv.org/pdf/2206.04658.pdf 13 | 14 | """ 15 | 16 | import torch 17 | import torch.nn as nn 18 | 19 | 20 | class Snake(nn.Module): 21 | """Snake activation function module.""" 22 | 23 | def __init__(self, channels, init=50): 24 | """Initialize Snake module. 25 | 26 | Args: 27 | channels (int): Number of feature channels. 28 | init (float): Initial value of the learnable parameter alpha. 29 | According to the original paper, 5 ~ 50 would be 30 | suitable for periodic data (i.e. voices). 31 | 32 | """ 33 | super(Snake, self).__init__() 34 | alpha = init * torch.ones(1, channels, 1) 35 | self.alpha = nn.Parameter(alpha) 36 | 37 | def forward(self, x): 38 | """Calculate forward propagation. 39 | 40 | Args: 41 | x (Tensor): Input noise signal (B, channels, T). 42 | 43 | Returns: 44 | Tensor: Output tensor (B, channels, T). 45 | 46 | """ 47 | return x + torch.sin(self.alpha * x) ** 2 / self.alpha 48 | -------------------------------------------------------------------------------- /sifigan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from sifigan.losses.adv import * # NOQA 2 | from sifigan.losses.mel import * # NOQA 3 | from sifigan.losses.reg import * # NOQA 4 | -------------------------------------------------------------------------------- /sifigan/losses/adv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Adversarial loss modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | 11 | """ 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | 17 | 18 | class AdversarialLoss(nn.Module): 19 | """Adversarial loss module.""" 20 | 21 | def __init__( 22 | self, 23 | average_by_discriminators=False, 24 | loss_type="mse", 25 | ): 26 | """Initialize AversarialLoss module.""" 27 | super(AdversarialLoss, self).__init__() 28 | self.average_by_discriminators = average_by_discriminators 29 | 30 | assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported." 31 | if loss_type == "mse": 32 | self.adv_criterion = self._mse_adv_loss 33 | self.fake_criterion = self._mse_fake_loss 34 | self.real_criterion = self._mse_real_loss 35 | else: 36 | self.adv_criterion = self._hinge_adv_loss 37 | self.fake_criterion = self._hinge_fake_loss 38 | self.real_criterion = self._hinge_real_loss 39 | 40 | def forward(self, p_fakes, p_reals=None): 41 | """Calcualate generator/discriminator adversarial loss. 42 | 43 | Args: 44 | p_fakes (list): List of 45 | discriminator outputs calculated from generator outputs. 46 | p_reals (list): List of 47 | discriminator outputs calculated from groundtruth. 48 | 49 | Returns: 50 | Tensor: Generator adversarial loss value. 51 | Tensor: Discriminator real loss value. 52 | Tensor: Discriminator fake loss value. 53 | 54 | """ 55 | # generator adversarial loss 56 | if p_reals is None: 57 | adv_loss = 0.0 58 | for p_fake in p_fakes: 59 | adv_loss += self.adv_criterion(p_fake) 60 | 61 | if self.average_by_discriminators: 62 | adv_loss /= len(p_fakes) 63 | 64 | return adv_loss 65 | 66 | # discriminator adversarial loss 67 | else: 68 | fake_loss = 0.0 69 | real_loss = 0.0 70 | for p_fake, p_real in zip(p_fakes, p_reals): 71 | fake_loss += self.fake_criterion(p_fake) 72 | real_loss += self.real_criterion(p_real) 73 | 74 | if self.average_by_discriminators: 75 | fake_loss /= len(p_fakes) 76 | real_loss /= len(p_reals) 77 | 78 | return fake_loss, real_loss 79 | 80 | def _mse_adv_loss(self, x): 81 | return F.mse_loss(x, x.new_ones(x.size())) 82 | 83 | def _mse_real_loss(self, x): 84 | return F.mse_loss(x, x.new_ones(x.size())) 85 | 86 | def _mse_fake_loss(self, x): 87 | return F.mse_loss(x, x.new_zeros(x.size())) 88 | 89 | def _hinge_adv_loss(self, x): 90 | return -x.mean() 91 | 92 | def _hinge_real_loss(self, x): 93 | return -torch.mean(torch.min(x - 1, x.new_zeros(x.size()))) 94 | 95 | def _hinge_fake_loss(self, x): 96 | return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size()))) 97 | 98 | 99 | class FeatureMatchLoss(nn.Module): 100 | # Feature matching loss module. 101 | 102 | def __init__( 103 | self, 104 | average_by_layers=False, 105 | ): 106 | """Initialize FeatureMatchLoss module.""" 107 | super(FeatureMatchLoss, self).__init__() 108 | self.average_by_layers = average_by_layers 109 | 110 | def forward(self, fmaps_fake, fmaps_real): 111 | """Calculate forward propagation. 112 | 113 | Args: 114 | fmaps_fake (list): List of discriminator outputs 115 | calcuated from generater outputs. 116 | fmaps_real (list): List of discriminator outputs 117 | calcuated from groundtruth. 118 | 119 | Returns: 120 | Tensor: Feature matching loss value. 121 | 122 | """ 123 | fm_loss = 0.0 124 | for feat_fake, feat_real in zip(fmaps_fake, fmaps_real): 125 | fm_loss += F.l1_loss(feat_fake, feat_real.detach()) 126 | 127 | if self.average_by_layers: 128 | fm_loss /= len(fmaps_fake) 129 | 130 | return fm_loss 131 | -------------------------------------------------------------------------------- /sifigan/losses/mel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based loss modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | 11 | """ 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from librosa.filters import mel as librosa_mel 17 | 18 | 19 | def stft( 20 | x, fft_size, hop_size, win_length, window, center=True, onesided=True, power=False 21 | ): 22 | """Perform STFT and convert to magnitude spectrogram. 23 | 24 | Args: 25 | x (Tensor): Input signal tensor (B, T). 26 | fft_size (int): FFT size. 27 | hop_size (int): Hop size. 28 | win_length (int): Window length. 29 | window (str): Window function type. 30 | 31 | Returns: 32 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 33 | 34 | """ 35 | x_stft = torch.stft( 36 | x, 37 | fft_size, 38 | hop_size, 39 | win_length, 40 | window, 41 | center=center, 42 | onesided=onesided, 43 | return_complex=False, 44 | ) 45 | real = x_stft[..., 0] 46 | imag = x_stft[..., 1] 47 | 48 | if power: 49 | return torch.clamp(real**2 + imag**2, min=1e-7).transpose(2, 1) 50 | else: 51 | return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) 52 | 53 | 54 | class MelSpectralLoss(nn.Module): 55 | """Mel-spectral L1 loss module.""" 56 | 57 | def __init__( 58 | self, 59 | fft_size=1024, 60 | hop_size=120, 61 | win_length=1024, 62 | window="hann_window", 63 | sample_rate=24000, 64 | n_mels=80, 65 | fmin=0, 66 | fmax=None, 67 | ): 68 | """Initialize MelSpectralLoss loss. 69 | 70 | Args: 71 | fft_size (int): FFT points. 72 | hop_length (int): Hop length. 73 | win_length (Optional[int]): Window length. 74 | window (str): Window type. 75 | sample_rate (int): Sampling rate. 76 | n_mels (int): Number of Mel basis. 77 | fmin (Optional[int]): Minimum frequency of mel-filter-bank. 78 | fmax (Optional[int]): Maximum frequency of mel-filter-bank. 79 | 80 | """ 81 | super().__init__() 82 | self.fft_size = fft_size 83 | self.hop_size = hop_size 84 | self.win_length = win_length if win_length is not None else fft_size 85 | self.register_buffer("window", getattr(torch, window)(self.win_length)) 86 | self.sample_rate = sample_rate 87 | self.n_mels = n_mels 88 | self.fmin = fmin 89 | self.fmax = fmax if fmax is not None else sample_rate / 2 90 | melmat = librosa_mel( 91 | sr=sample_rate, n_fft=fft_size, n_mels=n_mels, fmin=fmin, fmax=fmax 92 | ).T 93 | self.register_buffer("melmat", torch.from_numpy(melmat).float()) 94 | 95 | def forward(self, x, y): 96 | """Calculate Mel-spectral L1 loss. 97 | 98 | Args: 99 | x (Tensor): Generated waveform tensor (B, 1, T). 100 | y (Tensor): Groundtruth waveform tensor (B, 1, T). 101 | 102 | Returns: 103 | Tensor: Mel-spectral L1 loss value. 104 | 105 | """ 106 | x = x.squeeze(1) 107 | y = y.squeeze(1) 108 | x_mag = stft(x, self.fft_size, self.hop_size, self.win_length, self.window) 109 | y_mag = stft(y, self.fft_size, self.hop_size, self.win_length, self.window) 110 | x_log_mel = torch.log(torch.clamp(torch.matmul(x_mag, self.melmat), min=1e-7)) 111 | y_log_mel = torch.log(torch.clamp(torch.matmul(y_mag, self.melmat), min=1e-7)) 112 | mel_loss = F.l1_loss(x_log_mel, y_log_mel) 113 | 114 | return mel_loss 115 | -------------------------------------------------------------------------------- /sifigan/losses/reg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Source regularization loss modules.""" 7 | 8 | import sifigan.losses 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from librosa.filters import mel as librosa_mel 13 | from sifigan.layers import CheapTrick 14 | 15 | 16 | class FlattenLoss(nn.Module): 17 | """The regularization loss of uSFGAN.""" 18 | 19 | def __init__( 20 | self, 21 | sample_rate=24000, 22 | hop_size=120, 23 | fft_size=2048, 24 | f0_floor=70, 25 | f0_ceil=340, 26 | power=False, 27 | elim_0th=False, 28 | l2_norm=False, 29 | ): 30 | """Initialize spectral envelope regularlization loss module. 31 | 32 | Args: 33 | sample_rate (int): Sampling rate. 34 | hop_size (int): Hop size. 35 | fft_size (int): FFT size. 36 | f0_floor (int): Minimum F0 value. 37 | f0_ceil (int): Maximum F0 value. 38 | power (bool): Whether to use power or magnitude spectrogram. 39 | elim_0th (bool): Whether to exclude 0th cepstrum in CheapTrick. 40 | If set to true, power is estimated by source-network. 41 | l2_norm (bool): Whether to regularize the spectral envelopes with L2 norm. 42 | 43 | """ 44 | super(FlattenLoss, self).__init__() 45 | self.hop_size = hop_size 46 | self.power = power 47 | self.elim_0th = elim_0th 48 | self.cheaptrick = CheapTrick( 49 | sample_rate=sample_rate, 50 | hop_size=hop_size, 51 | fft_size=fft_size, 52 | f0_floor=f0_floor, 53 | f0_ceil=f0_ceil, 54 | ) 55 | if l2_norm: 56 | self.loss = nn.MSELoss() 57 | else: 58 | self.loss = nn.L1Loss() 59 | 60 | def forward(self, s, f): 61 | """Calculate forward propagation. 62 | 63 | Args: 64 | s (Tensor): Predicted source excitation signal (B, 1, T). 65 | f (Tensor): F0 sequence (B, 1, T // hop_size). 66 | 67 | Returns: 68 | loss (Tensor): Loss value. 69 | 70 | """ 71 | s, f = s.squeeze(1), f.squeeze(1) 72 | e = self.cheaptrick.forward(s, f, self.power, self.elim_0th) 73 | loss = self.loss(e, e.new_zeros(e.size())) 74 | return loss 75 | 76 | 77 | class ResidualLoss(nn.Module): 78 | """The regularization loss of hn-uSFGAN.""" 79 | 80 | def __init__( 81 | self, 82 | sample_rate=24000, 83 | fft_size=2048, 84 | hop_size=120, 85 | f0_floor=100, 86 | f0_ceil=840, 87 | n_mels=80, 88 | fmin=0, 89 | fmax=None, 90 | power=False, 91 | elim_0th=True, 92 | ): 93 | """Initialize ResidualLoss module. 94 | 95 | Args: 96 | sample_rate (int): Sampling rate. 97 | fft_size (int): FFT size. 98 | hop_size (int): Hop size. 99 | f0_floor (int): Minimum F0 value. 100 | f0_ceil (int): Maximum F0 value. 101 | n_mels (int): Number of Mel basis. 102 | fmin (int): Minimum frequency for Mel. 103 | fmax (int): Maximum frequency for Mel. 104 | power (bool): Whether to use power or magnitude spectrogram. 105 | elim_0th (bool): Whether to exclude 0th cepstrum in CheapTrick. 106 | If set to true, power is estimated by source-network. 107 | 108 | """ 109 | super(ResidualLoss, self).__init__() 110 | self.sample_rate = sample_rate 111 | self.fft_size = fft_size 112 | self.hop_size = hop_size 113 | self.cheaptrick = CheapTrick( 114 | sample_rate=sample_rate, 115 | hop_size=hop_size, 116 | fft_size=fft_size, 117 | f0_floor=f0_floor, 118 | f0_ceil=f0_ceil, 119 | ) 120 | self.win_length = fft_size 121 | self.register_buffer("window", torch.hann_window(self.win_length)) 122 | 123 | # define mel-filter-bank 124 | self.n_mels = n_mels 125 | self.fmin = fmin 126 | self.fmax = fmax if fmax is not None else sample_rate / 2 127 | melmat = librosa_mel(sr=sample_rate, n_fft=fft_size, n_mels=n_mels, fmin=fmin, fmax=self.fmax).T 128 | self.register_buffer("melmat", torch.from_numpy(melmat).float()) 129 | 130 | self.power = power 131 | self.elim_0th = elim_0th 132 | 133 | def forward(self, s, y, f): 134 | """Calculate forward propagation. 135 | 136 | Args: 137 | s (Tensor): Predicted source excitation signal (B, 1, T). 138 | y (Tensor): Ground truth signal (B, 1, T). 139 | f (Tensor): F0 sequence (B, 1, T // hop_size). 140 | 141 | Returns: 142 | Tensor: Loss value. 143 | 144 | """ 145 | s, y, f = s.squeeze(1), y.squeeze(1), f.squeeze(1) 146 | 147 | with torch.no_grad(): 148 | # calculate log power (or magnitude) spectrograms 149 | e = self.cheaptrick.forward(y, f, self.power, self.elim_0th) 150 | y = sifigan.losses.stft( 151 | y, 152 | self.fft_size, 153 | self.hop_size, 154 | self.win_length, 155 | self.window, 156 | power=self.power, 157 | ) 158 | # adjust length, (B, T', C) 159 | minlen = min(e.size(1), y.size(1)) 160 | e, y = e[:, :minlen, :], y[:, :minlen, :] 161 | 162 | # calculate mean power (or magnitude) of y 163 | if self.elim_0th: 164 | y_mean = y.mean(dim=-1, keepdim=True) 165 | 166 | # calculate target of output source signal 167 | y = torch.log(torch.clamp(y, min=1e-7)) 168 | t = (y - e).exp() 169 | if self.elim_0th: 170 | t_mean = t.mean(dim=-1, keepdim=True) 171 | t = y_mean / t_mean * t 172 | 173 | # apply mel-filter-bank and log 174 | t = torch.matmul(t, self.melmat) 175 | t = torch.log(torch.clamp(t, min=1e-7)) 176 | 177 | # calculate power (or magnitude) spectrogram 178 | s = sifigan.losses.stft( 179 | s, 180 | self.fft_size, 181 | self.hop_size, 182 | self.win_length, 183 | self.window, 184 | power=self.power, 185 | ) 186 | # adjust length, (B, T', C) 187 | minlen = min(minlen, s.size(1)) 188 | s, t = s[:, :minlen, :], t[:, :minlen, :] 189 | 190 | # apply mel-filter-bank and log 191 | s = torch.matmul(s, self.melmat) 192 | s = torch.log(torch.clamp(s, min=1e-7)) 193 | 194 | loss = F.l1_loss(s, t.detach()) 195 | 196 | return loss 197 | -------------------------------------------------------------------------------- /sifigan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from sifigan.models.discriminator import * # NOQA 2 | from sifigan.models.generator import * # NOQA 3 | -------------------------------------------------------------------------------- /sifigan/models/discriminator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Discriminator modules. 7 | 8 | References: 9 | - https://github.com/kan-bayashi/ParallelWaveGAN 10 | - https://github.com/jik876/hifi-gan 11 | - UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation 12 | https://www.isca-speech.org/archive/interspeech_2021/jang21_interspeech.html 13 | 14 | """ 15 | 16 | import copy 17 | from logging import getLogger 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | from torchaudio.functional import spectrogram 23 | 24 | # A logger for this file 25 | logger = getLogger(__name__) 26 | 27 | 28 | class HiFiGANPeriodDiscriminator(nn.Module): 29 | """HiFiGAN period discriminator module.""" 30 | 31 | def __init__( 32 | self, 33 | in_channels=1, 34 | out_channels=1, 35 | period=3, 36 | kernel_sizes=[5, 3], 37 | channels=32, 38 | downsample_scales=[3, 3, 3, 3, 1], 39 | max_downsample_channels=1024, 40 | bias=True, 41 | nonlinear_activation="LeakyReLU", 42 | nonlinear_activation_params={"negative_slope": 0.1}, 43 | use_weight_norm=True, 44 | use_spectral_norm=False, 45 | ): 46 | """Initialize HiFiGANPeriodDiscriminator module. 47 | 48 | Args: 49 | in_channels (int): Number of input channels. 50 | out_channels (int): Number of output channels. 51 | period (int): Period. 52 | kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. 53 | channels (int): Number of initial channels. 54 | downsample_scales (list): List of downsampling scales. 55 | max_downsample_channels (int): Number of maximum downsampling channels. 56 | bias (bool): Whether to add bias parameter in convolution layers. 57 | nonlinear_activation (str): Activation function module name. 58 | nonlinear_activation_params (dict): Hyperparameters for activation function. 59 | use_weight_norm (bool): Whether to use weight norm. 60 | If set to true, it will be applied to all of the conv layers. 61 | use_spectral_norm (bool): Whether to use spectral norm. 62 | If set to true, it will be applied to all of the conv layers. 63 | 64 | """ 65 | super().__init__() 66 | assert len(kernel_sizes) == 2 67 | assert kernel_sizes[0] % 2 == 1, "Kernel size must be odd number." 68 | assert kernel_sizes[1] % 2 == 1, "Kernel size must be odd number." 69 | 70 | self.period = period 71 | self.convs = nn.ModuleList() 72 | in_chs = in_channels 73 | out_chs = channels 74 | for downsample_scale in downsample_scales: 75 | self.convs += [ 76 | nn.Sequential( 77 | nn.Conv2d( 78 | in_chs, 79 | out_chs, 80 | (kernel_sizes[0], 1), 81 | (downsample_scale, 1), 82 | padding=((kernel_sizes[0] - 1) // 2, 0), 83 | bias=bias, 84 | ), 85 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 86 | ) 87 | ] 88 | in_chs = out_chs 89 | # NOTE(kan-bayashi): Use downsample_scale + 1? 90 | out_chs = min(out_chs * 4, max_downsample_channels) 91 | self.output_conv = nn.Conv2d( 92 | out_chs, 93 | out_channels, 94 | (kernel_sizes[1] - 1, 1), 95 | 1, 96 | padding=((kernel_sizes[1] - 1) // 2, 0), 97 | bias=bias, 98 | ) 99 | 100 | if use_weight_norm and use_spectral_norm: 101 | raise ValueError("Either use use_weight_norm or use_spectral_norm.") 102 | 103 | # apply weight norm 104 | if use_weight_norm: 105 | self.apply_weight_norm() 106 | 107 | # apply spectral norm 108 | if use_spectral_norm: 109 | self.apply_spectral_norm() 110 | 111 | def forward(self, x, return_fmaps=False): 112 | """Calculate forward propagation. 113 | 114 | Args: 115 | c (Tensor): Input tensor (B, in_channels, T). 116 | return_fmaps (bool): Whether to return feature maps. 117 | 118 | Returns: 119 | list: List of each layer's tensors. 120 | 121 | """ 122 | # transform 1d to 2d -> (B, C, T/P, P) 123 | b, c, t = x.shape 124 | if t % self.period != 0: 125 | n_pad = self.period - (t % self.period) 126 | x = F.pad(x, (0, n_pad), "reflect") 127 | t += n_pad 128 | x = x.view(b, c, t // self.period, self.period) 129 | 130 | # forward conv 131 | fmap = [] 132 | for f in self.convs: 133 | x = f(x) 134 | if return_fmaps: 135 | fmap.append(x) 136 | x = self.output_conv(x) 137 | out = torch.flatten(x, 1, -1) 138 | 139 | if return_fmaps: 140 | return out, fmap 141 | else: 142 | return out 143 | 144 | def apply_weight_norm(self): 145 | """Apply weight normalization module from all of the layers.""" 146 | 147 | def _apply_weight_norm(m): 148 | if isinstance(m, nn.Conv2d): 149 | nn.utils.weight_norm(m) 150 | logger.debug(f"Weight norm is applied to {m}.") 151 | 152 | self.apply(_apply_weight_norm) 153 | 154 | def apply_spectral_norm(self): 155 | """Apply spectral normalization module from all of the layers.""" 156 | 157 | def _apply_spectral_norm(m): 158 | if isinstance(m, nn.Conv2d): 159 | nn.utils.spectral_norm(m) 160 | logger.debug(f"Spectral norm is applied to {m}.") 161 | 162 | self.apply(_apply_spectral_norm) 163 | 164 | 165 | class HiFiGANMultiPeriodDiscriminator(nn.Module): 166 | """HiFiGAN multi-period discriminator module.""" 167 | 168 | def __init__( 169 | self, 170 | periods=[2, 3, 5, 7, 11], 171 | discriminator_params={ 172 | "in_channels": 1, 173 | "out_channels": 1, 174 | "kernel_sizes": [5, 3], 175 | "channels": 32, 176 | "downsample_scales": [3, 3, 3, 3, 1], 177 | "max_downsample_channels": 1024, 178 | "bias": True, 179 | "nonlinear_activation": "LeakyReLU", 180 | "nonlinear_activation_params": {"negative_slope": 0.1}, 181 | "use_weight_norm": True, 182 | "use_spectral_norm": False, 183 | }, 184 | ): 185 | """Initialize HiFiGANMultiPeriodDiscriminator module. 186 | 187 | Args: 188 | periods (list): List of periods. 189 | discriminator_params (dict): Parameters for hifi-gan period discriminator module. 190 | The period parameter will be overwritten. 191 | 192 | """ 193 | super().__init__() 194 | self.discriminators = nn.ModuleList() 195 | for period in periods: 196 | params = copy.deepcopy(discriminator_params) 197 | params["period"] = period 198 | self.discriminators += [HiFiGANPeriodDiscriminator(**params)] 199 | 200 | def forward(self, x, return_fmaps=False): 201 | """Calculate forward propagation. 202 | 203 | Args: 204 | x (Tensor): Input noise signal (B, 1, T). 205 | return_fmaps (bool): Whether to return feature maps. 206 | 207 | Returns: 208 | List: List of list of each discriminator outputs, which consists of each layer output tensors. 209 | 210 | """ 211 | outs, fmaps = [], [] 212 | for f in self.discriminators: 213 | if return_fmaps: 214 | out, fmap = f(x, return_fmaps) 215 | fmaps.extend(fmap) 216 | else: 217 | out = f(x) 218 | outs.append(out) 219 | 220 | if return_fmaps: 221 | return outs, fmaps 222 | else: 223 | return outs 224 | 225 | 226 | class HiFiGANScaleDiscriminator(nn.Module): 227 | """HiFi-GAN scale discriminator module.""" 228 | 229 | def __init__( 230 | self, 231 | in_channels=1, 232 | out_channels=1, 233 | kernel_sizes=[15, 41, 5, 3], 234 | channels=128, 235 | max_downsample_channels=1024, 236 | max_groups=16, 237 | bias=True, 238 | downsample_scales=[2, 2, 4, 4, 1], 239 | nonlinear_activation="LeakyReLU", 240 | nonlinear_activation_params={"negative_slope": 0.1}, 241 | use_weight_norm=True, 242 | use_spectral_norm=False, 243 | ): 244 | """Initilize HiFiGAN scale discriminator module. 245 | 246 | Args: 247 | in_channels (int): Number of input channels. 248 | out_channels (int): Number of output channels. 249 | kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, 250 | and the second is for downsampling part, and the remaining two are for output layers. 251 | channels (int): Initial number of channels for conv layer. 252 | max_downsample_channels (int): Maximum number of channels for downsampling layers. 253 | bias (bool): Whether to add bias parameter in convolution layers. 254 | downsample_scales (list): List of downsampling scales. 255 | nonlinear_activation (str): Activation function module name. 256 | nonlinear_activation_params (dict): Hyperparameters for activation function. 257 | use_weight_norm (bool): Whether to use weight norm. 258 | If set to true, it will be applied to all of the conv layers. 259 | use_spectral_norm (bool): Whether to use spectral norm. 260 | If set to true, it will be applied to all of the conv layers. 261 | 262 | """ 263 | super().__init__() 264 | self.layers = nn.ModuleList() 265 | 266 | # check kernel size is valid 267 | assert len(kernel_sizes) == 4 268 | for ks in kernel_sizes: 269 | assert ks % 2 == 1 270 | 271 | # add first layer 272 | self.layers += [ 273 | nn.Sequential( 274 | nn.Conv1d( 275 | in_channels, 276 | channels, 277 | # NOTE(kan-bayashi): Use always the same kernel size 278 | kernel_sizes[0], 279 | bias=bias, 280 | padding=(kernel_sizes[0] - 1) // 2, 281 | ), 282 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 283 | ) 284 | ] 285 | 286 | # add downsample layers 287 | in_chs = channels 288 | out_chs = channels 289 | # NOTE(kan-bayashi): Remove hard coding? 290 | groups = 4 291 | for downsample_scale in downsample_scales: 292 | self.layers += [ 293 | nn.Sequential( 294 | nn.Conv1d( 295 | in_chs, 296 | out_chs, 297 | kernel_size=kernel_sizes[1], 298 | stride=downsample_scale, 299 | padding=(kernel_sizes[1] - 1) // 2, 300 | groups=groups, 301 | bias=bias, 302 | ), 303 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 304 | ) 305 | ] 306 | in_chs = out_chs 307 | # NOTE(kan-bayashi): Remove hard coding? 308 | out_chs = min(in_chs * 2, max_downsample_channels) 309 | # NOTE(kan-bayashi): Remove hard coding? 310 | groups = min(groups * 4, max_groups) 311 | 312 | # add final layers 313 | out_chs = min(in_chs * 2, max_downsample_channels) 314 | self.layers += [ 315 | nn.Sequential( 316 | nn.Conv1d( 317 | in_chs, 318 | out_chs, 319 | kernel_size=kernel_sizes[2], 320 | stride=1, 321 | padding=(kernel_sizes[2] - 1) // 2, 322 | bias=bias, 323 | ), 324 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 325 | ) 326 | ] 327 | self.last_layer = nn.Conv1d( 328 | out_chs, 329 | out_channels, 330 | kernel_size=kernel_sizes[3], 331 | stride=1, 332 | padding=(kernel_sizes[3] - 1) // 2, 333 | bias=bias, 334 | ) 335 | 336 | if use_weight_norm and use_spectral_norm: 337 | raise ValueError("Either use use_weight_norm or use_spectral_norm.") 338 | 339 | # apply weight norm 340 | if use_weight_norm: 341 | self.apply_weight_norm() 342 | 343 | # apply spectral norm 344 | if use_spectral_norm: 345 | self.apply_spectral_norm() 346 | 347 | def forward(self, x, return_fmaps=False): 348 | """Calculate forward propagation. 349 | 350 | Args: 351 | x (Tensor): Input noise signal (B, 1, T). 352 | return_fmaps (bool): Whether to return feature maps. 353 | 354 | Returns: 355 | List: List of output tensors of each layer. 356 | 357 | """ 358 | fmap = [] 359 | for f in self.layers: 360 | x = f(x) 361 | if return_fmaps: 362 | fmap.append(x) 363 | out = self.last_layer(x) 364 | 365 | if return_fmaps: 366 | return out, fmap 367 | else: 368 | return out 369 | 370 | def apply_weight_norm(self): 371 | """Apply weight normalization module from all of the layers.""" 372 | 373 | def _apply_weight_norm(m): 374 | if isinstance(m, nn.Conv2d): 375 | nn.utils.weight_norm(m) 376 | logger.debug(f"Weight norm is applied to {m}.") 377 | 378 | self.apply(_apply_weight_norm) 379 | 380 | def apply_spectral_norm(self): 381 | """Apply spectral normalization module from all of the layers.""" 382 | 383 | def _apply_spectral_norm(m): 384 | if isinstance(m, nn.Conv2d): 385 | nn.utils.spectral_norm(m) 386 | logger.debug(f"Spectral norm is applied to {m}.") 387 | 388 | self.apply(_apply_spectral_norm) 389 | 390 | 391 | class HiFiGANMultiScaleDiscriminator(nn.Module): 392 | """HiFi-GAN multi-scale discriminator module.""" 393 | 394 | def __init__( 395 | self, 396 | scales=3, 397 | downsample_pooling="AvgPool1d", 398 | # follow the official implementation setting 399 | downsample_pooling_params={ 400 | "kernel_size": 4, 401 | "stride": 2, 402 | "padding": 2, 403 | }, 404 | discriminator_params={ 405 | "in_channels": 1, 406 | "out_channels": 1, 407 | "kernel_sizes": [15, 41, 5, 3], 408 | "channels": 128, 409 | "max_downsample_channels": 1024, 410 | "max_groups": 16, 411 | "bias": True, 412 | "downsample_scales": [2, 2, 4, 4, 1], 413 | "nonlinear_activation": "LeakyReLU", 414 | "nonlinear_activation_params": {"negative_slope": 0.1}, 415 | }, 416 | follow_official_norm=False, 417 | ): 418 | """Initilize HiFiGAN multi-scale discriminator module. 419 | 420 | Args: 421 | scales (int): Number of multi-scales. 422 | downsample_pooling (str): Pooling module name for downsampling of the inputs. 423 | downsample_pooling_params (dict): Parameters for the above pooling module. 424 | discriminator_params (dict): Parameters for hifi-gan scale discriminator module. 425 | follow_official_norm (bool): Whether to follow the norm setting of the official 426 | implementaion. The first discriminator uses spectral norm and the other 427 | discriminators use weight norm. 428 | 429 | """ 430 | super().__init__() 431 | self.discriminators = nn.ModuleList() 432 | 433 | # add discriminators 434 | for i in range(scales): 435 | params = copy.deepcopy(discriminator_params) 436 | if follow_official_norm: 437 | if i == 0: 438 | params["use_weight_norm"] = False 439 | params["use_spectral_norm"] = True 440 | else: 441 | params["use_weight_norm"] = True 442 | params["use_spectral_norm"] = False 443 | self.discriminators += [HiFiGANScaleDiscriminator(**params)] 444 | self.pooling = getattr(nn, downsample_pooling)(**downsample_pooling_params) 445 | 446 | def forward(self, x, return_fmaps=False): 447 | """Calculate forward propagation. 448 | 449 | Args: 450 | x (Tensor): Input noise signal (B, 1, T). 451 | return_fmaps (bool): Whether to return feature maps. 452 | 453 | Returns: 454 | List: List of list of each discriminator outputs, which consists of each layer output tensors. 455 | 456 | """ 457 | outs, fmaps = [], [] 458 | for f in self.discriminators: 459 | if return_fmaps: 460 | out, fmap = f(x, return_fmaps) 461 | fmaps.extend(fmap) 462 | else: 463 | out = f(x) 464 | outs.append(out) 465 | x = self.pooling(x) 466 | 467 | if return_fmaps: 468 | return outs, fmaps 469 | else: 470 | return outs 471 | 472 | 473 | class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Module): 474 | """HiFi-GAN multi-scale + multi-period discriminator module.""" 475 | 476 | def __init__( 477 | self, 478 | # Multi-scale discriminator related 479 | scales=3, 480 | scale_downsample_pooling="AvgPool1d", 481 | scale_downsample_pooling_params={ 482 | "kernel_size": 4, 483 | "stride": 2, 484 | "padding": 2, 485 | }, 486 | scale_discriminator_params={ 487 | "in_channels": 1, 488 | "out_channels": 1, 489 | "kernel_sizes": [15, 41, 5, 3], 490 | "channels": 128, 491 | "max_downsample_channels": 1024, 492 | "max_groups": 16, 493 | "bias": True, 494 | "downsample_scales": [2, 2, 4, 4, 1], 495 | "nonlinear_activation": "LeakyReLU", 496 | "nonlinear_activation_params": {"negative_slope": 0.1}, 497 | }, 498 | follow_official_norm=True, 499 | # Multi-period discriminator related 500 | periods=[2, 3, 5, 7, 11], 501 | period_discriminator_params={ 502 | "in_channels": 1, 503 | "out_channels": 1, 504 | "kernel_sizes": [5, 3], 505 | "channels": 32, 506 | "downsample_scales": [3, 3, 3, 3, 1], 507 | "max_downsample_channels": 1024, 508 | "bias": True, 509 | "nonlinear_activation": "LeakyReLU", 510 | "nonlinear_activation_params": {"negative_slope": 0.1}, 511 | "use_weight_norm": True, 512 | "use_spectral_norm": False, 513 | }, 514 | ): 515 | """Initilize HiFiGAN multi-scale + multi-period discriminator module. 516 | 517 | Args: 518 | scales (int): Number of multi-scales. 519 | scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. 520 | scale_downsample_pooling_params (dict): Parameters for the above pooling module. 521 | scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. 522 | follow_official_norm (bool): Whether to follow the norm setting of the official 523 | implementaion. The first discriminator uses spectral norm and the other 524 | discriminators use weight norm. 525 | periods (list): List of periods. 526 | period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. 527 | The period parameter will be overwritten. 528 | 529 | """ 530 | super().__init__() 531 | self.msd = HiFiGANMultiScaleDiscriminator( 532 | scales=scales, 533 | downsample_pooling=scale_downsample_pooling, 534 | downsample_pooling_params=scale_downsample_pooling_params, 535 | discriminator_params=scale_discriminator_params, 536 | follow_official_norm=follow_official_norm, 537 | ) 538 | self.mpd = HiFiGANMultiPeriodDiscriminator( 539 | periods=periods, 540 | discriminator_params=period_discriminator_params, 541 | ) 542 | 543 | def forward(self, x, return_fmaps=False): 544 | """Calculate forward propagation. 545 | 546 | Args: 547 | x (Tensor): Input noise signal (B, 1, T). 548 | return_fmaps (bool): Whether to return feature maps. 549 | 550 | Returns: 551 | List: List of list of each discriminator outputs, 552 | which consists of each layer output tensors. 553 | Multi scale and multi period ones are concatenated. 554 | 555 | """ 556 | if return_fmaps: 557 | msd_outs, msd_fmaps = self.msd(x, return_fmaps) 558 | mpd_outs, mpd_fmaps = self.mpd(x, return_fmaps) 559 | outs = msd_outs + mpd_outs 560 | fmaps = msd_fmaps + mpd_fmaps 561 | return outs, fmaps 562 | else: 563 | msd_outs = self.msd(x) 564 | mpd_outs = self.mpd(x) 565 | outs = msd_outs + mpd_outs 566 | return outs 567 | 568 | 569 | class UnivNetSpectralDiscriminator(nn.Module): 570 | """UnivNet spectral discriminator module.""" 571 | 572 | def __init__( 573 | self, 574 | fft_size, 575 | hop_size, 576 | win_length, 577 | window="hann_window", 578 | kernel_sizes=[(3, 9), (3, 9), (3, 9), (3, 9), (3, 3), (3, 3)], 579 | strides=[(1, 1), (1, 2), (1, 2), (1, 2), (1, 1), (1, 1)], 580 | channels=32, 581 | bias=True, 582 | nonlinear_activation="LeakyReLU", 583 | nonlinear_activation_params={"negative_slope": 0.2}, 584 | use_weight_norm=True, 585 | ): 586 | """Initilize HiFiGAN scale discriminator module. 587 | 588 | Args: 589 | fft_size (list): FFT size. 590 | hop_size (int): Hop size. 591 | win_length (int): Window length. 592 | window (stt): Name of window function. 593 | kernel_sizes (list): List of kernel sizes in down-sampling CNNs. 594 | strides (list): List of stride sizes in down-sampling CNNs. 595 | channels (int): Number of channels for conv layer. 596 | bias (bool): Whether to add bias parameter in convolution layers. 597 | nonlinear_activation (str): Activation function module name. 598 | nonlinear_activation_params (dict): Hyperparameters for activation function. 599 | use_weight_norm (bool): Whether to use weight norm. 600 | If set to true, it will be applied to all of the conv layers. 601 | 602 | """ 603 | super().__init__() 604 | 605 | self.fft_size = fft_size 606 | self.hop_size = hop_size 607 | self.win_length = win_length 608 | self.register_buffer("window", getattr(torch, window)(win_length)) 609 | 610 | self.layers = nn.ModuleList() 611 | 612 | # check kernel size is valid 613 | assert len(kernel_sizes) == len(strides) 614 | 615 | # add first layer 616 | self.layers += [ 617 | nn.Sequential( 618 | nn.Conv2d( 619 | 1, 620 | channels, 621 | kernel_sizes[0], 622 | stride=strides[0], 623 | bias=bias, 624 | ), 625 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 626 | ) 627 | ] 628 | 629 | for i in range(1, len(kernel_sizes) - 2): 630 | self.layers += [ 631 | nn.Sequential( 632 | nn.Conv2d( 633 | channels, 634 | channels, 635 | kernel_size=kernel_sizes[i], 636 | stride=strides[i], 637 | bias=bias, 638 | ), 639 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 640 | ) 641 | ] 642 | 643 | # add final layers 644 | self.layers += [ 645 | nn.Sequential( 646 | nn.Conv2d( 647 | channels, 648 | channels, 649 | kernel_size=kernel_sizes[-2], 650 | stride=strides[-2], 651 | bias=bias, 652 | ), 653 | getattr(nn, nonlinear_activation)(**nonlinear_activation_params), 654 | ) 655 | ] 656 | self.layers += [ 657 | nn.Conv2d( 658 | channels, 659 | 1, 660 | kernel_size=kernel_sizes[-1], 661 | stride=strides[-1], 662 | bias=bias, 663 | ) 664 | ] 665 | 666 | # apply weight norm 667 | if use_weight_norm: 668 | self.apply_weight_norm() 669 | 670 | def forward(self, x, return_fmaps=False): 671 | """Calculate forward propagation. 672 | 673 | Args: 674 | x (Tensor): Input noise signal (B, 1, T). 675 | return_fmaps (bool): Whether to return feature maps. 676 | 677 | Returns: 678 | List: List of output tensors of each layer. 679 | 680 | """ 681 | x = spectrogram( 682 | x, 683 | pad=self.win_length // 2, 684 | window=self.window, 685 | n_fft=self.fft_size, 686 | hop_length=self.hop_size, 687 | win_length=self.win_length, 688 | power=1.0, 689 | normalized=False, 690 | ).transpose(-1, -2) 691 | 692 | fmap = [] 693 | for f in self.layers: 694 | x = f(x) 695 | if return_fmaps: 696 | fmap.append(x) 697 | 698 | if return_fmaps: 699 | return x, fmap 700 | else: 701 | return x 702 | 703 | def apply_weight_norm(self): 704 | """Apply weight normalization module from all of the layers.""" 705 | 706 | def _apply_weight_norm(m): 707 | if isinstance(m, nn.Conv2d): 708 | nn.utils.weight_norm(m) 709 | logger.debug(f"Weight norm is applied to {m}.") 710 | 711 | self.apply(_apply_weight_norm) 712 | 713 | 714 | class UnivNetMultiResolutionSpectralDiscriminator(nn.Module): 715 | """UnivNet multi-resolution spectral discriminator module.""" 716 | 717 | def __init__( 718 | self, 719 | fft_sizes=[1024, 2048, 512], 720 | hop_sizes=[120, 240, 50], 721 | win_lengths=[600, 1200, 240], 722 | window="hann_window", 723 | discriminator_params={ 724 | "channels": 32, 725 | "kernel_sizes": [(3, 9), (3, 9), (3, 9), (3, 9), (3, 3), (3, 3)], 726 | "strides": [(1, 1), (1, 2), (1, 2), (1, 2), (1, 1), (1, 1)], 727 | "bias": True, 728 | "nonlinear_activation": "LeakyReLU", 729 | "nonlinear_activation_params": {"negative_slope": 0.2}, 730 | }, 731 | ): 732 | """Initilize UnivNetMultiResolutionSpectralDiscriminator module. 733 | 734 | Args: 735 | fft_sizes (list): FFT sizes for each spectral discriminator. 736 | hop_sizes (list): Hop sizes for each spectral discriminator. 737 | win_lengths (list): Window lengths for each spectral discriminator. 738 | window (stt): Name of window function. 739 | discriminator_params (dict): Parameters for univ-net spectral discriminator module. 740 | 741 | """ 742 | super().__init__() 743 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 744 | self.discriminators = nn.ModuleList() 745 | 746 | # add discriminators 747 | for i in range(len(fft_sizes)): 748 | params = copy.deepcopy(discriminator_params) 749 | self.discriminators += [ 750 | UnivNetSpectralDiscriminator( 751 | fft_size=fft_sizes[i], 752 | hop_size=hop_sizes[i], 753 | win_length=win_lengths[i], 754 | window=window, 755 | **params, 756 | ) 757 | ] 758 | 759 | def forward(self, x, return_fmaps=False): 760 | """Calculate forward propagation. 761 | 762 | Args: 763 | x (Tensor): Input noise signal (B, 1, T). 764 | return_fmaps (bool): Whether to return feature maps. 765 | 766 | Returns: 767 | List: List of list of each discriminator outputs, which consists of each layer output tensors. 768 | 769 | """ 770 | outs, fmaps = [], [] 771 | for f in self.discriminators: 772 | if return_fmaps: 773 | out, fmap = f(x, return_fmaps) 774 | fmaps.extend(fmap) 775 | else: 776 | out = f(x) 777 | outs.append(out) 778 | 779 | if return_fmaps: 780 | return outs, fmaps 781 | else: 782 | return outs 783 | 784 | 785 | class UnivNetMultiResolutionMultiPeriodDiscriminator(nn.Module): 786 | """UnivNet multi-resolution + multi-period discriminator module.""" 787 | 788 | def __init__( 789 | self, 790 | # Multi-resolution discriminator related 791 | fft_sizes=[1024, 2048, 512], 792 | hop_sizes=[120, 240, 50], 793 | win_lengths=[600, 1200, 240], 794 | window="hann_window", 795 | spectral_discriminator_params={ 796 | "channels": 32, 797 | "kernel_sizes": [(3, 9), (3, 9), (3, 9), (3, 9), (3, 3), (3, 3)], 798 | "strides": [(1, 1), (1, 2), (1, 2), (1, 2), (1, 1), (1, 1)], 799 | "bias": True, 800 | "nonlinear_activation": "LeakyReLU", 801 | "nonlinear_activation_params": {"negative_slope": 0.2}, 802 | }, 803 | # Multi-period discriminator related 804 | periods=[2, 3, 5, 7, 11], 805 | period_discriminator_params={ 806 | "in_channels": 1, 807 | "out_channels": 1, 808 | "kernel_sizes": [5, 3], 809 | "channels": 32, 810 | "downsample_scales": [3, 3, 3, 3, 1], 811 | "max_downsample_channels": 1024, 812 | "bias": True, 813 | "nonlinear_activation": "LeakyReLU", 814 | "nonlinear_activation_params": {"negative_slope": 0.1}, 815 | "use_weight_norm": True, 816 | "use_spectral_norm": False, 817 | }, 818 | ): 819 | """Initilize UnivNetMultiResolutionMultiPeriodDiscriminator module. 820 | 821 | Args: 822 | fft_sizes (list): FFT sizes for each spectral discriminator. 823 | hop_sizes (list): Hop sizes for each spectral discriminator. 824 | win_lengths (list): Window lengths for each spectral discriminator. 825 | window (stt): Name of window function. 826 | sperctral_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. 827 | periods (list): List of periods. 828 | period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. 829 | The period parameter will be overwritten. 830 | 831 | """ 832 | super().__init__() 833 | self.mrd = UnivNetMultiResolutionSpectralDiscriminator( 834 | fft_sizes=fft_sizes, 835 | hop_sizes=hop_sizes, 836 | win_lengths=win_lengths, 837 | window=window, 838 | discriminator_params=spectral_discriminator_params, 839 | ) 840 | self.mpd = HiFiGANMultiPeriodDiscriminator( 841 | periods=periods, 842 | discriminator_params=period_discriminator_params, 843 | ) 844 | 845 | def forward(self, x, return_fmaps=False): 846 | """Calculate forward propagation. 847 | 848 | Args: 849 | x (Tensor): Input noise signal (B, 1, T). 850 | return_fmaps (bool): Whether to return feature maps. 851 | 852 | Returns: 853 | List: List of list of each discriminator outputs, 854 | which consists of each layer output tensors. 855 | Multi scale and multi period ones are concatenated. 856 | 857 | """ 858 | if return_fmaps: 859 | mrd_outs, mrd_fmaps = self.mrd(x, return_fmaps) 860 | mpd_outs, mpd_fmaps = self.mpd(x, return_fmaps) 861 | outs = mrd_outs + mpd_outs 862 | fmaps = mrd_fmaps + mpd_fmaps 863 | 864 | return outs, fmaps 865 | else: 866 | mrd_outs = self.mrd(x) 867 | mpd_outs = self.mpd(x) 868 | outs = mrd_outs + mpd_outs 869 | 870 | return outs 871 | -------------------------------------------------------------------------------- /sifigan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from sifigan.utils.features import * # NOQA 2 | from sifigan.utils.index import * # NOQA 3 | from sifigan.utils.utils import * # NOQA 4 | -------------------------------------------------------------------------------- /sifigan/utils/features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2022 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Feature-related functions. 7 | 8 | References: 9 | - https://github.com/bigpon/QPPWG 10 | - https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts 11 | 12 | """ 13 | 14 | import sys 15 | from logging import getLogger 16 | 17 | import numpy as np 18 | import torch 19 | from torch.nn.functional import interpolate 20 | 21 | # A logger for this file 22 | logger = getLogger(__name__) 23 | 24 | 25 | def validate_length(xs, ys=None, hop_size=None): 26 | """Validate length 27 | 28 | Args: 29 | xs (ndarray): numpy array of features 30 | ys (ndarray): numpy array of audios 31 | hop_size (int): upsampling factor 32 | 33 | Returns: 34 | (ndarray): length adjusted features 35 | 36 | """ 37 | min_len_x = min([x.shape[0] for x in xs]) 38 | if ys is not None: 39 | min_len_y = min([y.shape[0] for y in ys]) 40 | if min_len_y < min_len_x * hop_size: 41 | min_len_x = min_len_y // hop_size 42 | if min_len_y > min_len_x * hop_size: 43 | min_len_y = min_len_x * hop_size 44 | ys = [y[:min_len_y] for y in ys] 45 | xs = [x[:min_len_x] for x in xs] 46 | 47 | return xs + ys if ys is not None else xs 48 | 49 | 50 | def dilated_factor(batch_f0, fs, dense_factor): 51 | """Pitch-dependent dilated factor 52 | 53 | Args: 54 | batch_f0 (ndarray): the f0 sequence (T) 55 | fs (int): sampling rate 56 | dense_factor (int): the number of taps in one cycle 57 | 58 | Return: 59 | dilated_factors(np array): 60 | float array of the pitch-dependent dilated factors (T) 61 | 62 | """ 63 | batch_f0[batch_f0 == 0] = fs / dense_factor 64 | dilated_factors = np.ones(batch_f0.shape) * fs / dense_factor / batch_f0 65 | assert np.all(dilated_factors > 0) 66 | 67 | return dilated_factors 68 | 69 | 70 | class SignalGenerator: 71 | """Input signal generator module.""" 72 | 73 | def __init__( 74 | self, 75 | sample_rate=24000, 76 | hop_size=120, 77 | sine_amp=0.1, 78 | noise_amp=0.003, 79 | signal_types=["sine", "noise"], 80 | ): 81 | """Initialize WaveNetResidualBlock module. 82 | 83 | Args: 84 | sample_rate (int): Sampling rate. 85 | hop_size (int): Hop size of input F0. 86 | sine_amp (float): Sine amplitude for NSF-based sine generation. 87 | noise_amp (float): Noise amplitude for NSF-based sine generation. 88 | signal_types (list): List of input signal types for generator. 89 | 90 | """ 91 | self.sample_rate = sample_rate 92 | self.hop_size = hop_size 93 | self.signal_types = signal_types 94 | self.sine_amp = sine_amp 95 | self.noise_amp = noise_amp 96 | 97 | for signal_type in signal_types: 98 | if not signal_type in ["noise", "sine", "sines", "uv"]: 99 | logger.info(f"{signal_type} is not supported type for generator input.") 100 | sys.exit(0) 101 | logger.info(f"Use {signal_types} for generator input signals.") 102 | 103 | @torch.no_grad() 104 | def __call__(self, f0): 105 | signals = [] 106 | for typ in self.signal_types: 107 | if "noise" == typ: 108 | signals.append(self.random_noise(f0)) 109 | if "sine" == typ: 110 | signals.append(self.sinusoid(f0)) 111 | if "sines" == typ: 112 | signals.append(self.sinusoids(f0)) 113 | if "uv" == typ: 114 | signals.append(self.vuv_binary(f0)) 115 | 116 | input_batch = signals[0] 117 | for signal in signals[1:]: 118 | input_batch = torch.cat([input_batch, signal], axis=1) 119 | 120 | return input_batch 121 | 122 | @torch.no_grad() 123 | def random_noise(self, f0): 124 | """Calculate noise signals. 125 | 126 | Args: 127 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 128 | 129 | Returns: 130 | Tensor: Gaussian noise signals (B, 1, T). 131 | 132 | """ 133 | B, _, T = f0.size() 134 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) 135 | 136 | return noise 137 | 138 | @torch.no_grad() 139 | def sinusoid(self, f0): 140 | """Calculate sine signals. 141 | 142 | Args: 143 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 144 | 145 | Returns: 146 | Tensor: Sines generated following NSF (B, 1, T). 147 | 148 | """ 149 | B, _, T = f0.size() 150 | vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 151 | radious = (interpolate(f0.to(torch.float64), T * self.hop_size) / self.sample_rate) % 1 152 | sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp 153 | if self.noise_amp > 0: 154 | noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0 155 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp 156 | sine = sine + noise 157 | 158 | return sine 159 | 160 | @torch.no_grad() 161 | def sinusoids(self, f0): 162 | """Calculate sines. 163 | 164 | Args: 165 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 166 | 167 | Returns: 168 | Tensor: Sines generated following NSF (B, 1, T). 169 | 170 | """ 171 | B, _, T = f0.size() 172 | vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 173 | f0 = interpolate(f0.to(torch.float64), T * self.hop_size) 174 | sines = torch.zeros_like(f0, device=f0.device) 175 | harmonics = 5 # currently only fixed number of harmonics is supported 176 | for i in range(harmonics): 177 | radious = (f0 * (i + 1) / self.sample_rate) % 1 178 | sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) 179 | sines = self.sine_amp * sines * vuv / harmonics 180 | if self.noise_amp > 0: 181 | noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0 182 | noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp 183 | sines = sines + noise 184 | 185 | return sines 186 | 187 | @torch.no_grad() 188 | def vuv_binary(self, f0): 189 | """Calculate V/UV binary sequences. 190 | 191 | Args: 192 | f0 (Tensor): F0 tensor (B, 1, T // hop_size). 193 | 194 | Returns: 195 | Tensor: V/UV binary sequences (B, 1, T). 196 | 197 | """ 198 | _, _, T = f0.size() 199 | uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size) 200 | 201 | return uv 202 | -------------------------------------------------------------------------------- /sifigan/utils/index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Indexing-related functions.""" 7 | 8 | import torch 9 | 10 | 11 | def pd_indexing(x, d, dilation, batch_index, ch_index): 12 | """Pitch-dependent indexing of past and future samples. 13 | 14 | Args: 15 | x (Tensor): Input feature map (B, C, T). 16 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 17 | dilation (Int): Dilation size. 18 | batch_index (Tensor): Batch index 19 | ch_index (Tensor): Channel index 20 | 21 | Returns: 22 | Tensor: Past output tensor (B, out_channels, T) 23 | Tensor: Future output tensor (B, out_channels, T) 24 | 25 | """ 26 | B, C, T = x.size() 27 | batch_index = torch.arange(0, B, dtype=torch.long, device=x.device).reshape(B, 1, 1) 28 | ch_index = torch.arange(0, C, dtype=torch.long, device=x.device).reshape(1, C, 1) 29 | dilations = torch.clamp((d * dilation).long(), min=1) 30 | 31 | # get past index (assume reflect padding) 32 | idx_base = torch.arange(0, T, dtype=torch.long, device=x.device).reshape(1, 1, T) 33 | idxP = (idx_base - dilations).abs() % T 34 | idxP = (batch_index, ch_index, idxP) 35 | 36 | # get future index (assume reflect padding) 37 | idxF = idx_base + dilations 38 | overflowed = idxF >= T 39 | idxF[overflowed] = -(idxF[overflowed] % T) 40 | idxF = (batch_index, ch_index, idxF) 41 | 42 | return x[idxP], x[idxF] 43 | 44 | 45 | def index_initial(n_batch, n_ch, tensor=True): 46 | """Tensor batch and channel index initialization. 47 | 48 | Args: 49 | n_batch (Int): Number of batch. 50 | n_ch (Int): Number of channel. 51 | tensor (bool): Return tensor or numpy array 52 | 53 | Returns: 54 | Tensor: Batch index 55 | Tensor: Channel index 56 | 57 | """ 58 | batch_index = [] 59 | for i in range(n_batch): 60 | batch_index.append([[i]] * n_ch) 61 | ch_index = [] 62 | for i in range(n_ch): 63 | ch_index += [[i]] 64 | ch_index = [ch_index] * n_batch 65 | 66 | if tensor: 67 | batch_index = torch.tensor(batch_index) 68 | ch_index = torch.tensor(ch_index) 69 | if torch.cuda.is_available(): 70 | batch_index = batch_index.cuda() 71 | ch_index = ch_index.cuda() 72 | return batch_index, ch_index 73 | -------------------------------------------------------------------------------- /sifigan/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 5 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 6 | # MIT License (https://opensource.org/licenses/MIT) 7 | 8 | """Utility functions.""" 9 | 10 | import os 11 | import sys 12 | from logging import getLogger 13 | 14 | import h5py 15 | import numpy as np 16 | 17 | # A logger for this file 18 | logger = getLogger(__name__) 19 | 20 | 21 | def read_hdf5(hdf5_name, hdf5_path): 22 | """Read hdf5 dataset. 23 | 24 | Args: 25 | hdf5_name (str): Filename of hdf5 file. 26 | hdf5_path (str): Dataset name in hdf5 file. 27 | 28 | Return: 29 | any: Dataset values. 30 | 31 | """ 32 | if not os.path.exists(hdf5_name): 33 | logger.error(f"There is no such a hdf5 file ({hdf5_name}).") 34 | sys.exit(1) 35 | 36 | hdf5_file = h5py.File(hdf5_name, "r") 37 | 38 | if hdf5_path not in hdf5_file: 39 | logger.error(f"There is no such a data in hdf5 file. ({hdf5_path})") 40 | sys.exit(1) 41 | 42 | hdf5_data = hdf5_file[hdf5_path][()] 43 | hdf5_file.close() 44 | 45 | return hdf5_data 46 | 47 | 48 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): 49 | """Write dataset to hdf5. 50 | 51 | Args: 52 | hdf5_name (str): Hdf5 dataset filename. 53 | hdf5_path (str): Dataset path in hdf5. 54 | write_data (ndarray): Data to write. 55 | is_overwrite (bool): Whether to overwrite dataset. 56 | 57 | """ 58 | # convert to numpy array 59 | write_data = np.array(write_data) 60 | 61 | # check folder existence 62 | folder_name, _ = os.path.split(hdf5_name) 63 | if not os.path.exists(folder_name) and len(folder_name) != 0: 64 | os.makedirs(folder_name) 65 | 66 | # check hdf5 existence 67 | if os.path.exists(hdf5_name): 68 | # if already exists, open with r+ mode 69 | hdf5_file = h5py.File(hdf5_name, "r+") 70 | # check dataset existence 71 | if hdf5_path in hdf5_file: 72 | if is_overwrite: 73 | hdf5_file.__delitem__(hdf5_path) 74 | else: 75 | logger.error( 76 | "Dataset in hdf5 file already exists. " 77 | "if you want to overwrite, please set is_overwrite = True." 78 | ) 79 | hdf5_file.close() 80 | sys.exit(1) 81 | else: 82 | # if not exists, open with w mode 83 | hdf5_file = h5py.File(hdf5_name, "w") 84 | 85 | # write data to hdf5 86 | hdf5_file.create_dataset(hdf5_path, data=write_data) 87 | hdf5_file.flush() 88 | hdf5_file.close() 89 | 90 | 91 | def check_hdf5(hdf5_name, hdf5_path): 92 | """Check hdf5 file existence 93 | 94 | Args: 95 | hdf5_name (str): filename of hdf5 file 96 | hdf5_path (str): dataset name in hdf5 file 97 | 98 | Return: 99 | (bool): dataset exists then return true 100 | 101 | """ 102 | if not os.path.exists(hdf5_name): 103 | return False 104 | else: 105 | with h5py.File(hdf5_name, "r") as f: 106 | if hdf5_path in f: 107 | return True 108 | else: 109 | return False 110 | 111 | 112 | def read_txt(file_list): 113 | """Read .txt file list 114 | 115 | Arg: 116 | file_list (str): txt file filename 117 | 118 | Return: 119 | (list): list of read lines 120 | 121 | """ 122 | with open(file_list, "r") as f: 123 | filenames = f.readlines() 124 | return [filename.replace("\n", "") for filename in filenames] 125 | 126 | 127 | def check_filename(list1, list2): 128 | """Check the filenames of two list are matched 129 | 130 | Arg: 131 | list1 (list): file list 1 132 | list2 (list): file list 2 133 | 134 | Return: 135 | (bool): matched (True) or not (False) 136 | 137 | """ 138 | 139 | def _filename(x): 140 | return os.path.basename(x).split(".")[0] 141 | 142 | list1 = list(map(_filename, list1)) 143 | list2 = list(map(_filename, list2)) 144 | 145 | return list1 == list2 146 | --------------------------------------------------------------------------------