├── .gitignore ├── LICENSE ├── README.md ├── egs ├── arctic │ ├── conf │ │ └── arctic.uSFGAN_60.yaml │ ├── data │ │ ├── pow_f0_dict.yaml │ │ └── scp │ │ │ ├── arctic_eval_16kHz.scp │ │ │ ├── arctic_train_16kHz.scp │ │ │ └── arctic_valid_16kHz.scp │ ├── exp │ │ └── usfgan_arctic_train_16kHz_uSFGAN_60 │ │ │ ├── checkpoint-400000steps.pkl │ │ │ └── samples │ │ │ ├── bdl_arctic_b0474.wav │ │ │ ├── bdl_arctic_b0474_f0.50.wav │ │ │ ├── bdl_arctic_b0474_f0.50_src.wav │ │ │ ├── bdl_arctic_b0474_f2.00.wav │ │ │ ├── bdl_arctic_b0474_f2.00_src.wav │ │ │ ├── bdl_arctic_b0474_src.wav │ │ │ ├── clb_arctic_b0475.wav │ │ │ ├── clb_arctic_b0475_f0.50.wav │ │ │ ├── clb_arctic_b0475_f0.50_src.wav │ │ │ ├── clb_arctic_b0475_f2.00.wav │ │ │ ├── clb_arctic_b0475_f2.00_src.wav │ │ │ └── clb_arctic_b0475_src.wav │ ├── run.py │ └── run.sh ├── parse_options.sh └── vcc18 │ ├── conf │ └── vcc18.uSFGAN_60.yaml │ ├── data │ ├── pow_f0_dict.yml │ └── scp │ │ ├── vcc18_eval_22kHz.scp │ │ ├── vcc18_train_22kHz.scp │ │ └── vcc18_valid_22kHz.scp │ ├── run.py │ └── run.sh ├── setup.cfg ├── setup.py ├── tools └── Makefile └── usfgan ├── __init__.py ├── bin ├── __init__.py ├── compute_statistics.py ├── decode.py ├── preprocess.py └── train.py ├── datasets ├── __init__.py └── audio_feat_dataset.py ├── distributed ├── __init__.py └── launch.py ├── layers ├── __init__.py ├── filter_network.py ├── residual_block.py ├── source_network.py └── upsample.py ├── losses ├── __init__.py ├── cheaptrick.py ├── source_loss.py └── stft_loss.py ├── models ├── __init__.py └── usfgan.py ├── optimizers ├── __init__.py └── radam.py └── utils ├── __init__.py ├── features.py ├── filters.py ├── index.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | egs/arctic/exp/*/wav 2 | egs/arctic/exp/*/predictions 3 | egs/arctic/data/wav 4 | egs/arctic/data/hdf5 5 | egs/arctic/data/stats 6 | egs/arctic/data/scp/*.list 7 | egs/vcc18/exp/*/wav 8 | egs/vcc18/exp/*/predictions 9 | egs/vcc18/data/wav 10 | egs/vcc18/data/hdf5 11 | egs/vcc18/data/stats 12 | egs/vcc18/data/scp/*.list 13 | egs/vasc/exp/*/wav 14 | egs/vasc/exp/*/predictions 15 | egs/vasc/data/wav 16 | egs/vasc/data/hdf5 17 | egs/vasc/data/stats 18 | egs/vasc/data/scp/*.list 19 | venv 20 | usfgan.egg-info 21 | __pycache__ 22 | settings.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Reo Yoneyama 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Unified Source-Filter GAN (uSFGAN) 3 | 4 | I released a new PyTorch implementation of uSFGAN in addition to a better model, Harmonic-plus-Noise uSFGAN [here](https://github.com/chomeyama/HN-UnifiedSourceFilterGAN). 5 | 6 | This is official PyTorch implementation of [uSFGAN](https://arxiv.org/abs/2104.04668), which is a unified source-filter network based on factorization of [QPPWG](https://github.com/bigpon/QPPWG) by Yi-Chiao Wu @ Nagoya University ([@bigpon](https://github.com/bigpon)). 7 | 8 |

9 | architecture 10 |

11 | 12 | In this repo, we provide an example to train and test uSFGAN as a vocoder for [WORLD](https://doi.org/10.1587/transinf.2015EDP7457) acoustic features. 13 | More details can be found on our [Demo](https://chomeyama.github.io/UnifiedSourceFilterGAN-Demo/) page. 14 | 15 | ## Requirements 16 | 17 | This repository is tested on Ubuntu 20.04 with a Titan RTX 3090 GPU. 18 | 19 | - Python 3.8+ 20 | - Cuda 11.0 21 | - CuDNN 7+ 22 | - PyTorch 1.7.1+ 23 | 24 | 25 | ## Environment setup 26 | 27 | ```bash 28 | $ cd UnifiedSourceFilterGAN 29 | $ pip install -e . 30 | ``` 31 | 32 | Please refer to the [PWG](https://github.com/kan-bayashi/ParallelWaveGAN) repo for more details. 33 | 34 | ## Folder architecture 35 | - **egs**: 36 | The folder for projects. 37 | - **egs/vcc18**: 38 | The folder of the VCC2018 project. 39 | - **egs/vcc18/exp**: 40 | The folder for trained models. 41 | - **egs/vcc18/conf**: 42 | The folder for configs. 43 | - **egs/vcc18/data**: 44 | The folder for corpus related files (wav, feature, list ...). 45 | - **usfgan**: 46 | The folder of the source codes. 47 | 48 | Projects on [CMU-ARCTIC](http://www.festvox.org/cmu_arctic/) corpus are also available 49 | - Check **egs/arctic/*** 50 | - Dataset separation is based on [Official NSF implementation](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts) 51 | 52 | ## Run 53 | 54 | ### Corpus and path setup 55 | 56 | - Modify the corresponding CUDA paths in `egs/vcc18/run.py`. 57 | - Download the [Voice Conversion Challenge 2018](https://datashare.is.ed.ac.uk/handle/10283/3061) (VCC2018) corpus to run the uSFGAN example. 58 | 59 | ```bash 60 | $ cd egs/vcc18 61 | # Download training and validation corpus 62 | $ wget -o train.log -O data/wav/train.zip https://datashare.is.ed.ac.uk/bitstream/handle/10283/3061/vcc2018_database_training.zip 63 | # Download evaluation corpus 64 | $ wget -o eval.log -O data/wav/eval.zip https://datashare.is.ed.ac.uk/bitstream/handle/10283/3061/vcc2018_database_evaluation.zip 65 | # unzip corpus 66 | $ unzip data/wav/train.zip -d data/wav/ 67 | $ unzip data/wav/eval.zip -d data/wav/ 68 | ``` 69 | 70 | - **Training wav lists**: `data/scp/vcc18_train_22kHz.scp`. 71 | - **Validation wav lists**: `data/scp/vcc18_valid_22kHz.scp`. 72 | - **Testing wav list**: `data/scp/vcc18_eval_22kHz.scp`. 73 | 74 | ### Preprocessing 75 | 76 | ```bash 77 | # Extract WORLD acoustic features and statistics of training and testing data 78 | $ bash run.sh --stage 0 --conf uSFGAN_60 79 | ``` 80 | 81 | - WORLD-related settings can be changed in `egs/vcc18/conf/vcc18.uSFGAN_60.yaml`. 82 | - If you want to use another corpus, please create a corresponding config and a file including power thresholds and f0 ranges like `egs/vcc18/data/pow_f0_dict.yml`. 83 | - More details about feature extraction can be found in the [QPNet](https://github.com/bigpon/QPNet) repo. 84 | - The lists of auxiliary features will be automatically generated. 85 | - **Training aux lists**: `data/scp/vcc18_train_22kHz.list`. 86 | - **Validation aux lists**: `data/scp/vcc18_valid_22kHz.list`. 87 | - **Testing aux list**: `data/scp/vcc18_eval_22kHz.list`. 88 | 89 | 90 | ### uSFGAN training 91 | 92 | ```bash 93 | # Training a uSFGAN model with the 'uSFGAN_60' config and the 'vcc18_train_22kHz' and 'vcc18_valid_22kHz' sets. 94 | $ bash run.sh --gpu 0 --stage 1 --conf uSFGAN_60 \ 95 | --trainset vcc18_train_22kHz --validset vcc18_valid_22kHz 96 | ``` 97 | 98 | - The gpu ID can be set by --gpu GPU_ID (default: 0) 99 | - The model architecture can be set by --conf CONFIG (default: uSFGAN_60) 100 | - The trained model resume can be set by --resume NUM (default: None) 101 | 102 | 103 | ### uSFGAN testing 104 | 105 | ```bash 106 | # uSFGAN/QPPWG/PWG decoding w/ natural acoustic features 107 | $ bash run.sh --gpu 0 --stage 2 --conf uSFGAN_60 \ 108 | --iter 400000 --trainset vcc18_train_22kHz --evalset vcc18_eval_22kHz 109 | # uSFGAN/QPPWG/PWG decoding w/ scaled f0 (ex: halved f0). 110 | $ bash run.sh --gpu 0 --stage 3 --conf uSFGAN_60 --scaled 0.50 \ 111 | --iter 400000 --trainset vcc18_train_22kHz --evalset vcc18_eval_22kHz 112 | ``` 113 | 114 | ### Monitor training progress 115 | 116 | ```bash 117 | $ tensorboard --logdir exp 118 | ``` 119 | 120 | - The training time of uSFGAN_60 with a TITAN RTX 3090 is around 6 days. 121 | 122 | ## Citation 123 | If you find the code is helpful, please cite the following article. 124 | 125 | ``` 126 | @inproceedings{yoneyama21_interspeech, 127 | author={Reo Yoneyama and Yi-Chiao Wu and Tomoki Toda}, 128 | title={{Unified Source-Filter GAN: Unified Source-Filter Network Based On Factorization of Quasi-Periodic Parallel WaveGAN}}, 129 | year=2021, 130 | booktitle={Proc. Interspeech 2021}, 131 | pages={2187--2191}, 132 | doi={10.21437/Interspeech.2021-517} 133 | } 134 | ``` 135 | 136 | ## Authors 137 | 138 | Development: 139 | Reo Yoneyama @ Nagoya University ([@chomeyama](https://github.com/chomeyama))
140 | E-mail: `yoneyama.reo@g.sp.m.is.nagoya-u.ac.jp` 141 | 142 | Advisor: 143 | Yi-Chiao Wu @ Nagoya University ([@bigpon](https://github.com/bigpon))
144 | E-mail: `yichiao.wu@g.sp.m.is.nagoya-u.ac.jp` 145 | 146 | Tomoki Toda @ Nagoya University
147 | E-mail: `tomoki@icts.nagoya-u.ac.jp` 148 | -------------------------------------------------------------------------------- /egs/arctic/conf/arctic.uSFGAN_60.yaml: -------------------------------------------------------------------------------- 1 | # This is the hyperparameter configuration file for Parallel WaveGAN. 2 | # Please make sure this is adjusted for the vcc2018. If you want to 3 | # apply to the other dataset, you might need to carefully change some parameters. 4 | # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V. 5 | 6 | ########################################################### 7 | # FEATURE EXTRACTION SETTING # 8 | ########################################################### 9 | feat_type: "world" # Feature type 10 | sampling_rate: 16000 # Sampling rate. 11 | fft_size: 1024 # FFT size. 12 | hop_size: 80 # Hop size. 13 | shiftms: 5 # Frame shift (ms) 14 | mcep_dim: 25 # Mcep dim 15 | mcep_alpha: 0.42 # Mcep alpha 16 | highpass_cutoff: 0 # Cutoff frequency of preprocessing highpass filter 17 | uv_dim_idx: 0 # u/v dimension index 18 | f0_dim_idx: 1 # f0 dimension index 19 | mcep_dim_start: 2 # First dimension index of mcep 20 | mcep_dim_end: 28 # Last dimension index of mcep 21 | ap_dim_start: 29 # First dimension index of coded ap 22 | ap_dim_end: 29 # Last dimension index of coded ap 23 | stats_shift: 1 # Skip U/V in calculating statistics 24 | f0_cont: False # Continuous F0 or descrete F0 25 | # In original uSFGAN paper, continuous F0 is used, 26 | # but discrete F0 is better to reduce V/UV error. 27 | minf0: 70 # Minimum f0 28 | maxf0: 270 # Maximum f0 29 | pow_th: -20 # Power threshold 30 | mean_path: "/world/mean" 31 | scale_path: "/world/scale" 32 | 33 | ########################################################### 34 | # GENERATOR NETWORK ARCHITECTURE SETTING # 35 | ########################################################### 36 | generator_type: "USFGANGenerator" # Generator type. 37 | generator_params: 38 | sampling_rate: 16000 # Sampling rate. 39 | hop_size: 80 # Hop size. 40 | in_channels: 1 # Number of input channels. 41 | out_channels: 1 # Number of output channels. 42 | blockFs: [0, 30] # Number of fixed residual blocks. 43 | cycleFs: [0, 3] # Number of fixed dilation cycles. 44 | blockAs: [30, 0] # Number of adaptive residual blocks. 45 | cycleAs: [6, 0] # Number of adaptive dilation cycles. 46 | cascade_modes: [1, 1] # Network cascaded mode (0: fix->adaptive; 1: adaptive->fix). 47 | residual_channels: 64 # Number of channels in residual conv. 48 | gate_channels: 128 # Number of channels in gated conv. 49 | skip_channels: 64 # Number of channels in skip conv. 50 | aux_channels: 29 # Number of channels for auxiliary feature conv. 51 | aux_context_window: 2 # Context window size for auxiliary feature. 52 | # If set to 2, previous 2 and future 2 frames will be considered. 53 | upsample_params: # Upsampling network parameters. 54 | upsample_scales: [4, 2, 5, 2] # Upsampling scales. Prodcut of these must be the same as hop size. 55 | 56 | ########################################################### 57 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 58 | ########################################################### 59 | discriminator_type: "PWGDiscriminator" 60 | discriminator_params: 61 | in_channels: 1 # Number of input channels. 62 | out_channels: 1 # Number of output channels. 63 | kernel_size: 3 # Number of output channels. 64 | layers: 10 # Number of conv layers. 65 | conv_channels: 64 # Number of chnn layers. 66 | bias: true # Whether to use bias parameter in conv. 67 | use_weight_norm: true # Whether to use weight norm. 68 | # If set to true, it will be applied to all of the conv layers. 69 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 70 | nonlinear_activation_params: # Nonlinear function parameters 71 | negative_slope: 0.2 # Alpha in LeakyReLU. 72 | 73 | ########################################################### 74 | # STFT LOSS SETTING # 75 | ########################################################### 76 | stft_loss_params: 77 | fft_sizes: [512, 128, 2048] # List of FFT size for STFT-based loss. 78 | hop_sizes: [80, 40, 640] # List of hop size for STFT-based loss 79 | win_lengths: [320, 80, 1920] # List of window length for STFT-based loss. 80 | window: "hann_window" # Window function for STFT-based loss 81 | 82 | ########################################################### 83 | # SOURCE LOSS SETTING # 84 | ########################################################### 85 | source_loss_params: # Spectral envelope regularization loss 86 | sampling_rate: 16000 # Sampling rate. 87 | hop_size: 80 # Hop size. 88 | fft_size: 1024 # FFT size. 89 | f0_floor: 70 # Minimum F0 value. 90 | f0_ceil: 270 # Maximum F0 value. 91 | # 'fft_size' must be larger than (3.0 * 'sampling_rate' / 'f0_floor') 92 | 93 | ########################################################### 94 | # ADVERSARIAL LOSS SETTING # 95 | ########################################################### 96 | lambda_source: 0.5 # Loss balancing coefficient. 97 | # In original uSFGAN paper, this weight is set to 1.0. 98 | lambda_adv: 4.0 # Loss balancing coefficient. 99 | 100 | ########################################################### 101 | # DATA LOADER SETTING # 102 | ########################################################### 103 | batch_size: 8 # Batch size. 104 | batch_max_steps: 12000 # Length of each audio in batch. Make sure dividable by hop_size. 105 | pin_memory: true # Whether to pin memory in Pytorch DataLoader. 106 | num_workers: 2 # Number of workers in Pytorch DataLoader. 107 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. 108 | allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. 109 | 110 | ########################################################### 111 | # OPTIMIZER & SCHEDULER SETTING # 112 | ########################################################### 113 | generator_optimizer_params: 114 | lr: 0.0001 # Generator's learning rate. 115 | eps: 1.0e-6 # Generator's epsilon. 116 | weight_decay: 0.0 # Generator's weight decay coefficient. 117 | generator_scheduler_params: 118 | step_size: 200000 # Generator's scheduler step size. 119 | gamma: 0.5 # Generator's scheduler gamma. 120 | # At each step size, lr will be multiplied by this parameter. 121 | generator_grad_norm: 10 # Generator's gradient norm. 122 | discriminator_optimizer_params: 123 | lr: 0.00005 # Discriminator's learning rate. 124 | eps: 1.0e-6 # Discriminator's epsilon. 125 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 126 | discriminator_scheduler_params: 127 | step_size: 200000 # Discriminator's scheduler step size. 128 | gamma: 0.5 # Discriminator's scheduler gamma. 129 | # At each step size, lr will be multiplied by this parameter. 130 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 131 | 132 | ########################################################### 133 | # INTERVAL SETTING # 134 | ########################################################### 135 | discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. 136 | train_max_steps: 400000 # Number of training steps. 137 | save_interval_steps: 5000 # Interval steps to save checkpoint. 138 | eval_interval_steps: 1000 # Interval steps to evaluate the network. 139 | log_interval_steps: 100 # Interval steps to record the training log. 140 | 141 | ########################################################### 142 | # OTHER SETTING # 143 | ########################################################### 144 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. 145 | -------------------------------------------------------------------------------- /egs/arctic/data/pow_f0_dict.yaml: -------------------------------------------------------------------------------- 1 | bdl: 2 | f0_min: 75 3 | f0_max: 170 4 | pow_th: -20 5 | clb: 6 | f0_min: 130 7 | f0_max: 270 8 | pow_th: -20 9 | rms: 10 | f0_min: 70 11 | f0_max: 140 12 | pow_th: -20 13 | slt: 14 | f0_min: 130 15 | f0_max: 230 16 | pow_th: -20 -------------------------------------------------------------------------------- /egs/arctic/data/scp/arctic_eval_16kHz.scp: -------------------------------------------------------------------------------- 1 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0474.wav 2 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0475.wav 3 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0476.wav 4 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0477.wav 5 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0478.wav 6 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0479.wav 7 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0480.wav 8 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0481.wav 9 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0482.wav 10 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0483.wav 11 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0484.wav 12 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0485.wav 13 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0486.wav 14 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0487.wav 15 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0488.wav 16 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0489.wav 17 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0490.wav 18 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0491.wav 19 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0492.wav 20 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0493.wav 21 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0494.wav 22 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0495.wav 23 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0496.wav 24 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0497.wav 25 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0498.wav 26 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0499.wav 27 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0500.wav 28 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0501.wav 29 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0502.wav 30 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0503.wav 31 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0504.wav 32 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0505.wav 33 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0506.wav 34 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0507.wav 35 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0508.wav 36 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0509.wav 37 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0510.wav 38 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0511.wav 39 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0512.wav 40 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0513.wav 41 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0514.wav 42 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0515.wav 43 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0516.wav 44 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0517.wav 45 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0518.wav 46 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0519.wav 47 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0520.wav 48 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0521.wav 49 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0522.wav 50 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0523.wav 51 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0524.wav 52 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0525.wav 53 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0526.wav 54 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0527.wav 55 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0528.wav 56 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0529.wav 57 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0530.wav 58 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0531.wav 59 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0532.wav 60 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0533.wav 61 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0534.wav 62 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0535.wav 63 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0536.wav 64 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0537.wav 65 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0538.wav 66 | data/wav/arctic_evaluation/bdl/bdl_arctic_b0539.wav 67 | data/wav/arctic_evaluation/clb/clb_arctic_b0474.wav 68 | data/wav/arctic_evaluation/clb/clb_arctic_b0475.wav 69 | data/wav/arctic_evaluation/clb/clb_arctic_b0476.wav 70 | data/wav/arctic_evaluation/clb/clb_arctic_b0477.wav 71 | data/wav/arctic_evaluation/clb/clb_arctic_b0478.wav 72 | data/wav/arctic_evaluation/clb/clb_arctic_b0479.wav 73 | data/wav/arctic_evaluation/clb/clb_arctic_b0480.wav 74 | data/wav/arctic_evaluation/clb/clb_arctic_b0481.wav 75 | data/wav/arctic_evaluation/clb/clb_arctic_b0482.wav 76 | data/wav/arctic_evaluation/clb/clb_arctic_b0483.wav 77 | data/wav/arctic_evaluation/clb/clb_arctic_b0484.wav 78 | data/wav/arctic_evaluation/clb/clb_arctic_b0485.wav 79 | data/wav/arctic_evaluation/clb/clb_arctic_b0486.wav 80 | data/wav/arctic_evaluation/clb/clb_arctic_b0487.wav 81 | data/wav/arctic_evaluation/clb/clb_arctic_b0488.wav 82 | data/wav/arctic_evaluation/clb/clb_arctic_b0489.wav 83 | data/wav/arctic_evaluation/clb/clb_arctic_b0490.wav 84 | data/wav/arctic_evaluation/clb/clb_arctic_b0491.wav 85 | data/wav/arctic_evaluation/clb/clb_arctic_b0492.wav 86 | data/wav/arctic_evaluation/clb/clb_arctic_b0493.wav 87 | data/wav/arctic_evaluation/clb/clb_arctic_b0494.wav 88 | data/wav/arctic_evaluation/clb/clb_arctic_b0495.wav 89 | data/wav/arctic_evaluation/clb/clb_arctic_b0496.wav 90 | data/wav/arctic_evaluation/clb/clb_arctic_b0497.wav 91 | data/wav/arctic_evaluation/clb/clb_arctic_b0498.wav 92 | data/wav/arctic_evaluation/clb/clb_arctic_b0499.wav 93 | data/wav/arctic_evaluation/clb/clb_arctic_b0500.wav 94 | data/wav/arctic_evaluation/clb/clb_arctic_b0501.wav 95 | data/wav/arctic_evaluation/clb/clb_arctic_b0502.wav 96 | data/wav/arctic_evaluation/clb/clb_arctic_b0503.wav 97 | data/wav/arctic_evaluation/clb/clb_arctic_b0504.wav 98 | data/wav/arctic_evaluation/clb/clb_arctic_b0505.wav 99 | data/wav/arctic_evaluation/clb/clb_arctic_b0506.wav 100 | data/wav/arctic_evaluation/clb/clb_arctic_b0507.wav 101 | data/wav/arctic_evaluation/clb/clb_arctic_b0508.wav 102 | data/wav/arctic_evaluation/clb/clb_arctic_b0509.wav 103 | data/wav/arctic_evaluation/clb/clb_arctic_b0510.wav 104 | data/wav/arctic_evaluation/clb/clb_arctic_b0511.wav 105 | data/wav/arctic_evaluation/clb/clb_arctic_b0512.wav 106 | data/wav/arctic_evaluation/clb/clb_arctic_b0513.wav 107 | data/wav/arctic_evaluation/clb/clb_arctic_b0514.wav 108 | data/wav/arctic_evaluation/clb/clb_arctic_b0515.wav 109 | data/wav/arctic_evaluation/clb/clb_arctic_b0516.wav 110 | data/wav/arctic_evaluation/clb/clb_arctic_b0517.wav 111 | data/wav/arctic_evaluation/clb/clb_arctic_b0518.wav 112 | data/wav/arctic_evaluation/clb/clb_arctic_b0519.wav 113 | data/wav/arctic_evaluation/clb/clb_arctic_b0520.wav 114 | data/wav/arctic_evaluation/clb/clb_arctic_b0521.wav 115 | data/wav/arctic_evaluation/clb/clb_arctic_b0522.wav 116 | data/wav/arctic_evaluation/clb/clb_arctic_b0523.wav 117 | data/wav/arctic_evaluation/clb/clb_arctic_b0524.wav 118 | data/wav/arctic_evaluation/clb/clb_arctic_b0525.wav 119 | data/wav/arctic_evaluation/clb/clb_arctic_b0526.wav 120 | data/wav/arctic_evaluation/clb/clb_arctic_b0527.wav 121 | data/wav/arctic_evaluation/clb/clb_arctic_b0528.wav 122 | data/wav/arctic_evaluation/clb/clb_arctic_b0529.wav 123 | data/wav/arctic_evaluation/clb/clb_arctic_b0530.wav 124 | data/wav/arctic_evaluation/clb/clb_arctic_b0531.wav 125 | data/wav/arctic_evaluation/clb/clb_arctic_b0532.wav 126 | data/wav/arctic_evaluation/clb/clb_arctic_b0533.wav 127 | data/wav/arctic_evaluation/clb/clb_arctic_b0534.wav 128 | data/wav/arctic_evaluation/clb/clb_arctic_b0535.wav 129 | data/wav/arctic_evaluation/clb/clb_arctic_b0536.wav 130 | data/wav/arctic_evaluation/clb/clb_arctic_b0537.wav 131 | data/wav/arctic_evaluation/clb/clb_arctic_b0538.wav 132 | data/wav/arctic_evaluation/clb/clb_arctic_b0539.wav 133 | data/wav/arctic_evaluation/rms/rms_arctic_b0474.wav 134 | data/wav/arctic_evaluation/rms/rms_arctic_b0475.wav 135 | data/wav/arctic_evaluation/rms/rms_arctic_b0476.wav 136 | data/wav/arctic_evaluation/rms/rms_arctic_b0477.wav 137 | data/wav/arctic_evaluation/rms/rms_arctic_b0478.wav 138 | data/wav/arctic_evaluation/rms/rms_arctic_b0479.wav 139 | data/wav/arctic_evaluation/rms/rms_arctic_b0480.wav 140 | data/wav/arctic_evaluation/rms/rms_arctic_b0481.wav 141 | data/wav/arctic_evaluation/rms/rms_arctic_b0482.wav 142 | data/wav/arctic_evaluation/rms/rms_arctic_b0483.wav 143 | data/wav/arctic_evaluation/rms/rms_arctic_b0484.wav 144 | data/wav/arctic_evaluation/rms/rms_arctic_b0485.wav 145 | data/wav/arctic_evaluation/rms/rms_arctic_b0486.wav 146 | data/wav/arctic_evaluation/rms/rms_arctic_b0487.wav 147 | data/wav/arctic_evaluation/rms/rms_arctic_b0488.wav 148 | data/wav/arctic_evaluation/rms/rms_arctic_b0489.wav 149 | data/wav/arctic_evaluation/rms/rms_arctic_b0490.wav 150 | data/wav/arctic_evaluation/rms/rms_arctic_b0491.wav 151 | data/wav/arctic_evaluation/rms/rms_arctic_b0492.wav 152 | data/wav/arctic_evaluation/rms/rms_arctic_b0493.wav 153 | data/wav/arctic_evaluation/rms/rms_arctic_b0494.wav 154 | data/wav/arctic_evaluation/rms/rms_arctic_b0495.wav 155 | data/wav/arctic_evaluation/rms/rms_arctic_b0496.wav 156 | data/wav/arctic_evaluation/rms/rms_arctic_b0497.wav 157 | data/wav/arctic_evaluation/rms/rms_arctic_b0498.wav 158 | data/wav/arctic_evaluation/rms/rms_arctic_b0499.wav 159 | data/wav/arctic_evaluation/rms/rms_arctic_b0500.wav 160 | data/wav/arctic_evaluation/rms/rms_arctic_b0501.wav 161 | data/wav/arctic_evaluation/rms/rms_arctic_b0502.wav 162 | data/wav/arctic_evaluation/rms/rms_arctic_b0503.wav 163 | data/wav/arctic_evaluation/rms/rms_arctic_b0504.wav 164 | data/wav/arctic_evaluation/rms/rms_arctic_b0505.wav 165 | data/wav/arctic_evaluation/rms/rms_arctic_b0506.wav 166 | data/wav/arctic_evaluation/rms/rms_arctic_b0507.wav 167 | data/wav/arctic_evaluation/rms/rms_arctic_b0508.wav 168 | data/wav/arctic_evaluation/rms/rms_arctic_b0509.wav 169 | data/wav/arctic_evaluation/rms/rms_arctic_b0510.wav 170 | data/wav/arctic_evaluation/rms/rms_arctic_b0511.wav 171 | data/wav/arctic_evaluation/rms/rms_arctic_b0512.wav 172 | data/wav/arctic_evaluation/rms/rms_arctic_b0513.wav 173 | data/wav/arctic_evaluation/rms/rms_arctic_b0514.wav 174 | data/wav/arctic_evaluation/rms/rms_arctic_b0515.wav 175 | data/wav/arctic_evaluation/rms/rms_arctic_b0516.wav 176 | data/wav/arctic_evaluation/rms/rms_arctic_b0517.wav 177 | data/wav/arctic_evaluation/rms/rms_arctic_b0518.wav 178 | data/wav/arctic_evaluation/rms/rms_arctic_b0519.wav 179 | data/wav/arctic_evaluation/rms/rms_arctic_b0520.wav 180 | data/wav/arctic_evaluation/rms/rms_arctic_b0521.wav 181 | data/wav/arctic_evaluation/rms/rms_arctic_b0522.wav 182 | data/wav/arctic_evaluation/rms/rms_arctic_b0523.wav 183 | data/wav/arctic_evaluation/rms/rms_arctic_b0524.wav 184 | data/wav/arctic_evaluation/rms/rms_arctic_b0525.wav 185 | data/wav/arctic_evaluation/rms/rms_arctic_b0526.wav 186 | data/wav/arctic_evaluation/rms/rms_arctic_b0527.wav 187 | data/wav/arctic_evaluation/rms/rms_arctic_b0528.wav 188 | data/wav/arctic_evaluation/rms/rms_arctic_b0529.wav 189 | data/wav/arctic_evaluation/rms/rms_arctic_b0530.wav 190 | data/wav/arctic_evaluation/rms/rms_arctic_b0531.wav 191 | data/wav/arctic_evaluation/rms/rms_arctic_b0532.wav 192 | data/wav/arctic_evaluation/rms/rms_arctic_b0533.wav 193 | data/wav/arctic_evaluation/rms/rms_arctic_b0534.wav 194 | data/wav/arctic_evaluation/rms/rms_arctic_b0535.wav 195 | data/wav/arctic_evaluation/rms/rms_arctic_b0536.wav 196 | data/wav/arctic_evaluation/rms/rms_arctic_b0537.wav 197 | data/wav/arctic_evaluation/rms/rms_arctic_b0538.wav 198 | data/wav/arctic_evaluation/rms/rms_arctic_b0539.wav 199 | data/wav/arctic_evaluation/slt/slt_arctic_b0474.wav 200 | data/wav/arctic_evaluation/slt/slt_arctic_b0475.wav 201 | data/wav/arctic_evaluation/slt/slt_arctic_b0476.wav 202 | data/wav/arctic_evaluation/slt/slt_arctic_b0477.wav 203 | data/wav/arctic_evaluation/slt/slt_arctic_b0478.wav 204 | data/wav/arctic_evaluation/slt/slt_arctic_b0479.wav 205 | data/wav/arctic_evaluation/slt/slt_arctic_b0480.wav 206 | data/wav/arctic_evaluation/slt/slt_arctic_b0481.wav 207 | data/wav/arctic_evaluation/slt/slt_arctic_b0482.wav 208 | data/wav/arctic_evaluation/slt/slt_arctic_b0483.wav 209 | data/wav/arctic_evaluation/slt/slt_arctic_b0484.wav 210 | data/wav/arctic_evaluation/slt/slt_arctic_b0485.wav 211 | data/wav/arctic_evaluation/slt/slt_arctic_b0486.wav 212 | data/wav/arctic_evaluation/slt/slt_arctic_b0487.wav 213 | data/wav/arctic_evaluation/slt/slt_arctic_b0488.wav 214 | data/wav/arctic_evaluation/slt/slt_arctic_b0489.wav 215 | data/wav/arctic_evaluation/slt/slt_arctic_b0490.wav 216 | data/wav/arctic_evaluation/slt/slt_arctic_b0491.wav 217 | data/wav/arctic_evaluation/slt/slt_arctic_b0492.wav 218 | data/wav/arctic_evaluation/slt/slt_arctic_b0493.wav 219 | data/wav/arctic_evaluation/slt/slt_arctic_b0494.wav 220 | data/wav/arctic_evaluation/slt/slt_arctic_b0495.wav 221 | data/wav/arctic_evaluation/slt/slt_arctic_b0496.wav 222 | data/wav/arctic_evaluation/slt/slt_arctic_b0497.wav 223 | data/wav/arctic_evaluation/slt/slt_arctic_b0498.wav 224 | data/wav/arctic_evaluation/slt/slt_arctic_b0499.wav 225 | data/wav/arctic_evaluation/slt/slt_arctic_b0500.wav 226 | data/wav/arctic_evaluation/slt/slt_arctic_b0501.wav 227 | data/wav/arctic_evaluation/slt/slt_arctic_b0502.wav 228 | data/wav/arctic_evaluation/slt/slt_arctic_b0503.wav 229 | data/wav/arctic_evaluation/slt/slt_arctic_b0504.wav 230 | data/wav/arctic_evaluation/slt/slt_arctic_b0505.wav 231 | data/wav/arctic_evaluation/slt/slt_arctic_b0506.wav 232 | data/wav/arctic_evaluation/slt/slt_arctic_b0507.wav 233 | data/wav/arctic_evaluation/slt/slt_arctic_b0508.wav 234 | data/wav/arctic_evaluation/slt/slt_arctic_b0509.wav 235 | data/wav/arctic_evaluation/slt/slt_arctic_b0510.wav 236 | data/wav/arctic_evaluation/slt/slt_arctic_b0511.wav 237 | data/wav/arctic_evaluation/slt/slt_arctic_b0512.wav 238 | data/wav/arctic_evaluation/slt/slt_arctic_b0513.wav 239 | data/wav/arctic_evaluation/slt/slt_arctic_b0514.wav 240 | data/wav/arctic_evaluation/slt/slt_arctic_b0515.wav 241 | data/wav/arctic_evaluation/slt/slt_arctic_b0516.wav 242 | data/wav/arctic_evaluation/slt/slt_arctic_b0517.wav 243 | data/wav/arctic_evaluation/slt/slt_arctic_b0518.wav 244 | data/wav/arctic_evaluation/slt/slt_arctic_b0519.wav 245 | data/wav/arctic_evaluation/slt/slt_arctic_b0520.wav 246 | data/wav/arctic_evaluation/slt/slt_arctic_b0521.wav 247 | data/wav/arctic_evaluation/slt/slt_arctic_b0522.wav 248 | data/wav/arctic_evaluation/slt/slt_arctic_b0523.wav 249 | data/wav/arctic_evaluation/slt/slt_arctic_b0524.wav 250 | data/wav/arctic_evaluation/slt/slt_arctic_b0525.wav 251 | data/wav/arctic_evaluation/slt/slt_arctic_b0526.wav 252 | data/wav/arctic_evaluation/slt/slt_arctic_b0527.wav 253 | data/wav/arctic_evaluation/slt/slt_arctic_b0528.wav 254 | data/wav/arctic_evaluation/slt/slt_arctic_b0529.wav 255 | data/wav/arctic_evaluation/slt/slt_arctic_b0530.wav 256 | data/wav/arctic_evaluation/slt/slt_arctic_b0531.wav 257 | data/wav/arctic_evaluation/slt/slt_arctic_b0532.wav 258 | data/wav/arctic_evaluation/slt/slt_arctic_b0533.wav 259 | data/wav/arctic_evaluation/slt/slt_arctic_b0534.wav 260 | data/wav/arctic_evaluation/slt/slt_arctic_b0535.wav 261 | data/wav/arctic_evaluation/slt/slt_arctic_b0536.wav 262 | data/wav/arctic_evaluation/slt/slt_arctic_b0537.wav 263 | data/wav/arctic_evaluation/slt/slt_arctic_b0538.wav 264 | data/wav/arctic_evaluation/slt/slt_arctic_b0539.wav 265 | -------------------------------------------------------------------------------- /egs/arctic/data/scp/arctic_valid_16kHz.scp: -------------------------------------------------------------------------------- 1 | data/wav/arctic_training/bdl/bdl_arctic_b0408.wav 2 | data/wav/arctic_training/bdl/bdl_arctic_b0409.wav 3 | data/wav/arctic_training/bdl/bdl_arctic_b0410.wav 4 | data/wav/arctic_training/bdl/bdl_arctic_b0411.wav 5 | data/wav/arctic_training/bdl/bdl_arctic_b0412.wav 6 | data/wav/arctic_training/bdl/bdl_arctic_b0413.wav 7 | data/wav/arctic_training/bdl/bdl_arctic_b0414.wav 8 | data/wav/arctic_training/bdl/bdl_arctic_b0415.wav 9 | data/wav/arctic_training/bdl/bdl_arctic_b0416.wav 10 | data/wav/arctic_training/bdl/bdl_arctic_b0417.wav 11 | data/wav/arctic_training/bdl/bdl_arctic_b0418.wav 12 | data/wav/arctic_training/bdl/bdl_arctic_b0419.wav 13 | data/wav/arctic_training/bdl/bdl_arctic_b0420.wav 14 | data/wav/arctic_training/bdl/bdl_arctic_b0421.wav 15 | data/wav/arctic_training/bdl/bdl_arctic_b0422.wav 16 | data/wav/arctic_training/bdl/bdl_arctic_b0423.wav 17 | data/wav/arctic_training/bdl/bdl_arctic_b0424.wav 18 | data/wav/arctic_training/bdl/bdl_arctic_b0425.wav 19 | data/wav/arctic_training/bdl/bdl_arctic_b0426.wav 20 | data/wav/arctic_training/bdl/bdl_arctic_b0427.wav 21 | data/wav/arctic_training/bdl/bdl_arctic_b0428.wav 22 | data/wav/arctic_training/bdl/bdl_arctic_b0429.wav 23 | data/wav/arctic_training/bdl/bdl_arctic_b0430.wav 24 | data/wav/arctic_training/bdl/bdl_arctic_b0431.wav 25 | data/wav/arctic_training/bdl/bdl_arctic_b0432.wav 26 | data/wav/arctic_training/bdl/bdl_arctic_b0433.wav 27 | data/wav/arctic_training/bdl/bdl_arctic_b0434.wav 28 | data/wav/arctic_training/bdl/bdl_arctic_b0435.wav 29 | data/wav/arctic_training/bdl/bdl_arctic_b0436.wav 30 | data/wav/arctic_training/bdl/bdl_arctic_b0437.wav 31 | data/wav/arctic_training/bdl/bdl_arctic_b0438.wav 32 | data/wav/arctic_training/bdl/bdl_arctic_b0439.wav 33 | data/wav/arctic_training/bdl/bdl_arctic_b0440.wav 34 | data/wav/arctic_training/bdl/bdl_arctic_b0441.wav 35 | data/wav/arctic_training/bdl/bdl_arctic_b0442.wav 36 | data/wav/arctic_training/bdl/bdl_arctic_b0443.wav 37 | data/wav/arctic_training/bdl/bdl_arctic_b0444.wav 38 | data/wav/arctic_training/bdl/bdl_arctic_b0445.wav 39 | data/wav/arctic_training/bdl/bdl_arctic_b0446.wav 40 | data/wav/arctic_training/bdl/bdl_arctic_b0447.wav 41 | data/wav/arctic_training/bdl/bdl_arctic_b0448.wav 42 | data/wav/arctic_training/bdl/bdl_arctic_b0449.wav 43 | data/wav/arctic_training/bdl/bdl_arctic_b0450.wav 44 | data/wav/arctic_training/bdl/bdl_arctic_b0451.wav 45 | data/wav/arctic_training/bdl/bdl_arctic_b0452.wav 46 | data/wav/arctic_training/bdl/bdl_arctic_b0453.wav 47 | data/wav/arctic_training/bdl/bdl_arctic_b0454.wav 48 | data/wav/arctic_training/bdl/bdl_arctic_b0455.wav 49 | data/wav/arctic_training/bdl/bdl_arctic_b0456.wav 50 | data/wav/arctic_training/bdl/bdl_arctic_b0457.wav 51 | data/wav/arctic_training/bdl/bdl_arctic_b0458.wav 52 | data/wav/arctic_training/bdl/bdl_arctic_b0459.wav 53 | data/wav/arctic_training/bdl/bdl_arctic_b0460.wav 54 | data/wav/arctic_training/bdl/bdl_arctic_b0461.wav 55 | data/wav/arctic_training/bdl/bdl_arctic_b0462.wav 56 | data/wav/arctic_training/bdl/bdl_arctic_b0463.wav 57 | data/wav/arctic_training/bdl/bdl_arctic_b0464.wav 58 | data/wav/arctic_training/bdl/bdl_arctic_b0465.wav 59 | data/wav/arctic_training/bdl/bdl_arctic_b0466.wav 60 | data/wav/arctic_training/bdl/bdl_arctic_b0467.wav 61 | data/wav/arctic_training/bdl/bdl_arctic_b0468.wav 62 | data/wav/arctic_training/bdl/bdl_arctic_b0469.wav 63 | data/wav/arctic_training/bdl/bdl_arctic_b0470.wav 64 | data/wav/arctic_training/bdl/bdl_arctic_b0471.wav 65 | data/wav/arctic_training/bdl/bdl_arctic_b0472.wav 66 | data/wav/arctic_training/bdl/bdl_arctic_b0473.wav 67 | data/wav/arctic_training/clb/clb_arctic_b0408.wav 68 | data/wav/arctic_training/clb/clb_arctic_b0409.wav 69 | data/wav/arctic_training/clb/clb_arctic_b0410.wav 70 | data/wav/arctic_training/clb/clb_arctic_b0411.wav 71 | data/wav/arctic_training/clb/clb_arctic_b0412.wav 72 | data/wav/arctic_training/clb/clb_arctic_b0413.wav 73 | data/wav/arctic_training/clb/clb_arctic_b0414.wav 74 | data/wav/arctic_training/clb/clb_arctic_b0415.wav 75 | data/wav/arctic_training/clb/clb_arctic_b0416.wav 76 | data/wav/arctic_training/clb/clb_arctic_b0417.wav 77 | data/wav/arctic_training/clb/clb_arctic_b0418.wav 78 | data/wav/arctic_training/clb/clb_arctic_b0419.wav 79 | data/wav/arctic_training/clb/clb_arctic_b0420.wav 80 | data/wav/arctic_training/clb/clb_arctic_b0421.wav 81 | data/wav/arctic_training/clb/clb_arctic_b0422.wav 82 | data/wav/arctic_training/clb/clb_arctic_b0423.wav 83 | data/wav/arctic_training/clb/clb_arctic_b0424.wav 84 | data/wav/arctic_training/clb/clb_arctic_b0425.wav 85 | data/wav/arctic_training/clb/clb_arctic_b0426.wav 86 | data/wav/arctic_training/clb/clb_arctic_b0427.wav 87 | data/wav/arctic_training/clb/clb_arctic_b0428.wav 88 | data/wav/arctic_training/clb/clb_arctic_b0429.wav 89 | data/wav/arctic_training/clb/clb_arctic_b0430.wav 90 | data/wav/arctic_training/clb/clb_arctic_b0431.wav 91 | data/wav/arctic_training/clb/clb_arctic_b0432.wav 92 | data/wav/arctic_training/clb/clb_arctic_b0433.wav 93 | data/wav/arctic_training/clb/clb_arctic_b0434.wav 94 | data/wav/arctic_training/clb/clb_arctic_b0435.wav 95 | data/wav/arctic_training/clb/clb_arctic_b0436.wav 96 | data/wav/arctic_training/clb/clb_arctic_b0437.wav 97 | data/wav/arctic_training/clb/clb_arctic_b0438.wav 98 | data/wav/arctic_training/clb/clb_arctic_b0439.wav 99 | data/wav/arctic_training/clb/clb_arctic_b0440.wav 100 | data/wav/arctic_training/clb/clb_arctic_b0441.wav 101 | data/wav/arctic_training/clb/clb_arctic_b0442.wav 102 | data/wav/arctic_training/clb/clb_arctic_b0443.wav 103 | data/wav/arctic_training/clb/clb_arctic_b0444.wav 104 | data/wav/arctic_training/clb/clb_arctic_b0445.wav 105 | data/wav/arctic_training/clb/clb_arctic_b0446.wav 106 | data/wav/arctic_training/clb/clb_arctic_b0447.wav 107 | data/wav/arctic_training/clb/clb_arctic_b0448.wav 108 | data/wav/arctic_training/clb/clb_arctic_b0449.wav 109 | data/wav/arctic_training/clb/clb_arctic_b0450.wav 110 | data/wav/arctic_training/clb/clb_arctic_b0451.wav 111 | data/wav/arctic_training/clb/clb_arctic_b0452.wav 112 | data/wav/arctic_training/clb/clb_arctic_b0453.wav 113 | data/wav/arctic_training/clb/clb_arctic_b0454.wav 114 | data/wav/arctic_training/clb/clb_arctic_b0455.wav 115 | data/wav/arctic_training/clb/clb_arctic_b0456.wav 116 | data/wav/arctic_training/clb/clb_arctic_b0457.wav 117 | data/wav/arctic_training/clb/clb_arctic_b0458.wav 118 | data/wav/arctic_training/clb/clb_arctic_b0459.wav 119 | data/wav/arctic_training/clb/clb_arctic_b0460.wav 120 | data/wav/arctic_training/clb/clb_arctic_b0461.wav 121 | data/wav/arctic_training/clb/clb_arctic_b0462.wav 122 | data/wav/arctic_training/clb/clb_arctic_b0463.wav 123 | data/wav/arctic_training/clb/clb_arctic_b0464.wav 124 | data/wav/arctic_training/clb/clb_arctic_b0465.wav 125 | data/wav/arctic_training/clb/clb_arctic_b0466.wav 126 | data/wav/arctic_training/clb/clb_arctic_b0467.wav 127 | data/wav/arctic_training/clb/clb_arctic_b0468.wav 128 | data/wav/arctic_training/clb/clb_arctic_b0469.wav 129 | data/wav/arctic_training/clb/clb_arctic_b0470.wav 130 | data/wav/arctic_training/clb/clb_arctic_b0471.wav 131 | data/wav/arctic_training/clb/clb_arctic_b0472.wav 132 | data/wav/arctic_training/clb/clb_arctic_b0473.wav 133 | data/wav/arctic_training/rms/rms_arctic_b0408.wav 134 | data/wav/arctic_training/rms/rms_arctic_b0409.wav 135 | data/wav/arctic_training/rms/rms_arctic_b0410.wav 136 | data/wav/arctic_training/rms/rms_arctic_b0411.wav 137 | data/wav/arctic_training/rms/rms_arctic_b0412.wav 138 | data/wav/arctic_training/rms/rms_arctic_b0413.wav 139 | data/wav/arctic_training/rms/rms_arctic_b0414.wav 140 | data/wav/arctic_training/rms/rms_arctic_b0415.wav 141 | data/wav/arctic_training/rms/rms_arctic_b0416.wav 142 | data/wav/arctic_training/rms/rms_arctic_b0417.wav 143 | data/wav/arctic_training/rms/rms_arctic_b0418.wav 144 | data/wav/arctic_training/rms/rms_arctic_b0419.wav 145 | data/wav/arctic_training/rms/rms_arctic_b0420.wav 146 | data/wav/arctic_training/rms/rms_arctic_b0421.wav 147 | data/wav/arctic_training/rms/rms_arctic_b0422.wav 148 | data/wav/arctic_training/rms/rms_arctic_b0423.wav 149 | data/wav/arctic_training/rms/rms_arctic_b0424.wav 150 | data/wav/arctic_training/rms/rms_arctic_b0425.wav 151 | data/wav/arctic_training/rms/rms_arctic_b0426.wav 152 | data/wav/arctic_training/rms/rms_arctic_b0427.wav 153 | data/wav/arctic_training/rms/rms_arctic_b0428.wav 154 | data/wav/arctic_training/rms/rms_arctic_b0429.wav 155 | data/wav/arctic_training/rms/rms_arctic_b0430.wav 156 | data/wav/arctic_training/rms/rms_arctic_b0431.wav 157 | data/wav/arctic_training/rms/rms_arctic_b0432.wav 158 | data/wav/arctic_training/rms/rms_arctic_b0433.wav 159 | data/wav/arctic_training/rms/rms_arctic_b0434.wav 160 | data/wav/arctic_training/rms/rms_arctic_b0435.wav 161 | data/wav/arctic_training/rms/rms_arctic_b0436.wav 162 | data/wav/arctic_training/rms/rms_arctic_b0437.wav 163 | data/wav/arctic_training/rms/rms_arctic_b0438.wav 164 | data/wav/arctic_training/rms/rms_arctic_b0439.wav 165 | data/wav/arctic_training/rms/rms_arctic_b0440.wav 166 | data/wav/arctic_training/rms/rms_arctic_b0441.wav 167 | data/wav/arctic_training/rms/rms_arctic_b0442.wav 168 | data/wav/arctic_training/rms/rms_arctic_b0443.wav 169 | data/wav/arctic_training/rms/rms_arctic_b0444.wav 170 | data/wav/arctic_training/rms/rms_arctic_b0445.wav 171 | data/wav/arctic_training/rms/rms_arctic_b0446.wav 172 | data/wav/arctic_training/rms/rms_arctic_b0447.wav 173 | data/wav/arctic_training/rms/rms_arctic_b0448.wav 174 | data/wav/arctic_training/rms/rms_arctic_b0449.wav 175 | data/wav/arctic_training/rms/rms_arctic_b0450.wav 176 | data/wav/arctic_training/rms/rms_arctic_b0451.wav 177 | data/wav/arctic_training/rms/rms_arctic_b0452.wav 178 | data/wav/arctic_training/rms/rms_arctic_b0453.wav 179 | data/wav/arctic_training/rms/rms_arctic_b0454.wav 180 | data/wav/arctic_training/rms/rms_arctic_b0455.wav 181 | data/wav/arctic_training/rms/rms_arctic_b0456.wav 182 | data/wav/arctic_training/rms/rms_arctic_b0457.wav 183 | data/wav/arctic_training/rms/rms_arctic_b0458.wav 184 | data/wav/arctic_training/rms/rms_arctic_b0459.wav 185 | data/wav/arctic_training/rms/rms_arctic_b0460.wav 186 | data/wav/arctic_training/rms/rms_arctic_b0461.wav 187 | data/wav/arctic_training/rms/rms_arctic_b0462.wav 188 | data/wav/arctic_training/rms/rms_arctic_b0463.wav 189 | data/wav/arctic_training/rms/rms_arctic_b0464.wav 190 | data/wav/arctic_training/rms/rms_arctic_b0465.wav 191 | data/wav/arctic_training/rms/rms_arctic_b0466.wav 192 | data/wav/arctic_training/rms/rms_arctic_b0467.wav 193 | data/wav/arctic_training/rms/rms_arctic_b0468.wav 194 | data/wav/arctic_training/rms/rms_arctic_b0469.wav 195 | data/wav/arctic_training/rms/rms_arctic_b0470.wav 196 | data/wav/arctic_training/rms/rms_arctic_b0471.wav 197 | data/wav/arctic_training/rms/rms_arctic_b0472.wav 198 | data/wav/arctic_training/rms/rms_arctic_b0473.wav 199 | data/wav/arctic_training/slt/slt_arctic_b0408.wav 200 | data/wav/arctic_training/slt/slt_arctic_b0409.wav 201 | data/wav/arctic_training/slt/slt_arctic_b0410.wav 202 | data/wav/arctic_training/slt/slt_arctic_b0411.wav 203 | data/wav/arctic_training/slt/slt_arctic_b0412.wav 204 | data/wav/arctic_training/slt/slt_arctic_b0413.wav 205 | data/wav/arctic_training/slt/slt_arctic_b0414.wav 206 | data/wav/arctic_training/slt/slt_arctic_b0415.wav 207 | data/wav/arctic_training/slt/slt_arctic_b0416.wav 208 | data/wav/arctic_training/slt/slt_arctic_b0417.wav 209 | data/wav/arctic_training/slt/slt_arctic_b0418.wav 210 | data/wav/arctic_training/slt/slt_arctic_b0419.wav 211 | data/wav/arctic_training/slt/slt_arctic_b0420.wav 212 | data/wav/arctic_training/slt/slt_arctic_b0421.wav 213 | data/wav/arctic_training/slt/slt_arctic_b0422.wav 214 | data/wav/arctic_training/slt/slt_arctic_b0423.wav 215 | data/wav/arctic_training/slt/slt_arctic_b0424.wav 216 | data/wav/arctic_training/slt/slt_arctic_b0425.wav 217 | data/wav/arctic_training/slt/slt_arctic_b0426.wav 218 | data/wav/arctic_training/slt/slt_arctic_b0427.wav 219 | data/wav/arctic_training/slt/slt_arctic_b0428.wav 220 | data/wav/arctic_training/slt/slt_arctic_b0429.wav 221 | data/wav/arctic_training/slt/slt_arctic_b0430.wav 222 | data/wav/arctic_training/slt/slt_arctic_b0431.wav 223 | data/wav/arctic_training/slt/slt_arctic_b0432.wav 224 | data/wav/arctic_training/slt/slt_arctic_b0433.wav 225 | data/wav/arctic_training/slt/slt_arctic_b0434.wav 226 | data/wav/arctic_training/slt/slt_arctic_b0435.wav 227 | data/wav/arctic_training/slt/slt_arctic_b0436.wav 228 | data/wav/arctic_training/slt/slt_arctic_b0437.wav 229 | data/wav/arctic_training/slt/slt_arctic_b0438.wav 230 | data/wav/arctic_training/slt/slt_arctic_b0439.wav 231 | data/wav/arctic_training/slt/slt_arctic_b0440.wav 232 | data/wav/arctic_training/slt/slt_arctic_b0441.wav 233 | data/wav/arctic_training/slt/slt_arctic_b0442.wav 234 | data/wav/arctic_training/slt/slt_arctic_b0443.wav 235 | data/wav/arctic_training/slt/slt_arctic_b0444.wav 236 | data/wav/arctic_training/slt/slt_arctic_b0445.wav 237 | data/wav/arctic_training/slt/slt_arctic_b0446.wav 238 | data/wav/arctic_training/slt/slt_arctic_b0447.wav 239 | data/wav/arctic_training/slt/slt_arctic_b0448.wav 240 | data/wav/arctic_training/slt/slt_arctic_b0449.wav 241 | data/wav/arctic_training/slt/slt_arctic_b0450.wav 242 | data/wav/arctic_training/slt/slt_arctic_b0451.wav 243 | data/wav/arctic_training/slt/slt_arctic_b0452.wav 244 | data/wav/arctic_training/slt/slt_arctic_b0453.wav 245 | data/wav/arctic_training/slt/slt_arctic_b0454.wav 246 | data/wav/arctic_training/slt/slt_arctic_b0455.wav 247 | data/wav/arctic_training/slt/slt_arctic_b0456.wav 248 | data/wav/arctic_training/slt/slt_arctic_b0457.wav 249 | data/wav/arctic_training/slt/slt_arctic_b0458.wav 250 | data/wav/arctic_training/slt/slt_arctic_b0459.wav 251 | data/wav/arctic_training/slt/slt_arctic_b0460.wav 252 | data/wav/arctic_training/slt/slt_arctic_b0461.wav 253 | data/wav/arctic_training/slt/slt_arctic_b0462.wav 254 | data/wav/arctic_training/slt/slt_arctic_b0463.wav 255 | data/wav/arctic_training/slt/slt_arctic_b0464.wav 256 | data/wav/arctic_training/slt/slt_arctic_b0465.wav 257 | data/wav/arctic_training/slt/slt_arctic_b0466.wav 258 | data/wav/arctic_training/slt/slt_arctic_b0467.wav 259 | data/wav/arctic_training/slt/slt_arctic_b0468.wav 260 | data/wav/arctic_training/slt/slt_arctic_b0469.wav 261 | data/wav/arctic_training/slt/slt_arctic_b0470.wav 262 | data/wav/arctic_training/slt/slt_arctic_b0471.wav 263 | data/wav/arctic_training/slt/slt_arctic_b0472.wav 264 | data/wav/arctic_training/slt/slt_arctic_b0473.wav 265 | -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/checkpoint-400000steps.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/checkpoint-400000steps.pkl -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f0.50.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f0.50.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f0.50_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f0.50_src.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f2.00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f2.00.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f2.00_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_f2.00_src.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/bdl_arctic_b0474_src.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f0.50.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f0.50.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f0.50_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f0.50_src.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f2.00.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f2.00.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f2.00_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_f2.00_src.wav -------------------------------------------------------------------------------- /egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_src.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/egs/arctic/exp/usfgan_arctic_train_16kHz_uSFGAN_60/samples/clb_arctic_b0475_src.wav -------------------------------------------------------------------------------- /egs/arctic/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This code is modified from https://github.com/bigpon/QPPWG. 5 | 6 | """Pytorch uSFGAN script 7 | Usage: run.py [-h] [-g GPUID] 8 | [-f FACTOR] [-C CONFIG] 9 | [-T TRAIN] [-V VALID] [-E EVAL] 10 | [-R RESUME] [-I ITER] 11 | [-0] [-1] [-2] 12 | 13 | Options: 14 | -h, --help Show the help 15 | -g GPUID GPU device ID 16 | -f FACTOR F0 scaled factor 17 | -C CONFIG Name of config version 18 | -T TRAIN Training set 19 | -V VALID Validation set 20 | -E EVAL Evaluation set 21 | -R RESUME Number of iteration to resume model 22 | -I ITER Number of iteration of testing model 23 | -0, --step0 Execute step0 (Feature extraction) 24 | -1, --step1 Execute step1 (uSFGAN training) 25 | -2, --step2 Execute step2 (uSFGAN decodeing) 26 | 27 | """ 28 | import os 29 | from docopt import docopt 30 | 31 | 32 | # PATH INITIALIZATION 33 | def _path_initial(pathlist): 34 | for pathdir in pathlist: 35 | if not os.path.exists(pathdir): 36 | os.makedirs(pathdir) 37 | 38 | 39 | # PATH CHECK 40 | def _path_check(pathlist): 41 | for pathdir in pathlist: 42 | if not os.path.exists(pathdir): 43 | raise FileNotFoundError("%s doesn't exist!!" % pathdir) 44 | 45 | 46 | # PATH & PARAMETER SETTINGS 47 | LIBRARY_DIR = "/usr/local/cuda-11.0/lib64" 48 | CUDA_DIR = "/usr/local/cuda-11.0" 49 | PRJ_ROOT = "../.." 50 | SEED = 1 51 | DECODE_SEED = 100 52 | 53 | # MAIN 54 | if __name__ == "__main__": 55 | args = docopt(__doc__) 56 | print(args) 57 | # step control 58 | execute_steps = [args["--step{}".format(step_index)] for step_index in range(0, 3)] 59 | if not any(execute_steps): 60 | raise("Please specify steps with options") 61 | # environment setting 62 | os.environ['LD_LIBRARY_PATH'] += ":" + LIBRARY_DIR 63 | os.environ['CUDA_HOME'] = CUDA_DIR 64 | os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID" 65 | if args['-g'] is not None: 66 | os.environ['CUDA_VISIBLE_DEVICES'] = args['-g'] 67 | else: 68 | os.environ['CUDA_VISIBLE_DEVICES'] = "0" 69 | # path setting 70 | network = "usfgan" 71 | entry_fe = "usfgan-preprocess" 72 | entry_stats = "usfgan-compute-statistics" 73 | entry_train = "usfgan-train" 74 | entry_decode = "usfgan-decode" 75 | train_version = "arctic_train_16kHz" # training 76 | valid_version = "arctic_valid_16kHz" # validation 77 | eval_version = "arctic_eval_16kHz" # evaluation 78 | config_version = "uSFGAN_60" # config 79 | model_iters = "400000" # iteration of testing model 80 | f0_factor = "1.00" # scaled factor of f0 81 | if args['-f'] is not None: 82 | f0_factor = args['-f'] 83 | if args['-T'] is not None: 84 | train_version = args['-T'] 85 | if args['-V'] is not None: 86 | valid_version = args['-V'] 87 | if args['-E'] is not None: 88 | eval_version = args['-E'] 89 | if args['-C'] is not None: 90 | config_version = args['-C'] 91 | if args['-I'] is not None: 92 | model_iters = args['-I'] 93 | model_version = "%s_%s" % (network, train_version) # model name 94 | spkinfo = "data/pow_f0_dict.yaml" 95 | config = "conf/arctic.%s.yaml" % (config_version) 96 | stats = "data/stats/%s.joblib" % (train_version) 97 | outdir = "exp/%s_%s" % (model_version, config_version) 98 | train_wav = "data/scp/%s.scp" % (train_version) 99 | valid_wav = "data/scp/%s.scp" % (valid_version) 100 | eval_wav = "data/scp/%s.scp" % (eval_version) 101 | train_aux = "data/scp/%s.list" % (train_version) 102 | valid_aux = "data/scp/%s.list" % (valid_version) 103 | eval_aux = "data/scp/%s.list" % (eval_version) 104 | _path_check([config]) 105 | 106 | # FEATURE EXTRACTION 107 | if execute_steps[0]: 108 | inverse = True # If False, wav is restored from acoustic features 109 | split = "/" # Path split string 110 | spkidx = -2 # Speaker index of the split path 111 | # feature extraction 112 | for wav in [train_wav, valid_wav, eval_wav]: 113 | _path_check([wav]) 114 | cmd = entry_fe + \ 115 | " --audio " + wav + \ 116 | " --indir " + "wav" + \ 117 | " --outdir " + "hdf5" + \ 118 | " --config " + config + \ 119 | " --spkinfo " + spkinfo + \ 120 | " --split " + split + \ 121 | " --spkidx " + str(spkidx) + \ 122 | " --inv " + str(inverse) + \ 123 | " --verbose 1 " 124 | os.system(cmd) 125 | # calculate statistic 126 | _path_check([train_aux]) 127 | cmd = entry_stats + \ 128 | " --feats " + train_aux + \ 129 | " --config " + config + \ 130 | " --stats " + stats 131 | os.system(cmd) 132 | 133 | # NETWORK TRAINING 134 | if execute_steps[1]: 135 | # resume setting 136 | if args['-R'] is not None: 137 | resume = "%s/checkpoint-%ssteps.pkl" % (outdir, args['-R']) 138 | else: 139 | resume = "None" 140 | # training 141 | cmd = entry_train + \ 142 | " --train_audio " + train_wav + \ 143 | " --train_feat " + train_aux + \ 144 | " --valid_audio " + valid_wav + \ 145 | " --valid_feat " + valid_aux + \ 146 | " --stats " + stats + \ 147 | " --outdir " + outdir + \ 148 | " --config " + config + \ 149 | " --resume " + resume + \ 150 | " --seed " + str(SEED) + \ 151 | " --verbose 1 " 152 | os.system(cmd) 153 | 154 | # EVALUATION (ANALYSIS-SYNTHESIS) 155 | if execute_steps[2]: 156 | # path setting 157 | indir = "data/hdf5/" # input path of features 158 | outdir_eval = "%s/wav/%s/" % (outdir, model_iters) # wav output path 159 | # check trained model 160 | checkpoint = "%s/checkpoint-%ssteps.pkl" % (outdir, model_iters) 161 | _path_check([checkpoint]) 162 | 163 | # speech decoding 164 | cmd = entry_decode + \ 165 | " --eval_feat " + eval_aux + \ 166 | " --stats " + stats + \ 167 | " --indir " + indir + \ 168 | " --outdir " + outdir_eval + \ 169 | " --checkpoint " + checkpoint + \ 170 | " --config " + config + \ 171 | " --seed " + str(DECODE_SEED) + \ 172 | " --f0_factor " + f0_factor + \ 173 | " --verbose 1 " 174 | os.system(cmd) 175 | -------------------------------------------------------------------------------- /egs/arctic/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script is modified from https://github.com/bigpon/QPPWG. 5 | 6 | trainset=arctic_train_16kHz # training set 7 | validset=arctic_valid_16kHz # validation set 8 | evalset=arctic_eval_16kHz # evaluation set 9 | gpu=0 # gpu id 10 | conf=uSFGAN_60 # name of config 11 | resume=None # number of iteration of resume model 12 | iter=400000 # number of iteration of testing model 13 | scaled=0.50 # scaled ratio of f0 14 | stage= # running stage (0-3) 15 | # stage 0: Preprocessing 16 | # stage 1: uSFGAN training 17 | # stage 2: uSFGAN decoding (analysis-synthesis) 18 | # stage 3: uSFGAN decoding (scaled F0) 19 | . ../parse_options.sh || exit 1; 20 | 21 | export LD_LIBRARY_PATH='' 22 | export CUDA_HOME='' 23 | export CUDA_DEVICE_ORDER='' 24 | 25 | # Preprocessing 26 | if echo ${stage} | grep -q 0; then 27 | echo "Preprocessing." 28 | python run.py -C ${conf} -T ${trainset} -V ${validset} -E ${evalset} -0 29 | fi 30 | 31 | # uSFGAN training 32 | if echo ${stage} | grep -q 1; then 33 | echo "uSFGAN training." 34 | python run.py -g ${gpu} -C ${conf} \ 35 | -T ${trainset} -V ${validset} -R ${resume} -1 36 | fi 37 | 38 | # uSFGAN decoding w/ natural acoustic features 39 | if echo ${stage} | grep -q 2; then 40 | echo "uSFGAN decoding (natural)." 41 | python run.py -g ${gpu} -C ${conf} \ 42 | -T ${trainset} -E ${evalset} -I ${iter} -2 43 | fi 44 | 45 | # uSFGAN decoding w/ scaled F0 46 | if echo ${stage} | grep -q 3; then 47 | echo "uSFGAN decoding ( ${scaled} x F0)." 48 | python run.py -g ${gpu} -C ${conf} -f ${scaled}\ 49 | -T ${trainset} -E ${evalset} -I ${iter} -2 50 | fi -------------------------------------------------------------------------------- /egs/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### Now we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /egs/vcc18/conf/vcc18.uSFGAN_60.yaml: -------------------------------------------------------------------------------- 1 | # This is the hyperparameter configuration file for Parallel WaveGAN. 2 | # Please make sure this is adjusted for the vcc2018. If you want to 3 | # apply to the other dataset, you might need to carefully change some parameters. 4 | # This configuration requires 12 GB GPU memory and takes ~3 days on TITAN V. 5 | 6 | ########################################################### 7 | # FEATURE EXTRACTION SETTING # 8 | ########################################################### 9 | feat_type: "world" # Feature type 10 | sampling_rate: 22050 # Sampling rate. 11 | fft_size: 1024 # FFT size. 12 | hop_size: 110 # Hop size. 13 | shiftms: 5 # Frame shift (ms) 14 | mcep_dim: 34 # Mcep dim 15 | mcep_alpha: 0.455 # Mcep alpha 16 | highpass_cutoff: 70 # Cutoff frequency of preprocessing highpass filter 17 | uv_dim_idx: 0 # u/v dimension index 18 | f0_dim_idx: 1 # f0 dimension index 19 | ap_dim_start: 37 # First dimension index of coded ap 20 | ap_dim_end: 39 # Last dimension index of coded ap 21 | mcep_dim_start: 2 # First dimension index of mcep 22 | mcep_dim_end: 37 # Last dimension index of mcep 23 | stats_shift: 1 # Skip U/V in calculating statistics 24 | f0_cont: False # Continuous F0 or descrete F0 25 | # In original uSFGAN paper, continuous F0 is used, 26 | # but discrete F0 is better to reduce V/UV error. 27 | minf0: 45 # Minimum f0 28 | maxf0: 450 # Maximum f0 29 | pow_th: -20 # Power threshold 30 | mean_path: "/world/mean" 31 | scale_path: "/world/scale" 32 | 33 | ########################################################### 34 | # GENERATOR NETWORK ARCHITECTURE SETTING # 35 | ########################################################### 36 | generator_type: "USFGANGenerator" # Generator type. 37 | generator_params: 38 | sampling_rate: 22050 # Sampling rate. 39 | hop_size: 110 # Hop size. 40 | in_channels: 1 # Number of input channels. 41 | out_channels: 1 # Number of output channels. 42 | blockFs: [0, 30] # Number of fixed residual blocks. 43 | cycleFs: [0, 3] # Number of fixed dilation cycles. 44 | blockAs: [30, 0] # Number of adaptive residual blocks. 45 | cycleAs: [6, 0] # Number of adaptive dilation cycles. 46 | cascade_modes: [1, 1] # Network cascaded mode (0: fix->adaptive; 1: adaptive->fix). 47 | residual_channels: 64 # Number of channels in residual conv. 48 | gate_channels: 128 # Number of channels in gated conv. 49 | skip_channels: 64 # Number of channels in skip conv. 50 | aux_channels: 39 # Number of channels for auxiliary feature conv. 51 | aux_context_window: 2 # Context window size for auxiliary feature. 52 | # If set to 2, previous 2 and future 2 frames will be considered. 53 | upsample_params: # Upsampling network parameters. 54 | upsample_scales: [5, 2, 11, 1] # Upsampling scales. Prodcut of these must be the same as hop size. 55 | 56 | ########################################################### 57 | # DISCRIMINATOR NETWORK ARCHITECTURE SETTING # 58 | ########################################################### 59 | discriminator_type: "PWGDiscriminator" 60 | discriminator_params: 61 | in_channels: 1 # Number of input channels. 62 | out_channels: 1 # Number of output channels. 63 | kernel_size: 3 # Number of output channels. 64 | layers: 10 # Number of conv layers. 65 | conv_channels: 64 # Number of chnn layers. 66 | bias: true # Whether to use bias parameter in conv. 67 | use_weight_norm: true # Whether to use weight norm. 68 | # If set to true, it will be applied to all of the conv layers. 69 | nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv. 70 | nonlinear_activation_params: # Nonlinear function parameters 71 | negative_slope: 0.2 # Alpha in LeakyReLU. 72 | 73 | ########################################################### 74 | # STFT LOSS SETTING # 75 | ########################################################### 76 | stft_loss_params: 77 | fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. 78 | hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss 79 | win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. 80 | window: "hann_window" # Window function for STFT-based loss 81 | 82 | ########################################################### 83 | # SOURCE LOSS SETTING # 84 | ########################################################### 85 | source_loss_params: 86 | sampling_rate: 22050 # Sampling rate. 87 | hop_size: 110 # Hop size. 88 | fft_size: 2048 # FFT size. 89 | f0_floor: 45 # Minimum F0 value. 90 | f0_ceil: 450 # Maximum F0 value. 91 | # 'fft_size' must be larger than (3.0 * 'sampling_rate' / 'f0_floor') 92 | 93 | ########################################################### 94 | # ADVERSARIAL LOSS SETTING # 95 | ########################################################### 96 | lambda_source: 1.0 # Loss balancing coefficient. 97 | lambda_adv: 4.0 # Loss balancing coefficient. 98 | 99 | ########################################################### 100 | # DATA LOADER SETTING # 101 | ########################################################### 102 | batch_size: 6 # Batch size. 103 | batch_max_steps: 22550 # Length of each audio in batch. Make sure dividable by hop_size. 104 | pin_memory: true # Whether to pin memory in Pytorch DataLoader. 105 | num_workers: 2 # Number of workers in Pytorch DataLoader. 106 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. 107 | allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. 108 | 109 | ########################################################### 110 | # OPTIMIZER & SCHEDULER SETTING # 111 | ########################################################### 112 | generator_optimizer_params: 113 | lr: 0.0001 # Generator's learning rate. 114 | eps: 1.0e-6 # Generator's epsilon. 115 | weight_decay: 0.0 # Generator's weight decay coefficient. 116 | generator_scheduler_params: 117 | step_size: 200000 # Generator's scheduler step size. 118 | gamma: 0.5 # Generator's scheduler gamma. 119 | # At each step size, lr will be multiplied by this parameter. 120 | generator_grad_norm: 10 # Generator's gradient norm. 121 | discriminator_optimizer_params: 122 | lr: 0.00005 # Discriminator's learning rate. 123 | eps: 1.0e-6 # Discriminator's epsilon. 124 | weight_decay: 0.0 # Discriminator's weight decay coefficient. 125 | discriminator_scheduler_params: 126 | step_size: 200000 # Discriminator's scheduler step size. 127 | gamma: 0.5 # Discriminator's scheduler gamma. 128 | # At each step size, lr will be multiplied by this parameter. 129 | discriminator_grad_norm: 1 # Discriminator's gradient norm. 130 | 131 | ########################################################### 132 | # INTERVAL SETTING # 133 | ########################################################### 134 | discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. 135 | train_max_steps: 400000 # Number of training steps. 136 | save_interval_steps: 5000 # Interval steps to save checkpoint. 137 | eval_interval_steps: 1000 # Interval steps to evaluate the network. 138 | log_interval_steps: 100 # Interval steps to record the training log. 139 | 140 | ########################################################### 141 | # OTHER SETTING # 142 | ########################################################### 143 | num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. -------------------------------------------------------------------------------- /egs/vcc18/data/pow_f0_dict.yml: -------------------------------------------------------------------------------- 1 | VCC2SF1: 2 | f0_max: 450 3 | f0_min: 100 4 | pow_th: -31 5 | VCC2SF2: 6 | f0_max: 350 7 | f0_min: 110 8 | pow_th: -31 9 | VCC2SF3: 10 | f0_max: 340 11 | f0_min: 110 12 | pow_th: -38 13 | VCC2SF4: 14 | f0_max: 330 15 | f0_min: 120 16 | pow_th: -34 17 | VCC2SM1: 18 | f0_max: 200 19 | f0_min: 50 20 | pow_th: -31 21 | VCC2SM2: 22 | f0_max: 300 23 | f0_min: 70 24 | pow_th: -40 25 | VCC2SM3: 26 | f0_max: 220 27 | f0_min: 45 28 | pow_th: -35 29 | VCC2SM4: 30 | f0_max: 260 31 | f0_min: 45 32 | pow_th: -32 33 | VCC2TF1: 34 | f0_max: 350 35 | f0_min: 140 36 | pow_th: -45 37 | VCC2TF2: 38 | f0_max: 400 39 | f0_min: 100 40 | pow_th: -30 41 | VCC2TM1: 42 | f0_max: 200 43 | f0_min: 60 44 | pow_th: -23 45 | VCC2TM2: 46 | f0_max: 280 47 | f0_min: 50 48 | pow_th: -31 49 | bdl: 50 | f0_max: 240 51 | f0_min: 50 52 | pow_th: -30 53 | slt: 54 | f0_max: 340 55 | f0_min: 130 56 | pow_th: -28 57 | -------------------------------------------------------------------------------- /egs/vcc18/data/scp/vcc18_eval_22kHz.scp: -------------------------------------------------------------------------------- 1 | data/wav/vcc2018_evaluation/VCC2SF3/30001.wav 2 | data/wav/vcc2018_evaluation/VCC2SF3/30002.wav 3 | data/wav/vcc2018_evaluation/VCC2SF3/30003.wav 4 | data/wav/vcc2018_evaluation/VCC2SF3/30004.wav 5 | data/wav/vcc2018_evaluation/VCC2SF3/30005.wav 6 | data/wav/vcc2018_evaluation/VCC2SF3/30006.wav 7 | data/wav/vcc2018_evaluation/VCC2SF3/30007.wav 8 | data/wav/vcc2018_evaluation/VCC2SF3/30008.wav 9 | data/wav/vcc2018_evaluation/VCC2SF3/30009.wav 10 | data/wav/vcc2018_evaluation/VCC2SF3/30010.wav 11 | data/wav/vcc2018_evaluation/VCC2SF3/30011.wav 12 | data/wav/vcc2018_evaluation/VCC2SF3/30012.wav 13 | data/wav/vcc2018_evaluation/VCC2SF3/30013.wav 14 | data/wav/vcc2018_evaluation/VCC2SF3/30014.wav 15 | data/wav/vcc2018_evaluation/VCC2SF3/30015.wav 16 | data/wav/vcc2018_evaluation/VCC2SF3/30016.wav 17 | data/wav/vcc2018_evaluation/VCC2SF3/30017.wav 18 | data/wav/vcc2018_evaluation/VCC2SF3/30018.wav 19 | data/wav/vcc2018_evaluation/VCC2SF3/30019.wav 20 | data/wav/vcc2018_evaluation/VCC2SF3/30020.wav 21 | data/wav/vcc2018_evaluation/VCC2SF3/30021.wav 22 | data/wav/vcc2018_evaluation/VCC2SF3/30022.wav 23 | data/wav/vcc2018_evaluation/VCC2SF3/30023.wav 24 | data/wav/vcc2018_evaluation/VCC2SF3/30024.wav 25 | data/wav/vcc2018_evaluation/VCC2SF3/30025.wav 26 | data/wav/vcc2018_evaluation/VCC2SF3/30026.wav 27 | data/wav/vcc2018_evaluation/VCC2SF3/30027.wav 28 | data/wav/vcc2018_evaluation/VCC2SF3/30028.wav 29 | data/wav/vcc2018_evaluation/VCC2SF3/30029.wav 30 | data/wav/vcc2018_evaluation/VCC2SF3/30030.wav 31 | data/wav/vcc2018_evaluation/VCC2SF3/30031.wav 32 | data/wav/vcc2018_evaluation/VCC2SF3/30032.wav 33 | data/wav/vcc2018_evaluation/VCC2SF3/30033.wav 34 | data/wav/vcc2018_evaluation/VCC2SF3/30034.wav 35 | data/wav/vcc2018_evaluation/VCC2SF3/30035.wav 36 | data/wav/vcc2018_evaluation/VCC2SF4/30001.wav 37 | data/wav/vcc2018_evaluation/VCC2SF4/30002.wav 38 | data/wav/vcc2018_evaluation/VCC2SF4/30003.wav 39 | data/wav/vcc2018_evaluation/VCC2SF4/30004.wav 40 | data/wav/vcc2018_evaluation/VCC2SF4/30005.wav 41 | data/wav/vcc2018_evaluation/VCC2SF4/30006.wav 42 | data/wav/vcc2018_evaluation/VCC2SF4/30007.wav 43 | data/wav/vcc2018_evaluation/VCC2SF4/30008.wav 44 | data/wav/vcc2018_evaluation/VCC2SF4/30009.wav 45 | data/wav/vcc2018_evaluation/VCC2SF4/30010.wav 46 | data/wav/vcc2018_evaluation/VCC2SF4/30011.wav 47 | data/wav/vcc2018_evaluation/VCC2SF4/30012.wav 48 | data/wav/vcc2018_evaluation/VCC2SF4/30013.wav 49 | data/wav/vcc2018_evaluation/VCC2SF4/30014.wav 50 | data/wav/vcc2018_evaluation/VCC2SF4/30015.wav 51 | data/wav/vcc2018_evaluation/VCC2SF4/30016.wav 52 | data/wav/vcc2018_evaluation/VCC2SF4/30017.wav 53 | data/wav/vcc2018_evaluation/VCC2SF4/30018.wav 54 | data/wav/vcc2018_evaluation/VCC2SF4/30019.wav 55 | data/wav/vcc2018_evaluation/VCC2SF4/30020.wav 56 | data/wav/vcc2018_evaluation/VCC2SF4/30021.wav 57 | data/wav/vcc2018_evaluation/VCC2SF4/30022.wav 58 | data/wav/vcc2018_evaluation/VCC2SF4/30023.wav 59 | data/wav/vcc2018_evaluation/VCC2SF4/30024.wav 60 | data/wav/vcc2018_evaluation/VCC2SF4/30025.wav 61 | data/wav/vcc2018_evaluation/VCC2SF4/30026.wav 62 | data/wav/vcc2018_evaluation/VCC2SF4/30027.wav 63 | data/wav/vcc2018_evaluation/VCC2SF4/30028.wav 64 | data/wav/vcc2018_evaluation/VCC2SF4/30029.wav 65 | data/wav/vcc2018_evaluation/VCC2SF4/30030.wav 66 | data/wav/vcc2018_evaluation/VCC2SF4/30031.wav 67 | data/wav/vcc2018_evaluation/VCC2SF4/30032.wav 68 | data/wav/vcc2018_evaluation/VCC2SF4/30033.wav 69 | data/wav/vcc2018_evaluation/VCC2SF4/30034.wav 70 | data/wav/vcc2018_evaluation/VCC2SF4/30035.wav 71 | data/wav/vcc2018_evaluation/VCC2SM3/30001.wav 72 | data/wav/vcc2018_evaluation/VCC2SM3/30002.wav 73 | data/wav/vcc2018_evaluation/VCC2SM3/30003.wav 74 | data/wav/vcc2018_evaluation/VCC2SM3/30004.wav 75 | data/wav/vcc2018_evaluation/VCC2SM3/30005.wav 76 | data/wav/vcc2018_evaluation/VCC2SM3/30006.wav 77 | data/wav/vcc2018_evaluation/VCC2SM3/30007.wav 78 | data/wav/vcc2018_evaluation/VCC2SM3/30008.wav 79 | data/wav/vcc2018_evaluation/VCC2SM3/30009.wav 80 | data/wav/vcc2018_evaluation/VCC2SM3/30010.wav 81 | data/wav/vcc2018_evaluation/VCC2SM3/30011.wav 82 | data/wav/vcc2018_evaluation/VCC2SM3/30012.wav 83 | data/wav/vcc2018_evaluation/VCC2SM3/30013.wav 84 | data/wav/vcc2018_evaluation/VCC2SM3/30014.wav 85 | data/wav/vcc2018_evaluation/VCC2SM3/30015.wav 86 | data/wav/vcc2018_evaluation/VCC2SM3/30016.wav 87 | data/wav/vcc2018_evaluation/VCC2SM3/30017.wav 88 | data/wav/vcc2018_evaluation/VCC2SM3/30018.wav 89 | data/wav/vcc2018_evaluation/VCC2SM3/30019.wav 90 | data/wav/vcc2018_evaluation/VCC2SM3/30020.wav 91 | data/wav/vcc2018_evaluation/VCC2SM3/30021.wav 92 | data/wav/vcc2018_evaluation/VCC2SM3/30022.wav 93 | data/wav/vcc2018_evaluation/VCC2SM3/30023.wav 94 | data/wav/vcc2018_evaluation/VCC2SM3/30024.wav 95 | data/wav/vcc2018_evaluation/VCC2SM3/30025.wav 96 | data/wav/vcc2018_evaluation/VCC2SM3/30026.wav 97 | data/wav/vcc2018_evaluation/VCC2SM3/30027.wav 98 | data/wav/vcc2018_evaluation/VCC2SM3/30028.wav 99 | data/wav/vcc2018_evaluation/VCC2SM3/30029.wav 100 | data/wav/vcc2018_evaluation/VCC2SM3/30030.wav 101 | data/wav/vcc2018_evaluation/VCC2SM3/30031.wav 102 | data/wav/vcc2018_evaluation/VCC2SM3/30032.wav 103 | data/wav/vcc2018_evaluation/VCC2SM3/30033.wav 104 | data/wav/vcc2018_evaluation/VCC2SM3/30034.wav 105 | data/wav/vcc2018_evaluation/VCC2SM3/30035.wav 106 | data/wav/vcc2018_evaluation/VCC2SM4/30001.wav 107 | data/wav/vcc2018_evaluation/VCC2SM4/30002.wav 108 | data/wav/vcc2018_evaluation/VCC2SM4/30003.wav 109 | data/wav/vcc2018_evaluation/VCC2SM4/30004.wav 110 | data/wav/vcc2018_evaluation/VCC2SM4/30005.wav 111 | data/wav/vcc2018_evaluation/VCC2SM4/30006.wav 112 | data/wav/vcc2018_evaluation/VCC2SM4/30007.wav 113 | data/wav/vcc2018_evaluation/VCC2SM4/30008.wav 114 | data/wav/vcc2018_evaluation/VCC2SM4/30009.wav 115 | data/wav/vcc2018_evaluation/VCC2SM4/30010.wav 116 | data/wav/vcc2018_evaluation/VCC2SM4/30011.wav 117 | data/wav/vcc2018_evaluation/VCC2SM4/30012.wav 118 | data/wav/vcc2018_evaluation/VCC2SM4/30013.wav 119 | data/wav/vcc2018_evaluation/VCC2SM4/30014.wav 120 | data/wav/vcc2018_evaluation/VCC2SM4/30015.wav 121 | data/wav/vcc2018_evaluation/VCC2SM4/30016.wav 122 | data/wav/vcc2018_evaluation/VCC2SM4/30017.wav 123 | data/wav/vcc2018_evaluation/VCC2SM4/30018.wav 124 | data/wav/vcc2018_evaluation/VCC2SM4/30019.wav 125 | data/wav/vcc2018_evaluation/VCC2SM4/30020.wav 126 | data/wav/vcc2018_evaluation/VCC2SM4/30021.wav 127 | data/wav/vcc2018_evaluation/VCC2SM4/30022.wav 128 | data/wav/vcc2018_evaluation/VCC2SM4/30023.wav 129 | data/wav/vcc2018_evaluation/VCC2SM4/30024.wav 130 | data/wav/vcc2018_evaluation/VCC2SM4/30025.wav 131 | data/wav/vcc2018_evaluation/VCC2SM4/30026.wav 132 | data/wav/vcc2018_evaluation/VCC2SM4/30027.wav 133 | data/wav/vcc2018_evaluation/VCC2SM4/30028.wav 134 | data/wav/vcc2018_evaluation/VCC2SM4/30029.wav 135 | data/wav/vcc2018_evaluation/VCC2SM4/30030.wav 136 | data/wav/vcc2018_evaluation/VCC2SM4/30031.wav 137 | data/wav/vcc2018_evaluation/VCC2SM4/30032.wav 138 | data/wav/vcc2018_evaluation/VCC2SM4/30033.wav 139 | data/wav/vcc2018_evaluation/VCC2SM4/30034.wav 140 | data/wav/vcc2018_evaluation/VCC2SM4/30035.wav 141 | -------------------------------------------------------------------------------- /egs/vcc18/data/scp/vcc18_valid_22kHz.scp: -------------------------------------------------------------------------------- 1 | data/wav/vcc2018_training/VCC2SF1/10001.wav 2 | data/wav/vcc2018_training/VCC2SF1/10002.wav 3 | data/wav/vcc2018_training/VCC2SF1/10003.wav 4 | data/wav/vcc2018_training/VCC2SF1/10004.wav 5 | data/wav/vcc2018_training/VCC2SF1/10005.wav 6 | data/wav/vcc2018_training/VCC2SF1/10006.wav 7 | data/wav/vcc2018_training/VCC2SF1/10007.wav 8 | data/wav/vcc2018_training/VCC2SF1/10008.wav 9 | data/wav/vcc2018_training/VCC2SF1/10009.wav 10 | data/wav/vcc2018_training/VCC2SF1/10010.wav 11 | data/wav/vcc2018_training/VCC2SF2/10001.wav 12 | data/wav/vcc2018_training/VCC2SF2/10002.wav 13 | data/wav/vcc2018_training/VCC2SF2/10003.wav 14 | data/wav/vcc2018_training/VCC2SF2/10004.wav 15 | data/wav/vcc2018_training/VCC2SF2/10005.wav 16 | data/wav/vcc2018_training/VCC2SF2/10006.wav 17 | data/wav/vcc2018_training/VCC2SF2/10007.wav 18 | data/wav/vcc2018_training/VCC2SF2/10008.wav 19 | data/wav/vcc2018_training/VCC2SF2/10009.wav 20 | data/wav/vcc2018_training/VCC2SF2/10010.wav 21 | data/wav/vcc2018_training/VCC2SF3/20001.wav 22 | data/wav/vcc2018_training/VCC2SF3/20002.wav 23 | data/wav/vcc2018_training/VCC2SF3/20003.wav 24 | data/wav/vcc2018_training/VCC2SF3/20004.wav 25 | data/wav/vcc2018_training/VCC2SF3/20005.wav 26 | data/wav/vcc2018_training/VCC2SF3/20006.wav 27 | data/wav/vcc2018_training/VCC2SF3/20007.wav 28 | data/wav/vcc2018_training/VCC2SF3/20008.wav 29 | data/wav/vcc2018_training/VCC2SF3/20009.wav 30 | data/wav/vcc2018_training/VCC2SF3/20010.wav 31 | data/wav/vcc2018_training/VCC2SF4/20001.wav 32 | data/wav/vcc2018_training/VCC2SF4/20002.wav 33 | data/wav/vcc2018_training/VCC2SF4/20003.wav 34 | data/wav/vcc2018_training/VCC2SF4/20004.wav 35 | data/wav/vcc2018_training/VCC2SF4/20005.wav 36 | data/wav/vcc2018_training/VCC2SF4/20006.wav 37 | data/wav/vcc2018_training/VCC2SF4/20007.wav 38 | data/wav/vcc2018_training/VCC2SF4/20008.wav 39 | data/wav/vcc2018_training/VCC2SF4/20009.wav 40 | data/wav/vcc2018_training/VCC2SF4/20010.wav 41 | data/wav/vcc2018_training/VCC2SM1/10001.wav 42 | data/wav/vcc2018_training/VCC2SM1/10002.wav 43 | data/wav/vcc2018_training/VCC2SM1/10003.wav 44 | data/wav/vcc2018_training/VCC2SM1/10004.wav 45 | data/wav/vcc2018_training/VCC2SM1/10005.wav 46 | data/wav/vcc2018_training/VCC2SM1/10006.wav 47 | data/wav/vcc2018_training/VCC2SM1/10007.wav 48 | data/wav/vcc2018_training/VCC2SM1/10008.wav 49 | data/wav/vcc2018_training/VCC2SM1/10009.wav 50 | data/wav/vcc2018_training/VCC2SM1/10010.wav 51 | data/wav/vcc2018_training/VCC2SM2/10001.wav 52 | data/wav/vcc2018_training/VCC2SM2/10002.wav 53 | data/wav/vcc2018_training/VCC2SM2/10003.wav 54 | data/wav/vcc2018_training/VCC2SM2/10004.wav 55 | data/wav/vcc2018_training/VCC2SM2/10005.wav 56 | data/wav/vcc2018_training/VCC2SM2/10006.wav 57 | data/wav/vcc2018_training/VCC2SM2/10007.wav 58 | data/wav/vcc2018_training/VCC2SM2/10008.wav 59 | data/wav/vcc2018_training/VCC2SM2/10009.wav 60 | data/wav/vcc2018_training/VCC2SM2/10010.wav 61 | data/wav/vcc2018_training/VCC2SM3/20001.wav 62 | data/wav/vcc2018_training/VCC2SM3/20002.wav 63 | data/wav/vcc2018_training/VCC2SM3/20003.wav 64 | data/wav/vcc2018_training/VCC2SM3/20004.wav 65 | data/wav/vcc2018_training/VCC2SM3/20005.wav 66 | data/wav/vcc2018_training/VCC2SM3/20006.wav 67 | data/wav/vcc2018_training/VCC2SM3/20007.wav 68 | data/wav/vcc2018_training/VCC2SM3/20008.wav 69 | data/wav/vcc2018_training/VCC2SM3/20009.wav 70 | data/wav/vcc2018_training/VCC2SM3/20010.wav 71 | data/wav/vcc2018_training/VCC2SM4/20001.wav 72 | data/wav/vcc2018_training/VCC2SM4/20002.wav 73 | data/wav/vcc2018_training/VCC2SM4/20003.wav 74 | data/wav/vcc2018_training/VCC2SM4/20004.wav 75 | data/wav/vcc2018_training/VCC2SM4/20005.wav 76 | data/wav/vcc2018_training/VCC2SM4/20006.wav 77 | data/wav/vcc2018_training/VCC2SM4/20007.wav 78 | data/wav/vcc2018_training/VCC2SM4/20008.wav 79 | data/wav/vcc2018_training/VCC2SM4/20009.wav 80 | data/wav/vcc2018_training/VCC2SM4/20010.wav 81 | data/wav/vcc2018_training/VCC2TF1/10001.wav 82 | data/wav/vcc2018_training/VCC2TF1/10002.wav 83 | data/wav/vcc2018_training/VCC2TF1/10003.wav 84 | data/wav/vcc2018_training/VCC2TF1/10004.wav 85 | data/wav/vcc2018_training/VCC2TF1/10005.wav 86 | data/wav/vcc2018_training/VCC2TF1/10006.wav 87 | data/wav/vcc2018_training/VCC2TF1/10007.wav 88 | data/wav/vcc2018_training/VCC2TF1/10008.wav 89 | data/wav/vcc2018_training/VCC2TF1/10009.wav 90 | data/wav/vcc2018_training/VCC2TF1/10010.wav 91 | data/wav/vcc2018_training/VCC2TF2/10001.wav 92 | data/wav/vcc2018_training/VCC2TF2/10002.wav 93 | data/wav/vcc2018_training/VCC2TF2/10003.wav 94 | data/wav/vcc2018_training/VCC2TF2/10004.wav 95 | data/wav/vcc2018_training/VCC2TF2/10005.wav 96 | data/wav/vcc2018_training/VCC2TF2/10006.wav 97 | data/wav/vcc2018_training/VCC2TF2/10007.wav 98 | data/wav/vcc2018_training/VCC2TF2/10008.wav 99 | data/wav/vcc2018_training/VCC2TF2/10009.wav 100 | data/wav/vcc2018_training/VCC2TF2/10010.wav 101 | data/wav/vcc2018_training/VCC2TM1/10001.wav 102 | data/wav/vcc2018_training/VCC2TM1/10002.wav 103 | data/wav/vcc2018_training/VCC2TM1/10003.wav 104 | data/wav/vcc2018_training/VCC2TM1/10004.wav 105 | data/wav/vcc2018_training/VCC2TM1/10005.wav 106 | data/wav/vcc2018_training/VCC2TM1/10006.wav 107 | data/wav/vcc2018_training/VCC2TM1/10007.wav 108 | data/wav/vcc2018_training/VCC2TM1/10008.wav 109 | data/wav/vcc2018_training/VCC2TM1/10009.wav 110 | data/wav/vcc2018_training/VCC2TM1/10010.wav 111 | data/wav/vcc2018_training/VCC2TM2/10001.wav 112 | data/wav/vcc2018_training/VCC2TM2/10002.wav 113 | data/wav/vcc2018_training/VCC2TM2/10003.wav 114 | data/wav/vcc2018_training/VCC2TM2/10004.wav 115 | data/wav/vcc2018_training/VCC2TM2/10005.wav 116 | data/wav/vcc2018_training/VCC2TM2/10006.wav 117 | data/wav/vcc2018_training/VCC2TM2/10007.wav 118 | data/wav/vcc2018_training/VCC2TM2/10008.wav 119 | data/wav/vcc2018_training/VCC2TM2/10009.wav 120 | data/wav/vcc2018_training/VCC2TM2/10010.wav 121 | -------------------------------------------------------------------------------- /egs/vcc18/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # This code is modified from https://github.com/bigpon/QPPWG. 5 | 6 | """Pytorch uSFGAN script 7 | Usage: run.py [-h] [-g GPUID] 8 | [-f FACTOR] [-C CONFIG] 9 | [-T TRAIN] [-V VALID] [-E EVAL] 10 | [-R RESUME] [-I ITER] 11 | [-0] [-1] [-2] 12 | 13 | Options: 14 | -h, --help Show the help 15 | -g GPUID GPU device ID 16 | -f FACTOR F0 scaled factor 17 | -C CONFIG Name of config version 18 | -T TRAIN Training set 19 | -V VALID Validation set 20 | -E EVAL Evaluation set 21 | -R RESUME Number of iteration to resume model 22 | -I ITER Number of iteration of testing model 23 | -0, --step0 Execute step0 (Feature extraction) 24 | -1, --step1 Execute step1 (uSFGAN training) 25 | -2, --step2 Execute step2 (uSFGAN decodeing) 26 | 27 | """ 28 | import os 29 | from docopt import docopt 30 | 31 | 32 | # PATH INITIALIZATION 33 | def _path_initial(pathlist): 34 | for pathdir in pathlist: 35 | if not os.path.exists(pathdir): 36 | os.makedirs(pathdir) 37 | 38 | 39 | # PATH CHECK 40 | def _path_check(pathlist): 41 | for pathdir in pathlist: 42 | if not os.path.exists(pathdir): 43 | raise FileNotFoundError("%s doesn't exist!!" % pathdir) 44 | 45 | 46 | # PATH & PARAMETER SETTINGS 47 | LIBRARY_DIR = "/usr/local/cuda-11.0/lib64" 48 | CUDA_DIR = "/usr/local/cuda-11.0" 49 | PRJ_ROOT = "../.." 50 | SEED = 1 51 | DECODE_SEED = 100 52 | 53 | # MAIN 54 | if __name__ == "__main__": 55 | args = docopt(__doc__) 56 | print(args) 57 | # step control 58 | execute_steps = [args["--step{}".format(step_index)] for step_index in range(0, 3)] 59 | if not any(execute_steps): 60 | raise("Please specify steps with options") 61 | # environment setting 62 | os.environ['LD_LIBRARY_PATH'] += ":" + LIBRARY_DIR 63 | os.environ['CUDA_HOME'] = CUDA_DIR 64 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 65 | if args['-g'] is not None: 66 | os.environ['CUDA_VISIBLE_DEVICES'] = args['-g'] 67 | else: 68 | os.environ['CUDA_VISIBLE_DEVICES'] = "0" 69 | # path setting 70 | network = "usfgan" 71 | entry_fe = "usfgan-preprocess" 72 | entry_stats = "usfgan-compute-statistics" 73 | entry_train = "usfgan-train" 74 | entry_decode = "usfgan-decode" 75 | train_version = "vcc18_train_22kHz" # training 76 | valid_version = "vcc18_valid_22kHz" # validation 77 | eval_version = "vcc18_eval_22kHz" # evaluation 78 | config_version = "uSFGAN_40" # config 79 | model_iters = "400000" # iteration of testing model 80 | f0_factor = "1.00" # scaled factor of f0 81 | if args['-f'] is not None: 82 | f0_factor = args['-f'] 83 | if args['-T'] is not None: 84 | train_version = args['-T'] 85 | if args['-V'] is not None: 86 | valid_version = args['-V'] 87 | if args['-E'] is not None: 88 | eval_version = args['-E'] 89 | if args['-C'] is not None: 90 | config_version = args['-C'] 91 | if args['-I'] is not None: 92 | model_iters = args['-I'] 93 | model_version = "%s_%s" % (network, train_version) # model name 94 | spkinfo = "data/pow_f0_dict.yml" 95 | config = "conf/vcc18.%s.yaml" % (config_version) 96 | stats = "data/stats/%s.joblib" % (train_version) 97 | outdir = "exp/%s_%s" % (model_version, config_version) 98 | train_wav = "data/scp/%s.scp" % (train_version) 99 | valid_wav = "data/scp/%s.scp" % (valid_version) 100 | eval_wav = "data/scp/%s.scp" % (eval_version) 101 | train_aux = "data/scp/%s.list" % (train_version) 102 | valid_aux = "data/scp/%s.list" % (valid_version) 103 | eval_aux = "data/scp/%s.list" % (eval_version) 104 | _path_check([config]) 105 | 106 | # FEATURE EXTRACTION 107 | if execute_steps[0]: 108 | inverse = True # If False, wav is restored from acoustic features 109 | split = "/" # Path split string 110 | spkidx = -2 # Speaker index of the split path 111 | # feature extraction 112 | for wav in [train_wav, valid_wav, eval_wav]: 113 | _path_check([wav]) 114 | cmd = entry_fe + \ 115 | " --audio " + wav + \ 116 | " --indir " + "wav" + \ 117 | " --outdir " + "hdf5" + \ 118 | " --config " + config + \ 119 | " --spkinfo " + spkinfo + \ 120 | " --split " + split + \ 121 | " --spkidx " + str(spkidx) + \ 122 | " --inv " + str(inverse) + \ 123 | " --verbose 1 " 124 | os.system(cmd) 125 | # calculate statistic 126 | _path_check([train_aux]) 127 | cmd = entry_stats + \ 128 | " --feats " + train_aux + \ 129 | " --config " + config + \ 130 | " --stats " + stats 131 | os.system(cmd) 132 | 133 | # NETWORK TRAINING 134 | if execute_steps[1]: 135 | # resume setting 136 | if args['-R'] is not None: 137 | resume = "%s/checkpoint-%ssteps.pkl" % (outdir, args['-R']) 138 | else: 139 | resume = "None" 140 | # training 141 | cmd = entry_train + \ 142 | " --train_audio " + train_wav + \ 143 | " --train_feat " + train_aux + \ 144 | " --valid_audio " + valid_wav + \ 145 | " --valid_feat " + valid_aux + \ 146 | " --stats " + stats + \ 147 | " --outdir " + outdir + \ 148 | " --config " + config + \ 149 | " --resume " + resume + \ 150 | " --seed " + str(SEED) + \ 151 | " --verbose 1 " 152 | os.system(cmd) 153 | 154 | # EVALUATION (ANALYSIS-SYNTHESIS) 155 | if execute_steps[2]: 156 | # path setting 157 | indir = "data/hdf5/" # input path of features 158 | outdir_eval = "%s/wav/%s/" % (outdir, model_iters) # wav output path 159 | # check trained model 160 | checkpoint = "%s/checkpoint-%ssteps.pkl" % (outdir, model_iters) 161 | _path_check([checkpoint]) 162 | 163 | # speech decoding 164 | cmd = entry_decode + \ 165 | " --eval_feat " + eval_aux + \ 166 | " --stats " + stats + \ 167 | " --indir " + indir + \ 168 | " --outdir " + outdir_eval + \ 169 | " --checkpoint " + checkpoint + \ 170 | " --config " + config + \ 171 | " --seed " + str(DECODE_SEED) + \ 172 | " --f0_factor " + f0_factor + \ 173 | " --verbose 1 " 174 | os.system(cmd) 175 | -------------------------------------------------------------------------------- /egs/vcc18/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # -*- coding: utf-8 -*- 3 | 4 | # This script is modified from https://github.com/bigpon/QPPWG. 5 | 6 | trainset=vcc18_train_22kHz # training set 7 | validset=vcc18_valid_22kHz # validation set 8 | evalset=vcc18_eval_22kHz # evaluation set 9 | gpu=0 # gpu id 10 | conf=uSFGAN_40 # name of config 11 | resume=None # number of iteration of resume model 12 | iter=400000 # number of iteration of testing model 13 | scaled=0.50 # scaled ratio of f0 14 | stage= # running stage (0-3) 15 | # stage 0: Preprocessing 16 | # stage 1: uSFGAN training 17 | # stage 2: uSFGAN decoding (analysis-synthesis) 18 | # stage 3: uSFGAN decoding (scaled F0) 19 | . ../parse_options.sh || exit 1; 20 | 21 | export LD_LIBRARY_PATH='' 22 | export CUDA_HOME='' 23 | export CUDA_DEVICE_ORDER='' 24 | 25 | # Preprocessing 26 | if echo ${stage} | grep -q 0; then 27 | echo "Preprocessing." 28 | python run.py -C ${conf} -T ${trainset} -V ${validset} -E ${evalset} -0 29 | fi 30 | 31 | # uSFGAN training 32 | if echo ${stage} | grep -q 1; then 33 | echo "uSFGAN training." 34 | python run.py -g ${gpu} -C ${conf} \ 35 | -T ${trainset} -V ${validset} -R ${resume} -1 36 | fi 37 | 38 | # uSFGAN decoding w/ natural acoustic features 39 | if echo ${stage} | grep -q 2; then 40 | echo "uSFGAN decoding (natural)." 41 | python run.py -g ${gpu} -C ${conf} \ 42 | -T ${trainset} -E ${evalset} -I ${iter} -2 43 | fi 44 | 45 | # uSFGAN decoding w/ scaled F0 46 | if echo ${stage} | grep -q 3; then 47 | echo "uSFGAN decoding ( ${scaled} x F0)." 48 | python run.py -g ${gpu} -C ${conf} -f ${scaled}\ 49 | -T ${trainset} -E ${evalset} -I ${iter} -2 50 | fi -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --verbose --durations=0 6 | testpaths = test 7 | 8 | [flake8] 9 | ignore = E731, F841, H102, W504, H238, D104, E221 10 | # 120 is a workaround, 79 is good 11 | max-line-length = 120 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Setup uSFGAN library.""" 5 | 6 | import os 7 | import pip 8 | import sys 9 | 10 | from distutils.version import LooseVersion 11 | from setuptools import find_packages 12 | from setuptools import setup 13 | 14 | if LooseVersion(sys.version) < LooseVersion("3.8"): 15 | raise RuntimeError( 16 | "usfgan requires Python>=3.8, " 17 | "but your Python is {}".format(sys.version)) 18 | if LooseVersion(pip.__version__) < LooseVersion("20.0.0"): 19 | raise RuntimeError( 20 | "pip>=20.0.0 is required, but your pip is {}. " 21 | "Try again after \"pip install -U pip\"".format(pip.__version__)) 22 | 23 | requirements = { 24 | "install": [ 25 | "torch>=1.7.1", 26 | "setuptools>=38.5.1", 27 | "librosa>=0.8.0", 28 | "soundfile>=0.10.2", 29 | "tensorboardX>=1.8", 30 | "matplotlib>=3.1.0", 31 | "PyYAML>=3.12", 32 | "tqdm>=4.26.1", 33 | "kaldiio>=2.14.1", 34 | "h5py>=2.10.0", 35 | "pyworld>=0.2.12", 36 | "docopt", 37 | "sprocket-vc", 38 | ], 39 | "setup": [ 40 | "numpy", 41 | "pytest-runner", 42 | ], 43 | "test": [ 44 | "pytest>=3.3.0", 45 | "hacking>=1.1.0", 46 | "flake8>=3.7.8", 47 | "flake8-docstrings>=1.3.1", 48 | ] 49 | } 50 | entry_points = { 51 | "console_scripts": [ 52 | "usfgan-preprocess=usfgan.bin.preprocess:main", 53 | "usfgan-compute-statistics=usfgan.bin.compute_statistics:main", 54 | "usfgan-train=usfgan.bin.train:main", 55 | "usfgan-decode=usfgan.bin.decode:main", 56 | ] 57 | } 58 | 59 | install_requires = requirements["install"] 60 | setup_requires = requirements["setup"] 61 | tests_require = requirements["test"] 62 | extras_require = {k: v for k, v in requirements.items() 63 | if k not in ["install", "setup"]} 64 | 65 | dirname = os.path.dirname(__file__) 66 | setup(name="usfgan", 67 | version="0.1", 68 | url="http://github.com/chomeyama/UnifiedSourceFilterGAN", 69 | author="Reo Yoneyama", 70 | author_email="yoneyama.reo@g.sp.m.is.nagoya-u.ac.jp", 71 | description="Unified Source-Filter GAN implementation", 72 | long_description_content_type="text/markdown", 73 | long_description=open(os.path.join(dirname, "README.md"), encoding="utf-8").read(), 74 | license="MIT License", 75 | packages=find_packages(include=["usfgan*"]), 76 | install_requires=install_requires, 77 | setup_requires=setup_requires, 78 | tests_require=tests_require, 79 | extras_require=extras_require, 80 | entry_points=entry_points, 81 | classifiers=[ 82 | "Programming Language :: Python :: 3.8.6", 83 | "Intended Audience :: Science/Research", 84 | "Operating System :: POSIX :: Linux", 85 | "License :: OSI Approved :: MIT License", 86 | "Topic :: Software Development :: Libraries :: Python Modules"], 87 | ) 88 | -------------------------------------------------------------------------------- /tools/Makefile: -------------------------------------------------------------------------------- 1 | PYTHON:= python3.8 2 | CUDA_VERSION:= 11.0 3 | PYTORCH_VERSION:= 1.7.1 4 | DOT:= . 5 | .PHONY: all clean 6 | 7 | all: virtualenv 8 | 9 | virtualenv: 10 | test -d venv || virtualenv -p $(PYTHON) venv 11 | . venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \ 12 | -f https://download.pytorch.org/whl/cu$(subst $(DOT),,$(CUDA_VERSION))/torch_stable.html 13 | . venv/bin/activate; cd ../; pip install -e . 14 | . venv/bin/activate; cd ../; pip install -e .[test] 15 | touch venv/bin/activate 16 | 17 | apex: virtualenv 18 | git clone https://github.com/NVIDIA/apex.git 19 | . venv/bin/activate; cd apex; \ 20 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 21 | 22 | clean: 23 | rm -fr venv apex 24 | find -iname "*.pyc" -delete 25 | -------------------------------------------------------------------------------- /usfgan/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __version__ = "0.0.1.post1" 4 | -------------------------------------------------------------------------------- /usfgan/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/usfgan/bin/__init__.py -------------------------------------------------------------------------------- /usfgan/bin/compute_statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Reo Yoneyama (Nagoya University) 5 | # based on a Quasi-Periodic Parallel WaveGAN script by Yi-Chiao Wu (Nagoya University) 6 | # (https://github.com/bigpon/QPPWG) 7 | # and also based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 8 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 9 | # MIT License (https://opensource.org/licenses/MIT) 10 | 11 | import argparse 12 | import logging 13 | import yaml 14 | 15 | from sklearn.preprocessing import StandardScaler 16 | from joblib import dump 17 | from usfgan.utils import (read_txt, read_hdf5) 18 | 19 | 20 | def calc_stats(file_list, config, shift=1): 21 | """Calcute statistics 22 | Args: 23 | file_list (list): File list. 24 | config (dict): Dictionary of config. 25 | shift (int): Shift of feature dimesion. 26 | """ 27 | scaler = StandardScaler() 28 | 29 | # process over all of data 30 | for i, filename in enumerate(file_list): 31 | logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) 32 | feat = read_hdf5(filename, "/%s" % config['feat_type']) 33 | scaler.partial_fit(feat[:, shift:]) 34 | 35 | dump(scaler, config['stats']) 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser() 40 | 41 | parser.add_argument("--feats", required=True, type=str, 42 | help="name of the list of hdf5 files") 43 | parser.add_argument("--config", required=True, type=str, 44 | help="yaml format configuration file") 45 | parser.add_argument("--stats", required=True, type=str, 46 | help="filename of stats") 47 | parser.add_argument("--verbose", default=1, type=int, 48 | help="log message level") 49 | 50 | args = parser.parse_args() 51 | 52 | # set log level 53 | if args.verbose == 1: 54 | logging.basicConfig(level=logging.INFO, 55 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 56 | datefmt='%m/%d/%Y %I:%M:%S') 57 | elif args.verbose > 1: 58 | logging.basicConfig(level=logging.DEBUG, 59 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 60 | datefmt='%m/%d/%Y %I:%M:%S') 61 | else: 62 | logging.basicConfig(level=logging.WARN, 63 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 64 | datefmt='%m/%d/%Y %I:%M:%S') 65 | logging.warn("logging is disabled.") 66 | 67 | # show argument 68 | for key, value in vars(args).items(): 69 | logging.info(f"{key} = {value}") 70 | 71 | # read file list 72 | file_list = read_txt(args.feats) 73 | logging.info(f"number of utterances = {len(file_list)}") 74 | 75 | # load config and speaker info 76 | with open(args.config, "r") as f: 77 | config = yaml.safe_load(f) 78 | config.update(vars(args)) 79 | 80 | # calculate statistics 81 | shift = config.get("stats_shift", 1) 82 | # for world feature, 83 | # the first dimesion (u/v) is usually 84 | # skipped in calculating statistics. 85 | logging.info(f"stats shift dimesion: {shift}") 86 | calc_stats(file_list, config, shift) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() -------------------------------------------------------------------------------- /usfgan/bin/decode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Reo Yoneyama (Nagoya University) 5 | # based on a Quasi-Periodic Parallel WaveGAN script by Yi-Chiao Wu (Nagoya University) 6 | # (https://github.com/bigpon/QPPWG) 7 | # and also based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 8 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 9 | # MIT License (https://opensource.org/licenses/MIT) 10 | 11 | """Decode with trained USFGAN Generator.""" 12 | 13 | import argparse 14 | import logging 15 | import os 16 | import time 17 | import numpy as np 18 | import soundfile as sf 19 | import torch 20 | import yaml 21 | 22 | from tqdm import tqdm 23 | import usfgan.models 24 | from usfgan.datasets import FeatDataset 25 | from usfgan.utils import read_hdf5 26 | 27 | 28 | def main(): 29 | """Run decoding process.""" 30 | parser = argparse.ArgumentParser( 31 | description="Decode dumped features with trained Quasi-Periodic Parallel WaveGAN Generator " 32 | "(See detail in usfgan/bin/decode.py).") 33 | parser.add_argument("--eval_feat", required=True, type=str, 34 | help="list of evaluation aux feat files") 35 | parser.add_argument("--stats", required=True, type=str, 36 | help="hdf5 file including statistics") 37 | parser.add_argument("--indir", required=True, type=str, 38 | help="directory of input feature files") 39 | parser.add_argument("--outdir", type=str, required=True, 40 | help="directory to output generated speech.") 41 | parser.add_argument("--checkpoint", type=str, required=True, 42 | help="checkpoint file to be loaded.") 43 | parser.add_argument("--config", default=None, type=str, 44 | help="yaml format configuration file. if not explicitly provided, " 45 | "it will be searched in the checkpoint directory. (default=None)") 46 | parser.add_argument("--verbose", type=int, default=1, 47 | help="logging level. higher is more logging. (default=1)") 48 | parser.add_argument("--seed", default=100, type=int, 49 | help="seed number") 50 | parser.add_argument("--f0_factor", default=1.0, type=float, 51 | help="f0 scaled factor") 52 | args = parser.parse_args() 53 | 54 | # set logger 55 | if args.verbose > 1: 56 | logging.basicConfig( 57 | level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") 58 | elif args.verbose > 0: 59 | logging.basicConfig( 60 | level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") 61 | else: 62 | logging.basicConfig( 63 | level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") 64 | logging.warning("Skip DEBUG/INFO messages") 65 | 66 | # fix seed 67 | np.random.seed(args.seed) 68 | torch.manual_seed(args.seed) 69 | torch.cuda.manual_seed(args.seed) 70 | os.environ['PYTHONHASHSEED'] = str(args.seed) 71 | 72 | # check directory existence 73 | if not os.path.isdir(os.path.dirname(args.outdir)): 74 | os.makedirs(os.path.dirname(args.outdir)) 75 | 76 | # load config 77 | if args.config is None: 78 | dirname = os.path.dirname(args.checkpoint) 79 | args.config = os.path.join(dirname, "config.yml") 80 | with open(args.config) as f: 81 | config = yaml.load(f, Loader=yaml.Loader) 82 | config.update(vars(args)) 83 | 84 | # get dataset 85 | feat_load_fn = lambda x: read_hdf5(x, config.get("feat_type", "world")) 86 | f0_factor = args.f0_factor 87 | dataset = FeatDataset( 88 | stats=args.stats, 89 | feat_list=args.eval_feat, 90 | feat_load_fn=feat_load_fn, 91 | return_filename=True, 92 | hop_size=config["hop_size"], 93 | dense_factor=config.get("dense_factor", 4), 94 | f0_threshold=config.get("f0_threshold", 0), 95 | f0_cont=config.get("f0_cont", False), 96 | f0_dim_idx=config.get("f0_dim_idx", 1), 97 | uv_dim_idx=config.get("uv_dim_idx", 0), 98 | mean_path=config.get("mean_path", "/world/mean"), 99 | scale_path=config.get("scale_path", "/world/scale"), 100 | f0_factor=f0_factor, 101 | fs=config.get("sampling_rate", 22050), 102 | shift=config.get("stats_shift", 1), 103 | ) 104 | logging.info(f"The number of features to be decoded = {len(dataset)}.") 105 | 106 | # setup 107 | if torch.cuda.is_available(): 108 | device = torch.device("cuda") 109 | else: 110 | device = torch.device("cpu") 111 | model_class = getattr( 112 | usfgan.models, 113 | config.get("generator_type", "uSFGANGenerator")) 114 | model = model_class(**config["generator_params"]) 115 | model.load_state_dict( 116 | torch.load(args.checkpoint, map_location="cpu")["model"]["generator"]) 117 | logging.info(f"Loaded model parameters from {args.checkpoint}.") 118 | model.remove_weight_norm() 119 | model = model.eval().to(device) 120 | input_type = config.get("input_type", "noise") 121 | pad_fn = torch.nn.ReplicationPad1d( 122 | config["generator_params"].get("aux_context_window", 0)) 123 | 124 | # start generation 125 | total_rtf = 0.0 126 | with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: 127 | for idx, (feat_path, f0, c, d) in enumerate(pbar, 1): 128 | # setup input 129 | x = () 130 | if input_type == "noise": 131 | z = torch.randn(1, 1, len(c) * config["hop_size"]).to(device) 132 | x += (z,) 133 | else: 134 | raise NotImplementedError("Currently only 'noise' input is supported ") 135 | f0 = torch.FloatTensor(f0).unsqueeze(0).transpose(2, 1).to(device) 136 | c = pad_fn(torch.FloatTensor(c).unsqueeze(0).transpose(2, 1)).to(device) 137 | d = torch.FloatTensor(d).view(1, 1, -1).to(device) 138 | x += (f0, c, d) 139 | 140 | # generate 141 | start = time.time() 142 | y, s, f0 = model(*x) 143 | y = y.view(-1).cpu().numpy() 144 | rtf = (time.time() - start) / (len(y) / config["sampling_rate"]) 145 | pbar.set_postfix({"RTF": rtf}) 146 | total_rtf += rtf 147 | 148 | # save output signal as PCM 16 bit wav file 149 | feat_path = os.path.splitext(feat_path)[0] 150 | feat_path = feat_path.replace(args.indir, args.outdir) 151 | if f0_factor == 1.0: # unchanged 152 | wav_filename = f"{feat_path}.wav" 153 | else: # scaled f0 154 | wav_filename = f"{feat_path}_f{f0_factor:.2f}.wav" 155 | if not os.path.exists(os.path.dirname(wav_filename)): 156 | os.makedirs(os.path.dirname(wav_filename)) 157 | sf.write(wav_filename, y, config.get("sampling_rate", 22050), "PCM_16") 158 | 159 | # save source signal as PCM 16 bit wav file 160 | s = s.view(-1).cpu().numpy() 161 | s = s / max(abs(s)) * 0.5 162 | feat_path = os.path.splitext(feat_path)[0] 163 | feat_path = feat_path.replace(args.indir, args.outdir) 164 | if f0_factor == 1.0: # unchanged 165 | wav_filename = f"{feat_path}_src.wav" 166 | else: # scaled f0 167 | wav_filename = f"{feat_path}_f{f0_factor:.2f}_src.wav" 168 | if not os.path.exists(os.path.dirname(wav_filename)): 169 | os.makedirs(os.path.dirname(wav_filename)) 170 | sf.write(wav_filename, s, config.get("sampling_rate", 22050), "PCM_16") 171 | 172 | # report average RTF 173 | logging.info(f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).") 174 | 175 | 176 | if __name__ == "__main__": 177 | main() -------------------------------------------------------------------------------- /usfgan/bin/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2021 Reo Yoneyama (Nagoya University) 5 | # based on a Quasi-Periodic Parallel WaveGAN script by Yi-Chiao Wu (Nagoya University) 6 | # (https://github.com/bigpon/QPPWG) 7 | # and also based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 8 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 9 | # and also based on sprocket-vc script by Kazuhiro Kobayashi (Nagoya University) 10 | # (https://github.com/k2kobayashi/sprocket) 11 | # MIT License (https://opensource.org/licenses/MIT) 12 | 13 | from __future__ import division 14 | 15 | import argparse 16 | import logging 17 | import multiprocessing as mp 18 | import os 19 | import sys 20 | import copy 21 | import yaml 22 | import pyworld 23 | import librosa 24 | import numpy as np 25 | import soundfile as sf 26 | 27 | from distutils.util import strtobool 28 | from scipy.interpolate import interp1d 29 | from scipy.io import wavfile 30 | from scipy.signal import firwin 31 | from scipy.signal import lfilter 32 | from sprocket.speech import FeatureExtractor 33 | from sprocket.speech import Synthesizer 34 | from usfgan.utils import (read_txt, read_hdf5, write_hdf5, check_hdf5) 35 | 36 | 37 | def _get_arguments(): 38 | parser = argparse.ArgumentParser( 39 | description="making feature file argsurations.") 40 | # setting 41 | parser.add_argument("--audio", required=True, 42 | type=str, help="List of input wav files") 43 | parser.add_argument("--indir", required=True, 44 | type=str, help="Directory of input feature files") 45 | parser.add_argument("--outdir", required=True, 46 | type=str, help="Directory to save generated samples") 47 | parser.add_argument("--config", required=True, 48 | type=str, help="YAML format configuration file") 49 | parser.add_argument("--spkinfo", default="None", 50 | type=str, help="YAML format speaker information") 51 | parser.add_argument("--feature_format", default="h5", 52 | type=str, help="Feature format") 53 | parser.add_argument("--split", default="/", 54 | type=str, help="Path split string") 55 | parser.add_argument("--spkidx", default=-2, 56 | type=int, help="Speaker index of the split path") 57 | # flags setting 58 | parser.add_argument("--save_f0", default=True, 59 | type=strtobool, help="If set True, features f0 will be saved") 60 | parser.add_argument("--save_ap", default=False, 61 | type=strtobool, help="If set True, features ap will be saved") 62 | parser.add_argument("--save_spc", default=False, 63 | type=strtobool, help="If set True, features spc will be saved") 64 | parser.add_argument("--save_npow", default=True, 65 | type=strtobool, help="If set True, features npow will be saved") 66 | # other setting 67 | parser.add_argument('--inv', default=True, 68 | type=strtobool, help="If False, wav is restored from acoustic features") 69 | parser.add_argument("--verbose", default=1, 70 | type=int, help="Log message level") 71 | 72 | return parser.parse_args() 73 | 74 | 75 | def path_create(wav_list, indir, outdir, extname): 76 | for wav_name in wav_list: 77 | path_replace(wav_name, indir, outdir, extname=extname) 78 | 79 | 80 | def path_replace(filepath, inputpath, outputpath, extname=None): 81 | filepath = filepath.replace(inputpath, outputpath) 82 | if not os.path.exists(os.path.dirname(filepath)): 83 | os.makedirs(os.path.dirname(filepath)) 84 | if extname is not None: 85 | filepath = '%s.%s' % (filepath.split('.')[0], extname) 86 | return filepath 87 | 88 | 89 | def spk_division(file_list, config, spkinfo, split="/", spkidx=-2): 90 | """Divide list into speaker-dependent list 91 | Args: 92 | file_list (list): Waveform list 93 | config (dict): Config 94 | spkinfo (dict): Dictionary of 95 | speaker-dependent f0 range and power threshold 96 | split: Path split string 97 | spkidx: Speaker index of the split path 98 | Return: 99 | (list): List of divided file lists 100 | (list): List of speaker-dependent configs 101 | """ 102 | file_lists, configs, tempf = [], [], [] 103 | prespk = None 104 | for file in file_list: 105 | spk = file.split(split)[spkidx] 106 | if spk != prespk: 107 | if tempf: 108 | file_lists.append(tempf) 109 | tempf = [] 110 | prespk = spk 111 | tempc = copy.deepcopy(config) 112 | if spk in spkinfo: 113 | tempc['minf0'] = spkinfo[spk]['f0_min'] 114 | tempc['maxf0'] = spkinfo[spk]['f0_max'] 115 | tempc['pow_th'] = spkinfo[spk]['pow_th'] 116 | else: 117 | msg = "Since %s is not in spkinfo dict, " % spk 118 | msg += "default f0 range and power threshold are used." 119 | logging.info(msg) 120 | tempc['minf0'] = 40 121 | tempc['maxf0'] = 800 122 | tempc['pow_th'] = -20 123 | configs.append(tempc) 124 | tempf.append(file) 125 | file_lists.append(tempf) 126 | 127 | return file_lists, configs 128 | 129 | 130 | def aux_list_create(wav_list_file, config): 131 | """Create list of auxiliary acoustic features 132 | Args: 133 | wav_list_file (str): Filename of wav list 134 | config (dict): Config 135 | """ 136 | aux_list_file = wav_list_file.replace(".scp", ".list") 137 | wav_files = read_txt(wav_list_file) 138 | with open(aux_list_file, "w") as f: 139 | for wav_name in wav_files: 140 | feat_name = path_replace(wav_name, 141 | config['indir'], config['outdir'], 142 | extname=config['feature_format']) 143 | f.write("%s\n" % feat_name) 144 | 145 | 146 | def low_cut_filter(x, fs, cutoff=70): 147 | """Low cut filter 148 | Args: 149 | x (ndarray): Waveform sequence 150 | fs (int): Sampling frequency 151 | cutoff (float): Cutoff frequency of low cut filter 152 | Return: 153 | (ndarray): Low cut filtered waveform sequence 154 | """ 155 | nyquist = fs // 2 156 | norm_cutoff = cutoff / nyquist 157 | fil = firwin(255, norm_cutoff, pass_zero=False) 158 | lcf_x = lfilter(fil, 1, x) 159 | 160 | return lcf_x 161 | 162 | 163 | def low_pass_filter(x, fs, cutoff=70, padding=True): 164 | """Low pass filter 165 | Args: 166 | x (ndarray): Waveform sequence 167 | fs (int): Sampling frequency 168 | cutoff (float): Cutoff frequency of low pass filter 169 | Return: 170 | (ndarray): Low pass filtered waveform sequence 171 | """ 172 | nyquist = fs // 2 173 | norm_cutoff = cutoff / nyquist 174 | numtaps = 255 175 | fil = firwin(numtaps, norm_cutoff) 176 | x_pad = np.pad(x, (numtaps, numtaps), 'edge') 177 | lpf_x = lfilter(fil, 1, x_pad) 178 | lpf_x = lpf_x[numtaps + numtaps // 2: -numtaps // 2] 179 | 180 | return lpf_x 181 | 182 | 183 | # WORLD features 184 | def convert_continuos_f0(f0): 185 | """Convert F0 to continuous F0 186 | Args: 187 | f0 (ndarray): original f0 sequence with the shape (T) 188 | Return: 189 | (ndarray): continuous f0 with the shape (T) 190 | """ 191 | # get uv information as binary 192 | uv = np.float32(f0 != 0) 193 | # get start and end of f0 194 | if (f0 == 0).all(): 195 | logging.warn("all of the f0 values are 0.") 196 | return uv, f0 197 | start_f0 = f0[f0 != 0][0] 198 | end_f0 = f0[f0 != 0][-1] 199 | # padding start and end of f0 sequence 200 | cont_f0 = copy.deepcopy(f0) 201 | start_idx = np.where(cont_f0 == start_f0)[0][0] 202 | end_idx = np.where(cont_f0 == end_f0)[0][-1] 203 | cont_f0[:start_idx] = start_f0 204 | cont_f0[end_idx:] = end_f0 205 | # get non-zero frame index 206 | nz_frames = np.where(cont_f0 != 0)[0] 207 | # perform linear interpolation 208 | f = interp1d(nz_frames, cont_f0[nz_frames]) 209 | cont_f0 = f(np.arange(0, cont_f0.shape[0])) 210 | 211 | return uv, cont_f0 212 | 213 | 214 | def world_speech_synthesis(queue, wav_list, config): 215 | """WORLD speech synthesis 216 | Args: 217 | queue (multiprocessing.Queue): the queue to store the file name of utterance 218 | wav_list (list): list of the wav files 219 | config (dict): feature extraction config 220 | """ 221 | # define synthesizer 222 | synthesizer = Synthesizer(fs=config['sampling_rate'], 223 | fftl=config['fft_size'], 224 | shiftms=config['shiftms']) 225 | # synthesis 226 | for i, wav_name in enumerate(wav_list): 227 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 228 | 229 | # load acoustic features 230 | feat_name = path_replace(wav_name, config['indir'], 231 | config['outdir'], extname=config['feature_format']) 232 | if check_hdf5(feat_name, "/world"): 233 | h = read_hdf5(feat_name, "/world") 234 | else: 235 | logging.error("%s is not existed." % (feat_name)) 236 | sys.exit(1) 237 | if check_hdf5(feat_name, "/f0"): 238 | f0 = read_hdf5(feat_name, "/f0") 239 | else: 240 | uv = h[:, config['uv_dim_idx']].copy(order='C') 241 | f0 = h[:, config['f0_dim_idx']].copy(order='C') # cont_f0_lpf 242 | fz_idx = np.where(uv == 0.0) 243 | f0[fz_idx] = 0.0 244 | if check_hdf5(feat_name, "/ap"): 245 | ap = read_hdf5(feat_name, "/ap") 246 | else: 247 | codeap = h[:, config['ap_dim_start']:config['ap_dim_end']].copy(order='C') 248 | ap = pyworld.decode_aperiodicity(codeap, config['sampling_rate'], config['fft_size']) 249 | mcep = h[:, config['mcep_dim_start']:config['mcep_dim_end']].copy(order='C') 250 | 251 | # waveform synthesis 252 | wav = synthesizer.synthesis(f0, 253 | mcep, 254 | ap, 255 | alpha=config['mcep_alpha']) 256 | wav = np.clip(np.int16(wav), -32768, 32767) 257 | 258 | # save restored wav 259 | restored_name = path_replace(wav_name, "wav", "world", extname="wav") 260 | wavfile.write(restored_name, config['sampling_rate'], wav) 261 | 262 | queue.put('Finish') 263 | 264 | 265 | def world_feature_extract(queue, wav_list, config): 266 | """WORLD feature extraction 267 | Args: 268 | queue (multiprocessing.Queue): the queue to store the file name of utterance 269 | wav_list (list): list of the wav files 270 | config (dict): feature extraction config 271 | """ 272 | # define feature extractor 273 | feature_extractor = FeatureExtractor( 274 | analyzer="world", 275 | fs=config['sampling_rate'], 276 | shiftms=config['shiftms'], 277 | minf0=config['minf0'], 278 | maxf0=config['maxf0'], 279 | fftl=config['fft_size']) 280 | # extraction 281 | for i, wav_name in enumerate(wav_list): 282 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 283 | 284 | # load wavfile and apply low cut filter 285 | fs, x = wavfile.read(wav_name) 286 | x = np.array(x, dtype=np.float32) 287 | if config['highpass_cutoff'] != 0: 288 | x = low_cut_filter(x, fs, cutoff=config['highpass_cutoff']) 289 | 290 | # check sampling frequency 291 | if not fs == config['sampling_rate']: 292 | logging.error("sampling frequency of %s is not matched." % wav_name) 293 | sys.exit(1) 294 | 295 | # extract features 296 | f0, spc, ap = feature_extractor.analyze(x) 297 | codeap = feature_extractor.codeap() 298 | mcep = feature_extractor.mcep(dim=config['mcep_dim'], alpha=config['mcep_alpha']) 299 | npow = feature_extractor.npow() 300 | uv, cont_f0 = convert_continuos_f0(f0) 301 | lpf_fs = int(1.0 / (config['shiftms'] * 0.001)) 302 | cont_f0_lpf = low_pass_filter(cont_f0, lpf_fs, cutoff=20) 303 | next_cutoff = 70 304 | while not (cont_f0_lpf >= [0]).all(): 305 | cont_f0_lpf = low_pass_filter(cont_f0, lpf_fs, cutoff=next_cutoff) 306 | next_cutoff *= 2 307 | 308 | # concatenate 309 | cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) 310 | uv = np.expand_dims(uv, axis=-1) 311 | if config['f0_cont']: 312 | feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1) 313 | else: 314 | feats = np.concatenate([uv, f0[:, np.newaxis], mcep, codeap], axis=1) 315 | 316 | # save feature 317 | feat_name = path_replace(wav_name, config['indir'], 318 | config['outdir'], extname=config['feature_format']) 319 | write_hdf5(feat_name, "/%s" % (config["feat_type"]), feats) 320 | if config['save_f0']: 321 | write_hdf5(feat_name, "/f0", f0) 322 | if config['save_ap']: 323 | write_hdf5(feat_name, "/ap", ap) 324 | if config['save_spc']: 325 | write_hdf5(feat_name, "/spc", spc) 326 | if config['save_npow']: 327 | write_hdf5(feat_name, "/npow", npow) 328 | 329 | queue.put('Finish') 330 | 331 | 332 | # Mel-spec and f0 features 333 | def logmelfilterbank(audio, 334 | sampling_rate, 335 | fft_size=1024, 336 | hop_size=256, 337 | win_length=None, 338 | window="hann", 339 | num_mels=80, 340 | fmin=None, 341 | fmax=None, 342 | eps=1e-10, 343 | ): 344 | """Extract log-Mel filterbank feature. 345 | Args: 346 | audio (ndarray): Audio signal (T,). 347 | sampling_rate (int): Sampling rate. 348 | fft_size (int): FFT size. 349 | hop_size (int): Hop size. 350 | win_length (int): Window length. If set to None, it will be the same as fft_size. 351 | window (str): Window function type. 352 | num_mels (int): Number of mel basis. 353 | fmin (int): Minimum frequency in mel basis calculation. 354 | fmax (int): Maximum frequency in mel basis calculation. 355 | eps (float): Epsilon value to avoid inf in log calculation. 356 | Returns: 357 | ndarray: Log Mel filterbank feature (#frames, num_mels). 358 | """ 359 | # get amplitude spectrogram 360 | x_stft = librosa.stft(audio, n_fft=fft_size, hop_length=hop_size, 361 | win_length=win_length, window=window, pad_mode="reflect") 362 | spc = np.abs(x_stft).T # (#frames, #bins) 363 | 364 | # get mel basis 365 | fmin = 0 if fmin is None else fmin 366 | fmax = sampling_rate / 2 if fmax is None else fmax 367 | mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax) 368 | 369 | return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) 370 | 371 | 372 | def melf0_feature_extract(queue, wav_list, config): 373 | """Mel-spc w/ F0 feature extraction 374 | Args: 375 | queue (multiprocessing.Queue): the queue to store the file name of utterance 376 | wav_list (list): list of the wav files 377 | config (dict): feature extraction config 378 | """ 379 | # define f0 feature extractor 380 | feature_extractor = FeatureExtractor( 381 | analyzer="world", 382 | fs=config['sampling_rate'], 383 | shiftms=config['shiftms'], 384 | minf0=config['minf0'], 385 | maxf0=config['maxf0'], 386 | fftl=config['fft_size']) 387 | # extraction 388 | for i, wav_name in enumerate(wav_list): 389 | logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) 390 | 391 | # load wavfile 392 | (x, fs) = sf.read(wav_name) 393 | 394 | # check sampling frequency 395 | if not fs == config['sampling_rate']: 396 | logging.error("sampling frequency is not matched.") 397 | sys.exit(1) 398 | 399 | # extract f0 and uv features 400 | f0, _, _ = feature_extractor.analyze(x) 401 | uv, cont_f0 = convert_continuos_f0(f0) 402 | lpf_fs = int(1.0 / (config['shiftms'] * 0.001)) 403 | cont_f0_lpf = low_pass_filter(cont_f0, lpf_fs, cutoff=20) 404 | next_cutoff = 70 405 | while not (cont_f0_lpf >= [0]).all(): 406 | cont_f0_lpf = low_pass_filter(cont_f0, lpf_fs, cutoff=next_cutoff) 407 | next_cutoff *= 2 408 | 409 | # extract mel-spc feature 410 | mel = logmelfilterbank(x, fs, 411 | fft_size=config["fft_size"], 412 | hop_size=config["hop_size"], 413 | win_length=config["win_length"], 414 | window=config["window"], 415 | num_mels=config["num_mels"], 416 | fmin=config["fmin"], 417 | fmax=config["fmax"]) 418 | 419 | # concatenate 420 | cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) 421 | uv = np.expand_dims(uv, axis=-1) 422 | minlen = min(uv.shape[0], mel.shape[0]) 423 | feats = np.concatenate([uv[:minlen, :], cont_f0_lpf[:minlen, :], 424 | mel.astype(np.float32)[:minlen, :]], axis=1) 425 | if config['f0_cont']: 426 | feats = np.concatenate([uv[:minlen, :], cont_f0_lpf[:minlen, :], 427 | mel.astype(np.float32)[:minlen, :]], axis=1) 428 | else: 429 | feats = np.concatenate([uv[:minlen, :], f0[:minlen, np.newaxis], 430 | mel.astype(np.float32)[:minlen, :]], axis=1) 431 | 432 | # save feature 433 | feat_name = path_replace(wav_name, config['indir'], 434 | config['outdir'], extname=config['feature_format']) 435 | write_hdf5(feat_name, "/%s" % (config["feat_type"]), feats) 436 | if config['save_f0']: 437 | write_hdf5(feat_name, "/f0", f0) 438 | 439 | queue.put('Finish') 440 | 441 | 442 | def main(): 443 | # parser arguments 444 | args = _get_arguments() 445 | # set log level 446 | if args.verbose == 1: 447 | logging.basicConfig(level=logging.INFO, 448 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 449 | datefmt='%m/%d/%Y %I:%M:%S') 450 | elif args.verbose > 1: 451 | logging.basicConfig(level=logging.DEBUG, 452 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 453 | datefmt='%m/%d/%Y %I:%M:%S') 454 | else: 455 | logging.basicConfig(level=logging.WARN, 456 | format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', 457 | datefmt='%m/%d/%Y %I:%M:%S') 458 | logging.warn("logging is disabled.") 459 | 460 | # show argument 461 | for key, value in vars(args).items(): 462 | logging.info("%s = %s" % (key, str(value))) 463 | 464 | # read list 465 | file_list = read_txt(args.audio) 466 | logging.info("number of utterances = %d" % len(file_list)) 467 | 468 | # load config 469 | with open(args.config, "r") as f: 470 | config = yaml.safe_load(f) 471 | config.update(vars(args)) 472 | 473 | # list division 474 | if os.path.exists(args.spkinfo): 475 | # load speaker info 476 | with open(args.spkinfo, "r") as f: 477 | spkinfo = yaml.safe_load(f) 478 | # divide into each spk list 479 | file_lists, configs = spk_division(file_list, config, spkinfo) 480 | else: 481 | msg = "Since spkinfo %s is not exist, " % args.spkinfo 482 | msg += "default f0 range and power threshold are used." 483 | logging.info(msg) 484 | file_lists = np.array_split(file_list, 10) 485 | file_lists = [f_list.tolist() for f_list in file_lists] 486 | config['minf0'] = 70 487 | config['maxf0'] = 270 488 | config['pow_th'] = -20 489 | configs = [config] * len(file_lists) 490 | 491 | # set mode 492 | if config['feat_type'] == "world": 493 | if args.inv: 494 | target_fn = world_feature_extract 495 | # create auxiliary feature list 496 | aux_list_create(args.audio, config) 497 | # create folder 498 | path_create(file_list, config['indir'], 499 | config['outdir'], config['feature_format']) 500 | else: 501 | target_fn = world_speech_synthesis 502 | # create folder 503 | path_create(file_list, "wav", "world", "wav") 504 | elif config['feat_type'][:6] == "melf0h": 505 | if args.inv: 506 | target_fn = melf0_feature_extract 507 | # create auxiliary feature list 508 | aux_list_create(args.audio, config) 509 | # create folder 510 | path_create(file_list, config['indir'], 511 | config['outdir'], config['feature_format']) 512 | else: 513 | raise NotImplementedError("Currently, only mel-spec extraction is supported.") 514 | else: 515 | raise NotImplementedError("Currently, only 'world' and 'melf0hxxx' are supported.") 516 | 517 | # multi processing 518 | processes = [] 519 | queue = mp.Queue() 520 | for f, config in zip(file_lists, configs): 521 | p = mp.Process(target=target_fn, args=(queue, f, config,)) 522 | p.start() 523 | processes.append(p) 524 | 525 | # wait for all process 526 | for p in processes: 527 | p.join() 528 | 529 | 530 | if __name__ == "__main__": 531 | main() 532 | -------------------------------------------------------------------------------- /usfgan/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from usfgan.datasets.audio_feat_dataset import * # NOQA 2 | -------------------------------------------------------------------------------- /usfgan/datasets/audio_feat_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | # based on a Quasi-Periodic Parallel WaveGAN script by Yi-Chiao Wu (Nagoya University) 5 | # (https://github.com/bigpon/QPPWG) 6 | # and also based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 7 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 8 | # MIT License (https://opensource.org/licenses/MIT) 9 | 10 | """Dataset modules.""" 11 | 12 | import logging 13 | import numpy as np 14 | 15 | from multiprocessing import Manager 16 | from torch.utils.data import Dataset 17 | from usfgan.utils import read_hdf5, read_txt, check_filename 18 | from usfgan.utils import validate_length, batch_f0, dilated_factor 19 | from joblib import load 20 | 21 | import soundfile as sf 22 | 23 | 24 | class AudioFeatDataset(Dataset): 25 | """PyTorch compatible audio and acoustic feat. dataset.""" 26 | 27 | def __init__(self, 28 | stats, 29 | audio_list, 30 | feat_list, 31 | audio_load_fn=sf.read, 32 | feat_load_fn=lambda x: read_hdf5(x, "world"), 33 | audio_length_threshold=None, 34 | feat_length_threshold=None, 35 | return_filename=False, 36 | allow_cache=False, 37 | hop_size=110, 38 | dense_factor=4, 39 | f0_threshold=0, 40 | f0_cont=False, 41 | f0_dim_idx=1, 42 | uv_dim_idx=0, 43 | mean_path="/world/mean", 44 | scale_path="/world/scale", 45 | shift=1, 46 | ): 47 | """Initialize dataset. 48 | 49 | Args: 50 | stats (str): Filename of the statistic hdf5 file. 51 | audio_list (str): Filename of the list of audio files. 52 | feat_list (str): Filename of the list of feature files. 53 | audio_load_fn (func): Function to load audio file. 54 | feat_load_fn (func): Function to load feature file. 55 | audio_length_threshold (int): Threshold to remove short audio files. 56 | feat_length_threshold (int): Threshold to remove short feature files. 57 | return_filename (bool): Whether to return the filename with arrays. 58 | allow_cache (bool): Whether to allow cache of the loaded files. 59 | hop_size (int): Hope size of acoustic feature 60 | dense_factor (int): Number of taps in one cycle. 61 | f0_threshold (float): Lower bound of pitch. 62 | f0_cont (bool): Whether to get dilated factor by continuous f0. 63 | f0_dim_idx (int): Dimension index of f0. (if set -1, all dilated factors will be 1) 64 | uv_dim_idx (int): Dimension index of U/V. 65 | mean_path (str): The data path (channel) of the mean in the statistic hdf5 file. 66 | scale_path (str): The data path (channel) of the scale in the statistic hdf5 file. 67 | shift (int): Shift of feature dimesion. 68 | 69 | """ 70 | # load audio and feature files & check filename 71 | audio_files = read_txt(audio_list) 72 | feat_files = read_txt(feat_list) 73 | assert check_filename(audio_files, feat_files) 74 | 75 | # filter by threshold 76 | if audio_length_threshold is not None: 77 | audio_lengths = [audio_load_fn(f).shape[0] for f in audio_files] 78 | idxs = [idx for idx in range(len(audio_files)) if audio_lengths[idx] > audio_length_threshold] 79 | if len(audio_files) != len(idxs): 80 | logging.warning(f"Some files are filtered by audio length threshold " 81 | f"({len(audio_files)} -> {len(idxs)}).") 82 | audio_files = [audio_files[idx] for idx in idxs] 83 | feat_files = [feat_files[idx] for idx in idxs] 84 | if feat_length_threshold is not None: 85 | mel_lengths = [feat_load_fn(f).shape[0] for f in feat_files] 86 | idxs = [idx for idx in range(len(feat_files)) if mel_lengths[idx] > feat_length_threshold] 87 | if len(feat_files) != len(idxs): 88 | logging.warning(f"Some files are filtered by mel length threshold " 89 | f"({len(feat_files)} -> {len(idxs)}).") 90 | audio_files = [audio_files[idx] for idx in idxs] 91 | feat_files = [feat_files[idx] for idx in idxs] 92 | 93 | # assert the number of files 94 | assert len(audio_files) != 0, f"${audio_list} is empty." 95 | assert len(audio_files) == len(feat_files), \ 96 | f"Number of audio and mel files are different ({len(audio_files)} vs {len(feat_files)})." 97 | 98 | self.audio_files = audio_files 99 | self.audio_load_fn = audio_load_fn 100 | self.feat_load_fn = feat_load_fn 101 | self.feat_files = feat_files 102 | self.return_filename = return_filename 103 | self.allow_cache = allow_cache 104 | self.hop_size = hop_size 105 | self.f0_threshold = f0_threshold 106 | self.dense_factor = dense_factor 107 | self.f0_cont = f0_cont 108 | self.f0_dim_idx = f0_dim_idx 109 | self.uv_dim_idx = uv_dim_idx 110 | self.shift = shift 111 | 112 | if allow_cache: 113 | # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 114 | self.manager = Manager() 115 | self.caches = self.manager.list() 116 | self.caches += [() for _ in range(len(audio_files))] 117 | 118 | # define feature pre-processing function 119 | scaler = load(stats) 120 | self.feat_transform = lambda x: scaler.transform(x) 121 | 122 | def __getitem__(self, idx): 123 | """Get specified idx items. 124 | 125 | Args: 126 | idx (int): Index of the item. 127 | 128 | Returns: 129 | str: Utterance id (only in return_filename = True). 130 | ndarray: Audio signal (T,). 131 | ndarray: Feature (T', C). 132 | ndarray: Dilated factor (T, 1). 133 | 134 | """ 135 | if self.allow_cache and len(self.caches[idx]) != 0: 136 | return self.caches[idx] 137 | 138 | audio, fs = self.audio_load_fn(self.audio_files[idx]) 139 | feat = self.feat_load_fn(self.feat_files[idx]) 140 | audio, feat = validate_length(audio, feat, self.hop_size) 141 | # get dilated factor sequence 142 | f0 = batch_f0(feat, self.f0_threshold, self.f0_cont, 143 | self.f0_dim_idx, self.uv_dim_idx) 144 | df = dilated_factor(f0, fs, self.dense_factor) 145 | df = df.repeat(self.hop_size, axis=0) 146 | # audio & feature pre-processing 147 | audio = audio.astype(np.float32) 148 | f0 = np.copy(feat[:, self.shift:self.shift+1]) 149 | feat[:, self.shift:] = self.feat_transform(feat[:, self.shift:]) 150 | 151 | if self.return_filename: 152 | items = self.feat_files[idx], audio, f0, feat, df 153 | else: 154 | items = audio, f0, feat, df 155 | 156 | if self.allow_cache: 157 | self.caches[idx] = items 158 | 159 | return items 160 | 161 | def __len__(self): 162 | """Return dataset length. 163 | 164 | Returns: 165 | int: The length of dataset. 166 | 167 | """ 168 | return len(self.audio_files) 169 | 170 | 171 | class FeatDataset(Dataset): 172 | """PyTorch compatible mel dataset.""" 173 | 174 | def __init__(self, 175 | stats, 176 | feat_list, 177 | feat_length_threshold=None, 178 | feat_load_fn=lambda x: read_hdf5(x, "world"), 179 | return_filename=False, 180 | allow_cache=False, 181 | hop_size=110, 182 | dense_factor=4, 183 | f0_threshold=0, 184 | f0_cont=False, 185 | f0_dim_idx=1, 186 | uv_dim_idx=0, 187 | mean_path="/world/mean", 188 | scale_path="/world/scale", 189 | f0_factor=1.0, 190 | fs=22050, 191 | shift=1, 192 | ): 193 | """Initialize dataset. 194 | 195 | Args: 196 | stats (str): Filename of the statistic hdf5 file. 197 | feat_list (str): Filename of the list of feature files. 198 | feat_load_fn (func): Function to load feature file. 199 | feat_length_threshold (int): Threshold to remove short feature files. 200 | return_filename (bool): Whether to return the utterance id with arrays. 201 | allow_cache (bool): Whether to allow cache of the loaded files. 202 | hop_size (int): Hope size of acoustic feature 203 | dense_factor (int): Number of taps in one cycle. 204 | f0_threshold (float): Lower bound of pitch. 205 | f0_cont (bool): Whether to get dilated factor by continuous f0. 206 | f0_dim_idx (int): Dimension index of f0. (if set -1, all dilated factors will be 1) 207 | uv_dim_idx (int): Dimension index of U/V. 208 | mean_path (str): The data path (channel) of the mean in the statistic hdf5 file. 209 | scale_path (str): The data path (channel) of the scale in the statistic hdf5 file. 210 | f0_factor (float): Ratio of scaled f0 211 | fs (int): The sampling rate of audio speech 212 | shift (int): Shift of feature dimesion. 213 | 214 | """ 215 | # load feat. files 216 | feat_files = read_txt(feat_list) 217 | 218 | # filter by threshold 219 | if feat_length_threshold is not None: 220 | mel_lengths = [feat_load_fn(f).shape[0] for f in feat_files] 221 | idxs = [idx for idx in range(len(feat_files)) if mel_lengths[idx] > feat_length_threshold] 222 | if len(feat_files) != len(idxs): 223 | logging.warning(f"Some files are filtered by mel length threshold " 224 | f"({len(feat_files)} -> {len(idxs)}).") 225 | feat_files = [feat_files[idx] for idx in idxs] 226 | 227 | # assert the number of files 228 | assert len(feat_files) != 0, f"${feat_list} is empty." 229 | 230 | self.feat_files = feat_files 231 | self.feat_load_fn = feat_load_fn 232 | self.return_filename = return_filename 233 | self.allow_cache = allow_cache 234 | self.hop_size = hop_size 235 | self.dense_factor = dense_factor 236 | self.f0_threshold = f0_threshold 237 | self.f0_cont = f0_cont 238 | self.f0_factor = f0_factor 239 | self.f0_dim_idx = f0_dim_idx 240 | self.uv_dim_idx = uv_dim_idx 241 | self.fs = fs 242 | self.shift = shift 243 | 244 | if allow_cache: 245 | # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 246 | self.manager = Manager() 247 | self.caches = self.manager.list() 248 | self.caches += [() for _ in range(len(feat_files))] 249 | 250 | # define feature pre-processing function 251 | scaler = load(stats) 252 | self.feat_transform = lambda x: scaler.transform(x) 253 | 254 | def __getitem__(self, idx): 255 | """Get specified idx items. 256 | 257 | Args: 258 | idx (int): Index of the item. 259 | 260 | Returns: 261 | str: Utterance id (only in return_filename = True). 262 | ndarray: Feature (T', C). 263 | 264 | """ 265 | if self.allow_cache and len(self.caches[idx]) != 0: 266 | return self.caches[idx] 267 | 268 | feat = self.feat_load_fn(self.feat_files[idx]) 269 | # f0 scaled 270 | if self.f0_factor != 1.0: 271 | feat[:, self.f0_dim_idx] *= self.f0_factor 272 | # get dilated factor sequence 273 | f0 = batch_f0(feat, self.f0_threshold, self.f0_cont, 274 | self.f0_dim_idx, self.uv_dim_idx) 275 | df = dilated_factor(f0, self.fs, self.dense_factor) 276 | df = df.repeat(self.hop_size, axis=0) 277 | # feature pre-processing 278 | f0 = np.copy(feat[:, self.shift:self.shift+1]) 279 | feat[:, self.shift:] = self.feat_transform(feat[:, self.shift:]) 280 | 281 | if self.return_filename: 282 | items = self.feat_files[idx], f0, feat, df 283 | else: 284 | items = f0, feat, df 285 | 286 | if self.allow_cache: 287 | self.caches[idx] = items 288 | 289 | return items 290 | 291 | def __len__(self): 292 | """Return dataset length. 293 | 294 | Returns: 295 | int: The length of dataset. 296 | 297 | """ 298 | return len(self.feat_files) 299 | -------------------------------------------------------------------------------- /usfgan/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/UnifiedSourceFilterGAN/9665a10c8171c2fd89ebdfae05b32d42beb97389/usfgan/distributed/__init__.py -------------------------------------------------------------------------------- /usfgan/distributed/launch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Distributed process launcher. 5 | 6 | This code is modified from https://github.com/pytorch/pytorch/blob/v1.3.0/torch/distributed/launch.py. 7 | 8 | """ 9 | import os 10 | import subprocess 11 | import sys 12 | 13 | from argparse import ArgumentParser 14 | from argparse import REMAINDER 15 | 16 | 17 | def parse_args(): 18 | """Parse arguments.""" 19 | parser = ArgumentParser(description="PyTorch distributed training launch " 20 | "helper utilty that will spawn up " 21 | "multiple distributed processes") 22 | 23 | # Optional arguments for the launch helper 24 | parser.add_argument("--nnodes", type=int, default=1, 25 | help="The number of nodes to use for distributed " 26 | "training") 27 | parser.add_argument("--node_rank", type=int, default=0, 28 | help="The rank of the node for multi-node distributed " 29 | "training") 30 | parser.add_argument("--nproc_per_node", type=int, default=1, 31 | help="The number of processes to launch on each node, " 32 | "for GPU training, this is recommended to be set " 33 | "to the number of GPUs in your system so that " 34 | "each process can be bound to a single GPU.") 35 | parser.add_argument("--master_addr", default="127.0.0.1", type=str, 36 | help="Master node (rank 0)'s address, should be either " 37 | "the IP address or the hostname of node 0, for " 38 | "single node multi-proc training, the " 39 | "--master_addr can simply be 127.0.0.1") 40 | parser.add_argument("--master_port", default=29500, type=int, 41 | help="Master node (rank 0)'s free port that needs to " 42 | "be used for communciation during distributed " 43 | "training") 44 | parser.add_argument("--use_env", default=False, action="store_true", 45 | help="Use environment variable to pass " 46 | "'local rank'. For legacy reasons, the default value is False. " 47 | "If set to True, the script will not pass " 48 | "--local_rank as argument, and will instead set LOCAL_RANK.") 49 | parser.add_argument("-m", "--module", default=False, action="store_true", 50 | help="Changes each process to interpret the launch script " 51 | "as a python module, executing with the same behavior as" 52 | "'python -m'.") 53 | parser.add_argument("-c", "--command", default=False, action="store_true", 54 | help="Changes each process to interpret the launch script " 55 | "as a command.") 56 | 57 | # positional 58 | parser.add_argument("training_script", type=str, 59 | help="The full path to the single GPU training " 60 | "program/script/command to be launched in parallel, " 61 | "followed by all the arguments for the " 62 | "training script") 63 | 64 | # rest from the training program 65 | parser.add_argument('training_script_args', nargs=REMAINDER) 66 | return parser.parse_args() 67 | 68 | 69 | def main(): 70 | """Launch distributed processes.""" 71 | args = parse_args() 72 | 73 | # world size in terms of number of processes 74 | dist_world_size = args.nproc_per_node * args.nnodes 75 | 76 | # set PyTorch distributed related environmental variables 77 | current_env = os.environ.copy() 78 | current_env["MASTER_ADDR"] = args.master_addr 79 | current_env["MASTER_PORT"] = str(args.master_port) 80 | current_env["WORLD_SIZE"] = str(dist_world_size) 81 | 82 | processes = [] 83 | 84 | if 'OMP_NUM_THREADS' not in os.environ and args.nproc_per_node > 1: 85 | current_env["OMP_NUM_THREADS"] = str(1) 86 | print("*****************************************\n" 87 | "Setting OMP_NUM_THREADS environment variable for each process " 88 | "to be {} in default, to avoid your system being overloaded, " 89 | "please further tune the variable for optimal performance in " 90 | "your application as needed. \n" 91 | "*****************************************".format(current_env["OMP_NUM_THREADS"])) 92 | 93 | for local_rank in range(0, args.nproc_per_node): 94 | # each process's rank 95 | dist_rank = args.nproc_per_node * args.node_rank + local_rank 96 | current_env["RANK"] = str(dist_rank) 97 | current_env["LOCAL_RANK"] = str(local_rank) 98 | 99 | # spawn the processes 100 | if args.command: 101 | cmd = [args.training_script] 102 | else: 103 | cmd = [sys.executable, "-u"] 104 | if args.module: 105 | cmd.append("-m") 106 | cmd.append(args.training_script) 107 | 108 | if not args.use_env: 109 | cmd.append("--local_rank={}".format(local_rank)) 110 | 111 | cmd.extend(args.training_script_args) 112 | 113 | process = subprocess.Popen(cmd, env=current_env) 114 | processes.append(process) 115 | 116 | for process in processes: 117 | process.wait() 118 | if process.returncode != 0: 119 | raise subprocess.CalledProcessError( 120 | returncode=process.returncode, cmd=cmd) 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /usfgan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from usfgan.layers.residual_block import * # NOQA 2 | from usfgan.layers.upsample import * # NOQA 3 | from usfgan.layers.source_network import * # NOQA 4 | from usfgan.layers.filter_network import * # NOQA 5 | -------------------------------------------------------------------------------- /usfgan/layers/filter_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | 5 | """Filter Network module.""" 6 | 7 | import math 8 | 9 | import torch 10 | 11 | from usfgan.layers import Conv1d 12 | from usfgan.layers import Conv1d1x1 13 | from usfgan.layers import FixedBlock 14 | from usfgan.layers import AdaptiveBlock 15 | from usfgan.utils import pd_indexing 16 | 17 | 18 | class FilterNetwork(torch.nn.Module): 19 | 20 | def __init__(self, 21 | in_channels, 22 | out_channels, 23 | blockF, 24 | cycleF, 25 | blockA, 26 | cycleA, 27 | cascade_mode, 28 | residual_channels, 29 | gate_channels, 30 | skip_channels, 31 | aux_channels): 32 | super(FilterNetwork, self).__init__() 33 | 34 | # convert source signal to hidden representation 35 | self.conv_first = Conv1d1x1(in_channels, residual_channels, bias=True) 36 | 37 | # check the number of blocks and cycles 38 | cycleA = max(cycleA, 1) 39 | cycleF = max(cycleF, 1) 40 | assert blockF % cycleF == 0 41 | blockF_per_cycle = blockF // cycleF 42 | assert blockA % cycleA == 0 43 | self.blockA_per_cycle = blockA // cycleA 44 | 45 | # define fixed residual blocks 46 | fixed_blocks = torch.nn.ModuleList() 47 | for block in range(blockF): 48 | dilation = 2 ** (block % blockF_per_cycle) 49 | conv = FixedBlock( 50 | residual_channels=residual_channels, 51 | gate_channels=gate_channels, 52 | skip_channels=skip_channels, 53 | aux_channels=aux_channels, 54 | dilation=dilation, 55 | bias=True, 56 | ) 57 | fixed_blocks += [conv] 58 | 59 | # define adaptive residual blocks 60 | adaptive_blocks = torch.nn.ModuleList() 61 | for block in range(blockA): 62 | conv = AdaptiveBlock( 63 | residual_channels=residual_channels, 64 | gate_channels=gate_channels, 65 | skip_channels=skip_channels, 66 | aux_channels=aux_channels, 67 | bias=True, 68 | ) 69 | adaptive_blocks += [conv] 70 | 71 | # define cascaded structure 72 | if cascade_mode == 0: # fixed->adaptive 73 | self.conv_dilated = fixed_blocks.extend(adaptive_blocks) 74 | self.block_modes = [False] * blockF + [True] * blockA 75 | elif cascade_mode == 1: # adaptive->fixed 76 | self.conv_dilated = adaptive_blocks.extend(fixed_blocks) 77 | self.block_modes = [True] * blockA + [False] * blockF 78 | else: 79 | logging.error("Cascaded mode %d is not supported!" % (cascade_mode)) 80 | sys.exit(0) 81 | 82 | # convert hidden representation to output signal 83 | self.conv_last = torch.nn.ModuleList([ 84 | torch.nn.ReLU(inplace=True), 85 | Conv1d1x1(skip_channels, skip_channels, bias=True), 86 | torch.nn.ReLU(inplace=True), 87 | Conv1d1x1(skip_channels, out_channels, bias=True), 88 | ]) 89 | 90 | def forward(self, x, c, d, batch_index, ch_index): 91 | 92 | # encode to hidden representation 93 | x = self.conv_first(x) 94 | 95 | skips = 0 96 | blockA_idx = 0 97 | for f, mode in zip(self.conv_dilated, self.block_modes): 98 | if mode: # adaptive block 99 | dilation = 2 ** (blockA_idx % self.blockA_per_cycle) 100 | xP, xF = pd_indexing(x, d, dilation, batch_index, ch_index) 101 | x, h = f(x, xP, xF, c) 102 | blockA_idx += 1 103 | else: # fixed block 104 | x, h = f(x, c) 105 | skips += h 106 | skips *= math.sqrt(1.0 / len(self.conv_dilated)) 107 | 108 | # apply final layers 109 | x = skips 110 | for f in self.conv_last: 111 | x = f(x) 112 | 113 | return x -------------------------------------------------------------------------------- /usfgan/layers/residual_block.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 5 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 6 | # and also based on a WaveNet script by Ryuichi Yamamoto (Line) 7 | # (https://github.com/r9y9/wavenet_vocoder) 8 | # MIT License (https://opensource.org/licenses/MIT) 9 | 10 | """Quasi-Periodic Residual block module.""" 11 | 12 | import math 13 | 14 | import torch 15 | 16 | 17 | class Conv1d(torch.nn.Conv1d): 18 | """Conv1d module with customized initialization.""" 19 | 20 | def __init__(self, *args, **kwargs): 21 | """Initialize Conv1d module.""" 22 | super(Conv1d, self).__init__(*args, **kwargs) 23 | 24 | def reset_parameters(self): 25 | """Reset parameters.""" 26 | torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 27 | if self.bias is not None: 28 | torch.nn.init.constant_(self.bias, 0.0) 29 | 30 | 31 | class Conv1d1x1(Conv1d): 32 | """1x1 Conv1d with customized initialization.""" 33 | 34 | def __init__(self, in_channels, out_channels, bias): 35 | """Initialize 1x1 Conv1d module.""" 36 | super(Conv1d1x1, self).__init__(in_channels, out_channels, 37 | kernel_size=1, padding=0, 38 | dilation=1, bias=bias) 39 | 40 | 41 | class FixedBlock(torch.nn.Module): 42 | """Fixed block module in QPPWG.""" 43 | 44 | def __init__(self, 45 | residual_channels=64, 46 | gate_channels=128, 47 | skip_channels=64, 48 | aux_channels=80, 49 | dilation=1, 50 | bias=True, 51 | ): 52 | """Initialize Fixed ResidualBlock module. 53 | Args: 54 | residual_channels (int): Number of channels for residual connection. 55 | skip_channels (int): Number of channels for skip connection. 56 | aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. 57 | dilation (int): Dilation size. 58 | bias (bool): Whether to add bias parameter in convolution layers. 59 | """ 60 | super(FixedBlock, self).__init__() 61 | kernel_size = 3 # fixed kernel size 62 | padding = (kernel_size - 1) // 2 * dilation 63 | 64 | # dilation conv 65 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 66 | padding=padding, dilation=dilation, bias=bias) 67 | 68 | # local conditioning 69 | if aux_channels > 0: 70 | self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) 71 | else: 72 | self.conv1x1_aux = None 73 | 74 | # conv output is split into two groups 75 | gate_out_channels = gate_channels // 2 76 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) 77 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) 78 | 79 | def forward(self, x, c): 80 | """Calculate forward propagation. 81 | Args: 82 | x (Tensor): Input tensor (B, residual_channels, T). 83 | c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). 84 | Returns: 85 | Tensor: Output tensor for residual connection (B, residual_channels, T). 86 | Tensor: Output tensor for skip connection (B, skip_channels, T). 87 | """ 88 | residual = x 89 | x = self.conv(x) 90 | 91 | # split into two part for gated activation 92 | splitdim = 1 93 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 94 | 95 | # local conditioning 96 | if c is not None: 97 | assert self.conv1x1_aux is not None 98 | c = self.conv1x1_aux(c) 99 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 100 | xa, xb = xa + ca, xb + cb 101 | 102 | x = torch.tanh(xa) * torch.sigmoid(xb) 103 | 104 | # for skip connection 105 | s = self.conv1x1_skip(x) 106 | 107 | # for residual connection 108 | x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) 109 | 110 | return x, s 111 | 112 | 113 | class AdaptiveBlock(torch.nn.Module): 114 | """Adaptive block module in QPPWG.""" 115 | 116 | def __init__(self, 117 | residual_channels=64, 118 | gate_channels=128, 119 | skip_channels=64, 120 | aux_channels=80, 121 | bias=True, 122 | ): 123 | """Initialize Adaptive ResidualBlock module. 124 | Args: 125 | residual_channels (int): Number of channels for residual connection. 126 | skip_channels (int): Number of channels for skip connection. 127 | aux_channels (int): Local conditioning channels i.e. auxiliary input dimension. 128 | bias (bool): Whether to add bias parameter in convolution layers. 129 | """ 130 | super(AdaptiveBlock, self).__init__() 131 | 132 | # pitch-dependent dilation conv 133 | self.convP = Conv1d1x1(residual_channels, gate_channels, bias=bias) # past 134 | self.convC = Conv1d1x1(residual_channels, gate_channels, bias=bias) # current 135 | self.convF = Conv1d1x1(residual_channels, gate_channels, bias=bias) # future 136 | 137 | # local conditioning 138 | if aux_channels > 0: 139 | self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False) 140 | else: 141 | self.conv1x1_aux = None 142 | 143 | # conv output is split into two groups 144 | gate_out_channels = gate_channels // 2 145 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias) 146 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias) 147 | 148 | def forward(self, xC, xP, xF, c): 149 | """Calculate forward propagation. 150 | Args: 151 | xC (Tensor): Current input tensor (B, residual_channels, T). 152 | xP (Tensor): Past input tensor (B, residual_channels, T). 153 | xF (Tensor): Future input tensor (B, residual_channels, T). 154 | c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T). 155 | Returns: 156 | Tensor: Output tensor for residual connection (B, residual_channels, T). 157 | Tensor: Output tensor for skip connection (B, skip_channels, T). 158 | """ 159 | residual = xC 160 | x = self.convC(xC) + self.convP(xP) + self.convF(xF) 161 | 162 | # split into two part for gated activation 163 | splitdim = 1 164 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 165 | 166 | # local conditioning 167 | if c is not None: 168 | assert self.conv1x1_aux is not None 169 | c = self.conv1x1_aux(c) 170 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 171 | xa, xb = xa + ca, xb + cb 172 | 173 | x = torch.tanh(xa) * torch.sigmoid(xb) 174 | 175 | # for skip connection 176 | s = self.conv1x1_skip(x) 177 | 178 | # for residual connection 179 | x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5) 180 | 181 | return x, s -------------------------------------------------------------------------------- /usfgan/layers/source_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | 5 | """Source Network module.""" 6 | 7 | import math 8 | import logging 9 | 10 | import torch 11 | 12 | from usfgan.layers import Conv1d1x1 13 | from usfgan.layers import FixedBlock 14 | from usfgan.layers import AdaptiveBlock 15 | from usfgan.utils import pd_indexing 16 | 17 | 18 | class SourceNetwork(torch.nn.Module): 19 | 20 | def __init__(self, 21 | sampling_rate, 22 | in_channels, 23 | out_channels, 24 | blockF, 25 | cycleF, 26 | blockA, 27 | cycleA, 28 | cascade_mode, 29 | residual_channels, 30 | gate_channels, 31 | skip_channels, 32 | aux_channels): 33 | super(SourceNetwork, self).__init__() 34 | 35 | self.sampling_rate = sampling_rate 36 | 37 | # convert sine signal to hidden representation 38 | self.conv_first = Conv1d1x1(in_channels * 2, residual_channels, bias=True) 39 | 40 | # check the number of blocks and cycles 41 | cycleA = max(cycleA, 1) 42 | cycleF = max(cycleF, 1) 43 | assert blockF % cycleF == 0 44 | blockF_per_cycle = blockF // cycleF 45 | assert blockA % cycleA == 0 46 | self.blockA_per_cycle = blockA // cycleA 47 | 48 | # define fixed residual blocks 49 | fixed_blocks = torch.nn.ModuleList() 50 | for block in range(blockF): 51 | dilation = 2 ** (block % blockF_per_cycle) 52 | conv = FixedBlock( 53 | residual_channels=residual_channels, 54 | gate_channels=gate_channels, 55 | skip_channels=skip_channels, 56 | aux_channels=aux_channels, 57 | dilation=dilation, 58 | bias=True, 59 | ) 60 | fixed_blocks += [conv] 61 | 62 | # define adaptive residual blocks 63 | adaptive_blocks = torch.nn.ModuleList() 64 | for block in range(blockA): 65 | conv = AdaptiveBlock( 66 | residual_channels=residual_channels, 67 | gate_channels=gate_channels, 68 | skip_channels=skip_channels, 69 | aux_channels=aux_channels, 70 | bias=True, 71 | ) 72 | adaptive_blocks += [conv] 73 | 74 | # define cascaded structure 75 | if cascade_mode == 0: # fixed->adaptive 76 | self.conv_dilated = fixed_blocks.extend(adaptive_blocks) 77 | self.block_modes = [False] * blockF + [True] * blockA 78 | elif cascade_mode == 1: # adaptive->fixed 79 | self.conv_dilated = adaptive_blocks.extend(fixed_blocks) 80 | self.block_modes = [True] * blockA + [False] * blockF 81 | else: 82 | logging.error("Cascaded mode %d is not supported!" % (cascade_mode)) 83 | sys.exit(0) 84 | 85 | # convert hidden representation to harmonic signal 86 | self.conv_last = torch.nn.ModuleList([ 87 | torch.nn.ReLU(inplace=True), 88 | Conv1d1x1(skip_channels, skip_channels, bias=True), 89 | torch.nn.ReLU(inplace=True), 90 | Conv1d1x1(skip_channels, out_channels, bias=True), 91 | ]) 92 | 93 | def forward(self, x, f0, c, d, batch_index, ch_index): 94 | 95 | with torch.no_grad(): 96 | uv = (f0 > 0) * torch.ones_like(f0) 97 | rad_values = (f0 / self.sampling_rate) % 1 98 | v = uv * torch.sin(torch.cumsum(rad_values, dim=2) * 2 * math.pi) 99 | 100 | # encode to hidden representation 101 | x = torch.cat((x, v), dim=1) 102 | x = self.conv_first(x) 103 | 104 | skips = 0 105 | blockA_idx = 0 106 | for f, mode in zip(self.conv_dilated, self.block_modes): 107 | if mode: # adaptive block 108 | dilation = 2 ** (blockA_idx % self.blockA_per_cycle) 109 | xP, xF = pd_indexing(x, d, dilation, batch_index, ch_index) 110 | x, h = f(x, xP, xF, c) 111 | blockA_idx += 1 112 | else: # fixed block 113 | x, h = f(x, c) 114 | skips += h 115 | skips *= math.sqrt(1.0 / len(self.conv_dilated)) 116 | 117 | # apply final layers 118 | x = skips 119 | for f in self.conv_last: 120 | x = f(x) 121 | 122 | return x -------------------------------------------------------------------------------- /usfgan/layers/upsample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Upsampling module. 4 | 5 | This code is modified from https://github.com/r9y9/wavenet_vocoder. 6 | 7 | """ 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn.functional as F 12 | 13 | from usfgan.layers import Conv1d 14 | 15 | 16 | class Stretch2d(torch.nn.Module): 17 | """Stretch2d module.""" 18 | 19 | def __init__(self, x_scale, y_scale, mode="nearest"): 20 | """Initialize Stretch2d module. 21 | Args: 22 | x_scale (int): X scaling factor (Time axis in spectrogram). 23 | y_scale (int): Y scaling factor (Frequency axis in spectrogram). 24 | mode (str): Interpolation mode. 25 | """ 26 | super(Stretch2d, self).__init__() 27 | self.x_scale = x_scale 28 | self.y_scale = y_scale 29 | self.mode = mode 30 | 31 | def forward(self, x): 32 | """Calculate forward propagation. 33 | Args: 34 | x (Tensor): Input tensor (B, C, F, T). 35 | Returns: 36 | Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), 37 | """ 38 | return F.interpolate( 39 | x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) 40 | 41 | 42 | class Conv2d(torch.nn.Conv2d): 43 | """Conv2d module with customized initialization.""" 44 | 45 | def __init__(self, *args, **kwargs): 46 | """Initialize Conv2d module.""" 47 | super(Conv2d, self).__init__(*args, **kwargs) 48 | 49 | def reset_parameters(self): 50 | """Reset parameters.""" 51 | self.weight.data.fill_(1. / np.prod(self.kernel_size)) 52 | if self.bias is not None: 53 | torch.nn.init.constant_(self.bias, 0.0) 54 | 55 | 56 | class UpsampleNetwork(torch.nn.Module): 57 | """Upsampling network module.""" 58 | 59 | def __init__(self, 60 | upsample_scales, 61 | nonlinear_activation=None, 62 | nonlinear_activation_params={}, 63 | interpolate_mode="nearest", 64 | freq_axis_kernel_size=1, 65 | use_causal_conv=False, 66 | ): 67 | """Initialize upsampling network module. 68 | Args: 69 | upsample_scales (list): List of upsampling scales. 70 | nonlinear_activation (str): Activation function name. 71 | nonlinear_activation_params (dict): Arguments for specified activation function. 72 | interpolate_mode (str): Interpolation mode. 73 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 74 | """ 75 | super(UpsampleNetwork, self).__init__() 76 | self.use_causal_conv = use_causal_conv 77 | self.up_layers = torch.nn.ModuleList() 78 | for scale in upsample_scales: 79 | # interpolation layer 80 | stretch = Stretch2d(scale, 1, interpolate_mode) 81 | self.up_layers += [stretch] 82 | 83 | # conv layer 84 | assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." 85 | freq_axis_padding = (freq_axis_kernel_size - 1) // 2 86 | kernel_size = (freq_axis_kernel_size, scale * 2 + 1) 87 | if use_causal_conv: 88 | padding = (freq_axis_padding, scale * 2) 89 | else: 90 | padding = (freq_axis_padding, scale) 91 | conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) 92 | self.up_layers += [conv] 93 | 94 | # nonlinear 95 | if nonlinear_activation is not None: 96 | nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) 97 | self.up_layers += [nonlinear] 98 | 99 | def forward(self, c): 100 | """Calculate forward propagation. 101 | Args: 102 | c : Input tensor (B, C, T). 103 | Returns: 104 | Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales). 105 | """ 106 | c = c.unsqueeze(1) # (B, 1, C, T) 107 | for f in self.up_layers: 108 | if self.use_causal_conv and isinstance(f, Conv2d): 109 | c = f(c)[..., :c.size(-1)] 110 | else: 111 | c = f(c) 112 | return c.squeeze(1) # (B, C, T') 113 | 114 | 115 | class ConvInUpsampleNetwork(torch.nn.Module): 116 | """Convolution + upsampling network module.""" 117 | 118 | def __init__(self, 119 | upsample_scales, 120 | nonlinear_activation=None, 121 | nonlinear_activation_params={}, 122 | interpolate_mode="nearest", 123 | freq_axis_kernel_size=1, 124 | aux_channels=80, 125 | aux_context_window=0, 126 | use_causal_conv=False 127 | ): 128 | """Initialize convolution + upsampling network module. 129 | Args: 130 | upsample_scales (list): List of upsampling scales. 131 | nonlinear_activation (str): Activation function name. 132 | nonlinear_activation_params (dict): Arguments for specified activation function. 133 | mode (str): Interpolation mode. 134 | freq_axis_kernel_size (int): Kernel size in the direction of frequency axis. 135 | aux_channels (int): Number of channels of pre-convolutional layer. 136 | aux_context_window (int): Context window size of the pre-convolutional layer. 137 | use_causal_conv (bool): Whether to use causal structure. 138 | """ 139 | super(ConvInUpsampleNetwork, self).__init__() 140 | self.aux_context_window = aux_context_window 141 | self.use_causal_conv = use_causal_conv and aux_context_window > 0 142 | # To capture wide-context information in conditional features 143 | kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 144 | # NOTE(kan-bayashi): Here do not use padding because the input is already padded 145 | self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) 146 | self.upsample = UpsampleNetwork( 147 | upsample_scales=upsample_scales, 148 | nonlinear_activation=nonlinear_activation, 149 | nonlinear_activation_params=nonlinear_activation_params, 150 | interpolate_mode=interpolate_mode, 151 | freq_axis_kernel_size=freq_axis_kernel_size, 152 | use_causal_conv=use_causal_conv, 153 | ) 154 | 155 | def forward(self, c): 156 | """Calculate forward propagation. 157 | Args: 158 | c : Input tensor (B, C, T'). 159 | Returns: 160 | Tensor: Upsampled tensor (B, C, T), 161 | where T = (T' - aux_context_window * 2) * prod(upsample_scales). 162 | Note: 163 | The length of inputs considers the context window size. 164 | """ 165 | c_ = self.conv_in(c) 166 | c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ 167 | return self.upsample(c) -------------------------------------------------------------------------------- /usfgan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from usfgan.losses.cheaptrick import * # NOQA 2 | from usfgan.losses.stft_loss import * # NOQA 3 | from usfgan.losses.source_loss import * # NOQA -------------------------------------------------------------------------------- /usfgan/losses/cheaptrick.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Spectral envelope estimation module based on the idea of CheapTrick. 7 | Please see https://www.sciencedirect.com/science/article/pii/S0167639314000697 for details.""" 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.fft 14 | 15 | 16 | class AdaptiveWindowing(nn.Module): 17 | 18 | def __init__(self, 19 | sampling_rate, 20 | hop_size, 21 | fft_size, 22 | f0_floor, 23 | f0_ceil, 24 | ): 25 | """Initilize AdaptiveWindowing module. 26 | Args: 27 | sampling_rate (int): Sampling rate. 28 | hop_size (int): Hop size. 29 | fft_size (int): FFT size. 30 | f0_floor (int): Minimum value of F0. 31 | f0_ceil (int): Maximum value of F0. 32 | """ 33 | super(AdaptiveWindowing, self).__init__() 34 | 35 | self.sampling_rate = sampling_rate 36 | self.hop_size = hop_size 37 | self.fft_size = fft_size 38 | self.window = torch.zeros((f0_ceil+1, fft_size)).cuda() 39 | self.zero_padding = nn.ConstantPad2d((fft_size // 2, fft_size // 2, 0, 0), 0) 40 | 41 | # Pre-calculation of the window functions 42 | for f0 in range(f0_floor, f0_ceil + 1): 43 | half_win_len = round(1.5 * self.sampling_rate / f0) 44 | base_index = torch.arange(-half_win_len, half_win_len + 1, dtype=torch.int64) 45 | position = base_index / 1.5 / self.sampling_rate 46 | left = fft_size // 2 - half_win_len 47 | right = fft_size // 2 + half_win_len + 1 48 | window = torch.zeros(fft_size) 49 | window[left: right] = 0.5 * torch.cos(math.pi * position * f0) + 0.5 50 | average = torch.sum(window * window).pow(0.5) 51 | self.window[f0] = (window / average) 52 | 53 | def forward(self, x, f): 54 | """Calculate forward propagation. 55 | Args: 56 | x (Tensor): Waveform (B, fft_size // 2 + 1, T). 57 | f (Tensor): F0 sequence (B, T'). 58 | Returns: 59 | Tensor: Power spectrogram (B, bin_size, T'). 60 | """ 61 | # Get the matrix of window functions corresponding to F0 62 | x = self.zero_padding(x).unfold(1, self.fft_size, self.hop_size) 63 | windows = self.window[f] 64 | # Adaptive windowing and calculate power spectrogram. 65 | # In test, change x[:, : -1, :] to x. 66 | x = torch.abs(torch.fft.rfft(x[:, : -1, :] * windows)).pow(2) 67 | 68 | return x 69 | 70 | 71 | class AdaptiveLiftering(nn.Module): 72 | 73 | def __init__(self, 74 | sampling_rate, 75 | fft_size, 76 | f0_floor, 77 | f0_ceil, 78 | q1=-0.15, 79 | ): 80 | """Initilize AdaptiveLiftering module. 81 | Args: 82 | sampling_rate (int): Sampling rate. 83 | fft_size (int): FFT size. 84 | f0_floor (int): Minimum value of F0. 85 | f0_ceil (int): Maximum value of F0. 86 | q1 (float): Parameter to remove effect of adjacent harmonics. 87 | """ 88 | super(AdaptiveLiftering, self).__init__() 89 | 90 | self.sampling_rate = sampling_rate 91 | self.bin_size = fft_size // 2 + 1 92 | self.q1 = q1 93 | self.q0 = 1.0 - 2.0 * q1 94 | self.smoothing_lifter = torch.zeros((f0_ceil+1, self.bin_size)).cuda() 95 | self.compensation_lifter = torch.zeros((f0_ceil+1, self.bin_size)).cuda() 96 | 97 | # Pre-calculation of the smoothing lifters and compensation lifters 98 | for f0 in range(f0_floor, f0_ceil + 1): 99 | smoothing_lifter = torch.zeros(self.bin_size) 100 | compensation_lifter = torch.zeros(self.bin_size) 101 | quefrency = torch.arange(1, self.bin_size) / sampling_rate 102 | smoothing_lifter[0] = 1.0 103 | smoothing_lifter[1:] = torch.sin(math.pi * f0 * quefrency) / (math.pi * f0 * quefrency) 104 | compensation_lifter[0] = self.q0 + 2.0 * self.q1 105 | compensation_lifter[1:] = self.q0 + 2.0 * self.q1 * torch.cos(2.0 * math.pi * f0 * quefrency) 106 | self.smoothing_lifter[f0] = smoothing_lifter 107 | self.compensation_lifter[f0] = compensation_lifter 108 | 109 | def forward(self, x, f): 110 | """Calculate forward propagation. 111 | Args: 112 | x (Tensor): Power spectrogram (B, bin_size, T'). 113 | f (Tensor): F0 sequence (B, T'). 114 | Returns: 115 | Tensor: Estimated spectral envelope (B, bin_size, T'). 116 | """ 117 | # Setting the smoothing lifter and compensation lifter 118 | smoothing_lifter = self.smoothing_lifter[f] 119 | compensation_lifter = self.compensation_lifter[f] 120 | # Calculating cepstrum 121 | tmp = torch.cat((x, torch.flip(x[:, :, 1:-1], [2])), dim=2) 122 | cepstrum = torch.fft.rfft( 123 | torch.log(torch.clamp(tmp, min=1e-7)) 124 | ).real 125 | # Liftering cepstrum with the lifters 126 | liftered_cepstrum = cepstrum * smoothing_lifter * compensation_lifter 127 | # Return the result to the spectral domain 128 | x = torch.fft.irfft(liftered_cepstrum)[:, :, : self.bin_size] 129 | 130 | return x 131 | 132 | 133 | class CheapTrick(nn.Module): 134 | 135 | def __init__(self, 136 | sampling_rate, 137 | hop_size, 138 | fft_size, 139 | f0_floor, 140 | f0_ceil, 141 | uv_threshold=0, 142 | q1=-0.15, 143 | ): 144 | """Initilize AdaptiveLiftering module. 145 | Args: 146 | sampling_rate (int): Sampling rate. 147 | hop_size (int): Hop size. 148 | fft_size (int): FFT size. 149 | f0_floor (int): Minimum value of F0. 150 | f0_ceil (int): Maximum value of F0. 151 | uv_threshold (float): V/UV determining threshold. 152 | q1 (float): Parameter to remove effect of adjacent harmonics. 153 | """ 154 | super(CheapTrick, self).__init__() 155 | 156 | # fft_size must be larger than 3.0 * sampling_rate / f0_floor 157 | assert fft_size > 3.0 * sampling_rate / f0_floor 158 | self.f0_floor = f0_floor 159 | self.f0_ceil = f0_ceil 160 | self.uv_threshold = uv_threshold 161 | 162 | self.ada_wind = AdaptiveWindowing( 163 | sampling_rate, 164 | hop_size, 165 | fft_size, 166 | f0_floor, 167 | f0_ceil, 168 | ) 169 | self.ada_lift = AdaptiveLiftering( 170 | sampling_rate, 171 | fft_size, 172 | f0_floor, 173 | f0_ceil, 174 | q1, 175 | ) 176 | 177 | def forward(self, x, f): 178 | """Calculate forward propagation. 179 | Args: 180 | x (Tensor): Power spectrogram (B, T). 181 | f (Tensor): F0 sequence (B, T'). 182 | Returns: 183 | Tensor: Estimated spectral envelope (B, bin_size, T'). 184 | """ 185 | # Step0: Round F0 values to integers. 186 | voiced = (f > self.uv_threshold) * torch.ones_like(f) 187 | f = voiced * f + (1 - voiced) * self.f0_ceil 188 | f = torch.round( 189 | torch.clamp(f, min=self.f0_floor, max=self.f0_ceil) 190 | ).to(torch.int64) 191 | # Step1: Adaptive windowing and calculate power spectrogram. 192 | x = self.ada_wind(x, f) 193 | # Step3: Smoothing (log axis) and spectral recovery on the cepstrum domain. 194 | x = self.ada_lift(x, f) 195 | 196 | return x 197 | 198 | 199 | if __name__ == "__main__": 200 | """Test of spectral envelope extraction.""" 201 | import numpy as np 202 | import pyworld as pw 203 | import soundfile as sf 204 | import librosa.display 205 | import matplotlib.pyplot as plt 206 | 207 | config = { 208 | 'sampling_rate': 16000, 209 | 'hop_size': 80, 210 | 'fft_size': 1024, 211 | 'f0_floor': 50, 212 | 'f0_ceil': 500 213 | } 214 | 215 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 216 | cheaptrick = CheapTrick(**config) 217 | cheaptrick.to(device) 218 | 219 | file_name = "../../egs/arctic/data/wav/arctic_evaluation/bdl/bdl_arctic_b0474.wav" 220 | x, sr = sf.read(file_name) 221 | x = x[:config['sampling_rate']] 222 | _f0, t = pw.dio(x, config['sampling_rate'], frame_period=config['hop_size'] * 1000 / config['sampling_rate']) 223 | f0 = pw.stonemask(x, _f0, t, config['sampling_rate']) 224 | ap = pw.d4c(x, f0, t, config['sampling_rate']) 225 | 226 | x = torch.from_numpy(np.array(x[np.newaxis, :])).clone().to(device) 227 | f0 = torch.from_numpy(np.array(f0[np.newaxis, :])).clone().to(device) 228 | sp = torch.exp(cheaptrick.forward(x, f0)) 229 | sp = sp.to('cpu').numpy().copy()[0] 230 | f0 = f0.to('cpu').numpy().copy()[0] 231 | 232 | # confirm whether the signal is resynthesized properly 233 | y = pw.synthesize(f0, sp, ap, config['sampling_rate'], config['hop_size'] * 1000 / config['sampling_rate']) 234 | save_name = 'resynthesized.wav' 235 | sf.write(save_name, y, config['sampling_rate']) 236 | 237 | # confirm whether reasonable spectral envelopes are extracted 238 | sp_db = librosa.power_to_db(sp) 239 | librosa.display.specshow(data=sp_db.T, sr=config['sampling_rate'], 240 | hop_length=config['hop_size'], y_axis='linear', x_axis='time') 241 | plt.colorbar(format="%+2.f dB") 242 | save_name = 'spectrogram.png' 243 | plt.savefig(save_name) 244 | -------------------------------------------------------------------------------- /usfgan/losses/source_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Source Excitation Spectral Envelope Regularization Loss module.""" 7 | 8 | import numpy as np 9 | import pyworld 10 | 11 | import torch 12 | import torch.fft 13 | import torch.nn as nn 14 | 15 | from usfgan.losses import CheapTrick 16 | 17 | 18 | class SourceLoss(torch.nn.Module): 19 | 20 | def __init__(self, 21 | sampling_rate, 22 | hop_size, 23 | fft_size, 24 | f0_floor, 25 | f0_ceil, 26 | uv_threshold=0, 27 | q1=-0.15): 28 | """Initialize source loss module. 29 | Args: 30 | sampling_rate (int): Sampling rate. 31 | hop_size (int): Hop size. 32 | fft_size (int): FFT size. 33 | f0_floor (int): Minimum F0 value. 34 | f0_ceil (int): Maximum F0 value. 35 | uv_threshold (float): V/UV determining threshold. 36 | q1 (float): Parameter to remove effect of adjacent harmonics. 37 | """ 38 | super(SourceLoss, self).__init__() 39 | 40 | self.cheaptrick = CheapTrick(sampling_rate=sampling_rate, 41 | hop_size=hop_size, 42 | fft_size=fft_size, 43 | f0_floor=f0_floor, 44 | f0_ceil=f0_ceil, 45 | uv_threshold=uv_threshold, 46 | q1=q1) 47 | self.loss = nn.MSELoss() 48 | 49 | def forward(self, s, f): 50 | """Calculate forward propagation. 51 | Args: 52 | s (Tensor): Predicted source signal (B, T). 53 | f (Tensor): Extracted F0 sequence (B, T'). 54 | Returns: 55 | source_loss (Tensor): Source loss value. 56 | """ 57 | e = self.cheaptrick.forward(s, f) 58 | source_loss = self.loss(e, e.new_zeros(e.size())) 59 | 60 | return source_loss 61 | -------------------------------------------------------------------------------- /usfgan/losses/stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | # based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 5 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 6 | # MIT License (https://opensource.org/licenses/MIT) 7 | 8 | """STFT-based Loss modules.""" 9 | 10 | import torch 11 | import torch.nn.functional as F 12 | 13 | 14 | class STFTLoss(torch.nn.Module): 15 | """STFT loss module.""" 16 | 17 | def __init__(self, fft_size, hop_size, win_length, window): 18 | """Initialize STFT loss module.""" 19 | super(STFTLoss, self).__init__() 20 | self.fft_size = fft_size 21 | self.hop_size = hop_size 22 | self.win_length = win_length 23 | self.window = getattr(torch, window)(win_length).cuda() 24 | self.mse_loss = torch.nn.MSELoss() 25 | 26 | def forward(self, x, y): 27 | """Calculate forward propagation. 28 | Args: 29 | x (Tensor): Predicted signal (B, T). 30 | y (Tensor): Groundtruth signal (B, T). 31 | Returns: 32 | Tensor: Logarithmic power STFT loss value. 33 | """ 34 | 35 | x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_length, 36 | window=self.window, onesided=True, pad_mode="constant") 37 | y_stft = torch.stft(y, self.fft_size, self.hop_size, self.win_length, 38 | window=self.window, onesided=True, pad_mode="constant") 39 | 40 | x_log_pow = torch.log(torch.clamp(torch.norm(x_stft, 2, -1).pow(2), min=1e-7)) 41 | y_log_pow = torch.log(torch.clamp(torch.norm(y_stft, 2, -1).pow(2), min=1e-7)) 42 | 43 | stft_loss = self.mse_loss(x_log_pow, y_log_pow) 44 | 45 | return stft_loss 46 | 47 | 48 | class MultiResolutionSTFTLoss(torch.nn.Module): 49 | 50 | def __init__(self, 51 | fft_sizes=[512, 128, 2048], 52 | hop_sizes=[80, 40, 640], 53 | win_lengths=[320, 80, 1920], 54 | window="hann_window"): 55 | """Initialize source loss module. 56 | Args: 57 | fft_sizes (int): FFT size. 58 | hop_sizes (int): Hop size. 59 | win_lengths (int): Window length. 60 | window (str): Window function type. 61 | """ 62 | super(MultiResolutionSTFTLoss, self).__init__() 63 | 64 | self.fft_sizes = fft_sizes 65 | self.hop_sizes = hop_sizes 66 | self.win_lengths = win_lengths 67 | self.window = window 68 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 69 | 70 | self.stft_losses = torch.nn.ModuleList() 71 | for fs, hs, wl in zip(fft_sizes, hop_sizes, win_lengths): 72 | self.stft_losses += [STFTLoss(fs, hs, wl, window)] 73 | 74 | def forward(self, x, y): 75 | 76 | stft_loss = 0.0 77 | # multi resolution stft loss 78 | for f in self.stft_losses: 79 | l = f(x, y) 80 | stft_loss += l 81 | stft_loss /= len(self.stft_losses) 82 | 83 | return stft_loss -------------------------------------------------------------------------------- /usfgan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from usfgan.models.usfgan import * # NOQA -------------------------------------------------------------------------------- /usfgan/models/usfgan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Reo Yoneyama (Nagoya University) 4 | # based on a Quasi-Periodic Parallel WaveGAN script by Yi-Chiao Wu (Nagoya University) 5 | # (https://github.com/bigpon/QPPWG) 6 | # and also based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 7 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 8 | # MIT License (https://opensource.org/licenses/MIT) 9 | 10 | """Unified Source-Filter GAN Modules.""" 11 | 12 | import sys 13 | import logging 14 | import math 15 | 16 | import torch 17 | 18 | from usfgan.layers import Conv1d 19 | from usfgan.layers import Conv1d1x1 20 | from usfgan.layers import FixedBlock 21 | from usfgan.layers import AdaptiveBlock 22 | from usfgan.layers import upsample 23 | from usfgan.layers import SourceNetwork 24 | from usfgan.layers import FilterNetwork 25 | from usfgan.utils import pd_indexing, index_initial 26 | 27 | 28 | class USFGANGenerator(torch.nn.Module): 29 | """uSFGAN Generator module.""" 30 | 31 | def __init__(self, 32 | sampling_rate, 33 | hop_size, 34 | in_channels, 35 | out_channels, 36 | blockFs, 37 | cycleFs, 38 | blockAs, 39 | cycleAs, 40 | cascade_modes, 41 | residual_channels=64, 42 | gate_channels=128, 43 | skip_channels=64, 44 | aux_channels=80, 45 | aux_context_window=2, 46 | upsample_params={"upsample_scales": [4, 2, 5, 2]}): 47 | """Initialize uSFGAN Generator module. 48 | 49 | Args: 50 | in_channels (int): Number of input channels. 51 | out_channels (int): Number of output channels. 52 | kernel_size (int): Kernel size of dilated convolution. 53 | residual_channels (int): Number of channels in residual conv. 54 | gate_channels (int): Number of channels in gated conv. 55 | skip_channels (int): Number of channels in skip conv. 56 | aux_channels (int): Number of channels for auxiliary feature conv. 57 | aux_context_window (int): Context window size for auxiliary feature. 58 | dropout (float): Dropout rate. 0.0 means no dropout applied. 59 | bias (bool): Whether to use bias parameter in conv layer. 60 | use_weight_norm (bool): Whether to use weight norm. 61 | If set to true, it will be applied to all of the conv layers. 62 | use_causal_conv (bool): Whether to use causal structure. 63 | upsample_conditional_features (bool): Whether to use upsampling network. 64 | upsample_net (str): Upsampling network architecture. 65 | upsample_params (dict): Upsampling network parameters. 66 | 67 | """ 68 | super(USFGANGenerator, self).__init__() 69 | 70 | torch.manual_seed(1) 71 | self.in_channels = in_channels 72 | self.out_channels = out_channels 73 | self.residual_channels = residual_channels 74 | self.aux_channels = aux_channels 75 | self.aux_context_window = aux_context_window 76 | 77 | # define upsampling networks 78 | self.upsample_net_f0 = torch.nn.Upsample(scale_factor=hop_size) 79 | upsample_params.update({ 80 | "aux_channels": aux_channels, 81 | "aux_context_window": aux_context_window, 82 | }) 83 | self.upsample_net = getattr(upsample, "ConvInUpsampleNetwork")(**upsample_params) 84 | 85 | self.source_network = SourceNetwork(sampling_rate, 86 | in_channels, 87 | out_channels, 88 | blockFs[0], 89 | cycleFs[0], 90 | blockAs[0], 91 | cycleAs[0], 92 | cascade_modes[0], 93 | residual_channels, 94 | gate_channels, 95 | skip_channels, 96 | aux_channels,) 97 | 98 | self.filter_network = FilterNetwork(in_channels, 99 | out_channels, 100 | blockFs[1], 101 | cycleFs[1], 102 | blockAs[1], 103 | cycleAs[1], 104 | cascade_modes[1], 105 | residual_channels, 106 | gate_channels, 107 | skip_channels, 108 | aux_channels,) 109 | 110 | def forward(self, x, f, c, d): 111 | """Calculate forward propagation. 112 | 113 | Args: 114 | x (Tensor): Input noise signal (B, 1, T). 115 | f (Tendor): F0 (B, C, T') 116 | c (Tensor): Local conditioning auxiliary features (B, C ,T'). 117 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 118 | 119 | Returns: 120 | Tensor: Output tensor (B, out_channels, T) 121 | 122 | """ 123 | # index initialization 124 | batch_index, ch_index = index_initial(x.size(0), self.residual_channels) 125 | 126 | # perform upsampling 127 | f_ = self.upsample_net_f0(f) 128 | assert f_.size(-1) == x.size(-1) 129 | c = self.upsample_net(c) 130 | assert c.size(-1) == x.size(-1) 131 | 132 | # generate source signals 133 | s = self.source_network(x, f_, c, d, batch_index, ch_index) 134 | 135 | # spectral filter 136 | x = self.filter_network(s, c, d, batch_index, ch_index) 137 | 138 | return x, s, f 139 | 140 | def remove_weight_norm(self): 141 | """Remove weight normalization module from all of the layers.""" 142 | def _remove_weight_norm(m): 143 | try: 144 | logging.debug(f"Weight norm is removed from {m}.") 145 | torch.nn.utils.remove_weight_norm(m) 146 | except ValueError: # this module didn't have weight norm 147 | return 148 | 149 | self.apply(_remove_weight_norm) 150 | 151 | def apply_weight_norm(self): 152 | """Apply weight normalization module from all of the layers.""" 153 | def _apply_weight_norm(m): 154 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 155 | torch.nn.utils.weight_norm(m) 156 | logging.debug(f"Weight norm is applied to {m}.") 157 | 158 | self.apply(_apply_weight_norm) 159 | 160 | 161 | class PWGDiscriminator(torch.nn.Module): 162 | """Parallel WaveGAN Discriminator module.""" 163 | 164 | def __init__(self, 165 | in_channels=1, 166 | out_channels=1, 167 | kernel_size=3, 168 | layers=10, 169 | conv_channels=64, 170 | dilation_factor=1, 171 | nonlinear_activation="LeakyReLU", 172 | nonlinear_activation_params={"negative_slope": 0.2}, 173 | bias=True, 174 | use_weight_norm=True, 175 | ): 176 | """Initialize Parallel WaveGAN Discriminator module. 177 | Args: 178 | in_channels (int): Number of input channels. 179 | out_channels (int): Number of output channels. 180 | kernel_size (int): Number of output channels. 181 | layers (int): Number of conv layers. 182 | conv_channels (int): Number of chnn layers. 183 | dilation_factor (int): Dilation factor. For example, if dilation_factor = 2, 184 | the dilation will be 2, 4, 8, ..., and so on. 185 | nonlinear_activation (str): Nonlinear function after each conv. 186 | nonlinear_activation_params (dict): Nonlinear function parameters 187 | bias (bool): Whether to use bias parameter in conv. 188 | use_weight_norm (bool) Whether to use weight norm. 189 | If set to true, it will be applied to all of the conv layers. 190 | """ 191 | super(PWGDiscriminator, self).__init__() 192 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 193 | assert dilation_factor > 0, "Dilation factor must be > 0." 194 | self.conv_layers = torch.nn.ModuleList() 195 | conv_in_channels = in_channels 196 | for i in range(layers - 1): 197 | if i == 0: 198 | dilation = 1 199 | else: 200 | dilation = i if dilation_factor == 1 else dilation_factor ** i 201 | conv_in_channels = conv_channels 202 | padding = (kernel_size - 1) // 2 * dilation 203 | conv_layer = [ 204 | Conv1d(conv_in_channels, conv_channels, 205 | kernel_size=kernel_size, padding=padding, 206 | dilation=dilation, bias=bias), 207 | getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params) 208 | ] 209 | self.conv_layers += conv_layer 210 | padding = (kernel_size - 1) // 2 211 | conv_last_layer = Conv1d( 212 | conv_in_channels, out_channels, 213 | kernel_size=kernel_size, padding=padding, bias=bias) 214 | self.conv_layers += [conv_last_layer] 215 | 216 | # apply weight norm 217 | if use_weight_norm: 218 | self.apply_weight_norm() 219 | 220 | def forward(self, x): 221 | """Calculate forward propagation. 222 | Args: 223 | x (Tensor): Input noise signal (B, 1, T). 224 | Returns: 225 | Tensor: Output tensor (B, 1, T) 226 | """ 227 | for f in self.conv_layers: 228 | x = f(x) 229 | return x 230 | 231 | def apply_weight_norm(self): 232 | """Apply weight normalization module from all of the layers.""" 233 | def _apply_weight_norm(m): 234 | if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): 235 | torch.nn.utils.weight_norm(m) 236 | logging.debug(f"Weight norm is applied to {m}.") 237 | 238 | self.apply(_apply_weight_norm) 239 | 240 | def remove_weight_norm(self): 241 | """Remove weight normalization module from all of the layers.""" 242 | def _remove_weight_norm(m): 243 | try: 244 | logging.debug(f"Weight norm is removed from {m}.") 245 | torch.nn.utils.remove_weight_norm(m) 246 | except ValueError: # this module didn't have weight norm 247 | return 248 | 249 | self.apply(_remove_weight_norm) 250 | -------------------------------------------------------------------------------- /usfgan/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.optim import * # NOQA 2 | from usfgan.optimizers.radam import * # NOQA 3 | -------------------------------------------------------------------------------- /usfgan/optimizers/radam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """RAdam optimizer. 4 | This code is derived from https://github.com/LiyuanLucasLiu/RAdam. 5 | """ 6 | 7 | import math 8 | import torch 9 | 10 | from torch.optim.optimizer import Optimizer 11 | 12 | 13 | class RAdam(Optimizer): 14 | """Rectified Adam optimizer.""" 15 | 16 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 17 | """Initilize RAdam optimizer.""" 18 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 19 | self.buffer = [[None, None, None] for ind in range(10)] 20 | super(RAdam, self).__init__(params, defaults) 21 | 22 | def __setstate__(self, state): 23 | """Set state.""" 24 | super(RAdam, self).__setstate__(state) 25 | 26 | def step(self, closure=None): 27 | """Run one step.""" 28 | loss = None 29 | if closure is not None: 30 | loss = closure() 31 | 32 | for group in self.param_groups: 33 | 34 | for p in group['params']: 35 | if p.grad is None: 36 | continue 37 | grad = p.grad.data.float() 38 | if grad.is_sparse: 39 | raise RuntimeError('RAdam does not support sparse gradients') 40 | 41 | p_data_fp32 = p.data.float() 42 | 43 | state = self.state[p] 44 | 45 | if len(state) == 0: 46 | state['step'] = 0 47 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 48 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 49 | else: 50 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 51 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 52 | 53 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 54 | beta1, beta2 = group['betas'] 55 | 56 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 57 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 58 | 59 | state['step'] += 1 60 | buffered = self.buffer[int(state['step'] % 10)] 61 | if state['step'] == buffered[0]: 62 | N_sma, step_size = buffered[1], buffered[2] 63 | else: 64 | buffered[0] = state['step'] 65 | beta2_t = beta2 ** state['step'] 66 | N_sma_max = 2 / (1 - beta2) - 1 67 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 68 | buffered[1] = N_sma 69 | 70 | # more conservative since it's an approximated value 71 | if N_sma >= 5: 72 | step_size = math.sqrt( 73 | (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA 74 | else: 75 | step_size = 1.0 / (1 - beta1 ** state['step']) 76 | buffered[2] = step_size 77 | 78 | if group['weight_decay'] != 0: 79 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 80 | 81 | # more conservative since it's an approximated value 82 | if N_sma >= 5: 83 | denom = exp_avg_sq.sqrt().add_(group['eps']) 84 | p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) 85 | else: 86 | p_data_fp32.add_(-step_size * group['lr'], exp_avg) 87 | 88 | p.data.copy_(p_data_fp32) 89 | 90 | return loss -------------------------------------------------------------------------------- /usfgan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from usfgan.utils.utils import * # NOQA 2 | from usfgan.utils.filters import * # NOQA 3 | from usfgan.utils.features import * # NOQA 4 | from usfgan.utils.index import * # NOQA 5 | -------------------------------------------------------------------------------- /usfgan/utils/features.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Feature-related functions.""" 7 | 8 | import numpy as np 9 | 10 | 11 | def validate_length(x, y, hop_size=None): 12 | """ Validate length 13 | Args: 14 | x (ndarray): numpy array with x.shape[0] = len_x 15 | y (ndarray): numpy array with y.shape[0] = len_y 16 | hop_size (int): upsampling factor 17 | Returns: 18 | (ndarray): length adjusted x with same length y 19 | (ndarray): length adjusted y with same length x 20 | """ 21 | if hop_size is None: 22 | if x.shape[0] < y.shape[0]: 23 | y = y[:x.shape[0]] 24 | if x.shape[0] > y.shape[0]: 25 | x = x[:y.shape[0]] 26 | assert len(x) == len(y) 27 | else: 28 | if x.shape[0] > y.shape[0] * hop_size: 29 | x = x[:y.shape[0] * hop_size] 30 | if x.shape[0] < y.shape[0] * hop_size: 31 | mod_y = y.shape[0] * hop_size - x.shape[0] 32 | mod_y_frame = mod_y // hop_size + 1 33 | y = y[:-mod_y_frame] 34 | x = x[:y.shape[0] * hop_size] 35 | assert len(x) == len(y) * hop_size 36 | 37 | return x, y 38 | 39 | 40 | def batch_f0(h, f0_threshold=0, f0_cont=True, f0_idx=1, uv_idx=0): 41 | """ load f0 42 | Args: 43 | h (ndarray): the auxiliary acoustic features (T x D) 44 | f0_threshold (float): the lower bound of pitch 45 | f0_cont (bool): True: return continuous f0; False return discrete f0 46 | f0_idx: the dimension index of f0 47 | uv_idx: the dimension index of U/V 48 | Return: 49 | f0(ndarray): 50 | float array of the f0 sequence (T) 51 | """ 52 | if (f0_idx < 0) or (uv_idx < 0): 53 | f0 = np.zeros(h.shape[0]) 54 | else: 55 | f0 = h[:, f0_idx].copy(order='C') 56 | f0[f0 < f0_threshold] = f0_threshold 57 | if not f0_cont: 58 | uv = h[:, uv_idx].copy(order='C') # voice/unvoice feature 59 | f0[uv == 0] = 0 60 | 61 | return f0 62 | 63 | 64 | def dilated_factor(batch_f0, fs, dense_factor): 65 | """Pitch-dependent dilated factor 66 | Args: 67 | batch_f0 (ndarray): the f0 sequence (T) 68 | fs (int): sampling rate 69 | dense_factor (int): the number of taps in one cycle 70 | Return: 71 | dilated_factors(np array): 72 | float array of the pitch-dependent dilated factors (T) 73 | """ 74 | batch_f0[batch_f0 == 0] = fs / dense_factor 75 | dilated_factors = np.ones(batch_f0.shape) * fs 76 | dilated_factors /= batch_f0 77 | dilated_factors /= dense_factor 78 | assert np.all(dilated_factors > 0) 79 | 80 | return dilated_factors 81 | -------------------------------------------------------------------------------- /usfgan/utils/filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # based on a WaveNet script by Tomoki Hayashi (Nagoya University) 5 | # (https://github.com/kan-bayashi/PytorchWaveNetVocoder) 6 | # based on sprocket-vc script by Kazuhiro Kobayashi (Nagoya University) 7 | # (https://github.com/k2kobayashi/sprocket) 8 | # MIT License (https://opensource.org/licenses/MIT) 9 | 10 | """Filters.""" 11 | 12 | import numpy as np 13 | from scipy.signal import firwin 14 | from scipy.signal import lfilter 15 | 16 | NUMTAPS = 255 17 | 18 | 19 | def low_cut_filter(x, fs, cutoff=70): 20 | """ Low-cut filter 21 | Args: 22 | x (ndarray): Waveform sequence 23 | fs (int): Sampling frequency 24 | cutoff (float): Cutoff frequency of low cut filter 25 | Return: 26 | (ndarray): Low cut filtered waveform sequence 27 | """ 28 | nyquist = fs // 2 29 | norm_cutoff = cutoff / nyquist 30 | numtaps = NUMTAPS 31 | fil = firwin(numtaps, norm_cutoff, pass_zero=False) 32 | lcf_x = lfilter(fil, 1, x) 33 | 34 | return lcf_x 35 | 36 | 37 | def low_pass_filter(x, fs, cutoff=70): 38 | """ Low-pass filter 39 | Args: 40 | x (ndarray): Waveform sequence 41 | fs (int): Sampling frequency 42 | cutoff (float): Cutoff frequency of low pass filter 43 | Return: 44 | (ndarray): Low pass filtered waveform sequence 45 | """ 46 | nyquist = fs // 2 47 | norm_cutoff = cutoff / nyquist 48 | numtaps = NUMTAPS 49 | fil = firwin(numtaps, norm_cutoff, pass_zero=True) 50 | x_pad = np.pad(x, (numtaps, numtaps), 'edge') 51 | lpf_x = lfilter(fil, 1, x_pad) 52 | lpf_x = lpf_x[numtaps + numtaps // 2: -numtaps // 2] 53 | 54 | return lpf_x 55 | -------------------------------------------------------------------------------- /usfgan/utils/index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Indexing-related functions.""" 7 | 8 | import torch 9 | from torch.nn import ConstantPad1d as pad1d 10 | 11 | 12 | def pd_indexing(x, d, dilation, 13 | batch_index, ch_index): 14 | """Pitch-dependent indexing of past and future samples. 15 | 16 | Args: 17 | x (Tensor): Input feature map (B, C, T). 18 | d (Tensor): Input pitch-dependent dilated factors (B, 1, T). 19 | dilation (Int): Dilation size. 20 | batch_index (Tensor): Batch index 21 | ch_index (Tensor): Channel index 22 | 23 | Returns: 24 | Tensor: Past output tensor (B, out_channels, T) 25 | Tensor: Future output tensor (B, out_channels, T) 26 | 27 | """ 28 | (_, _, batch_length) = d.size() 29 | dilations = d * dilation 30 | 31 | # get past index 32 | idxP = torch.arange(-batch_length, 0).float() 33 | if torch.cuda.is_available(): 34 | idxP = idxP.cuda() 35 | idxP = torch.add(-dilations, idxP) 36 | idxP = idxP.round().long() 37 | maxP = -((torch.min(idxP) + batch_length)) 38 | assert maxP >= 0 39 | idxP = (batch_index, ch_index, idxP) 40 | # padding past tensor 41 | xP = pad1d((maxP, 0), 0)(x) 42 | 43 | # get future index 44 | idxF = torch.arange(0, batch_length).float() 45 | if torch.cuda.is_available(): 46 | idxF = idxF.cuda() 47 | idxF = torch.add(dilations, idxF) 48 | idxF = idxF.round().long() 49 | maxF = torch.max(idxF) - (batch_length - 1) 50 | assert maxF >= 0 51 | idxF = (batch_index, ch_index, idxF) 52 | # padding future tensor 53 | xF = pad1d((0, maxF), 0)(x) 54 | 55 | return xP[idxP], xF[idxF] 56 | 57 | 58 | def index_initial(n_batch, n_ch, tensor=True): 59 | """Tensor batch and channel index initialization. 60 | 61 | Args: 62 | n_batch (Int): Number of batch. 63 | n_ch (Int): Number of channel. 64 | tensor (bool): Return tensor or numpy array 65 | 66 | Returns: 67 | Tensor: Batch index 68 | Tensor: Channel index 69 | 70 | """ 71 | batch_index = [] 72 | for i in range(n_batch): 73 | batch_index.append([[i]] * n_ch) 74 | ch_index = [] 75 | for i in range(n_ch): 76 | ch_index += [[i]] 77 | ch_index = [ch_index] * n_batch 78 | 79 | if tensor: 80 | batch_index = torch.tensor(batch_index) 81 | ch_index = torch.tensor(ch_index) 82 | if torch.cuda.is_available(): 83 | batch_index = batch_index.cuda() 84 | ch_index = ch_index.cuda() 85 | return batch_index, ch_index 86 | -------------------------------------------------------------------------------- /usfgan/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Yi-Chiao Wu (Nagoya University) 4 | # based on a Parallel WaveGAN script by Tomoki Hayashi (Nagoya University) 5 | # (https://github.com/kan-bayashi/ParallelWaveGAN) 6 | # MIT License (https://opensource.org/licenses/MIT) 7 | 8 | """Utility functions.""" 9 | 10 | import fnmatch 11 | import logging 12 | import os 13 | import sys 14 | 15 | import h5py 16 | import numpy as np 17 | 18 | 19 | def find_files(root_dir, query="*.wav", include_root_dir=True): 20 | """Find files recursively. 21 | 22 | Args: 23 | root_dir (str): Root root_dir to find. 24 | query (str): Query to find. 25 | include_root_dir (bool): If False, root_dir name is not included. 26 | 27 | Returns: 28 | list: List of found filenames. 29 | 30 | """ 31 | files = [] 32 | for root, dirnames, filenames in os.walk(root_dir, followlinks=True): 33 | for filename in fnmatch.filter(filenames, query): 34 | files.append(os.path.join(root, filename)) 35 | if not include_root_dir: 36 | files = [file_.replace(root_dir + "/", "") for file_ in files] 37 | 38 | return files 39 | 40 | 41 | def read_hdf5(hdf5_name, hdf5_path): 42 | """Read hdf5 dataset. 43 | 44 | Args: 45 | hdf5_name (str): Filename of hdf5 file. 46 | hdf5_path (str): Dataset name in hdf5 file. 47 | 48 | Return: 49 | any: Dataset values. 50 | 51 | """ 52 | if not os.path.exists(hdf5_name): 53 | logging.error(f"There is no such a hdf5 file ({hdf5_name}).") 54 | sys.exit(1) 55 | 56 | hdf5_file = h5py.File(hdf5_name, "r") 57 | 58 | if hdf5_path not in hdf5_file: 59 | logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})") 60 | sys.exit(1) 61 | 62 | hdf5_data = hdf5_file[hdf5_path][()] 63 | hdf5_file.close() 64 | 65 | return hdf5_data 66 | 67 | 68 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True): 69 | """Write dataset to hdf5. 70 | 71 | Args: 72 | hdf5_name (str): Hdf5 dataset filename. 73 | hdf5_path (str): Dataset path in hdf5. 74 | write_data (ndarray): Data to write. 75 | is_overwrite (bool): Whether to overwrite dataset. 76 | 77 | """ 78 | # convert to numpy array 79 | write_data = np.array(write_data) 80 | 81 | # check folder existence 82 | folder_name, _ = os.path.split(hdf5_name) 83 | if not os.path.exists(folder_name) and len(folder_name) != 0: 84 | os.makedirs(folder_name) 85 | 86 | # check hdf5 existence 87 | if os.path.exists(hdf5_name): 88 | # if already exists, open with r+ mode 89 | hdf5_file = h5py.File(hdf5_name, "r+") 90 | # check dataset existence 91 | if hdf5_path in hdf5_file: 92 | if is_overwrite: 93 | logging.warning("Dataset in hdf5 file already exists. " 94 | "recreate dataset in hdf5.") 95 | hdf5_file.__delitem__(hdf5_path) 96 | else: 97 | logging.error("Dataset in hdf5 file already exists. " 98 | "if you want to overwrite, please set is_overwrite = True.") 99 | hdf5_file.close() 100 | sys.exit(1) 101 | else: 102 | # if not exists, open with w mode 103 | hdf5_file = h5py.File(hdf5_name, "w") 104 | 105 | # write data to hdf5 106 | hdf5_file.create_dataset(hdf5_path, data=write_data) 107 | hdf5_file.flush() 108 | hdf5_file.close() 109 | 110 | 111 | def check_hdf5(hdf5_name, hdf5_path): 112 | """Check hdf5 file existence 113 | 114 | Args: 115 | hdf5_name (str): filename of hdf5 file 116 | hdf5_path (str): dataset name in hdf5 file 117 | 118 | Return: 119 | (bool): dataset exists then return true 120 | """ 121 | if not os.path.exists(hdf5_name): 122 | return False 123 | else: 124 | with h5py.File(hdf5_name, "r") as f: 125 | if hdf5_path in f: 126 | return True 127 | else: 128 | return False 129 | 130 | 131 | def read_txt(file_list): 132 | """Read .txt file list 133 | 134 | Arg: 135 | file_list (str): txt file filename 136 | 137 | Return: 138 | (list): list of read lines 139 | """ 140 | with open(file_list, "r") as f: 141 | filenames = f.readlines() 142 | return [filename.replace("\n", "") for filename in filenames] 143 | 144 | 145 | def check_filename(list1, list2): 146 | """Check the filenames of two list are matched 147 | 148 | Arg: 149 | list1 (list): file list 1 150 | list2 (list): file list 2 151 | 152 | Return: 153 | (bool): matched (True) or not (False) 154 | """ 155 | def _filename(x): 156 | return os.path.basename(x).split('.')[0] 157 | list1 = list(map(_filename, list1)) 158 | list2 = list(map(_filename, list2)) 159 | 160 | return list1 == list2 161 | --------------------------------------------------------------------------------