├── .gitignore ├── README.md ├── egs └── jvs │ └── data │ ├── list │ ├── all.list │ ├── dev.list │ ├── eval_high.list │ ├── eval_low.list │ ├── eval_mid.list │ └── train_no_dev.list │ ├── scp │ ├── all.scp │ ├── dev.scp │ ├── eval_high.scp │ ├── eval_low.scp │ ├── eval_mid.scp │ └── train_no_dev.scp │ └── spk_style.yaml ├── requirements.txt ├── setup.py └── wavehax ├── __init__.py ├── bin ├── __init__.py ├── compute_statistics.py ├── config │ ├── __init__.py │ ├── compute_statistics.yaml │ ├── data │ │ └── jvs.yaml │ ├── decode.yaml │ ├── discriminator │ │ └── univnet.yaml │ ├── extract_features.yaml │ ├── generator │ │ ├── cwavehax.v1.yaml │ │ ├── cwavehax.v2.yaml │ │ ├── wavehax.v1.yaml │ │ └── wavehax.v2.yaml │ ├── profile.yaml │ ├── train.yaml │ └── train │ │ └── wavehax.yaml ├── decode.py ├── extract_features.py ├── profile.py └── train.py ├── datasets ├── __init__.py └── audio_feat_dataset.py ├── discriminators ├── __init__.py └── univnet.py ├── generators ├── __init__.py └── wavehax.py ├── losses ├── __init__.py ├── adv.py └── spectral.py ├── modules ├── __init__.py ├── complex.py ├── conv.py ├── drop.py ├── norm.py ├── periodic.py ├── resblock.py ├── stft.py └── utils.py └── utils ├── __init__.py ├── features.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .venv 3 | .eggs 4 | *egg-info 5 | *.wav 6 | *.h5 7 | *.pkl 8 | *.png 9 | *.out 10 | *.log 11 | exp 12 | local 13 | events.out.tfevents.* 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wavehax 2 | 3 | This repository provides the official PyTorch implementation of [Wavehax](https://chomeyama.github.io/wavehax-demo/), an alias-free neural vocoder that combines 2D convolutions with harmonic priors for high-fidelity and robust complex spectrogram estimation. 4 | 5 | 6 | ## Environment Setup 7 | 8 | To set up the environment, run: 9 | ```bash 10 | $ cd wavehax 11 | $ pip install -e . 12 | ``` 13 | This will install the necessary dependencies in editable mode. 14 | 15 | 16 | ## Directory structure 17 | 18 | - **egs**: 19 | This directory contains project-specific examples and configurations. 20 | - **egs/jvs**: 21 | An example project using the [Japanese Versatile Speech (JVS) Corpus](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus), with speaker- and style-wise fundamental frequency (F0) ranges available at [JVS Corpus F0 Range](https://github.com/chomeyama/JVSCorpusF0Range). 22 | - **wavehax**: 23 | The main source code for Wavehax. 24 | 25 | 26 | ## Run 27 | 28 | This repository uses [Hydra](https://hydra.cc/docs/intro/) for managing hyperparameters. 29 | Hydra provides an easy way to dynamically create a hierarchical configuration by composition and override it through config files and the command line. 30 | 31 | ### Dataset preparation 32 | 33 | Prepare your dataset by creating `.scp` files that define the path to each audio file (e.g., `egs/jvs/data/scp/train_no_dev.scp`). 34 | During the preprocessing step, list files for the extracted features will be automatically generated (e.g., `egs/jvs/data/list/train_no_dev.list`). 35 | Ensure that separate `.scp` and `.list` files are available for training, validation, and evaluation datasets. 36 | 37 | 38 | ### Preprocessing 39 | 40 | To extract acoustic features and prepare statistics: 41 | ```bash 42 | # Move to the project directory. 43 | $ cd egs/jvs 44 | 45 | # Extract acoustic features like F0 and mel-spectrogram. To customize hyperparameters, edit wavehax/bin/config/extract_features.yaml, or override them from the command line. 46 | $ wavehax-extract-features audio_scp=data/scp/all.scp 47 | 48 | # Compute statistics of the training data. You can adjust hyperparameters in wavehax/bin/config/compute_statistics.yaml. 49 | $ wavehax-compute-statistics filepath_list=data/scp/train_no_dev.list save_path=data/stats/train_no_dev.joblib 50 | ``` 51 | 52 | ### Training 53 | 54 | To train the vocoder model: 55 | ```bash 56 | # Start training. You can adjust hyperparameters in wavehax/bin/config/decode.yaml. In the paper, the model was trained for 1000K steps to match other models, but Wavehax achieves similar performance with fewer training steps. 57 | $ wavehax-train generator=wavehax discriminator=univnet train=wavehax train.train_max_steps=500000 data=jvs out_dir=exp/wavehax 58 | ``` 59 | 60 | ### Inference 61 | 62 | To generate speech waveforms using the trained model: 63 | ```bash 64 | # Perform inference using the trained model. You can adjust hyperparameters in wavehax/bin/config/decode.yaml. 65 | $ wavehax-decode generator=wavehax data=jvs out_dir=exp/wavehax ckpt_steps=500000 66 | ``` 67 | 68 | ### Monitoring training progress 69 | 70 | You can monitor the training process using [TensorBoard](https://www.tensorflow.org/tensorboard): 71 | ```bash 72 | $ tensorboard --logdir exp 73 | ``` 74 | 75 | 76 | ### Pretrained models 77 | 78 | We plan to release models trained on several datasets. 79 | -------------------------------------------------------------------------------- /egs/jvs/data/list/dev.list: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/falset10/hdf5/BASIC5000_1635.h5 2 | data/ver1/jvs001/falset10/hdf5/VOICEACTRESS100_002.h5 3 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_016.h5 4 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_045.h5 5 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_099.h5 6 | data/ver1/jvs002/falset10/hdf5/ONOMATOPEE300_036.h5 7 | data/ver1/jvs003/falset10/hdf5/BASIC5000_1363.h5 8 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_040.h5 9 | data/ver1/jvs004/falset10/hdf5/BASIC5000_1438.h5 10 | data/ver1/jvs004/falset10/hdf5/VOICEACTRESS100_005.h5 11 | data/ver1/jvs005/falset10/hdf5/TRAVEL1000_0484.h5 12 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_002.h5 13 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_038.h5 14 | data/ver1/jvs006/parallel100/hdf5/VOICEACTRESS100_070.h5 15 | data/ver1/jvs006/parallel100/hdf5/VOICEACTRESS100_088.h5 16 | data/ver1/jvs009/falset10/hdf5/BASIC5000_2634.h5 17 | data/ver1/jvs009/nonpara30/hdf5/BASIC5000_0816.h5 18 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_018.h5 19 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_062.h5 20 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_082.h5 21 | data/ver1/jvs010/falset10/hdf5/VOICEACTRESS100_005.h5 22 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_004.h5 23 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_007.h5 24 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_032.h5 25 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_061.h5 26 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_002.h5 27 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_009.h5 28 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_019.h5 29 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_022.h5 30 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_040.h5 31 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_071.h5 32 | data/ver1/jvs015/falset10/hdf5/VOICEACTRESS100_001.h5 33 | data/ver1/jvs016/falset10/hdf5/BASIC5000_2676.h5 34 | data/ver1/jvs016/falset10/hdf5/VOICEACTRESS100_002.h5 35 | data/ver1/jvs017/falset10/hdf5/BASIC5000_2512.h5 36 | data/ver1/jvs019/falset10/hdf5/VOICEACTRESS100_001.h5 37 | data/ver1/jvs020/nonpara30/hdf5/BASIC5000_1431.h5 38 | data/ver1/jvs020/nonpara30/hdf5/TRAVEL1000_0867.h5 39 | data/ver1/jvs020/parallel100/hdf5/VOICEACTRESS100_030.h5 40 | data/ver1/jvs021/nonpara30/hdf5/TRAVEL1000_0496.h5 41 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_006.h5 42 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_012.h5 43 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_030.h5 44 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_045.h5 45 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_100.h5 46 | data/ver1/jvs022/parallel100/hdf5/VOICEACTRESS100_036.h5 47 | data/ver1/jvs023/nonpara30/hdf5/TRAVEL1000_0029.h5 48 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_011.h5 49 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_029.h5 50 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_058.h5 51 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_082.h5 52 | data/ver1/jvs024/falset10/hdf5/VOICEACTRESS100_003.h5 53 | data/ver1/jvs024/falset10/hdf5/VOICEACTRESS100_004.h5 54 | data/ver1/jvs024/falset10/hdf5/VOICEACTRESS100_005.h5 55 | data/ver1/jvs025/falset10/hdf5/VOICEACTRESS100_001.h5 56 | data/ver1/jvs026/falset10/hdf5/TRAVEL1000_0949.h5 57 | data/ver1/jvs026/falset10/hdf5/VOICEACTRESS100_001.h5 58 | data/ver1/jvs027/falset10/hdf5/BASIC5000_1045.h5 59 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_057.h5 60 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_100.h5 61 | data/ver1/jvs029/falset10/hdf5/BASIC5000_2412.h5 62 | data/ver1/jvs029/falset10/hdf5/TRAVEL1000_0449.h5 63 | data/ver1/jvs029/falset10/hdf5/VOICEACTRESS100_004.h5 64 | data/ver1/jvs031/nonpara30/hdf5/BASIC5000_0629.h5 65 | data/ver1/jvs031/nonpara30/hdf5/TRAVEL1000_0762.h5 66 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_014.h5 67 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_041.h5 68 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_072.h5 69 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_078.h5 70 | data/ver1/jvs032/falset10/hdf5/VOICEACTRESS100_001.h5 71 | data/ver1/jvs032/falset10/hdf5/VOICEACTRESS100_004.h5 72 | data/ver1/jvs032/parallel100/hdf5/VOICEACTRESS100_029.h5 73 | data/ver1/jvs033/falset10/hdf5/BASIC5000_0266.h5 74 | data/ver1/jvs033/falset10/hdf5/TRAVEL1000_0199.h5 75 | data/ver1/jvs034/falset10/hdf5/BASIC5000_2100.h5 76 | data/ver1/jvs034/falset10/hdf5/BASIC5000_2694.h5 77 | data/ver1/jvs034/falset10/hdf5/VOICEACTRESS100_002.h5 78 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_045.h5 79 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_060.h5 80 | data/ver1/jvs035/falset10/hdf5/BASIC5000_0468.h5 81 | data/ver1/jvs035/falset10/hdf5/VOICEACTRESS100_004.h5 82 | data/ver1/jvs036/falset10/hdf5/VOICEACTRESS100_005.h5 83 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_003.h5 84 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_051.h5 85 | data/ver1/jvs039/falset10/hdf5/BASIC5000_1512.h5 86 | data/ver1/jvs039/falset10/hdf5/BASIC5000_1753.h5 87 | data/ver1/jvs039/falset10/hdf5/VOICEACTRESS100_001.h5 88 | data/ver1/jvs039/falset10/hdf5/VOICEACTRESS100_002.h5 89 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_040.h5 90 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_098.h5 91 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_065.h5 92 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_089.h5 93 | data/ver1/jvs043/falset10/hdf5/VOICEACTRESS100_005.h5 94 | data/ver1/jvs044/falset10/hdf5/BASIC5000_2176.h5 95 | data/ver1/jvs044/falset10/hdf5/TRAVEL1000_0993.h5 96 | data/ver1/jvs044/nonpara30/hdf5/BASIC5000_1211.h5 97 | data/ver1/jvs044/nonpara30/hdf5/LOANWORD128_074.h5 98 | data/ver1/jvs044/parallel100/hdf5/VOICEACTRESS100_020.h5 99 | data/ver1/jvs044/parallel100/hdf5/VOICEACTRESS100_089.h5 100 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_020.h5 101 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_022.h5 102 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_052.h5 103 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_053.h5 104 | data/ver1/jvs046/parallel100/hdf5/VOICEACTRESS100_005.h5 105 | data/ver1/jvs046/parallel100/hdf5/VOICEACTRESS100_099.h5 106 | data/ver1/jvs047/parallel100/hdf5/VOICEACTRESS100_065.h5 107 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_004.h5 108 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_011.h5 109 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_037.h5 110 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_045.h5 111 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_099.h5 112 | data/ver1/jvs049/parallel100/hdf5/VOICEACTRESS100_023.h5 113 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_056.h5 114 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_073.h5 115 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_077.h5 116 | data/ver1/jvs051/falset10/hdf5/VOICEACTRESS100_004.h5 117 | data/ver1/jvs052/falset10/hdf5/VOICEACTRESS100_001.h5 118 | data/ver1/jvs053/falset10/hdf5/BASIC5000_2254.h5 119 | data/ver1/jvs054/falset10/hdf5/VOICEACTRESS100_003.h5 120 | data/ver1/jvs054/nonpara30/hdf5/BASIC5000_1468.h5 121 | data/ver1/jvs054/nonpara30/hdf5/BASIC5000_2178.h5 122 | data/ver1/jvs054/nonpara30/hdf5/BASIC5000_2538.h5 123 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_026.h5 124 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_088.h5 125 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_097.h5 126 | data/ver1/jvs055/falset10/hdf5/VOICEACTRESS100_002.h5 127 | data/ver1/jvs056/falset10/hdf5/BASIC5000_0979.h5 128 | data/ver1/jvs057/falset10/hdf5/UT-PARAPHRASE-sent261-phrase2.h5 129 | data/ver1/jvs058/falset10/hdf5/TRAVEL1000_0452.h5 130 | data/ver1/jvs058/falset10/hdf5/VOICEACTRESS100_002.h5 131 | data/ver1/jvs059/falset10/hdf5/VOICEACTRESS100_001.h5 132 | data/ver1/jvs061/falset10/hdf5/ONOMATOPEE300_135.h5 133 | data/ver1/jvs062/falset10/hdf5/BASIC5000_2084.h5 134 | data/ver1/jvs062/falset10/hdf5/TRAVEL1000_0730.h5 135 | data/ver1/jvs062/falset10/hdf5/VOICEACTRESS100_002.h5 136 | data/ver1/jvs062/falset10/hdf5/VOICEACTRESS100_003.h5 137 | data/ver1/jvs063/falset10/hdf5/BASIC5000_1226.h5 138 | data/ver1/jvs063/falset10/hdf5/BASIC5000_2320.h5 139 | data/ver1/jvs063/falset10/hdf5/VOICEACTRESS100_005.h5 140 | data/ver1/jvs064/falset10/hdf5/VOICEACTRESS100_001.h5 141 | data/ver1/jvs065/falset10/hdf5/VOICEACTRESS100_005.h5 142 | data/ver1/jvs066/falset10/hdf5/BASIC5000_2406.h5 143 | data/ver1/jvs067/falset10/hdf5/BASIC5000_1001.h5 144 | data/ver1/jvs068/parallel100/hdf5/VOICEACTRESS100_084.h5 145 | data/ver1/jvs070/falset10/hdf5/VOICEACTRESS100_001.h5 146 | data/ver1/jvs070/falset10/hdf5/VOICEACTRESS100_005.h5 147 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_072.h5 148 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_027.h5 149 | data/ver1/jvs073/falset10/hdf5/LOANWORD128_063.h5 150 | data/ver1/jvs074/parallel100/hdf5/VOICEACTRESS100_007.h5 151 | data/ver1/jvs076/falset10/hdf5/VOICEACTRESS100_002.h5 152 | data/ver1/jvs076/falset10/hdf5/VOICEACTRESS100_005.h5 153 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_012.h5 154 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_048.h5 155 | data/ver1/jvs077/falset10/hdf5/UT-PARAPHRASE-sent170-phrase1.h5 156 | data/ver1/jvs077/falset10/hdf5/VOICEACTRESS100_002.h5 157 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_018.h5 158 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_030.h5 159 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_051.h5 160 | data/ver1/jvs078/falset10/hdf5/TRAVEL1000_0178.h5 161 | data/ver1/jvs078/falset10/hdf5/TRAVEL1000_0819.h5 162 | data/ver1/jvs078/falset10/hdf5/VOICEACTRESS100_004.h5 163 | data/ver1/jvs078/nonpara30/hdf5/BASIC5000_0080.h5 164 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_006.h5 165 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_063.h5 166 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_068.h5 167 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_069.h5 168 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_077.h5 169 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_097.h5 170 | data/ver1/jvs079/falset10/hdf5/VOICEACTRESS100_004.h5 171 | data/ver1/jvs081/falset10/hdf5/TRAVEL1000_0884.h5 172 | data/ver1/jvs081/falset10/hdf5/VOICEACTRESS100_002.h5 173 | data/ver1/jvs083/falset10/hdf5/TRAVEL1000_0273.h5 174 | data/ver1/jvs084/falset10/hdf5/BASIC5000_2086.h5 175 | data/ver1/jvs086/falset10/hdf5/BASIC5000_1231.h5 176 | data/ver1/jvs086/falset10/hdf5/TRAVEL1000_0050.h5 177 | data/ver1/jvs086/nonpara30/hdf5/BASIC5000_1202.h5 178 | data/ver1/jvs087/falset10/hdf5/BASIC5000_1496.h5 179 | data/ver1/jvs087/falset10/hdf5/VOICEACTRESS100_002.h5 180 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_013.h5 181 | data/ver1/jvs088/falset10/hdf5/001.h5 182 | data/ver1/jvs088/falset10/hdf5/002.h5 183 | data/ver1/jvs088/falset10/hdf5/006.h5 184 | data/ver1/jvs089/nonpara30/hdf5/ONOMATOPEE300_139.h5 185 | data/ver1/jvs089/parallel100/hdf5/VOICEACTRESS100_028.h5 186 | data/ver1/jvs089/parallel100/hdf5/VOICEACTRESS100_055.h5 187 | data/ver1/jvs091/falset10/hdf5/VOICEACTRESS100_004.h5 188 | data/ver1/jvs092/falset10/hdf5/BASIC5000_0713.h5 189 | data/ver1/jvs093/falset10/hdf5/VOICEACTRESS100_004.h5 190 | data/ver1/jvs094/falset10/hdf5/LOANWORD128_026.h5 191 | data/ver1/jvs094/falset10/hdf5/ONOMATOPEE300_287.h5 192 | data/ver1/jvs095/falset10/hdf5/BASIC5000_2297.h5 193 | data/ver1/jvs096/falset10/hdf5/BASIC5000_1475.h5 194 | data/ver1/jvs096/falset10/hdf5/VOICEACTRESS100_001.h5 195 | data/ver1/jvs096/falset10/hdf5/VOICEACTRESS100_003.h5 196 | data/ver1/jvs099/parallel100/hdf5/VOICEACTRESS100_006.h5 197 | data/ver1/jvs099/parallel100/hdf5/VOICEACTRESS100_029.h5 198 | data/ver1/jvs099/parallel100/hdf5/VOICEACTRESS100_076.h5 199 | data/ver1/jvs100/parallel100/hdf5/VOICEACTRESS100_057.h5 200 | data/ver1/jvs100/parallel100/hdf5/VOICEACTRESS100_100.h5 201 | -------------------------------------------------------------------------------- /egs/jvs/data/list/eval_high.list: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/falset10/hdf5/VOICEACTRESS100_003.h5 2 | data/ver1/jvs002/falset10/hdf5/LOANWORD128_007.h5 3 | data/ver1/jvs004/falset10/hdf5/BASIC5000_1802.h5 4 | data/ver1/jvs004/falset10/hdf5/TRAVEL1000_0409.h5 5 | data/ver1/jvs004/falset10/hdf5/TRAVEL1000_0840.h5 6 | data/ver1/jvs004/falset10/hdf5/VOICEACTRESS100_001.h5 7 | data/ver1/jvs009/falset10/hdf5/VOICEACTRESS100_001.h5 8 | data/ver1/jvs010/falset10/hdf5/BASIC5000_1849.h5 9 | data/ver1/jvs010/falset10/hdf5/BASIC5000_2689.h5 10 | data/ver1/jvs010/falset10/hdf5/TRAVEL1000_0708.h5 11 | data/ver1/jvs010/falset10/hdf5/VOICEACTRESS100_001.h5 12 | data/ver1/jvs010/falset10/hdf5/VOICEACTRESS100_004.h5 13 | data/ver1/jvs014/falset10/hdf5/BASIC5000_0733.h5 14 | data/ver1/jvs014/falset10/hdf5/BASIC5000_1093.h5 15 | data/ver1/jvs014/falset10/hdf5/BASIC5000_3033.h5 16 | data/ver1/jvs014/falset10/hdf5/TRAVEL1000_0936.h5 17 | data/ver1/jvs014/falset10/hdf5/VOICEACTRESS100_001.h5 18 | data/ver1/jvs014/falset10/hdf5/VOICEACTRESS100_005.h5 19 | data/ver1/jvs015/falset10/hdf5/VOICEACTRESS100_002.h5 20 | data/ver1/jvs015/falset10/hdf5/VOICEACTRESS100_004.h5 21 | data/ver1/jvs016/falset10/hdf5/LOANWORD128_084.h5 22 | data/ver1/jvs016/falset10/hdf5/VOICEACTRESS100_001.h5 23 | data/ver1/jvs016/falset10/hdf5/VOICEACTRESS100_003.h5 24 | data/ver1/jvs016/falset10/hdf5/VOICEACTRESS100_004.h5 25 | data/ver1/jvs016/falset10/hdf5/VOICEACTRESS100_005.h5 26 | data/ver1/jvs017/falset10/hdf5/VOICEACTRESS100_001.h5 27 | data/ver1/jvs017/falset10/hdf5/VOICEACTRESS100_004.h5 28 | data/ver1/jvs019/falset10/hdf5/BASIC5000_1250.h5 29 | data/ver1/jvs019/falset10/hdf5/ONOMATOPEE300_210.h5 30 | data/ver1/jvs019/falset10/hdf5/VOICEACTRESS100_005.h5 31 | data/ver1/jvs024/falset10/hdf5/BASIC5000_0193.h5 32 | data/ver1/jvs024/falset10/hdf5/BASIC5000_0394.h5 33 | data/ver1/jvs025/falset10/hdf5/BASIC5000_0960.h5 34 | data/ver1/jvs025/falset10/hdf5/BASIC5000_1464.h5 35 | data/ver1/jvs025/falset10/hdf5/BASIC5000_2919.h5 36 | data/ver1/jvs025/falset10/hdf5/TRAVEL1000_0392.h5 37 | data/ver1/jvs025/falset10/hdf5/TRAVEL1000_0795.h5 38 | data/ver1/jvs025/falset10/hdf5/VOICEACTRESS100_002.h5 39 | data/ver1/jvs025/falset10/hdf5/VOICEACTRESS100_003.h5 40 | data/ver1/jvs025/falset10/hdf5/VOICEACTRESS100_005.h5 41 | data/ver1/jvs026/falset10/hdf5/BASIC5000_1023.h5 42 | data/ver1/jvs026/falset10/hdf5/TRAVEL1000_0555.h5 43 | data/ver1/jvs026/falset10/hdf5/VOICEACTRESS100_002.h5 44 | data/ver1/jvs026/falset10/hdf5/VOICEACTRESS100_003.h5 45 | data/ver1/jvs027/falset10/hdf5/BASIC5000_0608.h5 46 | data/ver1/jvs027/falset10/hdf5/TRAVEL1000_0296.h5 47 | data/ver1/jvs027/falset10/hdf5/VOICEACTRESS100_001.h5 48 | data/ver1/jvs027/falset10/hdf5/VOICEACTRESS100_003.h5 49 | data/ver1/jvs027/falset10/hdf5/VOICEACTRESS100_005.h5 50 | data/ver1/jvs028/falset10/hdf5/BASIC5000_1807.h5 51 | data/ver1/jvs029/falset10/hdf5/BASIC5000_1119.h5 52 | data/ver1/jvs029/falset10/hdf5/TRAVEL1000_0378.h5 53 | data/ver1/jvs029/falset10/hdf5/VOICEACTRESS100_001.h5 54 | data/ver1/jvs030/falset10/hdf5/BASIC5000_2283.h5 55 | data/ver1/jvs030/falset10/hdf5/VOICEACTRESS100_004.h5 56 | data/ver1/jvs032/falset10/hdf5/BASIC5000_0820.h5 57 | data/ver1/jvs032/falset10/hdf5/BASIC5000_1088.h5 58 | data/ver1/jvs032/falset10/hdf5/VOICEACTRESS100_002.h5 59 | data/ver1/jvs032/falset10/hdf5/VOICEACTRESS100_003.h5 60 | data/ver1/jvs033/falset10/hdf5/BASIC5000_0786.h5 61 | data/ver1/jvs033/falset10/hdf5/ONOMATOPEE300_203.h5 62 | data/ver1/jvs035/falset10/hdf5/BASIC5000_2750.h5 63 | data/ver1/jvs035/falset10/hdf5/TRAVEL1000_0618.h5 64 | data/ver1/jvs035/falset10/hdf5/VOICEACTRESS100_003.h5 65 | data/ver1/jvs038/falset10/hdf5/UT-PARAPHRASE-sent037-phrase1.h5 66 | data/ver1/jvs038/falset10/hdf5/VOICEACTRESS100_005.h5 67 | data/ver1/jvs039/falset10/hdf5/BASIC5000_1471.h5 68 | data/ver1/jvs039/falset10/hdf5/BASIC5000_1783.h5 69 | data/ver1/jvs039/falset10/hdf5/VOICEACTRESS100_003.h5 70 | data/ver1/jvs039/falset10/hdf5/VOICEACTRESS100_004.h5 71 | data/ver1/jvs039/falset10/hdf5/VOICEACTRESS100_005.h5 72 | data/ver1/jvs040/falset10/hdf5/TRAVEL1000_0927.h5 73 | data/ver1/jvs043/falset10/hdf5/BASIC5000_1227.h5 74 | data/ver1/jvs043/falset10/hdf5/LOANWORD128_005.h5 75 | data/ver1/jvs044/falset10/hdf5/VOICEACTRESS100_001.h5 76 | data/ver1/jvs045/falset10/hdf5/VOICEACTRESS100_005.h5 77 | data/ver1/jvs051/falset10/hdf5/ONOMATOPEE300_123.h5 78 | data/ver1/jvs051/falset10/hdf5/VOICEACTRESS100_002.h5 79 | data/ver1/jvs051/falset10/hdf5/VOICEACTRESS100_005.h5 80 | data/ver1/jvs052/falset10/hdf5/VOICEACTRESS100_003.h5 81 | data/ver1/jvs052/falset10/hdf5/VOICEACTRESS100_005.h5 82 | data/ver1/jvs053/falset10/hdf5/UT-PARAPHRASE-sent103-phrase1.h5 83 | data/ver1/jvs053/falset10/hdf5/VOICEACTRESS100_002.h5 84 | data/ver1/jvs053/falset10/hdf5/VOICEACTRESS100_003.h5 85 | data/ver1/jvs054/falset10/hdf5/BASIC5000_2178.h5 86 | data/ver1/jvs055/falset10/hdf5/VOICEACTRESS100_004.h5 87 | data/ver1/jvs056/falset10/hdf5/BASIC5000_2730.h5 88 | data/ver1/jvs056/falset10/hdf5/VOICEACTRESS100_003.h5 89 | data/ver1/jvs056/falset10/hdf5/VOICEACTRESS100_004.h5 90 | data/ver1/jvs056/falset10/hdf5/VOICEACTRESS100_005.h5 91 | data/ver1/jvs057/falset10/hdf5/VOICEACTRESS100_003.h5 92 | data/ver1/jvs058/falset10/hdf5/BASIC5000_2426.h5 93 | data/ver1/jvs058/falset10/hdf5/VOICEACTRESS100_005.h5 94 | data/ver1/jvs059/falset10/hdf5/VOICEACTRESS100_003.h5 95 | data/ver1/jvs061/falset10/hdf5/BASIC5000_2893.h5 96 | data/ver1/jvs061/falset10/hdf5/VOICEACTRESS100_004.h5 97 | data/ver1/jvs062/falset10/hdf5/VOICEACTRESS100_005.h5 98 | data/ver1/jvs063/falset10/hdf5/TRAVEL1000_0104.h5 99 | data/ver1/jvs063/falset10/hdf5/VOICEACTRESS100_001.h5 100 | data/ver1/jvs064/falset10/hdf5/BASIC5000_1641.h5 101 | data/ver1/jvs065/falset10/hdf5/BASIC5000_2139.h5 102 | data/ver1/jvs065/falset10/hdf5/TRAVEL1000_0234.h5 103 | data/ver1/jvs065/falset10/hdf5/TRAVEL1000_0542.h5 104 | data/ver1/jvs065/falset10/hdf5/VOICEACTRESS100_002.h5 105 | data/ver1/jvs066/falset10/hdf5/BASIC5000_1205.h5 106 | data/ver1/jvs066/falset10/hdf5/TRAVEL1000_0861.h5 107 | data/ver1/jvs066/falset10/hdf5/VOICEACTRESS100_001.h5 108 | data/ver1/jvs066/falset10/hdf5/VOICEACTRESS100_005.h5 109 | data/ver1/jvs067/falset10/hdf5/BASIC5000_0480.h5 110 | data/ver1/jvs067/falset10/hdf5/ONOMATOPEE300_243.h5 111 | data/ver1/jvs067/falset10/hdf5/VOICEACTRESS100_002.h5 112 | data/ver1/jvs067/falset10/hdf5/VOICEACTRESS100_004.h5 113 | data/ver1/jvs068/falset10/hdf5/BASIC5000_1759.h5 114 | data/ver1/jvs068/falset10/hdf5/TRAVEL1000_0146.h5 115 | data/ver1/jvs069/falset10/hdf5/BASIC5000_0614.h5 116 | data/ver1/jvs069/falset10/hdf5/BASIC5000_2786.h5 117 | data/ver1/jvs069/falset10/hdf5/TRAVEL1000_0574.h5 118 | data/ver1/jvs069/falset10/hdf5/VOICEACTRESS100_002.h5 119 | data/ver1/jvs069/falset10/hdf5/VOICEACTRESS100_003.h5 120 | data/ver1/jvs070/falset10/hdf5/BASIC5000_0182.h5 121 | data/ver1/jvs070/falset10/hdf5/BASIC5000_1696.h5 122 | data/ver1/jvs070/falset10/hdf5/VOICEACTRESS100_002.h5 123 | data/ver1/jvs070/falset10/hdf5/VOICEACTRESS100_003.h5 124 | data/ver1/jvs070/falset10/hdf5/VOICEACTRESS100_004.h5 125 | data/ver1/jvs072/falset10/hdf5/BASIC5000_0170.h5 126 | data/ver1/jvs072/falset10/hdf5/BASIC5000_0813.h5 127 | data/ver1/jvs072/falset10/hdf5/BASIC5000_1604.h5 128 | data/ver1/jvs072/falset10/hdf5/BASIC5000_1940.h5 129 | data/ver1/jvs072/falset10/hdf5/BASIC5000_1976.h5 130 | data/ver1/jvs072/falset10/hdf5/VOICEACTRESS100_001.h5 131 | data/ver1/jvs072/falset10/hdf5/VOICEACTRESS100_002.h5 132 | data/ver1/jvs072/falset10/hdf5/VOICEACTRESS100_004.h5 133 | data/ver1/jvs072/falset10/hdf5/VOICEACTRESS100_005.h5 134 | data/ver1/jvs073/falset10/hdf5/BASIC5000_2430.h5 135 | data/ver1/jvs073/falset10/hdf5/VOICEACTRESS100_002.h5 136 | data/ver1/jvs073/falset10/hdf5/VOICEACTRESS100_004.h5 137 | data/ver1/jvs075/falset10/hdf5/UT-PARAPHRASE-sent004-phrase1.h5 138 | data/ver1/jvs075/falset10/hdf5/VOICEACTRESS100_002.h5 139 | data/ver1/jvs076/falset10/hdf5/BASIC5000_2502.h5 140 | data/ver1/jvs076/falset10/hdf5/VOICEACTRESS100_004.h5 141 | data/ver1/jvs077/falset10/hdf5/BASIC5000_0910.h5 142 | data/ver1/jvs077/falset10/hdf5/TRAVEL1000_0291.h5 143 | data/ver1/jvs077/falset10/hdf5/VOICEACTRESS100_001.h5 144 | data/ver1/jvs077/falset10/hdf5/VOICEACTRESS100_003.h5 145 | data/ver1/jvs077/falset10/hdf5/VOICEACTRESS100_004.h5 146 | data/ver1/jvs078/falset10/hdf5/VOICEACTRESS100_001.h5 147 | data/ver1/jvs079/falset10/hdf5/BASIC5000_1162.h5 148 | data/ver1/jvs079/falset10/hdf5/BASIC5000_1973.h5 149 | data/ver1/jvs080/falset10/hdf5/TRAVEL1000_0048.h5 150 | data/ver1/jvs080/falset10/hdf5/UT-PARAPHRASE-sent003-phrase2.h5 151 | data/ver1/jvs081/falset10/hdf5/BASIC5000_2612.h5 152 | data/ver1/jvs081/falset10/hdf5/VOICEACTRESS100_001.h5 153 | data/ver1/jvs081/falset10/hdf5/VOICEACTRESS100_004.h5 154 | data/ver1/jvs082/falset10/hdf5/BASIC5000_1728.h5 155 | data/ver1/jvs082/falset10/hdf5/VOICEACTRESS100_002.h5 156 | data/ver1/jvs082/falset10/hdf5/VOICEACTRESS100_004.h5 157 | data/ver1/jvs083/falset10/hdf5/TRAVEL1000_0213.h5 158 | data/ver1/jvs083/falset10/hdf5/TRAVEL1000_0804.h5 159 | data/ver1/jvs083/falset10/hdf5/TRAVEL1000_0817.h5 160 | data/ver1/jvs083/falset10/hdf5/VOICEACTRESS100_001.h5 161 | data/ver1/jvs084/falset10/hdf5/UT-PARAPHRASE-sent056-phrase2.h5 162 | data/ver1/jvs085/falset10/hdf5/VOICEACTRESS100_001.h5 163 | data/ver1/jvs085/falset10/hdf5/VOICEACTRESS100_003.h5 164 | data/ver1/jvs086/falset10/hdf5/VOICEACTRESS100_005.h5 165 | data/ver1/jvs087/falset10/hdf5/BASIC5000_1799.h5 166 | data/ver1/jvs087/falset10/hdf5/BASIC5000_2717.h5 167 | data/ver1/jvs087/falset10/hdf5/VOICEACTRESS100_001.h5 168 | data/ver1/jvs088/falset10/hdf5/004.h5 169 | data/ver1/jvs088/falset10/hdf5/007.h5 170 | data/ver1/jvs088/falset10/hdf5/009.h5 171 | data/ver1/jvs090/falset10/hdf5/BASIC5000_0993.h5 172 | data/ver1/jvs090/falset10/hdf5/BASIC5000_1413.h5 173 | data/ver1/jvs090/falset10/hdf5/UT-PARAPHRASE-sent018-phrase2.h5 174 | data/ver1/jvs090/falset10/hdf5/VOICEACTRESS100_001.h5 175 | data/ver1/jvs090/falset10/hdf5/VOICEACTRESS100_003.h5 176 | data/ver1/jvs090/falset10/hdf5/VOICEACTRESS100_004.h5 177 | data/ver1/jvs090/falset10/hdf5/VOICEACTRESS100_005.h5 178 | data/ver1/jvs091/falset10/hdf5/BASIC5000_2150.h5 179 | data/ver1/jvs091/falset10/hdf5/TRAVEL1000_0246.h5 180 | data/ver1/jvs091/falset10/hdf5/TRAVEL1000_0636.h5 181 | data/ver1/jvs091/falset10/hdf5/VOICEACTRESS100_005.h5 182 | data/ver1/jvs092/falset10/hdf5/BASIC5000_0012.h5 183 | data/ver1/jvs092/falset10/hdf5/VOICEACTRESS100_002.h5 184 | data/ver1/jvs092/falset10/hdf5/VOICEACTRESS100_003.h5 185 | data/ver1/jvs093/falset10/hdf5/BASIC5000_2685.h5 186 | data/ver1/jvs094/falset10/hdf5/BASIC5000_1851.h5 187 | data/ver1/jvs094/falset10/hdf5/VOICEACTRESS100_001.h5 188 | data/ver1/jvs094/falset10/hdf5/VOICEACTRESS100_002.h5 189 | data/ver1/jvs095/falset10/hdf5/BASIC5000_0565.h5 190 | data/ver1/jvs095/falset10/hdf5/BASIC5000_2773.h5 191 | data/ver1/jvs095/falset10/hdf5/VOICEACTRESS100_002.h5 192 | data/ver1/jvs096/falset10/hdf5/TRAVEL1000_0307.h5 193 | data/ver1/jvs096/falset10/hdf5/VOICEACTRESS100_004.h5 194 | data/ver1/jvs096/falset10/hdf5/VOICEACTRESS100_005.h5 195 | data/ver1/jvs097/falset10/hdf5/BASIC5000_1562.h5 196 | data/ver1/jvs097/falset10/hdf5/BASIC5000_1805.h5 197 | data/ver1/jvs097/falset10/hdf5/BASIC5000_2238.h5 198 | data/ver1/jvs097/falset10/hdf5/VOICEACTRESS100_002.h5 199 | data/ver1/jvs097/falset10/hdf5/VOICEACTRESS100_004.h5 200 | data/ver1/jvs100/falset10/hdf5/VOICEACTRESS100_005.h5 201 | -------------------------------------------------------------------------------- /egs/jvs/data/list/eval_low.list: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/nonpara30/hdf5/BASIC5000_1896.h5 2 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_014.h5 3 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_036.h5 4 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_046.h5 5 | data/ver1/jvs001/parallel100/hdf5/VOICEACTRESS100_048.h5 6 | data/ver1/jvs003/nonpara30/hdf5/BASIC5000_1388.h5 7 | data/ver1/jvs003/nonpara30/hdf5/LOANWORD128_044.h5 8 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_031.h5 9 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_033.h5 10 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_064.h5 11 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_090.h5 12 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_094.h5 13 | data/ver1/jvs005/nonpara30/hdf5/BASIC5000_1733.h5 14 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_028.h5 15 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_036.h5 16 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_059.h5 17 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_066.h5 18 | data/ver1/jvs005/parallel100/hdf5/VOICEACTRESS100_093.h5 19 | data/ver1/jvs006/nonpara30/hdf5/BASIC5000_0069.h5 20 | data/ver1/jvs006/nonpara30/hdf5/BASIC5000_2121.h5 21 | data/ver1/jvs006/parallel100/hdf5/VOICEACTRESS100_006.h5 22 | data/ver1/jvs006/parallel100/hdf5/VOICEACTRESS100_096.h5 23 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_021.h5 24 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_030.h5 25 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_035.h5 26 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_065.h5 27 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_076.h5 28 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_084.h5 29 | data/ver1/jvs009/parallel100/hdf5/VOICEACTRESS100_090.h5 30 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_013.h5 31 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_034.h5 32 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_043.h5 33 | data/ver1/jvs012/parallel100/hdf5/VOICEACTRESS100_057.h5 34 | data/ver1/jvs020/nonpara30/hdf5/BASIC5000_0584.h5 35 | data/ver1/jvs020/parallel100/hdf5/VOICEACTRESS100_028.h5 36 | data/ver1/jvs021/nonpara30/hdf5/LOANWORD128_105.h5 37 | data/ver1/jvs021/parallel100/hdf5/VOICEACTRESS100_084.h5 38 | data/ver1/jvs022/parallel100/hdf5/VOICEACTRESS100_008.h5 39 | data/ver1/jvs022/parallel100/hdf5/VOICEACTRESS100_030.h5 40 | data/ver1/jvs022/parallel100/hdf5/VOICEACTRESS100_075.h5 41 | data/ver1/jvs023/nonpara30/hdf5/BASIC5000_0788.h5 42 | data/ver1/jvs023/nonpara30/hdf5/TRAVEL1000_0566.h5 43 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_002.h5 44 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_014.h5 45 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_046.h5 46 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_083.h5 47 | data/ver1/jvs023/parallel100/hdf5/VOICEACTRESS100_086.h5 48 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_012.h5 49 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_046.h5 50 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_065.h5 51 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_086.h5 52 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_090.h5 53 | data/ver1/jvs028/parallel100/hdf5/VOICEACTRESS100_094.h5 54 | data/ver1/jvs031/nonpara30/hdf5/BASIC5000_0723.h5 55 | data/ver1/jvs031/nonpara30/hdf5/BASIC5000_1774.h5 56 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_012.h5 57 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_026.h5 58 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_044.h5 59 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_079.h5 60 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_087.h5 61 | data/ver1/jvs031/parallel100/hdf5/VOICEACTRESS100_088.h5 62 | data/ver1/jvs032/parallel100/hdf5/VOICEACTRESS100_008.h5 63 | data/ver1/jvs032/parallel100/hdf5/VOICEACTRESS100_072.h5 64 | data/ver1/jvs033/parallel100/hdf5/VOICEACTRESS100_067.h5 65 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_002.h5 66 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_020.h5 67 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_050.h5 68 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_088.h5 69 | data/ver1/jvs037/nonpara30/hdf5/BASIC5000_0340.h5 70 | data/ver1/jvs037/nonpara30/hdf5/BASIC5000_0409.h5 71 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_004.h5 72 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_006.h5 73 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_017.h5 74 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_022.h5 75 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_025.h5 76 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_027.h5 77 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_041.h5 78 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_064.h5 79 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_076.h5 80 | data/ver1/jvs037/parallel100/hdf5/VOICEACTRESS100_100.h5 81 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_007.h5 82 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_013.h5 83 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_028.h5 84 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_033.h5 85 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_041.h5 86 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_047.h5 87 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_055.h5 88 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_059.h5 89 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_070.h5 90 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_088.h5 91 | data/ver1/jvs042/nonpara30/hdf5/BASIC5000_1065.h5 92 | data/ver1/jvs042/nonpara30/hdf5/BASIC5000_2650.h5 93 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_014.h5 94 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_032.h5 95 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_037.h5 96 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_079.h5 97 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_087.h5 98 | data/ver1/jvs042/parallel100/hdf5/VOICEACTRESS100_097.h5 99 | data/ver1/jvs044/parallel100/hdf5/VOICEACTRESS100_024.h5 100 | data/ver1/jvs044/parallel100/hdf5/VOICEACTRESS100_031.h5 101 | data/ver1/jvs044/parallel100/hdf5/VOICEACTRESS100_071.h5 102 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_027.h5 103 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_046.h5 104 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_054.h5 105 | data/ver1/jvs046/parallel100/hdf5/VOICEACTRESS100_015.h5 106 | data/ver1/jvs046/parallel100/hdf5/VOICEACTRESS100_076.h5 107 | data/ver1/jvs046/parallel100/hdf5/VOICEACTRESS100_082.h5 108 | data/ver1/jvs047/nonpara30/hdf5/BASIC5000_1971.h5 109 | data/ver1/jvs047/nonpara30/hdf5/ONOMATOPEE300_101.h5 110 | data/ver1/jvs047/nonpara30/hdf5/UT-PARAPHRASE-sent135-phrase2.h5 111 | data/ver1/jvs047/parallel100/hdf5/VOICEACTRESS100_035.h5 112 | data/ver1/jvs047/parallel100/hdf5/VOICEACTRESS100_037.h5 113 | data/ver1/jvs047/parallel100/hdf5/VOICEACTRESS100_045.h5 114 | data/ver1/jvs048/nonpara30/hdf5/BASIC5000_0721.h5 115 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_005.h5 116 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_006.h5 117 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_023.h5 118 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_050.h5 119 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_051.h5 120 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_055.h5 121 | data/ver1/jvs048/parallel100/hdf5/VOICEACTRESS100_098.h5 122 | data/ver1/jvs050/nonpara30/hdf5/TRAVEL1000_0473.h5 123 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_039.h5 124 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_048.h5 125 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_071.h5 126 | data/ver1/jvs050/parallel100/hdf5/VOICEACTRESS100_096.h5 127 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_018.h5 128 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_025.h5 129 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_050.h5 130 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_077.h5 131 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_097.h5 132 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_010.h5 133 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_043.h5 134 | data/ver1/jvs054/parallel100/hdf5/VOICEACTRESS100_099.h5 135 | data/ver1/jvs068/nonpara30/hdf5/BASIC5000_0896.h5 136 | data/ver1/jvs068/nonpara30/hdf5/BASIC5000_3071.h5 137 | data/ver1/jvs068/parallel100/hdf5/VOICEACTRESS100_009.h5 138 | data/ver1/jvs068/parallel100/hdf5/VOICEACTRESS100_023.h5 139 | data/ver1/jvs068/parallel100/hdf5/VOICEACTRESS100_072.h5 140 | data/ver1/jvs070/nonpara30/hdf5/BASIC5000_3078.h5 141 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_036.h5 142 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_091.h5 143 | data/ver1/jvs071/nonpara30/hdf5/BASIC5000_0883.h5 144 | data/ver1/jvs071/nonpara30/hdf5/BASIC5000_2654.h5 145 | data/ver1/jvs071/nonpara30/hdf5/TRAVEL1000_0575.h5 146 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_001.h5 147 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_037.h5 148 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_040.h5 149 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_041.h5 150 | data/ver1/jvs071/parallel100/hdf5/VOICEACTRESS100_042.h5 151 | data/ver1/jvs074/parallel100/hdf5/VOICEACTRESS100_027.h5 152 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_002.h5 153 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_005.h5 154 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_043.h5 155 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_011.h5 156 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_067.h5 157 | data/ver1/jvs077/parallel100/hdf5/VOICEACTRESS100_081.h5 158 | data/ver1/jvs078/nonpara30/hdf5/BASIC5000_0638.h5 159 | data/ver1/jvs078/nonpara30/hdf5/BASIC5000_2375.h5 160 | data/ver1/jvs078/nonpara30/hdf5/LOANWORD128_053.h5 161 | data/ver1/jvs078/nonpara30/hdf5/UT-PARAPHRASE-sent216-phrase2.h5 162 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_027.h5 163 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_033.h5 164 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_071.h5 165 | data/ver1/jvs078/parallel100/hdf5/VOICEACTRESS100_073.h5 166 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_005.h5 167 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_038.h5 168 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_067.h5 169 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_069.h5 170 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_088.h5 171 | data/ver1/jvs080/parallel100/hdf5/VOICEACTRESS100_090.h5 172 | data/ver1/jvs086/nonpara30/hdf5/BASIC5000_0804.h5 173 | data/ver1/jvs086/nonpara30/hdf5/TRAVEL1000_0509.h5 174 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_021.h5 175 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_023.h5 176 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_056.h5 177 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_057.h5 178 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_059.h5 179 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_061.h5 180 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_075.h5 181 | data/ver1/jvs086/parallel100/hdf5/VOICEACTRESS100_087.h5 182 | data/ver1/jvs087/nonpara30/hdf5/BASIC5000_1230.h5 183 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_007.h5 184 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_010.h5 185 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_021.h5 186 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_049.h5 187 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_066.h5 188 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_076.h5 189 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_092.h5 190 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_094.h5 191 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_096.h5 192 | data/ver1/jvs087/parallel100/hdf5/VOICEACTRESS100_098.h5 193 | data/ver1/jvs089/nonpara30/hdf5/TRAVEL1000_0617.h5 194 | data/ver1/jvs089/parallel100/hdf5/VOICEACTRESS100_005.h5 195 | data/ver1/jvs099/nonpara30/hdf5/BASIC5000_2567.h5 196 | data/ver1/jvs099/nonpara30/hdf5/TRAVEL1000_0090.h5 197 | data/ver1/jvs100/nonpara30/hdf5/BASIC5000_2813.h5 198 | data/ver1/jvs100/nonpara30/hdf5/TRAVEL1000_0227.h5 199 | data/ver1/jvs100/parallel100/hdf5/VOICEACTRESS100_049.h5 200 | data/ver1/jvs100/parallel100/hdf5/VOICEACTRESS100_095.h5 201 | -------------------------------------------------------------------------------- /egs/jvs/data/list/eval_mid.list: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/whisper10/hdf5/VOICEACTRESS100_002.h5 2 | data/ver1/jvs002/parallel100/hdf5/VOICEACTRESS100_048.h5 3 | data/ver1/jvs002/parallel100/hdf5/VOICEACTRESS100_097.h5 4 | data/ver1/jvs003/falset10/hdf5/VOICEACTRESS100_001.h5 5 | data/ver1/jvs003/falset10/hdf5/VOICEACTRESS100_003.h5 6 | data/ver1/jvs003/parallel100/hdf5/VOICEACTRESS100_050.h5 7 | data/ver1/jvs004/parallel100/hdf5/VOICEACTRESS100_019.h5 8 | data/ver1/jvs007/nonpara30/hdf5/BASIC5000_2968.h5 9 | data/ver1/jvs007/nonpara30/hdf5/TRAVEL1000_0842.h5 10 | data/ver1/jvs007/parallel100/hdf5/VOICEACTRESS100_036.h5 11 | data/ver1/jvs007/parallel100/hdf5/VOICEACTRESS100_075.h5 12 | data/ver1/jvs008/nonpara30/hdf5/BASIC5000_1872.h5 13 | data/ver1/jvs008/parallel100/hdf5/VOICEACTRESS100_027.h5 14 | data/ver1/jvs008/parallel100/hdf5/VOICEACTRESS100_071.h5 15 | data/ver1/jvs008/parallel100/hdf5/VOICEACTRESS100_099.h5 16 | data/ver1/jvs010/parallel100/hdf5/VOICEACTRESS100_031.h5 17 | data/ver1/jvs010/parallel100/hdf5/VOICEACTRESS100_077.h5 18 | data/ver1/jvs011/nonpara30/hdf5/BASIC5000_0093.h5 19 | data/ver1/jvs012/whisper10/hdf5/VOICEACTRESS100_002.h5 20 | data/ver1/jvs013/parallel100/hdf5/VOICEACTRESS100_060.h5 21 | data/ver1/jvs014/nonpara30/hdf5/BASIC5000_1255.h5 22 | data/ver1/jvs014/parallel100/hdf5/VOICEACTRESS100_077.h5 23 | data/ver1/jvs015/nonpara30/hdf5/TRAVEL1000_0737.h5 24 | data/ver1/jvs015/parallel100/hdf5/VOICEACTRESS100_044.h5 25 | data/ver1/jvs015/parallel100/hdf5/VOICEACTRESS100_095.h5 26 | data/ver1/jvs016/nonpara30/hdf5/TRAVEL1000_0630.h5 27 | data/ver1/jvs016/parallel100/hdf5/VOICEACTRESS100_026.h5 28 | data/ver1/jvs016/parallel100/hdf5/VOICEACTRESS100_045.h5 29 | data/ver1/jvs016/parallel100/hdf5/VOICEACTRESS100_068.h5 30 | data/ver1/jvs016/parallel100/hdf5/VOICEACTRESS100_082.h5 31 | data/ver1/jvs018/parallel100/hdf5/VOICEACTRESS100_021.h5 32 | data/ver1/jvs018/parallel100/hdf5/VOICEACTRESS100_025.h5 33 | data/ver1/jvs018/parallel100/hdf5/VOICEACTRESS100_032.h5 34 | data/ver1/jvs018/parallel100/hdf5/VOICEACTRESS100_069.h5 35 | data/ver1/jvs019/parallel100/hdf5/VOICEACTRESS100_020.h5 36 | data/ver1/jvs019/parallel100/hdf5/VOICEACTRESS100_050.h5 37 | data/ver1/jvs019/parallel100/hdf5/VOICEACTRESS100_075.h5 38 | data/ver1/jvs019/parallel100/hdf5/VOICEACTRESS100_076.h5 39 | data/ver1/jvs020/nonpara30/hdf5/BASIC5000_0065.h5 40 | data/ver1/jvs020/nonpara30/hdf5/BASIC5000_0079.h5 41 | data/ver1/jvs020/parallel100/hdf5/VOICEACTRESS100_046.h5 42 | data/ver1/jvs020/whisper10/hdf5/BASIC5000_0065.h5 43 | data/ver1/jvs021/falset10/hdf5/BASIC5000_2299.h5 44 | data/ver1/jvs021/falset10/hdf5/VOICEACTRESS100_003.h5 45 | data/ver1/jvs021/whisper10/hdf5/TRAVEL1000_0537.h5 46 | data/ver1/jvs024/nonpara30/hdf5/BASIC5000_0853.h5 47 | data/ver1/jvs024/parallel100/hdf5/VOICEACTRESS100_006.h5 48 | data/ver1/jvs024/parallel100/hdf5/VOICEACTRESS100_021.h5 49 | data/ver1/jvs024/whisper10/hdf5/VOICEACTRESS100_003.h5 50 | data/ver1/jvs025/nonpara30/hdf5/BASIC5000_0082.h5 51 | data/ver1/jvs025/nonpara30/hdf5/BASIC5000_0162.h5 52 | data/ver1/jvs025/parallel100/hdf5/VOICEACTRESS100_002.h5 53 | data/ver1/jvs026/parallel100/hdf5/VOICEACTRESS100_057.h5 54 | data/ver1/jvs029/nonpara30/hdf5/BASIC5000_2790.h5 55 | data/ver1/jvs029/parallel100/hdf5/VOICEACTRESS100_030.h5 56 | data/ver1/jvs029/parallel100/hdf5/VOICEACTRESS100_062.h5 57 | data/ver1/jvs029/parallel100/hdf5/VOICEACTRESS100_100.h5 58 | data/ver1/jvs030/parallel100/hdf5/VOICEACTRESS100_030.h5 59 | data/ver1/jvs032/parallel100/hdf5/VOICEACTRESS100_032.h5 60 | data/ver1/jvs033/parallel100/hdf5/VOICEACTRESS100_044.h5 61 | data/ver1/jvs033/parallel100/hdf5/VOICEACTRESS100_051.h5 62 | data/ver1/jvs034/nonpara30/hdf5/BASIC5000_1903.h5 63 | data/ver1/jvs034/parallel100/hdf5/VOICEACTRESS100_038.h5 64 | data/ver1/jvs034/whisper10/hdf5/UT-PARAPHRASE-sent173-phrase2.h5 65 | data/ver1/jvs035/nonpara30/hdf5/TRAVEL1000_0773.h5 66 | data/ver1/jvs036/parallel100/hdf5/VOICEACTRESS100_083.h5 67 | data/ver1/jvs038/parallel100/hdf5/VOICEACTRESS100_057.h5 68 | data/ver1/jvs038/parallel100/hdf5/VOICEACTRESS100_068.h5 69 | data/ver1/jvs039/parallel100/hdf5/VOICEACTRESS100_003.h5 70 | data/ver1/jvs039/parallel100/hdf5/VOICEACTRESS100_044.h5 71 | data/ver1/jvs039/parallel100/hdf5/VOICEACTRESS100_061.h5 72 | data/ver1/jvs040/parallel100/hdf5/VOICEACTRESS100_034.h5 73 | data/ver1/jvs040/parallel100/hdf5/VOICEACTRESS100_035.h5 74 | data/ver1/jvs040/parallel100/hdf5/VOICEACTRESS100_078.h5 75 | data/ver1/jvs040/whisper10/hdf5/TRAVEL1000_0437.h5 76 | data/ver1/jvs041/falset10/hdf5/BASIC5000_2818.h5 77 | data/ver1/jvs041/falset10/hdf5/VOICEACTRESS100_005.h5 78 | data/ver1/jvs041/parallel100/hdf5/VOICEACTRESS100_099.h5 79 | data/ver1/jvs043/parallel100/hdf5/VOICEACTRESS100_023.h5 80 | data/ver1/jvs043/parallel100/hdf5/VOICEACTRESS100_075.h5 81 | data/ver1/jvs043/parallel100/hdf5/VOICEACTRESS100_086.h5 82 | data/ver1/jvs043/parallel100/hdf5/VOICEACTRESS100_087.h5 83 | data/ver1/jvs045/parallel100/hdf5/VOICEACTRESS100_040.h5 84 | data/ver1/jvs047/falset10/hdf5/VOICEACTRESS100_002.h5 85 | data/ver1/jvs047/whisper10/hdf5/VOICEACTRESS100_001.h5 86 | data/ver1/jvs049/parallel100/hdf5/VOICEACTRESS100_025.h5 87 | data/ver1/jvs049/parallel100/hdf5/VOICEACTRESS100_074.h5 88 | data/ver1/jvs051/nonpara30/hdf5/BASIC5000_1857.h5 89 | data/ver1/jvs051/nonpara30/hdf5/TRAVEL1000_0917.h5 90 | data/ver1/jvs052/parallel100/hdf5/VOICEACTRESS100_003.h5 91 | data/ver1/jvs053/parallel100/hdf5/VOICEACTRESS100_016.h5 92 | data/ver1/jvs053/parallel100/hdf5/VOICEACTRESS100_070.h5 93 | data/ver1/jvs053/parallel100/hdf5/VOICEACTRESS100_091.h5 94 | data/ver1/jvs055/parallel100/hdf5/VOICEACTRESS100_009.h5 95 | data/ver1/jvs055/parallel100/hdf5/VOICEACTRESS100_024.h5 96 | data/ver1/jvs056/parallel100/hdf5/VOICEACTRESS100_008.h5 97 | data/ver1/jvs057/nonpara30/hdf5/BASIC5000_1276.h5 98 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_035.h5 99 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_041.h5 100 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_076.h5 101 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_093.h5 102 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_097.h5 103 | data/ver1/jvs057/parallel100/hdf5/VOICEACTRESS100_100.h5 104 | data/ver1/jvs057/whisper10/hdf5/VOICEACTRESS100_004.h5 105 | data/ver1/jvs058/parallel100/hdf5/VOICEACTRESS100_061.h5 106 | data/ver1/jvs058/whisper10/hdf5/VOICEACTRESS100_004.h5 107 | data/ver1/jvs059/nonpara30/hdf5/BASIC5000_1588.h5 108 | data/ver1/jvs059/parallel100/hdf5/VOICEACTRESS100_020.h5 109 | data/ver1/jvs059/parallel100/hdf5/VOICEACTRESS100_024.h5 110 | data/ver1/jvs059/parallel100/hdf5/VOICEACTRESS100_074.h5 111 | data/ver1/jvs059/parallel100/hdf5/VOICEACTRESS100_090.h5 112 | data/ver1/jvs061/nonpara30/hdf5/BASIC5000_1540.h5 113 | data/ver1/jvs061/parallel100/hdf5/VOICEACTRESS100_002.h5 114 | data/ver1/jvs061/parallel100/hdf5/VOICEACTRESS100_009.h5 115 | data/ver1/jvs061/parallel100/hdf5/VOICEACTRESS100_015.h5 116 | data/ver1/jvs061/parallel100/hdf5/VOICEACTRESS100_076.h5 117 | data/ver1/jvs063/parallel100/hdf5/VOICEACTRESS100_097.h5 118 | data/ver1/jvs064/parallel100/hdf5/VOICEACTRESS100_039.h5 119 | data/ver1/jvs065/parallel100/hdf5/VOICEACTRESS100_021.h5 120 | data/ver1/jvs065/parallel100/hdf5/VOICEACTRESS100_066.h5 121 | data/ver1/jvs065/parallel100/hdf5/VOICEACTRESS100_094.h5 122 | data/ver1/jvs065/whisper10/hdf5/VOICEACTRESS100_003.h5 123 | data/ver1/jvs067/parallel100/hdf5/VOICEACTRESS100_084.h5 124 | data/ver1/jvs067/parallel100/hdf5/VOICEACTRESS100_085.h5 125 | data/ver1/jvs069/parallel100/hdf5/VOICEACTRESS100_001.h5 126 | data/ver1/jvs069/parallel100/hdf5/VOICEACTRESS100_012.h5 127 | data/ver1/jvs069/parallel100/hdf5/VOICEACTRESS100_045.h5 128 | data/ver1/jvs069/whisper10/hdf5/VOICEACTRESS100_001.h5 129 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_013.h5 130 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_051.h5 131 | data/ver1/jvs070/parallel100/hdf5/VOICEACTRESS100_063.h5 132 | data/ver1/jvs071/whisper10/hdf5/VOICEACTRESS100_005.h5 133 | data/ver1/jvs072/parallel100/hdf5/VOICEACTRESS100_009.h5 134 | data/ver1/jvs073/nonpara30/hdf5/BASIC5000_0070.h5 135 | data/ver1/jvs073/parallel100/hdf5/VOICEACTRESS100_011.h5 136 | data/ver1/jvs073/parallel100/hdf5/VOICEACTRESS100_029.h5 137 | data/ver1/jvs073/parallel100/hdf5/VOICEACTRESS100_034.h5 138 | data/ver1/jvs074/parallel100/hdf5/VOICEACTRESS100_083.h5 139 | data/ver1/jvs075/falset10/hdf5/VOICEACTRESS100_004.h5 140 | data/ver1/jvs075/parallel100/hdf5/VOICEACTRESS100_021.h5 141 | data/ver1/jvs075/parallel100/hdf5/VOICEACTRESS100_024.h5 142 | data/ver1/jvs075/parallel100/hdf5/VOICEACTRESS100_030.h5 143 | data/ver1/jvs075/parallel100/hdf5/VOICEACTRESS100_035.h5 144 | data/ver1/jvs075/parallel100/hdf5/VOICEACTRESS100_036.h5 145 | data/ver1/jvs076/nonpara30/hdf5/BASIC5000_1517.h5 146 | data/ver1/jvs076/parallel100/hdf5/VOICEACTRESS100_060.h5 147 | data/ver1/jvs077/nonpara30/hdf5/UT-PARAPHRASE-sent054-phrase2.h5 148 | data/ver1/jvs079/falset10/hdf5/VOICEACTRESS100_002.h5 149 | data/ver1/jvs079/parallel100/hdf5/VOICEACTRESS100_001.h5 150 | data/ver1/jvs079/parallel100/hdf5/VOICEACTRESS100_007.h5 151 | data/ver1/jvs079/whisper10/hdf5/BASIC5000_0382.h5 152 | data/ver1/jvs080/nonpara30/hdf5/BASIC5000_0860.h5 153 | data/ver1/jvs080/nonpara30/hdf5/TRAVEL1000_0241.h5 154 | data/ver1/jvs081/parallel100/hdf5/VOICEACTRESS100_021.h5 155 | data/ver1/jvs081/whisper10/hdf5/BASIC5000_1695.h5 156 | data/ver1/jvs082/parallel100/hdf5/VOICEACTRESS100_014.h5 157 | data/ver1/jvs082/parallel100/hdf5/VOICEACTRESS100_056.h5 158 | data/ver1/jvs082/parallel100/hdf5/VOICEACTRESS100_066.h5 159 | data/ver1/jvs082/parallel100/hdf5/VOICEACTRESS100_067.h5 160 | data/ver1/jvs083/nonpara30/hdf5/BASIC5000_2066.h5 161 | data/ver1/jvs083/nonpara30/hdf5/TRAVEL1000_0817.h5 162 | data/ver1/jvs083/parallel100/hdf5/VOICEACTRESS100_046.h5 163 | data/ver1/jvs083/parallel100/hdf5/VOICEACTRESS100_067.h5 164 | data/ver1/jvs084/parallel100/hdf5/VOICEACTRESS100_044.h5 165 | data/ver1/jvs084/parallel100/hdf5/VOICEACTRESS100_066.h5 166 | data/ver1/jvs084/whisper10/hdf5/BASIC5000_0591.h5 167 | data/ver1/jvs085/nonpara30/hdf5/BASIC5000_2356.h5 168 | data/ver1/jvs085/parallel100/hdf5/VOICEACTRESS100_013.h5 169 | data/ver1/jvs085/parallel100/hdf5/VOICEACTRESS100_086.h5 170 | data/ver1/jvs088/nonpara30/hdf5/BASIC5000_0545.h5 171 | data/ver1/jvs088/parallel100/hdf5/VOICEACTRESS100_057.h5 172 | data/ver1/jvs088/parallel100/hdf5/VOICEACTRESS100_087.h5 173 | data/ver1/jvs090/nonpara30/hdf5/BASIC5000_0219.h5 174 | data/ver1/jvs090/parallel100/hdf5/VOICEACTRESS100_005.h5 175 | data/ver1/jvs090/parallel100/hdf5/VOICEACTRESS100_060.h5 176 | data/ver1/jvs091/nonpara30/hdf5/BASIC5000_2247.h5 177 | data/ver1/jvs091/nonpara30/hdf5/BASIC5000_2583.h5 178 | data/ver1/jvs091/parallel100/hdf5/VOICEACTRESS100_001.h5 179 | data/ver1/jvs091/parallel100/hdf5/VOICEACTRESS100_004.h5 180 | data/ver1/jvs091/parallel100/hdf5/VOICEACTRESS100_087.h5 181 | data/ver1/jvs092/nonpara30/hdf5/ONOMATOPEE300_262.h5 182 | data/ver1/jvs092/parallel100/hdf5/VOICEACTRESS100_010.h5 183 | data/ver1/jvs093/nonpara30/hdf5/BASIC5000_2376.h5 184 | data/ver1/jvs093/parallel100/hdf5/VOICEACTRESS100_013.h5 185 | data/ver1/jvs093/parallel100/hdf5/VOICEACTRESS100_083.h5 186 | data/ver1/jvs094/parallel100/hdf5/VOICEACTRESS100_049.h5 187 | data/ver1/jvs095/nonpara30/hdf5/BASIC5000_1810.h5 188 | data/ver1/jvs096/parallel100/hdf5/VOICEACTRESS100_004.h5 189 | data/ver1/jvs096/parallel100/hdf5/VOICEACTRESS100_012.h5 190 | data/ver1/jvs096/parallel100/hdf5/VOICEACTRESS100_057.h5 191 | data/ver1/jvs096/parallel100/hdf5/VOICEACTRESS100_080.h5 192 | data/ver1/jvs096/parallel100/hdf5/VOICEACTRESS100_085.h5 193 | data/ver1/jvs097/parallel100/hdf5/VOICEACTRESS100_034.h5 194 | data/ver1/jvs097/parallel100/hdf5/VOICEACTRESS100_044.h5 195 | data/ver1/jvs097/parallel100/hdf5/VOICEACTRESS100_059.h5 196 | data/ver1/jvs097/whisper10/hdf5/VOICEACTRESS100_004.h5 197 | data/ver1/jvs098/parallel100/hdf5/VOICEACTRESS100_033.h5 198 | data/ver1/jvs098/whisper10/hdf5/TRAVEL1000_0220.h5 199 | data/ver1/jvs099/parallel100/hdf5/VOICEACTRESS100_077.h5 200 | data/ver1/jvs099/parallel100/hdf5/VOICEACTRESS100_078.h5 201 | -------------------------------------------------------------------------------- /egs/jvs/data/scp/dev.scp: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/falset10/wav/BASIC5000_1635.wav 2 | data/ver1/jvs001/falset10/wav/VOICEACTRESS100_002.wav 3 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_016.wav 4 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_045.wav 5 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_099.wav 6 | data/ver1/jvs002/falset10/wav/ONOMATOPEE300_036.wav 7 | data/ver1/jvs003/falset10/wav/BASIC5000_1363.wav 8 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_040.wav 9 | data/ver1/jvs004/falset10/wav/BASIC5000_1438.wav 10 | data/ver1/jvs004/falset10/wav/VOICEACTRESS100_005.wav 11 | data/ver1/jvs005/falset10/wav/TRAVEL1000_0484.wav 12 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_002.wav 13 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_038.wav 14 | data/ver1/jvs006/parallel100/wav/VOICEACTRESS100_070.wav 15 | data/ver1/jvs006/parallel100/wav/VOICEACTRESS100_088.wav 16 | data/ver1/jvs009/falset10/wav/BASIC5000_2634.wav 17 | data/ver1/jvs009/nonpara30/wav/BASIC5000_0816.wav 18 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_018.wav 19 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_062.wav 20 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_082.wav 21 | data/ver1/jvs010/falset10/wav/VOICEACTRESS100_005.wav 22 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_004.wav 23 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_007.wav 24 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_032.wav 25 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_061.wav 26 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_002.wav 27 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_009.wav 28 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_019.wav 29 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_022.wav 30 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_040.wav 31 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_071.wav 32 | data/ver1/jvs015/falset10/wav/VOICEACTRESS100_001.wav 33 | data/ver1/jvs016/falset10/wav/BASIC5000_2676.wav 34 | data/ver1/jvs016/falset10/wav/VOICEACTRESS100_002.wav 35 | data/ver1/jvs017/falset10/wav/BASIC5000_2512.wav 36 | data/ver1/jvs019/falset10/wav/VOICEACTRESS100_001.wav 37 | data/ver1/jvs020/nonpara30/wav/BASIC5000_1431.wav 38 | data/ver1/jvs020/nonpara30/wav/TRAVEL1000_0867.wav 39 | data/ver1/jvs020/parallel100/wav/VOICEACTRESS100_030.wav 40 | data/ver1/jvs021/nonpara30/wav/TRAVEL1000_0496.wav 41 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_006.wav 42 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_012.wav 43 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_030.wav 44 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_045.wav 45 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_100.wav 46 | data/ver1/jvs022/parallel100/wav/VOICEACTRESS100_036.wav 47 | data/ver1/jvs023/nonpara30/wav/TRAVEL1000_0029.wav 48 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_011.wav 49 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_029.wav 50 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_058.wav 51 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_082.wav 52 | data/ver1/jvs024/falset10/wav/VOICEACTRESS100_003.wav 53 | data/ver1/jvs024/falset10/wav/VOICEACTRESS100_004.wav 54 | data/ver1/jvs024/falset10/wav/VOICEACTRESS100_005.wav 55 | data/ver1/jvs025/falset10/wav/VOICEACTRESS100_001.wav 56 | data/ver1/jvs026/falset10/wav/TRAVEL1000_0949.wav 57 | data/ver1/jvs026/falset10/wav/VOICEACTRESS100_001.wav 58 | data/ver1/jvs027/falset10/wav/BASIC5000_1045.wav 59 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_057.wav 60 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_100.wav 61 | data/ver1/jvs029/falset10/wav/BASIC5000_2412.wav 62 | data/ver1/jvs029/falset10/wav/TRAVEL1000_0449.wav 63 | data/ver1/jvs029/falset10/wav/VOICEACTRESS100_004.wav 64 | data/ver1/jvs031/nonpara30/wav/BASIC5000_0629.wav 65 | data/ver1/jvs031/nonpara30/wav/TRAVEL1000_0762.wav 66 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_014.wav 67 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_041.wav 68 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_072.wav 69 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_078.wav 70 | data/ver1/jvs032/falset10/wav/VOICEACTRESS100_001.wav 71 | data/ver1/jvs032/falset10/wav/VOICEACTRESS100_004.wav 72 | data/ver1/jvs032/parallel100/wav/VOICEACTRESS100_029.wav 73 | data/ver1/jvs033/falset10/wav/BASIC5000_0266.wav 74 | data/ver1/jvs033/falset10/wav/TRAVEL1000_0199.wav 75 | data/ver1/jvs034/falset10/wav/BASIC5000_2100.wav 76 | data/ver1/jvs034/falset10/wav/BASIC5000_2694.wav 77 | data/ver1/jvs034/falset10/wav/VOICEACTRESS100_002.wav 78 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_045.wav 79 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_060.wav 80 | data/ver1/jvs035/falset10/wav/BASIC5000_0468.wav 81 | data/ver1/jvs035/falset10/wav/VOICEACTRESS100_004.wav 82 | data/ver1/jvs036/falset10/wav/VOICEACTRESS100_005.wav 83 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_003.wav 84 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_051.wav 85 | data/ver1/jvs039/falset10/wav/BASIC5000_1512.wav 86 | data/ver1/jvs039/falset10/wav/BASIC5000_1753.wav 87 | data/ver1/jvs039/falset10/wav/VOICEACTRESS100_001.wav 88 | data/ver1/jvs039/falset10/wav/VOICEACTRESS100_002.wav 89 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_040.wav 90 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_098.wav 91 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_065.wav 92 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_089.wav 93 | data/ver1/jvs043/falset10/wav/VOICEACTRESS100_005.wav 94 | data/ver1/jvs044/falset10/wav/BASIC5000_2176.wav 95 | data/ver1/jvs044/falset10/wav/TRAVEL1000_0993.wav 96 | data/ver1/jvs044/nonpara30/wav/BASIC5000_1211.wav 97 | data/ver1/jvs044/nonpara30/wav/LOANWORD128_074.wav 98 | data/ver1/jvs044/parallel100/wav/VOICEACTRESS100_020.wav 99 | data/ver1/jvs044/parallel100/wav/VOICEACTRESS100_089.wav 100 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_020.wav 101 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_022.wav 102 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_052.wav 103 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_053.wav 104 | data/ver1/jvs046/parallel100/wav/VOICEACTRESS100_005.wav 105 | data/ver1/jvs046/parallel100/wav/VOICEACTRESS100_099.wav 106 | data/ver1/jvs047/parallel100/wav/VOICEACTRESS100_065.wav 107 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_004.wav 108 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_011.wav 109 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_037.wav 110 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_045.wav 111 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_099.wav 112 | data/ver1/jvs049/parallel100/wav/VOICEACTRESS100_023.wav 113 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_056.wav 114 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_073.wav 115 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_077.wav 116 | data/ver1/jvs051/falset10/wav/VOICEACTRESS100_004.wav 117 | data/ver1/jvs052/falset10/wav/VOICEACTRESS100_001.wav 118 | data/ver1/jvs053/falset10/wav/BASIC5000_2254.wav 119 | data/ver1/jvs054/falset10/wav/VOICEACTRESS100_003.wav 120 | data/ver1/jvs054/nonpara30/wav/BASIC5000_1468.wav 121 | data/ver1/jvs054/nonpara30/wav/BASIC5000_2178.wav 122 | data/ver1/jvs054/nonpara30/wav/BASIC5000_2538.wav 123 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_026.wav 124 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_088.wav 125 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_097.wav 126 | data/ver1/jvs055/falset10/wav/VOICEACTRESS100_002.wav 127 | data/ver1/jvs056/falset10/wav/BASIC5000_0979.wav 128 | data/ver1/jvs057/falset10/wav/UT-PARAPHRASE-sent261-phrase2.wav 129 | data/ver1/jvs058/falset10/wav/TRAVEL1000_0452.wav 130 | data/ver1/jvs058/falset10/wav/VOICEACTRESS100_002.wav 131 | data/ver1/jvs059/falset10/wav/VOICEACTRESS100_001.wav 132 | data/ver1/jvs061/falset10/wav/ONOMATOPEE300_135.wav 133 | data/ver1/jvs062/falset10/wav/BASIC5000_2084.wav 134 | data/ver1/jvs062/falset10/wav/TRAVEL1000_0730.wav 135 | data/ver1/jvs062/falset10/wav/VOICEACTRESS100_002.wav 136 | data/ver1/jvs062/falset10/wav/VOICEACTRESS100_003.wav 137 | data/ver1/jvs063/falset10/wav/BASIC5000_1226.wav 138 | data/ver1/jvs063/falset10/wav/BASIC5000_2320.wav 139 | data/ver1/jvs063/falset10/wav/VOICEACTRESS100_005.wav 140 | data/ver1/jvs064/falset10/wav/VOICEACTRESS100_001.wav 141 | data/ver1/jvs065/falset10/wav/VOICEACTRESS100_005.wav 142 | data/ver1/jvs066/falset10/wav/BASIC5000_2406.wav 143 | data/ver1/jvs067/falset10/wav/BASIC5000_1001.wav 144 | data/ver1/jvs068/parallel100/wav/VOICEACTRESS100_084.wav 145 | data/ver1/jvs070/falset10/wav/VOICEACTRESS100_001.wav 146 | data/ver1/jvs070/falset10/wav/VOICEACTRESS100_005.wav 147 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_072.wav 148 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_027.wav 149 | data/ver1/jvs073/falset10/wav/LOANWORD128_063.wav 150 | data/ver1/jvs074/parallel100/wav/VOICEACTRESS100_007.wav 151 | data/ver1/jvs076/falset10/wav/VOICEACTRESS100_002.wav 152 | data/ver1/jvs076/falset10/wav/VOICEACTRESS100_005.wav 153 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_012.wav 154 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_048.wav 155 | data/ver1/jvs077/falset10/wav/UT-PARAPHRASE-sent170-phrase1.wav 156 | data/ver1/jvs077/falset10/wav/VOICEACTRESS100_002.wav 157 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_018.wav 158 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_030.wav 159 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_051.wav 160 | data/ver1/jvs078/falset10/wav/TRAVEL1000_0178.wav 161 | data/ver1/jvs078/falset10/wav/TRAVEL1000_0819.wav 162 | data/ver1/jvs078/falset10/wav/VOICEACTRESS100_004.wav 163 | data/ver1/jvs078/nonpara30/wav/BASIC5000_0080.wav 164 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_006.wav 165 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_063.wav 166 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_068.wav 167 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_069.wav 168 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_077.wav 169 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_097.wav 170 | data/ver1/jvs079/falset10/wav/VOICEACTRESS100_004.wav 171 | data/ver1/jvs081/falset10/wav/TRAVEL1000_0884.wav 172 | data/ver1/jvs081/falset10/wav/VOICEACTRESS100_002.wav 173 | data/ver1/jvs083/falset10/wav/TRAVEL1000_0273.wav 174 | data/ver1/jvs084/falset10/wav/BASIC5000_2086.wav 175 | data/ver1/jvs086/falset10/wav/BASIC5000_1231.wav 176 | data/ver1/jvs086/falset10/wav/TRAVEL1000_0050.wav 177 | data/ver1/jvs086/nonpara30/wav/BASIC5000_1202.wav 178 | data/ver1/jvs087/falset10/wav/BASIC5000_1496.wav 179 | data/ver1/jvs087/falset10/wav/VOICEACTRESS100_002.wav 180 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_013.wav 181 | data/ver1/jvs088/falset10/wav/001.wav 182 | data/ver1/jvs088/falset10/wav/002.wav 183 | data/ver1/jvs088/falset10/wav/006.wav 184 | data/ver1/jvs089/nonpara30/wav/ONOMATOPEE300_139.wav 185 | data/ver1/jvs089/parallel100/wav/VOICEACTRESS100_028.wav 186 | data/ver1/jvs089/parallel100/wav/VOICEACTRESS100_055.wav 187 | data/ver1/jvs091/falset10/wav/VOICEACTRESS100_004.wav 188 | data/ver1/jvs092/falset10/wav/BASIC5000_0713.wav 189 | data/ver1/jvs093/falset10/wav/VOICEACTRESS100_004.wav 190 | data/ver1/jvs094/falset10/wav/LOANWORD128_026.wav 191 | data/ver1/jvs094/falset10/wav/ONOMATOPEE300_287.wav 192 | data/ver1/jvs095/falset10/wav/BASIC5000_2297.wav 193 | data/ver1/jvs096/falset10/wav/BASIC5000_1475.wav 194 | data/ver1/jvs096/falset10/wav/VOICEACTRESS100_001.wav 195 | data/ver1/jvs096/falset10/wav/VOICEACTRESS100_003.wav 196 | data/ver1/jvs099/parallel100/wav/VOICEACTRESS100_006.wav 197 | data/ver1/jvs099/parallel100/wav/VOICEACTRESS100_029.wav 198 | data/ver1/jvs099/parallel100/wav/VOICEACTRESS100_076.wav 199 | data/ver1/jvs100/parallel100/wav/VOICEACTRESS100_057.wav 200 | data/ver1/jvs100/parallel100/wav/VOICEACTRESS100_100.wav 201 | -------------------------------------------------------------------------------- /egs/jvs/data/scp/eval_high.scp: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/falset10/wav/VOICEACTRESS100_003.wav 2 | data/ver1/jvs002/falset10/wav/LOANWORD128_007.wav 3 | data/ver1/jvs004/falset10/wav/BASIC5000_1802.wav 4 | data/ver1/jvs004/falset10/wav/TRAVEL1000_0409.wav 5 | data/ver1/jvs004/falset10/wav/TRAVEL1000_0840.wav 6 | data/ver1/jvs004/falset10/wav/VOICEACTRESS100_001.wav 7 | data/ver1/jvs009/falset10/wav/VOICEACTRESS100_001.wav 8 | data/ver1/jvs010/falset10/wav/BASIC5000_1849.wav 9 | data/ver1/jvs010/falset10/wav/BASIC5000_2689.wav 10 | data/ver1/jvs010/falset10/wav/TRAVEL1000_0708.wav 11 | data/ver1/jvs010/falset10/wav/VOICEACTRESS100_001.wav 12 | data/ver1/jvs010/falset10/wav/VOICEACTRESS100_004.wav 13 | data/ver1/jvs014/falset10/wav/BASIC5000_0733.wav 14 | data/ver1/jvs014/falset10/wav/BASIC5000_1093.wav 15 | data/ver1/jvs014/falset10/wav/BASIC5000_3033.wav 16 | data/ver1/jvs014/falset10/wav/TRAVEL1000_0936.wav 17 | data/ver1/jvs014/falset10/wav/VOICEACTRESS100_001.wav 18 | data/ver1/jvs014/falset10/wav/VOICEACTRESS100_005.wav 19 | data/ver1/jvs015/falset10/wav/VOICEACTRESS100_002.wav 20 | data/ver1/jvs015/falset10/wav/VOICEACTRESS100_004.wav 21 | data/ver1/jvs016/falset10/wav/LOANWORD128_084.wav 22 | data/ver1/jvs016/falset10/wav/VOICEACTRESS100_001.wav 23 | data/ver1/jvs016/falset10/wav/VOICEACTRESS100_003.wav 24 | data/ver1/jvs016/falset10/wav/VOICEACTRESS100_004.wav 25 | data/ver1/jvs016/falset10/wav/VOICEACTRESS100_005.wav 26 | data/ver1/jvs017/falset10/wav/VOICEACTRESS100_001.wav 27 | data/ver1/jvs017/falset10/wav/VOICEACTRESS100_004.wav 28 | data/ver1/jvs019/falset10/wav/BASIC5000_1250.wav 29 | data/ver1/jvs019/falset10/wav/ONOMATOPEE300_210.wav 30 | data/ver1/jvs019/falset10/wav/VOICEACTRESS100_005.wav 31 | data/ver1/jvs024/falset10/wav/BASIC5000_0193.wav 32 | data/ver1/jvs024/falset10/wav/BASIC5000_0394.wav 33 | data/ver1/jvs025/falset10/wav/BASIC5000_0960.wav 34 | data/ver1/jvs025/falset10/wav/BASIC5000_1464.wav 35 | data/ver1/jvs025/falset10/wav/BASIC5000_2919.wav 36 | data/ver1/jvs025/falset10/wav/TRAVEL1000_0392.wav 37 | data/ver1/jvs025/falset10/wav/TRAVEL1000_0795.wav 38 | data/ver1/jvs025/falset10/wav/VOICEACTRESS100_002.wav 39 | data/ver1/jvs025/falset10/wav/VOICEACTRESS100_003.wav 40 | data/ver1/jvs025/falset10/wav/VOICEACTRESS100_005.wav 41 | data/ver1/jvs026/falset10/wav/BASIC5000_1023.wav 42 | data/ver1/jvs026/falset10/wav/TRAVEL1000_0555.wav 43 | data/ver1/jvs026/falset10/wav/VOICEACTRESS100_002.wav 44 | data/ver1/jvs026/falset10/wav/VOICEACTRESS100_003.wav 45 | data/ver1/jvs027/falset10/wav/BASIC5000_0608.wav 46 | data/ver1/jvs027/falset10/wav/TRAVEL1000_0296.wav 47 | data/ver1/jvs027/falset10/wav/VOICEACTRESS100_001.wav 48 | data/ver1/jvs027/falset10/wav/VOICEACTRESS100_003.wav 49 | data/ver1/jvs027/falset10/wav/VOICEACTRESS100_005.wav 50 | data/ver1/jvs028/falset10/wav/BASIC5000_1807.wav 51 | data/ver1/jvs029/falset10/wav/BASIC5000_1119.wav 52 | data/ver1/jvs029/falset10/wav/TRAVEL1000_0378.wav 53 | data/ver1/jvs029/falset10/wav/VOICEACTRESS100_001.wav 54 | data/ver1/jvs030/falset10/wav/BASIC5000_2283.wav 55 | data/ver1/jvs030/falset10/wav/VOICEACTRESS100_004.wav 56 | data/ver1/jvs032/falset10/wav/BASIC5000_0820.wav 57 | data/ver1/jvs032/falset10/wav/BASIC5000_1088.wav 58 | data/ver1/jvs032/falset10/wav/VOICEACTRESS100_002.wav 59 | data/ver1/jvs032/falset10/wav/VOICEACTRESS100_003.wav 60 | data/ver1/jvs033/falset10/wav/BASIC5000_0786.wav 61 | data/ver1/jvs033/falset10/wav/ONOMATOPEE300_203.wav 62 | data/ver1/jvs035/falset10/wav/BASIC5000_2750.wav 63 | data/ver1/jvs035/falset10/wav/TRAVEL1000_0618.wav 64 | data/ver1/jvs035/falset10/wav/VOICEACTRESS100_003.wav 65 | data/ver1/jvs038/falset10/wav/UT-PARAPHRASE-sent037-phrase1.wav 66 | data/ver1/jvs038/falset10/wav/VOICEACTRESS100_005.wav 67 | data/ver1/jvs039/falset10/wav/BASIC5000_1471.wav 68 | data/ver1/jvs039/falset10/wav/BASIC5000_1783.wav 69 | data/ver1/jvs039/falset10/wav/VOICEACTRESS100_003.wav 70 | data/ver1/jvs039/falset10/wav/VOICEACTRESS100_004.wav 71 | data/ver1/jvs039/falset10/wav/VOICEACTRESS100_005.wav 72 | data/ver1/jvs040/falset10/wav/TRAVEL1000_0927.wav 73 | data/ver1/jvs043/falset10/wav/BASIC5000_1227.wav 74 | data/ver1/jvs043/falset10/wav/LOANWORD128_005.wav 75 | data/ver1/jvs044/falset10/wav/VOICEACTRESS100_001.wav 76 | data/ver1/jvs045/falset10/wav/VOICEACTRESS100_005.wav 77 | data/ver1/jvs051/falset10/wav/ONOMATOPEE300_123.wav 78 | data/ver1/jvs051/falset10/wav/VOICEACTRESS100_002.wav 79 | data/ver1/jvs051/falset10/wav/VOICEACTRESS100_005.wav 80 | data/ver1/jvs052/falset10/wav/VOICEACTRESS100_003.wav 81 | data/ver1/jvs052/falset10/wav/VOICEACTRESS100_005.wav 82 | data/ver1/jvs053/falset10/wav/UT-PARAPHRASE-sent103-phrase1.wav 83 | data/ver1/jvs053/falset10/wav/VOICEACTRESS100_002.wav 84 | data/ver1/jvs053/falset10/wav/VOICEACTRESS100_003.wav 85 | data/ver1/jvs054/falset10/wav/BASIC5000_2178.wav 86 | data/ver1/jvs055/falset10/wav/VOICEACTRESS100_004.wav 87 | data/ver1/jvs056/falset10/wav/BASIC5000_2730.wav 88 | data/ver1/jvs056/falset10/wav/VOICEACTRESS100_003.wav 89 | data/ver1/jvs056/falset10/wav/VOICEACTRESS100_004.wav 90 | data/ver1/jvs056/falset10/wav/VOICEACTRESS100_005.wav 91 | data/ver1/jvs057/falset10/wav/VOICEACTRESS100_003.wav 92 | data/ver1/jvs058/falset10/wav/BASIC5000_2426.wav 93 | data/ver1/jvs058/falset10/wav/VOICEACTRESS100_005.wav 94 | data/ver1/jvs059/falset10/wav/VOICEACTRESS100_003.wav 95 | data/ver1/jvs061/falset10/wav/BASIC5000_2893.wav 96 | data/ver1/jvs061/falset10/wav/VOICEACTRESS100_004.wav 97 | data/ver1/jvs062/falset10/wav/VOICEACTRESS100_005.wav 98 | data/ver1/jvs063/falset10/wav/TRAVEL1000_0104.wav 99 | data/ver1/jvs063/falset10/wav/VOICEACTRESS100_001.wav 100 | data/ver1/jvs064/falset10/wav/BASIC5000_1641.wav 101 | data/ver1/jvs065/falset10/wav/BASIC5000_2139.wav 102 | data/ver1/jvs065/falset10/wav/TRAVEL1000_0234.wav 103 | data/ver1/jvs065/falset10/wav/TRAVEL1000_0542.wav 104 | data/ver1/jvs065/falset10/wav/VOICEACTRESS100_002.wav 105 | data/ver1/jvs066/falset10/wav/BASIC5000_1205.wav 106 | data/ver1/jvs066/falset10/wav/TRAVEL1000_0861.wav 107 | data/ver1/jvs066/falset10/wav/VOICEACTRESS100_001.wav 108 | data/ver1/jvs066/falset10/wav/VOICEACTRESS100_005.wav 109 | data/ver1/jvs067/falset10/wav/BASIC5000_0480.wav 110 | data/ver1/jvs067/falset10/wav/ONOMATOPEE300_243.wav 111 | data/ver1/jvs067/falset10/wav/VOICEACTRESS100_002.wav 112 | data/ver1/jvs067/falset10/wav/VOICEACTRESS100_004.wav 113 | data/ver1/jvs068/falset10/wav/BASIC5000_1759.wav 114 | data/ver1/jvs068/falset10/wav/TRAVEL1000_0146.wav 115 | data/ver1/jvs069/falset10/wav/BASIC5000_0614.wav 116 | data/ver1/jvs069/falset10/wav/BASIC5000_2786.wav 117 | data/ver1/jvs069/falset10/wav/TRAVEL1000_0574.wav 118 | data/ver1/jvs069/falset10/wav/VOICEACTRESS100_002.wav 119 | data/ver1/jvs069/falset10/wav/VOICEACTRESS100_003.wav 120 | data/ver1/jvs070/falset10/wav/BASIC5000_0182.wav 121 | data/ver1/jvs070/falset10/wav/BASIC5000_1696.wav 122 | data/ver1/jvs070/falset10/wav/VOICEACTRESS100_002.wav 123 | data/ver1/jvs070/falset10/wav/VOICEACTRESS100_003.wav 124 | data/ver1/jvs070/falset10/wav/VOICEACTRESS100_004.wav 125 | data/ver1/jvs072/falset10/wav/BASIC5000_0170.wav 126 | data/ver1/jvs072/falset10/wav/BASIC5000_0813.wav 127 | data/ver1/jvs072/falset10/wav/BASIC5000_1604.wav 128 | data/ver1/jvs072/falset10/wav/BASIC5000_1940.wav 129 | data/ver1/jvs072/falset10/wav/BASIC5000_1976.wav 130 | data/ver1/jvs072/falset10/wav/VOICEACTRESS100_001.wav 131 | data/ver1/jvs072/falset10/wav/VOICEACTRESS100_002.wav 132 | data/ver1/jvs072/falset10/wav/VOICEACTRESS100_004.wav 133 | data/ver1/jvs072/falset10/wav/VOICEACTRESS100_005.wav 134 | data/ver1/jvs073/falset10/wav/BASIC5000_2430.wav 135 | data/ver1/jvs073/falset10/wav/VOICEACTRESS100_002.wav 136 | data/ver1/jvs073/falset10/wav/VOICEACTRESS100_004.wav 137 | data/ver1/jvs075/falset10/wav/UT-PARAPHRASE-sent004-phrase1.wav 138 | data/ver1/jvs075/falset10/wav/VOICEACTRESS100_002.wav 139 | data/ver1/jvs076/falset10/wav/BASIC5000_2502.wav 140 | data/ver1/jvs076/falset10/wav/VOICEACTRESS100_004.wav 141 | data/ver1/jvs077/falset10/wav/BASIC5000_0910.wav 142 | data/ver1/jvs077/falset10/wav/TRAVEL1000_0291.wav 143 | data/ver1/jvs077/falset10/wav/VOICEACTRESS100_001.wav 144 | data/ver1/jvs077/falset10/wav/VOICEACTRESS100_003.wav 145 | data/ver1/jvs077/falset10/wav/VOICEACTRESS100_004.wav 146 | data/ver1/jvs078/falset10/wav/VOICEACTRESS100_001.wav 147 | data/ver1/jvs079/falset10/wav/BASIC5000_1162.wav 148 | data/ver1/jvs079/falset10/wav/BASIC5000_1973.wav 149 | data/ver1/jvs080/falset10/wav/TRAVEL1000_0048.wav 150 | data/ver1/jvs080/falset10/wav/UT-PARAPHRASE-sent003-phrase2.wav 151 | data/ver1/jvs081/falset10/wav/BASIC5000_2612.wav 152 | data/ver1/jvs081/falset10/wav/VOICEACTRESS100_001.wav 153 | data/ver1/jvs081/falset10/wav/VOICEACTRESS100_004.wav 154 | data/ver1/jvs082/falset10/wav/BASIC5000_1728.wav 155 | data/ver1/jvs082/falset10/wav/VOICEACTRESS100_002.wav 156 | data/ver1/jvs082/falset10/wav/VOICEACTRESS100_004.wav 157 | data/ver1/jvs083/falset10/wav/TRAVEL1000_0213.wav 158 | data/ver1/jvs083/falset10/wav/TRAVEL1000_0804.wav 159 | data/ver1/jvs083/falset10/wav/TRAVEL1000_0817.wav 160 | data/ver1/jvs083/falset10/wav/VOICEACTRESS100_001.wav 161 | data/ver1/jvs084/falset10/wav/UT-PARAPHRASE-sent056-phrase2.wav 162 | data/ver1/jvs085/falset10/wav/VOICEACTRESS100_001.wav 163 | data/ver1/jvs085/falset10/wav/VOICEACTRESS100_003.wav 164 | data/ver1/jvs086/falset10/wav/VOICEACTRESS100_005.wav 165 | data/ver1/jvs087/falset10/wav/BASIC5000_1799.wav 166 | data/ver1/jvs087/falset10/wav/BASIC5000_2717.wav 167 | data/ver1/jvs087/falset10/wav/VOICEACTRESS100_001.wav 168 | data/ver1/jvs088/falset10/wav/004.wav 169 | data/ver1/jvs088/falset10/wav/007.wav 170 | data/ver1/jvs088/falset10/wav/009.wav 171 | data/ver1/jvs090/falset10/wav/BASIC5000_0993.wav 172 | data/ver1/jvs090/falset10/wav/BASIC5000_1413.wav 173 | data/ver1/jvs090/falset10/wav/UT-PARAPHRASE-sent018-phrase2.wav 174 | data/ver1/jvs090/falset10/wav/VOICEACTRESS100_001.wav 175 | data/ver1/jvs090/falset10/wav/VOICEACTRESS100_003.wav 176 | data/ver1/jvs090/falset10/wav/VOICEACTRESS100_004.wav 177 | data/ver1/jvs090/falset10/wav/VOICEACTRESS100_005.wav 178 | data/ver1/jvs091/falset10/wav/BASIC5000_2150.wav 179 | data/ver1/jvs091/falset10/wav/TRAVEL1000_0246.wav 180 | data/ver1/jvs091/falset10/wav/TRAVEL1000_0636.wav 181 | data/ver1/jvs091/falset10/wav/VOICEACTRESS100_005.wav 182 | data/ver1/jvs092/falset10/wav/BASIC5000_0012.wav 183 | data/ver1/jvs092/falset10/wav/VOICEACTRESS100_002.wav 184 | data/ver1/jvs092/falset10/wav/VOICEACTRESS100_003.wav 185 | data/ver1/jvs093/falset10/wav/BASIC5000_2685.wav 186 | data/ver1/jvs094/falset10/wav/BASIC5000_1851.wav 187 | data/ver1/jvs094/falset10/wav/VOICEACTRESS100_001.wav 188 | data/ver1/jvs094/falset10/wav/VOICEACTRESS100_002.wav 189 | data/ver1/jvs095/falset10/wav/BASIC5000_0565.wav 190 | data/ver1/jvs095/falset10/wav/BASIC5000_2773.wav 191 | data/ver1/jvs095/falset10/wav/VOICEACTRESS100_002.wav 192 | data/ver1/jvs096/falset10/wav/TRAVEL1000_0307.wav 193 | data/ver1/jvs096/falset10/wav/VOICEACTRESS100_004.wav 194 | data/ver1/jvs096/falset10/wav/VOICEACTRESS100_005.wav 195 | data/ver1/jvs097/falset10/wav/BASIC5000_1562.wav 196 | data/ver1/jvs097/falset10/wav/BASIC5000_1805.wav 197 | data/ver1/jvs097/falset10/wav/BASIC5000_2238.wav 198 | data/ver1/jvs097/falset10/wav/VOICEACTRESS100_002.wav 199 | data/ver1/jvs097/falset10/wav/VOICEACTRESS100_004.wav 200 | data/ver1/jvs100/falset10/wav/VOICEACTRESS100_005.wav 201 | -------------------------------------------------------------------------------- /egs/jvs/data/scp/eval_low.scp: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/nonpara30/wav/BASIC5000_1896.wav 2 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_014.wav 3 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_036.wav 4 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_046.wav 5 | data/ver1/jvs001/parallel100/wav/VOICEACTRESS100_048.wav 6 | data/ver1/jvs003/nonpara30/wav/BASIC5000_1388.wav 7 | data/ver1/jvs003/nonpara30/wav/LOANWORD128_044.wav 8 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_031.wav 9 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_033.wav 10 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_064.wav 11 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_090.wav 12 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_094.wav 13 | data/ver1/jvs005/nonpara30/wav/BASIC5000_1733.wav 14 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_028.wav 15 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_036.wav 16 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_059.wav 17 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_066.wav 18 | data/ver1/jvs005/parallel100/wav/VOICEACTRESS100_093.wav 19 | data/ver1/jvs006/nonpara30/wav/BASIC5000_0069.wav 20 | data/ver1/jvs006/nonpara30/wav/BASIC5000_2121.wav 21 | data/ver1/jvs006/parallel100/wav/VOICEACTRESS100_006.wav 22 | data/ver1/jvs006/parallel100/wav/VOICEACTRESS100_096.wav 23 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_021.wav 24 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_030.wav 25 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_035.wav 26 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_065.wav 27 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_076.wav 28 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_084.wav 29 | data/ver1/jvs009/parallel100/wav/VOICEACTRESS100_090.wav 30 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_013.wav 31 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_034.wav 32 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_043.wav 33 | data/ver1/jvs012/parallel100/wav/VOICEACTRESS100_057.wav 34 | data/ver1/jvs020/nonpara30/wav/BASIC5000_0584.wav 35 | data/ver1/jvs020/parallel100/wav/VOICEACTRESS100_028.wav 36 | data/ver1/jvs021/nonpara30/wav/LOANWORD128_105.wav 37 | data/ver1/jvs021/parallel100/wav/VOICEACTRESS100_084.wav 38 | data/ver1/jvs022/parallel100/wav/VOICEACTRESS100_008.wav 39 | data/ver1/jvs022/parallel100/wav/VOICEACTRESS100_030.wav 40 | data/ver1/jvs022/parallel100/wav/VOICEACTRESS100_075.wav 41 | data/ver1/jvs023/nonpara30/wav/BASIC5000_0788.wav 42 | data/ver1/jvs023/nonpara30/wav/TRAVEL1000_0566.wav 43 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_002.wav 44 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_014.wav 45 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_046.wav 46 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_083.wav 47 | data/ver1/jvs023/parallel100/wav/VOICEACTRESS100_086.wav 48 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_012.wav 49 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_046.wav 50 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_065.wav 51 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_086.wav 52 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_090.wav 53 | data/ver1/jvs028/parallel100/wav/VOICEACTRESS100_094.wav 54 | data/ver1/jvs031/nonpara30/wav/BASIC5000_0723.wav 55 | data/ver1/jvs031/nonpara30/wav/BASIC5000_1774.wav 56 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_012.wav 57 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_026.wav 58 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_044.wav 59 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_079.wav 60 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_087.wav 61 | data/ver1/jvs031/parallel100/wav/VOICEACTRESS100_088.wav 62 | data/ver1/jvs032/parallel100/wav/VOICEACTRESS100_008.wav 63 | data/ver1/jvs032/parallel100/wav/VOICEACTRESS100_072.wav 64 | data/ver1/jvs033/parallel100/wav/VOICEACTRESS100_067.wav 65 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_002.wav 66 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_020.wav 67 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_050.wav 68 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_088.wav 69 | data/ver1/jvs037/nonpara30/wav/BASIC5000_0340.wav 70 | data/ver1/jvs037/nonpara30/wav/BASIC5000_0409.wav 71 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_004.wav 72 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_006.wav 73 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_017.wav 74 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_022.wav 75 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_025.wav 76 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_027.wav 77 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_041.wav 78 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_064.wav 79 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_076.wav 80 | data/ver1/jvs037/parallel100/wav/VOICEACTRESS100_100.wav 81 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_007.wav 82 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_013.wav 83 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_028.wav 84 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_033.wav 85 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_041.wav 86 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_047.wav 87 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_055.wav 88 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_059.wav 89 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_070.wav 90 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_088.wav 91 | data/ver1/jvs042/nonpara30/wav/BASIC5000_1065.wav 92 | data/ver1/jvs042/nonpara30/wav/BASIC5000_2650.wav 93 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_014.wav 94 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_032.wav 95 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_037.wav 96 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_079.wav 97 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_087.wav 98 | data/ver1/jvs042/parallel100/wav/VOICEACTRESS100_097.wav 99 | data/ver1/jvs044/parallel100/wav/VOICEACTRESS100_024.wav 100 | data/ver1/jvs044/parallel100/wav/VOICEACTRESS100_031.wav 101 | data/ver1/jvs044/parallel100/wav/VOICEACTRESS100_071.wav 102 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_027.wav 103 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_046.wav 104 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_054.wav 105 | data/ver1/jvs046/parallel100/wav/VOICEACTRESS100_015.wav 106 | data/ver1/jvs046/parallel100/wav/VOICEACTRESS100_076.wav 107 | data/ver1/jvs046/parallel100/wav/VOICEACTRESS100_082.wav 108 | data/ver1/jvs047/nonpara30/wav/BASIC5000_1971.wav 109 | data/ver1/jvs047/nonpara30/wav/ONOMATOPEE300_101.wav 110 | data/ver1/jvs047/nonpara30/wav/UT-PARAPHRASE-sent135-phrase2.wav 111 | data/ver1/jvs047/parallel100/wav/VOICEACTRESS100_035.wav 112 | data/ver1/jvs047/parallel100/wav/VOICEACTRESS100_037.wav 113 | data/ver1/jvs047/parallel100/wav/VOICEACTRESS100_045.wav 114 | data/ver1/jvs048/nonpara30/wav/BASIC5000_0721.wav 115 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_005.wav 116 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_006.wav 117 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_023.wav 118 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_050.wav 119 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_051.wav 120 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_055.wav 121 | data/ver1/jvs048/parallel100/wav/VOICEACTRESS100_098.wav 122 | data/ver1/jvs050/nonpara30/wav/TRAVEL1000_0473.wav 123 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_039.wav 124 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_048.wav 125 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_071.wav 126 | data/ver1/jvs050/parallel100/wav/VOICEACTRESS100_096.wav 127 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_018.wav 128 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_025.wav 129 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_050.wav 130 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_077.wav 131 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_097.wav 132 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_010.wav 133 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_043.wav 134 | data/ver1/jvs054/parallel100/wav/VOICEACTRESS100_099.wav 135 | data/ver1/jvs068/nonpara30/wav/BASIC5000_0896.wav 136 | data/ver1/jvs068/nonpara30/wav/BASIC5000_3071.wav 137 | data/ver1/jvs068/parallel100/wav/VOICEACTRESS100_009.wav 138 | data/ver1/jvs068/parallel100/wav/VOICEACTRESS100_023.wav 139 | data/ver1/jvs068/parallel100/wav/VOICEACTRESS100_072.wav 140 | data/ver1/jvs070/nonpara30/wav/BASIC5000_3078.wav 141 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_036.wav 142 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_091.wav 143 | data/ver1/jvs071/nonpara30/wav/BASIC5000_0883.wav 144 | data/ver1/jvs071/nonpara30/wav/BASIC5000_2654.wav 145 | data/ver1/jvs071/nonpara30/wav/TRAVEL1000_0575.wav 146 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_001.wav 147 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_037.wav 148 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_040.wav 149 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_041.wav 150 | data/ver1/jvs071/parallel100/wav/VOICEACTRESS100_042.wav 151 | data/ver1/jvs074/parallel100/wav/VOICEACTRESS100_027.wav 152 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_002.wav 153 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_005.wav 154 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_043.wav 155 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_011.wav 156 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_067.wav 157 | data/ver1/jvs077/parallel100/wav/VOICEACTRESS100_081.wav 158 | data/ver1/jvs078/nonpara30/wav/BASIC5000_0638.wav 159 | data/ver1/jvs078/nonpara30/wav/BASIC5000_2375.wav 160 | data/ver1/jvs078/nonpara30/wav/LOANWORD128_053.wav 161 | data/ver1/jvs078/nonpara30/wav/UT-PARAPHRASE-sent216-phrase2.wav 162 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_027.wav 163 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_033.wav 164 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_071.wav 165 | data/ver1/jvs078/parallel100/wav/VOICEACTRESS100_073.wav 166 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_005.wav 167 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_038.wav 168 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_067.wav 169 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_069.wav 170 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_088.wav 171 | data/ver1/jvs080/parallel100/wav/VOICEACTRESS100_090.wav 172 | data/ver1/jvs086/nonpara30/wav/BASIC5000_0804.wav 173 | data/ver1/jvs086/nonpara30/wav/TRAVEL1000_0509.wav 174 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_021.wav 175 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_023.wav 176 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_056.wav 177 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_057.wav 178 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_059.wav 179 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_061.wav 180 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_075.wav 181 | data/ver1/jvs086/parallel100/wav/VOICEACTRESS100_087.wav 182 | data/ver1/jvs087/nonpara30/wav/BASIC5000_1230.wav 183 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_007.wav 184 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_010.wav 185 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_021.wav 186 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_049.wav 187 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_066.wav 188 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_076.wav 189 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_092.wav 190 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_094.wav 191 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_096.wav 192 | data/ver1/jvs087/parallel100/wav/VOICEACTRESS100_098.wav 193 | data/ver1/jvs089/nonpara30/wav/TRAVEL1000_0617.wav 194 | data/ver1/jvs089/parallel100/wav/VOICEACTRESS100_005.wav 195 | data/ver1/jvs099/nonpara30/wav/BASIC5000_2567.wav 196 | data/ver1/jvs099/nonpara30/wav/TRAVEL1000_0090.wav 197 | data/ver1/jvs100/nonpara30/wav/BASIC5000_2813.wav 198 | data/ver1/jvs100/nonpara30/wav/TRAVEL1000_0227.wav 199 | data/ver1/jvs100/parallel100/wav/VOICEACTRESS100_049.wav 200 | data/ver1/jvs100/parallel100/wav/VOICEACTRESS100_095.wav 201 | -------------------------------------------------------------------------------- /egs/jvs/data/scp/eval_mid.scp: -------------------------------------------------------------------------------- 1 | data/ver1/jvs001/whisper10/wav/VOICEACTRESS100_002.wav 2 | data/ver1/jvs002/parallel100/wav/VOICEACTRESS100_048.wav 3 | data/ver1/jvs002/parallel100/wav/VOICEACTRESS100_097.wav 4 | data/ver1/jvs003/falset10/wav/VOICEACTRESS100_001.wav 5 | data/ver1/jvs003/falset10/wav/VOICEACTRESS100_003.wav 6 | data/ver1/jvs003/parallel100/wav/VOICEACTRESS100_050.wav 7 | data/ver1/jvs004/parallel100/wav/VOICEACTRESS100_019.wav 8 | data/ver1/jvs007/nonpara30/wav/BASIC5000_2968.wav 9 | data/ver1/jvs007/nonpara30/wav/TRAVEL1000_0842.wav 10 | data/ver1/jvs007/parallel100/wav/VOICEACTRESS100_036.wav 11 | data/ver1/jvs007/parallel100/wav/VOICEACTRESS100_075.wav 12 | data/ver1/jvs008/nonpara30/wav/BASIC5000_1872.wav 13 | data/ver1/jvs008/parallel100/wav/VOICEACTRESS100_027.wav 14 | data/ver1/jvs008/parallel100/wav/VOICEACTRESS100_071.wav 15 | data/ver1/jvs008/parallel100/wav/VOICEACTRESS100_099.wav 16 | data/ver1/jvs010/parallel100/wav/VOICEACTRESS100_031.wav 17 | data/ver1/jvs010/parallel100/wav/VOICEACTRESS100_077.wav 18 | data/ver1/jvs011/nonpara30/wav/BASIC5000_0093.wav 19 | data/ver1/jvs012/whisper10/wav/VOICEACTRESS100_002.wav 20 | data/ver1/jvs013/parallel100/wav/VOICEACTRESS100_060.wav 21 | data/ver1/jvs014/nonpara30/wav/BASIC5000_1255.wav 22 | data/ver1/jvs014/parallel100/wav/VOICEACTRESS100_077.wav 23 | data/ver1/jvs015/nonpara30/wav/TRAVEL1000_0737.wav 24 | data/ver1/jvs015/parallel100/wav/VOICEACTRESS100_044.wav 25 | data/ver1/jvs015/parallel100/wav/VOICEACTRESS100_095.wav 26 | data/ver1/jvs016/nonpara30/wav/TRAVEL1000_0630.wav 27 | data/ver1/jvs016/parallel100/wav/VOICEACTRESS100_026.wav 28 | data/ver1/jvs016/parallel100/wav/VOICEACTRESS100_045.wav 29 | data/ver1/jvs016/parallel100/wav/VOICEACTRESS100_068.wav 30 | data/ver1/jvs016/parallel100/wav/VOICEACTRESS100_082.wav 31 | data/ver1/jvs018/parallel100/wav/VOICEACTRESS100_021.wav 32 | data/ver1/jvs018/parallel100/wav/VOICEACTRESS100_025.wav 33 | data/ver1/jvs018/parallel100/wav/VOICEACTRESS100_032.wav 34 | data/ver1/jvs018/parallel100/wav/VOICEACTRESS100_069.wav 35 | data/ver1/jvs019/parallel100/wav/VOICEACTRESS100_020.wav 36 | data/ver1/jvs019/parallel100/wav/VOICEACTRESS100_050.wav 37 | data/ver1/jvs019/parallel100/wav/VOICEACTRESS100_075.wav 38 | data/ver1/jvs019/parallel100/wav/VOICEACTRESS100_076.wav 39 | data/ver1/jvs020/nonpara30/wav/BASIC5000_0065.wav 40 | data/ver1/jvs020/nonpara30/wav/BASIC5000_0079.wav 41 | data/ver1/jvs020/parallel100/wav/VOICEACTRESS100_046.wav 42 | data/ver1/jvs020/whisper10/wav/BASIC5000_0065.wav 43 | data/ver1/jvs021/falset10/wav/BASIC5000_2299.wav 44 | data/ver1/jvs021/falset10/wav/VOICEACTRESS100_003.wav 45 | data/ver1/jvs021/whisper10/wav/TRAVEL1000_0537.wav 46 | data/ver1/jvs024/nonpara30/wav/BASIC5000_0853.wav 47 | data/ver1/jvs024/parallel100/wav/VOICEACTRESS100_006.wav 48 | data/ver1/jvs024/parallel100/wav/VOICEACTRESS100_021.wav 49 | data/ver1/jvs024/whisper10/wav/VOICEACTRESS100_003.wav 50 | data/ver1/jvs025/nonpara30/wav/BASIC5000_0082.wav 51 | data/ver1/jvs025/nonpara30/wav/BASIC5000_0162.wav 52 | data/ver1/jvs025/parallel100/wav/VOICEACTRESS100_002.wav 53 | data/ver1/jvs026/parallel100/wav/VOICEACTRESS100_057.wav 54 | data/ver1/jvs029/nonpara30/wav/BASIC5000_2790.wav 55 | data/ver1/jvs029/parallel100/wav/VOICEACTRESS100_030.wav 56 | data/ver1/jvs029/parallel100/wav/VOICEACTRESS100_062.wav 57 | data/ver1/jvs029/parallel100/wav/VOICEACTRESS100_100.wav 58 | data/ver1/jvs030/parallel100/wav/VOICEACTRESS100_030.wav 59 | data/ver1/jvs032/parallel100/wav/VOICEACTRESS100_032.wav 60 | data/ver1/jvs033/parallel100/wav/VOICEACTRESS100_044.wav 61 | data/ver1/jvs033/parallel100/wav/VOICEACTRESS100_051.wav 62 | data/ver1/jvs034/nonpara30/wav/BASIC5000_1903.wav 63 | data/ver1/jvs034/parallel100/wav/VOICEACTRESS100_038.wav 64 | data/ver1/jvs034/whisper10/wav/UT-PARAPHRASE-sent173-phrase2.wav 65 | data/ver1/jvs035/nonpara30/wav/TRAVEL1000_0773.wav 66 | data/ver1/jvs036/parallel100/wav/VOICEACTRESS100_083.wav 67 | data/ver1/jvs038/parallel100/wav/VOICEACTRESS100_057.wav 68 | data/ver1/jvs038/parallel100/wav/VOICEACTRESS100_068.wav 69 | data/ver1/jvs039/parallel100/wav/VOICEACTRESS100_003.wav 70 | data/ver1/jvs039/parallel100/wav/VOICEACTRESS100_044.wav 71 | data/ver1/jvs039/parallel100/wav/VOICEACTRESS100_061.wav 72 | data/ver1/jvs040/parallel100/wav/VOICEACTRESS100_034.wav 73 | data/ver1/jvs040/parallel100/wav/VOICEACTRESS100_035.wav 74 | data/ver1/jvs040/parallel100/wav/VOICEACTRESS100_078.wav 75 | data/ver1/jvs040/whisper10/wav/TRAVEL1000_0437.wav 76 | data/ver1/jvs041/falset10/wav/BASIC5000_2818.wav 77 | data/ver1/jvs041/falset10/wav/VOICEACTRESS100_005.wav 78 | data/ver1/jvs041/parallel100/wav/VOICEACTRESS100_099.wav 79 | data/ver1/jvs043/parallel100/wav/VOICEACTRESS100_023.wav 80 | data/ver1/jvs043/parallel100/wav/VOICEACTRESS100_075.wav 81 | data/ver1/jvs043/parallel100/wav/VOICEACTRESS100_086.wav 82 | data/ver1/jvs043/parallel100/wav/VOICEACTRESS100_087.wav 83 | data/ver1/jvs045/parallel100/wav/VOICEACTRESS100_040.wav 84 | data/ver1/jvs047/falset10/wav/VOICEACTRESS100_002.wav 85 | data/ver1/jvs047/whisper10/wav/VOICEACTRESS100_001.wav 86 | data/ver1/jvs049/parallel100/wav/VOICEACTRESS100_025.wav 87 | data/ver1/jvs049/parallel100/wav/VOICEACTRESS100_074.wav 88 | data/ver1/jvs051/nonpara30/wav/BASIC5000_1857.wav 89 | data/ver1/jvs051/nonpara30/wav/TRAVEL1000_0917.wav 90 | data/ver1/jvs052/parallel100/wav/VOICEACTRESS100_003.wav 91 | data/ver1/jvs053/parallel100/wav/VOICEACTRESS100_016.wav 92 | data/ver1/jvs053/parallel100/wav/VOICEACTRESS100_070.wav 93 | data/ver1/jvs053/parallel100/wav/VOICEACTRESS100_091.wav 94 | data/ver1/jvs055/parallel100/wav/VOICEACTRESS100_009.wav 95 | data/ver1/jvs055/parallel100/wav/VOICEACTRESS100_024.wav 96 | data/ver1/jvs056/parallel100/wav/VOICEACTRESS100_008.wav 97 | data/ver1/jvs057/nonpara30/wav/BASIC5000_1276.wav 98 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_035.wav 99 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_041.wav 100 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_076.wav 101 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_093.wav 102 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_097.wav 103 | data/ver1/jvs057/parallel100/wav/VOICEACTRESS100_100.wav 104 | data/ver1/jvs057/whisper10/wav/VOICEACTRESS100_004.wav 105 | data/ver1/jvs058/parallel100/wav/VOICEACTRESS100_061.wav 106 | data/ver1/jvs058/whisper10/wav/VOICEACTRESS100_004.wav 107 | data/ver1/jvs059/nonpara30/wav/BASIC5000_1588.wav 108 | data/ver1/jvs059/parallel100/wav/VOICEACTRESS100_020.wav 109 | data/ver1/jvs059/parallel100/wav/VOICEACTRESS100_024.wav 110 | data/ver1/jvs059/parallel100/wav/VOICEACTRESS100_074.wav 111 | data/ver1/jvs059/parallel100/wav/VOICEACTRESS100_090.wav 112 | data/ver1/jvs061/nonpara30/wav/BASIC5000_1540.wav 113 | data/ver1/jvs061/parallel100/wav/VOICEACTRESS100_002.wav 114 | data/ver1/jvs061/parallel100/wav/VOICEACTRESS100_009.wav 115 | data/ver1/jvs061/parallel100/wav/VOICEACTRESS100_015.wav 116 | data/ver1/jvs061/parallel100/wav/VOICEACTRESS100_076.wav 117 | data/ver1/jvs063/parallel100/wav/VOICEACTRESS100_097.wav 118 | data/ver1/jvs064/parallel100/wav/VOICEACTRESS100_039.wav 119 | data/ver1/jvs065/parallel100/wav/VOICEACTRESS100_021.wav 120 | data/ver1/jvs065/parallel100/wav/VOICEACTRESS100_066.wav 121 | data/ver1/jvs065/parallel100/wav/VOICEACTRESS100_094.wav 122 | data/ver1/jvs065/whisper10/wav/VOICEACTRESS100_003.wav 123 | data/ver1/jvs067/parallel100/wav/VOICEACTRESS100_084.wav 124 | data/ver1/jvs067/parallel100/wav/VOICEACTRESS100_085.wav 125 | data/ver1/jvs069/parallel100/wav/VOICEACTRESS100_001.wav 126 | data/ver1/jvs069/parallel100/wav/VOICEACTRESS100_012.wav 127 | data/ver1/jvs069/parallel100/wav/VOICEACTRESS100_045.wav 128 | data/ver1/jvs069/whisper10/wav/VOICEACTRESS100_001.wav 129 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_013.wav 130 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_051.wav 131 | data/ver1/jvs070/parallel100/wav/VOICEACTRESS100_063.wav 132 | data/ver1/jvs071/whisper10/wav/VOICEACTRESS100_005.wav 133 | data/ver1/jvs072/parallel100/wav/VOICEACTRESS100_009.wav 134 | data/ver1/jvs073/nonpara30/wav/BASIC5000_0070.wav 135 | data/ver1/jvs073/parallel100/wav/VOICEACTRESS100_011.wav 136 | data/ver1/jvs073/parallel100/wav/VOICEACTRESS100_029.wav 137 | data/ver1/jvs073/parallel100/wav/VOICEACTRESS100_034.wav 138 | data/ver1/jvs074/parallel100/wav/VOICEACTRESS100_083.wav 139 | data/ver1/jvs075/falset10/wav/VOICEACTRESS100_004.wav 140 | data/ver1/jvs075/parallel100/wav/VOICEACTRESS100_021.wav 141 | data/ver1/jvs075/parallel100/wav/VOICEACTRESS100_024.wav 142 | data/ver1/jvs075/parallel100/wav/VOICEACTRESS100_030.wav 143 | data/ver1/jvs075/parallel100/wav/VOICEACTRESS100_035.wav 144 | data/ver1/jvs075/parallel100/wav/VOICEACTRESS100_036.wav 145 | data/ver1/jvs076/nonpara30/wav/BASIC5000_1517.wav 146 | data/ver1/jvs076/parallel100/wav/VOICEACTRESS100_060.wav 147 | data/ver1/jvs077/nonpara30/wav/UT-PARAPHRASE-sent054-phrase2.wav 148 | data/ver1/jvs079/falset10/wav/VOICEACTRESS100_002.wav 149 | data/ver1/jvs079/parallel100/wav/VOICEACTRESS100_001.wav 150 | data/ver1/jvs079/parallel100/wav/VOICEACTRESS100_007.wav 151 | data/ver1/jvs079/whisper10/wav/BASIC5000_0382.wav 152 | data/ver1/jvs080/nonpara30/wav/BASIC5000_0860.wav 153 | data/ver1/jvs080/nonpara30/wav/TRAVEL1000_0241.wav 154 | data/ver1/jvs081/parallel100/wav/VOICEACTRESS100_021.wav 155 | data/ver1/jvs081/whisper10/wav/BASIC5000_1695.wav 156 | data/ver1/jvs082/parallel100/wav/VOICEACTRESS100_014.wav 157 | data/ver1/jvs082/parallel100/wav/VOICEACTRESS100_056.wav 158 | data/ver1/jvs082/parallel100/wav/VOICEACTRESS100_066.wav 159 | data/ver1/jvs082/parallel100/wav/VOICEACTRESS100_067.wav 160 | data/ver1/jvs083/nonpara30/wav/BASIC5000_2066.wav 161 | data/ver1/jvs083/nonpara30/wav/TRAVEL1000_0817.wav 162 | data/ver1/jvs083/parallel100/wav/VOICEACTRESS100_046.wav 163 | data/ver1/jvs083/parallel100/wav/VOICEACTRESS100_067.wav 164 | data/ver1/jvs084/parallel100/wav/VOICEACTRESS100_044.wav 165 | data/ver1/jvs084/parallel100/wav/VOICEACTRESS100_066.wav 166 | data/ver1/jvs084/whisper10/wav/BASIC5000_0591.wav 167 | data/ver1/jvs085/nonpara30/wav/BASIC5000_2356.wav 168 | data/ver1/jvs085/parallel100/wav/VOICEACTRESS100_013.wav 169 | data/ver1/jvs085/parallel100/wav/VOICEACTRESS100_086.wav 170 | data/ver1/jvs088/nonpara30/wav/BASIC5000_0545.wav 171 | data/ver1/jvs088/parallel100/wav/VOICEACTRESS100_057.wav 172 | data/ver1/jvs088/parallel100/wav/VOICEACTRESS100_087.wav 173 | data/ver1/jvs090/nonpara30/wav/BASIC5000_0219.wav 174 | data/ver1/jvs090/parallel100/wav/VOICEACTRESS100_005.wav 175 | data/ver1/jvs090/parallel100/wav/VOICEACTRESS100_060.wav 176 | data/ver1/jvs091/nonpara30/wav/BASIC5000_2247.wav 177 | data/ver1/jvs091/nonpara30/wav/BASIC5000_2583.wav 178 | data/ver1/jvs091/parallel100/wav/VOICEACTRESS100_001.wav 179 | data/ver1/jvs091/parallel100/wav/VOICEACTRESS100_004.wav 180 | data/ver1/jvs091/parallel100/wav/VOICEACTRESS100_087.wav 181 | data/ver1/jvs092/nonpara30/wav/ONOMATOPEE300_262.wav 182 | data/ver1/jvs092/parallel100/wav/VOICEACTRESS100_010.wav 183 | data/ver1/jvs093/nonpara30/wav/BASIC5000_2376.wav 184 | data/ver1/jvs093/parallel100/wav/VOICEACTRESS100_013.wav 185 | data/ver1/jvs093/parallel100/wav/VOICEACTRESS100_083.wav 186 | data/ver1/jvs094/parallel100/wav/VOICEACTRESS100_049.wav 187 | data/ver1/jvs095/nonpara30/wav/BASIC5000_1810.wav 188 | data/ver1/jvs096/parallel100/wav/VOICEACTRESS100_004.wav 189 | data/ver1/jvs096/parallel100/wav/VOICEACTRESS100_012.wav 190 | data/ver1/jvs096/parallel100/wav/VOICEACTRESS100_057.wav 191 | data/ver1/jvs096/parallel100/wav/VOICEACTRESS100_080.wav 192 | data/ver1/jvs096/parallel100/wav/VOICEACTRESS100_085.wav 193 | data/ver1/jvs097/parallel100/wav/VOICEACTRESS100_034.wav 194 | data/ver1/jvs097/parallel100/wav/VOICEACTRESS100_044.wav 195 | data/ver1/jvs097/parallel100/wav/VOICEACTRESS100_059.wav 196 | data/ver1/jvs097/whisper10/wav/VOICEACTRESS100_004.wav 197 | data/ver1/jvs098/parallel100/wav/VOICEACTRESS100_033.wav 198 | data/ver1/jvs098/whisper10/wav/TRAVEL1000_0220.wav 199 | data/ver1/jvs099/parallel100/wav/VOICEACTRESS100_077.wav 200 | data/ver1/jvs099/parallel100/wav/VOICEACTRESS100_078.wav 201 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | antlr4-python3-runtime==4.9.3 2 | audioread==3.0.1 3 | certifi==2024.8.30 4 | cffi==1.17.1 5 | charset-normalizer==3.3.2 6 | coloredlogs==15.0.1 7 | contourpy==1.3.0 8 | cycler==0.12.1 9 | Cython==3.0.11 10 | decorator==5.1.1 11 | exceptiongroup==1.2.2 12 | filelock==3.16.1 13 | flatbuffers==24.3.25 14 | fonttools==4.54.1 15 | fsspec==2024.9.0 16 | future==1.0.0 17 | h5py==3.12.1 18 | huggingface-hub==0.25.1 19 | humanfriendly==10.0 20 | hydra-core==1.3.2 21 | idna==3.10 22 | importlib_resources==6.4.5 23 | iniconfig==2.0.0 24 | Jinja2==3.1.4 25 | joblib==1.4.2 26 | kiwisolver==1.4.7 27 | lazy_loader==0.4 28 | librosa==0.10.2.post1 29 | llvmlite==0.43.0 30 | MarkupSafe==2.1.5 31 | matplotlib==3.9.2 32 | mpmath==1.3.0 33 | msgpack==1.1.0 34 | networkx==3.2.1 35 | numba==0.60.0 36 | numpy==1.26.4 37 | nvidia-cublas-cu12==12.1.3.1 38 | nvidia-cuda-cupti-cu12==12.1.105 39 | nvidia-cuda-nvrtc-cu12==12.1.105 40 | nvidia-cuda-runtime-cu12==12.1.105 41 | nvidia-cudnn-cu12==9.1.0.70 42 | nvidia-cufft-cu12==11.0.2.54 43 | nvidia-curand-cu12==10.3.2.106 44 | nvidia-cusolver-cu12==11.4.5.107 45 | nvidia-cusparse-cu12==12.1.0.106 46 | nvidia-nccl-cu12==2.20.5 47 | nvidia-nvjitlink-cu12==12.6.68 48 | nvidia-nvtx-cu12==12.1.105 49 | omegaconf==2.3.0 50 | packaging==24.1 51 | pillow==10.4.0 52 | platformdirs==4.3.6 53 | pluggy==1.5.0 54 | pooch==1.8.2 55 | protobuf==5.28.2 56 | pycparser==2.22 57 | pyloudnorm==0.1.1 58 | pyparsing==3.1.4 59 | pypesq==1.2.4 60 | pysptk==1.0.1 61 | pytest==8.3.3 62 | pytest-runner==6.0.1 63 | python-dateutil==2.9.0.post0 64 | pyworld==0.3.4 65 | PyYAML==6.0.2 66 | regex==2024.9.11 67 | requests==2.32.3 68 | safetensors==0.4.5 69 | scikit-learn==1.5.2 70 | scipy==1.13.1 71 | six==1.16.0 72 | soundfile==0.12.1 73 | soxr==0.5.0.post1 74 | sympy==1.13.3 75 | tensorboardX==2.6.2.2 76 | threadpoolctl==3.5.0 77 | tokenizers==0.20.0 78 | tomli==2.0.2 79 | torch==1.12.1+cu113 80 | torchaudio==0.12.1+cu113 81 | torchprofile==0.0.4 82 | torchvision==0.13.1+cu113 83 | tqdm==4.66.5 84 | transformers==4.45.1 85 | triton==3.0.0 86 | typing_extensions==4.12.2 87 | urllib3==2.2.3 88 | zipp==3.20.2 89 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | from wavehax import __version__ 6 | 7 | requirements = { 8 | "install": [ 9 | "wheel", 10 | "setuptools", 11 | "protobuf", 12 | "PyYAML", 13 | "tqdm", 14 | "h5py", 15 | "librosa", 16 | "soundfile", 17 | "pyloudnorm", 18 | "pyworld", 19 | "pysptk", 20 | "matplotlib", 21 | "hydra-core>=1.2", 22 | "torch>=1.9.0", 23 | "torchaudio>=0.8.1", 24 | "torchprofile", 25 | "transformers", 26 | "tensorboardX", 27 | ], 28 | "setup": [ 29 | "numpy", 30 | ], 31 | } 32 | 33 | entry_points = { 34 | "console_scripts": [ 35 | "wavehax-extract-features=wavehax.bin.extract_features:main", 36 | "wavehax-compute-statistics=wavehax.bin.compute_statistics:main", 37 | "wavehax-profile=wavehax.bin.profile:main", 38 | "wavehax-train=wavehax.bin.train:main", 39 | "wavehax-decode=wavehax.bin.decode:main", 40 | ] 41 | } 42 | 43 | install_requires = requirements["install"] 44 | setup_requires = requirements["setup"] 45 | 46 | readme_path = os.path.join(os.path.dirname(__file__), "README.md") 47 | long_description = "" 48 | if os.path.exists(readme_path): 49 | with open(readme_path, encoding="utf-8") as f: 50 | long_description = f.read() 51 | 52 | setup( 53 | name="wavehax", 54 | version=__version__, 55 | author="Reo Yoneyama", 56 | author_email="yoneyama.reo@g.sp.m.is.nagoya-u.ac.jp", 57 | url="http://github.com/chomeyama/wavehax", 58 | description="Wavehax official implementation", 59 | long_description_content_type="text/markdown", 60 | long_description=long_description, 61 | packages=find_packages(), 62 | install_requires=install_requires, 63 | setup_requires=setup_requires, 64 | entry_points=entry_points, 65 | ) 66 | -------------------------------------------------------------------------------- /wavehax/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /wavehax/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/wavehax/d8edae1d465e0287eb379c6b158b20d249fb61bd/wavehax/bin/__init__.py -------------------------------------------------------------------------------- /wavehax/bin/compute_statistics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Statistics computing script for feature normalization. 6 | 7 | This script computes the mean and variance of various acoustic features for normalization 8 | purposes, commonly used in neural vocoder training. It updates or creates feature-specific 9 | scalers based on the provided list of feature files. 10 | 11 | References: 12 | - https://github.com/kan-bayashi/ParallelWaveGAN 13 | - https://github.com/bigpon/QPPWG 14 | """ 15 | 16 | import os 17 | from logging import getLogger 18 | 19 | import hydra 20 | import numpy as np 21 | from hydra.utils import to_absolute_path 22 | from joblib import dump, load 23 | from omegaconf import DictConfig, OmegaConf 24 | from sklearn.preprocessing import StandardScaler 25 | 26 | from wavehax.utils import read_hdf5, read_txt 27 | 28 | logger = getLogger(__name__) 29 | 30 | 31 | def compute_statistics(cfg: DictConfig): 32 | """ 33 | Compute and save statistics (mean and variance) for normalization of acoustic features. 34 | 35 | This function processes each feature specified in the configuration, calculating 36 | statistics in an online fashion using `partial_fit` to handle large datasets. 37 | 38 | Args: 39 | cfg (DictConfig): Configuration containing feature names, file paths, and save directory. 40 | 41 | Workflow: 42 | - Load or initialize scalers for each feature. 43 | - Read feature data from the provided HDF5 files. 44 | - Update the scalers with the new data. 45 | - Save the updated scalers to a specified path. 46 | 47 | Raises: 48 | Warning: If a feature array has length 0, a warning is logged and the feature is skipped. 49 | """ 50 | # Define scalers 51 | scaler = load(cfg.save_path) if os.path.isfile(cfg.save_path) else {} 52 | for feat_name in cfg.feat_names: 53 | scaler[feat_name] = StandardScaler() 54 | 55 | # Get feature paths 56 | feat_paths = read_txt(to_absolute_path(cfg.filepath_list)) 57 | logger.info(f"Number of utterances = {len(feat_paths)}.") 58 | 59 | # Perform online calculation 60 | for file_path in feat_paths: 61 | for feat_name in cfg.feat_names: 62 | feat = read_hdf5(to_absolute_path(file_path), feat_name) 63 | if feat_name == "f0": 64 | feat = np.expand_dims(feat[feat > 0], axis=-1) 65 | if feat.shape[0] == 0: 66 | logger.warning(f"Feature length is 0 {file_path}, {feat_name}") 67 | continue 68 | scaler[feat_name].partial_fit(feat) 69 | 70 | # Save the computed statistics 71 | save_path = to_absolute_path(cfg.save_path) 72 | os.makedirs(os.path.dirname(save_path), exist_ok=True) 73 | dump(scaler, save_path) 74 | logger.info(f"Successfully saved statistics to {cfg.save_path}.") 75 | 76 | 77 | @hydra.main(version_base=None, config_path="config", config_name="compute_statistics") 78 | def main(cfg: DictConfig): 79 | logger.info(OmegaConf.to_yaml(cfg)) 80 | compute_statistics(cfg) 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /wavehax/bin/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chomeyama/wavehax/d8edae1d465e0287eb379c6b158b20d249fb61bd/wavehax/bin/config/__init__.py -------------------------------------------------------------------------------- /wavehax/bin/config/compute_statistics.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | run: 5 | dir: ./ 6 | output_subdir: null 7 | 8 | filepath_list: data/list/train_no_dev.list # List file of input features. 9 | save_path: data/stats/train_no_dev.joblib # Path to file to output statistics. 10 | feat_names: ["f0", "cf0", "lf0", "vuv", "mgc", "map", "bap", "mel"] # Feature names. 11 | -------------------------------------------------------------------------------- /wavehax/bin/config/data/jvs.yaml: -------------------------------------------------------------------------------- 1 | # Dataset settings 2 | train_audio: data/scp/train_no_dev.scp # List file of training audio files. 3 | train_feat: data/list/train_no_dev.list # List file of training feature files. 4 | valid_audio: data/scp/dev.scp # List file of validation audio files. 5 | valid_feat: data/list/dev.list # List file of validation feature files. 6 | eval_audio: data/scp/eval_mid.scp # List file of evaluation feature files for decoding. 7 | eval_feat: data/list/eval_mid.list # List file of evaluation feature files for decoding. 8 | stats: data/stats/train_no_dev.joblib # Path to the file of statistics. 9 | 10 | # Feature settings 11 | feat_names: ["mel"] # Names of auxiliary features. 12 | use_continuous_f0: false # Whether to use continuous F0. 13 | 14 | # Data loader setting 15 | batch_max_length: 7680 # Length of each audio in batch. Make sure dividable by hop_length. 16 | batch_size: 16 # Batch size 17 | num_workers: 1 # Number of workers in Pytorch DataLoader 18 | pin_memory: true # Whether to pin memory in Pytorch DataLoader 19 | 20 | # Other setting 21 | remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_length 22 | allow_cache: false # Whether to allow cache in dataset. If true, it requires cpu memory 23 | -------------------------------------------------------------------------------- /wavehax/bin/config/decode.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: wavehax 6 | - data: jvs 7 | 8 | hydra: 9 | run: 10 | dir: ./ 11 | output_subdir: null 12 | 13 | out_dir: # Path to the directory where checkpoint files are stored and decoding outputs will be saved. 14 | tag: wav # Subdirectory name where the decoding results will be saved. 15 | 16 | ckpt_path: # Path to the checkpoint file of the pre-trained model. 17 | ckpt_steps: 1000000 # Checkpoint step of the pre-trained model to be used. 18 | 19 | num_threads: 4 # Number of CPU threads to use for decoding. 20 | 21 | f0_factors: [1.00] # F0 scaling factors. 22 | 23 | seed: 100 # Random seed. 24 | -------------------------------------------------------------------------------- /wavehax/bin/config/discriminator/univnet.yaml: -------------------------------------------------------------------------------- 1 | _target_: wavehax.discriminators.MultiResolutionMultiPeriodDiscriminator 2 | # Multi-period discriminator related 3 | periods: [2, 3, 5, 7, 11] 4 | period_discriminator_params: 5 | channels: 32 6 | kernel_sizes: [5, 3] 7 | downsample_scales: [3, 3, 3, 3, 1] 8 | max_downsample_channels: 1024 9 | use_weight_norm: true 10 | use_spectral_norm: false 11 | # Multi-resolution discriminator related 12 | fft_sizes: [1024, 2048, 512] 13 | hop_sizes: [256, 512, 128] 14 | win_lengths: [1024, 2048, 512] 15 | spectral_discriminator_params: 16 | window: "hann_window" 17 | channels: 32 18 | kernel_sizes: 19 | - [7, 5] 20 | - [5, 3] 21 | - [5, 3] 22 | - [3, 3] 23 | - [3, 3] 24 | - [3, 3] 25 | strides: 26 | - [2, 2] 27 | - [2, 1] 28 | - [2, 2] 29 | - [2, 1] 30 | - [2, 2] 31 | - [1, 1] 32 | use_weight_norm: true 33 | -------------------------------------------------------------------------------- /wavehax/bin/config/extract_features.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | hydra: 4 | run: 5 | dir: ./ 6 | output_subdir: null 7 | 8 | audio_scp: data/scp/all.scp # List file of input wav files. 9 | in_dir: wav # Directory of input feature files. 10 | out_dir: hdf5 # Directory to save generated samples. 11 | feature_format: h5 # Feature format. 12 | sample_rate: 24000 # Sampling frequency of audio waveforms in Hz. 13 | spk_info: data/spk_style.yaml # YAML format speaker information. 14 | spk_idx: -4 # Speaker index of the split path. 15 | 16 | # Mel-spectrogram extraction setting. 17 | n_fft: 1024 # Number of Fourier transform points (FFT size). 18 | hop_length: 240 # Hop length (frameshift) in samples. 19 | n_mels: 100 # Number of mel basis. 20 | fmin: 0 # Minimum frequency in mel basis calculation. 21 | fmax: 8000 # Maximum frequency in mel basis calculation. 22 | 23 | # WORLD feature extraction setting. 24 | f0_min: 100 # Minimum F0 value. 25 | f0_max: 1000 # Maximum F0 value. 26 | shiftms: 10 # Frameshift in ms. 27 | fft_size: 2048 # Number of Fourier transform points (FFT size). 28 | mgc_dim: 40 # Number of dimension of mel-generalized cepstrum. 29 | map_dim: 20 # Number of dimention of mel-cepstral aperiodicity. 30 | -------------------------------------------------------------------------------- /wavehax/bin/config/generator/cwavehax.v1.yaml: -------------------------------------------------------------------------------- 1 | _target_: wavehax.generators.ComplexWavehaxGenerator 2 | in_channels: 100 3 | channels: 16 4 | mult_channels: 2 5 | kernel_size: 7 6 | num_blocks: 8 7 | n_fft: 480 8 | hop_length: 240 9 | sample_rate: 24000 10 | prior_type: "pcph" 11 | drop_prob: 0.0 12 | init_weights: false 13 | -------------------------------------------------------------------------------- /wavehax/bin/config/generator/cwavehax.v2.yaml: -------------------------------------------------------------------------------- 1 | _target_: wavehax.generators.ComplexWavehaxGenerator 2 | in_channels: 100 3 | channels: 8 4 | mult_channels: 3 5 | kernel_size: [13, 7] 6 | num_blocks: 8 7 | n_fft: 480 8 | hop_length: 240 9 | sample_rate: 24000 10 | prior_type: "pcph" 11 | drop_prob: 0.0 12 | init_weights: false 13 | -------------------------------------------------------------------------------- /wavehax/bin/config/generator/wavehax.v1.yaml: -------------------------------------------------------------------------------- 1 | _target_: wavehax.generators.WavehaxGenerator 2 | in_channels: 100 3 | channels: 32 4 | mult_channels: 2 5 | kernel_size: 7 6 | num_blocks: 8 7 | n_fft: 480 8 | hop_length: 240 9 | sample_rate: 24000 10 | prior_type: "pcph" 11 | drop_prob: 0.0 12 | use_logmag_phase: false 13 | -------------------------------------------------------------------------------- /wavehax/bin/config/generator/wavehax.v2.yaml: -------------------------------------------------------------------------------- 1 | _target_: wavehax.generators.WavehaxGenerator 2 | in_channels: 100 3 | channels: 16 4 | mult_channels: 3 5 | kernel_size: [13, 7] 6 | num_blocks: 8 7 | n_fft: 480 8 | hop_length: 240 9 | sample_rate: 24000 10 | prior_type: "pcph" 11 | drop_prob: 0.0 12 | use_logmag_phase: false 13 | -------------------------------------------------------------------------------- /wavehax/bin/config/profile.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: wavehax 6 | 7 | hydra: 8 | run: 9 | dir: ./ 10 | output_subdir: null 11 | -------------------------------------------------------------------------------- /wavehax/bin/config/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - generator: wavehax 6 | - discriminator: univnet 7 | - train: wavehax 8 | - data: jvs 9 | 10 | hydra: 11 | run: 12 | dir: ./ 13 | output_subdir: null 14 | 15 | out_dir: # Directory to output training results. 16 | seed: 0 # Random seed. 17 | -------------------------------------------------------------------------------- /wavehax/bin/config/train/wavehax.yaml: -------------------------------------------------------------------------------- 1 | # Interval setting 2 | discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. 3 | train_max_steps: 1000000 # Number of pre-training steps. 4 | save_interval_steps: 100000 # Interval steps to save checkpoint. 5 | eval_interval_steps: 2000 # Interval steps to evaluate the network. 6 | log_interval_steps: 2000 # Interval steps to record the training log. 7 | distributed_training: false # Whether to apply ditributed training. 8 | resume: # Epoch to resume training. 9 | load_only_params: false # Whether to load only model parameters. 10 | 11 | # Loss balancing coefficients. 12 | lambda_mel: 45.0 13 | lambda_reg: 0.0 14 | lambda_phase: 0.0 15 | lambda_adv: 1.0 16 | lambda_fm: 2.0 17 | 18 | # Mel-spectral loss setting 19 | mel_loss: 20 | _target_: wavehax.losses.MelSpectralLoss 21 | n_fft: 1024 22 | hop_length: 256 23 | sample_rate: 24000 24 | n_mels: 100 25 | 26 | # Adversarial loss setting 27 | adv_loss: 28 | _target_: wavehax.losses.AdversarialLoss 29 | average_by_discriminators: false 30 | loss_type: hinge 31 | 32 | # Feature matching loss setting 33 | fm_loss: 34 | _target_: wavehax.losses.FeatureMatchingLoss 35 | average_by_layers: false 36 | 37 | # Optimizer and scheduler setting 38 | generator_optimizer: 39 | _target_: torch.optim.AdamW 40 | lr: 2.0e-4 41 | betas: [0.8, 0.9] 42 | weight_decay: 0.0 43 | generator_scheduler: 44 | _target_: transformers.get_cosine_schedule_with_warmup 45 | num_warmup_steps: 0 46 | num_training_steps: ${train.train_max_steps} 47 | generator_grad_norm: 10 48 | discriminator_optimizer: 49 | _target_: torch.optim.AdamW 50 | lr: 2.0e-4 51 | betas: [0.8, 0.9] 52 | weight_decay: 0.0 53 | discriminator_scheduler: 54 | _target_: transformers.get_cosine_schedule_with_warmup 55 | num_warmup_steps: 0 56 | num_training_steps: ${train.train_max_steps} 57 | discriminator_grad_norm: 10 58 | -------------------------------------------------------------------------------- /wavehax/bin/decode.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Decoding script for GAN-based vocoders. 6 | 7 | This script performs inference for GAN-based vocoders, loading model checkpoints and feature data, 8 | and generating audio waveforms based on the provided acoustic features. 9 | 10 | References: 11 | - https://github.com/kan-bayashi/ParallelWaveGAN 12 | """ 13 | 14 | import os 15 | from logging import getLogger 16 | from time import time 17 | 18 | import hydra 19 | import numpy as np 20 | import soundfile as sf 21 | import torch 22 | from hydra.utils import instantiate, to_absolute_path 23 | from joblib import load 24 | from omegaconf import DictConfig 25 | from tqdm import tqdm 26 | 27 | from wavehax.datasets import FeatDataset 28 | from wavehax.modules import remove_weight_norm 29 | 30 | # A logger for this file 31 | logger = getLogger(__name__) 32 | 33 | 34 | @hydra.main(version_base=None, config_path="config", config_name="decode") 35 | def main(cfg: DictConfig) -> None: 36 | """ 37 | Run the decoding process to generate audio waveforms from acoustic features. 38 | 39 | This function: 40 | - Loads a pre-trained GAN-based vocoder model. 41 | - Loads and scales feature data using a StandardScaler. 42 | - Decodes the features to generate corresponding audio waveforms. 43 | - Saves the generated audio as PCM 16-bit WAV files. 44 | 45 | Args: 46 | cfg (DictConfig): Configuration object loaded via Hydra. 47 | """ 48 | np.random.seed(cfg.seed) 49 | torch.manual_seed(cfg.seed) 50 | torch.cuda.manual_seed(cfg.seed) 51 | os.environ["PYTHONHASHSEED"] = str(cfg.seed) 52 | torch.set_num_threads(cfg.num_threads) 53 | logger.info(f"Number of threads: {cfg.num_threads}.") 54 | 55 | # Set device for computation (CPU or GPU) 56 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 57 | logger.info(f"Decode on {device}.") 58 | 59 | # Load scaler for normalizing features 60 | scaler = load(to_absolute_path(cfg.data.stats)) 61 | 62 | # Load model parameters from checkpoint 63 | if cfg.ckpt_path is None: 64 | ckpt_path = os.path.join( 65 | cfg.out_dir, 66 | "checkpoints", 67 | f"checkpoint-{cfg.ckpt_steps}steps.pkl", 68 | ) 69 | else: 70 | ckpt_path = cfg.ckpt_path 71 | assert os.path.exists(ckpt_path), f"Checkpoint file {ckpt_path} does not exist!" 72 | logger.info(f"Load model parameters from {ckpt_path}.") 73 | state_dict = torch.load(to_absolute_path(ckpt_path), map_location="cpu") 74 | state_dict = state_dict["model"]["generator"] 75 | 76 | # Instantiate and prepare the generator model 77 | model = instantiate(cfg.generator) 78 | model.load_state_dict(state_dict) 79 | model.apply(remove_weight_norm) 80 | model.eval().to(device) 81 | 82 | # Prepare output directory for saving generated waveforms 83 | out_dir = to_absolute_path(os.path.join(cfg.out_dir, cfg.tag, str(cfg.ckpt_steps))) 84 | logger.info(f"Save output waveforms to {out_dir}.") 85 | os.makedirs(out_dir, exist_ok=True) 86 | 87 | # Get hop length from the model 88 | if hasattr(model, "hop_length"): 89 | hop_length = model.hop_length 90 | elif hasattr(model, "upsample_scales"): 91 | hop_length = np.prod(model.upsample_scales) 92 | 93 | total_rtf = 0.0 # Real-time factor tracker 94 | 95 | # Perform inference for each F0 scaling factor 96 | for f0_factor in cfg.f0_factors: 97 | # Prepare the dataset 98 | dataset = FeatDataset( 99 | scaler=scaler, 100 | feat_list=cfg.data.eval_feat, 101 | sample_rate=model.sample_rate, 102 | hop_length=hop_length, 103 | feat_names=cfg.data.feat_names, 104 | use_continuous_f0=cfg.data.use_continuous_f0, 105 | f0_factor=f0_factor, 106 | return_filename=True, 107 | ) 108 | logger.info(f"The number of features to be decoded = {len(dataset)}.") 109 | 110 | with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: 111 | for feat_path, c, f0 in pbar: 112 | c = torch.FloatTensor(c.T).unsqueeze(0).to(device) 113 | f0 = torch.FloatTensor(f0).view(1, 1, -1).to(device) 114 | 115 | # Perform waveform generation 116 | start = time() 117 | y = model.inference(c, f0) 118 | rtf = (time() - start) / (y.size(-1) / model.sample_rate) 119 | pbar.set_postfix({"RTF": rtf}) 120 | total_rtf += rtf 121 | 122 | y = y.view(-1).cpu().numpy() 123 | 124 | # Save generated waveform as a WAV file 125 | utt_id = os.path.splitext(os.path.basename(feat_path))[0] 126 | if "jvs" in feat_path: 127 | spk_id, style_id = feat_path.split("/")[-4:-2] 128 | save_path = os.path.join( 129 | out_dir, f"{spk_id}_{style_id}_{utt_id}.wav" 130 | ) 131 | else: 132 | save_path = os.path.join(out_dir, utt_id + ".wav") 133 | y = np.clip(y, -1, 1) 134 | sf.write(save_path, y, model.sample_rate, "PCM_16") 135 | 136 | # Report average real-time factor 137 | average_rtf = total_rtf / len(dataset) 138 | logger.info( 139 | f"Finished generation of {len(dataset)} utterances (RTF = {average_rtf:.6f})." 140 | ) 141 | 142 | 143 | if __name__ == "__main__": 144 | main() 145 | -------------------------------------------------------------------------------- /wavehax/bin/extract_features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Acoustice feature extraction script. 6 | 7 | This script extracts various acoustic features such as F0 (fundamental frequency), mel-spectrograms, 8 | spectral envelopes, aperiodicities, and mel-generalized cepstra from a list of audio files. 9 | The extracted features are saved in HDF5 format for further use in neural vocoder training or other tasks. 10 | 11 | 12 | References: 13 | - https://github.com/kan-bayashi/ParallelWaveGAN 14 | - https://github.com/bigpon/QPPWG 15 | - https://github.com/k2kobayashi/sprocket 16 | """ 17 | 18 | import copy 19 | import multiprocessing as mp 20 | import os 21 | from logging import getLogger 22 | from typing import Dict, List, Optional, Tuple 23 | 24 | import hydra 25 | import numpy as np 26 | import pysptk 27 | import pyworld 28 | import torch 29 | from hydra.utils import to_absolute_path 30 | from omegaconf import DictConfig, OmegaConf 31 | 32 | from wavehax.modules import MelSpectrogram 33 | from wavehax.utils import ( 34 | convert_to_continuous_f0, 35 | read_audio, 36 | read_txt, 37 | read_yaml, 38 | write_hdf5, 39 | ) 40 | 41 | # A logger for this file 42 | logger = getLogger(__name__) 43 | 44 | 45 | # All-pass-filter coefficients {key -> Sampling frequency : value -> coefficient} 46 | ALPHA = { 47 | 8000: 0.312, 48 | 12000: 0.369, 49 | 16000: 0.410, 50 | 22050: 0.455, 51 | 24000: 0.466, 52 | 32000: 0.504, 53 | 44100: 0.544, 54 | 48000: 0.554, 55 | } 56 | 57 | 58 | def path_create( 59 | audio_paths: List[str], in_dir: str, out_dir: str, extname: str 60 | ) -> List[str]: 61 | """ 62 | Create directories and prepare paths for feature files. 63 | 64 | Args: 65 | audio_paths (List[str]): List of input audio file paths. 66 | in_dir (str): Directory containing the input audio files. 67 | out_dir (str): Directory where the extracted features will be saved. 68 | extname (str): File extension for the output files (e.g., "h5" for HDF5 files). 69 | 70 | Returns: 71 | List[str]: List of paths where the extracted features will be saved. 72 | """ 73 | for audio_path in audio_paths: 74 | path_replace(audio_path, in_dir, out_dir, extname=extname) 75 | 76 | 77 | def path_replace( 78 | file_path: str, in_dir: str, out_dir: str, extname: Optional[str] = None 79 | ) -> str: 80 | """ 81 | Modify the file path by replacing the input directory with the output directory, 82 | and optionally changing the file extension. 83 | 84 | Args: 85 | file_path (str): Original file path. 86 | in_dir (str): Input directory to be replaced in the path. 87 | out_dir (str): Output directory to be inserted in the path. 88 | extname (str, optional): New file extension without a dot (default: None). 89 | 90 | Returns: 91 | str: New file path with the updated directory and file extension. 92 | """ 93 | if extname is not None: 94 | file_path = f"{os.path.splitext(file_path)[0]}.{extname}" 95 | file_path = file_path.replace(in_dir, out_dir) 96 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 97 | return file_path 98 | 99 | 100 | def spk_division( 101 | file_list: List[str], cfg: Dict, spk_info: Dict, split: Optional[str] = "/" 102 | ) -> Tuple[List[List[str]], Dict]: 103 | """ 104 | Divide a list of audio files based on the speaker and prepare speaker-specific configurations. 105 | 106 | Args: 107 | file_list (List[str]): List of audio file paths. 108 | cfg (Dict): Configuration dictionary with default parameters. 109 | spk_info (Dict): Dictionary containing speaker-specific information such as F0 range. 110 | split (str, optional): Delimiter used to split file paths and extract speaker information (default: "/"). 111 | 112 | Returns: 113 | Tuple[List[List[str]], Dict]: 114 | - List of file lists grouped by speaker. 115 | - List of speaker-specific configurations (modified copies of the original cfg). 116 | """ 117 | file_lists, cfgs, tempf = [], [], [] 118 | prespk = None 119 | for file in file_list: 120 | spk = file.split(split)[cfg.spkidx] 121 | if spk != prespk: 122 | if tempf: 123 | file_lists.append(tempf) 124 | tempf = [] 125 | prespk = spk 126 | tempc = copy.deepcopy(cfg) 127 | if spk in spk_info: 128 | tempc["f0_min"] = spk_info[spk]["f0_min"] 129 | tempc["f0_max"] = spk_info[spk]["f0_max"] 130 | else: 131 | msg = f"Since {spk} is not in spk_info dict, " 132 | msg += "default f0 range and power threshold are used." 133 | logger.info(msg) 134 | tempc["f0_min"] = 70 135 | tempc["f0_max"] = 300 136 | cfgs.append(tempc) 137 | tempf.append(file) 138 | file_lists.append(tempf) 139 | 140 | return file_lists, cfgs 141 | 142 | 143 | def feature_list_create(audio_scp: str, cfg: Dict) -> None: 144 | """ 145 | Create a list file containing paths to feature files. 146 | 147 | Args: 148 | audio_scp (str): Path to the SCP file that lists input audio files. 149 | cfg (Dict): Configuration dictionary with input and output directory information. 150 | """ 151 | feature_list_file = audio_scp.replace("scp/", "list/").replace(".scp", ".list") 152 | audio_paths = read_txt(audio_scp) 153 | with open(feature_list_file, "w") as f: 154 | for audio_path in audio_paths: 155 | feat_name = path_replace( 156 | audio_path, 157 | cfg.in_dir, 158 | cfg.out_dir, 159 | extname=cfg.feature_format, 160 | ) 161 | f.write(f"{feat_name}\n") 162 | 163 | 164 | def extract_acoustic_features( 165 | queue: mp.Queue, audio_paths: List[str], cfg: Dict 166 | ) -> None: 167 | """ 168 | Extract various acoustic features (F0, mel-spectrogram, spectral envelope, etc.) from a list of WAV files. 169 | 170 | Args: 171 | queue (multiprocessing.Queue): Queue to communicate the status of the process. 172 | wav_paths (List[str]): List of paths to the WAV files for feature extraction. 173 | cfg (Dict): Configuration dictionary for feature extraction parameters. 174 | """ 175 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 176 | 177 | # Define mel-spectrogram extractor 178 | mel_extractor = MelSpectrogram( 179 | sample_rate=cfg.sample_rate, 180 | hop_length=cfg.hop_length, 181 | n_fft=cfg.n_fft, 182 | n_mels=cfg.n_mels, 183 | fmin=cfg.fmin, 184 | fmax=cfg.fmax, 185 | ).to(device) 186 | 187 | # Read speaker information 188 | spk_info = None 189 | if cfg.spk_info and os.path.exists(to_absolute_path(cfg.spk_info)): 190 | spk_info = read_yaml(to_absolute_path(cfg.spk_info)) 191 | logger.info(f"Speaker information: {spk_info}.") 192 | 193 | # Feature extraction loop 194 | for i, audio_path in enumerate(audio_paths): 195 | # Check the existence of speaker and style 196 | f0_min, f0_max = cfg["f0_min"], cfg["f0_max"] 197 | if spk_info is not None: 198 | spk, style = audio_path.split("/")[-4:-2] 199 | if spk not in spk_info: 200 | logger.warning(f"{spk} of {audio_path} is not in {spk_info}.") 201 | elif style not in spk_info[spk]: 202 | logger.warning(f"{spk}/{style} of {audio_path} is not in {spk_info}.") 203 | else: 204 | f0_min = spk_info[spk][style]["f0_min"] 205 | f0_max = spk_info[spk][style]["f0_max"] 206 | 207 | # Load audio file (WORLD analyzer requires float64) 208 | x = read_audio(audio_path, cfg.sample_rate).astype(np.float64) 209 | 210 | # Extract F0 211 | f0, t = pyworld.harvest( 212 | x, 213 | fs=cfg.sample_rate, 214 | f0_floor=f0_min if f0_min > 0 else cfg["f0_min"], 215 | f0_ceil=f0_max if f0_max > 0 else cfg["f0_max"], 216 | frame_period=1000 * cfg.hop_length / cfg.sample_rate, 217 | ) 218 | if f0_min <= 0 or f0_max <= 0: 219 | f0 *= 0.0 220 | 221 | # Extract spectral envelope and aperiodicity 222 | env = pyworld.cheaptrick(x, f0, t, fs=cfg.sample_rate, fft_size=cfg.fft_size) 223 | ap = pyworld.d4c(x, f0, t, fs=cfg.sample_rate, fft_size=cfg.fft_size) 224 | 225 | # Convert F0 to continuous F0 and voiced/unvoiced flags 226 | cf0 = convert_to_continuous_f0(np.copy(f0)) 227 | lf0 = np.log(cf0 + 1.0) 228 | vuv = f0 != 0 229 | 230 | # Convert spectral envelope to mel-generalized cepstrum (MGC) 231 | mgc = pysptk.sp2mc(env, order=cfg.mgc_dim - 1, alpha=ALPHA[cfg.sample_rate]) 232 | 233 | # Convert aperiodicity to mel-generalized cepstra and coded aperiodicity (MAP and CAP) 234 | map = pysptk.sp2mc(ap, order=cfg.map_dim - 1, alpha=ALPHA[cfg.sample_rate]) 235 | bap = pyworld.code_aperiodicity(ap, cfg.sample_rate) 236 | 237 | # Extract mel-spectrogram (MEL) 238 | x = torch.tensor(x, dtype=torch.float, device=device).view(1, -1) 239 | mel = mel_extractor(x)[0].cpu().numpy().T 240 | 241 | # Prepare output dictionary 242 | features = { 243 | "f0": f0.astype(np.float32).reshape(-1, 1), 244 | "cf0": cf0.astype(np.float32).reshape(-1, 1), 245 | "lf0": lf0.astype(np.float32).reshape(-1, 1), 246 | "vuv": vuv.astype(np.float32).reshape(-1, 1), 247 | "mgc": mgc.astype(np.float32), 248 | "map": map.astype(np.float32), 249 | "bap": bap.astype(np.float32), 250 | "mel": mel.astype(np.float32), 251 | } 252 | 253 | # Save features to HDF5 254 | feat_path = to_absolute_path( 255 | path_replace(audio_path, cfg.in_dir, cfg.out_dir, extname="h5") 256 | ) 257 | for key, value in features.items(): 258 | write_hdf5(feat_path, key, value) 259 | logger.info( 260 | f"Processed {audio_path} and saved features to {feat_path} ({i + 1}/{len(audio_paths)})." 261 | ) 262 | 263 | # Update queue progress 264 | queue.put("Finish") 265 | 266 | 267 | @hydra.main(config_path="config", config_name="extract_features") 268 | def main(cfg: DictConfig) -> None: 269 | logger.info(OmegaConf.to_yaml(cfg)) 270 | 271 | # Read audio file list 272 | file_list = read_txt(to_absolute_path(cfg.audio_scp)) 273 | logger.info(f"Number of utterances = {len(file_list)}") 274 | 275 | # Divide audio file list 276 | file_lists = np.array_split(file_list, 4) 277 | file_lists = [f_list.tolist() for f_list in file_lists] 278 | 279 | # Create feature list file 280 | feature_list_create(to_absolute_path(cfg.audio_scp), cfg) 281 | 282 | # Create folder 283 | path_create(file_list, cfg.in_dir, cfg.out_dir, cfg.feature_format) 284 | 285 | # Multi processing 286 | processes = [] 287 | queue = mp.Queue() 288 | for file_list in file_lists: 289 | p = mp.Process(target=extract_acoustic_features, args=(queue, file_list, cfg)) 290 | p.start() 291 | processes.append(p) 292 | 293 | # Wait for all process 294 | for p in processes: 295 | p.join() 296 | 297 | 298 | if __name__ == "__main__": 299 | main() 300 | -------------------------------------------------------------------------------- /wavehax/bin/profile.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Profiling script for GAN-based vocoders. 6 | 7 | This script profiles a GAN-based vocoder model by calculating its MACs (multiply-accumulate operations) 8 | and counting the number of learnable parameters. 9 | """ 10 | 11 | import hydra 12 | import torch 13 | from omegaconf import DictConfig 14 | from torchprofile import profile_macs 15 | 16 | from wavehax.modules import remove_weight_norm 17 | 18 | 19 | @hydra.main(version_base=None, config_path="config", config_name="profile") 20 | def main(cfg: DictConfig) -> None: 21 | """Profile model parameters and MACs.""" 22 | 23 | # Instantiate model 24 | model = hydra.utils.instantiate(cfg.generator) 25 | model.apply(remove_weight_norm) 26 | model.eval() 27 | 28 | # Generated waveform duration in seconds 29 | dur_in_sec = 1.0 30 | 31 | # Prepare dummy inputs 32 | num_frames = int(model.sample_rate / model.hop_length * dur_in_sec) 33 | cond = torch.randn(1, model.in_channels, num_frames) 34 | f0 = torch.ones(1, 1, num_frames) 35 | 36 | # Calculate MACs 37 | macs = profile_macs(model, (cond, f0)) 38 | 39 | # Calculate learnable model parameters 40 | params = 0 41 | for p in model.parameters(): 42 | if p.requires_grad: 43 | params += p.numel() 44 | 45 | print(f"Model class: {model.__class__.__name__}") 46 | print(f"Duration: {dur_in_sec} [sec]") 47 | print(f"MAC counts: {macs}") 48 | print(f"Parameters: {params}") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /wavehax/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.datasets.audio_feat_dataset import * # NOQA 2 | -------------------------------------------------------------------------------- /wavehax/datasets/audio_feat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Audio and feature dataset modules. 6 | 7 | These modules provide classes to handle datasets of audio and acoustic features. 8 | 9 | References: 10 | - https://github.com/kan-bayashi/ParallelWaveGAN 11 | """ 12 | 13 | from logging import getLogger 14 | from multiprocessing import Manager 15 | from typing import Any, List, Optional 16 | 17 | import numpy as np 18 | from hydra.utils import to_absolute_path 19 | from sklearn.preprocessing import StandardScaler 20 | from torch.utils.data import Dataset 21 | 22 | from wavehax.utils import ( 23 | check_filename, 24 | read_audio, 25 | read_hdf5, 26 | read_txt, 27 | validate_length, 28 | ) 29 | 30 | # A logger for this file 31 | logger = getLogger(__name__) 32 | 33 | 34 | class AudioFeatDataset(Dataset): 35 | """PyTorch compatible dataset for paired audio and acoustic features.""" 36 | 37 | def __init__( 38 | self, 39 | sample_rate: int, 40 | hop_length: int, 41 | audio_list: str, 42 | feat_list: str, 43 | feat_names: List[str], 44 | use_continuous_f0: bool, 45 | scaler: StandardScaler, 46 | audio_length_threshold: Optional[int] = None, 47 | feat_length_threshold: Optional[int] = None, 48 | return_filename: Optional[bool] = False, 49 | allow_cache: Optional[bool] = False, 50 | ) -> None: 51 | """ 52 | Initialize the AudioFeatDataset. 53 | 54 | Args: 55 | sample_rate (int): Sampling frequency of the audio. 56 | hop_length (int): Hop size for acoustic features. 57 | audio_list (str): Filepath to a list of audio files. 58 | feat_list (str): Filepath to a list of feature files. 59 | feat_names (List[str]): Names of auxiliary features to load. 60 | use_continuous_f0 (bool): Whether to use continuous F0 values. 61 | scaler (StandardScaler): A fitted scaler for feature normalization. 62 | audio_length_threshold (int, optional): Minimum audio length to include in the dataset (default: None). 63 | feat_length_threshold (int, optional): Minimum feature length to include in the dataset (default: None). 64 | return_filename (bool, optional): Whether to return filenames with the data (default: False). 65 | allow_cache (bool, optional): Whether to cache loaded data in memory for faster access (default: False). 66 | """ 67 | 68 | # Load audio and feature files & check filename 69 | audio_files = read_txt(to_absolute_path(audio_list)) 70 | feat_files = read_txt(to_absolute_path(feat_list)) 71 | assert check_filename(audio_files, feat_files) 72 | 73 | # Filter by threshold 74 | if audio_length_threshold is not None: 75 | audio_lengths = [ 76 | read_audio(to_absolute_path(f), sample_rate).shape[0] 77 | for f in audio_files 78 | ] 79 | idxs = [ 80 | idx 81 | for idx in range(len(audio_files)) 82 | if audio_lengths[idx] > audio_length_threshold 83 | ] 84 | if len(audio_files) != len(idxs): 85 | logger.warning( 86 | f"Some files are filtered by audio length threshold " 87 | f"({len(audio_files)} -> {len(idxs)})." 88 | ) 89 | audio_files = [audio_files[idx] for idx in idxs] 90 | feat_files = [feat_files[idx] for idx in idxs] 91 | if feat_length_threshold is not None: 92 | f0_lengths = [ 93 | read_hdf5(to_absolute_path(f), feat_names[0]).shape[0] 94 | for f in feat_files 95 | ] 96 | idxs = [ 97 | idx 98 | for idx in range(len(feat_files)) 99 | if f0_lengths[idx] > feat_length_threshold 100 | ] 101 | if len(feat_files) != len(idxs): 102 | logger.warning( 103 | f"Some files are filtered by mel length threshold " 104 | f"({len(feat_files)} -> {len(idxs)})." 105 | ) 106 | audio_files = [audio_files[idx] for idx in idxs] 107 | feat_files = [feat_files[idx] for idx in idxs] 108 | 109 | # Check the number of files 110 | assert len(audio_files) != 0, f"${audio_list} is empty." 111 | assert ( 112 | len(audio_files) == len(feat_files) 113 | ), f"Number of audio and features files are different ({len(audio_files)} vs {len(feat_files)})." 114 | 115 | self.sample_rate = sample_rate 116 | self.hop_length = hop_length 117 | self.audio_files = audio_files 118 | self.feat_files = feat_files 119 | self.feat_names = feat_names 120 | self.f0_type = "cf0" if use_continuous_f0 else "f0" 121 | self.scaler = scaler 122 | self.return_filename = return_filename 123 | self.allow_cache = allow_cache 124 | 125 | if allow_cache: 126 | # NOTE(kan-bayashi): Manager is need to share memory in dataloader with num_workers > 0 127 | self.manager = Manager() 128 | self.caches = self.manager.list() 129 | self.caches += [() for _ in range(len(audio_files))] 130 | 131 | def __getitem__(self, idx: int) -> List[Any]: 132 | """ 133 | Get the specified index data. 134 | 135 | Args: 136 | idx (int): Index of the item. 137 | 138 | Returns: 139 | list: [filename (optional), audio waveform, auxiliary features, F0 sequence] 140 | """ 141 | if self.allow_cache and len(self.caches[idx]) != 0: 142 | return self.caches[idx] 143 | 144 | # Load audio waveform 145 | audio = read_audio(to_absolute_path(self.audio_files[idx]), self.sample_rate) 146 | 147 | # Get auxiliary features 148 | feats = [] 149 | for feat_name in self.feat_names: 150 | feat = read_hdf5(to_absolute_path(self.feat_files[idx]), feat_name) 151 | feat = self.scaler[feat_name].transform(feat) 152 | feats += [feat] 153 | feat = np.concatenate(feats, axis=1) 154 | 155 | # Get f0 sequence 156 | f0 = read_hdf5(to_absolute_path(self.feat_files[idx]), self.f0_type) 157 | feat, f0 = validate_length([feat, f0]) 158 | 159 | items = [audio, feat, f0] 160 | if self.return_filename: 161 | items = [self.feat_files[idx]] + items 162 | 163 | if self.allow_cache: 164 | self.caches[idx] = items 165 | 166 | return items 167 | 168 | def __len__(self) -> int: 169 | """ 170 | Return the number of items in the dataset. 171 | 172 | Returns: 173 | int: Dataset size. 174 | """ 175 | return len(self.audio_files) 176 | 177 | 178 | class FeatDataset(Dataset): 179 | """PyTorch compatible dataset for acoustic features.""" 180 | 181 | def __init__( 182 | self, 183 | sample_rate: int, 184 | hop_length: int, 185 | feat_list: str, 186 | feat_names: List[str], 187 | use_continuous_f0: bool, 188 | scaler: StandardScaler, 189 | f0_factor: Optional[float] = 1.0, 190 | return_filename: Optional[bool] = False, 191 | ) -> None: 192 | """ 193 | Initialize the FeatDataset. 194 | 195 | Args: 196 | sample_rate (int): Sampling frequency of the audio. 197 | hop_length (int): Hop size for acoustic features. 198 | feat_list (str): Filepath to a list of feature files. 199 | feat_names (List[str]): Names of auxiliary features to load. 200 | use_continuous_f0 (bool): Whether to use continuous F0 values. 201 | scaler (StandardScaler): A fitted scaler for feature normalization. 202 | f0_factor (float, optional): Scaling factor for the F0 values (default: [1.0]). 203 | return_filename (bool, optional): Whether to return filenames with the data (default: False). 204 | """ 205 | self.sample_rate = sample_rate 206 | self.hop_length = hop_length 207 | self.feat_files = read_txt(to_absolute_path(feat_list)) 208 | self.feat_names = feat_names 209 | self.f0_type = "cf0" if use_continuous_f0 else "f0" 210 | self.scaler = scaler 211 | self.f0_factor = f0_factor 212 | self.return_filename = return_filename 213 | 214 | def __getitem__(self, idx: int) -> List[Any]: 215 | """ 216 | Get the specified index data. 217 | 218 | Args: 219 | idx (int): Index of the item. 220 | 221 | Returns: 222 | list: [filename (optional), auxiliary features, F0 sequence] 223 | """ 224 | # Get auxiliary features 225 | feats = [] 226 | for feat_name in self.feat_names: 227 | feat = read_hdf5(to_absolute_path(self.feat_files[idx]), feat_name) 228 | if feat_name in ["f0", "cf0"]: 229 | feat *= self.f0_factor 230 | feat = self.scaler[feat_name].transform(feat) 231 | feats += [feat] 232 | feat = np.concatenate(feats, axis=1) 233 | 234 | # Get F0 sequences 235 | f0 = read_hdf5(to_absolute_path(self.feat_files[idx]), self.f0_type) 236 | f0 = f0 * self.f0_factor 237 | feat, f0 = validate_length([feat, f0]) 238 | 239 | items = [feat, f0] 240 | if self.return_filename: 241 | items = [self.feat_files[idx]] + items 242 | 243 | return items 244 | 245 | def __len__(self) -> int: 246 | """ 247 | Return the number of items in the dataset. 248 | 249 | Returns: 250 | int: Dataset size. 251 | """ 252 | return len(self.feat_files) 253 | -------------------------------------------------------------------------------- /wavehax/discriminators/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.discriminators.univnet import * # NOQA 2 | -------------------------------------------------------------------------------- /wavehax/generators/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.generators.wavehax import * # NOQA 2 | -------------------------------------------------------------------------------- /wavehax/generators/wavehax.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Wavehax generator modules.""" 5 | 6 | from functools import partial 7 | 8 | import torch 9 | from torch import Tensor, nn 10 | 11 | import wavehax.modules 12 | from wavehax.modules import ( 13 | STFT, 14 | ComplexConv1d, 15 | ComplexConv2d, 16 | ComplexConvNeXtBlock2d, 17 | ComplexLayerNorm2d, 18 | ConvNeXtBlock2d, 19 | LayerNorm2d, 20 | to_log_magnitude_and_phase, 21 | to_real_imaginary, 22 | ) 23 | 24 | 25 | class WavehaxGenerator(nn.Module): 26 | """ 27 | Wavehax generator module. 28 | 29 | This module produces time-domain waveforms through complex spectrogram estimation 30 | based on the integration of 2D convolution and harmonic prior spectrograms. 31 | """ 32 | 33 | def __init__( 34 | self, 35 | in_channels: int, 36 | channels: int, 37 | mult_channels: int, 38 | kernel_size: int, 39 | num_blocks: int, 40 | n_fft: int, 41 | hop_length: int, 42 | sample_rate: int, 43 | prior_type: str, 44 | drop_prob: float = 0.0, 45 | use_layer_norm: bool = True, 46 | use_logmag_phase: bool = False, 47 | ) -> None: 48 | """ 49 | Initialize the WavehaxGenerator module. 50 | 51 | Args: 52 | in_channels (int): Number of conditioning feature channels. 53 | channels (int): Number of hidden feature channels. 54 | mult_channels (int): Channel expansion multiplier for ConvNeXt blocks. 55 | kernel_size (int): Kernel size for ConvNeXt blocks. 56 | num_blocks (int): Number of ConvNeXt residual blocks. 57 | n_fft (int): Number of Fourier transform points (FFT size). 58 | hop_length (int): Hop length (frameshift) in samples. 59 | sample_rate (int): Sampling frequency of input and output waveforms in Hz. 60 | prior_type (str): Type of prior waveforms used. 61 | drop_prob (float): Probability of dropping paths for stochastic depth (default: 0.0). 62 | use_layer_norm (bool): If True, layer normalization is used; otherwise, 63 | batch normalization is applied (default: True). 64 | use_logmag_phase (bool): Whether to use log-magnitude and phase for STFT (default: False). 65 | """ 66 | super().__init__() 67 | self.in_channels = in_channels 68 | self.n_fft = n_fft 69 | self.n_bins = n_fft // 2 + 1 70 | self.hop_length = hop_length 71 | self.sample_rate = sample_rate 72 | self.use_logmag_phase = use_logmag_phase 73 | 74 | # Prior waveform generator 75 | self.prior_generator = partial( 76 | getattr(wavehax.modules, f"generate_{prior_type}"), 77 | hop_length=self.hop_length, 78 | sample_rate=sample_rate, 79 | ) 80 | 81 | # STFT layer 82 | self.stft = STFT(n_fft=n_fft, hop_length=hop_length) 83 | 84 | # Input projection layers 85 | n_bins = n_fft // 2 + 1 86 | self.prior_proj = nn.Conv1d( 87 | n_bins, n_bins, 7, padding=3, padding_mode="reflect" 88 | ) 89 | self.cond_proj = nn.Conv1d( 90 | in_channels, n_bins, 7, padding=3, padding_mode="reflect" 91 | ) 92 | 93 | # Input normalization and projection layers 94 | self.input_proj = nn.Conv2d(5, channels, 1, bias=False) 95 | self.input_norm = LayerNorm2d(channels) 96 | 97 | # ConvNeXt-based residual blocks 98 | self.blocks = nn.ModuleList() 99 | for _ in range(num_blocks): 100 | block = ConvNeXtBlock2d( 101 | channels, 102 | mult_channels, 103 | kernel_size, 104 | drop_prob=drop_prob, 105 | use_layer_norm=use_layer_norm, 106 | layer_scale_init_value=1 / num_blocks, 107 | ) 108 | self.blocks += [block] 109 | 110 | # Output normalization and projection layers 111 | self.output_norm = LayerNorm2d(channels) 112 | self.output_proj = nn.Conv2d(channels, 2, 1) 113 | 114 | self.apply(self.init_weights) 115 | 116 | def init_weights(self, m) -> None: 117 | """ 118 | Initialize weights of the module. 119 | 120 | Args: 121 | m (Any): Module to initialize. 122 | """ 123 | if isinstance(m, (nn.Conv1d, nn.Conv2d)): 124 | nn.init.trunc_normal_(m.weight, std=0.02) 125 | if m.bias is not None: 126 | nn.init.constant_(m.bias, 0.0) 127 | 128 | def forward(self, cond: Tensor, f0: Tensor) -> Tensor: 129 | """ 130 | Calculate forward propagation. 131 | 132 | Args: 133 | cond (Tensor): Conditioning features with shape (batch, in_channels, frames). 134 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 135 | 136 | Returns: 137 | Tensor: Generated waveforms with shape (batch, 1, frames * hop_length). 138 | Tensor: Generated prior waveforms with shape (batch, 1, frames * hop_length). 139 | """ 140 | # Generate prior waveform and compute spectrogram 141 | with torch.no_grad(): 142 | prior = self.prior_generator(f0) 143 | real, imag = self.stft(prior) 144 | if self.use_logmag_phase: 145 | prior1, prior2 = to_log_magnitude_and_phase(real, imag) 146 | else: 147 | prior1, prior2 = real, imag 148 | 149 | # Apply input projection 150 | prior1_proj = self.prior_proj(prior1) 151 | prior2_proj = self.prior_proj(prior2) 152 | cond = self.cond_proj(cond) 153 | 154 | # Convert to 2d representation 155 | x = torch.stack([prior1, prior2, prior1_proj, prior2_proj, cond], dim=1) 156 | x = self.input_proj(x) 157 | x = self.input_norm(x) 158 | 159 | # Apply residual blocks 160 | for f in self.blocks: 161 | x = f(x) 162 | 163 | # Apply output projection 164 | x = self.output_norm(x) 165 | x = self.output_proj(x) 166 | 167 | # Apply iSTFT followed by overlap and add 168 | if self.use_logmag_phase: 169 | real, imag = to_real_imaginary(x[:, 0], x[:, 1]) 170 | else: 171 | real, imag = x[:, 0], x[:, 1] 172 | x = self.stft.inverse(real, imag) 173 | 174 | return x, prior 175 | 176 | @torch.inference_mode() 177 | def inference(self, cond: Tensor, f0: Tensor) -> Tensor: 178 | return self(cond, f0)[0] 179 | 180 | 181 | class ComplexWavehaxGenerator(nn.Module): 182 | """ 183 | Complex-valued Wavehax generator module. 184 | 185 | This class examines whether incorporating the algebraic structure of complex numbers enhances performance. 186 | Although this complex-valued version slightly increases computational cost compared to the standard Wavehax with 187 | an almost equivalent number of channels, consistent performance improvement was not observed. 188 | This code is shared in the hope that it may be useful for further research or for developing more advanced methods. 189 | """ 190 | 191 | def __init__( 192 | self, 193 | in_channels: int, 194 | channels: int, 195 | mult_channels: int, 196 | kernel_size: int, 197 | num_blocks: int, 198 | n_fft: int, 199 | hop_length: int, 200 | sample_rate: int, 201 | prior_type: str, 202 | drop_prob: float = 0.0, 203 | use_layer_norm: bool = True, 204 | init_weights: bool = False, 205 | ) -> None: 206 | """ 207 | Initialize the ComplexWavehaxGenerator module. 208 | 209 | Args: 210 | in_channels (int): Number of conditioning feature channels. 211 | channels (int): Number of hidden feature channels. 212 | Note that both real and imaginary parts will retain this number of channels. 213 | mult_channels (int): Channel expansion multiplier for ConvNeXt blocks. 214 | kernel_size (int): Kernel size for ConvNeXt blocks. 215 | num_blocks (int): Number of ConvNeXt residual blocks. 216 | n_fft (int): Number of Fourier transform points (FFT size). 217 | hop_length (int): Hop length (frameshift) in samples. 218 | sample_rate (int): Sampling frequency of input and output waveforms in Hz. 219 | prior_type (str): Type of prior waveforms used. 220 | drop_prob (float): Probability of dropping paths for stochastic depth (default: 0.0). 221 | use_layer_norm (bool): If True, layer normalization is used; otherwise, 222 | batch normalization is applied (default: True). 223 | init_weights (bool): If True, apply the weight initialization of the standard Wavehax, 224 | instead of the weight initialization designed for complex-valued weights (default: False). 225 | """ 226 | super().__init__() 227 | self.in_channels = in_channels 228 | self.n_fft = n_fft 229 | self.n_bins = n_fft // 2 + 1 230 | self.hop_length = hop_length 231 | self.sample_rate = sample_rate 232 | 233 | # Prior waveform generator 234 | self.prior_generator = partial( 235 | getattr(wavehax.modules, f"generate_{prior_type}"), 236 | hop_length=self.hop_length, 237 | sample_rate=sample_rate, 238 | ) 239 | 240 | # STFT layer 241 | self.stft = STFT(n_fft=n_fft, hop_length=hop_length) 242 | 243 | # Input projection layers 244 | n_bins = n_fft // 2 + 1 245 | self.prior_proj = ComplexConv1d( 246 | n_bins, n_bins, 7, padding=3, padding_mode="reflect" 247 | ) 248 | self.cond_proj = ComplexConv1d( 249 | in_channels, n_bins, 7, padding=3, padding_mode="reflect" 250 | ) 251 | 252 | # Input normalization and projection layers 253 | self.input_proj = ComplexConv2d(3, channels, 1, bias=False) 254 | self.input_norm = ComplexLayerNorm2d(channels) 255 | 256 | # ConvNeXt-based residual blocks 257 | self.blocks = nn.ModuleList() 258 | for _ in range(num_blocks): 259 | block = ComplexConvNeXtBlock2d( 260 | channels, 261 | mult_channels, 262 | kernel_size, 263 | drop_prob=drop_prob, 264 | use_layer_norm=use_layer_norm, 265 | layer_scale_init_value=1 / num_blocks, 266 | ) 267 | self.blocks += [block] 268 | 269 | # Output normalization and projection layers 270 | self.output_norm = ComplexLayerNorm2d(channels) 271 | self.output_proj = ComplexConv2d(channels, 1, 1) 272 | 273 | # Apply the standard Wavehax weight initialization, which tends to produce better results. 274 | if init_weights: 275 | self.apply(self.init_weights) 276 | 277 | def init_weights(self, m) -> None: 278 | """ 279 | Initialize weights of the module. 280 | 281 | Args: 282 | m (Any): Module to initialize. 283 | """ 284 | if isinstance(m, (nn.Conv1d, nn.Conv2d)): 285 | nn.init.trunc_normal_(m.weight, std=0.02) 286 | if m.bias is not None: 287 | nn.init.constant_(m.bias, 0.0) 288 | 289 | def forward(self, cond: Tensor, f0: Tensor) -> Tensor: 290 | """ 291 | Calculate forward propagation. 292 | 293 | Args: 294 | cond (Tensor): Conditioning features with shape (batch, in_channels, frames). 295 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 296 | 297 | Returns: 298 | Tensor: Generated waveforms with shape (batch, 1, frames * hop_length). 299 | Tensor: Generated prior waveforms with shape (batch, 1, frames * hop_length). 300 | """ 301 | # Generate prior waveform and compute spectrogram 302 | with torch.no_grad(): 303 | prior = self.prior_generator(f0) 304 | real, imag = self.stft(prior) 305 | 306 | # Apply input projection 307 | real_proj, imag_proj = self.prior_proj(real, imag) 308 | cond_real, cond_imag = self.cond_proj(cond, cond) 309 | 310 | # Convert to 2d representation 311 | real = torch.stack([real, real_proj, cond_real], dim=1) 312 | imag = torch.stack([imag, imag_proj, cond_imag], dim=1) 313 | real, imag = self.input_proj(real, imag) 314 | real, imag = self.input_norm(real, imag) 315 | 316 | # Apply residual blocks 317 | for f in self.blocks: 318 | real, imag = f(real, imag) 319 | 320 | # Apply output projection 321 | real, imag = self.output_norm(real, imag) 322 | real, imag = self.output_proj(real, imag) 323 | 324 | # Apply iSTFT followed by overlap and add 325 | real, imag = real.squeeze(1), imag.squeeze(1) 326 | x = self.stft.inverse(real, imag) 327 | 328 | return x, prior 329 | 330 | @torch.inference_mode() 331 | def inference(self, cond: Tensor, f0: Tensor) -> Tensor: 332 | return self(cond, f0)[0] 333 | -------------------------------------------------------------------------------- /wavehax/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.losses.adv import * # NOQA 2 | from wavehax.losses.spectral import * # NOQA 3 | -------------------------------------------------------------------------------- /wavehax/losses/adv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adversarial loss modules. 3 | 4 | References: 5 | - https://github.com/kan-bayashi/ParallelWaveGAN 6 | """ 7 | 8 | from typing import List, Optional, Tuple, Union 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch import Tensor 14 | 15 | 16 | class AdversarialLoss(nn.Module): 17 | """Module for calculating adversarial loss in GANs.""" 18 | 19 | def __init__( 20 | self, 21 | average_by_discriminators: Optional[bool] = False, 22 | loss_type: Optional[str] = "mse", 23 | ) -> None: 24 | """ 25 | Initialize the AdversarialLoss module. 26 | 27 | Args: 28 | average_by_discriminators (bool, optional): If True, the loss is averaged over the number of discriminators (default: False). 29 | loss_type (str, optional): Type of GAN loss to use, either "mse" or "hinge" (default: "mse"). 30 | """ 31 | super().__init__() 32 | assert loss_type.lower() in ["mse", "hinge"], f"{loss_type} is not supported." 33 | self.average_by_discriminators = average_by_discriminators 34 | 35 | if loss_type == "mse": 36 | self.adv_criterion = self._mse_adv_loss 37 | self.fake_criterion = self._mse_fake_loss 38 | self.real_criterion = self._mse_real_loss 39 | else: 40 | self.adv_criterion = self._hinge_adv_loss 41 | self.fake_criterion = self._hinge_fake_loss 42 | self.real_criterion = self._hinge_real_loss 43 | 44 | def forward( 45 | self, p_fakes: List[Tensor], p_reals: Optional[List[Tensor]] = None 46 | ) -> Union[Tensor, Tuple[Tensor, Tensor]]: 47 | """ 48 | Calculate adversarial loss for both generator and discriminator. 49 | 50 | Args: 51 | p_fakes (List[Tensor]): List of discriminator outputs from the generated data. 52 | p_reals (List[Tensor], optional): List of discriminator outputs from real data. 53 | If not provided, only generator loss is computed (default: None). 54 | 55 | Returns: 56 | Tensor: Generator adversarial loss. 57 | If p_reals is provided: 58 | Tuple[Tensor, Tensor]: Fake and real discriminator loss values. 59 | """ 60 | # Generator adversarial loss 61 | if p_reals is None: 62 | adv_loss = 0.0 63 | for p_fake in p_fakes: 64 | adv_loss += self.adv_criterion(p_fake) 65 | 66 | if self.average_by_discriminators: 67 | adv_loss /= len(p_fakes) 68 | 69 | return adv_loss 70 | 71 | # Discriminator adversarial loss 72 | else: 73 | fake_loss, real_loss = 0.0, 0.0 74 | for p_fake, p_real in zip(p_fakes, p_reals): 75 | fake_loss += self.fake_criterion(p_fake) 76 | real_loss += self.real_criterion(p_real) 77 | 78 | if self.average_by_discriminators: 79 | fake_loss /= len(p_fakes) 80 | real_loss /= len(p_reals) 81 | 82 | return fake_loss, real_loss 83 | 84 | def _mse_adv_loss(self, x: Tensor) -> Tensor: 85 | """Calculate MSE loss for generator.""" 86 | return F.mse_loss(x, x.new_ones(x.size())) 87 | 88 | def _mse_real_loss(self, x: Tensor) -> Tensor: 89 | """Calculate MSE loss for real samples.""" 90 | return F.mse_loss(x, x.new_ones(x.size())) 91 | 92 | def _mse_fake_loss(self, x: Tensor) -> Tensor: 93 | """Calculate MSE loss for fake samples.""" 94 | return F.mse_loss(x, x.new_zeros(x.size())) 95 | 96 | def _hinge_adv_loss(self, x: Tensor) -> Tensor: 97 | """Calculate hinge loss for generator.""" 98 | return -x.mean() 99 | 100 | def _hinge_real_loss(self, x: Tensor) -> Tensor: 101 | """Calculate hinge loss for real samples.""" 102 | return -torch.mean(torch.min(x - 1, x.new_zeros(x.size()))) 103 | 104 | def _hinge_fake_loss(self, x: Tensor) -> Tensor: 105 | """Calculate hinge loss for fake samples.""" 106 | return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size()))) 107 | 108 | 109 | class FeatureMatchingLoss(nn.Module): 110 | """Module for feature matching loss in GANs, comparing latent features.""" 111 | 112 | def __init__(self, average_by_layers: Optional[bool] = False) -> None: 113 | """ 114 | Initialize the FeatureMatchingLoss module. 115 | 116 | Args: 117 | average_by_layers (bool, optional): If True, the loss is averaged over the number of layers (default: False). 118 | """ 119 | super().__init__() 120 | self.average_by_layers = average_by_layers 121 | 122 | def forward(self, fmaps_fake: List[Tensor], fmaps_real: List[Tensor]) -> Tensor: 123 | """ 124 | Calculate feature matching loss. 125 | 126 | Args: 127 | fmaps_fake (List[Tensor]): List of discriminator's latent features from generated data. 128 | fmaps_real (List[Tensor]): List of discriminator's latent features from real data. 129 | 130 | Returns: 131 | Tensor: The feature matching loss value. 132 | """ 133 | assert len(fmaps_fake) == len(fmaps_real) 134 | 135 | fm_loss = 0.0 136 | for feat_fake, feat_real in zip(fmaps_fake, fmaps_real): 137 | fm_loss += F.l1_loss(feat_fake, feat_real.detach()) 138 | 139 | if self.average_by_layers: 140 | fm_loss /= len(fmaps_fake) 141 | 142 | return fm_loss 143 | -------------------------------------------------------------------------------- /wavehax/losses/spectral.py: -------------------------------------------------------------------------------- 1 | """Spectral loss modules.""" 2 | 3 | from typing import Optional 4 | 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch import Tensor 8 | 9 | from wavehax.modules import MelSpectrogram 10 | 11 | 12 | class MelSpectralLoss(nn.Module): 13 | """Module for calculating L1 loss on Mel-spectrograms.""" 14 | 15 | def __init__( 16 | self, 17 | sample_rate: int, 18 | hop_length: int, 19 | n_fft: int, 20 | n_mels: int, 21 | window: Optional[str] = "hann_window", 22 | fmin: Optional[float] = 0, 23 | fmax: Optional[float] = None, 24 | ) -> None: 25 | """ 26 | Initialize the MelSpectralLoss module. 27 | 28 | Args: 29 | sample_rate (int): Sampling frequency of input waveforms. 30 | hop_length (int): Hop length (frameshift) in samples. 31 | n_fft (int): Number of Fourier transform points (FFT size). 32 | n_mels (int): Number of mel basis. 33 | window (str, optional): Name of the window function (default: "hann_window). 34 | fmin (float, optional): Minimum frequency for mel-filter bank (default: 0). 35 | fmax (float, optional): Maximum frequency for mel-filter bank (default: None). 36 | """ 37 | super().__init__() 38 | self.mel_extractor = MelSpectrogram( 39 | sample_rate=sample_rate, 40 | hop_length=hop_length, 41 | n_fft=n_fft, 42 | n_mels=n_mels, 43 | window=window, 44 | fmin=fmin, 45 | fmax=fmax, 46 | ) 47 | 48 | def forward(self, x: Tensor, y: Tensor) -> Tensor: 49 | """ 50 | Calculate the L1 loss between mel-spectrograms of the generated and target waveforms. 51 | 52 | Args: 53 | x (Tensor): Generated audio waveform with shape (batch, samples) or (batch, 1, samples). 54 | y (Tensor): Targetaudio waveform with shape (batch, samples) or (batch, 1, samples). 55 | 56 | Returns: 57 | Tensor: Mel-spectral L1 loss value. 58 | """ 59 | x_log_mel = self.mel_extractor(x) 60 | y_log_mel = self.mel_extractor(y) 61 | loss = F.l1_loss(x_log_mel, y_log_mel) 62 | 63 | return loss 64 | -------------------------------------------------------------------------------- /wavehax/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.modules.utils import * # NOQA # isort: skip 2 | from wavehax.modules.drop import * # NOQA # isort: skip 3 | from wavehax.modules.conv import * # NOQA # isort: skip 4 | from wavehax.modules.norm import * # NOQA # isort: skip 5 | from wavehax.modules.complex import * # NOQA # isort: skip 6 | from wavehax.modules.stft import * # NOQA # isort: skipxs 7 | from wavehax.modules.periodic import * # NOQA # isort: skip 8 | from wavehax.modules.resblock import * # NOQA # isort: skip 9 | -------------------------------------------------------------------------------- /wavehax/modules/conv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Pitch-dependent dilated convolutional neural networks (PDCNNs). 6 | 7 | This module implements pitch-dependent dilated convolutions for both 1D and 2D convolutions, 8 | where the dilation factor depends on the input's fundamental frequencies (F0). 9 | 10 | References: 11 | - https://github.com/bigpon/QPPWG 12 | """ 13 | 14 | from typing import List, Optional, Tuple, Union 15 | 16 | import torch 17 | import torch.nn as nn 18 | from torch import Tensor 19 | 20 | 21 | def pd_indexing1d(x: Tensor, d: Tensor, dilation: int) -> Tuple[Tensor, Tensor, Tensor]: 22 | """ 23 | Perform pitch-dependent indexing for temporal sequences. 24 | 25 | This function applies pitch-dependent dilation to the input tensor `x`, retrieving past, 26 | current (center), and future elements based on pitch-dependent dilation factors. 27 | 28 | Args: 29 | x (Tensor): Input tensor with shape (batch, channels, length). 30 | d (Tensor): Pitch-dependent dilation factors with shape (batch, 1, length). 31 | dilation (int): Dilation factor to apply to the temporal axis. 32 | 33 | Returns: 34 | Tuple[Tensor, Tensor, Tensor]: A tuple containing: 35 | - Past output tensor with shape (batch, channels, length). 36 | - Center element tensor with shape (batch, channels, length). 37 | - Future output tensor with shape (batch, channels, length). 38 | """ 39 | B, C, T = x.size() 40 | batch_index = torch.arange(0, B, dtype=torch.long, device=x.device).reshape(B, 1, 1) 41 | ch_index = torch.arange(0, C, dtype=torch.long, device=x.device).reshape(1, C, 1) 42 | dilations = torch.clamp((d * dilation).long(), min=1) 43 | 44 | # Get past index (assume reflect padding) 45 | idx_base = torch.arange(0, T, dtype=torch.long, device=x.device).reshape(1, 1, T) 46 | idxP = (idx_base - dilations).abs() % T 47 | idxP = (batch_index, ch_index, idxP) 48 | 49 | # Get future index (assume reflect padding) 50 | idxF = idx_base + dilations 51 | overflowed = idxF >= T 52 | idxF[overflowed] = -(idxF[overflowed] % T) - 1 53 | idxF = (batch_index, ch_index, idxF) 54 | 55 | return x[idxP], x, x[idxF] 56 | 57 | 58 | class AdaptiveConv1d(nn.Module): 59 | """ 60 | Pitch-dependent dilated 1d convolutional neural network module. 61 | 62 | This module performs 1D convolution with pitch-dependent dilation, adjusting 63 | the dilation factor based on the input's fundamental frequency (F0). 64 | """ 65 | 66 | def __init__( 67 | self, 68 | in_channels: int, 69 | out_channels: int, 70 | kernel_size: int, 71 | dilation: Optional[int] = 1, 72 | bias: Optional[bool] = True, 73 | ) -> None: 74 | """ 75 | Initialize the AdaptiveConv1d module. 76 | 77 | Args: 78 | in_channels (int): Number of input channels. 79 | out_channels (int): Number of output channels. 80 | kernel_size (int): Kernel size of the convolution. 81 | dilation (int, optional): Dilation factor for the convolution (default: 1). 82 | bias (bool, optional): Whether to include a bias term in the convolution (default: True). 83 | """ 84 | super().__init__() 85 | assert kernel_size in [1, 3] 86 | self.kernel_size = kernel_size 87 | self.dilation = dilation 88 | self.convs = nn.ModuleList( 89 | [ 90 | nn.Conv1d( 91 | in_channels, 92 | out_channels, 93 | kernel_size=1, 94 | bias=bias if i == 0 else False, 95 | ) 96 | for i in range(kernel_size) 97 | ] 98 | ) 99 | 100 | def forward(self, x: Tensor, d: Tensor) -> Tensor: 101 | """ 102 | Calculate forward propagation. 103 | 104 | Args: 105 | x (Tensor): Input tensor with shape (batch, in_channels, length). 106 | d (Tensor): Pitch-dependent dilation factors with shape (batch, 1, length). 107 | 108 | Returns: 109 | Tensor: Output tensor with shape (batch, out_channels, length). 110 | """ 111 | out = 0.0 112 | xs = pd_indexing1d(x, d, self.dilation) 113 | for x, f in zip(xs, self.convs): 114 | out = out + f(x) 115 | return out 116 | 117 | 118 | def pd_indexing2d( 119 | x: Tensor, dh: Tensor, dw: Tensor, dilation: Tuple[int, int] 120 | ) -> List[Tensor]: 121 | """ 122 | Perform pitch-dependent indexing for time-frequency feature maps. 123 | 124 | This function retrieves elements from a time-frequency map using pitch-dependent dilation factors 125 | in both the time and frequency dimensions. 126 | 127 | Args: 128 | x (Tensor): Input tensor with shape (batch, channels, bins, frames). 129 | dh (Tensor): Pitch-dependent dilation factors in the frequency dimension (batch, 1, frames). 130 | dw (Tensor): Pitch-dependent dilation factors in the time dimension (batch, 1, frames). 131 | dilation (int): Dilation factor for both dimensions. 132 | 133 | Returns: 134 | List[Tensor]: List of indexed tensors, where each tensor has shape (batch, channels, bins, frames). 135 | """ 136 | B, C, H, W = x.size() 137 | batch_index = torch.arange(0, B, dtype=torch.long, device=x.device).reshape( 138 | B, 1, 1, 1 139 | ) 140 | ch_index = torch.arange(0, C, dtype=torch.long, device=x.device).reshape(1, C, 1, 1) 141 | freq_index = torch.arange(0, H, dtype=torch.long, device=x.device).reshape( 142 | 1, 1, H, 1 143 | ) 144 | frame_index = torch.arange(0, W, dtype=torch.long, device=x.device).reshape( 145 | 1, 1, 1, W 146 | ) 147 | dilations_h = torch.clamp( 148 | (dh * dilation[0]).unsqueeze(2).expand(-1, -1, H, -1).long(), min=1 149 | ) 150 | dilations_w = torch.clamp( 151 | (dw * dilation[1]).unsqueeze(2).expand(-1, -1, -1, W).long(), min=1 152 | ) 153 | 154 | idx_base = torch.arange(0, H, dtype=torch.long, device=x.device).reshape(1, 1, H, 1) 155 | # Get lower index (assume reflect padding) 156 | idxD = (idx_base - dilations_h).abs() % H 157 | # Get upper index (overflowed kernels are applied to the central elements) 158 | idxU = idx_base + dilations_h 159 | overflowed = idxU >= H 160 | idxU[overflowed] = idxU[overflowed] - dilations_h[overflowed] 161 | row_indexes = [idxD, freq_index, idxU] 162 | 163 | idx_base = torch.arange(0, W, dtype=torch.long, device=x.device).reshape(1, 1, 1, W) 164 | # Get left (past) index (assume reflect padding) 165 | idxL = (idx_base - dilations_w).abs() % W 166 | # Get right (future) index (assume reflect padding) 167 | idxR = idx_base + dilations_w 168 | overflowed = idxR >= W 169 | idxR[overflowed] = -(idxR[overflowed] % W) - 1 170 | col_indexes = [idxL, frame_index, idxR] 171 | 172 | xs = [] 173 | for row_index in row_indexes: 174 | for col_index in col_indexes: 175 | index = (batch_index, ch_index, row_index, col_index) 176 | xs += [x[index]] 177 | 178 | return xs 179 | 180 | 181 | class AdaptiveConv2d(nn.Module): 182 | """ 183 | Pitch-dependent dilated 2D convolutional neural network module. 184 | 185 | This module performs 2D convolution with pitch-dependent dilation, adjusting 186 | the dilation factors for both time and frequency dimensions. 187 | """ 188 | 189 | def __init__( 190 | self, 191 | in_channels: int, 192 | out_channels: int, 193 | kernel_size: Union[int, Tuple[int, int]], 194 | dilation: Optional[Union[int, Tuple[int, int]]] = 1, 195 | bias: Optional[bool] = True, 196 | ) -> None: 197 | """ 198 | Initialize AdaptiveConv2d module. 199 | 200 | Args: 201 | in_channels (int): Number of input channels. 202 | out_channels (int): Number of output channels. 203 | kernel_size (Union[int, Tuple[int, int]]): Kernel size for the convolution. 204 | dilation (Union[int, Tuple[int, int]], optional): Dilation factors for both time and frequency dimensions (default: 1). 205 | bias (bool, optional): Whether to include a bias term in the convolution (default: True). 206 | """ 207 | super().__init__() 208 | assert kernel_size in [1, 3, (1, 1), (1, 3), (3, 1), (3, 3)] 209 | if isinstance(kernel_size, int): 210 | kernel_size = (kernel_size, kernel_size) 211 | self.kernel_size = kernel_size 212 | self.dilation = (dilation, dilation) if isinstance(dilation, int) else dilation 213 | self.convs = nn.ModuleList( 214 | [ 215 | nn.Conv2d( 216 | in_channels, 217 | out_channels, 218 | kernel_size=1, 219 | bias=bias if i == 0 else False, 220 | ) 221 | for i in range(3 * 3) 222 | ] 223 | ) 224 | 225 | def forward(self, x: Tensor, dh: Tensor, dw: Tensor) -> Tensor: 226 | """ 227 | Calculate forward propagation. 228 | 229 | Args: 230 | x (Tensor): Input tensor with shape (batch, channels, bins, frames). 231 | dh (Tensor): Pitch-dependent dilation factors in the frequency dimension (batch, 1, frames). 232 | dw (Tensor): Pitch-dependent dilation factors in the time dimension (batch, 1, frames). 233 | 234 | Returns: 235 | Tensor: Output tensor with shape (batch, out_channels, height, width). 236 | """ 237 | out = 0.0 238 | xs = pd_indexing2d(x, dh, dw, self.dilation) 239 | for x, f in zip(xs, self.convs): 240 | out = out + f(x) 241 | return out 242 | -------------------------------------------------------------------------------- /wavehax/modules/drop.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is adapted from the implementation by Ross Wightman. 3 | The original code can be found at the following link: 4 | https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py. 5 | 6 | Copyright 2020 Ross Wightman. All rights reserved. 7 | """ 8 | 9 | from typing import Optional 10 | 11 | from torch import Tensor, nn 12 | 13 | 14 | def drop_path( 15 | x: Tensor, 16 | drop_prob: Optional[float] = 0.0, 17 | training: Optional[bool] = False, 18 | scale_by_keep: Optional[bool] = True, 19 | ) -> Tensor: 20 | """ 21 | Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 22 | 23 | This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, 24 | the original name is misleading as 'Drop Connect' is a different form of drop_prob in a separate paper... 25 | See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for 26 | changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 27 | 'survival rate' as the argument. 28 | """ 29 | if drop_prob == 0.0 or not training: 30 | return x 31 | keep_prob = 1 - drop_prob 32 | shape = (x.shape[0],) + (1,) * ( 33 | x.ndim - 1 34 | ) # work with diff dim tensors, not just 2D ConvNets 35 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 36 | if keep_prob > 0.0 and scale_by_keep: 37 | random_tensor.div_(keep_prob) 38 | return x * random_tensor 39 | 40 | 41 | class DropPath(nn.Module): 42 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 43 | 44 | def __init__( 45 | self, drop_prob: Optional[float] = 0.0, scale_by_keep: Optional[bool] = True 46 | ) -> None: 47 | super().__init__() 48 | self.drop_prob = drop_prob 49 | self.scale_by_keep = scale_by_keep 50 | 51 | def forward(self, x: Tensor) -> Tensor: 52 | return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) 53 | 54 | def extra_repr(self) -> str: 55 | return f"drop_prob={round(self.drop_prob,3):0.3f}" 56 | -------------------------------------------------------------------------------- /wavehax/modules/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Normalization modules.""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch import Tensor 11 | 12 | 13 | class NormLayer(nn.Module): 14 | def __init__( 15 | self, channels: int, eps: Optional[float] = 1e-6, affine: Optional[bool] = True 16 | ) -> None: 17 | """ 18 | Initialize the NormLayer module. 19 | 20 | Args: 21 | channels (int): Number of input features. 22 | eps (float, optional): A small constant added to the denominator for numerical stability (default: 1e-6). 23 | affine (bool, optional): If True, this module has learnable affine parameters (default: True). 24 | """ 25 | super().__init__() 26 | self.channels = channels 27 | self.eps = eps 28 | self.affine = affine 29 | 30 | if self.affine: 31 | self.gamma = nn.Parameter(torch.ones(channels)) 32 | self.beta = nn.Parameter(torch.zeros(channels)) 33 | 34 | def normalize( 35 | self, 36 | x: Tensor, 37 | dim: int, 38 | mean: Optional[Tensor] = None, 39 | var: Optional[Tensor] = None, 40 | ) -> Tuple[Tensor, Tensor, Tensor]: 41 | """ 42 | Apply normalization to the input tensor. 43 | 44 | Args: 45 | x (Tensor): Input tensor with shape (batch, channels, ...). 46 | dim (int): Dimensions along which statistics are calculated. 47 | mean (Tensor, optional): Mean tensor (default: None). 48 | var (Tensor, optional): Variance tensor (default: None). 49 | 50 | Returns: 51 | Tuple[Tensor, Tensor, Tensor]: Normalized tensor and statistics. 52 | """ 53 | # Calculate the mean along dimensions to be reduced 54 | if mean is None: 55 | mean = x.mean(dim, keepdim=True) 56 | 57 | # Centerize the input tensor 58 | x = x - mean 59 | 60 | # Calculate the variance 61 | if var is None: 62 | var = (x**2).mean(dim=dim, keepdim=True) 63 | 64 | # Normalize 65 | x = x / torch.sqrt(var + self.eps) 66 | 67 | if self.affine: 68 | shape = [1, self.channels] + [1] * (x.ndim - 2) 69 | x = self.gamma.view(*shape) * x + self.beta.view(*shape) 70 | 71 | return x, mean, var 72 | 73 | 74 | class LayerNorm1d(NormLayer): 75 | def __init__( 76 | self, channels: int, eps: Optional[float] = 1e-6, affine: Optional[bool] = True 77 | ) -> None: 78 | """ 79 | Initialize the LayerNorm1d module. 80 | 81 | Args: 82 | channels (int): Number of input features. 83 | eps (float, optional): A small constant added to the denominator for numerical stability (default: 1e-6). 84 | affine (bool, optional): If True, this module has learnable affine parameters (default: True). 85 | """ 86 | super().__init__(channels, eps, affine) 87 | self.reduced_dim = [1, 2] 88 | 89 | def forward(self, x: Tensor) -> Tensor: 90 | """ 91 | Apply layer normalization to the input tensor. 92 | 93 | Args: 94 | x (Tensor): Input tensor with shape (batch, channels, height, width). 95 | 96 | Returns: 97 | Tensor: Normalized tensor. 98 | """ 99 | x, *_ = self.normalize(x, dim=self.reduced_dim) 100 | return x 101 | 102 | 103 | class BatchNorm1d(NormLayer): 104 | def __init__( 105 | self, 106 | channels: int, 107 | eps: Optional[float] = 1e-6, 108 | affine: Optional[bool] = True, 109 | momentum: Optional[float] = 0.1, 110 | track_running_stats: Optional[bool] = True, 111 | ) -> None: 112 | """ 113 | Initialize the BatchNorm1d module. 114 | 115 | Args: 116 | channels (int): Number of input features. 117 | eps (float, optional): A small constant added to the denominator for numerical stability (default: 1e-6). 118 | affine (bool, optional): If True, this module has learnable affine parameters (default: True). 119 | momentum (float, optional): The value used for the running_mean and running_var computation. 120 | Can be set to None for cumulative moving average, i.e. simple average (default: None). 121 | track_running_stats (bool, optional): If True, tracks running mean and variance during training. 122 | """ 123 | super().__init__(channels, eps, affine) 124 | self.momentum = momentum 125 | self.track_running_stats = track_running_stats 126 | if track_running_stats: 127 | self.register_buffer( 128 | "num_batches_tracked", torch.tensor(0, dtype=torch.long) 129 | ) 130 | self.register_buffer("running_mean", torch.zeros(1, channels, 1)) 131 | self.register_buffer("running_var", torch.ones(1, channels, 1)) 132 | self.reduced_dim = [0, 2] 133 | 134 | def forward(self, x: Tensor) -> Tensor: 135 | """ 136 | Apply batch normalization to the input tensor. 137 | 138 | Args: 139 | x (Tensor): Input tensor with shape (batch, channels, height, width). 140 | 141 | Returns: 142 | Tensor: Normalized tensor. 143 | """ 144 | # Get the running statistics if needed 145 | if (not self.training) and self.track_running_stats: 146 | mean = self.running_mean 147 | var = self.running_var 148 | else: 149 | mean = var = None 150 | 151 | x, mean, var = self.normalize(x, dim=self.reduced_dim, mean=mean, var=var) 152 | 153 | # Update the running statistics 154 | if self.training and self.track_running_stats: 155 | with torch.no_grad(): 156 | # Update the number of tracked samples 157 | self.num_batches_tracked += 1 158 | 159 | # Get the weight for cumulative or exponential moving average 160 | if self.momentum is None: 161 | exponential_average_factor = 1.0 / float(self.num_batches_tracked) 162 | else: 163 | exponential_average_factor = self.momentum 164 | 165 | # Update the running mean and covariance matrix 166 | self.running_mean = ( 167 | exponential_average_factor * mean 168 | + (1 - exponential_average_factor) * self.running_mean 169 | ) 170 | n = x.numel() / x.size(1) 171 | self.running_var = ( 172 | exponential_average_factor * var * n / (n - 1) 173 | + (1 - exponential_average_factor) * self.running_var 174 | ) 175 | 176 | return x 177 | 178 | 179 | class LayerNorm2d(NormLayer): 180 | def __init__( 181 | self, channels: int, eps: Optional[float] = 1e-6, affine: Optional[bool] = True 182 | ) -> None: 183 | """ 184 | Initialize the LayerNorm2d module. 185 | 186 | Args: 187 | channels (int): Number of input features. 188 | eps (float, optional): A small constant added to the denominator for numerical stability (default: 1e-6). 189 | affine (bool, optional): If True, this module has learnable affine parameters (default: True). 190 | """ 191 | super().__init__(channels, eps, affine) 192 | self.reduced_dim = [1, 2, 3] 193 | 194 | def forward(self, x: Tensor) -> Tensor: 195 | """ 196 | Apply normalization to the input tensor. 197 | 198 | Args: 199 | x (Tensor): Input tensor with shape (batch, channels, height, width). 200 | 201 | Returns: 202 | Tensor: Normalized tensor. 203 | """ 204 | x, *_ = self.normalize(x, dim=self.reduced_dim) 205 | return x 206 | 207 | 208 | class BatchNorm2d(BatchNorm1d): 209 | def __init__( 210 | self, 211 | channels: int, 212 | eps: Optional[float] = 1e-6, 213 | affine: Optional[bool] = True, 214 | momentum: Optional[float] = 0.1, 215 | track_running_stats: Optional[bool] = True, 216 | ) -> None: 217 | """ 218 | Initialize the BatchNorm2d module. 219 | 220 | Args: 221 | channels (int): Number of input features. 222 | eps (float, optional): A small constant added to the denominator for numerical stability (default: 1e-6). 223 | affine (bool, optional): If True, this module has learnable affine parameters (default: True). 224 | momentum (float, optional): The value used for the running_mean and running_var computation. 225 | Can be set to None for cumulative moving average, i.e. simple average (default: None). 226 | track_running_stats (bool, optional): If True, tracks running mean and variance during training. 227 | """ 228 | super().__init__(channels, eps, affine) 229 | self.momentum = momentum 230 | self.track_running_stats = track_running_stats 231 | if track_running_stats: 232 | self.register_buffer( 233 | "num_batches_tracked", torch.tensor(0, dtype=torch.long) 234 | ) 235 | self.register_buffer("running_mean", torch.zeros(1, channels, 1, 1)) 236 | self.register_buffer("running_var", torch.ones(1, channels, 1, 1)) 237 | self.reduced_dim = [0, 2, 3] 238 | 239 | def forward(self, x: Tensor) -> Tensor: 240 | """ 241 | Apply batch normalization to the input tensor. 242 | 243 | Args: 244 | x (Tensor): Input tensor with shape (batch, channels, height, width). 245 | 246 | Returns: 247 | Tensor: Normalized tensor. 248 | """ 249 | return super().forward(x) 250 | -------------------------------------------------------------------------------- /wavehax/modules/periodic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Functions for generating prior waveforms.""" 5 | 6 | from logging import getLogger 7 | from typing import Optional 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import Tensor 13 | from torchaudio.functional import resample 14 | 15 | # A logger for this file 16 | logger = getLogger(__name__) 17 | 18 | 19 | def generate_noise(f0: Tensor, hop_length: int, *args, **kwargs) -> Tensor: 20 | """ 21 | Generate Gaussian noise waveforms of specified duration based on input F0 sequences. 22 | 23 | Args: 24 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 25 | hop_length (int): Hop length of input f0 sequencess. 26 | 27 | Returns: 28 | Tensor: Generated noise waveform with shape (batch, 1, frames * hop_length). 29 | """ 30 | batch, _, frames = f0.size() 31 | noise = torch.randn((batch, 1, frames * hop_length), device=f0.device) 32 | return noise 33 | 34 | 35 | def generate_sine( 36 | f0: Tensor, 37 | hop_length: int, 38 | sample_rate: int, 39 | noise_amplitude: Optional[float] = 0.03, 40 | random_init_phase: Optional[bool] = True, 41 | *args, 42 | **kwargs, 43 | ) -> Tensor: 44 | """ 45 | Generate sine waveforms based on F0 sequences. 46 | 47 | Args: 48 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 49 | hop_length (int): Hop length of the F0 sequence. 50 | sample_rate (int): Sampling frequency of the waveform in Hz. 51 | noise_amplitude (float, optional): Amplitude of the noise component added to the waveform (default: 0.03). 52 | random_init_phase (bool, optional): Whether to initialize phases randomly (default: True). 53 | 54 | Returns: 55 | Tensor: Generated sine waveform with shape (batch, 1, frames * hop_length). 56 | """ 57 | device = f0.device 58 | f0 = F.interpolate(f0, f0.size(2) * hop_length) 59 | vuv = f0 > 0 60 | 61 | radious = f0.to(torch.float64) / sample_rate 62 | if random_init_phase: 63 | radious[..., 0] += torch.rand((1, 1), device=device) 64 | 65 | phase = 2.0 * np.pi * torch.cumsum(radious, dim=-1) 66 | sine = torch.sin(phase).to(torch.float32) 67 | noise = torch.randn(sine.size(), device=device) 68 | sine = vuv * sine + noise_amplitude * noise 69 | 70 | return sine 71 | 72 | 73 | def generate_sawtooth( 74 | f0: Tensor, 75 | hop_length: int, 76 | sample_rate: int, 77 | noise_amplitude: Optional[float] = 0.03, 78 | random_init_phase: Optional[bool] = True, 79 | oversampling: Optional[int] = 8, 80 | lowpass_filter_width: Optional[int] = 15, 81 | *args, 82 | **kwargs, 83 | ) -> Tensor: 84 | """ 85 | Generate sawtooth waveforms based on F0 sequences. 86 | 87 | Args: 88 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 89 | hop_length (int): Hop length of the F0 sequence. 90 | sample_rate (int): Sampling frequency of the waveform in Hz. 91 | noise_amplitude (float, optional): Amplitude of the noise component added to the waveform (default: 0.03). 92 | random_init_phase (bool, optional): Whether to initialize phases randomly (default: True). 93 | oversampling (int, optional): Oversampling factor to reduce aliasing (default: 8). 94 | lowpass_filter_width (int, optional): Low-pass filter length used for downsampling (default: 15). 95 | 96 | Returns: 97 | Tensor: Generated sawtooth waveform with shape (batch, 1, frames * hop_length). 98 | """ 99 | batch, _, frames = f0.size() 100 | device = f0.device 101 | noise = noise_amplitude * torch.randn( 102 | (batch, 1, frames * hop_length), device=device 103 | ) 104 | 105 | if torch.all(f0 == 0.0): 106 | return noise 107 | 108 | # Oversampling and low-pass filtering to reduce aliasing 109 | f0 = f0.repeat_interleave(hop_length * oversampling, dim=2) 110 | radious = f0.to(torch.float64) / (sample_rate * oversampling) 111 | if random_init_phase: 112 | radious[..., 0] += torch.rand((1, 1), device=device) 113 | 114 | theta = 2.0 * torch.pi * torch.cumsum(radious, dim=2) 115 | phase = torch.remainder(theta, 2.0 * torch.pi) 116 | saw = phase / torch.pi - 1.0 117 | vuv = f0 > 0 118 | saw = vuv * saw 119 | 120 | if oversampling > 1: 121 | saw = resample( 122 | saw, 123 | orig_freq=sample_rate * oversampling, 124 | new_freq=sample_rate, 125 | lowpass_filter_width=lowpass_filter_width, 126 | ) 127 | saw = saw.to(torch.float32) + noise 128 | 129 | return saw 130 | 131 | 132 | def generate_pcph( 133 | f0: Tensor, 134 | hop_length: int, 135 | sample_rate: int, 136 | noise_amplitude: Optional[float] = 0.01, 137 | random_init_phase: Optional[bool] = True, 138 | power_factor: Optional[float] = 0.1, 139 | max_frequency: Optional[float] = None, 140 | *args, 141 | **kwargs, 142 | ) -> Tensor: 143 | """ 144 | Generate pseudo-constant-power harmonic waveforms based on input F0 sequences. 145 | The spectral envelope of harmonics is designed to have flat spectral envelopes. 146 | 147 | Args: 148 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 149 | hop_length (int): Hop length of the F0 sequence. 150 | sample_rate (int): Sampling frequency of the waveform in Hz. 151 | noise_amplitude (float, optional): Amplitude of the noise component (default: 0.01). 152 | random_init_phase (bool, optional): Whether to initialize phases randomly (default: True). 153 | power_factor (float, optional): Factor to control the power of harmonics (default: 0.1). 154 | max_frequency (float, optional): Maximum frequency to define the number of harmonics (default: None). 155 | 156 | Returns: 157 | Tensor: Generated harmonic waveform with shape (batch, 1, frames * hop_length). 158 | """ 159 | batch, _, frames = f0.size() 160 | device = f0.device 161 | noise = noise_amplitude * torch.randn( 162 | (batch, 1, frames * hop_length), device=device 163 | ) 164 | if torch.all(f0 == 0.0): 165 | return noise 166 | 167 | vuv = f0 > 0 168 | min_f0_value = torch.min(f0[f0 > 0]).item() 169 | max_frequency = max_frequency if max_frequency is not None else sample_rate / 2 170 | max_n_harmonics = int(max_frequency / min_f0_value) 171 | n_harmonics = torch.ones_like(f0, dtype=torch.float) 172 | n_harmonics[vuv] = sample_rate / 2.0 / f0[vuv] 173 | 174 | indices = torch.arange(1, max_n_harmonics + 1, device=device).reshape(1, -1, 1) 175 | harmonic_f0 = f0 * indices 176 | 177 | # Compute harmonic mask 178 | harmonic_mask = harmonic_f0 <= (sample_rate / 2.0) 179 | harmonic_mask = torch.repeat_interleave(harmonic_mask, hop_length, dim=2) 180 | 181 | # Compute harmonic amplitude 182 | harmonic_amplitude = vuv * power_factor * torch.sqrt(2.0 / n_harmonics) 183 | harmocic_amplitude = torch.repeat_interleave(harmonic_amplitude, hop_length, dim=2) 184 | 185 | # Generate sinusoids 186 | f0 = torch.repeat_interleave(f0, hop_length, dim=2) 187 | radious = f0.to(torch.float64) / sample_rate 188 | if random_init_phase: 189 | radious[..., 0] += torch.rand((1, 1), device=device) 190 | radious = torch.cumsum(radious, dim=2) 191 | harmonic_phase = 2.0 * torch.pi * radious * indices 192 | harmonics = torch.sin(harmonic_phase).to(torch.float32) 193 | 194 | # Multiply coefficients to the harmonic signal 195 | harmonics = harmonic_mask * harmonics 196 | harmonics = harmocic_amplitude * torch.sum(harmonics, dim=1, keepdim=True) 197 | 198 | return harmonics + noise 199 | 200 | 201 | def generate_pcph_linear_decay( 202 | f0: Tensor, 203 | hop_length: int, 204 | sample_rate: int, 205 | noise_amplitude: Optional[float] = 0.01, 206 | random_init_phase: Optional[bool] = True, 207 | power_factor: Optional[float] = 0.1, 208 | max_frequency: Optional[float] = None, 209 | *args, 210 | **kwargs, 211 | ) -> Tensor: 212 | """ 213 | Generate pseudo-constant-power harmonic waveforms based on input F0 sequences. 214 | The spectral envelope of harmonics is designed to linearly decay in each time frame. 215 | 216 | Args: 217 | f0 (Tensor): F0 sequences with shape (batch, 1, frames). 218 | hop_length (int): Hop length of the F0 sequence. 219 | sample_rate (int): Sampling frequency of the waveform in Hz. 220 | noise_amplitude (float, optional): Amplitude of the noise component (default: 0.01). 221 | random_init_phase (bool, optional): Whether to initialize phases randomly (default: True). 222 | power_factor (float, optional): Factor to control the power of harmonics (default: 0.1). 223 | max_frequency (float, optional): Maximum frequency to define the number of harmonics (default: None). 224 | 225 | Returns: 226 | Tensor: Generated harmonic waveform with shape (batch, 1, frames * hop_length). 227 | """ 228 | batch, _, frames = f0.size() 229 | device = f0.device 230 | noise = noise_amplitude * torch.randn( 231 | (batch, 1, frames * hop_length), device=device 232 | ) 233 | if torch.all(f0 == 0.0): 234 | return noise 235 | 236 | vuv = f0 > 0 237 | min_f0_value = torch.min(f0[f0 > 0]).item() 238 | max_frequency = max_frequency if max_frequency is not None else sample_rate / 2 239 | max_n_harmonics = int(max_frequency / min_f0_value) 240 | n_harmonics = torch.ones_like(f0, dtype=torch.float) 241 | n_harmonics[vuv] = max_frequency / f0[vuv] 242 | indices = torch.arange(1, max_n_harmonics + 1, device=device).reshape(1, -1, 1) 243 | harmonic_f0 = f0 * indices 244 | 245 | # Compute harmonic mask 246 | harmonic_mask = harmonic_f0 <= (max_frequency) 247 | indices_with_mask = harmonic_mask * indices 248 | 249 | # Compute frame and indice level coefficients 250 | slope = -1.0 / max_frequency 251 | intercept = 1.0 252 | num_harmonics_normalized = ( 253 | torch.sum(indices_with_mask, dim=1, keepdim=True) / power_factor 254 | ) 255 | num_harmonics_squared_normalized = ( 256 | torch.sum(indices_with_mask**2, dim=1, keepdim=True) / power_factor 257 | ) 258 | amplitude_factor = torch.sqrt( 259 | 2.0 260 | * power_factor 261 | / ( 262 | slope**2 * f0**2 * num_harmonics_squared_normalized 263 | + 2.0 * slope * intercept * f0 * num_harmonics_normalized 264 | + intercept**2 * n_harmonics / power_factor 265 | ) 266 | ) 267 | harmonic_amplitude = ( 268 | vuv * harmonic_mask * amplitude_factor * (slope * harmonic_f0 + intercept) 269 | ) 270 | harmonic_amplitude = torch.repeat_interleave(harmonic_amplitude, hop_length, dim=2) 271 | 272 | # Generate sinusoids 273 | f0 = torch.repeat_interleave(f0, hop_length, dim=2) 274 | radious = f0.to(torch.float64) / sample_rate 275 | if random_init_phase: 276 | radious[..., 0] += torch.rand((1, 1), device=device) 277 | radious = torch.cumsum(radious, dim=2) 278 | harmonic_phase = 2.0 * torch.pi * radious * indices 279 | harmonics = torch.sin(harmonic_phase).to(torch.float32) 280 | 281 | # Multiply coefficients to each sinusoids 282 | harmonics = harmonic_amplitude * harmonics 283 | harmonics = torch.sum(harmonics, dim=1, keepdim=True) 284 | 285 | return harmonics + noise 286 | -------------------------------------------------------------------------------- /wavehax/modules/resblock.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Residual block modules.""" 5 | 6 | from logging import getLogger 7 | from typing import Tuple 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch import Tensor 12 | 13 | from wavehax.modules import ( 14 | BatchNorm2d, 15 | ComplexActivation, 16 | ComplexBatchNorm2d, 17 | ComplexConv2d, 18 | ComplexLayerNorm2d, 19 | DropPath, 20 | LayerNorm2d, 21 | ) 22 | 23 | # A logger for this file 24 | logger = getLogger(__name__) 25 | 26 | 27 | class ConvNeXtBlock2d(nn.Module): 28 | """ 29 | A 2D residual block module based on ConvNeXt architecture. 30 | 31 | Reference: 32 | - https://github.com/facebookresearch/ConvNeXt 33 | """ 34 | 35 | def __init__( 36 | self, 37 | channels: int, 38 | mult_channels: int, 39 | kernel_size: int, 40 | drop_prob: float = 0.0, 41 | use_layer_norm: bool = True, 42 | layer_scale_init_value: float = None, 43 | ) -> None: 44 | """ 45 | Initialize the ConvNeXtBlock2d module. 46 | 47 | Args: 48 | channels (int): Number of input and output channels for the block. 49 | mult_channels (int): Channel expansion factor used in pointwise convolutions. 50 | kernel_size (int): Size of the depthwise convolution kernel. 51 | drop_prob (float, optional): Probability of dropping paths for stochastic depth (default: 0.0). 52 | use_layer_norm (bool, optional): If True, layer normalization is used; otherwise, 53 | batch normalization is applied (default: True). 54 | layer_scale_init_value (float, optional): Initial value for the learnable layer scale parameter. 55 | If None, no scaling is applied (default: None). 56 | """ 57 | super().__init__() 58 | if isinstance(kernel_size, int): 59 | kernel_size = (kernel_size, kernel_size) 60 | assert kernel_size[0] % 2 == 1, "Kernel size must be odd number." 61 | assert kernel_size[1] % 2 == 1, "Kernel size must be odd number." 62 | self.dwconv = nn.Conv2d( 63 | channels, 64 | channels, 65 | kernel_size, 66 | padding=(kernel_size[0] // 2, kernel_size[1] // 2), 67 | groups=channels, 68 | bias=False, 69 | padding_mode="reflect", 70 | ) 71 | if use_layer_norm: 72 | self.norm = LayerNorm2d(channels) 73 | else: 74 | self.norm = BatchNorm2d(channels) 75 | self.pwconv1 = nn.Conv2d(channels, channels * mult_channels, 1) 76 | self.nonlinear = nn.GELU() 77 | self.pwconv2 = nn.Conv2d(channels * mult_channels, channels, 1) 78 | self.gamma = ( 79 | nn.Parameter( 80 | layer_scale_init_value * torch.ones(1, channels, 1, 1), 81 | requires_grad=True, 82 | ) 83 | if layer_scale_init_value is not None 84 | else None 85 | ) 86 | self.drop_path = DropPath(drop_prob) 87 | 88 | def forward(self, x: Tensor) -> Tensor: 89 | """ 90 | Calculate forward propagation. 91 | 92 | Args: 93 | x (Tensor): Input tensor with shape (batch, channels, height, width). 94 | 95 | Returns: 96 | Tensor: Output tensor of the same shape (batch, channels, height, width). 97 | """ 98 | residual = x 99 | x = self.dwconv(x) 100 | x = self.norm(x) 101 | x = self.pwconv1(x) 102 | x = self.nonlinear(x) 103 | x = self.pwconv2(x) 104 | if self.gamma is not None: 105 | x = self.gamma * x 106 | x = residual + self.drop_path(x) 107 | return x 108 | 109 | 110 | class ComplexConvNeXtBlock2d(nn.Module): 111 | """Complex-valued 2D residual block based on ConvNeXt architecture.""" 112 | 113 | def __init__( 114 | self, 115 | channels: int, 116 | mult_channels: int, 117 | kernel_size: int, 118 | drop_prob: float = 0.0, 119 | use_layer_norm: bool = True, 120 | layer_scale_init_value: float = None, 121 | ) -> None: 122 | """ 123 | Initialize the ComplexConvNeXtBlock2d module. 124 | 125 | Args: 126 | channels (int): Number of input and output channels for the block. 127 | mult_channels (int): Channel expansion factor used in pointwise convolutions. 128 | kernel_size (int): Size of the depthwise convolution kernel. 129 | drop_prob (float, optional): Probability of dropping paths for stochastic depth (default: 0.0). 130 | use_layer_norm (bool, optional): If True, layer normalization is used; otherwise, 131 | batch normalization is applied (default: True). 132 | layer_scale_init_value (float, optional): Initial value for the learnable layer scale parameter. 133 | If None, no scaling is applied (default: None). 134 | """ 135 | super().__init__() 136 | if isinstance(kernel_size, int): 137 | kernel_size = (kernel_size, kernel_size) 138 | assert kernel_size[0] % 2 == 1, "Kernel size must be odd number." 139 | assert kernel_size[1] % 2 == 1, "Kernel size must be odd number." 140 | 141 | self.dwconv = ComplexConv2d( 142 | channels, 143 | channels, 144 | kernel_size, 145 | padding=(kernel_size[0] // 2, kernel_size[1] // 2), 146 | groups=channels, 147 | bias=False, 148 | padding_mode="reflect", 149 | ) 150 | if use_layer_norm: 151 | self.norm = ComplexLayerNorm2d(channels) 152 | else: 153 | self.norm = ComplexBatchNorm2d(channels) 154 | self.pwconv1 = ComplexConv2d(channels, channels * mult_channels, 1) 155 | self.nonlinear = ComplexActivation(nn.GELU()) 156 | self.pwconv2 = ComplexConv2d(channels * mult_channels, channels, 1) 157 | self.gamma = ( 158 | nn.Parameter( 159 | layer_scale_init_value * torch.ones(1, channels, 1, 1), 160 | requires_grad=True, 161 | ) 162 | if layer_scale_init_value is not None 163 | else None 164 | ) 165 | self.drop_path = DropPath(drop_prob) 166 | 167 | def forward(self, real: Tensor, imag: Tensor) -> Tuple[Tensor, Tensor]: 168 | """ 169 | Calculate forward propagation. 170 | 171 | Args: 172 | real (Tensor): Input real part tensor with shape (batch, channels, height, width). 173 | imag (Tensor): Input imaginary part tensor with shape (batch, channels, height, width). 174 | 175 | Returns: 176 | Tuple[Tensor, Tensor]: Complex tensor with shape (batch, channels, height, width). 177 | """ 178 | residual_real, residual_imag = real, imag 179 | real, imag = self.dwconv(real, imag) 180 | real, imag = self.norm(real, imag) 181 | real, imag = self.pwconv1(real, imag) 182 | real, imag = self.nonlinear(real, imag) 183 | real, imag = self.pwconv2(real, imag) 184 | z = torch.stack([real, imag], dim=0) 185 | if self.gamma is not None: 186 | z = self.gamma * z 187 | z = self.drop_path(z) 188 | real = z[0] + residual_real 189 | imag = z[1] + residual_imag 190 | return real, imag 191 | -------------------------------------------------------------------------------- /wavehax/modules/stft.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Modules related to short-time Fourier transform (STFT).""" 5 | 6 | from typing import Optional, Tuple 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from librosa.filters import mel as librosa_filters_mel 12 | from torch import Tensor 13 | 14 | 15 | def to_log_magnitude_and_phase( 16 | real: Tensor, imag: Tensor, clip_value: Optional[float] = 1e-10 17 | ) -> Tuple[Tensor, Tensor]: 18 | """ 19 | Convert real and imaginary components of a complex signal to log-magnitude and phase. 20 | 21 | Args: 22 | real (Tensor): Real part of the complex signal. 23 | imag (Tensor): Imaginary part of the complex signal. 24 | clip_value (float, optional): Minimum value for magnitude to avoid log of zero (default: 1e-10). 25 | 26 | Returns: 27 | Tuple[Tensor, Tensor]: Log-magnitude and phase of the input complex signal. 28 | """ 29 | magnitude = torch.sqrt(torch.clamp(real**2 + imag**2, min=clip_value)) 30 | log_magnitude = torch.log(magnitude) 31 | phase = torch.atan2(imag, real) 32 | return log_magnitude, phase 33 | 34 | 35 | def to_real_imaginary( 36 | log_magnitude: Tensor, phase: Tensor, clip_value: Optional[float] = 1e2 37 | ) -> Tuple[Tensor, Tensor]: 38 | """ 39 | Convert log-magnitude and implicit phase wrapping back to real and imaginary components of a complex signal. 40 | 41 | Args: 42 | log_magnitude (Tensor): Log-magnitude of the complex signal. 43 | phase (Tensor): Implicit phase wrapping spectra as in Vocos. 44 | clip_value (float, optional): Maximum allowed value for magnitude after exponentiation (default: 1e2). 45 | 46 | Returns: 47 | Tuple[Tensor, Tensor]: Real and imaginary components of the complex signal. 48 | 49 | References: 50 | - https://arxiv.org/abs/2306.00814 51 | - https://github.com/gemelo-ai/vocos 52 | """ 53 | magnitude = torch.clip(torch.exp(log_magnitude), max=clip_value) 54 | real, imag = magnitude * torch.cos(phase), magnitude * torch.sin(phase) 55 | return real, imag 56 | 57 | 58 | class STFT(nn.Module): 59 | """ 60 | Short-Time Fourier Transform (STFT) module. 61 | 62 | References: 63 | - https://github.com/gemelo-ai/vocos 64 | - https://github.com/echocatzh/torch-mfcc 65 | """ 66 | 67 | def __init__( 68 | self, n_fft: int, hop_length: int, window: Optional[str] = "hann_window" 69 | ) -> None: 70 | """ 71 | Initialize the STFT module. 72 | 73 | Args: 74 | n_fft (int): Number of Fourier transform points (FFT size). 75 | hop_length (int): Hop length (frameshift) in samples. 76 | window (str, optional): Name of the window function (default: "hann_window"). 77 | """ 78 | super().__init__() 79 | self.n_fft = n_fft 80 | self.n_bins = n_fft // 2 + 1 81 | self.hop_length = hop_length 82 | 83 | # Create the window function and its squared values for normalization 84 | window = getattr(torch, window)(self.n_fft).reshape(1, n_fft, 1) 85 | self.register_buffer("window", window.reshape(1, n_fft, 1)) 86 | window_envelope = window.square() 87 | self.register_buffer("window_envelope", window_envelope.reshape(1, n_fft, 1)) 88 | 89 | # Create the kernel for enframe operation (sliding windows) 90 | enframe_kernel = torch.eye(self.n_fft).unsqueeze(1) 91 | self.register_buffer("enframe_kernel", enframe_kernel) 92 | 93 | def forward(self, x: Tensor, norm: Optional[str] = None) -> Tuple[Tensor, Tensor]: 94 | """ 95 | Perform the forward Short-Time Fourier Transform (STFT) on the input waveform. 96 | 97 | Args: 98 | x (Tensor): Input waveform with shape (batch, samples) or (batch, 1, samples). 99 | norm (str, optional): Normalization mode for the FFT (default: None). 100 | 101 | Returns: 102 | Tuple[Tensor, Tensor]: Real and imaginary parts of the STFT result. 103 | """ 104 | # Apply zero-padding to the input signal 105 | pad = self.n_fft - self.hop_length 106 | pad_left = pad // 2 107 | x = F.pad(x, (pad_left, pad - pad_left)) 108 | 109 | # Enframe the padded waveform (sliding windows) 110 | x = x.unsqueeze(1) if x.dim() == 2 else x 111 | x = F.conv1d(x, self.enframe_kernel, stride=self.hop_length) 112 | 113 | # Perform the forward real-valued DFT on each frame 114 | x = x * self.window 115 | x_stft = torch.fft.rfft(x, dim=1, norm=norm) 116 | real, imag = x_stft.real, x_stft.imag 117 | 118 | return real, imag 119 | 120 | def inverse(self, real: Tensor, imag: Tensor, norm: Optional[str] = None) -> Tensor: 121 | """ 122 | Perform the inverse Short-Time Fourier Transform (iSTFT) to reconstruct the waveform from the complex spectrogram. 123 | 124 | Args: 125 | real (Tensor): Real part of the complex spectrogram with shape (batch, n_bins, frames). 126 | imag (Tensor): Imaginary part of the complex spectrogram with shape (batch, n_bins, frames). 127 | norm (str, optional): Normalization mode for the inverse FFT (default: None). 128 | 129 | Returns: 130 | Tensor: Reconstructed waveform with shape (batch, 1, samples). 131 | """ 132 | # Validate shape and dimensionality 133 | assert real.shape == imag.shape and real.ndim == 3 134 | 135 | # Ensure the input represents a one-sided spectrogram 136 | assert real.size(1) == self.n_bins 137 | 138 | frames = real.shape[2] 139 | samples = frames * self.hop_length 140 | 141 | # Inverse RDFT and apply windowing, followed by overlap-add 142 | x = torch.fft.irfft(torch.complex(real, imag), dim=1, norm=norm) 143 | x = x * self.window 144 | x = F.conv_transpose1d(x, self.enframe_kernel, stride=self.hop_length) 145 | 146 | # Compute window envelope for normalization 147 | window_envelope = F.conv_transpose1d( 148 | self.window_envelope.repeat(1, 1, frames), 149 | self.enframe_kernel, 150 | stride=self.hop_length, 151 | ) 152 | 153 | # Remove padding 154 | pad = (self.n_fft - self.hop_length) // 2 155 | x = x[..., pad : samples + pad] 156 | window_envelope = window_envelope[..., pad : samples + pad] 157 | 158 | # Normalize the output by the window envelope 159 | assert (window_envelope > 1e-11).all() 160 | x = x / window_envelope 161 | 162 | return x 163 | 164 | 165 | class MelSpectrogram(nn.Module): 166 | """A module to compute a mel-spectrogram from waveforms.""" 167 | 168 | def __init__( 169 | self, 170 | sample_rate: int, 171 | hop_length: int, 172 | n_fft: int, 173 | n_mels: int, 174 | window: Optional[str] = "hann_window", 175 | fmin: Optional[float] = 0, 176 | fmax: Optional[float] = None, 177 | ) -> None: 178 | """ 179 | Initialize the MelSpectrogram module. 180 | 181 | Args: 182 | sample_rate (int): Sampling frequency of input waveforms. 183 | hop_length (int): Hop length (frameshift) in samples. 184 | n_fft (int): Number of Fourier transform points (FFT size). 185 | n_mels (int): Number of mel basis. 186 | window (str, optional): Name of the window function (default: "hann_window). 187 | fmin (float, optional): Minimum frequency for mel-filter bank (default: 0). 188 | fmax (float, optional): Maximum frequency for mel-filter bank (default: None). 189 | """ 190 | super().__init__() 191 | self.n_mels = n_mels 192 | self.stft = STFT(n_fft, hop_length, window) 193 | mel_basis = librosa_filters_mel( 194 | sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax 195 | ) # (n_mels, n_bins) 196 | self.register_buffer("mel_basis", torch.from_numpy(mel_basis)) 197 | 198 | def forward( 199 | self, 200 | audio: Tensor, 201 | log_scale: Optional[bool] = True, 202 | eps: Optional[float] = 1e-5, 203 | ) -> Tensor: 204 | """ 205 | Compute mel-spectrogram from the input waveforms. 206 | 207 | Args: 208 | audio (Tensor): Input waveforms with shape (batch, samples) or (batch, 1, samples). 209 | log_scale (bool, optional): Whether to return the log-magnitude of the mel-spectrogram (default: True). 210 | eps (float, optional): Small value to avoid numerical instability in log calculation (default: 1e-5). 211 | 212 | Returns: 213 | Tensor: Mel-spectrogram with shape (batch, n_mels, frames). 214 | """ 215 | real, imag = self.stft(audio) 216 | mel = torch.matmul(self.mel_basis, torch.complex(real, imag).abs()) 217 | mel = torch.log(torch.clamp(mel, min=eps)) if log_scale else mel 218 | return mel 219 | 220 | 221 | def griffin_lim(spectrogram: Tensor, stft: STFT, n_iter: int) -> Tensor: 222 | """ 223 | Perform the Griffin-Lim algorithm for phase recovery from a magnitude spectrogram. 224 | 225 | Args: 226 | spectrogram (Tensor): Input complex spectrogram tensor with shape (batch, bins, frames). 227 | stft (STFT): STFT object with the configuration used for the spectrogram. 228 | n_iter (int): Number of iterations for phase recovery. 229 | 230 | Returns: 231 | Tensor: Recovered waveform tensor with shape (batch, bins, frames). 232 | """ 233 | magnitude = spectrogram.abs() 234 | phase = spectrogram.angle() 235 | for _ in range(n_iter): 236 | inverse = stft.inverse(magnitude * torch.exp(1.0j * phase)) 237 | phase = stft(inverse).angle() 238 | return phase 239 | -------------------------------------------------------------------------------- /wavehax/modules/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """Utility functions.""" 5 | 6 | from typing import Any 7 | 8 | import torch 9 | from torch import nn 10 | 11 | # Check the PyTorch version 12 | if torch.__version__ >= "2.1.0": 13 | from torch.nn.utils.parametrizations import weight_norm as torch_weight_norm 14 | else: 15 | from torch.nn.utils import weight_norm as torch_weight_norm 16 | 17 | 18 | def weight_norm(m: Any) -> None: 19 | """ 20 | Apply weight normalization to the given module if it is a supported layer type. 21 | 22 | Args: 23 | m (Any): Module to apply weight normalization to. 24 | """ 25 | if isinstance( 26 | m, 27 | (nn.Linear, nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d), 28 | ): 29 | torch_weight_norm(m) 30 | 31 | 32 | def spectral_norm(m: Any) -> None: 33 | """ 34 | Apply spectral normalization to the given module if it is a supported layer type. 35 | 36 | Args: 37 | m (Any): Module to apply spectral normalization to. 38 | """ 39 | if isinstance( 40 | m, 41 | (nn.Linear, nn.Conv1d, nn.ConvTranspose1d, nn.Conv2d, nn.ConvTranspose2d), 42 | ): 43 | nn.utils.spectral_norm(m) 44 | 45 | 46 | def remove_weight_norm(m: Any) -> None: 47 | """ 48 | Remove weight normalization from the given module if it has weight normalization applied. 49 | 50 | Args: 51 | m (Any): Module to remove weight normalization from. 52 | """ 53 | try: 54 | nn.utils.remove_weight_norm(m) 55 | except ValueError: # this module didn't have weight norm 56 | return 57 | -------------------------------------------------------------------------------- /wavehax/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from wavehax.utils.features import * # NOQA 2 | from wavehax.utils.utils import * # NOQA 3 | -------------------------------------------------------------------------------- /wavehax/utils/features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Functions for audio and feature processing. 6 | 7 | References: 8 | - https://github.com/bigpon/QPPWG 9 | """ 10 | 11 | from logging import getLogger 12 | from typing import Optional 13 | 14 | import numpy as np 15 | import pyloudnorm as pyln 16 | from numpy import ndarray 17 | from scipy.interpolate import interp1d 18 | 19 | # A logger for this file 20 | logger = getLogger(__name__) 21 | 22 | 23 | def normalize_loudness( 24 | audio: ndarray, sample_rate: int, target_db: Optional[float] = -24.0 25 | ): 26 | """ 27 | Normalizes the loudness of an input monaural audio signal. 28 | 29 | Args: 30 | audio (ndarray): Input audio waveform. 31 | sample_rate (int): Sampling frequency of the audio. 32 | target_db (float, optional): Target loudness in decibels (default: -24.0). 33 | 34 | Returns: 35 | ndarray: Loudness-normalized audio waveform. 36 | """ 37 | meter = pyln.Meter(sample_rate) 38 | loudness = meter.integrated_loudness(audio) 39 | normed_audio = pyln.normalize.loudness(audio, loudness, target_db) 40 | return normed_audio 41 | 42 | 43 | def fill_zeros_with_neighbors(arr: ndarray) -> ndarray: 44 | """ 45 | Replaces zero values in the input array with the nearest non-zero values from neighboring indices. 46 | 47 | Args: 48 | arr (ndarray): Input array. 49 | 50 | Returns: 51 | ndarray: Array with zero values replaced by neighboring non-zero values. 52 | """ 53 | new_arr = arr.copy() 54 | for i in range(1, len(arr)): 55 | if new_arr[i] == 0: 56 | new_arr[i] = new_arr[i - 1] 57 | for i in range(len(arr) - 1, 0, -1): 58 | if new_arr[i - 1] == 0: 59 | new_arr[i - 1] = new_arr[i] 60 | return new_arr 61 | 62 | 63 | def convert_to_continuous_f0(f0: ndarray) -> ndarray: 64 | """ 65 | Converts an F0 sequence with intermittent zero values into a continuous F0 array 66 | by linearly interpolating over non-zero values. 67 | 68 | Args: 69 | f0 (ndarray): Input F0 array with zero and non-zero values. 70 | 71 | Returns: 72 | ndarray: Continuous F0 array. 73 | """ 74 | if f0.sum() == 0: 75 | return f0 76 | 77 | # Get start and end of f0 78 | start_f0 = f0[f0 != -1][0] 79 | end_f0 = f0[f0 != -1][-1] 80 | 81 | # Padding start and end of f0 sequence 82 | start_f0 = f0[f0 != 0][0] 83 | end_f0 = f0[f0 != 0][-1] 84 | start_idx = np.where(f0 == start_f0)[0][0] 85 | end_idx = np.where(f0 == end_f0)[0][-1] 86 | cf0 = f0.copy() 87 | cf0[:start_idx] = start_f0 88 | cf0[end_idx:] = end_f0 89 | 90 | # Get non-zero frame index 91 | nonzero_idxs = np.where(cf0 != 0)[0] 92 | 93 | # Perform linear interpolation 94 | interp_fn = interp1d(nonzero_idxs, cf0[nonzero_idxs]) 95 | cf0 = interp_fn(np.arange(0, cf0.shape[0])) 96 | 97 | return cf0 98 | -------------------------------------------------------------------------------- /wavehax/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Reo Yoneyama (Nagoya University) 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | 4 | """ 5 | Utility functions for file I/O, dynamic module importing, etc. 6 | 7 | References: 8 | - https://github.com/bigpon/QPPWG 9 | """ 10 | 11 | import os 12 | import sys 13 | from logging import getLogger 14 | from typing import Any, Dict, List, Optional 15 | 16 | import h5py 17 | import numpy as np 18 | import soundfile as sf 19 | import yaml 20 | from librosa import resample 21 | from numpy import ndarray 22 | 23 | logger = getLogger(__name__) 24 | 25 | 26 | def dynamic_import(module_class: str) -> Any: 27 | """ 28 | Dynamically imports a Python class from a module using its full module path. 29 | 30 | Args: 31 | module_class (str): Full module path in the format 'module.submodule.ClassName'. 32 | 33 | Returns: 34 | Any: The imported class object. 35 | """ 36 | module_path, class_name = module_class.rsplit(".", 1) 37 | module = __import__(module_path, fromlist=[class_name]) 38 | return getattr(module, class_name) 39 | 40 | 41 | def read_yaml(file_path: str) -> Dict: 42 | """ 43 | Reads a YAML configuration file and returns its contents as a dictionary. 44 | 45 | Args: 46 | file_path (str): Path to the YAML file. 47 | 48 | Returns: 49 | Dict: Parsed contents of the YAML file. 50 | """ 51 | with open(file_path) as file: 52 | config = yaml.safe_load(file) 53 | return config 54 | 55 | 56 | def read_audio(file_path: str, sample_rate: int) -> ndarray: 57 | """ 58 | Reads an audio file, resamples it to the target sampling frequency if necessary, 59 | and returns the audio waveform as a numpy array. 60 | 61 | Args: 62 | file_path (str): Path to the audio file. 63 | sample_rate (int): Desired sampling frequency for resampling. 64 | 65 | Returns: 66 | ndarray: Audio waveform array. 67 | """ 68 | audio, sr = sf.read(file_path, dtype="float32") 69 | 70 | assert ( 71 | np.abs(audio).max() <= 1.0 72 | ), f"{file_path} seems to be different from 16 bit PCM." 73 | 74 | if len(audio.shape) != 1: 75 | logger.warning(f"{file_path} seems to be multi-channel signal {audio.shape}.") 76 | audio = audio.mean(axis=-1) 77 | 78 | if sr != sample_rate: 79 | logger.warning(f"Resample {file_path} from {sr} Hz to {sample_rate} Hz.") 80 | audio = resample(audio, orig_sr=sr, target_sr=sample_rate) 81 | 82 | return audio 83 | 84 | 85 | def read_hdf5(hdf5_name: str, hdf5_path: str) -> Any: 86 | """ 87 | Reads a dataset from an HDF5 file. 88 | 89 | Args: 90 | hdf5_name (str): Path to the HDF5 file. 91 | hdf5_path (str): Dataset path within the HDF5 file. 92 | 93 | Returns: 94 | Any: Dataset values from the HDF5 file. 95 | """ 96 | if not os.path.exists(hdf5_name): 97 | logger.error(f"There is no such a hdf5 file ({hdf5_name}).") 98 | sys.exit(1) 99 | 100 | hdf5_file = h5py.File(hdf5_name, "r") 101 | 102 | if hdf5_path not in hdf5_file: 103 | logger.error(f"There is no data named {hdf5_path} in {hdf5_name}.") 104 | sys.exit(1) 105 | 106 | hdf5_data = hdf5_file[hdf5_path][()] 107 | hdf5_file.close() 108 | 109 | return hdf5_data 110 | 111 | 112 | def write_hdf5( 113 | hdf5_name: str, 114 | hdf5_path: str, 115 | write_data: ndarray, 116 | is_overwrite: Optional[bool] = True, 117 | ) -> None: 118 | """ 119 | Writes a dataset to an HDF5 file, optionally overwriting existing datasets. 120 | 121 | Args: 122 | hdf5_name (str): HDF5 file path. 123 | hdf5_path (str): Dataset path within the HDF5 file. 124 | write_data (ndarray): Data to write into the HDF5 file. 125 | is_overwrite (bool, optional): Whether to overwrite existing datasets (default: True). 126 | """ 127 | # Convert to numpy array 128 | write_data = np.array(write_data) 129 | 130 | # Check folder existence 131 | folder_name, _ = os.path.split(hdf5_name) 132 | if not os.path.exists(folder_name) and len(folder_name) != 0: 133 | os.makedirs(folder_name) 134 | 135 | # Check hdf5 existence 136 | if os.path.exists(hdf5_name): 137 | # If already exists, open with r+ mode 138 | hdf5_file = h5py.File(hdf5_name, "r+") 139 | # Check dataset existence 140 | if hdf5_path in hdf5_file: 141 | if is_overwrite: 142 | hdf5_file.__delitem__(hdf5_path) 143 | else: 144 | logger.error( 145 | "Dataset in hdf5 file already exists. " 146 | "if you want to overwrite, please set is_overwrite = True." 147 | ) 148 | hdf5_file.close() 149 | sys.exit(1) 150 | else: 151 | # If not exists, open with w mode 152 | hdf5_file = h5py.File(hdf5_name, "w") 153 | 154 | # Write data to hdf5 155 | hdf5_file.create_dataset(hdf5_path, data=write_data) 156 | hdf5_file.flush() 157 | hdf5_file.close() 158 | 159 | 160 | def check_hdf5(hdf5_name: str, hdf5_path: str) -> bool: 161 | """ 162 | Checks if a specified dataset exists in an HDF5 file. 163 | 164 | Args: 165 | hdf5_name (str): HDF5 file path. 166 | hdf5_path (str): Dataset path within the HDF5 file. 167 | 168 | Returns: 169 | bool: True if the dataset exists, False otherwise. 170 | """ 171 | if not os.path.exists(hdf5_name): 172 | return False 173 | 174 | with h5py.File(hdf5_name, "r") as hdf5_file: 175 | return hdf5_path in hdf5_file 176 | 177 | 178 | def read_txt(file_list: str) -> List[str]: 179 | """ 180 | Read lines from a text file, removing newline characters. 181 | 182 | Args: 183 | file_list (str): Path to the text file containing filenames. 184 | 185 | Returns: 186 | List[str]: A list of filenames, with newline characters removed. 187 | """ 188 | with open(file_list) as f: 189 | filenames = f.readlines() 190 | return [filename.replace("\n", "") for filename in filenames] 191 | 192 | 193 | def check_filename(list1: List[str], list2: List[str]) -> bool: 194 | """ 195 | Check if the filenames in two lists (without extensions) are identical. 196 | 197 | Args: 198 | list1 (List[str]): First list of file paths or names. 199 | list2 (List[str]): Second list of file paths or names. 200 | 201 | Returns: 202 | bool: True if the filenames (without extensions) in both lists match, False otherwise. 203 | """ 204 | 205 | def _filename(x): 206 | return os.path.basename(x).split(".")[0] 207 | 208 | list1 = list(map(_filename, list1)) 209 | list2 = list(map(_filename, list2)) 210 | 211 | return list1 == list2 212 | 213 | 214 | def validate_length( 215 | xs: List, ys: Optional[List] = None, hop_size: Optional[int] = None 216 | ) -> List: 217 | """ 218 | Validates and adjusts the lengths of feature arrays and corresponding audio data 219 | for alignment during audio processing. If audio data is provided, their lengths 220 | are adjusted relative to the hop size. 221 | 222 | Args: 223 | xs (List): List of feature arrays in ndarray. 224 | ys (List, optional): List of audio arrays in ndarray (default: None). 225 | hop_size (int, optional): Frame shift in samples (default: None). 226 | 227 | Returns: 228 | List: A list of length-adjusted features and optionally audios if provided. 229 | """ 230 | # Get minimum length for features and audios 231 | min_len_x = min([x.shape[0] for x in xs]) 232 | if ys is not None: 233 | min_len_y = min([y.shape[0] for y in ys]) 234 | if min_len_y < min_len_x * hop_size: 235 | min_len_x = min_len_y // hop_size 236 | if min_len_y > min_len_x * hop_size: 237 | min_len_y = min_len_x * hop_size 238 | ys = [y[:min_len_y] for y in ys] 239 | xs = [x[:min_len_x] for x in xs] 240 | 241 | return xs + ys if ys is not None else xs 242 | --------------------------------------------------------------------------------