├── .gitignore ├── .idea ├── misc.xml ├── modules.xml ├── vcs.xml ├── wavenet_vocoder.iml └── workspace.xml ├── .travis.yml ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── audio.py ├── cmu_arctic.py ├── docs ├── .gitignore ├── config.toml ├── content │ └── index.md ├── layouts │ ├── _default │ │ ├── list.html │ │ └── single.html │ ├── index.html │ └── partials │ │ ├── footer.html │ │ ├── header.html │ │ ├── mathjax.html │ │ └── social.html └── static │ ├── css │ ├── custom.css │ ├── normalize.css │ └── skeleton.css │ ├── favicon.png │ └── images │ └── r9y9.jpg ├── dump_hparams_to_json.py ├── evaluate.py ├── hparams.py ├── librivox.py ├── ljspeech.py ├── lrschedule.py ├── preprocess.py ├── presets ├── cmu_arctic_8bit.json ├── ljspeech_gaussian.json ├── ljspeech_mixture.json └── multispeaker_cmu_arctic_mixture.json ├── release.sh ├── resyn.wav ├── setup.py ├── synthesis.py ├── synthesis_student.py ├── tests ├── test_audio.py ├── test_misc.py ├── test_mixture.py ├── test_model.py └── test_upsample.py ├── train.py ├── train_student.py └── wavenet_vocoder ├── __init__.py ├── builder.py ├── clari_wavenet.py ├── conv.py ├── mixture.py ├── modules.py ├── student_wavenet.py ├── upsample.py ├── util.py └── wavenet.py /.gitignore: -------------------------------------------------------------------------------- 1 | foobar* 2 | pretrained_models 3 | notebooks 4 | wavenet_vocoder/version.py 5 | checkpoints/* 6 | log 7 | generated 8 | data/ 9 | text 10 | teacher_checkpoints/ 11 | student_checkpoints/ 12 | # Created by https://www.gitignore.io 13 | 14 | ### Python ### 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | env/ 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | 71 | ### IPythonNotebook ### 72 | # Temporary data 73 | .ipynb_checkpoints/ 74 | 75 | 76 | ### SublimeText ### 77 | # cache files for sublime text 78 | *.tmlanguage.cache 79 | *.tmPreferences.cache 80 | *.stTheme.cache 81 | 82 | # workspace files are user-specific 83 | *.sublime-workspace 84 | 85 | # project files should be checked into the repository, unless a significant 86 | # proportion of contributors will probably not be using SublimeText 87 | # *.sublime-project 88 | 89 | # sftp configuration file 90 | sftp-config.json 91 | 92 | 93 | ### Emacs ### 94 | # -*- mode: gitignore; -*- 95 | *~ 96 | \#*\# 97 | /.emacs.desktop 98 | /.emacs.desktop.lock 99 | *.elc 100 | auto-save-list 101 | tramp 102 | .\#* 103 | 104 | # Org-mode 105 | .org-id-locations 106 | *_archive 107 | 108 | # flymake-mode 109 | *_flymake.* 110 | 111 | # eshell files 112 | /eshell/history 113 | /eshell/lastdir 114 | 115 | # elpa packages 116 | /elpa/ 117 | 118 | # reftex files 119 | *.rel 120 | 121 | # AUCTeX auto folder 122 | /auto/ 123 | 124 | # cask packages 125 | .cask/ 126 | 127 | 128 | ### Vim ### 129 | [._]*.s[a-w][a-z] 130 | [._]s[a-w][a-z] 131 | *.un~ 132 | Session.vim 133 | .netrwhist 134 | *~ 135 | 136 | 137 | ### C++ ### 138 | # Compiled Object files 139 | *.slo 140 | *.lo 141 | *.o 142 | *.obj 143 | 144 | # Precompiled Headers 145 | *.gch 146 | *.pch 147 | 148 | # Compiled Dynamic libraries 149 | *.so 150 | *.dylib 151 | *.dll 152 | 153 | # Fortran module files 154 | *.mod 155 | 156 | # Compiled Static libraries 157 | *.lai 158 | *.la 159 | *.a 160 | *.lib 161 | 162 | # Executables 163 | *.exe 164 | *.out 165 | *.app 166 | 167 | 168 | ### OSX ### 169 | .DS_Store 170 | .AppleDouble 171 | .LSOverride 172 | 173 | # Icon must end with two \r 174 | Icon 175 | 176 | 177 | # Thumbnails 178 | ._* 179 | 180 | # Files that might appear on external disk 181 | .Spotlight-V100 182 | .Trashes 183 | 184 | # Directories potentially created on remote AFP share 185 | .AppleDB 186 | .AppleDesktop 187 | Network Trash Folder 188 | Temporary Items 189 | .apdisk 190 | 191 | 192 | ### Linux ### 193 | *~ 194 | 195 | # KDE directory preferences 196 | .directory 197 | 198 | # Linux trash folder which might appear on any partition or disk 199 | .Trash-* 200 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/wavenet_vocoder.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | 6 | notifications: 7 | email: false 8 | 9 | before_install: 10 | - sudo apt-get update 11 | - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then 12 | wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh; 13 | else 14 | wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh; 15 | fi 16 | - bash miniconda.sh -b -p $HOME/miniconda 17 | - export PATH="$HOME/miniconda/bin:$PATH" 18 | - hash -r 19 | - conda config --set always_yes yes --set changeps1 no 20 | - conda update -q conda 21 | # Useful for debugging any issues with conda 22 | - conda config --add channels pypi 23 | - conda info -a 24 | - deps='pip numpy scipy cython nose pytorch' 25 | - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch 26 | - source activate test-environment 27 | 28 | install: 29 | - pip install -e ".[test]" 30 | script: 31 | - nosetests -v -w tests/ -a '!local_only' 32 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The wavenet_vocoder package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2017: Ryuichi Yamamoto. 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining 6 | > a copy of this software and associated documentation files (the 7 | > "Software"), to deal in the Software without restriction, including 8 | > without limitation the rights to use, copy, modify, merge, publish, 9 | > distribute, sublicense, and/or sell copies of the Software, and to 10 | > permit persons to whom the Software is furnished to do so, subject to 11 | > the following conditions: 12 | > 13 | > The above copyright notice and this permission notice shall be 14 | > included in all copies or substantial portions of the Software. 15 | > 16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WaveNet vocoder 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/wavenet_vocoder.svg)](https://pypi.python.org/pypi/wavenet_vocoder) 4 | [![Build Status](https://travis-ci.org/r9y9/wavenet_vocoder.svg?branch=master)](https://travis-ci.org/r9y9/wavenet_vocoder) 5 | 6 | The goal of the repository is to provide an implementation of the WaveNet vocoder, which can generate high quality raw speech samples conditioned on linguistic or acoustic features. 7 | 8 | Audio samples are available at https://r9y9.github.io/wavenet_vocoder/. 9 | 10 | See https://github.com/r9y9/wavenet_vocoder/issues/1 for planned TODOs and current progress. 11 | 12 | 13 | ## Highlights 14 | 15 | - Focus on local and global conditioning of WaveNet, which is essential for vocoder. 16 | - Mixture of logistic distributions loss / sampling (experimental) 17 | 18 | ## Pre-trained models 19 | 20 | **Note**: This is not a text-to-speech (TTS) model. With a pre-trained model provided here, you can synthesize waveform given a *mel spectrogram*, not raw text. Pre-trained models for TTS are planed to be released once I finish up [deepvoice3_pytorch/#21](https://github.com/r9y9/deepvoice3_pytorch/pull/21). 21 | 22 | | Model URL | Data | Hyper params URL | Git commit | Steps | 23 | |----------------------------------------------------------------------------------------------------------------------------------|------------|------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------|---------------| 24 | | [link](https://www.dropbox.com/s/8qgcbd1mm2xsqgq/20180127_mixture_lj_checkpoint_step000410000_ema.pth?dl=0) | LJSpeech | [link](https://www.dropbox.com/s/stxasitb56y1zw8/20180127_ljspeech_mixture.json?dl=0) | [489e6fa](https://github.com/r9y9/wavenet_vocoder/commit/489e6fa92eda9ecf5b953b2783d5975d2fdee27a) | 1000k~ steps | 25 | | [link](https://www.dropbox.com/s/d0qk4ow9uuh2lww/20180212_mixture_multispeaker_cmu_arctic_checkpoint_step000740000_ema.pth?dl=0) | CMU ARCTIC | [link](https://www.dropbox.com/s/i35yigj5hvmeol8/20180212_multispeaker_cmu_arctic_mixture.json?dl=0) | [b1a1076](https://github.com/r9y9/wavenet_vocoder/tree/b1a1076e8b5d9b3e275c28f2f7f4d7cd0e75dae4) | 740k steps | 26 | 27 | To use pre-trained models, first checkout the specific git commit noted above. i.e., 28 | 29 | ``` 30 | git checkout ${commit_hash} 31 | ``` 32 | 33 | And then follows "Synthesize from a checkpoint" section in the README. Note that old version of synthesis.py may not accept `--preset=` parameter and you might have to change `hparams.py` according to the preset (json) file. 34 | 35 | You could try for example: 36 | 37 | ``` 38 | # Assuming you have downloaded LJSpeech-1.0 at ~/data/LJSpeech-1.0 39 | # pretrained model (20180127_mixture_lj_checkpoint_step000410000_ema.pth) 40 | git checkout 489e6fa 41 | python preprocess.py ljspeech ~/data/LJSpeech-1.0 ./data/ljspeech 42 | python synthesis.py --hparams="input_type=raw,quantize_channels=65536,out_channels=30" \ 43 | --conditional=./data/ljspeech/ljspeech-mel-00001.npy \ 44 | 20180127_mixture_lj_checkpoint_step000410000_ema.pth \ 45 | generated 46 | ``` 47 | 48 | You can find a generated wav file in `generated` directory. Wonder how it works? then take a look at code:) 49 | 50 | ## Requirements 51 | 52 | - Python 3 53 | - CUDA >= 8.0 54 | - TensorFlow >= v1.3 55 | 56 | ## Installation 57 | 58 | The repository contains a core library (PyTorch implementation of the WaveNet) and utility scripts. All the library and its dependencies can be installed by: 59 | 60 | ``` 61 | git clone https://github.com/r9y9/wavenet_vocoder 62 | cd wavenet_vocoder 63 | pip install -e ".[train]" 64 | ``` 65 | 66 | If you only need the library part, then you can install it by the following command: 67 | 68 | ``` 69 | pip install wavenet_vocoder 70 | ``` 71 | 72 | ## Getting started 73 | 74 | ### Preset parameters 75 | 76 | There are many hyper parameters to be turned depends on data. For typical datasets, parameters known to work good (**preset**) are provided in the repository. See `presets` directory for details. Notice that 77 | 78 | 1. `preprocess.py` 79 | 2. `train.py` 80 | 3. `synthesis.py` 81 | 82 | accepts `--preset=` *optional* parameter, which specifies where to load preset parameters. If you are going to use preset parameters, then you must use same `--preset=` throughout preprocessing, training and evaluation. e.g., 83 | 84 | ``` 85 | python preprocess.py --preset=presets/cmu_arctic_8bit.json cmu_arctic ~/data/cmu_arctic 86 | python train.py --preset=presets/cmu_arctic_8bit.json --data-root=./data/cmu_arctic 87 | ``` 88 | 89 | instead of 90 | 91 | ``` 92 | python preprocess.py cmu_arctic ~/data/cmu_arctic 93 | # warning! this may use different hyper parameters used at preprocessing stage 94 | python train.py --preset=presets/cmu_arctic_8bit.json --data-root=./data/cmu_arctic 95 | ``` 96 | 97 | ### 0. Download dataset 98 | 99 | - CMU ARCTIC (en): http://festvox.org/cmu_arctic/ 100 | - LJSpeech (en): https://keithito.com/LJ-Speech-Dataset/ 101 | 102 | ### 1. Preprocessing 103 | 104 | Usage: 105 | 106 | ``` 107 | python preprocess.py ${dataset_name} ${dataset_path} ${out_dir} --preset= 108 | ``` 109 | 110 | Supported `${dataset_name}`s for now are 111 | 112 | - `cmu_arctic` (multi-speaker) 113 | - `ljspeech` (single speaker) 114 | 115 | Assuming you use preset parameters known to work good for CMU ARCTIC dataset and have data in `~/data/cmu_arctic`, then you can preprocess data by: 116 | 117 | ``` 118 | python preprocess.py cmu_arctic ~/data/cmu_arctic ./data/cmu_arctic --preset=presets/cmu_arctic_8bit.json 119 | ``` 120 | 121 | When this is done, you will see time-aligned extracted features (pairs of audio and mel-spectrogram) in `./data/cmu_arctic`. 122 | 123 | ### 2. Training 124 | 125 | Usage: 126 | 127 | ``` 128 | python train.py --data-root=${data-root} --preset= --hparams="parameters you want to override" 129 | ``` 130 | 131 | Important options: 132 | 133 | - `--speaker-id=`: (Multi-speaker dataset only) it specifies which speaker of data we use for training. If this is not specified, all training data are used. This should only be specified when you are dealing with a multi-speaker dataset. For example, if you are trying to build a speaker-dependent WaveNet vocoder for speaker `awb` of CMU ARCTIC, then you have to specify `--speaker-id=0`. Speaker ID is automatically assigned as follows: 134 | 135 | ```py 136 | In [1]: from nnmnkwii.datasets import cmu_arctic 137 | 138 | In [2]: [(i, s) for (i,s) in enumerate(cmu_arctic.available_speakers)] 139 | Out[2]: 140 | 141 | [(0, 'awb'), 142 | (1, 'bdl'), 143 | (2, 'clb'), 144 | (3, 'jmk'), 145 | (4, 'ksp'), 146 | (5, 'rms'), 147 | (6, 'slt')] 148 | ``` 149 | 150 | #### Training un-conditional WaveNet 151 | 152 | ``` 153 | python train.py --data-root=./data/cmu_arctic/ 154 | --hparams="cin_channels=-1,gin_channels=-1" 155 | ``` 156 | 157 | You have to disable global and local conditioning by setting `gin_channels` and `cin_channels` to negative values. 158 | 159 | #### Training WaveNet conditioned on mel-spectrogram 160 | 161 | ``` 162 | python train.py --data-root=./data/cmu_arctic/ --speaker-id=0 \ 163 | --hparams="cin_channels=80,gin_channels=-1" 164 | ``` 165 | 166 | #### Training WaveNet conditioned on mel-spectrogram and speaker embedding 167 | 168 | ``` 169 | python train.py --data-root=./data/cmu_arctic/ \ 170 | --hparams="cin_channels=80,gin_channels=16,n_speakers=7" 171 | ``` 172 | 173 | ### 3. Monitor with Tensorboard 174 | 175 | Logs are dumped in `./log` directory by default. You can monitor logs by tensorboard: 176 | 177 | ``` 178 | tensorboard --logdir=log 179 | ``` 180 | 181 | ### 4. Synthesize from a checkpoint 182 | 183 | Usage: 184 | 185 | ``` 186 | python synthesis.py ${checkpoint_path} ${output_dir} --preset= --hparams="parameters you want to override" 187 | ``` 188 | 189 | Important options: 190 | 191 | - `--length=`: (Un-conditional WaveNet only) Number of time steps to generate. 192 | - `--conditional=`: (Required for onditional WaveNet) Path of local conditional features (.npy). If this is specified, number of time steps to generate is determined by the size of conditional feature. 193 | 194 | e.g., 195 | 196 | ``` 197 | python synthesis.py --hparams="parameters you want to override" \ 198 | checkpoints_awb/checkpoint_step000100000.pth \ 199 | generated/test_awb \ 200 | --conditional=./data/cmu_arctic/cmu_arctic-mel-00001.npy 201 | ``` 202 | 203 | ## Misc 204 | 205 | ### Synthesize audio samples for testset 206 | 207 | Usage: 208 | 209 | 210 | ``` 211 | python evaluate.py ${checkpoint_path} ${output_dir} --data-root="data location"\ 212 | --hparams="parameters you want to override" 213 | ``` 214 | 215 | This script is used for generating sounds for https://r9y9.github.io/wavenet_vocoder/. 216 | 217 | Options: 218 | 219 | - `--data-root`: Data root. This is required to collect testset. 220 | - `--num-utterances`: (For multi-speaker model) number of utterances to be generated per speaker. This is useful especially when testset is large and don't want to generate all utterances. For single speaker dataset, you can hit `ctrl-c` whenever you want to stop evaluation. 221 | 222 | e.g., 223 | 224 | ``` 225 | python evaluate.py --data-root=./data/cmu_arctic/ \ 226 | ./checkpoints_awb/checkpoint_step000100000.pth \ 227 | ./generated/cmu_arctic_awb 228 | ``` 229 | 230 | ## References 231 | 232 | - [Aaron van den Oord, Sander Dieleman, Heiga Zen, et al, "WaveNet: A Generative Model for Raw Audio", arXiv:1609.03499, Sep 2016.](https://arxiv.org/abs/1609.03499) 233 | - [Aaron van den Oord, Yazhe Li, Igor Babuschkin, et al, "Parallel WaveNet: Fast High-Fidelity Speech Synthesis", arXiv:1711.10433, Nov 2017.](https://arxiv.org/abs/1711.10433) 234 | - [Tamamori, Akira, et al. "Speaker-dependent WaveNet vocoder." Proceedings of Interspeech. 2017.](http://www.isca-speech.org/archive/Interspeech_2017/pdfs/0314.PDF) 235 | - [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884) 236 | - [Wei Ping, Kainan Peng, Andrew Gibiansky, et al, "Deep Voice 3: 2000-Speaker Neural Text-to-Speech", arXiv:1710.07654, Oct. 2017.](https://arxiv.org/abs/1710.07654) 237 | -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | from scipy import signal 6 | from hparams import hparams 7 | from scipy.io import wavfile 8 | 9 | import lws 10 | 11 | 12 | def load_wav(path): 13 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | 15 | 16 | def save_wav(wav, path): 17 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 19 | 20 | 21 | def trim(quantized): 22 | start, end = start_and_end_indices(quantized, hparams.silence_threshold) 23 | return quantized[start:end] 24 | 25 | 26 | def adjust_time_resolution(quantized, mel): 27 | """Adjust time resolution by repeating features 28 | 29 | Args: 30 | quantized (ndarray): (T,) 31 | mel (ndarray): (N, D) 32 | 33 | Returns: 34 | tuple: Tuple of (T,) and (T, D) 35 | """ 36 | assert len(quantized.shape) == 1 37 | assert len(mel.shape) == 2 38 | 39 | upsample_factor = quantized.size // mel.shape[0] 40 | mel = np.repeat(mel, upsample_factor, axis=0) 41 | n_pad = quantized.size - mel.shape[0] 42 | if n_pad != 0: 43 | assert n_pad > 0 44 | mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) 45 | 46 | # trim 47 | start, end = start_and_end_indices(quantized, hparams.silence_threshold) 48 | 49 | return quantized[start:end], mel[start:end, :] 50 | adjast_time_resolution = adjust_time_resolution # 'adjust' is correct spelling, this is for compatibility 51 | 52 | 53 | def start_and_end_indices(quantized, silence_threshold=2): 54 | for start in range(quantized.size): 55 | if abs(quantized[start] - 127) > silence_threshold: 56 | break 57 | for end in range(quantized.size - 1, 1, -1): 58 | if abs(quantized[end] - 127) > silence_threshold: 59 | break 60 | 61 | assert abs(quantized[start] - 127) > silence_threshold 62 | assert abs(quantized[end] - 127) > silence_threshold 63 | 64 | return start, end 65 | 66 | 67 | def melspectrogram(y): 68 | D = _lws_processor().stft(y).T 69 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 70 | if not hparams.allow_clipping_in_normalization: 71 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 72 | return _normalize(S) 73 | 74 | 75 | def get_hop_size(): 76 | hop_size = hparams.hop_size 77 | if hop_size is None: 78 | assert hparams.frame_shift_ms is not None 79 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 80 | return hop_size 81 | 82 | 83 | def _lws_processor(): 84 | return lws.lws(hparams.fft_size, get_hop_size(), mode="speech") 85 | 86 | 87 | def lws_num_frames(length, fsize, fshift): 88 | """Compute number of time frames of lws spectrogram 89 | """ 90 | pad = (fsize - fshift) 91 | if length % fshift == 0: 92 | M = (length + pad * 2 - fsize) // fshift + 1 93 | else: 94 | M = (length + pad * 2 - fsize) // fshift + 2 95 | return M 96 | 97 | 98 | def lws_pad_lr(x, fsize, fshift): 99 | """Compute left and right padding lws internally uses 100 | """ 101 | M = lws_num_frames(len(x), fsize, fshift) 102 | pad = (fsize - fshift) 103 | T = len(x) + 2 * pad 104 | r = (M - 1) * fshift + fsize - T 105 | return pad, pad + r 106 | 107 | # Conversions: 108 | 109 | 110 | _mel_basis = None 111 | 112 | 113 | def _linear_to_mel(spectrogram): 114 | global _mel_basis 115 | if _mel_basis is None: 116 | _mel_basis = _build_mel_basis() 117 | return np.dot(_mel_basis, spectrogram) 118 | 119 | 120 | def _build_mel_basis(): 121 | assert hparams.fmax <= hparams.sample_rate // 2 122 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, 123 | fmin=hparams.fmin, fmax=hparams.fmax, 124 | n_mels=hparams.num_mels) 125 | 126 | 127 | def _amp_to_db(x): 128 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 129 | return 20 * np.log10(np.maximum(min_level, x)) 130 | 131 | 132 | def _db_to_amp(x): 133 | return np.power(10.0, x * 0.05) 134 | 135 | 136 | def _normalize(S): 137 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 138 | 139 | 140 | def _denormalize(S): 141 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 142 | -------------------------------------------------------------------------------- /cmu_arctic.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from nnmnkwii.datasets import cmu_arctic 7 | from nnmnkwii.io import hts 8 | from nnmnkwii import preprocessing as P 9 | from hparams import hparams 10 | from os.path import exists 11 | import librosa 12 | 13 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 14 | 15 | from hparams import hparams 16 | 17 | 18 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 19 | executor = ProcessPoolExecutor(max_workers=num_workers) 20 | futures = [] 21 | 22 | speakers = cmu_arctic.available_speakers 23 | 24 | wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers) 25 | wav_paths = wd.collect_files() 26 | speaker_ids = wd.labels 27 | 28 | for index, (speaker_id, wav_path) in enumerate( 29 | zip(speaker_ids, wav_paths)): 30 | futures.append(executor.submit( 31 | partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A"))) 32 | return [future.result() for future in tqdm(futures)] 33 | 34 | 35 | def start_at(labels): 36 | has_silence = labels[0][-1] == "pau" 37 | if not has_silence: 38 | return labels[0][0] 39 | for i in range(1, len(labels)): 40 | if labels[i][-1] != "pau": 41 | return labels[i][0] 42 | assert False 43 | 44 | 45 | def end_at(labels): 46 | has_silence = labels[-1][-1] == "pau" 47 | if not has_silence: 48 | return labels[-1][1] 49 | for i in range(len(labels) - 2, 0, -1): 50 | if labels[i][-1] != "pau": 51 | return labels[i][1] 52 | assert False 53 | 54 | 55 | def _process_utterance(out_dir, index, speaker_id, wav_path, text): 56 | sr = hparams.sample_rate 57 | 58 | # Load the audio to a numpy array. Resampled if needed 59 | wav = audio.load_wav(wav_path) 60 | 61 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") 62 | 63 | # Trim silence from hts labels if available 64 | # TODO 65 | if exists(lab_path) and False: 66 | labels = hts.load(lab_path) 67 | b = int(start_at(labels) * 1e-7 * sr) 68 | e = int(end_at(labels) * 1e-7 * sr) 69 | wav = wav[b:e] 70 | wav, _ = librosa.effects.trim(wav, top_db=20) 71 | else: 72 | wav, _ = librosa.effects.trim(wav, top_db=20) 73 | 74 | if hparams.rescaling: 75 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 76 | 77 | # Mu-law quantize 78 | if is_mulaw_quantize(hparams.input_type): 79 | # [0, quantize_channels) 80 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 81 | 82 | # Trim silences 83 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 84 | wav = wav[start:end] 85 | out = out[start:end] 86 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 87 | out_dtype = np.int16 88 | elif is_mulaw(hparams.input_type): 89 | # [-1, 1] 90 | out = P.mulaw(wav, hparams.quantize_channels) 91 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 92 | out_dtype = np.float32 93 | else: 94 | # [-1, 1] 95 | out = wav 96 | constant_values = 0.0 97 | out_dtype = np.float32 98 | 99 | # Compute a mel-scale spectrogram from the trimmed wav: 100 | # (N, D) 101 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 102 | # lws pads zeros internally before performing stft 103 | # this is needed to adjust time resolution between audio and mel-spectrogram 104 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 105 | 106 | # zero pad for quantized signal 107 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 108 | N = mel_spectrogram.shape[0] 109 | assert len(out) >= N * audio.get_hop_size() 110 | 111 | # time resolution adjustment 112 | # ensure length of raw audio is multiple of hop_size so that we can use 113 | # transposed convolution to upsample 114 | out = out[:N * audio.get_hop_size()] 115 | assert len(out) % audio.get_hop_size() == 0 116 | 117 | timesteps = len(out) 118 | 119 | # Write the spectrograms to disk: 120 | audio_filename = 'cmu_arctic-audio-%05d.npy' % index 121 | mel_filename = 'cmu_arctic-mel-%05d.npy' % index 122 | np.save(os.path.join(out_dir, audio_filename), 123 | out.astype(out_dtype), allow_pickle=False) 124 | np.save(os.path.join(out_dir, mel_filename), 125 | mel_spectrogram.astype(np.float32), allow_pickle=False) 126 | 127 | # Return a tuple describing this training example: 128 | return (audio_filename, mel_filename, timesteps, text, speaker_id) 129 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | public 2 | static/audio -------------------------------------------------------------------------------- /docs/config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "https://r9y9.github.io/wavenet_vocoder/" 2 | languageCode = "ja-jp" 3 | title = "An open source implementation of WaveNet vocoder" 4 | author = "Ryuichi YAMAMOTO" 5 | 6 | [params] 7 | author = "Ryuichi YAMAMOTO" 8 | logo = "/images/r9y9.jpg" 9 | twitter = "r9y9" 10 | github = "r9y9" 11 | analytics = "UA-44433856-1" 12 | -------------------------------------------------------------------------------- /docs/content/index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | Categories = [] 3 | Description = "" 4 | Keywords = [] 5 | Tags = [] 6 | date = "2018-01-04T19:42:01+09:00" 7 | title = "index" 8 | type = "index" 9 | +++ 10 | 11 |
12 | 13 | - Github: https://github.com/r9y9/wavenet_vocoder 14 | 15 | This page provides audio samples for the open source implementation of the **WaveNet (WN)** vocoder. 16 | 17 | 1. WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz) 18 | 2. WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz) 19 | 3. WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz) 20 | 3. (Not yet) DeepVoice3 + WaveNet vocoder 21 | 22 | ## WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz) 23 | 24 | - Samples from a model trained for over 400k steps. 25 | - Left: generated, Right: ground truth 26 | 27 | 31 | 35 | 36 | 40 | 44 | 45 | 49 | 53 | 54 | 58 | 62 | 63 | 67 | 71 | 72 | 76 | 80 | 81 | 85 | 89 | 90 | 94 | 98 | 99 | 103 | 107 | 108 | 112 | 116 | 117 | | key | value | 118 | |---------------------------------|------------------------------------------------------| 119 | | Data | LJSpeech (12522 for training, 578 for testing) | 120 | | Input type | 16-bit linear PCM | 121 | | Sampling frequency | 22.5kHz | 122 | | Local conditioning | 80-dim mel-spectrogram | 123 | | Hop size | 256 | 124 | | Global conditioning | N/A | 125 | | Total layers | 24 | 126 | | Num cycles | 4 | 127 | | Residual / Gate / Skip-out channels | 512 / 512 / 256 | 128 | | Receptive field (samples / ms) | 505 / 22.9 | 129 | | Numer of mixtures | 10 | 130 | | Number of upsampling layers | 4 | 131 | 132 | ## WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz) 133 | 134 | - Samples from a model trained for 100k steps (~22 hours) 135 | - Left: generated, Right: (mu-law encoded) ground truth 136 | 137 | 141 | 145 | 146 | 150 | 154 | 155 | 159 | 163 | 164 | 168 | 172 | 173 | 177 | 181 | 182 | 186 | 190 | 191 | 195 | 199 | 200 | 204 | 208 | 209 | 213 | 217 | 218 | 222 | 226 | 227 | | key | value | 228 | |---------------------------------|------------------------------------------------------| 229 | | Data | CMU ARCTIC (`clb`) (1183 for training, 50 for testing) | 230 | | Input type | 8-bit mu-law encoded one-hot vector | 231 | | Sampling frequency | 16kHz | 232 | | Local conditioning | 80-dim mel-spectrogram | 233 | | Hop size | 256 | 234 | | Global conditioning | N/A | 235 | | Total layers | 16 | 236 | | Num cycles | 2 | 237 | | Residual / Gate / Skip-out channels | 512 / 512 / 256 | 238 | | Receptive field (samples / ms) | 1021 / 63.8 | 239 | | Number of upsampling layers | N/A | 240 | 241 | 242 | ## WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz) 243 | 244 | - Samples from a model trained for over 1000k steps 245 | - Left: generated, Right: ground truth 246 | 247 | **awb** 248 | 249 | 253 | 257 | 258 | 262 | 266 | 267 | **bdl** 268 | 269 | 273 | 277 | 278 | 282 | 286 | 287 | **clb** 288 | 289 | 293 | 297 | 298 | 302 | 306 | 307 | **jmk** 308 | 309 | 313 | 317 | 318 | 322 | 326 | 327 | 328 | **ksp** 329 | 330 | 334 | 338 | 339 | 343 | 347 | 348 | 349 | **rms** 350 | 351 | 355 | 359 | 360 | 364 | 368 | 369 | **slt** 370 | 371 | 375 | 379 | 380 | 384 | 388 | 389 | | key | value | 390 | |---------------------------------|------------------------------------------------------| 391 | | Data | CMU ARCTIC (7580 for training, 350 for testing) | 392 | | Input type | 8-bit mu-law encoded one-hot vector | 393 | | Local conditioning | 80-dim mel-spectrogram | 394 | | Hop size | 256 | 395 | | Global conditioning | 16-dim speaker embedding [^1] | 396 | | Total layers | 24 | 397 | | Num cycles | 4 | 398 | | Residual / Gate / Skip-out channels | 512 / 512 / 256 | 399 | | Receptive field (samples / ms) | 505 / 22.9 | 400 | | Numer of mixtures | 10 | 401 | | Number of upsampling layers | 4 | 402 | 403 | [^1]: Note that mel-spectrogram used in local conditioning is dependent on speaker characteristics, so we cannot simply change the speaker identity of the generated audio samples using the model. It should work without speaker embedding, but it might have helped training speed. 404 | 405 | ## DeepVoice3 + WaveNet vocoder 406 | 407 | TODO 408 | 409 | ## References 410 | 411 | - [Aaron van den Oord, Sander Dieleman, Heiga Zen, et al, "WaveNet: A Generative Model for Raw Audio", arXiv:1609.03499, Sep 2016.](https://arxiv.org/abs/1609.03499) 412 | - [Aaron van den Oord, Yazhe Li, Igor Babuschkin, et al, "Parallel WaveNet: Fast High-Fidelity Speech Synthesis", arXiv:1711.10433, Nov 2017.](https://arxiv.org/abs/1711.10433) 413 | - [Tamamori, Akira, et al. "Speaker-dependent WaveNet vocoder." Proceedings of Interspeech. 2017.](http://www.isca-speech.org/archive/Interspeech_2017/pdfs/0314.PDF) 414 | - [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884) 415 | - [Wei Ping, Kainan Peng, Andrew Gibiansky, et al, "Deep Voice 3: 2000-Speaker Neural Text-to-Speech", arXiv:1710.07654, Oct. 2017.](https://arxiv.org/abs/1710.07654) 416 | -------------------------------------------------------------------------------- /docs/layouts/_default/list.html: -------------------------------------------------------------------------------- 1 | {{ partial "header.html" . }} 2 | 3 |
4 |

{{ .Title }}

5 | {{ range .Data.Pages }} 6 | 10 | {{ end }} 11 |
12 | 13 | {{ partial "footer.html" . }} -------------------------------------------------------------------------------- /docs/layouts/_default/single.html: -------------------------------------------------------------------------------- 1 | {{ partial "header.html" . }} 2 | 3 |
4 |
5 |

{{ .Title }}

6 | 7 |
8 | {{ .Content }} 9 | {{ partial "social.html" . }} 10 |
11 |
12 |
13 | 14 | {{ partial "footer.html" . }} 15 | -------------------------------------------------------------------------------- /docs/layouts/index.html: -------------------------------------------------------------------------------- 1 | {{ template "partials/header.html" . }} 2 | {{ range .Data.Pages }} 3 | {{if eq .Type "index" }} 4 | {{.Content}} 5 | {{end}} 6 | {{ end }} 7 | {{ template "partials/footer.html" . }} 8 | -------------------------------------------------------------------------------- /docs/layouts/partials/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 22 | 23 | 24 | 25 | {{ with .Site.Params.analytics }}{{ end }} 33 | 34 | 35 | 36 | 37 | {{ partial "mathjax.html" . }} 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /docs/layouts/partials/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ .Hugo.Generator }} 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | {{ $isHomePage := eq .Title .Site.Title }}{{ .Title }}{{ if eq $isHomePage false }} - {{ .Site.Title }}{{ end }} 15 | 16 | 17 | 18 |
19 | 20 |
21 | 24 | {{ if eq $isHomePage true }}

{{ .Site.Title }}

{{ end }} 25 |
26 | -------------------------------------------------------------------------------- /docs/layouts/partials/mathjax.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 | 30 | 31 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/layouts/partials/social.html: -------------------------------------------------------------------------------- 1 | {{ if isset .Site.Params "twitter" }} 2 | 8 | {{ end }} 9 | -------------------------------------------------------------------------------- /docs/static/css/custom.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: "Roboto", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | background-color: #FCFCFC; 4 | -webkit-font-smoothing: antialiased; 5 | font-size: 1.8em; 6 | line-height: 1.5; 7 | font-weight: 300; 8 | } 9 | 10 | h1, h2, h3, h4, h5, h6 { 11 | color: #263c4c; 12 | } 13 | h2, h3, h4, h5, h6 { 14 | margin-top: 5rem; 15 | margin-bottom: 3rem; 16 | font-weight: bold; 17 | padding-bottom: 10px; 18 | } 19 | 20 | h1 { font-size: 3.0rem; } 21 | h2 { 22 | margin-top: 6rem; 23 | font-size: 2.6rem; 24 | } 25 | h3 { font-size: 2.1rem; } 26 | h4, 27 | h5, 28 | h6 { font-size: 1.9rem; } 29 | 30 | h2.entry-title { 31 | font-size: 2.1rem; 32 | margin-top: 0; 33 | font-weight: 400; 34 | border-bottom: none; 35 | } 36 | 37 | li { 38 | margin-bottom: 0.5rem; 39 | margin-left: 0.7em; 40 | } 41 | 42 | img { 43 | max-width: 100%; 44 | height: auto; 45 | vertical-align: middle; 46 | border: 0; 47 | margin: 1em 0; 48 | } 49 | 50 | header, 51 | footer { 52 | margin: 4rem 0; 53 | text-align: center; 54 | } 55 | 56 | main { 57 | margin: 4rem 0; 58 | } 59 | 60 | .container { 61 | width: 90%; 62 | max-width: 700px; 63 | } 64 | 65 | .header-logo img { 66 | border-radius: 50%; 67 | border: 2px solid #E1E1E1; 68 | } 69 | 70 | .header-logo img:hover { 71 | border-color: #F1F1F1; 72 | } 73 | 74 | .site-title { 75 | margin-top: 2rem; 76 | } 77 | 78 | .entry-title { 79 | margin-bottom: 0; 80 | } 81 | 82 | .entry-title a { 83 | text-decoration: none; 84 | } 85 | 86 | .entry-meta { 87 | display: inline-block; 88 | margin-bottom: 2rem; 89 | font-size: 1.6rem; 90 | color: #888; 91 | } 92 | 93 | .footer-link { 94 | margin: 2rem 0; 95 | } 96 | 97 | .hr { 98 | height: 1px; 99 | margin: 2rem 0; 100 | background: #E1E1E1; 101 | background: -webkit-gradient(linear, left top, right top, from(white), color-stop(#E1E1E1), to(white)); 102 | background: -webkit-linear-gradient(left, white, #E1E1E1, white); 103 | background: linear-gradient(to right, white, #E1E1E1, white); 104 | } 105 | 106 | article .social { 107 | height: 40px; 108 | padding: 10px 0; 109 | } 110 | 111 | address { 112 | margin: 0; 113 | font-size:0.9em; 114 | max-height: 60px; 115 | font-weight: 300; 116 | font-style: normal; 117 | display: block; 118 | } 119 | 120 | address a { 121 | text-decoration: none; 122 | } 123 | 124 | .avatar-bottom img { 125 | border-radius: 50%; 126 | border: 1px solid #E1E1E1; 127 | float: left; 128 | max-width: 100%; 129 | vertical-align: middle; 130 | width: 32px; 131 | height: 32px; 132 | margin: 0 20px 0 0; 133 | margin-top: -7px; 134 | } 135 | 136 | .avatar-bottom img:hover { 137 | border-color: #F1F1F1; 138 | } 139 | 140 | .copyright { 141 | font-size:0.9em; 142 | font-weight: 300; 143 | } 144 | 145 | .github { 146 | float: right; 147 | } 148 | 149 | blockquote { 150 | position: relative; 151 | padding: 10px 10px 10px 32px; 152 | box-sizing: border-box; 153 | font-style: italic; 154 | color: #464646; 155 | background: #e0e0e0; 156 | } 157 | 158 | blockquote:before{ 159 | display: inline-block; 160 | position: absolute; 161 | top: 0; 162 | left: 0; 163 | vertical-align: middle; 164 | content: "\f10d"; 165 | font-family: FontAwesome; 166 | color: #e0e0e0; 167 | font-size: 22px; 168 | line-height: 1; 169 | z-index: 2; 170 | } 171 | 172 | blockquote:after{ 173 | position: absolute; 174 | content: ''; 175 | left: 0; 176 | top: 0; 177 | border-width: 0 0 40px 40px; 178 | border-style: solid; 179 | border-color: transparent #ffffff; 180 | } 181 | 182 | blockquote p { 183 | position: relative; 184 | padding: 0; 185 | margin: 10px 0; 186 | z-index: 3; 187 | line-height: 1.7; 188 | } 189 | 190 | blockquote cite { 191 | display: block; 192 | text-align: right; 193 | color: #888888; 194 | font-size: 0.9em; 195 | } 196 | -------------------------------------------------------------------------------- /docs/static/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */ 2 | 3 | /** 4 | * 1. Set default font family to sans-serif. 5 | * 2. Prevent iOS text size adjust after orientation change, without disabling 6 | * user zoom. 7 | */ 8 | 9 | html { 10 | font-family: sans-serif; /* 1 */ 11 | -ms-text-size-adjust: 100%; /* 2 */ 12 | -webkit-text-size-adjust: 100%; /* 2 */ 13 | } 14 | 15 | /** 16 | * Remove default margin. 17 | */ 18 | 19 | body { 20 | margin: 0; 21 | } 22 | 23 | /* HTML5 display definitions 24 | ========================================================================== */ 25 | 26 | /** 27 | * Correct `block` display not defined for any HTML5 element in IE 8/9. 28 | * Correct `block` display not defined for `details` or `summary` in IE 10/11 29 | * and Firefox. 30 | * Correct `block` display not defined for `main` in IE 11. 31 | */ 32 | 33 | article, 34 | aside, 35 | details, 36 | figcaption, 37 | figure, 38 | footer, 39 | header, 40 | hgroup, 41 | main, 42 | menu, 43 | nav, 44 | section, 45 | summary { 46 | display: block; 47 | } 48 | 49 | /** 50 | * 1. Correct `inline-block` display not defined in IE 8/9. 51 | * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera. 52 | */ 53 | 54 | audio, 55 | canvas, 56 | progress, 57 | video { 58 | display: inline-block; /* 1 */ 59 | vertical-align: baseline; /* 2 */ 60 | } 61 | 62 | /** 63 | * Prevent modern browsers from displaying `audio` without controls. 64 | * Remove excess height in iOS 5 devices. 65 | */ 66 | 67 | audio:not([controls]) { 68 | display: none; 69 | height: 0; 70 | } 71 | 72 | /** 73 | * Address `[hidden]` styling not present in IE 8/9/10. 74 | * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22. 75 | */ 76 | 77 | [hidden], 78 | template { 79 | display: none; 80 | } 81 | 82 | /* Links 83 | ========================================================================== */ 84 | 85 | /** 86 | * Remove the gray background color from active links in IE 10. 87 | */ 88 | 89 | a { 90 | background-color: transparent; 91 | } 92 | 93 | /** 94 | * Improve readability when focused and also mouse hovered in all browsers. 95 | */ 96 | 97 | a:active, 98 | a:hover { 99 | outline: 0; 100 | } 101 | 102 | /* Text-level semantics 103 | ========================================================================== */ 104 | 105 | /** 106 | * Address styling not present in IE 8/9/10/11, Safari, and Chrome. 107 | */ 108 | 109 | abbr[title] { 110 | border-bottom: 1px dotted; 111 | } 112 | 113 | /** 114 | * Address style set to `bolder` in Firefox 4+, Safari, and Chrome. 115 | */ 116 | 117 | b, 118 | strong { 119 | font-weight: bold; 120 | } 121 | 122 | /** 123 | * Address styling not present in Safari and Chrome. 124 | */ 125 | 126 | dfn { 127 | font-style: italic; 128 | } 129 | 130 | /** 131 | * Address variable `h1` font-size and margin within `section` and `article` 132 | * contexts in Firefox 4+, Safari, and Chrome. 133 | */ 134 | 135 | h1 { 136 | font-size: 2em; 137 | margin: 0.67em 0; 138 | } 139 | 140 | /** 141 | * Address styling not present in IE 8/9. 142 | */ 143 | 144 | mark { 145 | background: #ff0; 146 | color: #000; 147 | } 148 | 149 | /** 150 | * Address inconsistent and variable font size in all browsers. 151 | */ 152 | 153 | small { 154 | font-size: 80%; 155 | } 156 | 157 | /** 158 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 159 | */ 160 | 161 | sub, 162 | sup { 163 | font-size: 75%; 164 | line-height: 0; 165 | position: relative; 166 | vertical-align: baseline; 167 | } 168 | 169 | sup { 170 | top: -0.5em; 171 | } 172 | 173 | sub { 174 | bottom: -0.25em; 175 | } 176 | 177 | /* Embedded content 178 | ========================================================================== */ 179 | 180 | /** 181 | * Remove border when inside `a` element in IE 8/9/10. 182 | */ 183 | 184 | img { 185 | border: 0; 186 | } 187 | 188 | /** 189 | * Correct overflow not hidden in IE 9/10/11. 190 | */ 191 | 192 | svg:not(:root) { 193 | overflow: hidden; 194 | } 195 | 196 | /* Grouping content 197 | ========================================================================== */ 198 | 199 | /** 200 | * Address margin not present in IE 8/9 and Safari. 201 | */ 202 | 203 | figure { 204 | margin: 1em 40px; 205 | } 206 | 207 | /** 208 | * Address differences between Firefox and other browsers. 209 | */ 210 | 211 | hr { 212 | -moz-box-sizing: content-box; 213 | box-sizing: content-box; 214 | height: 0; 215 | } 216 | 217 | /** 218 | * Contain overflow in all browsers. 219 | */ 220 | 221 | pre { 222 | overflow: auto; 223 | } 224 | 225 | /** 226 | * Address odd `em`-unit font size rendering in all browsers. 227 | */ 228 | 229 | code, 230 | kbd, 231 | pre, 232 | samp { 233 | font-family: monospace, monospace; 234 | font-size: 1em; 235 | } 236 | 237 | /* Forms 238 | ========================================================================== */ 239 | 240 | /** 241 | * Known limitation: by default, Chrome and Safari on OS X allow very limited 242 | * styling of `select`, unless a `border` property is set. 243 | */ 244 | 245 | /** 246 | * 1. Correct color not being inherited. 247 | * Known issue: affects color of disabled elements. 248 | * 2. Correct font properties not being inherited. 249 | * 3. Address margins set differently in Firefox 4+, Safari, and Chrome. 250 | */ 251 | 252 | button, 253 | input, 254 | optgroup, 255 | select, 256 | textarea { 257 | color: inherit; /* 1 */ 258 | font: inherit; /* 2 */ 259 | margin: 0; /* 3 */ 260 | } 261 | 262 | /** 263 | * Address `overflow` set to `hidden` in IE 8/9/10/11. 264 | */ 265 | 266 | button { 267 | overflow: visible; 268 | } 269 | 270 | /** 271 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 272 | * All other form control elements do not inherit `text-transform` values. 273 | * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera. 274 | * Correct `select` style inheritance in Firefox. 275 | */ 276 | 277 | button, 278 | select { 279 | text-transform: none; 280 | } 281 | 282 | /** 283 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 284 | * and `video` controls. 285 | * 2. Correct inability to style clickable `input` types in iOS. 286 | * 3. Improve usability and consistency of cursor style between image-type 287 | * `input` and others. 288 | */ 289 | 290 | button, 291 | html input[type="button"], /* 1 */ 292 | input[type="reset"], 293 | input[type="submit"] { 294 | -webkit-appearance: button; /* 2 */ 295 | cursor: pointer; /* 3 */ 296 | } 297 | 298 | /** 299 | * Re-set default cursor for disabled elements. 300 | */ 301 | 302 | button[disabled], 303 | html input[disabled] { 304 | cursor: default; 305 | } 306 | 307 | /** 308 | * Remove inner padding and border in Firefox 4+. 309 | */ 310 | 311 | button::-moz-focus-inner, 312 | input::-moz-focus-inner { 313 | border: 0; 314 | padding: 0; 315 | } 316 | 317 | /** 318 | * Address Firefox 4+ setting `line-height` on `input` using `!important` in 319 | * the UA stylesheet. 320 | */ 321 | 322 | input { 323 | line-height: normal; 324 | } 325 | 326 | /** 327 | * It's recommended that you don't attempt to style these elements. 328 | * Firefox's implementation doesn't respect box-sizing, padding, or width. 329 | * 330 | * 1. Address box sizing set to `content-box` in IE 8/9/10. 331 | * 2. Remove excess padding in IE 8/9/10. 332 | */ 333 | 334 | input[type="checkbox"], 335 | input[type="radio"] { 336 | box-sizing: border-box; /* 1 */ 337 | padding: 0; /* 2 */ 338 | } 339 | 340 | /** 341 | * Fix the cursor style for Chrome's increment/decrement buttons. For certain 342 | * `font-size` values of the `input`, it causes the cursor style of the 343 | * decrement button to change from `default` to `text`. 344 | */ 345 | 346 | input[type="number"]::-webkit-inner-spin-button, 347 | input[type="number"]::-webkit-outer-spin-button { 348 | height: auto; 349 | } 350 | 351 | /** 352 | * 1. Address `appearance` set to `searchfield` in Safari and Chrome. 353 | * 2. Address `box-sizing` set to `border-box` in Safari and Chrome 354 | * (include `-moz` to future-proof). 355 | */ 356 | 357 | input[type="search"] { 358 | -webkit-appearance: textfield; /* 1 */ 359 | -moz-box-sizing: content-box; 360 | -webkit-box-sizing: content-box; /* 2 */ 361 | box-sizing: content-box; 362 | } 363 | 364 | /** 365 | * Remove inner padding and search cancel button in Safari and Chrome on OS X. 366 | * Safari (but not Chrome) clips the cancel button when the search input has 367 | * padding (and `textfield` appearance). 368 | */ 369 | 370 | input[type="search"]::-webkit-search-cancel-button, 371 | input[type="search"]::-webkit-search-decoration { 372 | -webkit-appearance: none; 373 | } 374 | 375 | /** 376 | * Define consistent border, margin, and padding. 377 | */ 378 | 379 | fieldset { 380 | border: 1px solid #c0c0c0; 381 | margin: 0 2px; 382 | padding: 0.35em 0.625em 0.75em; 383 | } 384 | 385 | /** 386 | * 1. Correct `color` not being inherited in IE 8/9/10/11. 387 | * 2. Remove padding so people aren't caught out if they zero out fieldsets. 388 | */ 389 | 390 | legend { 391 | border: 0; /* 1 */ 392 | padding: 0; /* 2 */ 393 | } 394 | 395 | /** 396 | * Remove default vertical scrollbar in IE 8/9/10/11. 397 | */ 398 | 399 | textarea { 400 | overflow: auto; 401 | } 402 | 403 | /** 404 | * Don't inherit the `font-weight` (applied by a rule above). 405 | * NOTE: the default cannot safely be changed in Chrome and Safari on OS X. 406 | */ 407 | 408 | optgroup { 409 | font-weight: bold; 410 | } 411 | 412 | /* Tables 413 | ========================================================================== */ 414 | 415 | /** 416 | * Remove most spacing between table cells. 417 | */ 418 | 419 | table { 420 | border-collapse: collapse; 421 | border-spacing: 0; 422 | } 423 | 424 | td, 425 | th { 426 | padding: 0; 427 | } -------------------------------------------------------------------------------- /docs/static/css/skeleton.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Skeleton V2.0.4 3 | * Copyright 2014, Dave Gamache 4 | * www.getskeleton.com 5 | * Free to use under the MIT license. 6 | * http://www.opensource.org/licenses/mit-license.php 7 | * 12/29/2014 8 | */ 9 | 10 | 11 | /* Table of contents 12 | –––––––––––––––––––––––––––––––––––––––––––––––––– 13 | - Grid 14 | - Base Styles 15 | - Typography 16 | - Links 17 | - Buttons 18 | - Forms 19 | - Lists 20 | - Code 21 | - Tables 22 | - Spacing 23 | - Utilities 24 | - Clearing 25 | - Media Queries 26 | */ 27 | 28 | 29 | /* Grid 30 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 31 | .container { 32 | position: relative; 33 | width: 100%; 34 | max-width: 960px; 35 | margin: 0 auto; 36 | padding: 0 20px; 37 | box-sizing: border-box; } 38 | .column, 39 | .columns { 40 | width: 100%; 41 | float: left; 42 | box-sizing: border-box; } 43 | 44 | /* For devices larger than 400px */ 45 | @media (min-width: 400px) { 46 | .container { 47 | width: 85%; 48 | padding: 0; } 49 | } 50 | 51 | /* For devices larger than 550px */ 52 | @media (min-width: 550px) { 53 | .container { 54 | width: 80%; } 55 | .column, 56 | .columns { 57 | margin-left: 4%; } 58 | .column:first-child, 59 | .columns:first-child { 60 | margin-left: 0; } 61 | 62 | .one.column, 63 | .one.columns { width: 4.66666666667%; } 64 | .two.columns { width: 13.3333333333%; } 65 | .three.columns { width: 22%; } 66 | .four.columns { width: 30.6666666667%; } 67 | .five.columns { width: 39.3333333333%; } 68 | .six.columns { width: 48%; } 69 | .seven.columns { width: 56.6666666667%; } 70 | .eight.columns { width: 65.3333333333%; } 71 | .nine.columns { width: 74.0%; } 72 | .ten.columns { width: 82.6666666667%; } 73 | .eleven.columns { width: 91.3333333333%; } 74 | .twelve.columns { width: 100%; margin-left: 0; } 75 | 76 | .one-third.column { width: 30.6666666667%; } 77 | .two-thirds.column { width: 65.3333333333%; } 78 | 79 | .one-half.column { width: 48%; } 80 | 81 | /* Offsets */ 82 | .offset-by-one.column, 83 | .offset-by-one.columns { margin-left: 8.66666666667%; } 84 | .offset-by-two.column, 85 | .offset-by-two.columns { margin-left: 17.3333333333%; } 86 | .offset-by-three.column, 87 | .offset-by-three.columns { margin-left: 26%; } 88 | .offset-by-four.column, 89 | .offset-by-four.columns { margin-left: 34.6666666667%; } 90 | .offset-by-five.column, 91 | .offset-by-five.columns { margin-left: 43.3333333333%; } 92 | .offset-by-six.column, 93 | .offset-by-six.columns { margin-left: 52%; } 94 | .offset-by-seven.column, 95 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 96 | .offset-by-eight.column, 97 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 98 | .offset-by-nine.column, 99 | .offset-by-nine.columns { margin-left: 78.0%; } 100 | .offset-by-ten.column, 101 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 102 | .offset-by-eleven.column, 103 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 104 | 105 | .offset-by-one-third.column, 106 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 107 | .offset-by-two-thirds.column, 108 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 109 | 110 | .offset-by-one-half.column, 111 | .offset-by-one-half.columns { margin-left: 52%; } 112 | 113 | } 114 | 115 | 116 | /* Base Styles 117 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 118 | /* NOTE 119 | html is set to 62.5% so that all the REM measurements throughout Skeleton 120 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 121 | html { 122 | font-size: 62.5%; } 123 | body { 124 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 125 | line-height: 1.6; 126 | font-weight: 400; 127 | font-family: "Raleway", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 128 | color: #222; } 129 | 130 | 131 | /* Typography 132 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 133 | h1, h2, h3, h4, h5, h6 { 134 | margin-top: 0; 135 | margin-bottom: 2rem; 136 | font-weight: 300; } 137 | h1 { font-size: 4.0rem; line-height: 1.2; letter-spacing: -.1rem;} 138 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; } 139 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; } 140 | h4 { font-size: 2.4rem; line-height: 1.35; letter-spacing: -.08rem; } 141 | h5 { font-size: 1.8rem; line-height: 1.5; letter-spacing: -.05rem; } 142 | h6 { font-size: 1.5rem; line-height: 1.6; letter-spacing: 0; } 143 | 144 | /* Larger than phablet */ 145 | @media (min-width: 550px) { 146 | h1 { font-size: 5.0rem; } 147 | h2 { font-size: 4.2rem; } 148 | h3 { font-size: 3.6rem; } 149 | h4 { font-size: 3.0rem; } 150 | h5 { font-size: 2.4rem; } 151 | h6 { font-size: 1.5rem; } 152 | } 153 | 154 | p { 155 | margin-top: 0; } 156 | 157 | 158 | /* Links 159 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 160 | a { 161 | color: #1EAEDB; } 162 | a:hover { 163 | color: #0FA0CE; } 164 | 165 | 166 | /* Buttons 167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 168 | .button, 169 | button, 170 | input[type="submit"], 171 | input[type="reset"], 172 | input[type="button"] { 173 | display: inline-block; 174 | height: 38px; 175 | padding: 0 30px; 176 | color: #555; 177 | text-align: center; 178 | font-size: 11px; 179 | font-weight: 600; 180 | line-height: 38px; 181 | letter-spacing: .1rem; 182 | text-transform: uppercase; 183 | text-decoration: none; 184 | white-space: nowrap; 185 | background-color: transparent; 186 | border-radius: 4px; 187 | border: 1px solid #bbb; 188 | cursor: pointer; 189 | box-sizing: border-box; } 190 | .button:hover, 191 | button:hover, 192 | input[type="submit"]:hover, 193 | input[type="reset"]:hover, 194 | input[type="button"]:hover, 195 | .button:focus, 196 | button:focus, 197 | input[type="submit"]:focus, 198 | input[type="reset"]:focus, 199 | input[type="button"]:focus { 200 | color: #333; 201 | border-color: #888; 202 | outline: 0; } 203 | .button.button-primary, 204 | button.button-primary, 205 | input[type="submit"].button-primary, 206 | input[type="reset"].button-primary, 207 | input[type="button"].button-primary { 208 | color: #FFF; 209 | background-color: #33C3F0; 210 | border-color: #33C3F0; } 211 | .button.button-primary:hover, 212 | button.button-primary:hover, 213 | input[type="submit"].button-primary:hover, 214 | input[type="reset"].button-primary:hover, 215 | input[type="button"].button-primary:hover, 216 | .button.button-primary:focus, 217 | button.button-primary:focus, 218 | input[type="submit"].button-primary:focus, 219 | input[type="reset"].button-primary:focus, 220 | input[type="button"].button-primary:focus { 221 | color: #FFF; 222 | background-color: #1EAEDB; 223 | border-color: #1EAEDB; } 224 | 225 | 226 | /* Forms 227 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 228 | input[type="email"], 229 | input[type="number"], 230 | input[type="search"], 231 | input[type="text"], 232 | input[type="tel"], 233 | input[type="url"], 234 | input[type="password"], 235 | textarea, 236 | select { 237 | height: 38px; 238 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 239 | background-color: #fff; 240 | border: 1px solid #D1D1D1; 241 | border-radius: 4px; 242 | box-shadow: none; 243 | box-sizing: border-box; } 244 | /* Removes awkward default styles on some inputs for iOS */ 245 | input[type="email"], 246 | input[type="number"], 247 | input[type="search"], 248 | input[type="text"], 249 | input[type="tel"], 250 | input[type="url"], 251 | input[type="password"], 252 | textarea { 253 | -webkit-appearance: none; 254 | -moz-appearance: none; 255 | appearance: none; } 256 | textarea { 257 | min-height: 65px; 258 | padding-top: 6px; 259 | padding-bottom: 6px; } 260 | input[type="email"]:focus, 261 | input[type="number"]:focus, 262 | input[type="search"]:focus, 263 | input[type="text"]:focus, 264 | input[type="tel"]:focus, 265 | input[type="url"]:focus, 266 | input[type="password"]:focus, 267 | textarea:focus, 268 | select:focus { 269 | border: 1px solid #33C3F0; 270 | outline: 0; } 271 | label, 272 | legend { 273 | display: block; 274 | margin-bottom: .5rem; 275 | font-weight: 600; } 276 | fieldset { 277 | padding: 0; 278 | border-width: 0; } 279 | input[type="checkbox"], 280 | input[type="radio"] { 281 | display: inline; } 282 | label > .label-body { 283 | display: inline-block; 284 | margin-left: .5rem; 285 | font-weight: normal; } 286 | 287 | 288 | /* Lists 289 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 290 | ul { 291 | list-style: circle inside; } 292 | ol { 293 | list-style: decimal inside; } 294 | ol, ul { 295 | padding-left: 0; 296 | margin-top: 0; } 297 | ul ul, 298 | ul ol, 299 | ol ol, 300 | ol ul { 301 | margin: 1.5rem 0 1.5rem 3rem; 302 | font-size: 90%; } 303 | li { 304 | margin-bottom: 1rem; } 305 | 306 | 307 | /* Code 308 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 309 | code { 310 | padding: .2rem .5rem; 311 | margin: 0 .2rem; 312 | font-size: 90%; 313 | white-space: nowrap; 314 | background: #F1F1F1; 315 | border: 1px solid #E1E1E1; 316 | border-radius: 4px; } 317 | pre > code { 318 | display: block; 319 | padding: 1rem 1.5rem; 320 | white-space: pre; } 321 | 322 | 323 | /* Tables 324 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 325 | th, 326 | td { 327 | padding: 12px 15px; 328 | text-align: left; 329 | border-bottom: 1px solid #E1E1E1; } 330 | th:first-child, 331 | td:first-child { 332 | padding-left: 0; } 333 | th:last-child, 334 | td:last-child { 335 | padding-right: 0; } 336 | 337 | 338 | /* Spacing 339 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 340 | button, 341 | .button { 342 | margin-bottom: 1rem; } 343 | input, 344 | textarea, 345 | select, 346 | fieldset { 347 | margin-bottom: 1.5rem; } 348 | pre, 349 | blockquote, 350 | dl, 351 | figure, 352 | table, 353 | p, 354 | ul, 355 | ol, 356 | form { 357 | margin-bottom: 2.5rem; } 358 | 359 | 360 | /* Utilities 361 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 362 | .u-full-width { 363 | width: 100%; 364 | box-sizing: border-box; } 365 | .u-max-full-width { 366 | max-width: 100%; 367 | box-sizing: border-box; } 368 | .u-pull-right { 369 | float: right; } 370 | .u-pull-left { 371 | float: left; } 372 | 373 | 374 | /* Misc 375 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 376 | hr { 377 | margin-top: 3rem; 378 | margin-bottom: 3.5rem; 379 | border-width: 0; 380 | border-top: 1px solid #E1E1E1; } 381 | 382 | 383 | /* Clearing 384 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 385 | 386 | /* Self Clearing Goodness */ 387 | .container:after, 388 | .row:after, 389 | .u-cf { 390 | content: ""; 391 | display: table; 392 | clear: both; } 393 | 394 | 395 | /* Media Queries 396 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 397 | /* 398 | Note: The best way to structure the use of media queries is to create the queries 399 | near the relevant code. For example, if you wanted to change the styles for buttons 400 | on small devices, paste the mobile query code up in the buttons section and style it 401 | there. 402 | */ 403 | 404 | 405 | /* Larger than mobile */ 406 | @media (min-width: 400px) {} 407 | 408 | /* Larger than phablet (also point when grid becomes active) */ 409 | @media (min-width: 550px) {} 410 | 411 | /* Larger than tablet */ 412 | @media (min-width: 750px) {} 413 | 414 | /* Larger than desktop */ 415 | @media (min-width: 1000px) {} 416 | 417 | /* Larger than Desktop HD */ 418 | @media (min-width: 1200px) {} 419 | -------------------------------------------------------------------------------- /docs/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/docs/static/favicon.png -------------------------------------------------------------------------------- /docs/static/images/r9y9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/docs/static/images/r9y9.jpg -------------------------------------------------------------------------------- /dump_hparams_to_json.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Dump hyper parameters to json file. 4 | 5 | usage: dump_hparams_to_json.py [options] 6 | 7 | options: 8 | -h, --help Show help message. 9 | """ 10 | from docopt import docopt 11 | 12 | import sys 13 | import os 14 | from os.path import dirname, join, basename, splitext 15 | import json 16 | 17 | from hparams import hparams 18 | 19 | if __name__ == "__main__": 20 | args = docopt(__doc__) 21 | output_json_path = args[""] 22 | 23 | j = hparams.values() 24 | 25 | # for compat legacy 26 | for k in ["preset", "presets"]: 27 | if k in j: 28 | del j[k] 29 | 30 | with open(output_json_path, "w") as f: 31 | json.dump(j, f, indent=2) 32 | sys.exit(0) 33 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Synthesis waveform for testset 4 | 5 | usage: evaluate.py [options] 6 | 7 | options: 8 | --data-root= Directory contains preprocessed features. 9 | --hparams= Hyper parameters [default: ]. 10 | --length= Steps to generate [default: 32000]. 11 | --speaker-id= Use specific speaker of data in case for multi-speaker datasets. 12 | --initial-value= Initial value for the WaveNet decoder. 13 | --file-name-suffix= File name suffix [default: ]. 14 | --output-html Output html for blog post. 15 | --num-utterances=N> Generate N utterenaces per speaker [default: -1]. 16 | -h, --help Show help message. 17 | """ 18 | from docopt import docopt 19 | 20 | import sys 21 | import os 22 | from os.path import dirname, join, basename, splitext 23 | import torch 24 | from torch.autograd import Variable 25 | import numpy as np 26 | from nnmnkwii import preprocessing as P 27 | from keras.utils import np_utils 28 | from tqdm import tqdm 29 | import librosa 30 | 31 | 32 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 33 | 34 | import audio 35 | from hparams import hparams 36 | 37 | 38 | use_cuda = torch.cuda.is_available() 39 | 40 | 41 | if __name__ == "__main__": 42 | args = docopt(__doc__) 43 | print("Command line args:\n", args) 44 | data_root = args["--data-root"] 45 | if data_root is None: 46 | data_root = join(dirname(__file__), "data", "cmu_arctic") 47 | checkpoint_path = args[""] 48 | dst_dir = args[""] 49 | 50 | length = int(args["--length"]) 51 | # Note that speaker-id is used for filtering out unrelated-speaker from 52 | # multi-speaker dataset. 53 | speaker_id = args["--speaker-id"] 54 | speaker_id = int(speaker_id) if speaker_id is not None else None 55 | initial_value = args["--initial-value"] 56 | initial_value = None if initial_value is None else float(initial_value) 57 | file_name_suffix = args["--file-name-suffix"] 58 | output_html = args["--output-html"] 59 | num_utterances = int(args["--num-utterances"]) 60 | 61 | # Override hyper parameters 62 | hparams.parse(args["--hparams"]) 63 | assert hparams.name == "wavenet_vocoder" 64 | 65 | from train import build_model, get_data_loaders 66 | from synthesis import wavegen 67 | 68 | # Data 69 | # Use exactly same testset used in training script 70 | # disable shuffle for convenience 71 | test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False)["test"] 72 | test_dataset = test_data_loader.dataset 73 | 74 | # Model 75 | model = build_model() 76 | 77 | # Load checkpoint 78 | print("Load checkpoint from {}".format(checkpoint_path)) 79 | checkpoint = torch.load(checkpoint_path) 80 | model.load_state_dict(checkpoint["state_dict"]) 81 | checkpoint_name = splitext(basename(checkpoint_path))[0] 82 | 83 | os.makedirs(dst_dir, exist_ok=True) 84 | dst_dir_name = basename(os.path.normpath(dst_dir)) 85 | 86 | generated_utterances = {} 87 | for idx, (x, c, g) in enumerate(test_dataset): 88 | target_audio_path = test_dataset.X.collected_files[idx][0] 89 | if num_utterances > 0 and g is not None: 90 | try: 91 | generated_utterances[g] += 1 92 | if generated_utterances[g] > num_utterances: 93 | continue 94 | except KeyError: 95 | generated_utterances[g] = 1 96 | 97 | if output_html: 98 | def _tqdm(x): return x 99 | else: 100 | _tqdm = tqdm 101 | print("Target audio is {}".format(target_audio_path)) 102 | if c is not None: 103 | print("Local conditioned by {}".format(test_dataset.Mel.collected_files[idx][0])) 104 | if g is not None: 105 | print("Global conditioned by speaker id {}".format(g)) 106 | 107 | # Paths 108 | if g is None: 109 | dst_wav_path = join(dst_dir, "{}_{}{}_predicted.wav".format( 110 | idx, checkpoint_name, file_name_suffix)) 111 | target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format( 112 | idx, checkpoint_name, file_name_suffix)) 113 | else: 114 | dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format( 115 | g, idx, checkpoint_name, file_name_suffix)) 116 | target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format( 117 | g, idx, checkpoint_name, file_name_suffix)) 118 | 119 | # Generate 120 | waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value, 121 | fast=True, tqdm=_tqdm) 122 | 123 | # save 124 | librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) 125 | if is_mulaw_quantize(hparams.input_type): 126 | x = P.inv_mulaw_quantize(x, hparams.quantize_channels) 127 | elif is_mulaw(hparams.input_type): 128 | x = P.inv_mulaw(x, hparams.quantize_channels) 129 | librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate) 130 | 131 | # log 132 | if output_html: 133 | print(""" 134 | 138 | """.format(hparams.name, dst_dir_name, basename(dst_wav_path))) 139 | 140 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 141 | sys.exit(0) 142 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | # NOTE: If you want full control for model architecture. please take a look 5 | # at the code and change whatever you want. Some hyper parameters are hardcoded. 6 | 7 | # Default hyperparameters: 8 | hparams = tf.contrib.training.HParams( 9 | name="wavenet_vocoder", 10 | 11 | # Convenient model builder 12 | builder="wavenet", 13 | 14 | # Input type: 15 | # 1. raw [-1, 1] 16 | # 2. mulaw [-1, 1] 17 | # 3. mulaw-quantize [0, mu] 18 | # If input_type is raw or mulaw, network assumes scalar input and 19 | # discretized mixture of logistic distributions output, otherwise one-hot 20 | # input and softmax output are assumed. 21 | # **NOTE**: if you change the one of the two parameters below, you need to 22 | # re-run preprocessing before training. 23 | # **NOTE**: scaler input (raw or mulaw) is experimental. Use it your own risk. 24 | input_type="raw", 25 | output_type="Gaussian",#['Gaussian','MOG','MOL','softmax'] 26 | quantize_channels=65536, # 65536 or 256 27 | 28 | # Audio: 29 | sample_rate=22050, 30 | # this is only valid for mulaw is True 31 | silence_threshold=2, 32 | num_mels=80, 33 | fmin=125, 34 | fmax=7600, 35 | fft_size=1024, 36 | # shift can be specified by either hop_size or frame_shift_ms 37 | hop_size=256, 38 | frame_shift_ms=None, 39 | min_level_db=-100, 40 | ref_level_db=20, 41 | # whether to rescale waveform or not. 42 | # Let x is an input waveform, rescaled waveform y is given by: 43 | # y = x / np.abs(x).max() * rescaling_max 44 | rescaling=True, 45 | rescaling_max=0.999, 46 | # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may 47 | # happen depends on min_level_db and ref_level_db, causing clipping noise. 48 | # If False, assertion is added to ensure no clipping happens.o0 49 | allow_clipping_in_normalization=True, 50 | 51 | # Mixture of logistic distributions: 52 | log_scale_min=float(np.log(1e-14)), 53 | 54 | # Model: 55 | # This should equal to `quantize_channels` if mu-law quantize enabled 56 | # otherwise num_mixture * 3 (pi, mean, log_scale) 57 | out_channels=2, 58 | use_skip=True, 59 | layers=24, 60 | stacks=4, 61 | residual_channels=512, 62 | gate_channels=512, # split into 2 gropus internally for gated activation 63 | skip_out_channels=256, 64 | dropout=1 - 0.95, 65 | kernel_size=3, 66 | # If True, apply weight normalization as same as DeepVoice3 67 | weight_normalization=True, 68 | 69 | # Local conditioning (set negative value to disable)) 70 | cin_channels=80, 71 | # If True, use transposed convolutions to upsample conditional features, 72 | # otherwise repeat features to adjust time resolution 73 | upsample_conditional_features=True, 74 | # should np.prod(upsample_scales) == hop_size 75 | upsample_scales=[4, 4, 4, 4], 76 | upsample_size=[[30,3],[40,3]], 77 | # Freq axis kernel size for upsampling network 78 | freq_axis_kernel_size=3, 79 | 80 | # Global conditioning (set negative value to disable) 81 | # currently limited for speaker embedding 82 | # this should only be enabled for multi-speaker dataset 83 | gin_channels=-1, # i.e., speaker embedding dim 84 | n_speakers=7, # 7 for CMU ARCTIC 85 | 86 | # Data loader 87 | pin_memory=True, 88 | num_workers=2, 89 | 90 | # train/test 91 | # test size can be specified as portion or num samples 92 | test_size=0.0441, # 50 for CMU ARCTIC single speaker 93 | test_num_samples=None, 94 | random_state=1234, 95 | 96 | # Loss 97 | 98 | # Training: 99 | batch_size=2, 100 | adam_beta1=0.9, 101 | adam_beta2=0.999, 102 | adam_eps=1e-8, 103 | initial_learning_rate=1e-3, 104 | # see lrschedule.py for available lr_schedule 105 | lr_schedule="noam_learning_rate_decay", 106 | lr_schedule_kwargs={}, # {"anneal_rate": 0.5, "anneal_interval": 50000}, 107 | nepochs=2000, 108 | weight_decay=0.0, 109 | clip_thresh=-1, 110 | # max time steps can either be specified as sec or steps 111 | # This is needed for those who don't have huge GPU memory... 112 | # if both are None, then full audio samples are used 113 | max_time_sec=None, 114 | max_time_steps=8000, 115 | # Hold moving averaged parameters and use them for evaluation 116 | exponential_moving_average=True, 117 | # averaged = decay * averaged + (1 - decay) * x 118 | ema_decay=0.9999, 119 | 120 | # Save 121 | # per-step intervals 122 | checkpoint_interval=10000, 123 | train_eval_interval=10000, 124 | # per-epoch interval 125 | test_eval_epoch_interval=5, 126 | save_optimizer_state=True, 127 | 128 | # Eval: 129 | 130 | # Student Model 131 | student_out_channels=2, 132 | student_layers=60, 133 | student_stacks=6, 134 | student_residual_channels=128, 135 | student_skip_channels=128, 136 | iaf_layer_sizes=[10, 10, 10, 10,10,10], 137 | student_gate_channels=128, 138 | use_scale=False, 139 | iaf_shift=False, 140 | share_condition_net=True 141 | ) 142 | 143 | 144 | def hparams_debug_string(): 145 | values = hparams.values() 146 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 147 | return 'Hyperparameters:\n' + '\n'.join(hp) 148 | -------------------------------------------------------------------------------- /librivox.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | 7 | from nnmnkwii import preprocessing as P 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 13 | 14 | from hparams import hparams 15 | 16 | 17 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 18 | executor = ProcessPoolExecutor(max_workers=num_workers) 19 | futures = [] 20 | index = 1 21 | 22 | #with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 23 | # for line in f: 24 | # parts = line.strip().split('|') 25 | # wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 26 | # text = parts[2] 27 | # futures.append(executor.submit( 28 | # partial(_process_utterance, out_dir, index, wav_path, text))) 29 | # index += 1 30 | 31 | valid_ext = '.ogg .wav .mp3'.split() 32 | for f in sorted(os.listdir(in_dir)): 33 | valid = sum([ f.endswith(ext) for ext in valid_ext ]) 34 | if valid<1: continue 35 | 36 | audio_filepath = os.path.join(in_dir, f) 37 | text = audio_filepath # Not very informative 38 | futures.append(executor.submit( 39 | partial(_process_utterance, out_dir, index, audio_filepath, text))) 40 | index += 1 41 | return [tup for future in tqdm(futures) for tup in future.result()] 42 | 43 | 44 | def _process_utterance(out_dir, index, audio_filepath, text): 45 | # Load the audio to a numpy array: 46 | wav_whole = audio.load_wav(audio_filepath) 47 | 48 | if hparams.rescaling: 49 | wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max 50 | 51 | # This is a librivox source, so the audio files are going to be v. long 52 | # compared to a typical 'utterance' : So split the wav into chunks 53 | 54 | tup_results = [] 55 | 56 | n_samples = int( 8.0 * hparams.sample_rate ) # All 8 second utterances 57 | n_chunks = wav_whole.shape[0] // n_samples 58 | 59 | for chunk_idx in range(n_chunks): 60 | chunk_start, chunk_end = chunk_idx*n_samples, (chunk_idx+1)*n_samples 61 | if chunk_idx == n_chunks-1: # This is the last chunk - allow it to extend to the end of the file 62 | chunk_end = None 63 | wav = wav_whole[ chunk_start : chunk_end ] 64 | 65 | # Mu-law quantize 66 | if is_mulaw_quantize(hparams.input_type): 67 | # [0, quantize_channels) 68 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 69 | 70 | # Trim silences 71 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 72 | wav = wav[start:end] 73 | out = out[start:end] 74 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 75 | out_dtype = np.int16 76 | elif is_mulaw(hparams.input_type): 77 | # [-1, 1] 78 | out = P.mulaw(wav, hparams.quantize_channels) 79 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 80 | out_dtype = np.float32 81 | else: 82 | # [-1, 1] 83 | out = wav 84 | constant_values = 0.0 85 | out_dtype = np.float32 86 | 87 | # Compute a mel-scale spectrogram from the trimmed wav: 88 | # (N, D) 89 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 90 | # lws pads zeros internally before performing stft 91 | # this is needed to adjust time resolution between audio and mel-spectrogram 92 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 93 | 94 | # zero pad for quantized signal 95 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 96 | N = mel_spectrogram.shape[0] 97 | assert len(out) >= N * audio.get_hop_size() 98 | 99 | # time resolution adjustment 100 | # ensure length of raw audio is multiple of hop_size so that we can use 101 | # transposed convolution to upsample 102 | out = out[:N * audio.get_hop_size()] 103 | assert len(out) % audio.get_hop_size() == 0 104 | 105 | timesteps = len(out) 106 | 107 | # Write the spectrograms to disk: 108 | audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,) 109 | mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,) 110 | text_idx = '%s - %05d' % (text, chunk_idx,) 111 | np.save(os.path.join(out_dir, audio_filename), 112 | out.astype(out_dtype), allow_pickle=False) 113 | np.save(os.path.join(out_dir, mel_filename), 114 | mel_spectrogram.astype(np.float32), allow_pickle=False) 115 | 116 | # Add results tuple describing this training example: 117 | tup_results.append( (audio_filename, mel_filename, timesteps, text_idx) ) 118 | 119 | # Return all the audio results tuples (unpack in caller) 120 | return tup_results 121 | 122 | -------------------------------------------------------------------------------- /ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | 7 | from nnmnkwii import preprocessing as P 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 13 | 14 | from hparams import hparams 15 | 16 | 17 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 18 | executor = ProcessPoolExecutor(max_workers=num_workers) 19 | futures = [] 20 | index = 1 21 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 22 | for line in f: 23 | parts = line.strip().split('|') 24 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 25 | text = parts[2] 26 | futures.append(executor.submit( 27 | partial(_process_utterance, out_dir, index, wav_path, text))) 28 | index += 1 29 | return [future.result() for future in tqdm(futures)] 30 | 31 | 32 | def _process_utterance(out_dir, index, wav_path, text): 33 | # Load the audio to a numpy array: 34 | wav = audio.load_wav(wav_path) 35 | 36 | if hparams.rescaling: 37 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 38 | 39 | # Mu-law quantize 40 | if is_mulaw_quantize(hparams.input_type): 41 | # [0, quantize_channels) 42 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 43 | 44 | # Trim silences 45 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 46 | wav = wav[start:end] 47 | out = out[start:end] 48 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 49 | out_dtype = np.int16 50 | elif is_mulaw(hparams.input_type): 51 | # [-1, 1] 52 | out = P.mulaw(wav, hparams.quantize_channels) 53 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 54 | out_dtype = np.float32 55 | else: 56 | # [-1, 1] 57 | out = wav 58 | constant_values = 0.0 59 | out_dtype = np.float32 60 | 61 | # Compute a mel-scale spectrogram from the trimmed wav: 62 | # (N, D) 63 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 64 | # lws pads zeros internally before performing stft 65 | # this is needed to adjust time resolution between audio and mel-spectrogram 66 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 67 | 68 | # zero pad for quantized signal 69 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 70 | N = mel_spectrogram.shape[0] 71 | assert len(out) >= N * audio.get_hop_size() 72 | 73 | # time resolution adjustment 74 | # ensure length of raw audio is multiple of hop_size so that we can use 75 | # transposed convolution to upsample 76 | out = out[:N * audio.get_hop_size()] 77 | assert len(out) % audio.get_hop_size() == 0 78 | 79 | timesteps = len(out) 80 | 81 | # Write the spectrograms to disk: 82 | audio_filename = 'ljspeech-audio-%05d.npy' % index 83 | mel_filename = 'ljspeech-mel-%05d.npy' % index 84 | np.save(os.path.join(out_dir, audio_filename), 85 | out.astype(out_dtype), allow_pickle=False) 86 | np.save(os.path.join(out_dir, mel_filename), 87 | mel_spectrogram.astype(np.float32), allow_pickle=False) 88 | 89 | # Return a tuple describing this training example: 90 | return (audio_filename, mel_filename, timesteps, text) 91 | -------------------------------------------------------------------------------- /lrschedule.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000): 6 | # Noam scheme from tensor2tensor: 7 | warmup_steps = float(warmup_steps) 8 | step = global_step + 1. 9 | lr = init_lr * warmup_steps**0.5 * np.minimum( 10 | step * warmup_steps**-1.5, step**-0.5) 11 | return lr 12 | 13 | 14 | def step_learning_rate_decay(init_lr, global_step, 15 | anneal_rate=0.98, 16 | anneal_interval=30000): 17 | return init_lr * anneal_rate ** (global_step // anneal_interval) 18 | 19 | 20 | def step_learning_rate(init_lr, global_step, 21 | gamma=0.5, 22 | step_size=100000): 23 | return init_lr * pow(gamma,int(global_step/step_size)) 24 | 25 | def cyclic_cosine_annealing(init_lr, global_step, T, M): 26 | """Cyclic cosine annealing 27 | 28 | https://arxiv.org/pdf/1704.00109.pdf 29 | 30 | Args: 31 | init_lr (float): Initial learning rate 32 | global_step (int): Current iteration number 33 | T (int): Total iteration number (i,e. nepoch) 34 | M (int): Number of ensembles we want 35 | 36 | Returns: 37 | float: Annealed learning rate 38 | """ 39 | TdivM = T // M 40 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0) 41 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Preprocess dataset 4 | 5 | usage: preprocess.py [options] 6 | 7 | options: 8 | --num_workers= Num workers. 9 | --hparams= Hyper parameters [default: ]. 10 | --preset= Path of preset parameters (json). 11 | -h, --help Show help message. 12 | """ 13 | from docopt import docopt 14 | import os 15 | from multiprocessing import cpu_count 16 | from tqdm import tqdm 17 | import importlib 18 | from hparams import hparams 19 | 20 | 21 | def preprocess(mod, in_dir, out_root, num_workers): 22 | os.makedirs(out_dir, exist_ok=True) 23 | metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm) 24 | write_metadata(metadata, out_dir) 25 | 26 | 27 | def write_metadata(metadata, out_dir): 28 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 29 | for m in metadata: 30 | f.write('|'.join([str(x) for x in m]) + '\n') 31 | frames = sum([m[2] for m in metadata]) 32 | sr = hparams.sample_rate 33 | hours = frames / sr / 3600 34 | print('Wrote %d utterances, %d time steps (%.2f hours)' % (len(metadata), frames, hours)) 35 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 36 | print('Max output length: %d' % max(m[2] for m in metadata)) 37 | 38 | 39 | if __name__ == "__main__": 40 | # args = docopt(__doc__) 41 | name = 'ljspeech' # args[""] 42 | in_dir = '/home/jinqiangzeng/work/data/speech/ljspeech/LJSpeech-1.0' # args[""] 43 | out_dir = './data/{}'.format(name)#args[""] 44 | num_workers = None #args["--num_workers"] 45 | num_workers = cpu_count() if num_workers is None else int(num_workers) 46 | preset = '/home/jinqiangzeng/work/mypycharm/wavenet/wavenet_vocoder/presets/ljspeech_gaussian.json'#args["--preset"] 47 | 48 | # Load preset if specified 49 | if preset is not None: 50 | with open(preset) as f: 51 | hparams.parse_json(f.read()) 52 | # Override hyper parameters 53 | # hparams.parse(args["--hparams"]) 54 | assert hparams.name == "wavenet_vocoder" 55 | 56 | print("Sampling frequency: {}".format(hparams.sample_rate)) 57 | 58 | assert name in ["cmu_arctic", "ljspeech", "librivox", ] 59 | mod = importlib.import_module(name) 60 | preprocess(mod, in_dir, out_dir, num_workers) 61 | -------------------------------------------------------------------------------- /presets/cmu_arctic_8bit.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "mulaw-quantize", 5 | "quantize_channels": 256, 6 | "sample_rate": 16000, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 256, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": -1, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 2, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.001, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": false, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true 63 | } -------------------------------------------------------------------------------- /presets/ljspeech_gaussian.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "clari", 3 | "builder": "wavenet", 4 | "input_type": "raw", 5 | "output_type": "Gaussian", 6 | "quantize_channels": 65536, 7 | "sample_rate": 22050, 8 | "silence_threshold": 2, 9 | "num_mels": 80, 10 | "fmin": 125, 11 | "fmax": 7600, 12 | "fft_size": 1024, 13 | "hop_size": 256, 14 | "frame_shift_ms": null, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": true, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "log_scale_min": -32.23619130191664, 21 | "out_channels": 2, 22 | "layers": 20, 23 | "stacks": 2, 24 | "residual_channels": 128, 25 | "gate_channels": 256, 26 | "skip_out_channels": 128, 27 | "dropout": 0.050000000000000044, 28 | "kernel_size": 3, 29 | "weight_normalization": true, 30 | "cin_channels": 80, 31 | "upsample_conditional_features": true, 32 | "upsample_scales": [ 33 | 4, 34 | 4, 35 | 4, 36 | 4 37 | ], 38 | "upsample_size": [ 39 | [ 40 | 30, 41 | 3 42 | ], 43 | [ 44 | 40, 45 | 3 46 | ] 47 | ], 48 | "freq_axis_kernel_size": 3, 49 | "gin_channels": -1, 50 | "n_speakers": 7, 51 | "pin_memory": true, 52 | "num_workers": 4, 53 | "test_size": 0.0441, 54 | "test_num_samples": null, 55 | "random_state": 1234, 56 | "batch_size": 4, 57 | "adam_beta1": 0.9, 58 | "adam_beta2": 0.999, 59 | "adam_eps": 1e-08, 60 | "initial_learning_rate": 0.001, 61 | "lr_schedule": "step_learning_rate", 62 | "lr_schedule_kwargs": {}, 63 | "nepochs": 2000, 64 | "weight_decay": 0.0, 65 | "clip_thresh": -1, 66 | "max_time_sec": null, 67 | "max_time_steps": 10000, 68 | "exponential_moving_average": true, 69 | "ema_decay": 0.9999, 70 | "checkpoint_interval": 3000, 71 | "train_eval_interval": 3000, 72 | "test_eval_epoch_interval": 5, 73 | "save_optimizer_state": true, 74 | "student_out_channels": 2, 75 | "student_layers": 60, 76 | "student_stacks": 6, 77 | "student_residual_channels": 128, 78 | "student_skip_channels": 128, 79 | "iaf_layer_sizes": [ 80 | 10, 81 | 10, 82 | 10, 83 | 10, 84 | 10, 85 | 10 86 | ], 87 | "student_gate_channels": 128, 88 | "use_scale": false, 89 | "iaf_shift": false, 90 | "use_skip":true, 91 | "share_condition_net":true 92 | } 93 | -------------------------------------------------------------------------------- /presets/ljspeech_mixture.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "raw", 5 | "quantize_channels": 65536, 6 | "sample_rate": 22050, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 30, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": -1, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 4, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 12, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.0002, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": true, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true, 63 | "student_out_channels": 2, 64 | "student_layers": 60, 65 | "student_stacks": 6, 66 | "student_residual_channels": 64, 67 | "iaf_layer_sizes": [10, 10, 10, 30], 68 | "student_gate_channels": 64, 69 | "use_scale": false 70 | } 71 | -------------------------------------------------------------------------------- /presets/multispeaker_cmu_arctic_mixture.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "raw", 5 | "quantize_channels": 65536, 6 | "sample_rate": 16000, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 30, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": 16, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 2, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.001, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": true, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true 63 | } -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script for Pypi release 4 | # 0. Make sure you are on git tag 5 | # 1. Run the script 6 | # 2. Upload sdist 7 | 8 | set -e 9 | 10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) 11 | cd $script_dir 12 | 13 | TAG=$(git describe --exact-match --tags HEAD) 14 | 15 | VERSION=${TAG/v/} 16 | 17 | WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py develop sdist 18 | echo "*** Ready to release! deepvoice3_pytorch $TAG ***" 19 | echo "Please run the following command manually:" 20 | echo WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py sdist upload 21 | echo "Please make sure that release verion is correct." 22 | cat wavenet_vocoder/version.py 23 | -------------------------------------------------------------------------------- /resyn.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/resyn.wav -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import setuptools.command.develop 5 | import setuptools.command.build_py 6 | import os 7 | import subprocess 8 | 9 | version = '0.0.4' 10 | 11 | # Adapted from https://github.com/pytorch/pytorch 12 | cwd = os.path.dirname(os.path.abspath(__file__)) 13 | if os.getenv('WAVENET_VOCODER_BUILD_VERSION'): 14 | version = os.getenv('WAVENET_VOCODER_BUILD_VERSION') 15 | else: 16 | try: 17 | sha = subprocess.check_output( 18 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 19 | version += '+' + sha[:7] 20 | except subprocess.CalledProcessError: 21 | pass 22 | except IOError: # FileNotFoundError for python 3 23 | pass 24 | 25 | 26 | class build_py(setuptools.command.build_py.build_py): 27 | 28 | def run(self): 29 | self.create_version_file() 30 | setuptools.command.build_py.build_py.run(self) 31 | 32 | @staticmethod 33 | def create_version_file(): 34 | global version, cwd 35 | print('-- Building version ' + version) 36 | version_path = os.path.join(cwd, 'wavenet_vocoder', 'version.py') 37 | with open(version_path, 'w') as f: 38 | f.write("__version__ = '{}'\n".format(version)) 39 | 40 | 41 | class develop(setuptools.command.develop.develop): 42 | 43 | def run(self): 44 | build_py.create_version_file() 45 | setuptools.command.develop.develop.run(self) 46 | 47 | 48 | setup(name='wavenet_vocoder', 49 | version=version, 50 | description='PyTorch implementation of WaveNet vocoder', 51 | packages=find_packages(), 52 | cmdclass={ 53 | 'build_py': build_py, 54 | 'develop': develop, 55 | }, 56 | install_requires=[ 57 | "numpy", 58 | "scipy", 59 | "torch >= 0.3.0", 60 | "deepvoice3_pytorch >= 0.0.2", 61 | ], 62 | extras_require={ 63 | "train": [ 64 | "docopt", 65 | "tqdm", 66 | "tensorboardX", 67 | "nnmnkwii >= 0.0.11", 68 | "keras", 69 | "scikit-learn", 70 | ], 71 | "test": [ 72 | "nose", 73 | "pysptk >= 0.1.9", 74 | "librosa", 75 | "matplotlib", 76 | "tqdm", 77 | "nnmnkwii >= 0.0.11", 78 | ], 79 | }) 80 | -------------------------------------------------------------------------------- /synthesis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Synthesis waveform from trained WaveNet. 3 | 4 | usage: synthesis.py [options] 5 | 6 | options: 7 | --hparams= Hyper parameters [default: ]. 8 | --preset= Path of preset parameters (json). 9 | --length= Steps to generate [default: 32000]. 10 | --initial-value= Initial value for the WaveNet decoder. 11 | --conditional=

Conditional features path. 12 | --file-name-suffix= File name suffix [default: ]. 13 | --speaker-id= Speaker ID (for multi-speaker model). 14 | --output-html Output html for blog post. 15 | -h, --help Show help message. 16 | """ 17 | from docopt import docopt 18 | 19 | import sys 20 | import os 21 | from os.path import dirname, join, basename, splitext 22 | import torch 23 | from torch.autograd import Variable 24 | import numpy as np 25 | from nnmnkwii import preprocessing as P 26 | from keras.utils import np_utils 27 | from tqdm import tqdm 28 | import librosa 29 | 30 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 31 | 32 | import audio 33 | from hparams import hparams 34 | 35 | torch.set_num_threads(4) 36 | use_cuda = torch.cuda.is_available() 37 | 38 | 39 | def _to_numpy(x): 40 | # this is ugly 41 | if x is None: 42 | return None 43 | if isinstance(x, np.ndarray) or np.isscalar(x): 44 | return x 45 | # remove batch axis 46 | if x.dim() == 3: 47 | x = x.squeeze(0) 48 | return x.numpy() 49 | 50 | 51 | def wavegen(model, length=None, c=None, g=None, initial_value=None, 52 | fast=False, tqdm=tqdm): 53 | """Generate waveform samples by WaveNet. 54 | 55 | Args: 56 | model (nn.Module) : WaveNet decoder 57 | length (int): Time steps to generate. If conditinlal features are given, 58 | then this is determined by the feature size. 59 | c (numpy.ndarray): Conditional features, of shape T x C 60 | g (scaler): Speaker ID 61 | initial_value (int) : initial_value for the WaveNet decoder. 62 | fast (Bool): Whether to remove weight normalization or not. 63 | tqdm (lambda): tqdm 64 | 65 | Returns: 66 | numpy.ndarray : Generated waveform samples 67 | """ 68 | from train import sanity_check 69 | sanity_check(model, c, g) 70 | 71 | c = _to_numpy(c) 72 | g = _to_numpy(g) 73 | 74 | if use_cuda: 75 | model = model.cuda() 76 | model.eval() 77 | if fast: 78 | model.make_generation_fast_() 79 | 80 | if c is None: 81 | assert length is not None 82 | else: 83 | # (Tc, D) 84 | if c.ndim != 2: 85 | raise RuntimeError( 86 | "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) 87 | assert c.ndim == 2 88 | Tc = c.shape[0] 89 | upsample_factor = audio.get_hop_size() 90 | # Overwrite length according to feature size 91 | length = Tc * upsample_factor 92 | # (Tc, D) -> (Tc', D) 93 | # Repeat features before feeding it to the network 94 | if not hparams.upsample_conditional_features: 95 | c = np.repeat(c, upsample_factor, axis=0) 96 | 97 | # B x C x T 98 | c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) 99 | 100 | if initial_value is None: 101 | if is_mulaw_quantize(hparams.input_type): 102 | initial_value = P.mulaw_quantize(0, hparams.quantize_channels) 103 | else: 104 | initial_value = 0.0 105 | 106 | if is_mulaw_quantize(hparams.input_type): 107 | assert 0 <= initial_value < hparams.quantize_channels 108 | initial_input = np_utils.to_categorical( 109 | initial_value, num_classes=hparams.quantize_channels).astype(np.float32) 110 | initial_input = Variable(torch.from_numpy(initial_input)).view( 111 | 1, 1, hparams.quantize_channels) 112 | else: 113 | initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value) 114 | 115 | g = None if g is None else Variable(torch.LongTensor([g])) 116 | if use_cuda: 117 | initial_input = initial_input.cuda() 118 | g = None if g is None else g.cuda() 119 | c = None if c is None else c.cuda() 120 | 121 | y_hat = model.incremental_forward( 122 | initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, 123 | log_scale_min=hparams.log_scale_min) 124 | 125 | if is_mulaw_quantize(hparams.input_type): 126 | y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() 127 | y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) 128 | elif is_mulaw(hparams.input_type): 129 | y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) 130 | else: 131 | y_hat = y_hat.view(-1).cpu().data.numpy() 132 | 133 | return y_hat 134 | 135 | 136 | if __name__ == "__main__": 137 | args = docopt(__doc__) 138 | print("Command line args:\n", args) 139 | checkpoint_path = args[""] 140 | dst_dir = args[""] 141 | 142 | length = int(args["--length"]) 143 | initial_value = args["--initial-value"] 144 | initial_value = None if initial_value is None else float(initial_value) 145 | conditional_path = args["--conditional"] 146 | file_name_suffix = args["--file-name-suffix"] 147 | output_html = args["--output-html"] 148 | speaker_id = args["--speaker-id"] 149 | speaker_id = None if speaker_id is None else int(speaker_id) 150 | preset = args["--preset"] 151 | 152 | # Load preset if specified 153 | if preset is not None: 154 | with open(preset) as f: 155 | hparams.parse_json(f.read()) 156 | # Override hyper parameters 157 | hparams.parse(args["--hparams"]) 158 | assert hparams.name == "wavenet_vocoder" 159 | 160 | # Load conditional features 161 | if conditional_path is not None: 162 | c = np.load(conditional_path) 163 | else: 164 | c = None 165 | 166 | from train import build_model 167 | 168 | # Model 169 | model = build_model() 170 | 171 | # Load checkpoint 172 | print("Load checkpoint from {}".format(checkpoint_path)) 173 | 174 | if use_cuda: 175 | checkpoint = torch.load(checkpoint_path) 176 | else: 177 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 178 | model.load_state_dict(checkpoint["state_dict"]) 179 | 180 | checkpoint_name = splitext(basename(checkpoint_path))[0] 181 | 182 | wav_id = conditional_path.split("/")[-1].split(".")[0].split("-")[-1] 183 | dataset_name = conditional_path.split("/")[-1].split(".")[0].split("-")[0] 184 | save_dir = join(dst_dir, checkpoint_name, dataset_name) 185 | os.makedirs(save_dir, exist_ok=True) 186 | 187 | dst_wav_path = join(save_dir, "{}{}.wav".format(wav_id, file_name_suffix)) 188 | 189 | # DO generate 190 | waveform = wavegen(model, length, c=c, g=speaker_id, initial_value=initial_value, fast=True) 191 | 192 | # save 193 | librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) 194 | 195 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 196 | sys.exit(0) 197 | -------------------------------------------------------------------------------- /synthesis_student.py: -------------------------------------------------------------------------------- 1 | """ 2 | Synthesis waveform from trained WaveNet. 3 | 4 | usage: synthesis.py [options] 5 | 6 | options: 7 | --hparams= Hyper parameters [default: ]. 8 | --preset= Path of preset parameters (json). 9 | --length= Steps to generate [default: 32000]. 10 | --initial-value= Initial value for the WaveNet decoder. 11 | --conditional=

Conditional features path. 12 | --file-name-suffix= File name suffix [default: ]. 13 | --speaker-id= Speaker ID (for multi-speaker model). 14 | --output-html Output html for blog post. 15 | -h, --help Show help message. 16 | """ 17 | from docopt import docopt 18 | 19 | import sys 20 | import os 21 | from os.path import dirname, join, basename, splitext 22 | import torch 23 | from torch.autograd import Variable 24 | import numpy as np 25 | from nnmnkwii import preprocessing as P 26 | from keras.utils import np_utils 27 | from tqdm import tqdm 28 | import librosa 29 | 30 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 31 | 32 | import audio 33 | from hparams import hparams 34 | 35 | torch.set_num_threads(1) 36 | # use_cuda = torch.cuda.is_available() 37 | use_cuda = False 38 | 39 | 40 | def _to_numpy(x): 41 | # this is ugly 42 | if x is None: 43 | return None 44 | if isinstance(x, np.ndarray) or np.isscalar(x): 45 | return x 46 | # remove batch axis 47 | if x.dim() == 3: 48 | x = x.squeeze(0) 49 | return x.numpy() 50 | 51 | 52 | def wavegen(model, length=None, c=None, g=None, tqdm=tqdm): 53 | """Generate waveform samples by WaveNet. 54 | 55 | Args: 56 | model (nn.Module) : WaveNet decoder 57 | length (int): Time steps to generate. If conditinlal features are given, 58 | then this is determined by the feature size. 59 | c (numpy.ndarray): Conditional features, of shape T x C 60 | g (scaler): Speaker ID 61 | initial_value (int) : initial_value for the WaveNet decoder. 62 | fast (Bool): Whether to remove weight normalization or not. 63 | tqdm (lambda): tqdm 64 | 65 | Returns: 66 | numpy.ndarray : Generated waveform samples 67 | """ 68 | from train import sanity_check 69 | sanity_check(model, c, g) 70 | 71 | c = _to_numpy(c) 72 | g = _to_numpy(g) 73 | 74 | if use_cuda: 75 | model = model.cuda() 76 | model.eval() 77 | 78 | if c is None: 79 | assert length is not None 80 | else: 81 | # (Tc, D) 82 | if c.ndim != 2: 83 | raise RuntimeError( 84 | "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) 85 | assert c.ndim == 2 86 | Tc = c.shape[0] 87 | upsample_factor = audio.get_hop_size() 88 | # Overwrite length according to feature size 89 | length = Tc * upsample_factor 90 | # (Tc, D) -> (Tc', D) 91 | # Repeat features before feeding it to the network 92 | if not hparams.upsample_conditional_features: 93 | c = np.repeat(c, upsample_factor, axis=0) 94 | 95 | # B x C x T 96 | c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) 97 | 98 | # if initial_value is None: 99 | # if is_mulaw_quantize(hparams.input_type): 100 | # initial_value = P.mulaw_quantize(0, hparams.quantize_channels) 101 | # else: 102 | # initial_value = 0.0 103 | # 104 | # if is_mulaw_quantize(hparams.input_type): 105 | # assert 0 <= initial_value < hparams.quantize_channels 106 | # initial_input = np_utils.to_categorical( 107 | # initial_value, num_classes=hparams.quantize_channels).astype(np.float32) 108 | # initial_input = Variable(torch.from_numpy(initial_input)).view( 109 | # 1, 1, hparams.quantize_channels) 110 | # else: 111 | # initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value) 112 | # 113 | # g = None if g is None else Variable(torch.LongTensor([g])) 114 | # if use_cuda: 115 | # initial_input = initial_input.cuda() 116 | # g = None if g is None else g.cuda() 117 | # c = None if c is None else c.cuda() 118 | 119 | # y_hat = model.incremental_forward( 120 | # initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, 121 | # log_scale_min=hparams.log_scale_min) 122 | 123 | with torch.no_grad(): 124 | y_student, _, _ = model(None, c, g, softmax=False, use_cuda=use_cuda) 125 | y_student = y_student.view(-1).cpu().data.numpy() 126 | return y_student 127 | 128 | # if is_mulaw_quantize(hparams.input_type): 129 | # y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() 130 | # y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) 131 | # elif is_mulaw(hparams.input_type): 132 | # y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) 133 | # else: 134 | # y_hat = y_hat.view(-1).cpu().data.numpy() 135 | 136 | 137 | if __name__ == "__main__": 138 | args = docopt(__doc__) 139 | print("Command line args:\n", args) 140 | checkpoint_path = args[""] 141 | dst_dir = args[""] 142 | 143 | length = int(args["--length"]) 144 | initial_value = args["--initial-value"] 145 | initial_value = None if initial_value is None else float(initial_value) 146 | conditional_path = args["--conditional"] 147 | file_name_suffix = args["--file-name-suffix"] 148 | output_html = args["--output-html"] 149 | speaker_id = args["--speaker-id"] 150 | speaker_id = None if speaker_id is None else int(speaker_id) 151 | preset = args["--preset"] 152 | 153 | # Load preset if specified 154 | if preset is not None: 155 | with open(preset) as f: 156 | hparams.parse_json(f.read()) 157 | # Override hyper parameters 158 | hparams.parse(args["--hparams"]) 159 | assert hparams.name == "wavenet_vocoder" 160 | 161 | # Load conditional features 162 | if conditional_path is not None: 163 | c = np.load(conditional_path) 164 | else: 165 | c = None 166 | 167 | from train_student import build_model 168 | 169 | # Model 170 | model = build_model(name="student") 171 | # Load checkpoint 172 | print("Load checkpoint from {}".format(checkpoint_path)) 173 | 174 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 175 | model.load_state_dict(checkpoint["state_dict"]) 176 | 177 | checkpoint_name = splitext(basename(checkpoint_path))[0] 178 | 179 | wav_id = conditional_path.split("/")[-1].split(".")[0].split("-")[-1] 180 | dataset_name = conditional_path.split("/")[-1].split(".")[0].split("-")[0] 181 | save_dir = join(dst_dir, checkpoint_name + "_student", dataset_name) 182 | os.makedirs(save_dir, exist_ok=True) 183 | 184 | dst_wav_path = join(save_dir, "{}{}.wav".format(wav_id, file_name_suffix)) 185 | 186 | # DO generate 187 | waveform = wavegen(model, length, c=c, g=speaker_id) 188 | 189 | # save 190 | librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) 191 | 192 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 193 | sys.exit(0) 194 | -------------------------------------------------------------------------------- /tests/test_audio.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join 6 | sys.path.insert(0, join(dirname(__file__), "..")) 7 | 8 | import numpy as np 9 | from nose.plugins.attrib import attr 10 | 11 | import logging 12 | logging.getLogger('tensorflow').disabled = True 13 | 14 | 15 | @attr("local_only") 16 | def test_amp_to_db(): 17 | import audio 18 | x = np.random.rand(10) 19 | x_hat = audio._db_to_amp(audio._amp_to_db(x)) 20 | assert np.allclose(x, x_hat) 21 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | from wavenet_vocoder import receptive_field_size 5 | 6 | 7 | def test_receptive_field_size(): 8 | # Table 4 in https://arxiv.org/abs/1711.10433 9 | assert receptive_field_size(total_layers=30, num_cycles=3, kernel_size=3) == 6139 10 | assert receptive_field_size(total_layers=24, num_cycles=4, kernel_size=3) == 505 11 | assert receptive_field_size(total_layers=12, num_cycles=2, kernel_size=3) == 253 12 | assert receptive_field_size(total_layers=30, num_cycles=1, 13 | kernel_size=3, dilation=lambda x: 1) == 61 14 | -------------------------------------------------------------------------------- /tests/test_mixture.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch.autograd import Variable 8 | from torch.nn import functional as F 9 | 10 | import librosa 11 | import pysptk 12 | 13 | from wavenet_vocoder.mixture import discretized_mix_logistic_loss 14 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic 15 | 16 | 17 | def log_prob_from_logits(x): 18 | """ numerically stable log_softmax implementation that prevents overflow """ 19 | # TF ordering 20 | axis = len(x.size()) - 1 21 | m, _ = torch.max(x, dim=-1, keepdim=True) 22 | return x - m - torch.log(torch.sum(torch.exp(x - m), dim=axis, keepdim=True)) 23 | 24 | 25 | def test_log_softmax(): 26 | x = Variable(torch.rand(2, 16000, 30)) 27 | y = log_prob_from_logits(x) 28 | y_hat = F.log_softmax(x, -1) 29 | 30 | y = y.data.cpu().numpy() 31 | y_hat = y_hat.data.cpu().numpy() 32 | assert np.allclose(y, y_hat) 33 | 34 | 35 | def test_mixture(): 36 | np.random.seed(1234) 37 | 38 | x, sr = librosa.load(pysptk.util.example_audio_file(), sr=None) 39 | assert sr == 16000 40 | 41 | T = len(x) 42 | x = x.reshape(1, T, 1) 43 | y = Variable(torch.from_numpy(x)).float() 44 | y_hat = Variable(torch.rand(1, 30, T)).float() 45 | 46 | print(y.shape, y_hat.shape) 47 | 48 | loss = discretized_mix_logistic_loss(y_hat, y) 49 | print(loss) 50 | 51 | loss = discretized_mix_logistic_loss(y_hat, y, reduce=False) 52 | print(loss.size(), y.size()) 53 | assert loss.size() == y.size() 54 | 55 | y = sample_from_discretized_mix_logistic(y_hat) 56 | print(y.shape) 57 | 58 | 59 | def test_misc(): 60 | # https://en.wikipedia.org/wiki/Logistic_distribution 61 | # what i have learned 62 | # m = (x - mu) / s 63 | m = Variable(torch.rand(10, 10)) 64 | log_pdf_mid1 = -2 * torch.log(torch.exp(m / 2) + torch.exp(-m / 2)) 65 | log_pdf_mid2 = m - 2 * F.softplus(m) 66 | assert np.allclose(log_pdf_mid1.data.numpy(), log_pdf_mid2.data.numpy()) 67 | 68 | # Edge case for 0 69 | plus_in = Variable(torch.rand(10, 10)) 70 | log_cdf_plus1 = F.sigmoid(m).log() 71 | log_cdf_plus2 = m - F.softplus(m) 72 | assert np.allclose(log_cdf_plus1.data.numpy(), log_cdf_plus2.data.numpy()) 73 | 74 | # Edge case for 255 75 | min_in = Variable(torch.rand(10, 10)) 76 | log_one_minus_cdf_min1 = (1 - F.sigmoid(min_in)).log() 77 | log_one_minus_cdf_min2 = -F.softplus(min_in) 78 | assert np.allclose(log_one_minus_cdf_min1.data.numpy(), log_one_minus_cdf_min2.data.numpy()) 79 | -------------------------------------------------------------------------------- /tests/test_model.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import torch 5 | from torch import nn 6 | from torch.autograd import Variable 7 | from torch.nn import functional as F 8 | from nnmnkwii import preprocessing as P 9 | from pysptk.util import example_audio_file 10 | import librosa 11 | import numpy as np 12 | from tqdm import tqdm 13 | from os.path import join, dirname, exists 14 | from functools import partial 15 | from nose.plugins.attrib import attr 16 | 17 | from wavenet_vocoder.modules import ResidualConv1dGLU 18 | from wavenet_vocoder import WaveNet 19 | 20 | use_cuda = False 21 | 22 | # For test 23 | build_compact_model = partial(WaveNet, layers=4, stacks=2, residual_channels=32, 24 | gate_channels=32, skip_out_channels=32, 25 | scalar_input=False) 26 | 27 | # https://github.com/keras-team/keras/blob/master/keras/utils/np_utils.py 28 | # copied to avoid keras dependency in tests 29 | 30 | 31 | def to_categorical(y, num_classes=None): 32 | """Converts a class vector (integers) to binary class matrix. 33 | E.g. for use with categorical_crossentropy. 34 | # Arguments 35 | y: class vector to be converted into a matrix 36 | (integers from 0 to num_classes). 37 | num_classes: total number of classes. 38 | # Returns 39 | A binary matrix representation of the input. 40 | """ 41 | y = np.array(y, dtype='int') 42 | input_shape = y.shape 43 | if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: 44 | input_shape = tuple(input_shape[:-1]) 45 | y = y.ravel() 46 | if not num_classes: 47 | num_classes = np.max(y) + 1 48 | n = y.shape[0] 49 | categorical = np.zeros((n, num_classes)) 50 | categorical[np.arange(n), y] = 1 51 | output_shape = input_shape + (num_classes,) 52 | categorical = np.reshape(categorical, output_shape) 53 | return categorical 54 | 55 | 56 | def test_conv_block(): 57 | conv = ResidualConv1dGLU(30, 30, kernel_size=3, dropout=1 - 0.95) 58 | print(conv) 59 | x = Variable(torch.zeros(16, 30, 16000)) 60 | y, h = conv(x) 61 | print(y.size(), h.size()) 62 | 63 | 64 | def test_wavenet(): 65 | model = build_compact_model() 66 | print(model) 67 | x = Variable(torch.zeros(16, 256, 1000)) 68 | y = model(x) 69 | print(y.size()) 70 | 71 | 72 | def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True): 73 | x, _ = librosa.load(example_audio_file(), sr=sr) 74 | x, _ = librosa.effects.trim(x, top_db=15) 75 | 76 | # To save computational cost 77 | x = x[:N] 78 | 79 | # For power conditioning wavenet 80 | if returns_power: 81 | # (1 x N') 82 | p = librosa.feature.rmse(x, frame_length=256, hop_length=128) 83 | upsample_factor = x.size // p.size 84 | # (1 x N) 85 | p = np.repeat(p, upsample_factor, axis=-1) 86 | if p.size < x.size: 87 | # pad against time axis 88 | p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0) 89 | 90 | # shape adajst 91 | p = p.reshape(1, 1, -1) 92 | 93 | # (T,) 94 | if mulaw: 95 | x = P.mulaw_quantize(x) 96 | x_org = P.inv_mulaw_quantize(x) 97 | # (C, T) 98 | x = to_categorical(x, num_classes=256).T 99 | # (1, C, T) 100 | x = x.reshape(1, 256, -1).astype(np.float32) 101 | else: 102 | x_org = x 103 | x = x.reshape(1, 1, -1) 104 | 105 | if returns_power: 106 | return x, x_org, p 107 | 108 | return x, x_org 109 | 110 | 111 | @attr("mixture") 112 | def test_mixture_wavenet(): 113 | x, x_org, c = _test_data(returns_power=True, mulaw=False) 114 | # 10 mixtures 115 | model = build_compact_model(out_channels=3 * 10, cin_channels=1, 116 | scalar_input=True) 117 | T = x.shape[-1] 118 | print(model.first_conv) 119 | 120 | # scalar input, not one-hot 121 | assert x.shape[1] == 1 122 | 123 | x = Variable(torch.from_numpy(x).contiguous()) 124 | x = x.cuda() if use_cuda else x 125 | 126 | c = Variable(torch.from_numpy(c).contiguous()) 127 | c = c.cuda() if use_cuda else c 128 | print(c.size()) 129 | 130 | model.eval() 131 | 132 | # Incremental forward with forced teaching 133 | y_online = model.incremental_forward( 134 | test_inputs=x, c=c, T=None, tqdm=tqdm) 135 | 136 | assert y_online.size() == x.size() 137 | 138 | y_online2 = model.incremental_forward( 139 | test_inputs=None, c=c, T=T, tqdm=tqdm) 140 | 141 | assert y_online2.size() == x.size() 142 | print(x.size()) 143 | 144 | 145 | @attr("local_conditioning") 146 | def test_local_conditioning_correctness(): 147 | # condition by power 148 | x, x_org, c = _test_data(returns_power=True) 149 | model = build_compact_model(cin_channels=1) 150 | assert model.local_conditioning_enabled() 151 | assert not model.has_speaker_embedding() 152 | 153 | x = Variable(torch.from_numpy(x).contiguous()) 154 | x = x.cuda() if use_cuda else x 155 | 156 | c = Variable(torch.from_numpy(c).contiguous()) 157 | c = c.cuda() if use_cuda else c 158 | print(x.size(), c.size()) 159 | 160 | model.eval() 161 | 162 | y_offline = model(x, c=c, softmax=True) 163 | 164 | # Incremental forward with forced teaching 165 | y_online = model.incremental_forward( 166 | test_inputs=x, c=c, T=None, tqdm=tqdm, softmax=True, quantize=False) 167 | 168 | # (1 x C x T) 169 | c = (y_offline - y_online).abs() 170 | print(c.mean(), c.max()) 171 | 172 | try: 173 | assert np.allclose(y_offline.cpu().data.numpy(), 174 | y_online.cpu().data.numpy(), atol=1e-4) 175 | except: 176 | from warnings import warn 177 | warn("oops! must be a bug!") 178 | 179 | 180 | @attr("local_conditioning") 181 | def test_local_conditioning_upsample_correctness(): 182 | # condition by power 183 | x, x_org, c = _test_data(returns_power=True) 184 | 185 | # downsample by 4 186 | assert c.shape[-1] % 4 == 0 187 | c = c[:, :, 0::4] 188 | 189 | model = build_compact_model( 190 | cin_channels=1, upsample_conditional_features=True, 191 | upsample_scales=[2, 2]) 192 | assert model.local_conditioning_enabled() 193 | assert not model.has_speaker_embedding() 194 | 195 | x = Variable(torch.from_numpy(x).contiguous()) 196 | x = x.cuda() if use_cuda else x 197 | 198 | c = Variable(torch.from_numpy(c).contiguous()) 199 | c = c.cuda() if use_cuda else c 200 | print(x.size(), c.size()) 201 | 202 | model.eval() 203 | 204 | y_offline = model(x, c=c, softmax=True) 205 | 206 | # Incremental forward with forced teaching 207 | y_online = model.incremental_forward( 208 | test_inputs=x, c=c, T=None, tqdm=tqdm, softmax=True, quantize=False) 209 | 210 | # (1 x C x T) 211 | c = (y_offline - y_online).abs() 212 | print(c.mean(), c.max()) 213 | 214 | try: 215 | assert np.allclose(y_offline.cpu().data.numpy(), 216 | y_online.cpu().data.numpy(), atol=1e-4) 217 | except: 218 | from warnings import warn 219 | warn("oops! must be a bug!") 220 | 221 | 222 | @attr("global_conditioning") 223 | def test_global_conditioning_with_embedding_correctness(): 224 | # condition by mean power 225 | x, x_org, c = _test_data(returns_power=True) 226 | g = c.mean(axis=-1, keepdims=True).astype(np.int) 227 | model = build_compact_model(gin_channels=16, n_speakers=256, 228 | use_speaker_embedding=True) 229 | assert not model.local_conditioning_enabled() 230 | assert model.has_speaker_embedding() 231 | 232 | x = Variable(torch.from_numpy(x).contiguous()) 233 | x = x.cuda() if use_cuda else x 234 | 235 | g = Variable(torch.from_numpy(g).contiguous()) 236 | g = g.cuda() if use_cuda else g 237 | print(g.size()) 238 | 239 | model.eval() 240 | 241 | y_offline = model(x, g=g, softmax=True) 242 | 243 | # Incremental forward with forced teaching 244 | y_online = model.incremental_forward( 245 | test_inputs=x, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False) 246 | 247 | # (1 x C x T) 248 | c = (y_offline - y_online).abs() 249 | print(c.mean(), c.max()) 250 | 251 | try: 252 | assert np.allclose(y_offline.cpu().data.numpy(), 253 | y_online.cpu().data.numpy(), atol=1e-4) 254 | except: 255 | from warnings import warn 256 | warn("oops! must be a bug!") 257 | 258 | 259 | @attr("global_conditioning") 260 | def test_global_conditioning_correctness(): 261 | # condition by mean power 262 | x, x_org, c = _test_data(returns_power=True) 263 | # must be floating-point type 264 | g = c.mean(axis=-1, keepdims=True).astype(np.float32) 265 | model = build_compact_model(gin_channels=1, use_speaker_embedding=False) 266 | assert not model.local_conditioning_enabled() 267 | # `use_speaker_embedding` False should diable embedding layer 268 | assert not model.has_speaker_embedding() 269 | 270 | x = Variable(torch.from_numpy(x).contiguous()) 271 | x = x.cuda() if use_cuda else x 272 | 273 | g = Variable(torch.from_numpy(g).contiguous()) 274 | g = g.cuda() if use_cuda else g 275 | print(g.size()) 276 | 277 | model.eval() 278 | y_offline = model(x, g=g, softmax=True) 279 | 280 | # Incremental forward with forced teaching 281 | y_online = model.incremental_forward( 282 | test_inputs=x, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False) 283 | 284 | # (1 x C x T) 285 | c = (y_offline - y_online).abs() 286 | print(c.mean(), c.max()) 287 | 288 | try: 289 | assert np.allclose(y_offline.cpu().data.numpy(), 290 | y_online.cpu().data.numpy(), atol=1e-4) 291 | except: 292 | from warnings import warn 293 | warn("oops! must be a bug!") 294 | 295 | 296 | @attr("local_and_global_conditioning") 297 | def test_global_and_local_conditioning_correctness(): 298 | x, x_org, c = _test_data(returns_power=True) 299 | g = c.mean(axis=-1, keepdims=True).astype(np.int) 300 | model = build_compact_model(cin_channels=1, gin_channels=16, n_speakers=256) 301 | assert model.local_conditioning_enabled() 302 | assert model.has_speaker_embedding() 303 | 304 | x = Variable(torch.from_numpy(x).contiguous()) 305 | x = x.cuda() if use_cuda else x 306 | 307 | # per-sample power 308 | c = Variable(torch.from_numpy(c).contiguous()) 309 | c = c.cuda() if use_cuda else c 310 | 311 | # mean power 312 | g = Variable(torch.from_numpy(g).contiguous()) 313 | g = g.cuda() if use_cuda else g 314 | 315 | print(c.size(), g.size()) 316 | 317 | model.eval() 318 | 319 | y_offline = model(x, c=c, g=g, softmax=True) 320 | 321 | # Incremental forward with forced teaching 322 | y_online = model.incremental_forward( 323 | test_inputs=x, c=c, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False) 324 | # (1 x C x T) 325 | 326 | c = (y_offline - y_online).abs() 327 | print(c.mean(), c.max()) 328 | 329 | try: 330 | assert np.allclose(y_offline.cpu().data.numpy(), 331 | y_online.cpu().data.numpy(), atol=1e-4) 332 | except: 333 | from warnings import warn 334 | warn("oops! must be a bug!") 335 | 336 | 337 | @attr("local_only") 338 | def test_incremental_forward_correctness(): 339 | import librosa.display 340 | from matplotlib import pyplot as plt 341 | 342 | model = build_compact_model() 343 | 344 | checkpoint_path = join(dirname(__file__), "..", "foobar/checkpoint_step000058000.pth") 345 | if exists(checkpoint_path): 346 | print("Loading from:", checkpoint_path) 347 | checkpoint = torch.load(checkpoint_path) 348 | model.load_state_dict(checkpoint["state_dict"]) 349 | 350 | if use_cuda: 351 | model = model.cuda() 352 | 353 | sr = 4000 354 | x, x_org = _test_data(sr=sr, N=3000) 355 | x = Variable(torch.from_numpy(x).contiguous()) 356 | x = x.cuda() if use_cuda else x 357 | 358 | model.eval() 359 | 360 | # Batch forward 361 | y_offline = model(x, softmax=True) 362 | 363 | # Test from zero start 364 | y_online = model.incremental_forward(initial_input=None, T=100, tqdm=tqdm, softmax=True) 365 | 366 | # Incremental forward with forced teaching 367 | y_online = model.incremental_forward(test_inputs=x, tqdm=tqdm, softmax=True, quantize=False) 368 | 369 | # (1 x C x T) 370 | c = (y_offline - y_online).abs() 371 | print(c.mean(), c.max()) 372 | 373 | try: 374 | assert np.allclose(y_offline.cpu().data.numpy(), 375 | y_online.cpu().data.numpy(), atol=1e-4) 376 | except: 377 | from warnings import warn 378 | warn("oops! must be a bug!") 379 | 380 | # (1, T, C) 381 | xt = x.transpose(1, 2).contiguous() 382 | 383 | initial_input = xt[:, 0, :].unsqueeze(1).contiguous() 384 | print(initial_input.size()) 385 | print("Inital value:", initial_input.view(-1).max(0)[1]) 386 | 387 | # With zero start 388 | zerostart = True 389 | if zerostart: 390 | y_inference = model.incremental_forward( 391 | initial_input=initial_input, T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True) 392 | else: 393 | # Feed a few samples as test_inputs and then generate auto-regressively 394 | N = 1000 395 | y_inference = model.incremental_forward( 396 | initial_input=None, test_inputs=xt[:, :N, :], 397 | T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True) 398 | 399 | # Waveforms 400 | # (T,) 401 | y_offline = y_offline.max(1)[1].view(-1) 402 | y_online = y_online.max(1)[1].view(-1) 403 | y_inference = y_inference.max(1)[1].view(-1) 404 | 405 | y_offline = P.inv_mulaw_quantize(y_offline.cpu().data.long().numpy()) 406 | y_online = P.inv_mulaw_quantize(y_online.cpu().data.long().numpy()) 407 | y_inference = P.inv_mulaw_quantize(y_inference.cpu().data.long().numpy()) 408 | 409 | plt.figure(figsize=(16, 10)) 410 | plt.subplot(4, 1, 1) 411 | librosa.display.waveplot(x_org, sr=sr) 412 | plt.subplot(4, 1, 2) 413 | librosa.display.waveplot(y_offline, sr=sr) 414 | plt.subplot(4, 1, 3) 415 | librosa.display.waveplot(y_online, sr=sr) 416 | plt.subplot(4, 1, 4) 417 | librosa.display.waveplot(y_inference, sr=sr) 418 | plt.show() 419 | 420 | save_audio = False 421 | if save_audio: 422 | librosa.output.write_wav("target.wav", x_org, sr=sr) 423 | librosa.output.write_wav("online.wav", y_online, sr=sr) 424 | librosa.output.write_wav("inference.wav", y_inference, sr=sr) 425 | -------------------------------------------------------------------------------- /tests/test_upsample.py: -------------------------------------------------------------------------------- 1 | from wavenet_vocoder.upsample import UpSampleConv,ClariUpsampleConv 2 | from train import get_data_loaders 3 | from train import eval_model,load_checkpoint,build_model 4 | 5 | def test_upsample(): 6 | data_loaders = get_data_loaders('../data/ljspeech',-1) 7 | for phase, data_loader in data_loaders.items(): 8 | train = (phase == "train") 9 | running_loss = 0. 10 | test_evaluated = False 11 | for step, (x, y, c, g, input_lengths) in enumerate(data_loader): 12 | c = c.unsqueeze(1) 13 | upconv1 = UpSampleConv() 14 | c1 = upconv1(c) 15 | break 16 | 17 | 18 | 19 | def test_sample(): 20 | preste = '../presets/ljspeech_gaussian.json' 21 | model = build_model() -------------------------------------------------------------------------------- /wavenet_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | 5 | from .wavenet import receptive_field_size, WaveNet 6 | from .student_wavenet import StudentWaveNet 7 | from .clari_wavenet import ClariWaveNet -------------------------------------------------------------------------------- /wavenet_vocoder/builder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | 5 | def wavenet(out_channels=256, 6 | layers=20, 7 | stacks=2, 8 | residual_channels=512, 9 | gate_channels=512, 10 | skip_out_channels=512, 11 | cin_channels=-1, 12 | gin_channels=-1, 13 | weight_normalization=True, 14 | dropout=1 - 0.95, 15 | kernel_size=3, 16 | n_speakers=None, 17 | upsample_conditional_features=False, 18 | upsample_scales=[16, 16], 19 | freq_axis_kernel_size=3, 20 | scalar_input=False, 21 | use_speaker_embedding=True, 22 | output_type="Gaussian" 23 | ): 24 | from wavenet_vocoder import WaveNet 25 | 26 | model = WaveNet(out_channels=out_channels, layers=layers, stacks=stacks, 27 | residual_channels=residual_channels, 28 | gate_channels=gate_channels, 29 | skip_out_channels=skip_out_channels, 30 | kernel_size=kernel_size, dropout=dropout, 31 | weight_normalization=weight_normalization, 32 | cin_channels=cin_channels, gin_channels=gin_channels, 33 | n_speakers=n_speakers, 34 | upsample_conditional_features=upsample_conditional_features, 35 | upsample_scales=upsample_scales, 36 | freq_axis_kernel_size=freq_axis_kernel_size, 37 | scalar_input=scalar_input, 38 | use_speaker_embedding=use_speaker_embedding, 39 | output_type=output_type 40 | ) 41 | 42 | return model 43 | 44 | 45 | def student_wavenet(out_channels=2, 46 | layers=20, 47 | stacks=2, 48 | residual_channels=64, 49 | iaf_layer_sizes=[10, 10, 10, 30], 50 | gate_channels=64, 51 | kernel_size=3, dropout=1 - 0.95, 52 | cin_channels=-1, gin_channels=-1, n_speakers=None, 53 | weight_normalization=True, 54 | upsample_conditional_features=False, 55 | upsample_scales=None, 56 | freq_axis_kernel_size=3, 57 | scalar_input=False, 58 | use_speaker_embedding=True 59 | ): 60 | from wavenet_vocoder import StudentWaveNet 61 | 62 | model = StudentWaveNet(out_channels=out_channels, 63 | layers=layers, stacks=stacks, 64 | residual_channels=residual_channels, 65 | iaf_layer_sizes=iaf_layer_sizes, gate_channels=gate_channels, kernel_size=kernel_size, 66 | dropout=dropout, 67 | cin_channels=cin_channels, gin_channels=gin_channels, 68 | n_speakers=n_speakers, 69 | upsample_conditional_features=upsample_conditional_features, 70 | upsample_scales=upsample_scales, 71 | freq_axis_kernel_size=freq_axis_kernel_size, 72 | scalar_input=scalar_input, 73 | use_speaker_embedding=use_speaker_embedding, 74 | ) 75 | return model 76 | 77 | 78 | def clari_wavenet(out_channels=2, 79 | layers=20, 80 | stacks=2, 81 | residual_channels=64, 82 | iaf_layer_sizes=[10, 10, 10, 30], 83 | gate_channels=64, 84 | kernel_size=3, 85 | dropout=1 - 0.95, 86 | cin_channels=-1, gin_channels=-1, n_speakers=None, 87 | weight_normalization=True, 88 | upsample_conditional_features=False, 89 | upsample_scales=None, 90 | freq_axis_kernel_size=3, 91 | scalar_input=False, 92 | use_speaker_embedding=True, 93 | skip_channels=128, 94 | use_skip=True, 95 | iaf_shift=False 96 | ): 97 | from wavenet_vocoder import ClariWaveNet 98 | 99 | model = ClariWaveNet(out_channels=out_channels, 100 | layers=layers, stacks=stacks, 101 | residual_channels=residual_channels, 102 | iaf_layer_sizes=iaf_layer_sizes, gate_channels=gate_channels, kernel_size=kernel_size, 103 | dropout=dropout, 104 | cin_channels=cin_channels, gin_channels=gin_channels, 105 | n_speakers=n_speakers, 106 | upsample_conditional_features=upsample_conditional_features, 107 | upsample_scales=upsample_scales, 108 | freq_axis_kernel_size=freq_axis_kernel_size, 109 | scalar_input=scalar_input, 110 | use_speaker_embedding=use_speaker_embedding, 111 | skip_out_channels=skip_channels, 112 | use_skip=use_skip, 113 | iaf_shift=iaf_shift 114 | ) 115 | return model 116 | -------------------------------------------------------------------------------- /wavenet_vocoder/clari_wavenet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import math 5 | import librosa 6 | import numpy as np 7 | from hparams import hparams 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | from torch.autograd import Variable 12 | from wavenet_vocoder.modules import Embedding, Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d 13 | from train import build_model 14 | from wavenet_vocoder import receptive_field_size 15 | from wavenet_vocoder.wavenet import _expand_global_features, WaveNet 16 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic 17 | from wavenet_vocoder.upsample import UpSampleConv 18 | 19 | 20 | class ClariWaveNet(nn.Module): 21 | 22 | def __init__(self, out_channels=2, layers=20, stacks=2, 23 | residual_channels=64, 24 | iaf_layer_sizes=[10, 10, 10, 10, 10, 10], 25 | gate_channels=64, 26 | kernel_size=3, dropout=1 - 0.95, 27 | cin_channels=-1, gin_channels=-1, n_speakers=None, 28 | weight_normalization=True, 29 | upsample_conditional_features=False, 30 | upsample_scales=None, 31 | skip_out_channels=64, 32 | freq_axis_kernel_size=3, 33 | scalar_input=False, 34 | use_speaker_embedding=True, 35 | use_skip=True, 36 | iaf_shift=False 37 | ): 38 | super(ClariWaveNet, self).__init__() 39 | self.scalar_input = scalar_input 40 | self.residual_channels = residual_channels 41 | self.out_channels = out_channels 42 | self.cin_channels = cin_channels 43 | self.iaf_layers_size = iaf_layer_sizes 44 | self.last_layers = [] 45 | self.use_skip = use_skip 46 | self.iaf_shift = iaf_shift 47 | assert layers % stacks == 0 48 | layers_per_stack = layers // stacks 49 | 50 | self.first_layers = nn.ModuleList() 51 | self.iaf_layers = nn.ModuleList() 52 | self.last_layers = nn.ModuleList() 53 | for i in range(len(iaf_layer_sizes)): 54 | if scalar_input: 55 | self.first_layers.append( 56 | Conv1d1x1(1, self.residual_channels)) 57 | else: 58 | self.first_layers.append(Conv1d1x1(self.out_channels, self.residual_channels)) 59 | 60 | for iaf_layer_size in iaf_layer_sizes: 61 | iaf_layer = nn.ModuleList() 62 | for layer_index in range(iaf_layer_size): 63 | dilation = 2 ** (layer_index % layers_per_stack) 64 | conv = ResidualConv1dGLU( 65 | residual_channels, 66 | gate_channels, 67 | skip_out_channels=skip_out_channels, 68 | kernel_size=kernel_size, 69 | bias=True, 70 | dilation=dilation, 71 | dropout=dropout, 72 | cin_channels=cin_channels, 73 | gin_channels=gin_channels, 74 | weight_normalization=weight_normalization 75 | ) 76 | iaf_layer.append(conv) 77 | 78 | self.iaf_layers.append(iaf_layer) 79 | self.last_layers.append(nn.ModuleList([ 80 | nn.ReLU(), 81 | Conv1d1x1(skip_out_channels, residual_channels, 82 | weight_normalization=weight_normalization) if self.use_skip else 83 | Conv1d1x1(residual_channels, residual_channels, weight_normalization=weight_normalization), 84 | nn.ReLU(), 85 | Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization) 86 | ])) 87 | 88 | if gin_channels > 0 and use_speaker_embedding: 89 | assert n_speakers is not None 90 | self.embed_speakers = Embedding( 91 | n_speakers, gin_channels, padding_idx=None, std=0.1) 92 | else: 93 | self.embed_speakers = None 94 | 95 | # Upsample conv net 96 | if upsample_conditional_features: 97 | self.upsample_conv = UpSampleConv() 98 | else: 99 | self.upsample_conv = None 100 | 101 | self.receptive_field = receptive_field_size(layers, stacks, kernel_size) 102 | 103 | def load_teacher_upsample_conv(self, teacher): 104 | upsample_state_dict = teacher.upsample_conv.state_dict() 105 | self.upsample_conv.load_state_dict(upsample_state_dict) 106 | for param in self.upsample_conv.parameters(): 107 | param.requires_grad = False 108 | self.upsample_conv.eval() 109 | 110 | def has_speaker_embedding(self): 111 | return self.embed_speakers is not None 112 | 113 | def local_conditioning_enabled(self): 114 | return self.cin_channels > 0 115 | 116 | def forward(self, z, c=None, g=None, softmax=False, use_cuda=True, use_scale=False): 117 | 118 | if c is not None and self.upsample_conv is not None: 119 | # B x 1 x C x T 120 | c = c.unsqueeze(1) 121 | # B x C x T 122 | c = self.upsample_conv(c) 123 | c = c.squeeze(1) 124 | 125 | assert c.size(-1) == z.size(-1) 126 | 127 | B, _, T = z.size() 128 | iaf_layers_len = len(self.iaf_layers_size) 129 | if g is not None: 130 | if self.embed_speakers is not None: 131 | # (B x 1) -> (B x 1 x gin_channels) 132 | g = self.embed_speakers(g.view(B, -1)) 133 | # (B x gin_channels x 1) 134 | g = g.transpose(1, 2) 135 | assert g.dim() == 3 136 | # Expand global conditioning features to all time steps 137 | g_bct = _expand_global_features(B, T, g, bct=True) 138 | if self.iaf_shift: 139 | z = z[:, :, len(self.iaf_layers_size):] 140 | mu_tot = torch.zeros(z.size(), requires_grad=True) 141 | scale_tot = torch.ones(z.size(), requires_grad=True) 142 | if use_cuda: 143 | mu_tot, scale_tot = mu_tot.cuda(), scale_tot.cuda() 144 | 145 | layer = 0 146 | original_c = c 147 | 148 | length = z.size(-1) 149 | z_list = [] 150 | 151 | for first_conv, iaf_layer, last_layer in zip(self.first_layers, self.iaf_layers, self.last_layers): 152 | if self.iaf_shift: 153 | c = original_c[:, :, layer:layer + length] 154 | 155 | skips = None 156 | new_z = first_conv(z) 157 | for f in iaf_layer: 158 | if isinstance(f, ResidualConv1dGLU): 159 | new_z, h = f(new_z, c, g_bct) 160 | if skips is None: 161 | skips = h 162 | else: 163 | skips += h 164 | skips *= math.sqrt(0.5) 165 | if self.use_skip: 166 | new_z = skips 167 | for f in last_layer: 168 | new_z = f(new_z) 169 | if use_scale: 170 | mu_s_f, scale_s_f = new_z[:, :1, :], new_z[:, 1:, :] 171 | else: 172 | mu_s_f, scale_s_f = new_z[:, :1, :], torch.exp(torch.clamp(new_z[:, 1:, :], min=-7)) # log_scale 173 | # mu_s_f = torch.clamp(mu_s_f, -1, 1 - 2.0 / hparams.quantize_channels) 174 | mu_tot = mu_s_f + mu_tot * scale_s_f 175 | scale_tot = scale_tot * scale_s_f 176 | z = z * scale_s_f + mu_s_f 177 | z_list.append(z) 178 | layer += 1 179 | return z_list, z, mu_tot, scale_tot 180 | -------------------------------------------------------------------------------- /wavenet_vocoder/conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Variable 5 | from torch.nn import functional as F 6 | 7 | 8 | class Conv1d(nn.Conv1d): 9 | """Extended nn.Conv1d for incremental dilated convolutions 10 | """ 11 | 12 | def __init__(self, *args, **kwargs): 13 | super().__init__(*args, **kwargs) 14 | self.clear_buffer() 15 | self._linearized_weight = None 16 | self.register_backward_hook(self._clear_linearized_weight) 17 | 18 | def incremental_forward(self, input): 19 | # input: (B, T, C) 20 | if self.training: 21 | raise RuntimeError('incremental_forward only supports eval mode') 22 | 23 | # run forward pre hooks (e.g., weight norm) 24 | for hook in self._forward_pre_hooks.values(): 25 | hook(self, input) 26 | 27 | # reshape weight 28 | weight = self._get_linearized_weight() 29 | kw = self.kernel_size[0] 30 | dilation = self.dilation[0] 31 | 32 | bsz = input.size(0) # input: bsz x len x dim 33 | if kw > 1: 34 | input = input.data 35 | if self.input_buffer is None: 36 | self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2)) 37 | self.input_buffer.zero_() 38 | else: 39 | # shift buffer 40 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 41 | # append next input 42 | self.input_buffer[:, -1, :] = input[:, -1, :] 43 | with torch.no_grad(): 44 | input = torch.autograd.Variable(self.input_buffer) 45 | if dilation > 1: 46 | input = input[:, 0::dilation, :].contiguous() 47 | output = F.linear(input.view(bsz, -1), weight, self.bias) 48 | return output.view(bsz, 1, -1) 49 | 50 | def clear_buffer(self): 51 | self.input_buffer = None 52 | 53 | def _get_linearized_weight(self): 54 | if self._linearized_weight is None: 55 | kw = self.kernel_size[0] 56 | # nn.Conv1d 57 | if self.weight.size() == (self.out_channels, self.in_channels, kw): 58 | weight = self.weight.transpose(1, 2).contiguous() 59 | else: 60 | # fairseq.modules.conv_tbc.ConvTBC 61 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 62 | assert weight.size() == (self.out_channels, kw, self.in_channels) 63 | self._linearized_weight = weight.view(self.out_channels, -1) 64 | return self._linearized_weight 65 | 66 | def _clear_linearized_weight(self, *args): 67 | self._linearized_weight = None -------------------------------------------------------------------------------- /wavenet_vocoder/mixture.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Code is adapted from: 3 | # https://github.com/pclucas14/pixel-cnn-pp 4 | # https://github.com/openai/pixel-cnn 5 | 6 | from __future__ import with_statement, print_function, absolute_import 7 | 8 | import math 9 | import numpy as np 10 | 11 | import torch 12 | from torch import nn 13 | from torch.autograd import Variable 14 | from torch.nn import functional as F 15 | from torch.distributions import Normal 16 | 17 | 18 | def log_sum_exp(x): 19 | """ numerically stable log_sum_exp implementation that prevents overflow """ 20 | # TF ordering 21 | axis = len(x.size()) - 1 22 | m, _ = torch.max(x, dim=axis) 23 | m2, _ = torch.max(x, dim=axis, keepdim=True) 24 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) 25 | 26 | 27 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True): 28 | """Discretized mixture of logistic distributions loss 29 | 30 | Note that it is assumed that input is scaled to [-1, 1]. 31 | 32 | Args: 33 | y_hat (Variable): Predicted output (B x C x T) 34 | y (Variable): Target (B x T x 1). 35 | num_classes (int): Number of classes 36 | log_scale_min (float): Log scale minimum value 37 | reduce (bool): If True, the losses are averaged or summed for each 38 | minibatch. 39 | 40 | Returns 41 | Variable: loss 42 | """ 43 | assert y_hat.dim() == 3 44 | assert y_hat.size(1) % 3 == 0 45 | nr_mix = y_hat.size(1) // 3 46 | 47 | # (B x T x C) 48 | y_hat = y_hat.transpose(1, 2) 49 | 50 | # unpack parameters. (B, T, num_mixtures) x 3 51 | logit_probs = y_hat[:, :, :nr_mix] 52 | means = y_hat[:, :, nr_mix:2 * nr_mix] 53 | log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) 54 | 55 | # B x T x 1 -> B x T x num_mixtures 56 | y = y.expand_as(means) 57 | 58 | centered_y = y - means 59 | inv_stdv = torch.exp(-log_scales) 60 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 61 | cdf_plus = F.sigmoid(plus_in) 62 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 63 | cdf_min = F.sigmoid(min_in) 64 | 65 | # log probability for edge case of 0 (before scaling) 66 | # equivalent: torch.log(F.sigmoid(plus_in)) 67 | log_cdf_plus = plus_in - F.softplus(plus_in) 68 | 69 | # log probability for edge case of 255 (before scaling) 70 | # equivalent: (1 - F.sigmoid(min_in)).log() 71 | log_one_minus_cdf_min = -F.softplus(min_in) 72 | 73 | # probability for all other cases 74 | cdf_delta = cdf_plus - cdf_min 75 | 76 | mid_in = inv_stdv * centered_y 77 | # log probability in the center of the bin, to be used in extreme cases 78 | # (not actually used in our code) 79 | log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) 80 | 81 | # tf equivalent 82 | """ 83 | log_probs = tf.where(x < -0.999, log_cdf_plus, 84 | tf.where(x > 0.999, log_one_minus_cdf_min, 85 | tf.where(cdf_delta > 1e-5, 86 | tf.log(tf.maximum(cdf_delta, 1e-12)), 87 | log_pdf_mid - np.log(127.5)))) 88 | """ 89 | # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value 90 | # for num_classes=65536 case? 1e-7? not sure.. 91 | inner_inner_cond = (cdf_delta > 1e-5).float() 92 | 93 | inner_inner_out = inner_inner_cond * \ 94 | torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ 95 | (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) 96 | inner_cond = (y > 0.999).float() 97 | inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out 98 | cond = (y < -0.999).float() 99 | log_probs = cond * log_cdf_plus + (1. - cond) * inner_out 100 | 101 | log_probs = log_probs + F.log_softmax(logit_probs, -1) 102 | 103 | if reduce: 104 | return -torch.sum(log_sum_exp(log_probs)) 105 | else: 106 | return -log_sum_exp(log_probs).unsqueeze(-1) 107 | 108 | 109 | def discretized_mix_gaussian_loss(y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True, use_gaussian=True): 110 | """Discretized mixture of logistic distributions loss 111 | 112 | Note that it is assumed that input is scaled to [-1, 1]. 113 | 114 | Args: 115 | y_hat (Variable): Predicted output (B x C x T) 116 | y (Variable): Target (B x T x 1). 117 | num_classes (int): Number of classes 118 | log_scale_min (float): Log scale minimum value 119 | reduce (bool): If True, the losses are averaged or summed for each 120 | minibatch. 121 | 122 | Returns 123 | Variable: loss 124 | """ 125 | assert y_hat.dim() == 3 126 | assert y_hat.size(1) % 3 == 0 or y_hat.size(1) == 2 127 | nr_mix = y_hat.size(1) // 3 128 | 129 | # (B x T x C) 130 | y_hat = y_hat.transpose(1, 2) 131 | if use_gaussian: 132 | from torch.distributions import Normal 133 | mean, log_scale = y_hat[:, :, :1], y_hat[:, :, 1:] 134 | scales = torch.exp(torch.clamp(log_scale, min=log_scale_min)) 135 | norm = Normal(mean, scales) 136 | return (y - mean) ** 2 / (2 * scales ** 2) + torch.log(scales) + math.log(math.sqrt(2) * math.pi) 137 | # unpack parameters. (B, T, num_mixtures) x 3 138 | logit_probs = y_hat[:, :, :nr_mix] 139 | means = y_hat[:, :, nr_mix:2 * nr_mix] 140 | log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) 141 | 142 | # B x T x 1 -> B x T x num_mixtures 143 | y = y.expand_as(means) 144 | 145 | centered_y = y - means 146 | inv_stdv = torch.exp(-log_scales) 147 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 148 | cdf_plus = F.sigmoid(plus_in) 149 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 150 | cdf_min = F.sigmoid(min_in) 151 | 152 | # log probability for edge case of 0 (before scaling) 153 | # equivalent: torch.log(F.sigmoid(plus_in)) 154 | log_cdf_plus = plus_in - F.softplus(plus_in) 155 | 156 | # log probability for edge case of 255 (before scaling) 157 | # equivalent: (1 - F.sigmoid(min_in)).log() 158 | log_one_minus_cdf_min = -F.softplus(min_in) 159 | 160 | # probability for all other cases 161 | cdf_delta = cdf_plus - cdf_min 162 | 163 | mid_in = inv_stdv * centered_y 164 | # log probability in the center of the bin, to be used in extreme cases 165 | # (not actually used in our code) 166 | log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) 167 | 168 | # tf equivalent 169 | """ 170 | log_probs = tf.where(x < -0.999, log_cdf_plus, 171 | tf.where(x > 0.999, log_one_minus_cdf_min, 172 | tf.where(cdf_delta > 1e-5, 173 | tf.log(tf.maximum(cdf_delta, 1e-12)), 174 | log_pdf_mid - np.log(127.5)))) 175 | """ 176 | # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value 177 | # for num_classes=65536 case? 1e-7? not sure.. 178 | inner_inner_cond = (cdf_delta > 1e-5).float() 179 | 180 | inner_inner_out = inner_inner_cond * \ 181 | torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ 182 | (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) 183 | inner_cond = (y > 0.999).float() 184 | inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out 185 | cond = (y < -0.999).float() 186 | log_probs = cond * log_cdf_plus + (1. - cond) * inner_out 187 | 188 | log_probs = log_probs + F.log_softmax(logit_probs, -1) 189 | 190 | if reduce: 191 | return -torch.sum(log_sum_exp(log_probs)) 192 | else: 193 | return -log_sum_exp(log_probs).unsqueeze(-1) 194 | 195 | 196 | def to_one_hot(tensor, n, fill_with=1.): 197 | # we perform one hot encore with respect to the last axis 198 | one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() 199 | if tensor.is_cuda: 200 | one_hot = one_hot.cuda() 201 | one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) 202 | return Variable(one_hot) 203 | 204 | 205 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0): 206 | """ 207 | Sample from discretized mixture of logistic distributions 208 | 209 | Args: 210 | y (Variable): B x C x T 211 | log_scale_min (float): Log scale minimum value 212 | 213 | Returns: 214 | Variable: sample in range of [-1, 1]. 215 | """ 216 | assert y.size(1) % 3 == 0 217 | nr_mix = y.size(1) // 3 218 | 219 | # B x T x C 220 | y = y.transpose(1, 2) 221 | logit_probs = y[:, :, :nr_mix] 222 | 223 | # sample mixture indicator from softmax 224 | temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) 225 | temp = logit_probs.data - torch.log(- torch.log(temp)) 226 | _, argmax = temp.max(dim=-1) 227 | 228 | # (B, T) -> (B, T, nr_mix) 229 | one_hot = to_one_hot(argmax, nr_mix) 230 | # select logistic parameters 231 | means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) 232 | log_scales = torch.clamp(torch.sum( 233 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) 234 | # sample from logistic & clip to interval 235 | # we don't actually round to the nearest 8bit value when sampling 236 | u = Variable(means.data.new(means.size()).randn(1e-5, 1.0 - 1e-5)) 237 | x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) 238 | 239 | x = torch.clamp(torch.clamp(x, min=-1.), max=1.) 240 | 241 | return x 242 | 243 | 244 | def sample_from_discretized_gaussian(y, log_scale_min=-7.0, use_norm=True): 245 | """ 246 | Sample from discretized mixture of logistic distributions 247 | 248 | Args: 249 | y (Variable): B x C x T 250 | log_scale_min (float): Log scale minimum value 251 | 252 | Returns: 253 | Variable: sample in range of [-1, 1]. 254 | """ 255 | assert y.size(1) % 2 == 0 256 | nr_mix = y.size(1) // 2 257 | 258 | # B x T x C 259 | y = y.transpose(1, 2) 260 | if use_norm: 261 | mean, log_scale = y[:, :, 0], torch.clamp(y[:, :, 1], min=log_scale_min) 262 | scale = torch.exp(log_scale) 263 | norm = Normal(mean,scale) 264 | x = norm.rsample() 265 | # sample = torch.randn(mean.size()).cuda() * scale + mean 266 | # x = torch.clamp(torch.clamp(sample, min=-1), max=1.0) 267 | return x 268 | 269 | logit_probs = y[:, :, :nr_mix] 270 | 271 | # sample mixture indicator from softmax 272 | temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) 273 | temp = logit_probs.data - torch.log(- torch.log(temp)) 274 | _, argmax = temp.max(dim=-1) 275 | 276 | # (B, T) -> (B, T, nr_mix) 277 | one_hot = to_one_hot(argmax, nr_mix) 278 | # select logistic parameters 279 | means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) 280 | log_scales = torch.clamp(torch.sum( 281 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) 282 | # sample from logistic & clip to interval 283 | # we don't actually round to the nearest 8bit value when sampling 284 | u = Variable(torch.randn(means.size())) 285 | x = means + torch.exp(log_scales) * u 286 | 287 | x = torch.clamp(torch.clamp(x, min=-1.), max=1.) 288 | 289 | return x 290 | -------------------------------------------------------------------------------- /wavenet_vocoder/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import math 5 | import numpy as np 6 | 7 | import torch 8 | from wavenet_vocoder import conv 9 | from torch import nn 10 | from torch.autograd import Variable 11 | from torch.nn import functional as F 12 | 13 | 14 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs): 15 | m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs) 16 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 17 | m.weight.data.normal_(mean=0, std=std) 18 | m.bias.data.zero_() 19 | return nn.utils.weight_norm(m) 20 | 21 | 22 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01): 23 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 24 | m.weight.data.normal_(0, std) 25 | return m 26 | 27 | 28 | def ConvTranspose2d(in_channels, out_channels, kernel_size, 29 | weight_normalization=True, **kwargs): 30 | freq_axis_kernel_size = kernel_size[0] 31 | m = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, **kwargs) 32 | m.weight.data.fill_(1.0 / freq_axis_kernel_size) 33 | m.bias.data.zero_() 34 | if weight_normalization: 35 | return nn.utils.weight_norm(m) 36 | else: 37 | return m 38 | 39 | 40 | def Conv1d1x1(in_channels, out_channels, bias=True, weight_normalization=True): 41 | """1-by-1 convolution layer 42 | """ 43 | if weight_normalization: 44 | assert bias 45 | return Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 46 | dilation=1, bias=bias, std_mul=1.0) 47 | else: 48 | return conv.Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 49 | dilation=1, bias=bias) 50 | 51 | 52 | def _conv1x1_forward(conv, x, is_incremental): 53 | """Conv1x1 forward 54 | """ 55 | if is_incremental: 56 | x = conv.incremental_forward(x) 57 | else: 58 | x = conv(x) 59 | return x 60 | 61 | 62 | class ResidualConv1dGLU(nn.Module): 63 | """Residual dilated conv1d + Gated linear unit 64 | 65 | Args: 66 | residual_channels (int): Residual input / output channels 67 | gate_channels (int): Gated activation channels. 68 | kernel_size (int): Kernel size of convolution layers. 69 | skip_out_channels (int): Skip connection channels. If None, set to same 70 | as ``residual_channels``. 71 | cin_channels (int): Local conditioning channels. If negative value is 72 | set, local conditioning is disabled. 73 | gin_channels (int): Global conditioning channels. If negative value is 74 | set, global conditioning is disabled. 75 | dropout (float): Dropout probability. 76 | padding (int): Padding for convolution layers. If None, proper padding 77 | is computed depends on dilation and kernel_size. 78 | dilation (int): Dilation factor. 79 | weight_normalization (bool): If True, DeepVoice3-style weight 80 | normalization is applied. 81 | """ 82 | 83 | def __init__(self, residual_channels, gate_channels, kernel_size, 84 | skip_out_channels=None, 85 | cin_channels=-1, gin_channels=-1, 86 | dropout=1 - 0.95, padding=None, dilation=1, causal=True, 87 | bias=True, weight_normalization=True, *args, **kwargs): 88 | super(ResidualConv1dGLU, self).__init__() 89 | self.dropout = dropout 90 | if skip_out_channels is None: 91 | skip_out_channels = residual_channels 92 | if padding is None: 93 | # no future time stamps available 94 | if causal: 95 | padding = (kernel_size - 1) * dilation 96 | else: 97 | padding = (kernel_size - 1) // 2 * dilation 98 | self.causal = causal 99 | 100 | if weight_normalization: 101 | assert bias 102 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 103 | padding=padding, dilation=dilation, 104 | bias=bias, std_mul=1.0, *args, **kwargs) 105 | else: 106 | self.conv = conv.Conv1d(residual_channels, gate_channels, kernel_size, 107 | padding=padding, dilation=dilation, 108 | bias=bias, *args, **kwargs) 109 | 110 | # local conditioning 111 | if cin_channels > 0: 112 | self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, 113 | bias=bias, 114 | weight_normalization=weight_normalization) 115 | else: 116 | self.conv1x1c = None 117 | 118 | # global conditioning 119 | if gin_channels > 0: 120 | self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, bias=bias, 121 | weight_normalization=weight_normalization) 122 | else: 123 | self.conv1x1g = None 124 | 125 | # conv output is split into two groups 126 | gate_out_channels = gate_channels // 2 127 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias, 128 | weight_normalization=weight_normalization) 129 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, bias=bias, 130 | weight_normalization=weight_normalization) 131 | 132 | def forward(self, x, c=None, g=None): 133 | return self._forward(x, c, g, False) 134 | 135 | def incremental_forward(self, x, c=None, g=None): 136 | return self._forward(x, c, g, True) 137 | 138 | def _forward(self, x, c, g, is_incremental): 139 | """Forward 140 | 141 | Args: 142 | x (Variable): B x C x T 143 | c (Variable): B x C x T, Local conditioning features 144 | g (Variable): B x C x T, Expanded global conditioning features 145 | is_incremental (Bool) : Whether incremental mode or not 146 | 147 | Returns: 148 | Variable: output 149 | """ 150 | residual = x 151 | x = F.dropout(x, p=self.dropout, training=self.training) 152 | if is_incremental: 153 | splitdim = -1 154 | x = self.conv.incremental_forward(x) 155 | else: 156 | splitdim = 1 157 | x = self.conv(x) 158 | # remove future time steps 159 | x = x[:, :, :residual.size(-1)] if self.causal else x 160 | 161 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 162 | 163 | # local conditioning 164 | if c is not None: 165 | assert self.conv1x1c is not None 166 | c = _conv1x1_forward(self.conv1x1c, c, is_incremental) 167 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 168 | a, b = a + ca, b + cb 169 | 170 | # global conditioning 171 | if g is not None: 172 | assert self.conv1x1g is not None 173 | g = _conv1x1_forward(self.conv1x1g, g, is_incremental) 174 | ga, gb = g.split(g.size(splitdim) // 2, dim=splitdim) 175 | a, b = a + ga, b + gb 176 | 177 | x = F.tanh(a) * F.sigmoid(b) 178 | 179 | # For skip connection 180 | s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental) 181 | 182 | # For residual connection 183 | x = _conv1x1_forward(self.conv1x1_out, x, is_incremental) 184 | 185 | x = (x + residual) * math.sqrt(0.5) 186 | return x, s 187 | 188 | def clear_buffer(self): 189 | for conv in [self.conv, self.conv1x1_out, self.conv1x1_skip, 190 | self.conv1x1c, self.conv1x1g]: 191 | if conv is not None: 192 | self.conv.clear_buffer() 193 | -------------------------------------------------------------------------------- /wavenet_vocoder/student_wavenet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import math 5 | import librosa 6 | import numpy as np 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | 12 | from .modules import Embedding, Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d 13 | from train import build_model 14 | from wavenet_vocoder import receptive_field_size 15 | from wavenet_vocoder.wavenet import _expand_global_features, WaveNet 16 | from .mixture import sample_from_discretized_mix_logistic 17 | 18 | 19 | class StudentWaveNet(nn.Module): 20 | 21 | def __init__(self, out_channels=2, layers=20, stacks=2, 22 | residual_channels=64, 23 | iaf_layer_sizes=[10, 10, 10, 30], 24 | gate_channels=64, 25 | kernel_size=3, dropout=1 - 0.95, 26 | cin_channels=-1, gin_channels=-1, n_speakers=None, 27 | weight_normalization=True, 28 | upsample_conditional_features=False, 29 | upsample_scales=None, 30 | freq_axis_kernel_size=3, 31 | scalar_input=False, 32 | use_speaker_embedding=True, 33 | ): 34 | super(StudentWaveNet, self).__init__() 35 | self.scalar_input = scalar_input 36 | self.out_channels = out_channels 37 | self.cin_channels = cin_channels 38 | self.last_layers = [] 39 | 40 | assert layers % stacks == 0 41 | layers_per_stack = layers // stacks 42 | 43 | if scalar_input: 44 | self.first_conv = nn.ModuleList([Conv1d1x1(1, residual_channels) 45 | for _ in range(len(iaf_layer_sizes))]) 46 | else: 47 | self.first_conv = nn.ModuleList([Conv1d1x1(out_channels, residual_channels) 48 | for _ in range(len(iaf_layer_sizes))]) 49 | 50 | self.iaf_layers = nn.ModuleList() 51 | self.last_layers = nn.ModuleList() 52 | 53 | for iaf_layer_size in iaf_layer_sizes: 54 | iaf_layer = nn.ModuleList() 55 | for layer_index in range(iaf_layer_size): 56 | dilation = 2 ** (layer_index % layers_per_stack) 57 | conv = ResidualConv1dGLU( 58 | residual_channels, 59 | gate_channels, 60 | kernel_size=kernel_size, 61 | bias=True, 62 | dilation=dilation, 63 | dropout=dropout, 64 | cin_channels=cin_channels, 65 | gin_channels=gin_channels, 66 | weight_normalization=weight_normalization 67 | ) 68 | iaf_layer.append(conv) 69 | self.iaf_layers.append(iaf_layer) 70 | self.last_layers.append(nn.ModuleList([ 71 | nn.ReLU(), 72 | Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization) 73 | ])) 74 | 75 | if gin_channels > 0 and use_speaker_embedding: 76 | assert n_speakers is not None 77 | self.embed_speakers = Embedding( 78 | n_speakers, gin_channels, padding_idx=None, std=0.1) 79 | else: 80 | self.embed_speakers = None 81 | 82 | # Upsample conv net 83 | if upsample_conditional_features: 84 | self.upsample_conv = nn.ModuleList() 85 | for s in upsample_scales: 86 | freq_axis_padding = (freq_axis_kernel_size - 1) // 2 87 | convt = ConvTranspose2d(1, 1, (freq_axis_kernel_size, s), 88 | padding=(freq_axis_padding, 0), 89 | dilation=1, stride=(1, s), 90 | weight_normalization=weight_normalization) 91 | self.upsample_conv.append(convt) 92 | # assuming we use [0, 1] scaled features 93 | # this should avoid non-negative upsampling output 94 | self.upsample_conv.append(nn.ReLU(inplace=True)) 95 | else: 96 | self.upsample_conv = None 97 | 98 | self.receptive_field = receptive_field_size(layers, stacks, kernel_size) 99 | 100 | def has_speaker_embedding(self): 101 | return self.embed_speakers is not None 102 | 103 | def local_conditioning_enabled(self): 104 | return self.cin_channels > 0 105 | 106 | def forward(self, z, c=None, g=None, softmax=False, use_cuda=True, use_scale=False): 107 | 108 | if c is not None and self.upsample_conv is not None: 109 | # B x 1 x C x T 110 | c = c.unsqueeze(1) 111 | for f in self.upsample_conv: 112 | c = f(c) 113 | # B x C x T 114 | c = c.squeeze(1) 115 | 116 | if z is None: # for inference 117 | z = np.random.logistic(0, 1, (1, 1, c.size(-1))) 118 | z = torch.from_numpy(z).float() 119 | if use_cuda: 120 | z = z.cuda() 121 | 122 | assert c.size(-1) == z.size(-1) 123 | 124 | B, _, T = z.size() 125 | 126 | if g is not None: 127 | if self.embed_speakers is not None: 128 | # (B x 1) -> (B x 1 x gin_channels) 129 | g = self.embed_speakers(g.view(B, -1)) 130 | # (B x gin_channels x 1) 131 | g = g.transpose(1, 2) 132 | assert g.dim() == 3 133 | # Expand global conditioning features to all time steps 134 | g_bct = _expand_global_features(B, T, g, bct=True) 135 | 136 | mu_tot = torch.zeros(z.size(), requires_grad=True) 137 | scale_tot = torch.ones(z.size(), requires_grad=True) 138 | if use_cuda: 139 | mu_tot, scale_tot = mu_tot.cuda(), scale_tot.cuda() 140 | 141 | for first_conv, iaf_layer, last_layer in zip(self.first_conv, self.iaf_layers, self.last_layers): 142 | new_z = first_conv(z) 143 | for f in iaf_layer: 144 | new_z, _ = f(new_z, c, g_bct) 145 | for f in last_layer: 146 | new_z = f(new_z) 147 | if use_scale: 148 | mu_s_f, scale_s_f = new_z[:, :1, :], new_z[:, 1:, :] 149 | else: 150 | mu_s_f, scale_s_f = new_z[:, :1, :], torch.exp(new_z[:, 1:, :]) 151 | mu_tot = mu_s_f + mu_tot * scale_s_f 152 | scale_tot = scale_tot * scale_s_f 153 | z = z*scale_s_f + mu_s_f 154 | 155 | return z, mu_tot, scale_tot 156 | 157 | def clear_buffer(self): 158 | self.first_conv.clear_buffer() 159 | for f in self.conv_layers: 160 | f.clear_buffer() 161 | for f in self.last_conv_layers: 162 | try: 163 | f.clear_buffer() 164 | except AttributeError: 165 | pass 166 | 167 | def make_generation_fast_(self): 168 | def remove_weight_norm(m): 169 | try: 170 | nn.utils.remove_weight_norm(m) 171 | except ValueError: # this module didn't have weight norm 172 | return 173 | self.apply(remove_weight_norm) 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /wavenet_vocoder/upsample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from hparams import hparams 4 | from wavenet_vocoder.modules import ConvTranspose2d 5 | import os 6 | from hparams import hparams 7 | 8 | class UpSampleConv(nn.Module): 9 | def __init__(self, 10 | path=None, 11 | share_condition=True, 12 | weight_normalization=True): 13 | super(UpSampleConv, self).__init__() 14 | self.path = path 15 | self.upsample_conv = nn.ModuleList() 16 | for s in hparams.upsample_scales: 17 | freq_axis_padding = (hparams.freq_axis_kernel_size - 1) // 2 18 | convt = ConvTranspose2d(1, 1, (hparams.freq_axis_kernel_size, s), 19 | padding=(freq_axis_padding, 0), 20 | dilation=1, stride=[1, s], 21 | weight_normalization=weight_normalization) 22 | self.upsample_conv.append(convt) 23 | self.upsample_conv.append(nn.LeakyReLU(inplace=True,negative_slope=0.2)) 24 | # load condition form teacher wavenet 25 | if path and share_condition: 26 | self.load() 27 | 28 | def forward(self, c): 29 | for f in self.upsample_conv: 30 | c = f(c) 31 | return c 32 | 33 | def load(self): 34 | if self.path and os.path.exists(self.path): 35 | self.upsample_conv.load_state_dict(torch.load(self.path)) 36 | else: 37 | raise Exception("can't load state dict, check path, see get_model in train_student.py !") 38 | 39 | 40 | class ClariUpsampleConv(nn.Module): 41 | def __init__(self, weight_normalization=True): 42 | super(ClariUpsampleConv, self).__init__() 43 | self.upsample_conv = nn.ModuleList() 44 | for s in hparams.upsample_size: 45 | convt = ConvTranspose2d(1, 1, kernel_size=s, stride=(1,s[0] / 2), weight_normalization=weight_normalization) 46 | self.upsample_conv.append(convt) 47 | self.upsample_conv.append(nn.LeakyReLU(inplace=True, negative_slope=0.4)) 48 | 49 | def forward(self, c): 50 | for f in self.upsample_conv: 51 | c = f(c) 52 | return c 53 | 54 | if __name__ == '__main__': 55 | checkpoint = torch.load('/home/jinqiangzeng/work/mypycharm/wavenet/clari_wavenet_vocoder/checkpoints/checkpoint_step000430000_ema.pth') 56 | preset = '/home/jinqiangzeng/work/mypycharm/wavenet/clari_wavenet_vocoder/presets/ljspeech_gaussian.json' 57 | with open(preset) as f: 58 | hparams.parse_json(f.read()) 59 | from train_student import build_model 60 | teacher = build_model(hparams,'teacher') 61 | teacher.load_state_dict(checkpoint['state_dict']) 62 | upsample_state_dict = teacher.upsample_conv.state_dict() 63 | upsample_conv = UpSampleConv() 64 | upsample_conv.load_state_dict(upsample_state_dict) 65 | for para in upsample_conv.parameters(): 66 | para.requires_grad=False -------------------------------------------------------------------------------- /wavenet_vocoder/util.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | 5 | def _assert_valid_input_type(s): 6 | assert s == "mulaw-quantize" or s == "mulaw" or s == "raw" 7 | 8 | 9 | def is_mulaw_quantize(s): 10 | _assert_valid_input_type(s) 11 | return s == "mulaw-quantize" 12 | 13 | 14 | def is_mulaw(s): 15 | _assert_valid_input_type(s) 16 | return s == "mulaw" 17 | 18 | 19 | def is_raw(s): 20 | _assert_valid_input_type(s) 21 | return s == "raw" 22 | 23 | 24 | def is_scalar_input(s): 25 | return is_raw(s) or is_mulaw(s) 26 | -------------------------------------------------------------------------------- /wavenet_vocoder/wavenet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import math 5 | import numpy as np 6 | 7 | import torch 8 | from torch import nn 9 | from torch.autograd import Variable 10 | from torch.nn import functional as F 11 | 12 | from .modules import Embedding 13 | 14 | from .modules import Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d 15 | from .mixture import sample_from_discretized_mix_logistic,sample_from_discretized_gaussian 16 | from .upsample import UpSampleConv,ClariUpsampleConv 17 | 18 | def _expand_global_features(B, T, g, bct=True): 19 | """Expand global conditioning features to all time steps 20 | 21 | Args: 22 | B (int): Batch size. 23 | T (int): Time length. 24 | g (Variable): Global features, (B x C) or (B x C x 1). 25 | bct (bool) : returns (B x C x T) if True, otherwise (B x T x C) 26 | 27 | Returns: 28 | Variable: B x C x T or B x T x C or None 29 | """ 30 | if g is None: 31 | return None 32 | g = g.unsqueeze(-1) if g.dim() == 2 else g 33 | if bct: 34 | g_bct = g.expand(B, -1, T) 35 | return g_bct.contiguous() 36 | else: 37 | g_btc = g.expand(B, -1, T).transpose(1, 2) 38 | return g_btc.contiguous() 39 | 40 | 41 | def receptive_field_size(total_layers, num_cycles, kernel_size, 42 | dilation=lambda x: 2**x): 43 | """Compute receptive field size 44 | 45 | Args: 46 | total_layers (int): total layers 47 | num_cycles (int): cycles 48 | kernel_size (int): kernel size 49 | dilation (lambda): lambda to compute dilation factor. ``lambda x : 1`` 50 | to disable dilated convolution. 51 | 52 | Returns: 53 | int: receptive field size in sample 54 | 55 | """ 56 | assert total_layers % num_cycles == 0 57 | layers_per_cycle = total_layers // num_cycles 58 | dilations = [dilation(i % layers_per_cycle) for i in range(total_layers)] 59 | return (kernel_size - 1) * sum(dilations) + 1 60 | 61 | 62 | class WaveNet(nn.Module): 63 | """The WaveNet model that supports local and global conditioning. 64 | 65 | Args: 66 | out_channels (int): Output channels. If input_type is mu-law quantized 67 | one-hot vecror. this must equal to the quantize channels. Other wise 68 | num_mixtures x 3 (pi, mu, log_scale). 69 | layers (int): Number of total layers 70 | stacks (int): Number of dilation cycles 71 | residual_channels (int): Residual input / output channels 72 | gate_channels (int): Gated activation channels. 73 | skip_out_channels (int): Skip connection channels. 74 | kernel_size (int): Kernel size of convolution layers. 75 | dropout (float): Dropout probability. 76 | cin_channels (int): Local conditioning channels. If negative value is 77 | set, local conditioning is disabled. 78 | gin_channels (int): Global conditioning channels. If negative value is 79 | set, global conditioning is disabled. 80 | n_speakers (int): Number of speakers. Used only if global conditioning 81 | is enabled. 82 | weight_normalization (bool): If True, DeepVoice3-style weight 83 | normalization is applied. 84 | upsample_conditional_features (bool): Whether upsampling local 85 | conditioning features by transposed convolution layers or not. 86 | upsample_scales (list): List of upsample scale. 87 | ``np.prod(upsample_scales)`` must equal to hop size. Used only if 88 | upsample_conditional_features is enabled. 89 | freq_axis_kernel_size (int): Freq-axis kernel_size for transposed 90 | convolution layers for upsampling. If you only care about time-axis 91 | upsampling, set this to 1. 92 | scalar_input (Bool): If True, scalar input ([-1, 1]) is expected, otherwise 93 | quantized one-hot vector is expected. 94 | use_speaker_embedding (Bool): Use speaker embedding or Not. Set to False 95 | if you want to disable embedding layer and use external features 96 | directly. 97 | """ 98 | 99 | def __init__(self, out_channels=256, layers=20, stacks=2, 100 | residual_channels=512, 101 | gate_channels=512, 102 | skip_out_channels=512, 103 | kernel_size=3, dropout=1 - 0.95, 104 | cin_channels=-1, gin_channels=-1, n_speakers=None, 105 | weight_normalization=True, 106 | upsample_conditional_features=False, 107 | upsample_scales=None, 108 | freq_axis_kernel_size=3, 109 | scalar_input=False, 110 | use_speaker_embedding=True, 111 | output_type='Gaussian' 112 | ): 113 | super(WaveNet, self).__init__() 114 | self.scalar_input = scalar_input 115 | self.out_channels = out_channels 116 | self.cin_channels = cin_channels 117 | self.output_type = output_type 118 | assert layers % stacks == 0 119 | layers_per_stack = layers // stacks 120 | if scalar_input: 121 | self.first_conv = Conv1d1x1(1, residual_channels) 122 | else: 123 | self.first_conv = Conv1d1x1(out_channels, residual_channels) 124 | 125 | self.conv_layers = nn.ModuleList() 126 | for layer in range(layers): 127 | dilation = 2**(layer % layers_per_stack) 128 | conv = ResidualConv1dGLU( 129 | residual_channels, gate_channels, 130 | kernel_size=kernel_size, 131 | skip_out_channels=skip_out_channels, 132 | bias=True, # magenda uses bias, but musyoku doesn't 133 | dilation=dilation, dropout=dropout, 134 | cin_channels=cin_channels, 135 | gin_channels=gin_channels, 136 | weight_normalization=weight_normalization) 137 | self.conv_layers.append(conv) 138 | self.last_conv_layers = nn.ModuleList([ 139 | nn.ReLU(inplace=True), 140 | Conv1d1x1(skip_out_channels, skip_out_channels, 141 | weight_normalization=weight_normalization), 142 | nn.ReLU(inplace=True), 143 | Conv1d1x1(skip_out_channels, out_channels, 144 | weight_normalization=weight_normalization), 145 | ]) 146 | 147 | if gin_channels > 0 and use_speaker_embedding: 148 | assert n_speakers is not None 149 | self.embed_speakers = Embedding( 150 | n_speakers, gin_channels, padding_idx=None, std=0.1) 151 | else: 152 | self.embed_speakers = None 153 | 154 | # Upsample conv net 155 | if upsample_conditional_features: 156 | self.upsample_conv = UpSampleConv() 157 | if output_type == 'Gaussian' and False: 158 | self.upsample_conv = ClariUpsampleConv() 159 | else: 160 | self.upsample_conv = None 161 | 162 | self.receptive_field = receptive_field_size(layers, stacks, kernel_size) 163 | 164 | def has_speaker_embedding(self): 165 | return self.embed_speakers is not None 166 | 167 | def local_conditioning_enabled(self): 168 | return self.cin_channels > 0 169 | 170 | def forward(self, x, c=None, g=None, softmax=False): 171 | """Forward step 172 | 173 | Args: 174 | x (Variable): One-hot encoded audio signal, shape (B x C x T) 175 | c (Variable): Local conditioning features, 176 | shape (B x cin_channels x T) 177 | g (Variable): Global conditioning features, 178 | shape (B x gin_channels x 1) or speaker Ids of shape (B x 1). 179 | Note that ``self.use_speaker_embedding`` must be False when you 180 | want to disable embedding layer and use external features 181 | directly (e.g., one-hot vector). 182 | Also type of input tensor must be FloatTensor, not LongTensor 183 | in case of ``self.use_speaker_embedding`` equals False. 184 | softmax (bool): Whether applies softmax or not. 185 | 186 | Returns: 187 | Variable: output, shape B x out_channels x T 188 | """ 189 | B, _, T = x.size() 190 | 191 | if g is not None: 192 | if self.embed_speakers is not None: 193 | # (B x 1) -> (B x 1 x gin_channels) 194 | g = self.embed_speakers(g.view(B, -1)) 195 | # (B x gin_channels x 1) 196 | g = g.transpose(1, 2) 197 | assert g.dim() == 3 198 | # Expand global conditioning features to all time steps 199 | g_bct = _expand_global_features(B, T, g, bct=True) 200 | 201 | if c is not None and self.upsample_conv is not None: 202 | # B x 1 x C x T 203 | c = c.unsqueeze(1) 204 | c = self.upsample_conv(c) 205 | # B x C x T 206 | c = c.squeeze(1) 207 | assert c.size(-1) == x.size(-1) 208 | 209 | # Feed data to network 210 | x = self.first_conv(x) 211 | skips = None 212 | for f in self.conv_layers: 213 | x, h = f(x, c, g_bct) 214 | if skips is None: 215 | skips = h 216 | else: 217 | skips += h 218 | skips *= math.sqrt(0.5) 219 | # skips = h if skips is None else (skips + h) * math.sqrt(0.5) 220 | 221 | x = skips 222 | for f in self.last_conv_layers: 223 | x = f(x) 224 | 225 | x = F.softmax(x, dim=1) if softmax else x 226 | 227 | return x 228 | 229 | def incremental_forward(self, initial_input=None, c=None, g=None, 230 | T=100, test_inputs=None, 231 | tqdm=lambda x: x, softmax=True, quantize=True, 232 | log_scale_min=-7.0): 233 | """Incremental forward step 234 | 235 | Due to linearized convolutions, inputs of shape (B x C x T) are reshaped 236 | to (B x T x C) internally and fed to the network for each time step. 237 | Input of each time step will be of shape (B x 1 x C). 238 | 239 | Args: 240 | initial_input (Variable): Initial decoder input, (B x C x 1) 241 | c (Variable): Local conditioning features, shape (B x C' x T) 242 | g (Variable): Global conditioning features, shape (B x C'' or B x C''x 1) 243 | T (int): Number of time steps to generate. 244 | test_inputs (Variable): Teacher forcing inputs (for debugging) 245 | tqdm (lamda) : tqdm 246 | softmax (bool) : Whether applies softmax or not 247 | quantize (bool): Whether quantize softmax output before feeding the 248 | network output to input for the next time step. TODO: rename 249 | log_scale_min (float): Log scale minimum value. 250 | 251 | Returns: 252 | Variable: Generated one-hot encoded samples. B x C x T  253 | or scaler vector B x 1 x T 254 | """ 255 | self.clear_buffer() 256 | B = 1 257 | 258 | # Note: shape should be **(B x T x C)**, not (B x C x T) opposed to 259 | # batch forward due to linealized convolution 260 | if test_inputs is not None: 261 | if self.scalar_input: 262 | if test_inputs.size(1) == 1: 263 | test_inputs = test_inputs.transpose(1, 2).contiguous() 264 | else: 265 | if test_inputs.size(1) == self.out_channels: 266 | test_inputs = test_inputs.transpose(1, 2).contiguous() 267 | 268 | B = test_inputs.size(0) 269 | if T is None: 270 | T = test_inputs.size(1) 271 | else: 272 | T = max(T, test_inputs.size(1)) 273 | # cast to int in case of numpy.int64... 274 | T = int(T) 275 | 276 | # Global conditioning 277 | if g is not None: 278 | if self.embed_speakers is not None: 279 | g = self.embed_speakers(g.view(B, -1)) 280 | # (B x gin_channels, 1) 281 | g = g.transpose(1, 2) 282 | assert g.dim() == 3 283 | g_btc = _expand_global_features(B, T, g, bct=False) 284 | 285 | # Local conditioning 286 | if c is not None and self.upsample_conv is not None: 287 | assert c is not None 288 | # B x 1 x C x T 289 | c = c.unsqueeze(1) 290 | c = self.upsample_conv(c) 291 | # B x C x T 292 | c = c.squeeze(1) 293 | assert c.size(-1) == T 294 | if c is not None and c.size(-1) == T: 295 | c = c.transpose(1, 2).contiguous() 296 | 297 | outputs = [] 298 | if initial_input is None: 299 | if self.scalar_input: 300 | initial_input = Variable(torch.zeros(B, 1, 1)) 301 | else: 302 | initial_input = Variable(torch.zeros(B, 1, self.out_channels)) 303 | initial_input[:, :, 127] = 1 # TODO: is this ok? 304 | # https://github.com/pytorch/pytorch/issues/584#issuecomment-275169567 305 | if next(self.parameters()).is_cuda: 306 | initial_input = initial_input.cuda() 307 | else: 308 | if initial_input.size(1) == self.out_channels: 309 | initial_input = initial_input.transpose(1, 2).contiguous() 310 | 311 | current_input = initial_input 312 | 313 | for t in tqdm(range(T)): 314 | if test_inputs is not None and t < test_inputs.size(1): 315 | current_input = test_inputs[:, t, :].unsqueeze(1) 316 | else: 317 | if t > 0: 318 | current_input = outputs[-1] 319 | current_input = Variable(current_input) 320 | 321 | # Conditioning features for single time step 322 | ct = None if c is None else c[:, t, :].unsqueeze(1) 323 | gt = None if g is None else g_btc[:, t, :].unsqueeze(1) 324 | 325 | x = current_input 326 | x = self.first_conv.incremental_forward(x) 327 | skips = None 328 | for f in self.conv_layers: 329 | x, h = f.incremental_forward(x, ct, gt) 330 | skips = h if skips is None else (skips + h) * math.sqrt(0.5) 331 | x = skips 332 | for f in self.last_conv_layers: 333 | try: 334 | x = f.incremental_forward(x) 335 | except AttributeError: 336 | x = f(x) 337 | 338 | # Generate next input by sampling 339 | if self.scalar_input: 340 | sample_fn = sample_from_discretized_mix_logistic 341 | if self.output_type == "Gaussian": 342 | sample_fn = sample_from_discretized_gaussian 343 | x = sample_fn( 344 | x.view(B, -1, 1), log_scale_min=log_scale_min) 345 | else: 346 | x = F.softmax(x.view(B, -1), dim=1) if softmax else x.view(B, -1) 347 | if quantize: 348 | sample = np.random.choice( 349 | np.arange(self.out_channels), p=x.view(-1).data.cpu().numpy()) 350 | x.zero_() 351 | x[:, sample] = 1.0 352 | outputs += [x.data] 353 | # T x B x C 354 | outputs = torch.stack(outputs) 355 | # B x C x T 356 | outputs = outputs.transpose(0, 1).transpose(1, 2).contiguous() 357 | 358 | self.clear_buffer() 359 | return outputs 360 | 361 | def clear_buffer(self): 362 | self.first_conv.clear_buffer() 363 | for f in self.conv_layers: 364 | f.clear_buffer() 365 | for f in self.last_conv_layers: 366 | try: 367 | f.clear_buffer() 368 | except AttributeError: 369 | pass 370 | 371 | def make_generation_fast_(self): 372 | def remove_weight_norm(m): 373 | try: 374 | nn.utils.remove_weight_norm(m) 375 | except ValueError: # this module didn't have weight norm 376 | return 377 | self.apply(remove_weight_norm) 378 | --------------------------------------------------------------------------------