├── .gitignore
├── .idea
    ├── misc.xml
    ├── modules.xml
    ├── vcs.xml
    ├── wavenet_vocoder.iml
    └── workspace.xml
├── .travis.yml
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── audio.py
├── cmu_arctic.py
├── docs
    ├── .gitignore
    ├── config.toml
    ├── content
    │   └── index.md
    ├── layouts
    │   ├── _default
    │   │   ├── list.html
    │   │   └── single.html
    │   ├── index.html
    │   └── partials
    │   │   ├── footer.html
    │   │   ├── header.html
    │   │   ├── mathjax.html
    │   │   └── social.html
    └── static
    │   ├── css
    │       ├── custom.css
    │       ├── normalize.css
    │       └── skeleton.css
    │   ├── favicon.png
    │   └── images
    │       └── r9y9.jpg
├── dump_hparams_to_json.py
├── evaluate.py
├── hparams.py
├── librivox.py
├── ljspeech.py
├── lrschedule.py
├── preprocess.py
├── presets
    ├── cmu_arctic_8bit.json
    ├── ljspeech_gaussian.json
    ├── ljspeech_mixture.json
    └── multispeaker_cmu_arctic_mixture.json
├── release.sh
├── resyn.wav
├── setup.py
├── synthesis.py
├── synthesis_student.py
├── tests
    ├── test_audio.py
    ├── test_misc.py
    ├── test_mixture.py
    ├── test_model.py
    └── test_upsample.py
├── train.py
├── train_student.py
└── wavenet_vocoder
    ├── __init__.py
    ├── builder.py
    ├── clari_wavenet.py
    ├── conv.py
    ├── mixture.py
    ├── modules.py
    ├── student_wavenet.py
    ├── upsample.py
    ├── util.py
    └── wavenet.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | foobar*
  2 | pretrained_models
  3 | notebooks
  4 | wavenet_vocoder/version.py
  5 | checkpoints/*
  6 | log
  7 | generated
  8 | data/
  9 | text
 10 | teacher_checkpoints/
 11 | student_checkpoints/
 12 | # Created by https://www.gitignore.io
 13 | 
 14 | ### Python ###
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | env/
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | 
 71 | ### IPythonNotebook ###
 72 | # Temporary data
 73 | .ipynb_checkpoints/
 74 | 
 75 | 
 76 | ### SublimeText ###
 77 | # cache files for sublime text
 78 | *.tmlanguage.cache
 79 | *.tmPreferences.cache
 80 | *.stTheme.cache
 81 | 
 82 | # workspace files are user-specific
 83 | *.sublime-workspace
 84 | 
 85 | # project files should be checked into the repository, unless a significant
 86 | # proportion of contributors will probably not be using SublimeText
 87 | # *.sublime-project
 88 | 
 89 | # sftp configuration file
 90 | sftp-config.json
 91 | 
 92 | 
 93 | ### Emacs ###
 94 | # -*- mode: gitignore; -*-
 95 | *~
 96 | \#*\#
 97 | /.emacs.desktop
 98 | /.emacs.desktop.lock
 99 | *.elc
100 | auto-save-list
101 | tramp
102 | .\#*
103 | 
104 | # Org-mode
105 | .org-id-locations
106 | *_archive
107 | 
108 | # flymake-mode
109 | *_flymake.*
110 | 
111 | # eshell files
112 | /eshell/history
113 | /eshell/lastdir
114 | 
115 | # elpa packages
116 | /elpa/
117 | 
118 | # reftex files
119 | *.rel
120 | 
121 | # AUCTeX auto folder
122 | /auto/
123 | 
124 | # cask packages
125 | .cask/
126 | 
127 | 
128 | ### Vim ###
129 | [._]*.s[a-w][a-z]
130 | [._]s[a-w][a-z]
131 | *.un~
132 | Session.vim
133 | .netrwhist
134 | *~
135 | 
136 | 
137 | ### C++ ###
138 | # Compiled Object files
139 | *.slo
140 | *.lo
141 | *.o
142 | *.obj
143 | 
144 | # Precompiled Headers
145 | *.gch
146 | *.pch
147 | 
148 | # Compiled Dynamic libraries
149 | *.so
150 | *.dylib
151 | *.dll
152 | 
153 | # Fortran module files
154 | *.mod
155 | 
156 | # Compiled Static libraries
157 | *.lai
158 | *.la
159 | *.a
160 | *.lib
161 | 
162 | # Executables
163 | *.exe
164 | *.out
165 | *.app
166 | 
167 | 
168 | ### OSX ###
169 | .DS_Store
170 | .AppleDouble
171 | .LSOverride
172 | 
173 | # Icon must end with two \r
174 | Icon
175 | 
176 | 
177 | # Thumbnails
178 | ._*
179 | 
180 | # Files that might appear on external disk
181 | .Spotlight-V100
182 | .Trashes
183 | 
184 | # Directories potentially created on remote AFP share
185 | .AppleDB
186 | .AppleDesktop
187 | Network Trash Folder
188 | Temporary Items
189 | .apdisk
190 | 
191 | 
192 | ### Linux ###
193 | *~
194 | 
195 | # KDE directory preferences
196 | .directory
197 | 
198 | # Linux trash folder which might appear on any partition or disk
199 | .Trash-*
200 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py35)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/../../../pycharm/P_wavenet_vocoder/.idea/P_wavenet_vocoder.iml" filepath="$PROJECT_DIR$/../../../pycharm/P_wavenet_vocoder/.idea/P_wavenet_vocoder.iml" />
6 |       <module fileurl="file://$PROJECT_DIR$/.idea/wavenet_vocoder.iml" filepath="$PROJECT_DIR$/.idea/wavenet_vocoder.iml" />
7 |     </modules>
8 |   </component>
9 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/wavenet_vocoder.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |     <orderEntry type="module" module-name="P_wavenet_vocoder" />
 8 |   </component>
 9 |   <component name="TestRunnerService">
10 |     <option name="projectConfiguration" value="py.test" />
11 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
12 |   </component>
13 | </module>


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.6"
 5 | 
 6 | notifications:
 7 |   email: false
 8 | 
 9 | before_install:
10 |   - sudo apt-get update
11 |   - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then
12 |       wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh;
13 |     else
14 |       wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh;
15 |     fi
16 |   - bash miniconda.sh -b -p $HOME/miniconda
17 |   - export PATH="$HOME/miniconda/bin:$PATH"
18 |   - hash -r
19 |   - conda config --set always_yes yes --set changeps1 no
20 |   - conda update -q conda
21 |   # Useful for debugging any issues with conda
22 |   - conda config --add channels pypi
23 |   - conda info -a
24 |   - deps='pip numpy scipy cython nose pytorch'
25 |   - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch
26 |   - source activate test-environment
27 | 
28 | install:
29 |   - pip install -e ".[test]"
30 | script:
31 |   - nosetests -v -w tests/ -a '!local_only'
32 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The wavenet_vocoder package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2017: Ryuichi Yamamoto.
 4 | >
 5 | > Permission is hereby granted, free of charge, to any person obtaining
 6 | > a copy of this software and associated documentation files (the
 7 | > "Software"), to deal in the Software without restriction, including
 8 | > without limitation the rights to use, copy, modify, merge, publish,
 9 | > distribute, sublicense, and/or sell copies of the Software, and to
10 | > permit persons to whom the Software is furnished to do so, subject to
11 | > the following conditions:
12 | >
13 | > The above copyright notice and this permission notice shall be
14 | > included in all copies or substantial portions of the Software.
15 | >
16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE.md
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WaveNet vocoder
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/wavenet_vocoder.svg)](https://pypi.python.org/pypi/wavenet_vocoder)
  4 | [![Build Status](https://travis-ci.org/r9y9/wavenet_vocoder.svg?branch=master)](https://travis-ci.org/r9y9/wavenet_vocoder)
  5 | 
  6 | The goal of the repository is to provide an implementation of the WaveNet vocoder, which can generate high quality raw speech samples conditioned on linguistic or acoustic features.
  7 | 
  8 | Audio samples are available at https://r9y9.github.io/wavenet_vocoder/.
  9 | 
 10 | See https://github.com/r9y9/wavenet_vocoder/issues/1 for planned TODOs and current progress.
 11 | 
 12 | 
 13 | ## Highlights
 14 | 
 15 | - Focus on local and global conditioning of WaveNet, which is essential for vocoder.
 16 | - Mixture of logistic distributions loss / sampling (experimental)
 17 | 
 18 | ## Pre-trained models
 19 | 
 20 | **Note**: This is not a text-to-speech (TTS) model. With a pre-trained model provided here, you can synthesize waveform given a *mel spectrogram*, not raw text. Pre-trained models for TTS are planed to be released once I finish up [deepvoice3_pytorch/#21](https://github.com/r9y9/deepvoice3_pytorch/pull/21).
 21 | 
 22 | | Model URL                                                                                                                        | Data       | Hyper params URL                                                                                     | Git commit                                                                                         | Steps         |
 23 | |----------------------------------------------------------------------------------------------------------------------------------|------------|------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------|---------------|
 24 | | [link](https://www.dropbox.com/s/8qgcbd1mm2xsqgq/20180127_mixture_lj_checkpoint_step000410000_ema.pth?dl=0)                      | LJSpeech   | [link](https://www.dropbox.com/s/stxasitb56y1zw8/20180127_ljspeech_mixture.json?dl=0)                | [489e6fa](https://github.com/r9y9/wavenet_vocoder/commit/489e6fa92eda9ecf5b953b2783d5975d2fdee27a) | 1000k~  steps |
 25 | | [link](https://www.dropbox.com/s/d0qk4ow9uuh2lww/20180212_mixture_multispeaker_cmu_arctic_checkpoint_step000740000_ema.pth?dl=0) | CMU ARCTIC | [link](https://www.dropbox.com/s/i35yigj5hvmeol8/20180212_multispeaker_cmu_arctic_mixture.json?dl=0) | [b1a1076](https://github.com/r9y9/wavenet_vocoder/tree/b1a1076e8b5d9b3e275c28f2f7f4d7cd0e75dae4)   | 740k steps    |
 26 | 
 27 | To use pre-trained models, first checkout the specific git commit noted above. i.e.,
 28 | 
 29 | ```
 30 | git checkout ${commit_hash}
 31 | ```
 32 | 
 33 | And then follows "Synthesize from a checkpoint" section in the README. Note that old version of synthesis.py may not accept `--preset=<json>` parameter and you might have to change `hparams.py` according to the preset (json) file.
 34 | 
 35 | You could try for example:
 36 | 
 37 | ```
 38 | # Assuming you have downloaded LJSpeech-1.0 at ~/data/LJSpeech-1.0
 39 | # pretrained model (20180127_mixture_lj_checkpoint_step000410000_ema.pth)
 40 | git checkout 489e6fa
 41 | python preprocess.py ljspeech ~/data/LJSpeech-1.0 ./data/ljspeech
 42 | python synthesis.py --hparams="input_type=raw,quantize_channels=65536,out_channels=30" \
 43 |   --conditional=./data/ljspeech/ljspeech-mel-00001.npy \
 44 |   20180127_mixture_lj_checkpoint_step000410000_ema.pth \
 45 |   generated
 46 | ```
 47 | 
 48 | You can find a generated wav file in `generated` directory. Wonder how it works? then take a look at code:)
 49 | 
 50 | ## Requirements
 51 | 
 52 | - Python 3
 53 | - CUDA >= 8.0
 54 | - TensorFlow >= v1.3
 55 | 
 56 | ## Installation
 57 | 
 58 | The repository contains a core library (PyTorch implementation of the WaveNet) and utility scripts. All the library and its dependencies can be installed by:
 59 | 
 60 | ```
 61 | git clone https://github.com/r9y9/wavenet_vocoder
 62 | cd wavenet_vocoder
 63 | pip install -e ".[train]"
 64 | ```
 65 | 
 66 | If you only need the library part, then you can install it by the following command:
 67 | 
 68 | ```
 69 | pip install wavenet_vocoder
 70 | ```
 71 | 
 72 | ## Getting started
 73 | 
 74 | ### Preset parameters
 75 | 
 76 | There are many hyper parameters to be turned depends on data. For typical datasets, parameters known to work good (**preset**) are provided in the repository. See `presets` directory for details. Notice that
 77 | 
 78 | 1. `preprocess.py`
 79 | 2. `train.py`
 80 | 3. `synthesis.py`
 81 | 
 82 | accepts `--preset=<json>` *optional* parameter, which specifies where to load preset parameters. If you are going to use preset parameters, then you must use same `--preset=<json>` throughout preprocessing, training and evaluation. e.g.,
 83 | 
 84 | ```
 85 | python preprocess.py --preset=presets/cmu_arctic_8bit.json cmu_arctic ~/data/cmu_arctic
 86 | python train.py --preset=presets/cmu_arctic_8bit.json --data-root=./data/cmu_arctic
 87 | ```
 88 | 
 89 | instead of
 90 | 
 91 | ```
 92 | python preprocess.py cmu_arctic ~/data/cmu_arctic
 93 | # warning! this may use different hyper parameters used at preprocessing stage
 94 | python train.py --preset=presets/cmu_arctic_8bit.json --data-root=./data/cmu_arctic
 95 | ```
 96 | 
 97 | ### 0. Download dataset
 98 | 
 99 | - CMU ARCTIC (en): http://festvox.org/cmu_arctic/
100 | - LJSpeech (en): https://keithito.com/LJ-Speech-Dataset/
101 | 
102 | ### 1. Preprocessing
103 | 
104 | Usage:
105 | 
106 | ```
107 | python preprocess.py ${dataset_name} ${dataset_path} ${out_dir} --preset=<json>
108 | ```
109 | 
110 | Supported `${dataset_name}`s for now are
111 | 
112 | - `cmu_arctic` (multi-speaker)
113 | - `ljspeech` (single speaker)
114 | 
115 | Assuming you use preset parameters known to work good for CMU ARCTIC dataset and have data in `~/data/cmu_arctic`, then you can preprocess data by:
116 | 
117 | ```
118 | python preprocess.py cmu_arctic ~/data/cmu_arctic ./data/cmu_arctic --preset=presets/cmu_arctic_8bit.json
119 | ```
120 | 
121 | When this is done, you will see time-aligned extracted features (pairs of audio and mel-spectrogram) in `./data/cmu_arctic`.
122 | 
123 | ### 2. Training
124 | 
125 | Usage:
126 | 
127 | ```
128 | python train.py --data-root=${data-root} --preset=<json> --hparams="parameters you want to override"
129 | ```
130 | 
131 | Important options:
132 | 
133 | - `--speaker-id=<n>`: (Multi-speaker dataset only) it specifies which speaker of data we use for training. If this is not specified, all training data are used. This should only be specified when you are dealing with a multi-speaker dataset. For example, if you are trying to build a speaker-dependent WaveNet vocoder for speaker `awb` of CMU ARCTIC, then you have to specify `--speaker-id=0`. Speaker ID is automatically assigned as follows:
134 | 
135 | ```py
136 | In [1]: from nnmnkwii.datasets import cmu_arctic
137 | 
138 | In [2]: [(i, s) for (i,s) in enumerate(cmu_arctic.available_speakers)]
139 | Out[2]:
140 | 
141 | [(0, 'awb'),
142 |  (1, 'bdl'),
143 |  (2, 'clb'),
144 |  (3, 'jmk'),
145 |  (4, 'ksp'),
146 |  (5, 'rms'),
147 |  (6, 'slt')]
148 | ```
149 | 
150 | #### Training un-conditional WaveNet
151 | 
152 | ```
153 | python train.py --data-root=./data/cmu_arctic/
154 |     --hparams="cin_channels=-1,gin_channels=-1"
155 | ```
156 | 
157 | You have to disable global and local conditioning by setting `gin_channels` and `cin_channels` to negative values.
158 | 
159 | #### Training WaveNet conditioned on mel-spectrogram
160 | 
161 | ```
162 | python train.py --data-root=./data/cmu_arctic/ --speaker-id=0 \
163 |     --hparams="cin_channels=80,gin_channels=-1"
164 | ```
165 | 
166 | #### Training WaveNet conditioned on mel-spectrogram and speaker embedding
167 | 
168 | ```
169 | python train.py --data-root=./data/cmu_arctic/ \
170 |     --hparams="cin_channels=80,gin_channels=16,n_speakers=7"
171 | ```
172 | 
173 | ### 3. Monitor with Tensorboard
174 | 
175 | Logs are dumped in `./log` directory by default. You can monitor logs by tensorboard:
176 | 
177 | ```
178 | tensorboard --logdir=log
179 | ```
180 | 
181 | ### 4. Synthesize from a checkpoint
182 | 
183 | Usage:
184 | 
185 | ```
186 | python synthesis.py ${checkpoint_path} ${output_dir} --preset=<json> --hparams="parameters you want to override"
187 | ```
188 | 
189 | Important options:
190 | 
191 | - `--length=<n>`: (Un-conditional WaveNet only) Number of time steps to generate.
192 | - `--conditional=<path>`: (Required for onditional WaveNet) Path of local conditional features (.npy). If this is specified, number of time steps to generate is determined by the size of conditional feature.
193 | 
194 | e.g.,
195 | 
196 | ```
197 | python synthesis.py --hparams="parameters you want to override" \
198 |     checkpoints_awb/checkpoint_step000100000.pth \
199 |     generated/test_awb \
200 |     --conditional=./data/cmu_arctic/cmu_arctic-mel-00001.npy
201 | ```
202 | 
203 | ## Misc
204 | 
205 | ### Synthesize audio samples for testset
206 | 
207 | Usage:
208 | 
209 | 
210 | ```
211 | python evaluate.py ${checkpoint_path} ${output_dir} --data-root="data location"\
212 |     --hparams="parameters you want to override"
213 | ```
214 | 
215 | This script is used for generating sounds for https://r9y9.github.io/wavenet_vocoder/.
216 | 
217 | Options:
218 | 
219 | - `--data-root`: Data root. This is required to collect testset.
220 | - `--num-utterances`: (For multi-speaker model) number of utterances to be generated per speaker. This is useful especially when testset is large and don't want to generate all utterances. For single speaker dataset, you can hit `ctrl-c` whenever you want to stop evaluation.
221 | 
222 | e.g.,
223 | 
224 | ```
225 | python evaluate.py --data-root=./data/cmu_arctic/ \
226 |     ./checkpoints_awb/checkpoint_step000100000.pth \
227 |     ./generated/cmu_arctic_awb
228 | ```
229 | 
230 | ## References
231 | 
232 | - [Aaron van den Oord, Sander Dieleman, Heiga Zen, et al, "WaveNet: A Generative Model for Raw Audio", 	arXiv:1609.03499, Sep 2016.](https://arxiv.org/abs/1609.03499)
233 | - [Aaron van den Oord, Yazhe Li, Igor Babuschkin, et al, "Parallel WaveNet: Fast High-Fidelity Speech Synthesis", 	arXiv:1711.10433, Nov 2017.](https://arxiv.org/abs/1711.10433)
234 | - [Tamamori, Akira, et al. "Speaker-dependent WaveNet vocoder." Proceedings of Interspeech. 2017.](http://www.isca-speech.org/archive/Interspeech_2017/pdfs/0314.PDF)
235 | - [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884)
236 | - [Wei Ping, Kainan Peng, Andrew Gibiansky, et al, "Deep Voice 3: 2000-Speaker Neural Text-to-Speech", arXiv:1710.07654, Oct. 2017.](https://arxiv.org/abs/1710.07654)
237 | 


--------------------------------------------------------------------------------
/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import math
  4 | import numpy as np
  5 | from scipy import signal
  6 | from hparams import hparams
  7 | from scipy.io import wavfile
  8 | 
  9 | import lws
 10 | 
 11 | 
 12 | def load_wav(path):
 13 |     return librosa.core.load(path, sr=hparams.sample_rate)[0]
 14 | 
 15 | 
 16 | def save_wav(wav, path):
 17 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 18 |     wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 19 | 
 20 | 
 21 | def trim(quantized):
 22 |     start, end = start_and_end_indices(quantized, hparams.silence_threshold)
 23 |     return quantized[start:end]
 24 | 
 25 | 
 26 | def adjust_time_resolution(quantized, mel):
 27 |     """Adjust time resolution by repeating features
 28 | 
 29 |     Args:
 30 |         quantized (ndarray): (T,)
 31 |         mel (ndarray): (N, D)
 32 | 
 33 |     Returns:
 34 |         tuple: Tuple of (T,) and (T, D)
 35 |     """
 36 |     assert len(quantized.shape) == 1
 37 |     assert len(mel.shape) == 2
 38 | 
 39 |     upsample_factor = quantized.size // mel.shape[0]
 40 |     mel = np.repeat(mel, upsample_factor, axis=0)
 41 |     n_pad = quantized.size - mel.shape[0]
 42 |     if n_pad != 0:
 43 |         assert n_pad > 0
 44 |         mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0)
 45 | 
 46 |     # trim
 47 |     start, end = start_and_end_indices(quantized, hparams.silence_threshold)
 48 | 
 49 |     return quantized[start:end], mel[start:end, :]
 50 | adjast_time_resolution = adjust_time_resolution  # 'adjust' is correct spelling, this is for compatibility
 51 | 
 52 | 
 53 | def start_and_end_indices(quantized, silence_threshold=2):
 54 |     for start in range(quantized.size):
 55 |         if abs(quantized[start] - 127) > silence_threshold:
 56 |             break
 57 |     for end in range(quantized.size - 1, 1, -1):
 58 |         if abs(quantized[end] - 127) > silence_threshold:
 59 |             break
 60 | 
 61 |     assert abs(quantized[start] - 127) > silence_threshold
 62 |     assert abs(quantized[end] - 127) > silence_threshold
 63 | 
 64 |     return start, end
 65 | 
 66 | 
 67 | def melspectrogram(y):
 68 |     D = _lws_processor().stft(y).T
 69 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 70 |     if not hparams.allow_clipping_in_normalization:
 71 |         assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
 72 |     return _normalize(S)
 73 | 
 74 | 
 75 | def get_hop_size():
 76 |     hop_size = hparams.hop_size
 77 |     if hop_size is None:
 78 |         assert hparams.frame_shift_ms is not None
 79 |         hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 80 |     return hop_size
 81 | 
 82 | 
 83 | def _lws_processor():
 84 |     return lws.lws(hparams.fft_size, get_hop_size(), mode="speech")
 85 | 
 86 | 
 87 | def lws_num_frames(length, fsize, fshift):
 88 |     """Compute number of time frames of lws spectrogram
 89 |     """
 90 |     pad = (fsize - fshift)
 91 |     if length % fshift == 0:
 92 |         M = (length + pad * 2 - fsize) // fshift + 1
 93 |     else:
 94 |         M = (length + pad * 2 - fsize) // fshift + 2
 95 |     return M
 96 | 
 97 | 
 98 | def lws_pad_lr(x, fsize, fshift):
 99 |     """Compute left and right padding lws internally uses
100 |     """
101 |     M = lws_num_frames(len(x), fsize, fshift)
102 |     pad = (fsize - fshift)
103 |     T = len(x) + 2 * pad
104 |     r = (M - 1) * fshift + fsize - T
105 |     return pad, pad + r
106 | 
107 | # Conversions:
108 | 
109 | 
110 | _mel_basis = None
111 | 
112 | 
113 | def _linear_to_mel(spectrogram):
114 |     global _mel_basis
115 |     if _mel_basis is None:
116 |         _mel_basis = _build_mel_basis()
117 |     return np.dot(_mel_basis, spectrogram)
118 | 
119 | 
120 | def _build_mel_basis():
121 |     assert hparams.fmax <= hparams.sample_rate // 2
122 |     return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
123 |                                fmin=hparams.fmin, fmax=hparams.fmax,
124 |                                n_mels=hparams.num_mels)
125 | 
126 | 
127 | def _amp_to_db(x):
128 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
129 |     return 20 * np.log10(np.maximum(min_level, x))
130 | 
131 | 
132 | def _db_to_amp(x):
133 |     return np.power(10.0, x * 0.05)
134 | 
135 | 
136 | def _normalize(S):
137 |     return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
138 | 
139 | 
140 | def _denormalize(S):
141 |     return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
142 | 


--------------------------------------------------------------------------------
/cmu_arctic.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | import numpy as np
  4 | import os
  5 | import audio
  6 | from nnmnkwii.datasets import cmu_arctic
  7 | from nnmnkwii.io import hts
  8 | from nnmnkwii import preprocessing as P
  9 | from hparams import hparams
 10 | from os.path import exists
 11 | import librosa
 12 | 
 13 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 14 | 
 15 | from hparams import hparams
 16 | 
 17 | 
 18 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 19 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 20 |     futures = []
 21 | 
 22 |     speakers = cmu_arctic.available_speakers
 23 | 
 24 |     wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers)
 25 |     wav_paths = wd.collect_files()
 26 |     speaker_ids = wd.labels
 27 | 
 28 |     for index, (speaker_id, wav_path) in enumerate(
 29 |             zip(speaker_ids, wav_paths)):
 30 |         futures.append(executor.submit(
 31 |             partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A")))
 32 |     return [future.result() for future in tqdm(futures)]
 33 | 
 34 | 
 35 | def start_at(labels):
 36 |     has_silence = labels[0][-1] == "pau"
 37 |     if not has_silence:
 38 |         return labels[0][0]
 39 |     for i in range(1, len(labels)):
 40 |         if labels[i][-1] != "pau":
 41 |             return labels[i][0]
 42 |     assert False
 43 | 
 44 | 
 45 | def end_at(labels):
 46 |     has_silence = labels[-1][-1] == "pau"
 47 |     if not has_silence:
 48 |         return labels[-1][1]
 49 |     for i in range(len(labels) - 2, 0, -1):
 50 |         if labels[i][-1] != "pau":
 51 |             return labels[i][1]
 52 |     assert False
 53 | 
 54 | 
 55 | def _process_utterance(out_dir, index, speaker_id, wav_path, text):
 56 |     sr = hparams.sample_rate
 57 | 
 58 |     # Load the audio to a numpy array. Resampled if needed
 59 |     wav = audio.load_wav(wav_path)
 60 | 
 61 |     lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
 62 | 
 63 |     # Trim silence from hts labels if available
 64 |     # TODO
 65 |     if exists(lab_path) and False:
 66 |         labels = hts.load(lab_path)
 67 |         b = int(start_at(labels) * 1e-7 * sr)
 68 |         e = int(end_at(labels) * 1e-7 * sr)
 69 |         wav = wav[b:e]
 70 |         wav, _ = librosa.effects.trim(wav, top_db=20)
 71 |     else:
 72 |         wav, _ = librosa.effects.trim(wav, top_db=20)
 73 | 
 74 |     if hparams.rescaling:
 75 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
 76 | 
 77 |     # Mu-law quantize
 78 |     if is_mulaw_quantize(hparams.input_type):
 79 |         # [0, quantize_channels)
 80 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
 81 | 
 82 |         # Trim silences
 83 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 84 |         wav = wav[start:end]
 85 |         out = out[start:end]
 86 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 87 |         out_dtype = np.int16
 88 |     elif is_mulaw(hparams.input_type):
 89 |         # [-1, 1]
 90 |         out = P.mulaw(wav, hparams.quantize_channels)
 91 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
 92 |         out_dtype = np.float32
 93 |     else:
 94 |         # [-1, 1]
 95 |         out = wav
 96 |         constant_values = 0.0
 97 |         out_dtype = np.float32
 98 | 
 99 |     # Compute a mel-scale spectrogram from the trimmed wav:
100 |     # (N, D)
101 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
102 |     # lws pads zeros internally before performing stft
103 |     # this is needed to adjust time resolution between audio and mel-spectrogram
104 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
105 | 
106 |     # zero pad for quantized signal
107 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
108 |     N = mel_spectrogram.shape[0]
109 |     assert len(out) >= N * audio.get_hop_size()
110 | 
111 |     # time resolution adjustment
112 |     # ensure length of raw audio is multiple of hop_size so that we can use
113 |     # transposed convolution to upsample
114 |     out = out[:N * audio.get_hop_size()]
115 |     assert len(out) % audio.get_hop_size() == 0
116 | 
117 |     timesteps = len(out)
118 | 
119 |     # Write the spectrograms to disk:
120 |     audio_filename = 'cmu_arctic-audio-%05d.npy' % index
121 |     mel_filename = 'cmu_arctic-mel-%05d.npy' % index
122 |     np.save(os.path.join(out_dir, audio_filename),
123 |             out.astype(out_dtype), allow_pickle=False)
124 |     np.save(os.path.join(out_dir, mel_filename),
125 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
126 | 
127 |     # Return a tuple describing this training example:
128 |     return (audio_filename, mel_filename, timesteps, text, speaker_id)
129 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | public
2 | static/audio


--------------------------------------------------------------------------------
/docs/config.toml:
--------------------------------------------------------------------------------
 1 | baseURL = "https://r9y9.github.io/wavenet_vocoder/"
 2 | languageCode = "ja-jp"
 3 | title = "An open source implementation of WaveNet vocoder"
 4 | author = "Ryuichi YAMAMOTO"
 5 | 
 6 | [params]
 7 |   author    = "Ryuichi YAMAMOTO"
 8 |   logo      = "/images/r9y9.jpg"
 9 |   twitter   = "r9y9"
10 |   github    = "r9y9"
11 |   analytics = "UA-44433856-1"
12 | 


--------------------------------------------------------------------------------
/docs/content/index.md:
--------------------------------------------------------------------------------
  1 | +++
  2 | Categories = []
  3 | Description = ""
  4 | Keywords = []
  5 | Tags = []
  6 | date = "2018-01-04T19:42:01+09:00"
  7 | title = "index"
  8 | type = "index"
  9 | +++
 10 | 
 11 | <br/>
 12 | 
 13 | - Github: https://github.com/r9y9/wavenet_vocoder
 14 | 
 15 | This page provides audio samples for the open source implementation of the **WaveNet (WN)** vocoder.
 16 | 
 17 | 1. WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz)
 18 | 2. WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz)
 19 | 3. WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz)
 20 | 3. (Not yet) DeepVoice3 + WaveNet vocoder
 21 | 
 22 | ## WN conditioned on mel-spectrogram (16-bit linear PCM, 22.5kHz)
 23 | 
 24 | - Samples from a model trained for over 400k steps.
 25 | - Left: generated, Right: ground truth
 26 | 
 27 | <audio controls="controls" >
 28 | <source src="/wavenet_vocoder/audio/mixture_lj/0_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 29 | Your browser does not support the audio element.
 30 | </audio>
 31 | <audio controls="controls" >
 32 | <source src="/wavenet_vocoder/audio/mixture_lj/0_checkpoint_step000410000_ema_target.wav" autoplay/>
 33 | Your browser does not support the audio element.
 34 | </audio>
 35 | 
 36 | <audio controls="controls" >
 37 | <source src="/wavenet_vocoder/audio/mixture_lj/1_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 38 | Your browser does not support the audio element.
 39 | </audio>
 40 | <audio controls="controls" >
 41 | <source src="/wavenet_vocoder/audio/mixture_lj/1_checkpoint_step000410000_ema_target.wav" autoplay/>
 42 | Your browser does not support the audio element.
 43 | </audio>
 44 | 
 45 | <audio controls="controls" >
 46 | <source src="/wavenet_vocoder/audio/mixture_lj/2_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 47 | Your browser does not support the audio element.
 48 | </audio>
 49 | <audio controls="controls" >
 50 | <source src="/wavenet_vocoder/audio/mixture_lj/2_checkpoint_step000410000_ema_target.wav" autoplay/>
 51 | Your browser does not support the audio element.
 52 | </audio>
 53 | 
 54 | <audio controls="controls" >
 55 | <source src="/wavenet_vocoder/audio/mixture_lj/3_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 56 | Your browser does not support the audio element.
 57 | </audio>
 58 | <audio controls="controls" >
 59 | <source src="/wavenet_vocoder/audio/mixture_lj/3_checkpoint_step000410000_ema_target.wav" autoplay/>
 60 | Your browser does not support the audio element.
 61 | </audio>
 62 | 
 63 | <audio controls="controls" >
 64 | <source src="/wavenet_vocoder/audio/mixture_lj/4_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 65 | Your browser does not support the audio element.
 66 | </audio>
 67 | <audio controls="controls" >
 68 | <source src="/wavenet_vocoder/audio/mixture_lj/4_checkpoint_step000410000_ema_target.wav" autoplay/>
 69 | Your browser does not support the audio element.
 70 | </audio>
 71 | 
 72 | <audio controls="controls" >
 73 | <source src="/wavenet_vocoder/audio/mixture_lj/5_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 74 | Your browser does not support the audio element.
 75 | </audio>
 76 | <audio controls="controls" >
 77 | <source src="/wavenet_vocoder/audio/mixture_lj/5_checkpoint_step000410000_ema_target.wav" autoplay/>
 78 | Your browser does not support the audio element.
 79 | </audio>
 80 | 
 81 | <audio controls="controls" >
 82 | <source src="/wavenet_vocoder/audio/mixture_lj/6_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 83 | Your browser does not support the audio element.
 84 | </audio>
 85 | <audio controls="controls" >
 86 | <source src="/wavenet_vocoder/audio/mixture_lj/6_checkpoint_step000410000_ema_target.wav" autoplay/>
 87 | Your browser does not support the audio element.
 88 | </audio>
 89 | 
 90 | <audio controls="controls" >
 91 | <source src="/wavenet_vocoder/audio/mixture_lj/7_checkpoint_step000410000_ema_predicted.wav" autoplay/>
 92 | Your browser does not support the audio element.
 93 | </audio>
 94 | <audio controls="controls" >
 95 | <source src="/wavenet_vocoder/audio/mixture_lj/7_checkpoint_step000410000_ema_target.wav" autoplay/>
 96 | Your browser does not support the audio element.
 97 | </audio>
 98 | 
 99 | <audio controls="controls" >
100 | <source src="/wavenet_vocoder/audio/mixture_lj/8_checkpoint_step000410000_ema_predicted.wav" autoplay/>
101 | Your browser does not support the audio element.
102 | </audio>
103 | <audio controls="controls" >
104 | <source src="/wavenet_vocoder/audio/mixture_lj/8_checkpoint_step000410000_ema_target.wav" autoplay/>
105 | Your browser does not support the audio element.
106 | </audio>
107 | 
108 | <audio controls="controls" >
109 | <source src="/wavenet_vocoder/audio/mixture_lj/9_checkpoint_step000410000_ema_predicted.wav" autoplay/>
110 | Your browser does not support the audio element.
111 | </audio>
112 | <audio controls="controls" >
113 | <source src="/wavenet_vocoder/audio/mixture_lj/9_checkpoint_step000410000_ema_target.wav" autoplay/>
114 | Your browser does not support the audio element.
115 | </audio>
116 | 
117 | | key                         | value |
118 | |---------------------------------|------------------------------------------------------|
119 | | Data                            | LJSpeech (12522 for training, 578 for testing) |
120 | | Input type | 16-bit linear PCM |
121 | | Sampling frequency  | 22.5kHz |
122 | | Local conditioning            | 80-dim mel-spectrogram                               |
123 | | Hop size | 256 |
124 | | Global conditioning            | N/A                              |
125 | | Total layers                    | 24                                                   |
126 | | Num cycles                      | 4                                                   |
127 | | Residual / Gate / Skip-out channels | 512 / 512 / 256  |
128 | | Receptive field (samples / ms) | 505 / 22.9                                        |
129 | | Numer of mixtures  |  10  |
130 | | Number of upsampling layers | 4 |
131 | 
132 | ## WN conditioned on mel-spectrogram (8-bit mu-law, 16kHz)
133 | 
134 | - Samples from a model trained for 100k steps (~22 hours)
135 | - Left: generated, Right: (mu-law encoded) ground truth
136 | 
137 | <audio controls="controls" >
138 | <source src="/wavenet_vocoder/audio/slt/0_checkpoint_step000100000_predicted.wav" autoplay/>
139 | Your browser does not support the audio element.
140 | </audio>
141 | <audio controls="controls" >
142 | <source src="/wavenet_vocoder/audio/slt/0_checkpoint_step000100000_target.wav" autoplay/>
143 | Your browser does not support the audio element.
144 | </audio>
145 | 
146 | <audio controls="controls" >
147 | <source src="/wavenet_vocoder/audio/slt/1_checkpoint_step000100000_predicted.wav" autoplay/>
148 | Your browser does not support the audio element.
149 | </audio>
150 | <audio controls="controls" >
151 | <source src="/wavenet_vocoder/audio/slt/1_checkpoint_step000100000_target.wav" autoplay/>
152 | Your browser does not support the audio element.
153 | </audio>
154 | 
155 | <audio controls="controls" >
156 | <source src="/wavenet_vocoder/audio/slt/2_checkpoint_step000100000_predicted.wav" autoplay/>
157 | Your browser does not support the audio element.
158 | </audio>
159 | <audio controls="controls" >
160 | <source src="/wavenet_vocoder/audio/slt/2_checkpoint_step000100000_target.wav" autoplay/>
161 | Your browser does not support the audio element.
162 | </audio>
163 | 
164 | <audio controls="controls" >
165 | <source src="/wavenet_vocoder/audio/slt/3_checkpoint_step000100000_predicted.wav" autoplay/>
166 | Your browser does not support the audio element.
167 | </audio>
168 | <audio controls="controls" >
169 | <source src="/wavenet_vocoder/audio/slt/3_checkpoint_step000100000_target.wav" autoplay/>
170 | Your browser does not support the audio element.
171 | </audio>
172 | 
173 | <audio controls="controls" >
174 | <source src="/wavenet_vocoder/audio/slt/4_checkpoint_step000100000_predicted.wav" autoplay/>
175 | Your browser does not support the audio element.
176 | </audio>
177 | <audio controls="controls" >
178 | <source src="/wavenet_vocoder/audio/slt/4_checkpoint_step000100000_target.wav" autoplay/>
179 | Your browser does not support the audio element.
180 | </audio>
181 | 
182 | <audio controls="controls" >
183 | <source src="/wavenet_vocoder/audio/slt/5_checkpoint_step000100000_predicted.wav" autoplay/>
184 | Your browser does not support the audio element.
185 | </audio>
186 | <audio controls="controls" >
187 | <source src="/wavenet_vocoder/audio/slt/5_checkpoint_step000100000_target.wav" autoplay/>
188 | Your browser does not support the audio element.
189 | </audio>
190 | 
191 | <audio controls="controls" >
192 | <source src="/wavenet_vocoder/audio/slt/6_checkpoint_step000100000_predicted.wav" autoplay/>
193 | Your browser does not support the audio element.
194 | </audio>
195 | <audio controls="controls" >
196 | <source src="/wavenet_vocoder/audio/slt/6_checkpoint_step000100000_target.wav" autoplay/>
197 | Your browser does not support the audio element.
198 | </audio>
199 | 
200 | <audio controls="controls" >
201 | <source src="/wavenet_vocoder/audio/slt/7_checkpoint_step000100000_predicted.wav" autoplay/>
202 | Your browser does not support the audio element.
203 | </audio>
204 | <audio controls="controls" >
205 | <source src="/wavenet_vocoder/audio/slt/7_checkpoint_step000100000_target.wav" autoplay/>
206 | Your browser does not support the audio element.
207 | </audio>
208 | 
209 | <audio controls="controls" >
210 | <source src="/wavenet_vocoder/audio/slt/8_checkpoint_step000100000_predicted.wav" autoplay/>
211 | Your browser does not support the audio element.
212 | </audio>
213 | <audio controls="controls" >
214 | <source src="/wavenet_vocoder/audio/slt/8_checkpoint_step000100000_target.wav" autoplay/>
215 | Your browser does not support the audio element.
216 | </audio>
217 | 
218 | <audio controls="controls" >
219 | <source src="/wavenet_vocoder/audio/slt/9_checkpoint_step000100000_predicted.wav" autoplay/>
220 | Your browser does not support the audio element.
221 | </audio>
222 | <audio controls="controls" >
223 | <source src="/wavenet_vocoder/audio/slt/9_checkpoint_step000100000_target.wav" autoplay/>
224 | Your browser does not support the audio element.
225 | </audio>
226 | 
227 | | key                         | value |
228 | |---------------------------------|------------------------------------------------------|
229 | | Data                            | CMU ARCTIC (`clb`) (1183 for training, 50 for testing) |
230 | | Input type | 8-bit mu-law encoded one-hot vector |
231 | | Sampling frequency  | 16kHz |
232 | | Local conditioning            | 80-dim mel-spectrogram                               |
233 | | Hop size | 256 |
234 | | Global conditioning            | N/A                              |
235 | | Total layers                    | 16                                                   |
236 | | Num cycles                      | 2                                                    |
237 | | Residual / Gate / Skip-out channels | 512 / 512 / 256  |
238 | | Receptive field (samples / ms) | 1021 / 63.8                                          |
239 | | Number of upsampling layers | N/A |
240 | 
241 | 
242 | ## WN conditioned on mel-spectrogram and speaker-embedding (16-bit linear PCM, 16kHz)
243 | 
244 | - Samples from a model trained for over 1000k steps
245 | - Left: generated, Right: ground truth
246 | 
247 | **awb**
248 | 
249 | <audio controls="controls" >
250 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker0_12_checkpoint_step000740000_ema_predicted.wav" autoplay/>
251 | Your browser does not support the audio element.
252 | </audio>
253 | <audio controls="controls" >
254 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker0_12_checkpoint_step000740000_ema_target.wav" autoplay/>
255 | Your browser does not support the audio element.
256 | </audio>
257 | 
258 | <audio controls="controls" >
259 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker0_7_checkpoint_step000740000_ema_predicted.wav" autoplay/>
260 | Your browser does not support the audio element.
261 | </audio>
262 | <audio controls="controls" >
263 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker0_7_checkpoint_step000740000_ema_target.wav" autoplay/>
264 | Your browser does not support the audio element.
265 | </audio>
266 | 
267 | **bdl**
268 | 
269 | <audio controls="controls" >
270 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker1_2_checkpoint_step000740000_ema_predicted.wav" autoplay/>
271 | Your browser does not support the audio element.
272 | </audio>
273 | <audio controls="controls" >
274 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker1_2_checkpoint_step000740000_ema_target.wav" autoplay/>
275 | Your browser does not support the audio element.
276 | </audio>
277 | 
278 | <audio controls="controls" >
279 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker1_33_checkpoint_step000740000_ema_predicted.wav" autoplay/>
280 | Your browser does not support the audio element.
281 | </audio>
282 | <audio controls="controls" >
283 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker1_33_checkpoint_step000740000_ema_target.wav" autoplay/>
284 | Your browser does not support the audio element.
285 | </audio>
286 | 
287 | **clb**
288 | 
289 | <audio controls="controls" >
290 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker2_5_checkpoint_step000740000_ema_predicted.wav" autoplay/>
291 | Your browser does not support the audio element.
292 | </audio>
293 | <audio controls="controls" >
294 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker2_5_checkpoint_step000740000_ema_target.wav" autoplay/>
295 | Your browser does not support the audio element.
296 | </audio>
297 | 
298 | <audio controls="controls" >
299 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker2_9_checkpoint_step000740000_ema_predicted.wav" autoplay/>
300 | Your browser does not support the audio element.
301 | </audio>
302 | <audio controls="controls" >
303 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker2_9_checkpoint_step000740000_ema_target.wav" autoplay/>
304 | Your browser does not support the audio element.
305 | </audio>
306 | 
307 | **jmk**
308 | 
309 | <audio controls="controls" >
310 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker3_24_checkpoint_step000740000_ema_predicted.wav" autoplay/>
311 | Your browser does not support the audio element.
312 | </audio>
313 | <audio controls="controls" >
314 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker3_24_checkpoint_step000740000_ema_target.wav" autoplay/>
315 | Your browser does not support the audio element.
316 | </audio>
317 | 
318 | <audio controls="controls" >
319 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker3_30_checkpoint_step000740000_ema_predicted.wav" autoplay/>
320 | Your browser does not support the audio element.
321 | </audio>
322 | <audio controls="controls" >
323 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker3_30_checkpoint_step000740000_ema_target.wav" autoplay/>
324 | Your browser does not support the audio element.
325 | </audio>
326 | 
327 | 
328 | **ksp**
329 | 
330 | <audio controls="controls" >
331 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker4_25_checkpoint_step000740000_ema_predicted.wav" autoplay/>
332 | Your browser does not support the audio element.
333 | </audio>
334 | <audio controls="controls" >
335 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker4_25_checkpoint_step000740000_ema_target.wav" autoplay/>
336 | Your browser does not support the audio element.
337 | </audio>
338 | 
339 | <audio controls="controls" >
340 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker4_3_checkpoint_step000740000_ema_predicted.wav" autoplay/>
341 | Your browser does not support the audio element.
342 | </audio>
343 | <audio controls="controls" >
344 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker4_3_checkpoint_step000740000_ema_target.wav" autoplay/>
345 | Your browser does not support the audio element.
346 | </audio>
347 | 
348 | 
349 | **rms**
350 | 
351 | <audio controls="controls" >
352 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker5_0_checkpoint_step000740000_ema_predicted.wav" autoplay/>
353 | Your browser does not support the audio element.
354 | </audio>
355 | <audio controls="controls" >
356 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker5_0_checkpoint_step000740000_ema_target.wav" autoplay/>
357 | Your browser does not support the audio element.
358 | </audio>
359 | 
360 | <audio controls="controls" >
361 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker5_1_checkpoint_step000740000_ema_predicted.wav" autoplay/>
362 | Your browser does not support the audio element.
363 | </audio>
364 | <audio controls="controls" >
365 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker5_1_checkpoint_step000740000_ema_target.wav" autoplay/>
366 | Your browser does not support the audio element.
367 | </audio>
368 | 
369 | **slt**
370 | 
371 | <audio controls="controls" >
372 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker6_4_checkpoint_step000740000_ema_predicted.wav" autoplay/>
373 | Your browser does not support the audio element.
374 | </audio>
375 | <audio controls="controls" >
376 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker6_4_checkpoint_step000740000_ema_target.wav" autoplay/>
377 | Your browser does not support the audio element.
378 | </audio>
379 | 
380 | <audio controls="controls" >
381 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker6_6_checkpoint_step000740000_ema_predicted.wav" autoplay/>
382 | Your browser does not support the audio element.
383 | </audio>
384 | <audio controls="controls" >
385 | <source src="/wavenet_vocoder/audio/cmu_arctic_multispeaker/speaker6_6_checkpoint_step000740000_ema_target.wav" autoplay/>
386 | Your browser does not support the audio element.
387 | </audio>
388 | 
389 | | key                         | value |
390 | |---------------------------------|------------------------------------------------------|
391 | | Data                            | CMU ARCTIC (7580 for training, 350 for testing) |
392 | | Input type | 8-bit mu-law encoded one-hot vector |
393 | | Local conditioning            | 80-dim mel-spectrogram                               |
394 | | Hop size | 256 |
395 | | Global conditioning            | 16-dim speaker embedding [^1]                              |
396 | | Total layers                    | 24                                                   |
397 | | Num cycles                      | 4                                                   |
398 | | Residual / Gate / Skip-out channels | 512 / 512 / 256  |
399 | | Receptive field (samples / ms) | 505 / 22.9                                        |
400 | | Numer of mixtures  |  10  |
401 | | Number of upsampling layers | 4 |
402 | 
403 | [^1]: Note that mel-spectrogram used in local conditioning is dependent on speaker characteristics, so we cannot simply change the speaker identity of the generated audio samples using the model. It should work without speaker embedding, but it might have helped training speed.
404 | 
405 | ## DeepVoice3 + WaveNet vocoder
406 | 
407 | TODO
408 | 
409 | ## References
410 | 
411 | - [Aaron van den Oord, Sander Dieleman, Heiga Zen, et al, "WaveNet: A Generative Model for Raw Audio", 	arXiv:1609.03499, Sep 2016.](https://arxiv.org/abs/1609.03499)
412 | - [Aaron van den Oord, Yazhe Li, Igor Babuschkin, et al, "Parallel WaveNet: Fast High-Fidelity Speech Synthesis", 	arXiv:1711.10433, Nov 2017.](https://arxiv.org/abs/1711.10433)
413 | - [Tamamori, Akira, et al. "Speaker-dependent WaveNet vocoder." Proceedings of Interspeech. 2017.](http://www.isca-speech.org/archive/Interspeech_2017/pdfs/0314.PDF)
414 | - [Jonathan Shen, Ruoming Pang, Ron J. Weiss, et al, "Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions", arXiv:1712.05884, Dec 2017.](https://arxiv.org/abs/1712.05884)
415 | - [Wei Ping, Kainan Peng, Andrew Gibiansky, et al, "Deep Voice 3: 2000-Speaker Neural Text-to-Speech", arXiv:1710.07654, Oct. 2017.](https://arxiv.org/abs/1710.07654)
416 | 


--------------------------------------------------------------------------------
/docs/layouts/_default/list.html:
--------------------------------------------------------------------------------
 1 | {{ partial "header.html" . }}
 2 | 
 3 | 	<main role="main">
 4 | 		<h1 class="list-title">{{ .Title }}</h1>
 5 | {{ range .Data.Pages }}
 6 | 		<article itemscope itemtype="http://schema.org/Blog">
 7 | 			<h2 class="entry-title" itemprop="headline"><a href="{{ .RelPermalink }}">{{ .Title }}{{ if .Draft }} #Draft{{ end }}</a></h2>
 8 | 			<span class="entry-meta"><time itemprop="datePublished" datetime="{{ .Date.Format "2006-01-02" }}">{{ .Date.Format "January 02, 2006" }}</time></span>
 9 | 		</article>
10 | {{ end }}
11 | 	</main>
12 | 
13 | {{ partial "footer.html" . }}


--------------------------------------------------------------------------------
/docs/layouts/_default/single.html:
--------------------------------------------------------------------------------
 1 | {{ partial "header.html" . }}
 2 | 
 3 | 	<main role="main">
 4 | 		<article itemscope itemtype="http://schema.org/BlogPosting">
 5 | 			<h1 class="entry-title" itemprop="headline">{{ .Title }}</h1>
 6 | 			<span class="entry-meta"><time itemprop="datePublished" datetime="{{ .Date.Format "2006-01-02" }}">{{ .Date.Format "January 02, 2006" }}</time></span>
 7 | 			<section itemprop="entry-text">
 8 | 				{{ .Content }}
 9 | 				{{ partial "social.html" . }}
10 | 			</section>
11 | 		</article>
12 | 	</main>
13 | 
14 | {{ partial "footer.html" . }}
15 | 


--------------------------------------------------------------------------------
/docs/layouts/index.html:
--------------------------------------------------------------------------------
1 | {{ template "partials/header.html" . }}
2 | {{ range .Data.Pages }}
3 |     {{if eq .Type "index" }}
4 |         {{.Content}}
5 |     {{end}}
6 | {{ end }}
7 | {{ template "partials/footer.html" . }}
8 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/footer.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 	<footer role="contentinfo">
 3 | 		<div class="hr"></div>
 4 | 		<address>
 5 | 			<div class="avatar-bottom">
 6 | 				<a href="/">
 7 | 					{{ with .Site.Params.logo }}
 8 | 					<img src="{{ . }}">
 9 | 					{{ end }}
10 | 				</a>
11 | 			</div>
12 | 
13 | 		<div class="copyright">Copyright &copy;
14 | 			<a href="/about">{{ .Site.Params.author}}</a> All rights reserved.
15 | 
16 | 			<a href="https://github.com/{{ .Site.Params.github }}">
17 | 				<span class="github">{{ .Site.Params.github }}@Github</span>
18 | 			</a>
19 | 		</div>
20 | 		</address>
21 | 	</footer>
22 | 
23 | </div>
24 | 
25 | {{ with .Site.Params.analytics }}<script>
26 | 	(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
27 | 	(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
28 | 	m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
29 | 	})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
30 | 	ga('create', '{{ . }}', 'auto');
31 | 	ga('send', 'pageview');
32 | </script>{{ end }}
33 | 
34 | <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/highlight.min.js"></script>
35 | <script>hljs.initHighlightingOnLoad();</script>
36 | 
37 | {{ partial "mathjax.html" . }}
38 | 
39 | </body>
40 | </html>
41 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/header.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ with .Site.LanguageCode }}{{ . }}{{ else }}en-US{{ end }}">
 3 | <head>
 4 | <meta charset="utf-8">
 5 | {{ .Hugo.Generator }}
 6 | <meta name="viewport" content="width=device-width, initial-scale=1">
 7 | <link href="https://fonts.googleapis.com/css?family=Roboto:300,400,700" rel="stylesheet" type="text/css">
 8 | <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/styles/github.min.css">
 9 | <link rel="stylesheet" href="/css/normalize.css">
10 | <link rel="stylesheet" href="/css/skeleton.css">
11 | <link rel="stylesheet" href="/css/custom.css">
12 | <link rel="alternate" href="/index.xml" type="application/rss+xml" title="{{ .Site.Title }}">
13 | <link rel="shortcut icon" href="/favicon.png" type="image/x-icon" />
14 | <title>{{ $isHomePage := eq .Title .Site.Title }}{{ .Title }}{{ if eq $isHomePage false }} - {{ .Site.Title }}{{ end }}</title>
15 | </head>
16 | <body>
17 | 
18 | <div class="container">
19 | 
20 | 	<header role="banner">
21 | 		<div class="header-logo">
22 | 			<a href="/"><img src="{{ .Site.Params.logo }}" width="70" height="70"></a>
23 | 		</div>
24 | 		{{ if eq $isHomePage true }}<h1 class="site-title">{{ .Site.Title }}</h1>{{ end }}
25 | 	</header>
26 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/mathjax.html:
--------------------------------------------------------------------------------
 1 | <!-- mathjax config similar to math.stackexchange -->
 2 | 
 3 | <script type="text/x-mathjax-config">
 4 |      MathJax.Hub.Config({
 5 |          HTML: ["input/TeX","output/HTML-CSS"],
 6 |          TeX: {
 7 |                 Macros: {
 8 |                          bm: ["\\boldsymbol{#1}", 1],
 9 |                          argmax: ["\\mathop{\\rm arg\\,max}\\limits"],
10 |                          argmin: ["\\mathop{\\rm arg\\,min}\\limits"]},
11 |                 extensions: ["AMSmath.js","AMSsymbols.js"],
12 |                 equationNumbers: { autoNumber: "AMS" } },
13 |          extensions: ["tex2jax.js"],
14 |          jax: ["input/TeX","output/HTML-CSS"],
15 |          tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ],
16 |                     displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
17 |                     processEscapes: true },
18 |          "HTML-CSS": { availableFonts: ["TeX"],
19 |                        linebreaks: { automatic: true } }
20 |      });
21 |  </script>
22 | 
23 |  <script type="text/x-mathjax-config">
24 |      MathJax.Hub.Config({
25 |        tex2jax: {
26 |          skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
27 |        }
28 |      });
29 |  </script>
30 | 
31 | <script type="text/javascript"
32 |    src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
33 | </script>
34 | 
35 | <!-- mathjax settings end -->
36 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/social.html:
--------------------------------------------------------------------------------
1 | {{ if isset .Site.Params "twitter" }}
2 | <div class="social">
3 |     <div>
4 |         <a href="https://twitter.com/share" class="twitter-share-button" {{ if isset .Site.Params "twitter" }}data-via="{{ .Site.Params.twitter }}"{{ end }} data-text="{{ .Title }}" data-related="{{ .Site.Params.twitter }}">Tweet</a>
5 |         <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
6 |     </div>
7 | </div>
8 | {{ end }}
9 | 


--------------------------------------------------------------------------------
/docs/static/css/custom.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |   font-family: "Roboto", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
  3 |   background-color: #FCFCFC;
  4 |   -webkit-font-smoothing: antialiased;
  5 |   font-size: 1.8em;
  6 |   line-height: 1.5;
  7 |   font-weight: 300;
  8 | }
  9 | 
 10 | h1, h2, h3, h4, h5, h6 {
 11 |     color: #263c4c;
 12 | }
 13 | h2, h3, h4, h5, h6 {
 14 |     margin-top: 5rem;
 15 |     margin-bottom: 3rem;
 16 |     font-weight: bold;
 17 |     padding-bottom: 10px;
 18 | }
 19 | 
 20 | h1 { font-size: 3.0rem; }
 21 | h2 {
 22 |     margin-top: 6rem;
 23 |     font-size: 2.6rem;
 24 | }
 25 | h3 { font-size: 2.1rem; }
 26 | h4,
 27 | h5,
 28 | h6 { font-size: 1.9rem; }
 29 | 
 30 | h2.entry-title {
 31 |     font-size: 2.1rem;
 32 |     margin-top: 0;
 33 |     font-weight: 400;
 34 |     border-bottom: none;
 35 | }
 36 | 
 37 | li {
 38 |     margin-bottom: 0.5rem;
 39 |     margin-left: 0.7em;
 40 | }
 41 | 
 42 | img {
 43 |   max-width: 100%;
 44 |   height: auto;
 45 |   vertical-align: middle;
 46 |   border: 0;
 47 |   margin: 1em 0;
 48 | }
 49 | 
 50 | header,
 51 | footer {
 52 |   margin: 4rem 0;
 53 |   text-align: center;
 54 | }
 55 | 
 56 | main {
 57 |   margin: 4rem 0;
 58 | }
 59 | 
 60 | .container {
 61 |   width: 90%;
 62 |   max-width: 700px;
 63 | }
 64 | 
 65 | .header-logo img {
 66 |   border-radius: 50%;
 67 |   border: 2px solid #E1E1E1;
 68 | }
 69 | 
 70 | .header-logo img:hover {
 71 |   border-color: #F1F1F1;
 72 | }
 73 | 
 74 | .site-title {
 75 |   margin-top: 2rem;
 76 | }
 77 | 
 78 | .entry-title {
 79 |   margin-bottom: 0;
 80 | }
 81 | 
 82 | .entry-title a {
 83 |   text-decoration: none;
 84 | }
 85 | 
 86 | .entry-meta {
 87 |   display: inline-block;
 88 |   margin-bottom: 2rem;
 89 |   font-size: 1.6rem;
 90 |   color: #888;
 91 | }
 92 | 
 93 | .footer-link {
 94 |   margin: 2rem 0;
 95 | }
 96 | 
 97 | .hr {
 98 |   height: 1px;
 99 |   margin: 2rem 0;
100 |   background: #E1E1E1;
101 |   background: -webkit-gradient(linear, left top, right top, from(white), color-stop(#E1E1E1), to(white));
102 |   background: -webkit-linear-gradient(left, white, #E1E1E1, white);
103 |   background: linear-gradient(to right, white, #E1E1E1, white);
104 | }
105 | 
106 | article .social {
107 |   height: 40px;
108 |   padding: 10px 0;
109 | }
110 | 
111 | address {
112 |     margin: 0;
113 |     font-size:0.9em;
114 |     max-height: 60px;
115 |     font-weight: 300;
116 |     font-style: normal;
117 |     display: block;
118 | }
119 | 
120 | address a {
121 |     text-decoration: none;
122 | }
123 | 
124 | .avatar-bottom img {
125 |     border-radius: 50%;
126 |     border: 1px solid #E1E1E1;
127 |     float: left;
128 |     max-width: 100%;
129 |     vertical-align: middle;
130 |     width: 32px;
131 |     height: 32px;
132 |     margin: 0 20px 0 0;
133 |     margin-top: -7px;
134 | }
135 | 
136 | .avatar-bottom img:hover {
137 |   border-color: #F1F1F1;
138 | }
139 | 
140 | .copyright {
141 |     font-size:0.9em;
142 |     font-weight: 300;
143 | }
144 | 
145 | .github {
146 |     float: right;
147 | }
148 | 
149 | blockquote {
150 |     position: relative;
151 |     padding: 10px 10px 10px 32px;
152 |     box-sizing: border-box;
153 |     font-style: italic;
154 |     color: #464646;
155 |     background: #e0e0e0;
156 | }
157 | 
158 | blockquote:before{
159 |     display: inline-block;
160 |     position: absolute;
161 |     top: 0;
162 |     left: 0;
163 |     vertical-align: middle;
164 |     content: "\f10d";
165 |     font-family: FontAwesome;
166 |     color: #e0e0e0;
167 |     font-size: 22px;
168 |     line-height: 1;
169 |     z-index: 2;
170 | }
171 | 
172 | blockquote:after{
173 |     position: absolute;
174 |     content: '';
175 |     left: 0;
176 |     top: 0;
177 |     border-width: 0 0 40px 40px;
178 |     border-style: solid;
179 |     border-color: transparent #ffffff;
180 | }
181 | 
182 | blockquote p {
183 |     position: relative;
184 |     padding: 0;
185 |     margin: 10px 0;
186 |     z-index: 3;
187 |     line-height: 1.7;
188 | }
189 | 
190 | blockquote cite {
191 |     display: block;
192 |     text-align: right;
193 |     color: #888888;
194 |     font-size: 0.9em;
195 | }
196 | 


--------------------------------------------------------------------------------
/docs/static/css/normalize.css:
--------------------------------------------------------------------------------
  1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */
  2 | 
  3 | /**
  4 |  * 1. Set default font family to sans-serif.
  5 |  * 2. Prevent iOS text size adjust after orientation change, without disabling
  6 |  *    user zoom.
  7 |  */
  8 | 
  9 | html {
 10 |   font-family: sans-serif; /* 1 */
 11 |   -ms-text-size-adjust: 100%; /* 2 */
 12 |   -webkit-text-size-adjust: 100%; /* 2 */
 13 | }
 14 | 
 15 | /**
 16 |  * Remove default margin.
 17 |  */
 18 | 
 19 | body {
 20 |   margin: 0;
 21 | }
 22 | 
 23 | /* HTML5 display definitions
 24 |    ========================================================================== */
 25 | 
 26 | /**
 27 |  * Correct `block` display not defined for any HTML5 element in IE 8/9.
 28 |  * Correct `block` display not defined for `details` or `summary` in IE 10/11
 29 |  * and Firefox.
 30 |  * Correct `block` display not defined for `main` in IE 11.
 31 |  */
 32 | 
 33 | article,
 34 | aside,
 35 | details,
 36 | figcaption,
 37 | figure,
 38 | footer,
 39 | header,
 40 | hgroup,
 41 | main,
 42 | menu,
 43 | nav,
 44 | section,
 45 | summary {
 46 |   display: block;
 47 | }
 48 | 
 49 | /**
 50 |  * 1. Correct `inline-block` display not defined in IE 8/9.
 51 |  * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera.
 52 |  */
 53 | 
 54 | audio,
 55 | canvas,
 56 | progress,
 57 | video {
 58 |   display: inline-block; /* 1 */
 59 |   vertical-align: baseline; /* 2 */
 60 | }
 61 | 
 62 | /**
 63 |  * Prevent modern browsers from displaying `audio` without controls.
 64 |  * Remove excess height in iOS 5 devices.
 65 |  */
 66 | 
 67 | audio:not([controls]) {
 68 |   display: none;
 69 |   height: 0;
 70 | }
 71 | 
 72 | /**
 73 |  * Address `[hidden]` styling not present in IE 8/9/10.
 74 |  * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22.
 75 |  */
 76 | 
 77 | [hidden],
 78 | template {
 79 |   display: none;
 80 | }
 81 | 
 82 | /* Links
 83 |    ========================================================================== */
 84 | 
 85 | /**
 86 |  * Remove the gray background color from active links in IE 10.
 87 |  */
 88 | 
 89 | a {
 90 |   background-color: transparent;
 91 | }
 92 | 
 93 | /**
 94 |  * Improve readability when focused and also mouse hovered in all browsers.
 95 |  */
 96 | 
 97 | a:active,
 98 | a:hover {
 99 |   outline: 0;
100 | }
101 | 
102 | /* Text-level semantics
103 |    ========================================================================== */
104 | 
105 | /**
106 |  * Address styling not present in IE 8/9/10/11, Safari, and Chrome.
107 |  */
108 | 
109 | abbr[title] {
110 |   border-bottom: 1px dotted;
111 | }
112 | 
113 | /**
114 |  * Address style set to `bolder` in Firefox 4+, Safari, and Chrome.
115 |  */
116 | 
117 | b,
118 | strong {
119 |   font-weight: bold;
120 | }
121 | 
122 | /**
123 |  * Address styling not present in Safari and Chrome.
124 |  */
125 | 
126 | dfn {
127 |   font-style: italic;
128 | }
129 | 
130 | /**
131 |  * Address variable `h1` font-size and margin within `section` and `article`
132 |  * contexts in Firefox 4+, Safari, and Chrome.
133 |  */
134 | 
135 | h1 {
136 |   font-size: 2em;
137 |   margin: 0.67em 0;
138 | }
139 | 
140 | /**
141 |  * Address styling not present in IE 8/9.
142 |  */
143 | 
144 | mark {
145 |   background: #ff0;
146 |   color: #000;
147 | }
148 | 
149 | /**
150 |  * Address inconsistent and variable font size in all browsers.
151 |  */
152 | 
153 | small {
154 |   font-size: 80%;
155 | }
156 | 
157 | /**
158 |  * Prevent `sub` and `sup` affecting `line-height` in all browsers.
159 |  */
160 | 
161 | sub,
162 | sup {
163 |   font-size: 75%;
164 |   line-height: 0;
165 |   position: relative;
166 |   vertical-align: baseline;
167 | }
168 | 
169 | sup {
170 |   top: -0.5em;
171 | }
172 | 
173 | sub {
174 |   bottom: -0.25em;
175 | }
176 | 
177 | /* Embedded content
178 |    ========================================================================== */
179 | 
180 | /**
181 |  * Remove border when inside `a` element in IE 8/9/10.
182 |  */
183 | 
184 | img {
185 |   border: 0;
186 | }
187 | 
188 | /**
189 |  * Correct overflow not hidden in IE 9/10/11.
190 |  */
191 | 
192 | svg:not(:root) {
193 |   overflow: hidden;
194 | }
195 | 
196 | /* Grouping content
197 |    ========================================================================== */
198 | 
199 | /**
200 |  * Address margin not present in IE 8/9 and Safari.
201 |  */
202 | 
203 | figure {
204 |   margin: 1em 40px;
205 | }
206 | 
207 | /**
208 |  * Address differences between Firefox and other browsers.
209 |  */
210 | 
211 | hr {
212 |   -moz-box-sizing: content-box;
213 |   box-sizing: content-box;
214 |   height: 0;
215 | }
216 | 
217 | /**
218 |  * Contain overflow in all browsers.
219 |  */
220 | 
221 | pre {
222 |   overflow: auto;
223 | }
224 | 
225 | /**
226 |  * Address odd `em`-unit font size rendering in all browsers.
227 |  */
228 | 
229 | code,
230 | kbd,
231 | pre,
232 | samp {
233 |   font-family: monospace, monospace;
234 |   font-size: 1em;
235 | }
236 | 
237 | /* Forms
238 |    ========================================================================== */
239 | 
240 | /**
241 |  * Known limitation: by default, Chrome and Safari on OS X allow very limited
242 |  * styling of `select`, unless a `border` property is set.
243 |  */
244 | 
245 | /**
246 |  * 1. Correct color not being inherited.
247 |  *    Known issue: affects color of disabled elements.
248 |  * 2. Correct font properties not being inherited.
249 |  * 3. Address margins set differently in Firefox 4+, Safari, and Chrome.
250 |  */
251 | 
252 | button,
253 | input,
254 | optgroup,
255 | select,
256 | textarea {
257 |   color: inherit; /* 1 */
258 |   font: inherit; /* 2 */
259 |   margin: 0; /* 3 */
260 | }
261 | 
262 | /**
263 |  * Address `overflow` set to `hidden` in IE 8/9/10/11.
264 |  */
265 | 
266 | button {
267 |   overflow: visible;
268 | }
269 | 
270 | /**
271 |  * Address inconsistent `text-transform` inheritance for `button` and `select`.
272 |  * All other form control elements do not inherit `text-transform` values.
273 |  * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera.
274 |  * Correct `select` style inheritance in Firefox.
275 |  */
276 | 
277 | button,
278 | select {
279 |   text-transform: none;
280 | }
281 | 
282 | /**
283 |  * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
284 |  *    and `video` controls.
285 |  * 2. Correct inability to style clickable `input` types in iOS.
286 |  * 3. Improve usability and consistency of cursor style between image-type
287 |  *    `input` and others.
288 |  */
289 | 
290 | button,
291 | html input[type="button"], /* 1 */
292 | input[type="reset"],
293 | input[type="submit"] {
294 |   -webkit-appearance: button; /* 2 */
295 |   cursor: pointer; /* 3 */
296 | }
297 | 
298 | /**
299 |  * Re-set default cursor for disabled elements.
300 |  */
301 | 
302 | button[disabled],
303 | html input[disabled] {
304 |   cursor: default;
305 | }
306 | 
307 | /**
308 |  * Remove inner padding and border in Firefox 4+.
309 |  */
310 | 
311 | button::-moz-focus-inner,
312 | input::-moz-focus-inner {
313 |   border: 0;
314 |   padding: 0;
315 | }
316 | 
317 | /**
318 |  * Address Firefox 4+ setting `line-height` on `input` using `!important` in
319 |  * the UA stylesheet.
320 |  */
321 | 
322 | input {
323 |   line-height: normal;
324 | }
325 | 
326 | /**
327 |  * It's recommended that you don't attempt to style these elements.
328 |  * Firefox's implementation doesn't respect box-sizing, padding, or width.
329 |  *
330 |  * 1. Address box sizing set to `content-box` in IE 8/9/10.
331 |  * 2. Remove excess padding in IE 8/9/10.
332 |  */
333 | 
334 | input[type="checkbox"],
335 | input[type="radio"] {
336 |   box-sizing: border-box; /* 1 */
337 |   padding: 0; /* 2 */
338 | }
339 | 
340 | /**
341 |  * Fix the cursor style for Chrome's increment/decrement buttons. For certain
342 |  * `font-size` values of the `input`, it causes the cursor style of the
343 |  * decrement button to change from `default` to `text`.
344 |  */
345 | 
346 | input[type="number"]::-webkit-inner-spin-button,
347 | input[type="number"]::-webkit-outer-spin-button {
348 |   height: auto;
349 | }
350 | 
351 | /**
352 |  * 1. Address `appearance` set to `searchfield` in Safari and Chrome.
353 |  * 2. Address `box-sizing` set to `border-box` in Safari and Chrome
354 |  *    (include `-moz` to future-proof).
355 |  */
356 | 
357 | input[type="search"] {
358 |   -webkit-appearance: textfield; /* 1 */
359 |   -moz-box-sizing: content-box;
360 |   -webkit-box-sizing: content-box; /* 2 */
361 |   box-sizing: content-box;
362 | }
363 | 
364 | /**
365 |  * Remove inner padding and search cancel button in Safari and Chrome on OS X.
366 |  * Safari (but not Chrome) clips the cancel button when the search input has
367 |  * padding (and `textfield` appearance).
368 |  */
369 | 
370 | input[type="search"]::-webkit-search-cancel-button,
371 | input[type="search"]::-webkit-search-decoration {
372 |   -webkit-appearance: none;
373 | }
374 | 
375 | /**
376 |  * Define consistent border, margin, and padding.
377 |  */
378 | 
379 | fieldset {
380 |   border: 1px solid #c0c0c0;
381 |   margin: 0 2px;
382 |   padding: 0.35em 0.625em 0.75em;
383 | }
384 | 
385 | /**
386 |  * 1. Correct `color` not being inherited in IE 8/9/10/11.
387 |  * 2. Remove padding so people aren't caught out if they zero out fieldsets.
388 |  */
389 | 
390 | legend {
391 |   border: 0; /* 1 */
392 |   padding: 0; /* 2 */
393 | }
394 | 
395 | /**
396 |  * Remove default vertical scrollbar in IE 8/9/10/11.
397 |  */
398 | 
399 | textarea {
400 |   overflow: auto;
401 | }
402 | 
403 | /**
404 |  * Don't inherit the `font-weight` (applied by a rule above).
405 |  * NOTE: the default cannot safely be changed in Chrome and Safari on OS X.
406 |  */
407 | 
408 | optgroup {
409 |   font-weight: bold;
410 | }
411 | 
412 | /* Tables
413 |    ========================================================================== */
414 | 
415 | /**
416 |  * Remove most spacing between table cells.
417 |  */
418 | 
419 | table {
420 |   border-collapse: collapse;
421 |   border-spacing: 0;
422 | }
423 | 
424 | td,
425 | th {
426 |   padding: 0;
427 | }


--------------------------------------------------------------------------------
/docs/static/css/skeleton.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Skeleton V2.0.4
  3 | * Copyright 2014, Dave Gamache
  4 | * www.getskeleton.com
  5 | * Free to use under the MIT license.
  6 | * http://www.opensource.org/licenses/mit-license.php
  7 | * 12/29/2014
  8 | */
  9 | 
 10 | 
 11 | /* Table of contents
 12 | ––––––––––––––––––––––––––––––––––––––––––––––––––
 13 | - Grid
 14 | - Base Styles
 15 | - Typography
 16 | - Links
 17 | - Buttons
 18 | - Forms
 19 | - Lists
 20 | - Code
 21 | - Tables
 22 | - Spacing
 23 | - Utilities
 24 | - Clearing
 25 | - Media Queries
 26 | */
 27 | 
 28 | 
 29 | /* Grid
 30 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
 31 | .container {
 32 |   position: relative;
 33 |   width: 100%;
 34 |   max-width: 960px;
 35 |   margin: 0 auto;
 36 |   padding: 0 20px;
 37 |   box-sizing: border-box; }
 38 | .column,
 39 | .columns {
 40 |   width: 100%;
 41 |   float: left;
 42 |   box-sizing: border-box; }
 43 | 
 44 | /* For devices larger than 400px */
 45 | @media (min-width: 400px) {
 46 |   .container {
 47 |     width: 85%;
 48 |     padding: 0; }
 49 | }
 50 | 
 51 | /* For devices larger than 550px */
 52 | @media (min-width: 550px) {
 53 |   .container {
 54 |     width: 80%; }
 55 |   .column,
 56 |   .columns {
 57 |     margin-left: 4%; }
 58 |   .column:first-child,
 59 |   .columns:first-child {
 60 |     margin-left: 0; }
 61 | 
 62 |   .one.column,
 63 |   .one.columns                    { width: 4.66666666667%; }
 64 |   .two.columns                    { width: 13.3333333333%; }
 65 |   .three.columns                  { width: 22%;            }
 66 |   .four.columns                   { width: 30.6666666667%; }
 67 |   .five.columns                   { width: 39.3333333333%; }
 68 |   .six.columns                    { width: 48%;            }
 69 |   .seven.columns                  { width: 56.6666666667%; }
 70 |   .eight.columns                  { width: 65.3333333333%; }
 71 |   .nine.columns                   { width: 74.0%;          }
 72 |   .ten.columns                    { width: 82.6666666667%; }
 73 |   .eleven.columns                 { width: 91.3333333333%; }
 74 |   .twelve.columns                 { width: 100%; margin-left: 0; }
 75 | 
 76 |   .one-third.column               { width: 30.6666666667%; }
 77 |   .two-thirds.column              { width: 65.3333333333%; }
 78 | 
 79 |   .one-half.column                { width: 48%; }
 80 | 
 81 |   /* Offsets */
 82 |   .offset-by-one.column,
 83 |   .offset-by-one.columns          { margin-left: 8.66666666667%; }
 84 |   .offset-by-two.column,
 85 |   .offset-by-two.columns          { margin-left: 17.3333333333%; }
 86 |   .offset-by-three.column,
 87 |   .offset-by-three.columns        { margin-left: 26%;            }
 88 |   .offset-by-four.column,
 89 |   .offset-by-four.columns         { margin-left: 34.6666666667%; }
 90 |   .offset-by-five.column,
 91 |   .offset-by-five.columns         { margin-left: 43.3333333333%; }
 92 |   .offset-by-six.column,
 93 |   .offset-by-six.columns          { margin-left: 52%;            }
 94 |   .offset-by-seven.column,
 95 |   .offset-by-seven.columns        { margin-left: 60.6666666667%; }
 96 |   .offset-by-eight.column,
 97 |   .offset-by-eight.columns        { margin-left: 69.3333333333%; }
 98 |   .offset-by-nine.column,
 99 |   .offset-by-nine.columns         { margin-left: 78.0%;          }
100 |   .offset-by-ten.column,
101 |   .offset-by-ten.columns          { margin-left: 86.6666666667%; }
102 |   .offset-by-eleven.column,
103 |   .offset-by-eleven.columns       { margin-left: 95.3333333333%; }
104 | 
105 |   .offset-by-one-third.column,
106 |   .offset-by-one-third.columns    { margin-left: 34.6666666667%; }
107 |   .offset-by-two-thirds.column,
108 |   .offset-by-two-thirds.columns   { margin-left: 69.3333333333%; }
109 | 
110 |   .offset-by-one-half.column,
111 |   .offset-by-one-half.columns     { margin-left: 52%; }
112 | 
113 | }
114 | 
115 | 
116 | /* Base Styles
117 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
118 | /* NOTE
119 | html is set to 62.5% so that all the REM measurements throughout Skeleton
120 | are based on 10px sizing. So basically 1.5rem = 15px :) */
121 | html {
122 |   font-size: 62.5%; }
123 | body {
124 |   font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */
125 |   line-height: 1.6;
126 |   font-weight: 400;
127 |   font-family: "Raleway", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
128 |   color: #222; }
129 | 
130 | 
131 | /* Typography
132 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
133 | h1, h2, h3, h4, h5, h6 {
134 |   margin-top: 0;
135 |   margin-bottom: 2rem;
136 |   font-weight: 300; }
137 | h1 { font-size: 4.0rem; line-height: 1.2;  letter-spacing: -.1rem;}
138 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; }
139 | h3 { font-size: 3.0rem; line-height: 1.3;  letter-spacing: -.1rem; }
140 | h4 { font-size: 2.4rem; line-height: 1.35; letter-spacing: -.08rem; }
141 | h5 { font-size: 1.8rem; line-height: 1.5;  letter-spacing: -.05rem; }
142 | h6 { font-size: 1.5rem; line-height: 1.6;  letter-spacing: 0; }
143 | 
144 | /* Larger than phablet */
145 | @media (min-width: 550px) {
146 |   h1 { font-size: 5.0rem; }
147 |   h2 { font-size: 4.2rem; }
148 |   h3 { font-size: 3.6rem; }
149 |   h4 { font-size: 3.0rem; }
150 |   h5 { font-size: 2.4rem; }
151 |   h6 { font-size: 1.5rem; }
152 | }
153 | 
154 | p {
155 |   margin-top: 0; }
156 | 
157 | 
158 | /* Links
159 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
160 | a {
161 |   color: #1EAEDB; }
162 | a:hover {
163 |   color: #0FA0CE; }
164 | 
165 | 
166 | /* Buttons
167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
168 | .button,
169 | button,
170 | input[type="submit"],
171 | input[type="reset"],
172 | input[type="button"] {
173 |   display: inline-block;
174 |   height: 38px;
175 |   padding: 0 30px;
176 |   color: #555;
177 |   text-align: center;
178 |   font-size: 11px;
179 |   font-weight: 600;
180 |   line-height: 38px;
181 |   letter-spacing: .1rem;
182 |   text-transform: uppercase;
183 |   text-decoration: none;
184 |   white-space: nowrap;
185 |   background-color: transparent;
186 |   border-radius: 4px;
187 |   border: 1px solid #bbb;
188 |   cursor: pointer;
189 |   box-sizing: border-box; }
190 | .button:hover,
191 | button:hover,
192 | input[type="submit"]:hover,
193 | input[type="reset"]:hover,
194 | input[type="button"]:hover,
195 | .button:focus,
196 | button:focus,
197 | input[type="submit"]:focus,
198 | input[type="reset"]:focus,
199 | input[type="button"]:focus {
200 |   color: #333;
201 |   border-color: #888;
202 |   outline: 0; }
203 | .button.button-primary,
204 | button.button-primary,
205 | input[type="submit"].button-primary,
206 | input[type="reset"].button-primary,
207 | input[type="button"].button-primary {
208 |   color: #FFF;
209 |   background-color: #33C3F0;
210 |   border-color: #33C3F0; }
211 | .button.button-primary:hover,
212 | button.button-primary:hover,
213 | input[type="submit"].button-primary:hover,
214 | input[type="reset"].button-primary:hover,
215 | input[type="button"].button-primary:hover,
216 | .button.button-primary:focus,
217 | button.button-primary:focus,
218 | input[type="submit"].button-primary:focus,
219 | input[type="reset"].button-primary:focus,
220 | input[type="button"].button-primary:focus {
221 |   color: #FFF;
222 |   background-color: #1EAEDB;
223 |   border-color: #1EAEDB; }
224 | 
225 | 
226 | /* Forms
227 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
228 | input[type="email"],
229 | input[type="number"],
230 | input[type="search"],
231 | input[type="text"],
232 | input[type="tel"],
233 | input[type="url"],
234 | input[type="password"],
235 | textarea,
236 | select {
237 |   height: 38px;
238 |   padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */
239 |   background-color: #fff;
240 |   border: 1px solid #D1D1D1;
241 |   border-radius: 4px;
242 |   box-shadow: none;
243 |   box-sizing: border-box; }
244 | /* Removes awkward default styles on some inputs for iOS */
245 | input[type="email"],
246 | input[type="number"],
247 | input[type="search"],
248 | input[type="text"],
249 | input[type="tel"],
250 | input[type="url"],
251 | input[type="password"],
252 | textarea {
253 |   -webkit-appearance: none;
254 |      -moz-appearance: none;
255 |           appearance: none; }
256 | textarea {
257 |   min-height: 65px;
258 |   padding-top: 6px;
259 |   padding-bottom: 6px; }
260 | input[type="email"]:focus,
261 | input[type="number"]:focus,
262 | input[type="search"]:focus,
263 | input[type="text"]:focus,
264 | input[type="tel"]:focus,
265 | input[type="url"]:focus,
266 | input[type="password"]:focus,
267 | textarea:focus,
268 | select:focus {
269 |   border: 1px solid #33C3F0;
270 |   outline: 0; }
271 | label,
272 | legend {
273 |   display: block;
274 |   margin-bottom: .5rem;
275 |   font-weight: 600; }
276 | fieldset {
277 |   padding: 0;
278 |   border-width: 0; }
279 | input[type="checkbox"],
280 | input[type="radio"] {
281 |   display: inline; }
282 | label > .label-body {
283 |   display: inline-block;
284 |   margin-left: .5rem;
285 |   font-weight: normal; }
286 | 
287 | 
288 | /* Lists
289 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
290 | ul {
291 |   list-style: circle inside; }
292 | ol {
293 |   list-style: decimal inside; }
294 | ol, ul {
295 |   padding-left: 0;
296 |   margin-top: 0; }
297 | ul ul,
298 | ul ol,
299 | ol ol,
300 | ol ul {
301 |   margin: 1.5rem 0 1.5rem 3rem;
302 |   font-size: 90%; }
303 | li {
304 |   margin-bottom: 1rem; }
305 | 
306 | 
307 | /* Code
308 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
309 | code {
310 |   padding: .2rem .5rem;
311 |   margin: 0 .2rem;
312 |   font-size: 90%;
313 |   white-space: nowrap;
314 |   background: #F1F1F1;
315 |   border: 1px solid #E1E1E1;
316 |   border-radius: 4px; }
317 | pre > code {
318 |   display: block;
319 |   padding: 1rem 1.5rem;
320 |   white-space: pre; }
321 | 
322 | 
323 | /* Tables
324 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
325 | th,
326 | td {
327 |   padding: 12px 15px;
328 |   text-align: left;
329 |   border-bottom: 1px solid #E1E1E1; }
330 | th:first-child,
331 | td:first-child {
332 |   padding-left: 0; }
333 | th:last-child,
334 | td:last-child {
335 |   padding-right: 0; }
336 | 
337 | 
338 | /* Spacing
339 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
340 | button,
341 | .button {
342 |   margin-bottom: 1rem; }
343 | input,
344 | textarea,
345 | select,
346 | fieldset {
347 |   margin-bottom: 1.5rem; }
348 | pre,
349 | blockquote,
350 | dl,
351 | figure,
352 | table,
353 | p,
354 | ul,
355 | ol,
356 | form {
357 |   margin-bottom: 2.5rem; }
358 | 
359 | 
360 | /* Utilities
361 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
362 | .u-full-width {
363 |   width: 100%;
364 |   box-sizing: border-box; }
365 | .u-max-full-width {
366 |   max-width: 100%;
367 |   box-sizing: border-box; }
368 | .u-pull-right {
369 |   float: right; }
370 | .u-pull-left {
371 |   float: left; }
372 | 
373 | 
374 | /* Misc
375 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
376 | hr {
377 |   margin-top: 3rem;
378 |   margin-bottom: 3.5rem;
379 |   border-width: 0;
380 |   border-top: 1px solid #E1E1E1; }
381 | 
382 | 
383 | /* Clearing
384 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
385 | 
386 | /* Self Clearing Goodness */
387 | .container:after,
388 | .row:after,
389 | .u-cf {
390 |   content: "";
391 |   display: table;
392 |   clear: both; }
393 | 
394 | 
395 | /* Media Queries
396 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
397 | /*
398 | Note: The best way to structure the use of media queries is to create the queries
399 | near the relevant code. For example, if you wanted to change the styles for buttons
400 | on small devices, paste the mobile query code up in the buttons section and style it
401 | there.
402 | */
403 | 
404 | 
405 | /* Larger than mobile */
406 | @media (min-width: 400px) {}
407 | 
408 | /* Larger than phablet (also point when grid becomes active) */
409 | @media (min-width: 550px) {}
410 | 
411 | /* Larger than tablet */
412 | @media (min-width: 750px) {}
413 | 
414 | /* Larger than desktop */
415 | @media (min-width: 1000px) {}
416 | 
417 | /* Larger than Desktop HD */
418 | @media (min-width: 1200px) {}
419 | 


--------------------------------------------------------------------------------
/docs/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/docs/static/favicon.png


--------------------------------------------------------------------------------
/docs/static/images/r9y9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/docs/static/images/r9y9.jpg


--------------------------------------------------------------------------------
/dump_hparams_to_json.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Dump hyper parameters to json file.
 4 | 
 5 | usage: dump_hparams_to_json.py [options] <output_json_path>
 6 | 
 7 | options:
 8 |     -h, --help               Show help message.
 9 | """
10 | from docopt import docopt
11 | 
12 | import sys
13 | import os
14 | from os.path import dirname, join, basename, splitext
15 | import json
16 | 
17 | from hparams import hparams
18 | 
19 | if __name__ == "__main__":
20 |     args = docopt(__doc__)
21 |     output_json_path = args["<output_json_path>"]
22 | 
23 |     j = hparams.values()
24 | 
25 |     # for compat legacy
26 |     for k in ["preset", "presets"]:
27 |         if k in j:
28 |             del j[k]
29 | 
30 |     with open(output_json_path, "w") as f:
31 |         json.dump(j, f, indent=2)
32 |     sys.exit(0)
33 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | Synthesis waveform for testset
  4 | 
  5 | usage: evaluate.py [options] <checkpoint> <dst_dir>
  6 | 
  7 | options:
  8 |     --data-root=<dir>           Directory contains preprocessed features.
  9 |     --hparams=<parmas>          Hyper parameters [default: ].
 10 |     --length=<T>                Steps to generate [default: 32000].
 11 |     --speaker-id=<N>            Use specific speaker of data in case for multi-speaker datasets.
 12 |     --initial-value=<n>         Initial value for the WaveNet decoder.
 13 |     --file-name-suffix=<s>      File name suffix [default: ].
 14 |     --output-html               Output html for blog post.
 15 |     --num-utterances=N>         Generate N utterenaces per speaker [default: -1].
 16 |     -h, --help                  Show help message.
 17 | """
 18 | from docopt import docopt
 19 | 
 20 | import sys
 21 | import os
 22 | from os.path import dirname, join, basename, splitext
 23 | import torch
 24 | from torch.autograd import Variable
 25 | import numpy as np
 26 | from nnmnkwii import preprocessing as P
 27 | from keras.utils import np_utils
 28 | from tqdm import tqdm
 29 | import librosa
 30 | 
 31 | 
 32 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 33 | 
 34 | import audio
 35 | from hparams import hparams
 36 | 
 37 | 
 38 | use_cuda = torch.cuda.is_available()
 39 | 
 40 | 
 41 | if __name__ == "__main__":
 42 |     args = docopt(__doc__)
 43 |     print("Command line args:\n", args)
 44 |     data_root = args["--data-root"]
 45 |     if data_root is None:
 46 |         data_root = join(dirname(__file__), "data", "cmu_arctic")
 47 |     checkpoint_path = args["<checkpoint>"]
 48 |     dst_dir = args["<dst_dir>"]
 49 | 
 50 |     length = int(args["--length"])
 51 |     # Note that speaker-id is used for filtering out unrelated-speaker from
 52 |     # multi-speaker dataset.
 53 |     speaker_id = args["--speaker-id"]
 54 |     speaker_id = int(speaker_id) if speaker_id is not None else None
 55 |     initial_value = args["--initial-value"]
 56 |     initial_value = None if initial_value is None else float(initial_value)
 57 |     file_name_suffix = args["--file-name-suffix"]
 58 |     output_html = args["--output-html"]
 59 |     num_utterances = int(args["--num-utterances"])
 60 | 
 61 |     # Override hyper parameters
 62 |     hparams.parse(args["--hparams"])
 63 |     assert hparams.name == "wavenet_vocoder"
 64 | 
 65 |     from train import build_model, get_data_loaders
 66 |     from synthesis import wavegen
 67 | 
 68 |     # Data
 69 |     # Use exactly same testset used in training script
 70 |     # disable shuffle for convenience
 71 |     test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False)["test"]
 72 |     test_dataset = test_data_loader.dataset
 73 | 
 74 |     # Model
 75 |     model = build_model()
 76 | 
 77 |     # Load checkpoint
 78 |     print("Load checkpoint from {}".format(checkpoint_path))
 79 |     checkpoint = torch.load(checkpoint_path)
 80 |     model.load_state_dict(checkpoint["state_dict"])
 81 |     checkpoint_name = splitext(basename(checkpoint_path))[0]
 82 | 
 83 |     os.makedirs(dst_dir, exist_ok=True)
 84 |     dst_dir_name = basename(os.path.normpath(dst_dir))
 85 | 
 86 |     generated_utterances = {}
 87 |     for idx, (x, c, g) in enumerate(test_dataset):
 88 |         target_audio_path = test_dataset.X.collected_files[idx][0]
 89 |         if num_utterances > 0 and g is not None:
 90 |             try:
 91 |                 generated_utterances[g] += 1
 92 |                 if generated_utterances[g] > num_utterances:
 93 |                     continue
 94 |             except KeyError:
 95 |                 generated_utterances[g] = 1
 96 | 
 97 |         if output_html:
 98 |             def _tqdm(x): return x
 99 |         else:
100 |             _tqdm = tqdm
101 |             print("Target audio is {}".format(target_audio_path))
102 |             if c is not None:
103 |                 print("Local conditioned by {}".format(test_dataset.Mel.collected_files[idx][0]))
104 |             if g is not None:
105 |                 print("Global conditioned by speaker id {}".format(g))
106 | 
107 |         # Paths
108 |         if g is None:
109 |             dst_wav_path = join(dst_dir, "{}_{}{}_predicted.wav".format(
110 |                 idx, checkpoint_name, file_name_suffix))
111 |             target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format(
112 |                 idx, checkpoint_name, file_name_suffix))
113 |         else:
114 |             dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format(
115 |                 g, idx, checkpoint_name, file_name_suffix))
116 |             target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format(
117 |                 g, idx, checkpoint_name, file_name_suffix))
118 | 
119 |         # Generate
120 |         waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value,
121 |                            fast=True, tqdm=_tqdm)
122 | 
123 |         # save
124 |         librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
125 |         if is_mulaw_quantize(hparams.input_type):
126 |             x = P.inv_mulaw_quantize(x, hparams.quantize_channels)
127 |         elif is_mulaw(hparams.input_type):
128 |             x = P.inv_mulaw(x, hparams.quantize_channels)
129 |         librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate)
130 | 
131 |         # log
132 |         if output_html:
133 |             print("""
134 | <audio controls="controls" >
135 | <source src="/{}/audio/{}/{}" autoplay/>
136 | Your browser does not support the audio element.
137 | </audio>
138 | """.format(hparams.name, dst_dir_name, basename(dst_wav_path)))
139 | 
140 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
141 |     sys.exit(0)
142 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | # NOTE: If you want full control for model architecture. please take a look
  5 | # at the code and change whatever you want. Some hyper parameters are hardcoded.
  6 | 
  7 | # Default hyperparameters:
  8 | hparams = tf.contrib.training.HParams(
  9 |     name="wavenet_vocoder",
 10 | 
 11 |     # Convenient model builder
 12 |     builder="wavenet",
 13 | 
 14 |     # Input type:
 15 |     # 1. raw [-1, 1]
 16 |     # 2. mulaw [-1, 1]
 17 |     # 3. mulaw-quantize [0, mu]
 18 |     # If input_type is raw or mulaw, network assumes scalar input and
 19 |     # discretized mixture of logistic distributions output, otherwise one-hot
 20 |     # input and softmax output are assumed.
 21 |     # **NOTE**: if you change the one of the two parameters below, you need to
 22 |     # re-run preprocessing before training.
 23 |     # **NOTE**: scaler input (raw or mulaw) is experimental. Use it your own risk.
 24 |     input_type="raw",
 25 |     output_type="Gaussian",#['Gaussian','MOG','MOL','softmax']
 26 |     quantize_channels=65536,  # 65536 or 256
 27 | 
 28 |     # Audio:
 29 |     sample_rate=22050,
 30 |     # this is only valid for mulaw is True
 31 |     silence_threshold=2,
 32 |     num_mels=80,
 33 |     fmin=125,
 34 |     fmax=7600,
 35 |     fft_size=1024,
 36 |     # shift can be specified by either hop_size or frame_shift_ms
 37 |     hop_size=256,
 38 |     frame_shift_ms=None,
 39 |     min_level_db=-100,
 40 |     ref_level_db=20,
 41 |     # whether to rescale waveform or not.
 42 |     # Let x is an input waveform, rescaled waveform y is given by:
 43 |     # y = x / np.abs(x).max() * rescaling_max
 44 |     rescaling=True,
 45 |     rescaling_max=0.999,
 46 |     # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
 47 |     # happen depends on min_level_db and ref_level_db, causing clipping noise.
 48 |     # If False, assertion is added to ensure no clipping happens.o0
 49 |     allow_clipping_in_normalization=True,
 50 | 
 51 |     # Mixture of logistic distributions:
 52 |     log_scale_min=float(np.log(1e-14)),
 53 | 
 54 |     # Model:
 55 |     # This should equal to `quantize_channels` if mu-law quantize enabled
 56 |     # otherwise num_mixture * 3 (pi, mean, log_scale)
 57 |     out_channels=2,
 58 |     use_skip=True,
 59 |     layers=24,
 60 |     stacks=4,
 61 |     residual_channels=512,
 62 |     gate_channels=512,  # split into 2 gropus internally for gated activation
 63 |     skip_out_channels=256,
 64 |     dropout=1 - 0.95,
 65 |     kernel_size=3,
 66 |     # If True, apply weight normalization as same as DeepVoice3
 67 |     weight_normalization=True,
 68 | 
 69 |     # Local conditioning (set negative value to disable))
 70 |     cin_channels=80,
 71 |     # If True, use transposed convolutions to upsample conditional features,
 72 |     # otherwise repeat features to adjust time resolution
 73 |     upsample_conditional_features=True,
 74 |     # should np.prod(upsample_scales) == hop_size
 75 |     upsample_scales=[4, 4, 4, 4],
 76 |     upsample_size=[[30,3],[40,3]],
 77 |     # Freq axis kernel size for upsampling network
 78 |     freq_axis_kernel_size=3,
 79 | 
 80 |     # Global conditioning (set negative value to disable)
 81 |     # currently limited for speaker embedding
 82 |     # this should only be enabled for multi-speaker dataset
 83 |     gin_channels=-1,  # i.e., speaker embedding dim
 84 |     n_speakers=7,  # 7 for CMU ARCTIC
 85 | 
 86 |     # Data loader
 87 |     pin_memory=True,
 88 |     num_workers=2,
 89 | 
 90 |     # train/test
 91 |     # test size can be specified as portion or num samples
 92 |     test_size=0.0441,  # 50 for CMU ARCTIC single speaker
 93 |     test_num_samples=None,
 94 |     random_state=1234,
 95 | 
 96 |     # Loss
 97 | 
 98 |     # Training:
 99 |     batch_size=2,
100 |     adam_beta1=0.9,
101 |     adam_beta2=0.999,
102 |     adam_eps=1e-8,
103 |     initial_learning_rate=1e-3,
104 |     # see lrschedule.py for available lr_schedule
105 |     lr_schedule="noam_learning_rate_decay",
106 |     lr_schedule_kwargs={},  # {"anneal_rate": 0.5, "anneal_interval": 50000},
107 |     nepochs=2000,
108 |     weight_decay=0.0,
109 |     clip_thresh=-1,
110 |     # max time steps can either be specified as sec or steps
111 |     # This is needed for those who don't have huge GPU memory...
112 |     # if both are None, then full audio samples are used
113 |     max_time_sec=None,
114 |     max_time_steps=8000,
115 |     # Hold moving averaged parameters and use them for evaluation
116 |     exponential_moving_average=True,
117 |     # averaged = decay * averaged + (1 - decay) * x
118 |     ema_decay=0.9999,
119 | 
120 |     # Save
121 |     # per-step intervals
122 |     checkpoint_interval=10000,
123 |     train_eval_interval=10000,
124 |     # per-epoch interval
125 |     test_eval_epoch_interval=5,
126 |     save_optimizer_state=True,
127 | 
128 |     # Eval:
129 | 
130 |     # Student Model
131 |     student_out_channels=2,
132 |     student_layers=60,
133 |     student_stacks=6,
134 |     student_residual_channels=128,
135 |     student_skip_channels=128,
136 |     iaf_layer_sizes=[10, 10, 10, 10,10,10],
137 |     student_gate_channels=128,
138 |     use_scale=False,
139 |     iaf_shift=False,
140 |     share_condition_net=True
141 | )
142 | 
143 | 
144 | def hparams_debug_string():
145 |     values = hparams.values()
146 |     hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
147 |     return 'Hyperparameters:\n' + '\n'.join(hp)
148 | 


--------------------------------------------------------------------------------
/librivox.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | import numpy as np
  4 | import os
  5 | import audio
  6 | 
  7 | from nnmnkwii import preprocessing as P
  8 | from hparams import hparams
  9 | from os.path import exists
 10 | import librosa
 11 | 
 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 13 | 
 14 | from hparams import hparams
 15 | 
 16 | 
 17 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 18 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 19 |     futures = []
 20 |     index = 1
 21 |     
 22 |     #with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
 23 |     #    for line in f:
 24 |     #        parts = line.strip().split('|')
 25 |     #        wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
 26 |     #        text = parts[2]
 27 |     #        futures.append(executor.submit(
 28 |     #            partial(_process_utterance, out_dir, index, wav_path, text)))
 29 |     #        index += 1
 30 |       
 31 |     valid_ext = '.ogg .wav .mp3'.split()
 32 |     for f in sorted(os.listdir(in_dir)):
 33 |       valid = sum([ f.endswith(ext) for ext in valid_ext ])
 34 |       if valid<1: continue
 35 |       
 36 |       audio_filepath = os.path.join(in_dir, f)
 37 |       text = audio_filepath # Not very informative
 38 |       futures.append(executor.submit(
 39 |         partial(_process_utterance, out_dir, index, audio_filepath, text)))
 40 |       index += 1
 41 |     return [tup for future in tqdm(futures) for tup in future.result()]
 42 | 
 43 | 
 44 | def _process_utterance(out_dir, index, audio_filepath, text):  
 45 |     # Load the audio to a numpy array:
 46 |     wav_whole = audio.load_wav(audio_filepath)
 47 | 
 48 |     if hparams.rescaling:
 49 |         wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max
 50 | 
 51 |     # This is a librivox source, so the audio files are going to be v. long 
 52 |     # compared to a typical 'utterance' : So split the wav into chunks
 53 | 
 54 |     tup_results = []
 55 |     
 56 |     n_samples = int( 8.0 * hparams.sample_rate ) # All 8 second utterances
 57 |     n_chunks = wav_whole.shape[0] // n_samples
 58 |     
 59 |     for chunk_idx in range(n_chunks):
 60 |         chunk_start, chunk_end = chunk_idx*n_samples, (chunk_idx+1)*n_samples
 61 |         if chunk_idx == n_chunks-1:  # This is the last chunk - allow it to extend to the end of the file
 62 |             chunk_end = None
 63 |         wav = wav_whole[ chunk_start : chunk_end ]
 64 | 
 65 |         # Mu-law quantize
 66 |         if is_mulaw_quantize(hparams.input_type):
 67 |             # [0, quantize_channels)
 68 |             out = P.mulaw_quantize(wav, hparams.quantize_channels)
 69 | 
 70 |             # Trim silences
 71 |             start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 72 |             wav = wav[start:end]
 73 |             out = out[start:end]
 74 |             constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 75 |             out_dtype = np.int16
 76 |         elif is_mulaw(hparams.input_type):
 77 |             # [-1, 1]
 78 |             out = P.mulaw(wav, hparams.quantize_channels)
 79 |             constant_values = P.mulaw(0.0, hparams.quantize_channels)
 80 |             out_dtype = np.float32
 81 |         else:
 82 |             # [-1, 1]
 83 |             out = wav
 84 |             constant_values = 0.0
 85 |             out_dtype = np.float32
 86 | 
 87 |         # Compute a mel-scale spectrogram from the trimmed wav:
 88 |         # (N, D)
 89 |         mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
 90 |         # lws pads zeros internally before performing stft
 91 |         # this is needed to adjust time resolution between audio and mel-spectrogram
 92 |         l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
 93 | 
 94 |         # zero pad for quantized signal
 95 |         out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
 96 |         N = mel_spectrogram.shape[0]
 97 |         assert len(out) >= N * audio.get_hop_size()
 98 | 
 99 |         # time resolution adjustment
100 |         # ensure length of raw audio is multiple of hop_size so that we can use
101 |         # transposed convolution to upsample
102 |         out = out[:N * audio.get_hop_size()]
103 |         assert len(out) % audio.get_hop_size() == 0
104 | 
105 |         timesteps = len(out)
106 | 
107 |         # Write the spectrograms to disk:
108 |         audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,)
109 |         mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,)
110 |         text_idx = '%s - %05d' % (text, chunk_idx,)
111 |         np.save(os.path.join(out_dir, audio_filename),
112 |                 out.astype(out_dtype), allow_pickle=False)
113 |         np.save(os.path.join(out_dir, mel_filename),
114 |                 mel_spectrogram.astype(np.float32), allow_pickle=False)
115 | 
116 |         # Add results tuple describing this training example:
117 |         tup_results.append( (audio_filename, mel_filename, timesteps, text_idx) )
118 |         
119 |     # Return all the audio results tuples (unpack in caller)
120 |     return tup_results
121 |     
122 | 


--------------------------------------------------------------------------------
/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | 
 7 | from nnmnkwii import preprocessing as P
 8 | from hparams import hparams
 9 | from os.path import exists
10 | import librosa
11 | 
12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
13 | 
14 | from hparams import hparams
15 | 
16 | 
17 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
18 |     executor = ProcessPoolExecutor(max_workers=num_workers)
19 |     futures = []
20 |     index = 1
21 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
22 |         for line in f:
23 |             parts = line.strip().split('|')
24 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
25 |             text = parts[2]
26 |             futures.append(executor.submit(
27 |                 partial(_process_utterance, out_dir, index, wav_path, text)))
28 |             index += 1
29 |     return [future.result() for future in tqdm(futures)]
30 | 
31 | 
32 | def _process_utterance(out_dir, index, wav_path, text):
33 |     # Load the audio to a numpy array:
34 |     wav = audio.load_wav(wav_path)
35 | 
36 |     if hparams.rescaling:
37 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
38 | 
39 |     # Mu-law quantize
40 |     if is_mulaw_quantize(hparams.input_type):
41 |         # [0, quantize_channels)
42 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
43 | 
44 |         # Trim silences
45 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
46 |         wav = wav[start:end]
47 |         out = out[start:end]
48 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
49 |         out_dtype = np.int16
50 |     elif is_mulaw(hparams.input_type):
51 |         # [-1, 1]
52 |         out = P.mulaw(wav, hparams.quantize_channels)
53 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
54 |         out_dtype = np.float32
55 |     else:
56 |         # [-1, 1]
57 |         out = wav
58 |         constant_values = 0.0
59 |         out_dtype = np.float32
60 | 
61 |     # Compute a mel-scale spectrogram from the trimmed wav:
62 |     # (N, D)
63 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
64 |     # lws pads zeros internally before performing stft
65 |     # this is needed to adjust time resolution between audio and mel-spectrogram
66 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
67 | 
68 |     # zero pad for quantized signal
69 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
70 |     N = mel_spectrogram.shape[0]
71 |     assert len(out) >= N * audio.get_hop_size()
72 | 
73 |     # time resolution adjustment
74 |     # ensure length of raw audio is multiple of hop_size so that we can use
75 |     # transposed convolution to upsample
76 |     out = out[:N * audio.get_hop_size()]
77 |     assert len(out) % audio.get_hop_size() == 0
78 | 
79 |     timesteps = len(out)
80 | 
81 |     # Write the spectrograms to disk:
82 |     audio_filename = 'ljspeech-audio-%05d.npy' % index
83 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
84 |     np.save(os.path.join(out_dir, audio_filename),
85 |             out.astype(out_dtype), allow_pickle=False)
86 |     np.save(os.path.join(out_dir, mel_filename),
87 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
88 | 
89 |     # Return a tuple describing this training example:
90 |     return (audio_filename, mel_filename, timesteps, text)
91 | 


--------------------------------------------------------------------------------
/lrschedule.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329
 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000):
 6 |      # Noam scheme from tensor2tensor:
 7 |     warmup_steps = float(warmup_steps)
 8 |     step = global_step + 1.
 9 |     lr = init_lr * warmup_steps**0.5 * np.minimum(
10 |         step * warmup_steps**-1.5, step**-0.5)
11 |     return lr
12 | 
13 | 
14 | def step_learning_rate_decay(init_lr, global_step,
15 |                              anneal_rate=0.98,
16 |                              anneal_interval=30000):
17 |     return init_lr * anneal_rate ** (global_step // anneal_interval)
18 | 
19 | 
20 | def step_learning_rate(init_lr, global_step,
21 |                              gamma=0.5,
22 |                              step_size=100000):
23 |     return init_lr * pow(gamma,int(global_step/step_size))
24 | 
25 | def cyclic_cosine_annealing(init_lr, global_step, T, M):
26 |     """Cyclic cosine annealing
27 | 
28 |     https://arxiv.org/pdf/1704.00109.pdf
29 | 
30 |     Args:
31 |         init_lr (float): Initial learning rate
32 |         global_step (int): Current iteration number
33 |         T (int): Total iteration number (i,e. nepoch)
34 |         M (int): Number of ensembles we want
35 | 
36 |     Returns:
37 |         float: Annealed learning rate
38 |     """
39 |     TdivM = T // M
40 |     return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0)
41 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Preprocess dataset
 4 | 
 5 | usage: preprocess.py [options] <name> <in_dir> <out_dir>
 6 | 
 7 | options:
 8 |     --num_workers=<n>        Num workers.
 9 |     --hparams=<parmas>       Hyper parameters [default: ].
10 |     --preset=<json>          Path of preset parameters (json).
11 |     -h, --help               Show help message.
12 | """
13 | from docopt import docopt
14 | import os
15 | from multiprocessing import cpu_count
16 | from tqdm import tqdm
17 | import importlib
18 | from hparams import hparams
19 | 
20 | 
21 | def preprocess(mod, in_dir, out_root, num_workers):
22 |     os.makedirs(out_dir, exist_ok=True)
23 |     metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
24 |     write_metadata(metadata, out_dir)
25 | 
26 | 
27 | def write_metadata(metadata, out_dir):
28 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
29 |         for m in metadata:
30 |             f.write('|'.join([str(x) for x in m]) + '\n')
31 |     frames = sum([m[2] for m in metadata])
32 |     sr = hparams.sample_rate
33 |     hours = frames / sr / 3600
34 |     print('Wrote %d utterances, %d time steps (%.2f hours)' % (len(metadata), frames, hours))
35 |     print('Max input length:  %d' % max(len(m[3]) for m in metadata))
36 |     print('Max output length: %d' % max(m[2] for m in metadata))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     # args = docopt(__doc__)
41 |     name = 'ljspeech'  # args["<name>"]
42 |     in_dir = '/home/jinqiangzeng/work/data/speech/ljspeech/LJSpeech-1.0'  # args["<in_dir>"]
43 |     out_dir = './data/{}'.format(name)#args["<out_dir>"]
44 |     num_workers = None #args["--num_workers"]
45 |     num_workers = cpu_count() if num_workers is None else int(num_workers)
46 |     preset = '/home/jinqiangzeng/work/mypycharm/wavenet/wavenet_vocoder/presets/ljspeech_gaussian.json'#args["--preset"]
47 | 
48 |     # Load preset if specified
49 |     if preset is not None:
50 |         with open(preset) as f:
51 |             hparams.parse_json(f.read())
52 |     # Override hyper parameters
53 |     # hparams.parse(args["--hparams"])
54 |     assert hparams.name == "wavenet_vocoder"
55 | 
56 |     print("Sampling frequency: {}".format(hparams.sample_rate))
57 | 
58 |     assert name in ["cmu_arctic", "ljspeech", "librivox", ]
59 |     mod = importlib.import_module(name)
60 |     preprocess(mod, in_dir, out_dir, num_workers)
61 | 


--------------------------------------------------------------------------------
/presets/cmu_arctic_8bit.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "mulaw-quantize",
 5 |   "quantize_channels": 256,
 6 |   "sample_rate": 16000,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 256,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": -1,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 2,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.001,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": false,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true
63 | }


--------------------------------------------------------------------------------
/presets/ljspeech_gaussian.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "clari",
 3 |   "builder": "wavenet",
 4 |   "input_type": "raw",
 5 |   "output_type": "Gaussian",
 6 |   "quantize_channels": 65536,
 7 |   "sample_rate": 22050,
 8 |   "silence_threshold": 2,
 9 |   "num_mels": 80,
10 |   "fmin": 125,
11 |   "fmax": 7600,
12 |   "fft_size": 1024,
13 |   "hop_size": 256,
14 |   "frame_shift_ms": null,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": true,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "log_scale_min": -32.23619130191664,
21 |   "out_channels": 2,
22 |   "layers": 20,
23 |   "stacks": 2,
24 |   "residual_channels": 128,
25 |   "gate_channels": 256,
26 |   "skip_out_channels": 128,
27 |   "dropout": 0.050000000000000044,
28 |   "kernel_size": 3,
29 |   "weight_normalization": true,
30 |   "cin_channels": 80,
31 |   "upsample_conditional_features": true,
32 |   "upsample_scales": [
33 |     4,
34 |     4,
35 |     4,
36 |     4
37 |   ],
38 |   "upsample_size": [
39 |     [
40 |       30,
41 |       3
42 |     ],
43 |     [
44 |       40,
45 |       3
46 |     ]
47 |   ],
48 |   "freq_axis_kernel_size": 3,
49 |   "gin_channels": -1,
50 |   "n_speakers": 7,
51 |   "pin_memory": true,
52 |   "num_workers": 4,
53 |   "test_size": 0.0441,
54 |   "test_num_samples": null,
55 |   "random_state": 1234,
56 |   "batch_size": 4,
57 |   "adam_beta1": 0.9,
58 |   "adam_beta2": 0.999,
59 |   "adam_eps": 1e-08,
60 |   "initial_learning_rate": 0.001,
61 |   "lr_schedule": "step_learning_rate",
62 |   "lr_schedule_kwargs": {},
63 |   "nepochs": 2000,
64 |   "weight_decay": 0.0,
65 |   "clip_thresh": -1,
66 |   "max_time_sec": null,
67 |   "max_time_steps": 10000,
68 |   "exponential_moving_average": true,
69 |   "ema_decay": 0.9999,
70 |   "checkpoint_interval": 3000,
71 |   "train_eval_interval": 3000,
72 |   "test_eval_epoch_interval": 5,
73 |   "save_optimizer_state": true,
74 |   "student_out_channels": 2,
75 |   "student_layers": 60,
76 |   "student_stacks": 6,
77 |   "student_residual_channels": 128,
78 |   "student_skip_channels": 128,
79 |   "iaf_layer_sizes": [
80 |     10,
81 |     10,
82 |     10,
83 |     10,
84 |     10,
85 |     10
86 |   ],
87 |   "student_gate_channels": 128,
88 |   "use_scale": false,
89 |   "iaf_shift": false,
90 |   "use_skip":true,
91 |   "share_condition_net":true
92 | }
93 | 


--------------------------------------------------------------------------------
/presets/ljspeech_mixture.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "raw",
 5 |   "quantize_channels": 65536,
 6 |   "sample_rate": 22050,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 30,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": -1,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 4,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 12,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.0002,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": true,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true,
63 |   "student_out_channels": 2,
64 |   "student_layers": 60,
65 |   "student_stacks": 6,
66 |   "student_residual_channels": 64,
67 |   "iaf_layer_sizes": [10, 10, 10, 30],
68 |   "student_gate_channels": 64,
69 |   "use_scale": false
70 | }
71 | 


--------------------------------------------------------------------------------
/presets/multispeaker_cmu_arctic_mixture.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "raw",
 5 |   "quantize_channels": 65536,
 6 |   "sample_rate": 16000,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 30,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": 16,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 2,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.001,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": true,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true
63 | }


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script for Pypi release
 4 | # 0. Make sure you are on git tag
 5 | # 1. Run the script
 6 | # 2. Upload sdist
 7 | 
 8 | set -e
 9 | 
10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
11 | cd $script_dir
12 | 
13 | TAG=$(git describe --exact-match --tags HEAD)
14 | 
15 | VERSION=${TAG/v/}
16 | 
17 | WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py develop sdist
18 | echo "*** Ready to release! deepvoice3_pytorch $TAG ***"
19 | echo "Please run the following command manually:"
20 | echo WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py sdist upload
21 | echo "Please make sure that release verion is correct."
22 | cat wavenet_vocoder/version.py
23 | 


--------------------------------------------------------------------------------
/resyn.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HaiFengZeng/clari_wavenet_vocoder/c1c290237898f17f3006b6ecbd4bad3d61d631a8/resyn.wav


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | import setuptools.command.develop
 5 | import setuptools.command.build_py
 6 | import os
 7 | import subprocess
 8 | 
 9 | version = '0.0.4'
10 | 
11 | # Adapted from https://github.com/pytorch/pytorch
12 | cwd = os.path.dirname(os.path.abspath(__file__))
13 | if os.getenv('WAVENET_VOCODER_BUILD_VERSION'):
14 |     version = os.getenv('WAVENET_VOCODER_BUILD_VERSION')
15 | else:
16 |     try:
17 |         sha = subprocess.check_output(
18 |             ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
19 |         version += '+' + sha[:7]
20 |     except subprocess.CalledProcessError:
21 |         pass
22 |     except IOError:  # FileNotFoundError for python 3
23 |         pass
24 | 
25 | 
26 | class build_py(setuptools.command.build_py.build_py):
27 | 
28 |     def run(self):
29 |         self.create_version_file()
30 |         setuptools.command.build_py.build_py.run(self)
31 | 
32 |     @staticmethod
33 |     def create_version_file():
34 |         global version, cwd
35 |         print('-- Building version ' + version)
36 |         version_path = os.path.join(cwd, 'wavenet_vocoder', 'version.py')
37 |         with open(version_path, 'w') as f:
38 |             f.write("__version__ = '{}'\n".format(version))
39 | 
40 | 
41 | class develop(setuptools.command.develop.develop):
42 | 
43 |     def run(self):
44 |         build_py.create_version_file()
45 |         setuptools.command.develop.develop.run(self)
46 | 
47 | 
48 | setup(name='wavenet_vocoder',
49 |       version=version,
50 |       description='PyTorch implementation of WaveNet vocoder',
51 |       packages=find_packages(),
52 |       cmdclass={
53 |           'build_py': build_py,
54 |           'develop': develop,
55 |       },
56 |       install_requires=[
57 |           "numpy",
58 |           "scipy",
59 |           "torch >= 0.3.0",
60 |           "deepvoice3_pytorch >= 0.0.2",
61 |       ],
62 |       extras_require={
63 |           "train": [
64 |               "docopt",
65 |               "tqdm",
66 |               "tensorboardX",
67 |               "nnmnkwii >= 0.0.11",
68 |               "keras",
69 |               "scikit-learn",
70 |           ],
71 |           "test": [
72 |               "nose",
73 |               "pysptk >= 0.1.9",
74 |               "librosa",
75 |               "matplotlib",
76 |               "tqdm",
77 |               "nnmnkwii >= 0.0.11",
78 |           ],
79 |       })
80 | 


--------------------------------------------------------------------------------
/synthesis.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Synthesis waveform from trained WaveNet.
  3 | 
  4 | usage: synthesis.py [options] <checkpoint> <dst_dir>
  5 | 
  6 | options:
  7 |     --hparams=<params>                Hyper parameters [default: ].
  8 |     --preset=<json>                   Path of preset parameters (json).
  9 |     --length=<T>                      Steps to generate [default: 32000].
 10 |     --initial-value=<n>               Initial value for the WaveNet decoder.
 11 |     --conditional=<p>                 Conditional features path.
 12 |     --file-name-suffix=<s>            File name suffix [default: ].
 13 |     --speaker-id=<id>                 Speaker ID (for multi-speaker model).
 14 |     --output-html                     Output html for blog post.
 15 |     -h, --help               Show help message.
 16 | """
 17 | from docopt import docopt
 18 | 
 19 | import sys
 20 | import os
 21 | from os.path import dirname, join, basename, splitext
 22 | import torch
 23 | from torch.autograd import Variable
 24 | import numpy as np
 25 | from nnmnkwii import preprocessing as P
 26 | from keras.utils import np_utils
 27 | from tqdm import tqdm
 28 | import librosa
 29 | 
 30 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 31 | 
 32 | import audio
 33 | from hparams import hparams
 34 | 
 35 | torch.set_num_threads(4)
 36 | use_cuda = torch.cuda.is_available()
 37 | 
 38 | 
 39 | def _to_numpy(x):
 40 |     # this is ugly
 41 |     if x is None:
 42 |         return None
 43 |     if isinstance(x, np.ndarray) or np.isscalar(x):
 44 |         return x
 45 |     # remove batch axis
 46 |     if x.dim() == 3:
 47 |         x = x.squeeze(0)
 48 |     return x.numpy()
 49 | 
 50 | 
 51 | def wavegen(model, length=None, c=None, g=None, initial_value=None,
 52 |             fast=False, tqdm=tqdm):
 53 |     """Generate waveform samples by WaveNet.
 54 | 
 55 |     Args:
 56 |         model (nn.Module) : WaveNet decoder
 57 |         length (int): Time steps to generate. If conditinlal features are given,
 58 |           then this is determined by the feature size.
 59 |         c (numpy.ndarray): Conditional features, of shape T x C
 60 |         g (scaler): Speaker ID
 61 |         initial_value (int) : initial_value for the WaveNet decoder.
 62 |         fast (Bool): Whether to remove weight normalization or not.
 63 |         tqdm (lambda): tqdm
 64 | 
 65 |     Returns:
 66 |         numpy.ndarray : Generated waveform samples
 67 |     """
 68 |     from train import sanity_check
 69 |     sanity_check(model, c, g)
 70 | 
 71 |     c = _to_numpy(c)
 72 |     g = _to_numpy(g)
 73 | 
 74 |     if use_cuda:
 75 |         model = model.cuda()
 76 |     model.eval()
 77 |     if fast:
 78 |         model.make_generation_fast_()
 79 | 
 80 |     if c is None:
 81 |         assert length is not None
 82 |     else:
 83 |         # (Tc, D)
 84 |         if c.ndim != 2:
 85 |             raise RuntimeError(
 86 |                 "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape))
 87 |             assert c.ndim == 2
 88 |         Tc = c.shape[0]
 89 |         upsample_factor = audio.get_hop_size()
 90 |         # Overwrite length according to feature size
 91 |         length = Tc * upsample_factor
 92 |         # (Tc, D) -> (Tc', D)
 93 |         # Repeat features before feeding it to the network
 94 |         if not hparams.upsample_conditional_features:
 95 |             c = np.repeat(c, upsample_factor, axis=0)
 96 | 
 97 |         # B x C x T
 98 |         c = Variable(torch.FloatTensor(c.T).unsqueeze(0))
 99 | 
100 |     if initial_value is None:
101 |         if is_mulaw_quantize(hparams.input_type):
102 |             initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
103 |         else:
104 |             initial_value = 0.0
105 | 
106 |     if is_mulaw_quantize(hparams.input_type):
107 |         assert 0 <= initial_value < hparams.quantize_channels
108 |         initial_input = np_utils.to_categorical(
109 |             initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
110 |         initial_input = Variable(torch.from_numpy(initial_input)).view(
111 |             1, 1, hparams.quantize_channels)
112 |     else:
113 |         initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)
114 | 
115 |     g = None if g is None else Variable(torch.LongTensor([g]))
116 |     if use_cuda:
117 |         initial_input = initial_input.cuda()
118 |         g = None if g is None else g.cuda()
119 |         c = None if c is None else c.cuda()
120 | 
121 |     y_hat = model.incremental_forward(
122 |         initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
123 |         log_scale_min=hparams.log_scale_min)
124 | 
125 |     if is_mulaw_quantize(hparams.input_type):
126 |         y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
127 |         y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
128 |     elif is_mulaw(hparams.input_type):
129 |         y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
130 |     else:
131 |         y_hat = y_hat.view(-1).cpu().data.numpy()
132 | 
133 |     return y_hat
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     args = docopt(__doc__)
138 |     print("Command line args:\n", args)
139 |     checkpoint_path = args["<checkpoint>"]
140 |     dst_dir = args["<dst_dir>"]
141 | 
142 |     length = int(args["--length"])
143 |     initial_value = args["--initial-value"]
144 |     initial_value = None if initial_value is None else float(initial_value)
145 |     conditional_path = args["--conditional"]
146 |     file_name_suffix = args["--file-name-suffix"]
147 |     output_html = args["--output-html"]
148 |     speaker_id = args["--speaker-id"]
149 |     speaker_id = None if speaker_id is None else int(speaker_id)
150 |     preset = args["--preset"]
151 | 
152 |     # Load preset if specified
153 |     if preset is not None:
154 |         with open(preset) as f:
155 |             hparams.parse_json(f.read())
156 |     # Override hyper parameters
157 |     hparams.parse(args["--hparams"])
158 |     assert hparams.name == "wavenet_vocoder"
159 | 
160 |     # Load conditional features
161 |     if conditional_path is not None:
162 |         c = np.load(conditional_path)
163 |     else:
164 |         c = None
165 | 
166 |     from train import build_model
167 | 
168 |     # Model
169 |     model = build_model()
170 | 
171 |     # Load checkpoint
172 |     print("Load checkpoint from {}".format(checkpoint_path))
173 | 
174 |     if use_cuda:
175 |         checkpoint = torch.load(checkpoint_path)
176 |     else:
177 |         checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
178 |     model.load_state_dict(checkpoint["state_dict"])
179 | 
180 |     checkpoint_name = splitext(basename(checkpoint_path))[0]
181 | 
182 |     wav_id = conditional_path.split("/")[-1].split(".")[0].split("-")[-1]
183 |     dataset_name = conditional_path.split("/")[-1].split(".")[0].split("-")[0]
184 |     save_dir = join(dst_dir, checkpoint_name, dataset_name)
185 |     os.makedirs(save_dir, exist_ok=True)
186 | 
187 |     dst_wav_path = join(save_dir, "{}{}.wav".format(wav_id, file_name_suffix))
188 | 
189 |     # DO generate
190 |     waveform = wavegen(model, length, c=c, g=speaker_id, initial_value=initial_value, fast=True)
191 | 
192 |     # save
193 |     librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
194 | 
195 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
196 |     sys.exit(0)
197 | 


--------------------------------------------------------------------------------
/synthesis_student.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Synthesis waveform from trained WaveNet.
  3 | 
  4 | usage: synthesis.py [options] <checkpoint> <dst_dir>
  5 | 
  6 | options:
  7 |     --hparams=<params>                Hyper parameters [default: ].
  8 |     --preset=<json>                   Path of preset parameters (json).
  9 |     --length=<T>                      Steps to generate [default: 32000].
 10 |     --initial-value=<n>               Initial value for the WaveNet decoder.
 11 |     --conditional=<p>                 Conditional features path.
 12 |     --file-name-suffix=<s>            File name suffix [default: ].
 13 |     --speaker-id=<id>                 Speaker ID (for multi-speaker model).
 14 |     --output-html                     Output html for blog post.
 15 |     -h, --help               Show help message.
 16 | """
 17 | from docopt import docopt
 18 | 
 19 | import sys
 20 | import os
 21 | from os.path import dirname, join, basename, splitext
 22 | import torch
 23 | from torch.autograd import Variable
 24 | import numpy as np
 25 | from nnmnkwii import preprocessing as P
 26 | from keras.utils import np_utils
 27 | from tqdm import tqdm
 28 | import librosa
 29 | 
 30 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 31 | 
 32 | import audio
 33 | from hparams import hparams
 34 | 
 35 | torch.set_num_threads(1)
 36 | # use_cuda = torch.cuda.is_available()
 37 | use_cuda = False
 38 | 
 39 | 
 40 | def _to_numpy(x):
 41 |     # this is ugly
 42 |     if x is None:
 43 |         return None
 44 |     if isinstance(x, np.ndarray) or np.isscalar(x):
 45 |         return x
 46 |     # remove batch axis
 47 |     if x.dim() == 3:
 48 |         x = x.squeeze(0)
 49 |     return x.numpy()
 50 | 
 51 | 
 52 | def wavegen(model, length=None, c=None, g=None, tqdm=tqdm):
 53 |     """Generate waveform samples by WaveNet.
 54 | 
 55 |     Args:
 56 |         model (nn.Module) : WaveNet decoder
 57 |         length (int): Time steps to generate. If conditinlal features are given,
 58 |           then this is determined by the feature size.
 59 |         c (numpy.ndarray): Conditional features, of shape T x C
 60 |         g (scaler): Speaker ID
 61 |         initial_value (int) : initial_value for the WaveNet decoder.
 62 |         fast (Bool): Whether to remove weight normalization or not.
 63 |         tqdm (lambda): tqdm
 64 | 
 65 |     Returns:
 66 |         numpy.ndarray : Generated waveform samples
 67 |     """
 68 |     from train import sanity_check
 69 |     sanity_check(model, c, g)
 70 | 
 71 |     c = _to_numpy(c)
 72 |     g = _to_numpy(g)
 73 | 
 74 |     if use_cuda:
 75 |         model = model.cuda()
 76 |     model.eval()
 77 | 
 78 |     if c is None:
 79 |         assert length is not None
 80 |     else:
 81 |         # (Tc, D)
 82 |         if c.ndim != 2:
 83 |             raise RuntimeError(
 84 |                 "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape))
 85 |         assert c.ndim == 2
 86 |         Tc = c.shape[0]
 87 |         upsample_factor = audio.get_hop_size()
 88 |         # Overwrite length according to feature size
 89 |         length = Tc * upsample_factor
 90 |         # (Tc, D) -> (Tc', D)
 91 |         # Repeat features before feeding it to the network
 92 |         if not hparams.upsample_conditional_features:
 93 |             c = np.repeat(c, upsample_factor, axis=0)
 94 | 
 95 |         # B x C x T
 96 |         c = Variable(torch.FloatTensor(c.T).unsqueeze(0))
 97 | 
 98 |     # if initial_value is None:
 99 |     #     if is_mulaw_quantize(hparams.input_type):
100 |     #         initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
101 |     #     else:
102 |     #         initial_value = 0.0
103 |     #
104 |     # if is_mulaw_quantize(hparams.input_type):
105 |     #     assert 0 <= initial_value < hparams.quantize_channels
106 |     #     initial_input = np_utils.to_categorical(
107 |     #         initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
108 |     #     initial_input = Variable(torch.from_numpy(initial_input)).view(
109 |     #         1, 1, hparams.quantize_channels)
110 |     # else:
111 |     #     initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value)
112 |     #
113 |     # g = None if g is None else Variable(torch.LongTensor([g]))
114 |     # if use_cuda:
115 |     #     initial_input = initial_input.cuda()
116 |     #     g = None if g is None else g.cuda()
117 |     #     c = None if c is None else c.cuda()
118 | 
119 |     # y_hat = model.incremental_forward(
120 |     #     initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
121 |     #     log_scale_min=hparams.log_scale_min)
122 | 
123 |     with torch.no_grad():
124 |         y_student, _, _ = model(None, c, g, softmax=False, use_cuda=use_cuda)
125 |     y_student = y_student.view(-1).cpu().data.numpy()
126 |     return y_student
127 | 
128 |     # if is_mulaw_quantize(hparams.input_type):
129 |     #     y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
130 |     #     y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
131 |     # elif is_mulaw(hparams.input_type):
132 |     #     y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
133 |     # else:
134 |     #     y_hat = y_hat.view(-1).cpu().data.numpy()
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     args = docopt(__doc__)
139 |     print("Command line args:\n", args)
140 |     checkpoint_path = args["<checkpoint>"]
141 |     dst_dir = args["<dst_dir>"]
142 | 
143 |     length = int(args["--length"])
144 |     initial_value = args["--initial-value"]
145 |     initial_value = None if initial_value is None else float(initial_value)
146 |     conditional_path = args["--conditional"]
147 |     file_name_suffix = args["--file-name-suffix"]
148 |     output_html = args["--output-html"]
149 |     speaker_id = args["--speaker-id"]
150 |     speaker_id = None if speaker_id is None else int(speaker_id)
151 |     preset = args["--preset"]
152 | 
153 |     # Load preset if specified
154 |     if preset is not None:
155 |         with open(preset) as f:
156 |             hparams.parse_json(f.read())
157 |     # Override hyper parameters
158 |     hparams.parse(args["--hparams"])
159 |     assert hparams.name == "wavenet_vocoder"
160 | 
161 |     # Load conditional features
162 |     if conditional_path is not None:
163 |         c = np.load(conditional_path)
164 |     else:
165 |         c = None
166 | 
167 |     from train_student import build_model
168 | 
169 |     # Model
170 |     model = build_model(name="student")
171 |     # Load checkpoint
172 |     print("Load checkpoint from {}".format(checkpoint_path))
173 | 
174 |     checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
175 |     model.load_state_dict(checkpoint["state_dict"])
176 | 
177 |     checkpoint_name = splitext(basename(checkpoint_path))[0]
178 | 
179 |     wav_id = conditional_path.split("/")[-1].split(".")[0].split("-")[-1]
180 |     dataset_name = conditional_path.split("/")[-1].split(".")[0].split("-")[0]
181 |     save_dir = join(dst_dir, checkpoint_name + "_student", dataset_name)
182 |     os.makedirs(save_dir, exist_ok=True)
183 | 
184 |     dst_wav_path = join(save_dir, "{}{}.wav".format(wav_id, file_name_suffix))
185 | 
186 |     # DO generate
187 |     waveform = wavegen(model, length, c=c, g=speaker_id)
188 | 
189 |     # save
190 |     librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
191 | 
192 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
193 |     sys.exit(0)
194 | 


--------------------------------------------------------------------------------
/tests/test_audio.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import sys
 5 | from os.path import dirname, join
 6 | sys.path.insert(0, join(dirname(__file__), ".."))
 7 | 
 8 | import numpy as np
 9 | from nose.plugins.attrib import attr
10 | 
11 | import logging
12 | logging.getLogger('tensorflow').disabled = True
13 | 
14 | 
15 | @attr("local_only")
16 | def test_amp_to_db():
17 |     import audio
18 |     x = np.random.rand(10)
19 |     x_hat = audio._db_to_amp(audio._amp_to_db(x))
20 |     assert np.allclose(x, x_hat)
21 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | from wavenet_vocoder import receptive_field_size
 5 | 
 6 | 
 7 | def test_receptive_field_size():
 8 |     # Table 4 in https://arxiv.org/abs/1711.10433
 9 |     assert receptive_field_size(total_layers=30, num_cycles=3, kernel_size=3) == 6139
10 |     assert receptive_field_size(total_layers=24, num_cycles=4, kernel_size=3) == 505
11 |     assert receptive_field_size(total_layers=12, num_cycles=2, kernel_size=3) == 253
12 |     assert receptive_field_size(total_layers=30, num_cycles=1,
13 |                                 kernel_size=3, dilation=lambda x: 1) == 61
14 | 


--------------------------------------------------------------------------------
/tests/test_mixture.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch import nn
 7 | from torch.autograd import Variable
 8 | from torch.nn import functional as F
 9 | 
10 | import librosa
11 | import pysptk
12 | 
13 | from wavenet_vocoder.mixture import discretized_mix_logistic_loss
14 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic
15 | 
16 | 
17 | def log_prob_from_logits(x):
18 |     """ numerically stable log_softmax implementation that prevents overflow """
19 |     # TF ordering
20 |     axis = len(x.size()) - 1
21 |     m, _ = torch.max(x, dim=-1, keepdim=True)
22 |     return x - m - torch.log(torch.sum(torch.exp(x - m), dim=axis, keepdim=True))
23 | 
24 | 
25 | def test_log_softmax():
26 |     x = Variable(torch.rand(2, 16000, 30))
27 |     y = log_prob_from_logits(x)
28 |     y_hat = F.log_softmax(x, -1)
29 | 
30 |     y = y.data.cpu().numpy()
31 |     y_hat = y_hat.data.cpu().numpy()
32 |     assert np.allclose(y, y_hat)
33 | 
34 | 
35 | def test_mixture():
36 |     np.random.seed(1234)
37 | 
38 |     x, sr = librosa.load(pysptk.util.example_audio_file(), sr=None)
39 |     assert sr == 16000
40 | 
41 |     T = len(x)
42 |     x = x.reshape(1, T, 1)
43 |     y = Variable(torch.from_numpy(x)).float()
44 |     y_hat = Variable(torch.rand(1, 30, T)).float()
45 | 
46 |     print(y.shape, y_hat.shape)
47 | 
48 |     loss = discretized_mix_logistic_loss(y_hat, y)
49 |     print(loss)
50 | 
51 |     loss = discretized_mix_logistic_loss(y_hat, y, reduce=False)
52 |     print(loss.size(), y.size())
53 |     assert loss.size() == y.size()
54 | 
55 |     y = sample_from_discretized_mix_logistic(y_hat)
56 |     print(y.shape)
57 | 
58 | 
59 | def test_misc():
60 |     # https://en.wikipedia.org/wiki/Logistic_distribution
61 |     # what i have learned
62 |     # m = (x - mu) / s
63 |     m = Variable(torch.rand(10, 10))
64 |     log_pdf_mid1 = -2 * torch.log(torch.exp(m / 2) + torch.exp(-m / 2))
65 |     log_pdf_mid2 = m - 2 * F.softplus(m)
66 |     assert np.allclose(log_pdf_mid1.data.numpy(), log_pdf_mid2.data.numpy())
67 | 
68 |     # Edge case for 0
69 |     plus_in = Variable(torch.rand(10, 10))
70 |     log_cdf_plus1 = F.sigmoid(m).log()
71 |     log_cdf_plus2 = m - F.softplus(m)
72 |     assert np.allclose(log_cdf_plus1.data.numpy(), log_cdf_plus2.data.numpy())
73 | 
74 |     # Edge case for 255
75 |     min_in = Variable(torch.rand(10, 10))
76 |     log_one_minus_cdf_min1 = (1 - F.sigmoid(min_in)).log()
77 |     log_one_minus_cdf_min2 = -F.softplus(min_in)
78 |     assert np.allclose(log_one_minus_cdf_min1.data.numpy(), log_one_minus_cdf_min2.data.numpy())
79 | 


--------------------------------------------------------------------------------
/tests/test_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch.autograd import Variable
  7 | from torch.nn import functional as F
  8 | from nnmnkwii import preprocessing as P
  9 | from pysptk.util import example_audio_file
 10 | import librosa
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | from os.path import join, dirname, exists
 14 | from functools import partial
 15 | from nose.plugins.attrib import attr
 16 | 
 17 | from wavenet_vocoder.modules import ResidualConv1dGLU
 18 | from wavenet_vocoder import WaveNet
 19 | 
 20 | use_cuda = False
 21 | 
 22 | # For test
 23 | build_compact_model = partial(WaveNet, layers=4, stacks=2, residual_channels=32,
 24 |                               gate_channels=32, skip_out_channels=32,
 25 |                               scalar_input=False)
 26 | 
 27 | # https://github.com/keras-team/keras/blob/master/keras/utils/np_utils.py
 28 | # copied to avoid keras dependency in tests
 29 | 
 30 | 
 31 | def to_categorical(y, num_classes=None):
 32 |     """Converts a class vector (integers) to binary class matrix.
 33 |     E.g. for use with categorical_crossentropy.
 34 |     # Arguments
 35 |         y: class vector to be converted into a matrix
 36 |             (integers from 0 to num_classes).
 37 |         num_classes: total number of classes.
 38 |     # Returns
 39 |         A binary matrix representation of the input.
 40 |     """
 41 |     y = np.array(y, dtype='int')
 42 |     input_shape = y.shape
 43 |     if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
 44 |         input_shape = tuple(input_shape[:-1])
 45 |     y = y.ravel()
 46 |     if not num_classes:
 47 |         num_classes = np.max(y) + 1
 48 |     n = y.shape[0]
 49 |     categorical = np.zeros((n, num_classes))
 50 |     categorical[np.arange(n), y] = 1
 51 |     output_shape = input_shape + (num_classes,)
 52 |     categorical = np.reshape(categorical, output_shape)
 53 |     return categorical
 54 | 
 55 | 
 56 | def test_conv_block():
 57 |     conv = ResidualConv1dGLU(30, 30, kernel_size=3, dropout=1 - 0.95)
 58 |     print(conv)
 59 |     x = Variable(torch.zeros(16, 30, 16000))
 60 |     y, h = conv(x)
 61 |     print(y.size(), h.size())
 62 | 
 63 | 
 64 | def test_wavenet():
 65 |     model = build_compact_model()
 66 |     print(model)
 67 |     x = Variable(torch.zeros(16, 256, 1000))
 68 |     y = model(x)
 69 |     print(y.size())
 70 | 
 71 | 
 72 | def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
 73 |     x, _ = librosa.load(example_audio_file(), sr=sr)
 74 |     x, _ = librosa.effects.trim(x, top_db=15)
 75 | 
 76 |     # To save computational cost
 77 |     x = x[:N]
 78 | 
 79 |     # For power conditioning wavenet
 80 |     if returns_power:
 81 |         # (1 x N')
 82 |         p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
 83 |         upsample_factor = x.size // p.size
 84 |         # (1 x N)
 85 |         p = np.repeat(p, upsample_factor, axis=-1)
 86 |         if p.size < x.size:
 87 |             # pad against time axis
 88 |             p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)
 89 | 
 90 |         # shape adajst
 91 |         p = p.reshape(1, 1, -1)
 92 | 
 93 |     # (T,)
 94 |     if mulaw:
 95 |         x = P.mulaw_quantize(x)
 96 |         x_org = P.inv_mulaw_quantize(x)
 97 |         # (C, T)
 98 |         x = to_categorical(x, num_classes=256).T
 99 |         # (1, C, T)
100 |         x = x.reshape(1, 256, -1).astype(np.float32)
101 |     else:
102 |         x_org = x
103 |         x = x.reshape(1, 1, -1)
104 | 
105 |     if returns_power:
106 |         return x, x_org, p
107 | 
108 |     return x, x_org
109 | 
110 | 
111 | @attr("mixture")
112 | def test_mixture_wavenet():
113 |     x, x_org, c = _test_data(returns_power=True, mulaw=False)
114 |     # 10 mixtures
115 |     model = build_compact_model(out_channels=3 * 10, cin_channels=1,
116 |                                 scalar_input=True)
117 |     T = x.shape[-1]
118 |     print(model.first_conv)
119 | 
120 |     # scalar input, not one-hot
121 |     assert x.shape[1] == 1
122 | 
123 |     x = Variable(torch.from_numpy(x).contiguous())
124 |     x = x.cuda() if use_cuda else x
125 | 
126 |     c = Variable(torch.from_numpy(c).contiguous())
127 |     c = c.cuda() if use_cuda else c
128 |     print(c.size())
129 | 
130 |     model.eval()
131 | 
132 |     # Incremental forward with forced teaching
133 |     y_online = model.incremental_forward(
134 |         test_inputs=x, c=c, T=None, tqdm=tqdm)
135 | 
136 |     assert y_online.size() == x.size()
137 | 
138 |     y_online2 = model.incremental_forward(
139 |         test_inputs=None, c=c, T=T, tqdm=tqdm)
140 | 
141 |     assert y_online2.size() == x.size()
142 |     print(x.size())
143 | 
144 | 
145 | @attr("local_conditioning")
146 | def test_local_conditioning_correctness():
147 |     # condition by power
148 |     x, x_org, c = _test_data(returns_power=True)
149 |     model = build_compact_model(cin_channels=1)
150 |     assert model.local_conditioning_enabled()
151 |     assert not model.has_speaker_embedding()
152 | 
153 |     x = Variable(torch.from_numpy(x).contiguous())
154 |     x = x.cuda() if use_cuda else x
155 | 
156 |     c = Variable(torch.from_numpy(c).contiguous())
157 |     c = c.cuda() if use_cuda else c
158 |     print(x.size(), c.size())
159 | 
160 |     model.eval()
161 | 
162 |     y_offline = model(x, c=c, softmax=True)
163 | 
164 |     # Incremental forward with forced teaching
165 |     y_online = model.incremental_forward(
166 |         test_inputs=x, c=c, T=None, tqdm=tqdm, softmax=True, quantize=False)
167 | 
168 |     # (1 x C x T)
169 |     c = (y_offline - y_online).abs()
170 |     print(c.mean(), c.max())
171 | 
172 |     try:
173 |         assert np.allclose(y_offline.cpu().data.numpy(),
174 |                            y_online.cpu().data.numpy(), atol=1e-4)
175 |     except:
176 |         from warnings import warn
177 |         warn("oops! must be a bug!")
178 | 
179 | 
180 | @attr("local_conditioning")
181 | def test_local_conditioning_upsample_correctness():
182 |     # condition by power
183 |     x, x_org, c = _test_data(returns_power=True)
184 | 
185 |     # downsample by 4
186 |     assert c.shape[-1] % 4 == 0
187 |     c = c[:, :, 0::4]
188 | 
189 |     model = build_compact_model(
190 |         cin_channels=1, upsample_conditional_features=True,
191 |         upsample_scales=[2, 2])
192 |     assert model.local_conditioning_enabled()
193 |     assert not model.has_speaker_embedding()
194 | 
195 |     x = Variable(torch.from_numpy(x).contiguous())
196 |     x = x.cuda() if use_cuda else x
197 | 
198 |     c = Variable(torch.from_numpy(c).contiguous())
199 |     c = c.cuda() if use_cuda else c
200 |     print(x.size(), c.size())
201 | 
202 |     model.eval()
203 | 
204 |     y_offline = model(x, c=c, softmax=True)
205 | 
206 |     # Incremental forward with forced teaching
207 |     y_online = model.incremental_forward(
208 |         test_inputs=x, c=c, T=None, tqdm=tqdm, softmax=True, quantize=False)
209 | 
210 |     # (1 x C x T)
211 |     c = (y_offline - y_online).abs()
212 |     print(c.mean(), c.max())
213 | 
214 |     try:
215 |         assert np.allclose(y_offline.cpu().data.numpy(),
216 |                            y_online.cpu().data.numpy(), atol=1e-4)
217 |     except:
218 |         from warnings import warn
219 |         warn("oops! must be a bug!")
220 | 
221 | 
222 | @attr("global_conditioning")
223 | def test_global_conditioning_with_embedding_correctness():
224 |     # condition by mean power
225 |     x, x_org, c = _test_data(returns_power=True)
226 |     g = c.mean(axis=-1, keepdims=True).astype(np.int)
227 |     model = build_compact_model(gin_channels=16, n_speakers=256,
228 |                                 use_speaker_embedding=True)
229 |     assert not model.local_conditioning_enabled()
230 |     assert model.has_speaker_embedding()
231 | 
232 |     x = Variable(torch.from_numpy(x).contiguous())
233 |     x = x.cuda() if use_cuda else x
234 | 
235 |     g = Variable(torch.from_numpy(g).contiguous())
236 |     g = g.cuda() if use_cuda else g
237 |     print(g.size())
238 | 
239 |     model.eval()
240 | 
241 |     y_offline = model(x, g=g, softmax=True)
242 | 
243 |     # Incremental forward with forced teaching
244 |     y_online = model.incremental_forward(
245 |         test_inputs=x, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False)
246 | 
247 |     # (1 x C x T)
248 |     c = (y_offline - y_online).abs()
249 |     print(c.mean(), c.max())
250 | 
251 |     try:
252 |         assert np.allclose(y_offline.cpu().data.numpy(),
253 |                            y_online.cpu().data.numpy(), atol=1e-4)
254 |     except:
255 |         from warnings import warn
256 |         warn("oops! must be a bug!")
257 | 
258 | 
259 | @attr("global_conditioning")
260 | def test_global_conditioning_correctness():
261 |     # condition by mean power
262 |     x, x_org, c = _test_data(returns_power=True)
263 |     # must be floating-point type
264 |     g = c.mean(axis=-1, keepdims=True).astype(np.float32)
265 |     model = build_compact_model(gin_channels=1, use_speaker_embedding=False)
266 |     assert not model.local_conditioning_enabled()
267 |     # `use_speaker_embedding` False should diable embedding layer
268 |     assert not model.has_speaker_embedding()
269 | 
270 |     x = Variable(torch.from_numpy(x).contiguous())
271 |     x = x.cuda() if use_cuda else x
272 | 
273 |     g = Variable(torch.from_numpy(g).contiguous())
274 |     g = g.cuda() if use_cuda else g
275 |     print(g.size())
276 | 
277 |     model.eval()
278 |     y_offline = model(x, g=g, softmax=True)
279 | 
280 |     # Incremental forward with forced teaching
281 |     y_online = model.incremental_forward(
282 |         test_inputs=x, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False)
283 | 
284 |     # (1 x C x T)
285 |     c = (y_offline - y_online).abs()
286 |     print(c.mean(), c.max())
287 | 
288 |     try:
289 |         assert np.allclose(y_offline.cpu().data.numpy(),
290 |                            y_online.cpu().data.numpy(), atol=1e-4)
291 |     except:
292 |         from warnings import warn
293 |         warn("oops! must be a bug!")
294 | 
295 | 
296 | @attr("local_and_global_conditioning")
297 | def test_global_and_local_conditioning_correctness():
298 |     x, x_org, c = _test_data(returns_power=True)
299 |     g = c.mean(axis=-1, keepdims=True).astype(np.int)
300 |     model = build_compact_model(cin_channels=1, gin_channels=16, n_speakers=256)
301 |     assert model.local_conditioning_enabled()
302 |     assert model.has_speaker_embedding()
303 | 
304 |     x = Variable(torch.from_numpy(x).contiguous())
305 |     x = x.cuda() if use_cuda else x
306 | 
307 |     # per-sample power
308 |     c = Variable(torch.from_numpy(c).contiguous())
309 |     c = c.cuda() if use_cuda else c
310 | 
311 |     # mean power
312 |     g = Variable(torch.from_numpy(g).contiguous())
313 |     g = g.cuda() if use_cuda else g
314 | 
315 |     print(c.size(), g.size())
316 | 
317 |     model.eval()
318 | 
319 |     y_offline = model(x, c=c, g=g, softmax=True)
320 | 
321 |     # Incremental forward with forced teaching
322 |     y_online = model.incremental_forward(
323 |         test_inputs=x, c=c, g=g, T=None, tqdm=tqdm, softmax=True, quantize=False)
324 |     # (1 x C x T)
325 | 
326 |     c = (y_offline - y_online).abs()
327 |     print(c.mean(), c.max())
328 | 
329 |     try:
330 |         assert np.allclose(y_offline.cpu().data.numpy(),
331 |                            y_online.cpu().data.numpy(), atol=1e-4)
332 |     except:
333 |         from warnings import warn
334 |         warn("oops! must be a bug!")
335 | 
336 | 
337 | @attr("local_only")
338 | def test_incremental_forward_correctness():
339 |     import librosa.display
340 |     from matplotlib import pyplot as plt
341 | 
342 |     model = build_compact_model()
343 | 
344 |     checkpoint_path = join(dirname(__file__), "..", "foobar/checkpoint_step000058000.pth")
345 |     if exists(checkpoint_path):
346 |         print("Loading from:", checkpoint_path)
347 |         checkpoint = torch.load(checkpoint_path)
348 |         model.load_state_dict(checkpoint["state_dict"])
349 | 
350 |     if use_cuda:
351 |         model = model.cuda()
352 | 
353 |     sr = 4000
354 |     x, x_org = _test_data(sr=sr, N=3000)
355 |     x = Variable(torch.from_numpy(x).contiguous())
356 |     x = x.cuda() if use_cuda else x
357 | 
358 |     model.eval()
359 | 
360 |     # Batch forward
361 |     y_offline = model(x, softmax=True)
362 | 
363 |     # Test from zero start
364 |     y_online = model.incremental_forward(initial_input=None, T=100, tqdm=tqdm, softmax=True)
365 | 
366 |     # Incremental forward with forced teaching
367 |     y_online = model.incremental_forward(test_inputs=x, tqdm=tqdm, softmax=True, quantize=False)
368 | 
369 |     # (1 x C x T)
370 |     c = (y_offline - y_online).abs()
371 |     print(c.mean(), c.max())
372 | 
373 |     try:
374 |         assert np.allclose(y_offline.cpu().data.numpy(),
375 |                            y_online.cpu().data.numpy(), atol=1e-4)
376 |     except:
377 |         from warnings import warn
378 |         warn("oops! must be a bug!")
379 | 
380 |     # (1, T, C)
381 |     xt = x.transpose(1, 2).contiguous()
382 | 
383 |     initial_input = xt[:, 0, :].unsqueeze(1).contiguous()
384 |     print(initial_input.size())
385 |     print("Inital value:", initial_input.view(-1).max(0)[1])
386 | 
387 |     # With zero start
388 |     zerostart = True
389 |     if zerostart:
390 |         y_inference = model.incremental_forward(
391 |             initial_input=initial_input, T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True)
392 |     else:
393 |         # Feed a few samples as test_inputs and then generate auto-regressively
394 |         N = 1000
395 |         y_inference = model.incremental_forward(
396 |             initial_input=None, test_inputs=xt[:, :N, :],
397 |             T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True)
398 | 
399 |     # Waveforms
400 |     # (T,)
401 |     y_offline = y_offline.max(1)[1].view(-1)
402 |     y_online = y_online.max(1)[1].view(-1)
403 |     y_inference = y_inference.max(1)[1].view(-1)
404 | 
405 |     y_offline = P.inv_mulaw_quantize(y_offline.cpu().data.long().numpy())
406 |     y_online = P.inv_mulaw_quantize(y_online.cpu().data.long().numpy())
407 |     y_inference = P.inv_mulaw_quantize(y_inference.cpu().data.long().numpy())
408 | 
409 |     plt.figure(figsize=(16, 10))
410 |     plt.subplot(4, 1, 1)
411 |     librosa.display.waveplot(x_org, sr=sr)
412 |     plt.subplot(4, 1, 2)
413 |     librosa.display.waveplot(y_offline, sr=sr)
414 |     plt.subplot(4, 1, 3)
415 |     librosa.display.waveplot(y_online, sr=sr)
416 |     plt.subplot(4, 1, 4)
417 |     librosa.display.waveplot(y_inference, sr=sr)
418 |     plt.show()
419 | 
420 |     save_audio = False
421 |     if save_audio:
422 |         librosa.output.write_wav("target.wav", x_org, sr=sr)
423 |         librosa.output.write_wav("online.wav", y_online, sr=sr)
424 |         librosa.output.write_wav("inference.wav", y_inference, sr=sr)
425 | 


--------------------------------------------------------------------------------
/tests/test_upsample.py:
--------------------------------------------------------------------------------
 1 | from wavenet_vocoder.upsample import UpSampleConv,ClariUpsampleConv
 2 | from train import get_data_loaders
 3 | from train import eval_model,load_checkpoint,build_model
 4 | 
 5 | def test_upsample():
 6 |     data_loaders = get_data_loaders('../data/ljspeech',-1)
 7 |     for phase, data_loader in data_loaders.items():
 8 |         train = (phase == "train")
 9 |         running_loss = 0.
10 |         test_evaluated = False
11 |         for step, (x, y, c, g, input_lengths) in enumerate(data_loader):
12 |             c = c.unsqueeze(1)
13 |             upconv1 = UpSampleConv()
14 |             c1 =  upconv1(c)
15 |             break
16 | 
17 | 
18 | 
19 | def test_sample():
20 |     preste = '../presets/ljspeech_gaussian.json'
21 |     model = build_model()


--------------------------------------------------------------------------------
/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 | 
4 | 
5 | from .wavenet import receptive_field_size, WaveNet
6 | from .student_wavenet import StudentWaveNet
7 | from .clari_wavenet import ClariWaveNet


--------------------------------------------------------------------------------
/wavenet_vocoder/builder.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | 
  5 | def wavenet(out_channels=256,
  6 |             layers=20,
  7 |             stacks=2,
  8 |             residual_channels=512,
  9 |             gate_channels=512,
 10 |             skip_out_channels=512,
 11 |             cin_channels=-1,
 12 |             gin_channels=-1,
 13 |             weight_normalization=True,
 14 |             dropout=1 - 0.95,
 15 |             kernel_size=3,
 16 |             n_speakers=None,
 17 |             upsample_conditional_features=False,
 18 |             upsample_scales=[16, 16],
 19 |             freq_axis_kernel_size=3,
 20 |             scalar_input=False,
 21 |             use_speaker_embedding=True,
 22 |             output_type="Gaussian"
 23 |             ):
 24 |     from wavenet_vocoder import WaveNet
 25 | 
 26 |     model = WaveNet(out_channels=out_channels, layers=layers, stacks=stacks,
 27 |                     residual_channels=residual_channels,
 28 |                     gate_channels=gate_channels,
 29 |                     skip_out_channels=skip_out_channels,
 30 |                     kernel_size=kernel_size, dropout=dropout,
 31 |                     weight_normalization=weight_normalization,
 32 |                     cin_channels=cin_channels, gin_channels=gin_channels,
 33 |                     n_speakers=n_speakers,
 34 |                     upsample_conditional_features=upsample_conditional_features,
 35 |                     upsample_scales=upsample_scales,
 36 |                     freq_axis_kernel_size=freq_axis_kernel_size,
 37 |                     scalar_input=scalar_input,
 38 |                     use_speaker_embedding=use_speaker_embedding,
 39 |                     output_type=output_type
 40 |                     )
 41 | 
 42 |     return model
 43 | 
 44 | 
 45 | def student_wavenet(out_channels=2,
 46 |                     layers=20,
 47 |                     stacks=2,
 48 |                     residual_channels=64,
 49 |                     iaf_layer_sizes=[10, 10, 10, 30],
 50 |                     gate_channels=64,
 51 |                     kernel_size=3, dropout=1 - 0.95,
 52 |                     cin_channels=-1, gin_channels=-1, n_speakers=None,
 53 |                     weight_normalization=True,
 54 |                     upsample_conditional_features=False,
 55 |                     upsample_scales=None,
 56 |                     freq_axis_kernel_size=3,
 57 |                     scalar_input=False,
 58 |                     use_speaker_embedding=True
 59 |                     ):
 60 |     from wavenet_vocoder import StudentWaveNet
 61 | 
 62 |     model = StudentWaveNet(out_channels=out_channels,
 63 |                            layers=layers, stacks=stacks,
 64 |                            residual_channels=residual_channels,
 65 |                            iaf_layer_sizes=iaf_layer_sizes, gate_channels=gate_channels, kernel_size=kernel_size,
 66 |                            dropout=dropout,
 67 |                            cin_channels=cin_channels, gin_channels=gin_channels,
 68 |                            n_speakers=n_speakers,
 69 |                            upsample_conditional_features=upsample_conditional_features,
 70 |                            upsample_scales=upsample_scales,
 71 |                            freq_axis_kernel_size=freq_axis_kernel_size,
 72 |                            scalar_input=scalar_input,
 73 |                            use_speaker_embedding=use_speaker_embedding,
 74 |                            )
 75 |     return model
 76 | 
 77 | 
 78 | def clari_wavenet(out_channels=2,
 79 |                   layers=20,
 80 |                   stacks=2,
 81 |                   residual_channels=64,
 82 |                   iaf_layer_sizes=[10, 10, 10, 30],
 83 |                   gate_channels=64,
 84 |                   kernel_size=3,
 85 |                   dropout=1 - 0.95,
 86 |                   cin_channels=-1, gin_channels=-1, n_speakers=None,
 87 |                   weight_normalization=True,
 88 |                   upsample_conditional_features=False,
 89 |                   upsample_scales=None,
 90 |                   freq_axis_kernel_size=3,
 91 |                   scalar_input=False,
 92 |                   use_speaker_embedding=True,
 93 |                   skip_channels=128,
 94 |                   use_skip=True,
 95 |                   iaf_shift=False
 96 |                   ):
 97 |     from wavenet_vocoder import ClariWaveNet
 98 | 
 99 |     model = ClariWaveNet(out_channels=out_channels,
100 |                          layers=layers, stacks=stacks,
101 |                          residual_channels=residual_channels,
102 |                          iaf_layer_sizes=iaf_layer_sizes, gate_channels=gate_channels, kernel_size=kernel_size,
103 |                          dropout=dropout,
104 |                          cin_channels=cin_channels, gin_channels=gin_channels,
105 |                          n_speakers=n_speakers,
106 |                          upsample_conditional_features=upsample_conditional_features,
107 |                          upsample_scales=upsample_scales,
108 |                          freq_axis_kernel_size=freq_axis_kernel_size,
109 |                          scalar_input=scalar_input,
110 |                          use_speaker_embedding=use_speaker_embedding,
111 |                          skip_out_channels=skip_channels,
112 |                          use_skip=use_skip,
113 |                          iaf_shift=iaf_shift
114 |                          )
115 |     return model
116 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/clari_wavenet.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import math
  5 | import librosa
  6 | import numpy as np
  7 | from hparams import hparams
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import functional as F
 11 | from torch.autograd import Variable
 12 | from wavenet_vocoder.modules import Embedding, Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d
 13 | from train import build_model
 14 | from wavenet_vocoder import receptive_field_size
 15 | from wavenet_vocoder.wavenet import _expand_global_features, WaveNet
 16 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic
 17 | from wavenet_vocoder.upsample import UpSampleConv
 18 | 
 19 | 
 20 | class ClariWaveNet(nn.Module):
 21 | 
 22 |     def __init__(self, out_channels=2, layers=20, stacks=2,
 23 |                  residual_channels=64,
 24 |                  iaf_layer_sizes=[10, 10, 10, 10, 10, 10],
 25 |                  gate_channels=64,
 26 |                  kernel_size=3, dropout=1 - 0.95,
 27 |                  cin_channels=-1, gin_channels=-1, n_speakers=None,
 28 |                  weight_normalization=True,
 29 |                  upsample_conditional_features=False,
 30 |                  upsample_scales=None,
 31 |                  skip_out_channels=64,
 32 |                  freq_axis_kernel_size=3,
 33 |                  scalar_input=False,
 34 |                  use_speaker_embedding=True,
 35 |                  use_skip=True,
 36 |                  iaf_shift=False
 37 |                  ):
 38 |         super(ClariWaveNet, self).__init__()
 39 |         self.scalar_input = scalar_input
 40 |         self.residual_channels = residual_channels
 41 |         self.out_channels = out_channels
 42 |         self.cin_channels = cin_channels
 43 |         self.iaf_layers_size = iaf_layer_sizes
 44 |         self.last_layers = []
 45 |         self.use_skip = use_skip
 46 |         self.iaf_shift = iaf_shift
 47 |         assert layers % stacks == 0
 48 |         layers_per_stack = layers // stacks
 49 | 
 50 |         self.first_layers = nn.ModuleList()
 51 |         self.iaf_layers = nn.ModuleList()
 52 |         self.last_layers = nn.ModuleList()
 53 |         for i in range(len(iaf_layer_sizes)):
 54 |             if scalar_input:
 55 |                 self.first_layers.append(
 56 |                     Conv1d1x1(1, self.residual_channels))
 57 |             else:
 58 |                 self.first_layers.append(Conv1d1x1(self.out_channels, self.residual_channels))
 59 | 
 60 |         for iaf_layer_size in iaf_layer_sizes:
 61 |             iaf_layer = nn.ModuleList()
 62 |             for layer_index in range(iaf_layer_size):
 63 |                 dilation = 2 ** (layer_index % layers_per_stack)
 64 |                 conv = ResidualConv1dGLU(
 65 |                     residual_channels,
 66 |                     gate_channels,
 67 |                     skip_out_channels=skip_out_channels,
 68 |                     kernel_size=kernel_size,
 69 |                     bias=True,
 70 |                     dilation=dilation,
 71 |                     dropout=dropout,
 72 |                     cin_channels=cin_channels,
 73 |                     gin_channels=gin_channels,
 74 |                     weight_normalization=weight_normalization
 75 |                 )
 76 |                 iaf_layer.append(conv)
 77 | 
 78 |             self.iaf_layers.append(iaf_layer)
 79 |             self.last_layers.append(nn.ModuleList([
 80 |                 nn.ReLU(),
 81 |                 Conv1d1x1(skip_out_channels, residual_channels,
 82 |                           weight_normalization=weight_normalization) if self.use_skip else
 83 |                 Conv1d1x1(residual_channels, residual_channels, weight_normalization=weight_normalization),
 84 |                 nn.ReLU(),
 85 |                 Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization)
 86 |             ]))
 87 | 
 88 |         if gin_channels > 0 and use_speaker_embedding:
 89 |             assert n_speakers is not None
 90 |             self.embed_speakers = Embedding(
 91 |                 n_speakers, gin_channels, padding_idx=None, std=0.1)
 92 |         else:
 93 |             self.embed_speakers = None
 94 | 
 95 |             # Upsample conv net
 96 |         if upsample_conditional_features:
 97 |             self.upsample_conv = UpSampleConv()
 98 |         else:
 99 |             self.upsample_conv = None
100 | 
101 |         self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
102 | 
103 |     def load_teacher_upsample_conv(self, teacher):
104 |         upsample_state_dict = teacher.upsample_conv.state_dict()
105 |         self.upsample_conv.load_state_dict(upsample_state_dict)
106 |         for param in self.upsample_conv.parameters():
107 |             param.requires_grad = False
108 |         self.upsample_conv.eval()
109 | 
110 |     def has_speaker_embedding(self):
111 |         return self.embed_speakers is not None
112 | 
113 |     def local_conditioning_enabled(self):
114 |         return self.cin_channels > 0
115 | 
116 |     def forward(self, z, c=None, g=None, softmax=False, use_cuda=True, use_scale=False):
117 | 
118 |         if c is not None and self.upsample_conv is not None:
119 |             # B x 1 x C x T
120 |             c = c.unsqueeze(1)
121 |             # B x C x T
122 |             c = self.upsample_conv(c)
123 |             c = c.squeeze(1)
124 | 
125 |         assert c.size(-1) == z.size(-1)
126 | 
127 |         B, _, T = z.size()
128 |         iaf_layers_len = len(self.iaf_layers_size)
129 |         if g is not None:
130 |             if self.embed_speakers is not None:
131 |                 # (B x 1) -> (B x 1 x gin_channels)
132 |                 g = self.embed_speakers(g.view(B, -1))
133 |                 # (B x gin_channels x 1)
134 |                 g = g.transpose(1, 2)
135 |                 assert g.dim() == 3
136 |         # Expand global conditioning features to all time steps
137 |         g_bct = _expand_global_features(B, T, g, bct=True)
138 |         if self.iaf_shift:
139 |             z = z[:, :, len(self.iaf_layers_size):]
140 |         mu_tot = torch.zeros(z.size(), requires_grad=True)
141 |         scale_tot = torch.ones(z.size(), requires_grad=True)
142 |         if use_cuda:
143 |             mu_tot, scale_tot = mu_tot.cuda(), scale_tot.cuda()
144 | 
145 |         layer = 0
146 |         original_c = c
147 | 
148 |         length = z.size(-1)
149 |         z_list = []
150 | 
151 |         for first_conv, iaf_layer, last_layer in zip(self.first_layers, self.iaf_layers, self.last_layers):
152 |             if self.iaf_shift:
153 |                 c = original_c[:, :, layer:layer + length]
154 | 
155 |             skips = None
156 |             new_z = first_conv(z)
157 |             for f in iaf_layer:
158 |                 if isinstance(f, ResidualConv1dGLU):
159 |                     new_z, h = f(new_z, c, g_bct)
160 |                 if skips is None:
161 |                     skips = h
162 |                 else:
163 |                     skips += h
164 |                     skips *= math.sqrt(0.5)
165 |             if self.use_skip:
166 |                 new_z = skips
167 |             for f in last_layer:
168 |                 new_z = f(new_z)
169 |             if use_scale:
170 |                 mu_s_f, scale_s_f = new_z[:, :1, :], new_z[:, 1:, :]
171 |             else:
172 |                 mu_s_f, scale_s_f = new_z[:, :1, :], torch.exp(torch.clamp(new_z[:, 1:, :], min=-7))  # log_scale
173 |             # mu_s_f = torch.clamp(mu_s_f, -1, 1 - 2.0 / hparams.quantize_channels)
174 |             mu_tot = mu_s_f + mu_tot * scale_s_f
175 |             scale_tot = scale_tot * scale_s_f
176 |             z = z * scale_s_f + mu_s_f
177 |             z_list.append(z)
178 |             layer += 1
179 |         return z_list, z, mu_tot, scale_tot
180 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/conv.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Variable
 5 | from torch.nn import functional as F
 6 | 
 7 | 
 8 | class Conv1d(nn.Conv1d):
 9 |     """Extended nn.Conv1d for incremental dilated convolutions
10 |     """
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         super().__init__(*args, **kwargs)
14 |         self.clear_buffer()
15 |         self._linearized_weight = None
16 |         self.register_backward_hook(self._clear_linearized_weight)
17 | 
18 |     def incremental_forward(self, input):
19 |         # input: (B, T, C)
20 |         if self.training:
21 |             raise RuntimeError('incremental_forward only supports eval mode')
22 | 
23 |         # run forward pre hooks (e.g., weight norm)
24 |         for hook in self._forward_pre_hooks.values():
25 |             hook(self, input)
26 | 
27 |         # reshape weight
28 |         weight = self._get_linearized_weight()
29 |         kw = self.kernel_size[0]
30 |         dilation = self.dilation[0]
31 | 
32 |         bsz = input.size(0)  # input: bsz x len x dim
33 |         if kw > 1:
34 |             input = input.data
35 |             if self.input_buffer is None:
36 |                 self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2))
37 |                 self.input_buffer.zero_()
38 |             else:
39 |                 # shift buffer
40 |                 self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
41 |             # append next input
42 |             self.input_buffer[:, -1, :] = input[:, -1, :]
43 |             with torch.no_grad():
44 |                 input = torch.autograd.Variable(self.input_buffer)
45 |             if dilation > 1:
46 |                 input = input[:, 0::dilation, :].contiguous()
47 |         output = F.linear(input.view(bsz, -1), weight, self.bias)
48 |         return output.view(bsz, 1, -1)
49 | 
50 |     def clear_buffer(self):
51 |         self.input_buffer = None
52 | 
53 |     def _get_linearized_weight(self):
54 |         if self._linearized_weight is None:
55 |             kw = self.kernel_size[0]
56 |             # nn.Conv1d
57 |             if self.weight.size() == (self.out_channels, self.in_channels, kw):
58 |                 weight = self.weight.transpose(1, 2).contiguous()
59 |             else:
60 |                 # fairseq.modules.conv_tbc.ConvTBC
61 |                 weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
62 |             assert weight.size() == (self.out_channels, kw, self.in_channels)
63 |             self._linearized_weight = weight.view(self.out_channels, -1)
64 |         return self._linearized_weight
65 | 
66 |     def _clear_linearized_weight(self, *args):
67 |         self._linearized_weight = None


--------------------------------------------------------------------------------
/wavenet_vocoder/mixture.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Code is adapted from:
  3 | # https://github.com/pclucas14/pixel-cnn-pp
  4 | # https://github.com/openai/pixel-cnn
  5 | 
  6 | from __future__ import with_statement, print_function, absolute_import
  7 | 
  8 | import math
  9 | import numpy as np
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | from torch.autograd import Variable
 14 | from torch.nn import functional as F
 15 | from torch.distributions import Normal
 16 | 
 17 | 
 18 | def log_sum_exp(x):
 19 |     """ numerically stable log_sum_exp implementation that prevents overflow """
 20 |     # TF ordering
 21 |     axis = len(x.size()) - 1
 22 |     m, _ = torch.max(x, dim=axis)
 23 |     m2, _ = torch.max(x, dim=axis, keepdim=True)
 24 |     return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
 25 | 
 26 | 
 27 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True):
 28 |     """Discretized mixture of logistic distributions loss
 29 | 
 30 |     Note that it is assumed that input is scaled to [-1, 1].
 31 | 
 32 |     Args:
 33 |         y_hat (Variable): Predicted output (B x C x T)
 34 |         y (Variable): Target (B x T x 1).
 35 |         num_classes (int): Number of classes
 36 |         log_scale_min (float): Log scale minimum value
 37 |         reduce (bool): If True, the losses are averaged or summed for each
 38 |           minibatch.
 39 | 
 40 |     Returns
 41 |         Variable: loss
 42 |     """
 43 |     assert y_hat.dim() == 3
 44 |     assert y_hat.size(1) % 3 == 0
 45 |     nr_mix = y_hat.size(1) // 3
 46 | 
 47 |     # (B x T x C)
 48 |     y_hat = y_hat.transpose(1, 2)
 49 | 
 50 |     # unpack parameters. (B, T, num_mixtures) x 3
 51 |     logit_probs = y_hat[:, :, :nr_mix]
 52 |     means = y_hat[:, :, nr_mix:2 * nr_mix]
 53 |     log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
 54 | 
 55 |     # B x T x 1 -> B x T x num_mixtures
 56 |     y = y.expand_as(means)
 57 | 
 58 |     centered_y = y - means
 59 |     inv_stdv = torch.exp(-log_scales)
 60 |     plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 61 |     cdf_plus = F.sigmoid(plus_in)
 62 |     min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 63 |     cdf_min = F.sigmoid(min_in)
 64 | 
 65 |     # log probability for edge case of 0 (before scaling)
 66 |     # equivalent: torch.log(F.sigmoid(plus_in))
 67 |     log_cdf_plus = plus_in - F.softplus(plus_in)
 68 | 
 69 |     # log probability for edge case of 255 (before scaling)
 70 |     # equivalent: (1 - F.sigmoid(min_in)).log()
 71 |     log_one_minus_cdf_min = -F.softplus(min_in)
 72 | 
 73 |     # probability for all other cases
 74 |     cdf_delta = cdf_plus - cdf_min
 75 | 
 76 |     mid_in = inv_stdv * centered_y
 77 |     # log probability in the center of the bin, to be used in extreme cases
 78 |     # (not actually used in our code)
 79 |     log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
 80 | 
 81 |     # tf equivalent
 82 |     """
 83 |     log_probs = tf.where(x < -0.999, log_cdf_plus,
 84 |                          tf.where(x > 0.999, log_one_minus_cdf_min,
 85 |                                   tf.where(cdf_delta > 1e-5,
 86 |                                            tf.log(tf.maximum(cdf_delta, 1e-12)),
 87 |                                            log_pdf_mid - np.log(127.5))))
 88 |     """
 89 |     # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
 90 |     # for num_classes=65536 case? 1e-7? not sure..
 91 |     inner_inner_cond = (cdf_delta > 1e-5).float()
 92 | 
 93 |     inner_inner_out = inner_inner_cond * \
 94 |                       torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
 95 |                       (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
 96 |     inner_cond = (y > 0.999).float()
 97 |     inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
 98 |     cond = (y < -0.999).float()
 99 |     log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
100 | 
101 |     log_probs = log_probs + F.log_softmax(logit_probs, -1)
102 | 
103 |     if reduce:
104 |         return -torch.sum(log_sum_exp(log_probs))
105 |     else:
106 |         return -log_sum_exp(log_probs).unsqueeze(-1)
107 | 
108 | 
109 | def discretized_mix_gaussian_loss(y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True, use_gaussian=True):
110 |     """Discretized mixture of logistic distributions loss
111 | 
112 |     Note that it is assumed that input is scaled to [-1, 1].
113 | 
114 |     Args:
115 |         y_hat (Variable): Predicted output (B x C x T)
116 |         y (Variable): Target (B x T x 1).
117 |         num_classes (int): Number of classes
118 |         log_scale_min (float): Log scale minimum value
119 |         reduce (bool): If True, the losses are averaged or summed for each
120 |           minibatch.
121 | 
122 |     Returns
123 |         Variable: loss
124 |     """
125 |     assert y_hat.dim() == 3
126 |     assert y_hat.size(1) % 3 == 0 or y_hat.size(1) == 2
127 |     nr_mix = y_hat.size(1) // 3
128 | 
129 |     # (B x T x C)
130 |     y_hat = y_hat.transpose(1, 2)
131 |     if use_gaussian:
132 |         from torch.distributions import Normal
133 |         mean, log_scale = y_hat[:, :, :1], y_hat[:, :, 1:]
134 |         scales = torch.exp(torch.clamp(log_scale, min=log_scale_min))
135 |         norm = Normal(mean, scales)
136 |         return (y - mean) ** 2 / (2 * scales ** 2) + torch.log(scales) + math.log(math.sqrt(2) * math.pi)
137 |     # unpack parameters. (B, T, num_mixtures) x 3
138 |     logit_probs = y_hat[:, :, :nr_mix]
139 |     means = y_hat[:, :, nr_mix:2 * nr_mix]
140 |     log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
141 | 
142 |     # B x T x 1 -> B x T x num_mixtures
143 |     y = y.expand_as(means)
144 | 
145 |     centered_y = y - means
146 |     inv_stdv = torch.exp(-log_scales)
147 |     plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
148 |     cdf_plus = F.sigmoid(plus_in)
149 |     min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
150 |     cdf_min = F.sigmoid(min_in)
151 | 
152 |     # log probability for edge case of 0 (before scaling)
153 |     # equivalent: torch.log(F.sigmoid(plus_in))
154 |     log_cdf_plus = plus_in - F.softplus(plus_in)
155 | 
156 |     # log probability for edge case of 255 (before scaling)
157 |     # equivalent: (1 - F.sigmoid(min_in)).log()
158 |     log_one_minus_cdf_min = -F.softplus(min_in)
159 | 
160 |     # probability for all other cases
161 |     cdf_delta = cdf_plus - cdf_min
162 | 
163 |     mid_in = inv_stdv * centered_y
164 |     # log probability in the center of the bin, to be used in extreme cases
165 |     # (not actually used in our code)
166 |     log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
167 | 
168 |     # tf equivalent
169 |     """
170 |     log_probs = tf.where(x < -0.999, log_cdf_plus,
171 |                          tf.where(x > 0.999, log_one_minus_cdf_min,
172 |                                   tf.where(cdf_delta > 1e-5,
173 |                                            tf.log(tf.maximum(cdf_delta, 1e-12)),
174 |                                            log_pdf_mid - np.log(127.5))))
175 |     """
176 |     # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
177 |     # for num_classes=65536 case? 1e-7? not sure..
178 |     inner_inner_cond = (cdf_delta > 1e-5).float()
179 | 
180 |     inner_inner_out = inner_inner_cond * \
181 |                       torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
182 |                       (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
183 |     inner_cond = (y > 0.999).float()
184 |     inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
185 |     cond = (y < -0.999).float()
186 |     log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
187 | 
188 |     log_probs = log_probs + F.log_softmax(logit_probs, -1)
189 | 
190 |     if reduce:
191 |         return -torch.sum(log_sum_exp(log_probs))
192 |     else:
193 |         return -log_sum_exp(log_probs).unsqueeze(-1)
194 | 
195 | 
196 | def to_one_hot(tensor, n, fill_with=1.):
197 |     # we perform one hot encore with respect to the last axis
198 |     one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
199 |     if tensor.is_cuda:
200 |         one_hot = one_hot.cuda()
201 |     one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
202 |     return Variable(one_hot)
203 | 
204 | 
205 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0):
206 |     """
207 |     Sample from discretized mixture of logistic distributions
208 | 
209 |     Args:
210 |         y (Variable): B x C x T
211 |         log_scale_min (float): Log scale minimum value
212 | 
213 |     Returns:
214 |         Variable: sample in range of [-1, 1].
215 |     """
216 |     assert y.size(1) % 3 == 0
217 |     nr_mix = y.size(1) // 3
218 | 
219 |     # B x T x C
220 |     y = y.transpose(1, 2)
221 |     logit_probs = y[:, :, :nr_mix]
222 | 
223 |     # sample mixture indicator from softmax
224 |     temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
225 |     temp = logit_probs.data - torch.log(- torch.log(temp))
226 |     _, argmax = temp.max(dim=-1)
227 | 
228 |     # (B, T) -> (B, T, nr_mix)
229 |     one_hot = to_one_hot(argmax, nr_mix)
230 |     # select logistic parameters
231 |     means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
232 |     log_scales = torch.clamp(torch.sum(
233 |         y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
234 |     # sample from logistic & clip to interval
235 |     # we don't actually round to the nearest 8bit value when sampling
236 |     u = Variable(means.data.new(means.size()).randn(1e-5, 1.0 - 1e-5))
237 |     x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
238 | 
239 |     x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
240 | 
241 |     return x
242 | 
243 | 
244 | def sample_from_discretized_gaussian(y, log_scale_min=-7.0, use_norm=True):
245 |     """
246 |     Sample from discretized mixture of logistic distributions
247 | 
248 |     Args:
249 |         y (Variable): B x C x T
250 |         log_scale_min (float): Log scale minimum value
251 | 
252 |     Returns:
253 |         Variable: sample in range of [-1, 1].
254 |     """
255 |     assert y.size(1) % 2 == 0
256 |     nr_mix = y.size(1) // 2
257 | 
258 |     # B x T x C
259 |     y = y.transpose(1, 2)
260 |     if use_norm:
261 |         mean, log_scale = y[:, :, 0], torch.clamp(y[:, :, 1], min=log_scale_min)
262 |         scale = torch.exp(log_scale)
263 |         norm = Normal(mean,scale)
264 |         x = norm.rsample()
265 |         # sample = torch.randn(mean.size()).cuda() * scale + mean
266 |         # x = torch.clamp(torch.clamp(sample, min=-1), max=1.0)
267 |         return x
268 | 
269 |     logit_probs = y[:, :, :nr_mix]
270 | 
271 |     # sample mixture indicator from softmax
272 |     temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
273 |     temp = logit_probs.data - torch.log(- torch.log(temp))
274 |     _, argmax = temp.max(dim=-1)
275 | 
276 |     # (B, T) -> (B, T, nr_mix)
277 |     one_hot = to_one_hot(argmax, nr_mix)
278 |     # select logistic parameters
279 |     means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
280 |     log_scales = torch.clamp(torch.sum(
281 |         y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
282 |     # sample from logistic & clip to interval
283 |     # we don't actually round to the nearest 8bit value when sampling
284 |     u = Variable(torch.randn(means.size()))
285 |     x = means + torch.exp(log_scales) * u
286 | 
287 |     x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
288 | 
289 |     return x
290 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/modules.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import math
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | from wavenet_vocoder import conv
  9 | from torch import nn
 10 | from torch.autograd import Variable
 11 | from torch.nn import functional as F
 12 | 
 13 | 
 14 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs):
 15 |     m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
 16 |     std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
 17 |     m.weight.data.normal_(mean=0, std=std)
 18 |     m.bias.data.zero_()
 19 |     return nn.utils.weight_norm(m)
 20 | 
 21 | 
 22 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
 23 |     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
 24 |     m.weight.data.normal_(0, std)
 25 |     return m
 26 | 
 27 | 
 28 | def ConvTranspose2d(in_channels, out_channels, kernel_size,
 29 |                     weight_normalization=True, **kwargs):
 30 |     freq_axis_kernel_size = kernel_size[0]
 31 |     m = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, **kwargs)
 32 |     m.weight.data.fill_(1.0 / freq_axis_kernel_size)
 33 |     m.bias.data.zero_()
 34 |     if weight_normalization:
 35 |         return nn.utils.weight_norm(m)
 36 |     else:
 37 |         return m
 38 | 
 39 | 
 40 | def Conv1d1x1(in_channels, out_channels, bias=True, weight_normalization=True):
 41 |     """1-by-1 convolution layer
 42 |     """
 43 |     if weight_normalization:
 44 |         assert bias
 45 |         return Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
 46 |                       dilation=1, bias=bias, std_mul=1.0)
 47 |     else:
 48 |         return conv.Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
 49 |                       dilation=1, bias=bias)
 50 | 
 51 | 
 52 | def _conv1x1_forward(conv, x, is_incremental):
 53 |     """Conv1x1 forward
 54 |     """
 55 |     if is_incremental:
 56 |         x = conv.incremental_forward(x)
 57 |     else:
 58 |         x = conv(x)
 59 |     return x
 60 | 
 61 | 
 62 | class ResidualConv1dGLU(nn.Module):
 63 |     """Residual dilated conv1d + Gated linear unit
 64 | 
 65 |     Args:
 66 |         residual_channels (int): Residual input / output channels
 67 |         gate_channels (int): Gated activation channels.
 68 |         kernel_size (int): Kernel size of convolution layers.
 69 |         skip_out_channels (int): Skip connection channels. If None, set to same
 70 |           as ``residual_channels``.
 71 |         cin_channels (int): Local conditioning channels. If negative value is
 72 |           set, local conditioning is disabled.
 73 |         gin_channels (int): Global conditioning channels. If negative value is
 74 |           set, global conditioning is disabled.
 75 |         dropout (float): Dropout probability.
 76 |         padding (int): Padding for convolution layers. If None, proper padding
 77 |           is computed depends on dilation and kernel_size.
 78 |         dilation (int): Dilation factor.
 79 |         weight_normalization (bool): If True, DeepVoice3-style weight
 80 |           normalization is applied.
 81 |     """
 82 | 
 83 |     def __init__(self, residual_channels, gate_channels, kernel_size,
 84 |                  skip_out_channels=None,
 85 |                  cin_channels=-1, gin_channels=-1,
 86 |                  dropout=1 - 0.95, padding=None, dilation=1, causal=True,
 87 |                  bias=True, weight_normalization=True, *args, **kwargs):
 88 |         super(ResidualConv1dGLU, self).__init__()
 89 |         self.dropout = dropout
 90 |         if skip_out_channels is None:
 91 |             skip_out_channels = residual_channels
 92 |         if padding is None:
 93 |             # no future time stamps available
 94 |             if causal:
 95 |                 padding = (kernel_size - 1) * dilation
 96 |             else:
 97 |                 padding = (kernel_size - 1) // 2 * dilation
 98 |         self.causal = causal
 99 | 
100 |         if weight_normalization:
101 |             assert bias
102 |             self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
103 |                                padding=padding, dilation=dilation,
104 |                                bias=bias, std_mul=1.0, *args, **kwargs)
105 |         else:
106 |             self.conv = conv.Conv1d(residual_channels, gate_channels, kernel_size,
107 |                                padding=padding, dilation=dilation,
108 |                                bias=bias, *args, **kwargs)
109 | 
110 |         # local conditioning
111 |         if cin_channels > 0:
112 |             self.conv1x1c = Conv1d1x1(cin_channels, gate_channels,
113 |                                       bias=bias,
114 |                                       weight_normalization=weight_normalization)
115 |         else:
116 |             self.conv1x1c = None
117 | 
118 |         # global conditioning
119 |         if gin_channels > 0:
120 |             self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, bias=bias,
121 |                                       weight_normalization=weight_normalization)
122 |         else:
123 |             self.conv1x1g = None
124 | 
125 |         # conv output is split into two groups
126 |         gate_out_channels = gate_channels // 2
127 |         self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias,
128 |                                      weight_normalization=weight_normalization)
129 |         self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, bias=bias,
130 |                                       weight_normalization=weight_normalization)
131 | 
132 |     def forward(self, x, c=None, g=None):
133 |         return self._forward(x, c, g, False)
134 | 
135 |     def incremental_forward(self, x, c=None, g=None):
136 |         return self._forward(x, c, g, True)
137 | 
138 |     def _forward(self, x, c, g, is_incremental):
139 |         """Forward
140 | 
141 |         Args:
142 |             x (Variable): B x C x T
143 |             c (Variable): B x C x T, Local conditioning features
144 |             g (Variable): B x C x T, Expanded global conditioning features
145 |             is_incremental (Bool) : Whether incremental mode or not
146 | 
147 |         Returns:
148 |             Variable: output
149 |         """
150 |         residual = x
151 |         x = F.dropout(x, p=self.dropout, training=self.training)
152 |         if is_incremental:
153 |             splitdim = -1
154 |             x = self.conv.incremental_forward(x)
155 |         else:
156 |             splitdim = 1
157 |             x = self.conv(x)
158 |             # remove future time steps
159 |             x = x[:, :, :residual.size(-1)] if self.causal else x
160 | 
161 |         a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
162 | 
163 |         # local conditioning
164 |         if c is not None:
165 |             assert self.conv1x1c is not None
166 |             c = _conv1x1_forward(self.conv1x1c, c, is_incremental)
167 |             ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
168 |             a, b = a + ca, b + cb
169 | 
170 |         # global conditioning
171 |         if g is not None:
172 |             assert self.conv1x1g is not None
173 |             g = _conv1x1_forward(self.conv1x1g, g, is_incremental)
174 |             ga, gb = g.split(g.size(splitdim) // 2, dim=splitdim)
175 |             a, b = a + ga, b + gb
176 | 
177 |         x = F.tanh(a) * F.sigmoid(b)
178 | 
179 |         # For skip connection
180 |         s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental)
181 | 
182 |         # For residual connection
183 |         x = _conv1x1_forward(self.conv1x1_out, x, is_incremental)
184 | 
185 |         x = (x + residual) * math.sqrt(0.5)
186 |         return x, s
187 | 
188 |     def clear_buffer(self):
189 |         for conv in [self.conv, self.conv1x1_out, self.conv1x1_skip,
190 |                      self.conv1x1c, self.conv1x1g]:
191 |             if conv is not None:
192 |                 self.conv.clear_buffer()
193 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/student_wavenet.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import math
  5 | import librosa
  6 | import numpy as np
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.nn import functional as F
 11 | 
 12 | from .modules import Embedding, Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d
 13 | from train import build_model
 14 | from wavenet_vocoder import receptive_field_size
 15 | from wavenet_vocoder.wavenet import _expand_global_features, WaveNet
 16 | from .mixture import sample_from_discretized_mix_logistic
 17 | 
 18 | 
 19 | class StudentWaveNet(nn.Module):
 20 | 
 21 |     def __init__(self, out_channels=2, layers=20, stacks=2,
 22 |                  residual_channels=64,
 23 |                  iaf_layer_sizes=[10, 10, 10, 30],
 24 |                  gate_channels=64,
 25 |                  kernel_size=3, dropout=1 - 0.95,
 26 |                  cin_channels=-1, gin_channels=-1, n_speakers=None,
 27 |                  weight_normalization=True,
 28 |                  upsample_conditional_features=False,
 29 |                  upsample_scales=None,
 30 |                  freq_axis_kernel_size=3,
 31 |                  scalar_input=False,
 32 |                  use_speaker_embedding=True,
 33 |                  ):
 34 |         super(StudentWaveNet, self).__init__()
 35 |         self.scalar_input = scalar_input
 36 |         self.out_channels = out_channels
 37 |         self.cin_channels = cin_channels
 38 |         self.last_layers = []
 39 | 
 40 |         assert layers % stacks == 0
 41 |         layers_per_stack = layers // stacks
 42 | 
 43 |         if scalar_input:
 44 |             self.first_conv = nn.ModuleList([Conv1d1x1(1, residual_channels)
 45 |                                              for _ in range(len(iaf_layer_sizes))])
 46 |         else:
 47 |             self.first_conv = nn.ModuleList([Conv1d1x1(out_channels, residual_channels)
 48 |                                              for _ in range(len(iaf_layer_sizes))])
 49 | 
 50 |         self.iaf_layers = nn.ModuleList()
 51 |         self.last_layers = nn.ModuleList()
 52 | 
 53 |         for iaf_layer_size in iaf_layer_sizes:
 54 |             iaf_layer = nn.ModuleList()
 55 |             for layer_index in range(iaf_layer_size):
 56 |                 dilation = 2 ** (layer_index % layers_per_stack)
 57 |                 conv = ResidualConv1dGLU(
 58 |                     residual_channels,
 59 |                     gate_channels,
 60 |                     kernel_size=kernel_size,
 61 |                     bias=True,
 62 |                     dilation=dilation,
 63 |                     dropout=dropout,
 64 |                     cin_channels=cin_channels,
 65 |                     gin_channels=gin_channels,
 66 |                     weight_normalization=weight_normalization
 67 |                 )
 68 |                 iaf_layer.append(conv)
 69 |             self.iaf_layers.append(iaf_layer)
 70 |             self.last_layers.append(nn.ModuleList([
 71 |                 nn.ReLU(),
 72 |                 Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization)
 73 |             ]))
 74 | 
 75 |         if gin_channels > 0 and use_speaker_embedding:
 76 |             assert n_speakers is not None
 77 |             self.embed_speakers = Embedding(
 78 |                 n_speakers, gin_channels, padding_idx=None, std=0.1)
 79 |         else:
 80 |             self.embed_speakers = None
 81 | 
 82 |             # Upsample conv net
 83 |         if upsample_conditional_features:
 84 |             self.upsample_conv = nn.ModuleList()
 85 |             for s in upsample_scales:
 86 |                 freq_axis_padding = (freq_axis_kernel_size - 1) // 2
 87 |                 convt = ConvTranspose2d(1, 1, (freq_axis_kernel_size, s),
 88 |                                         padding=(freq_axis_padding, 0),
 89 |                                         dilation=1, stride=(1, s),
 90 |                                         weight_normalization=weight_normalization)
 91 |                 self.upsample_conv.append(convt)
 92 |                 # assuming we use [0, 1] scaled features
 93 |                 # this should avoid non-negative upsampling output
 94 |                 self.upsample_conv.append(nn.ReLU(inplace=True))
 95 |         else:
 96 |             self.upsample_conv = None
 97 | 
 98 |         self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
 99 | 
100 |     def has_speaker_embedding(self):
101 |         return self.embed_speakers is not None
102 | 
103 |     def local_conditioning_enabled(self):
104 |         return self.cin_channels > 0
105 | 
106 |     def forward(self, z, c=None, g=None, softmax=False, use_cuda=True, use_scale=False):
107 | 
108 |         if c is not None and self.upsample_conv is not None:
109 |             # B x 1 x C x T
110 |             c = c.unsqueeze(1)
111 |             for f in self.upsample_conv:
112 |                 c = f(c)
113 |             # B x C x T
114 |             c = c.squeeze(1)
115 | 
116 |         if z is None:  # for inference
117 |             z = np.random.logistic(0, 1, (1, 1, c.size(-1)))
118 |             z = torch.from_numpy(z).float()
119 |             if use_cuda:
120 |                 z = z.cuda()
121 | 
122 |         assert c.size(-1) == z.size(-1)
123 | 
124 |         B, _, T = z.size()
125 | 
126 |         if g is not None:
127 |             if self.embed_speakers is not None:
128 |                 # (B x 1) -> (B x 1 x gin_channels)
129 |                 g = self.embed_speakers(g.view(B, -1))
130 |                 # (B x gin_channels x 1)
131 |                 g = g.transpose(1, 2)
132 |                 assert g.dim() == 3
133 |         # Expand global conditioning features to all time steps
134 |         g_bct = _expand_global_features(B, T, g, bct=True)
135 | 
136 |         mu_tot = torch.zeros(z.size(), requires_grad=True)
137 |         scale_tot = torch.ones(z.size(), requires_grad=True)
138 |         if use_cuda:
139 |             mu_tot, scale_tot = mu_tot.cuda(), scale_tot.cuda()
140 | 
141 |         for first_conv, iaf_layer, last_layer in zip(self.first_conv, self.iaf_layers, self.last_layers):
142 |             new_z = first_conv(z)
143 |             for f in iaf_layer:
144 |                 new_z, _ = f(new_z, c, g_bct)
145 |             for f in last_layer:
146 |                 new_z = f(new_z)
147 |             if use_scale:
148 |                 mu_s_f, scale_s_f = new_z[:, :1, :], new_z[:, 1:, :]
149 |             else:
150 |                 mu_s_f, scale_s_f = new_z[:, :1, :], torch.exp(new_z[:, 1:, :])
151 |             mu_tot = mu_s_f + mu_tot * scale_s_f
152 |             scale_tot = scale_tot * scale_s_f
153 |             z = z*scale_s_f + mu_s_f
154 | 
155 |         return z, mu_tot, scale_tot
156 | 
157 |     def clear_buffer(self):
158 |         self.first_conv.clear_buffer()
159 |         for f in self.conv_layers:
160 |             f.clear_buffer()
161 |         for f in self.last_conv_layers:
162 |             try:
163 |                 f.clear_buffer()
164 |             except AttributeError:
165 |                 pass
166 | 
167 |     def make_generation_fast_(self):
168 |         def remove_weight_norm(m):
169 |             try:
170 |                 nn.utils.remove_weight_norm(m)
171 |             except ValueError:  # this module didn't have weight norm
172 |                 return
173 |         self.apply(remove_weight_norm)
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/upsample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from hparams import hparams
 4 | from wavenet_vocoder.modules import ConvTranspose2d
 5 | import os
 6 | from hparams import hparams
 7 | 
 8 | class UpSampleConv(nn.Module):
 9 |     def __init__(self,
10 |                  path=None,
11 |                  share_condition=True,
12 |                  weight_normalization=True):
13 |         super(UpSampleConv, self).__init__()
14 |         self.path = path
15 |         self.upsample_conv = nn.ModuleList()
16 |         for s in hparams.upsample_scales:
17 |             freq_axis_padding = (hparams.freq_axis_kernel_size - 1) // 2
18 |             convt = ConvTranspose2d(1, 1, (hparams.freq_axis_kernel_size, s),
19 |                                     padding=(freq_axis_padding, 0),
20 |                                     dilation=1, stride=[1, s],
21 |                                     weight_normalization=weight_normalization)
22 |             self.upsample_conv.append(convt)
23 |             self.upsample_conv.append(nn.LeakyReLU(inplace=True,negative_slope=0.2))
24 |         # load condition form teacher wavenet
25 |         if path and share_condition:
26 |             self.load()
27 | 
28 |     def forward(self, c):
29 |         for f in self.upsample_conv:
30 |             c = f(c)
31 |         return c
32 | 
33 |     def load(self):
34 |         if self.path and os.path.exists(self.path):
35 |             self.upsample_conv.load_state_dict(torch.load(self.path))
36 |         else:
37 |             raise Exception("can't load state dict, check path, see get_model in train_student.py !")
38 | 
39 | 
40 | class ClariUpsampleConv(nn.Module):
41 |     def __init__(self, weight_normalization=True):
42 |         super(ClariUpsampleConv, self).__init__()
43 |         self.upsample_conv = nn.ModuleList()
44 |         for s in hparams.upsample_size:
45 |             convt = ConvTranspose2d(1, 1, kernel_size=s, stride=(1,s[0] / 2), weight_normalization=weight_normalization)
46 |             self.upsample_conv.append(convt)
47 |             self.upsample_conv.append(nn.LeakyReLU(inplace=True, negative_slope=0.4))
48 | 
49 |     def forward(self, c):
50 |         for f in self.upsample_conv:
51 |             c = f(c)
52 |         return c
53 | 
54 | if __name__ == '__main__':
55 |     checkpoint = torch.load('/home/jinqiangzeng/work/mypycharm/wavenet/clari_wavenet_vocoder/checkpoints/checkpoint_step000430000_ema.pth')
56 |     preset = '/home/jinqiangzeng/work/mypycharm/wavenet/clari_wavenet_vocoder/presets/ljspeech_gaussian.json'
57 |     with open(preset) as f:
58 |         hparams.parse_json(f.read())
59 |     from train_student import build_model
60 |     teacher = build_model(hparams,'teacher')
61 |     teacher.load_state_dict(checkpoint['state_dict'])
62 |     upsample_state_dict = teacher.upsample_conv.state_dict()
63 |     upsample_conv = UpSampleConv()
64 |     upsample_conv.load_state_dict(upsample_state_dict)
65 |     for para in upsample_conv.parameters():
66 |         para.requires_grad=False


--------------------------------------------------------------------------------
/wavenet_vocoder/util.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | 
 5 | def _assert_valid_input_type(s):
 6 |     assert s == "mulaw-quantize" or s == "mulaw" or s == "raw"
 7 | 
 8 | 
 9 | def is_mulaw_quantize(s):
10 |     _assert_valid_input_type(s)
11 |     return s == "mulaw-quantize"
12 | 
13 | 
14 | def is_mulaw(s):
15 |     _assert_valid_input_type(s)
16 |     return s == "mulaw"
17 | 
18 | 
19 | def is_raw(s):
20 |     _assert_valid_input_type(s)
21 |     return s == "raw"
22 | 
23 | 
24 | def is_scalar_input(s):
25 |     return is_raw(s) or is_mulaw(s)
26 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/wavenet.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import math
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | from torch import nn
  9 | from torch.autograd import Variable
 10 | from torch.nn import functional as F
 11 | 
 12 | from .modules import Embedding
 13 | 
 14 | from .modules import Conv1d1x1, ResidualConv1dGLU, ConvTranspose2d
 15 | from .mixture import sample_from_discretized_mix_logistic,sample_from_discretized_gaussian
 16 | from .upsample import  UpSampleConv,ClariUpsampleConv
 17 | 
 18 | def _expand_global_features(B, T, g, bct=True):
 19 |     """Expand global conditioning features to all time steps
 20 | 
 21 |     Args:
 22 |         B (int): Batch size.
 23 |         T (int): Time length.
 24 |         g (Variable): Global features, (B x C) or (B x C x 1).
 25 |         bct (bool) : returns (B x C x T) if True, otherwise (B x T x C)
 26 | 
 27 |     Returns:
 28 |         Variable: B x C x T or B x T x C or None
 29 |     """
 30 |     if g is None:
 31 |         return None
 32 |     g = g.unsqueeze(-1) if g.dim() == 2 else g
 33 |     if bct:
 34 |         g_bct = g.expand(B, -1, T)
 35 |         return g_bct.contiguous()
 36 |     else:
 37 |         g_btc = g.expand(B, -1, T).transpose(1, 2)
 38 |         return g_btc.contiguous()
 39 | 
 40 | 
 41 | def receptive_field_size(total_layers, num_cycles, kernel_size,
 42 |                          dilation=lambda x: 2**x):
 43 |     """Compute receptive field size
 44 | 
 45 |     Args:
 46 |         total_layers (int): total layers
 47 |         num_cycles (int): cycles
 48 |         kernel_size (int): kernel size
 49 |         dilation (lambda): lambda to compute dilation factor. ``lambda x : 1``
 50 |           to disable dilated convolution.
 51 | 
 52 |     Returns:
 53 |         int: receptive field size in sample
 54 | 
 55 |     """
 56 |     assert total_layers % num_cycles == 0
 57 |     layers_per_cycle = total_layers // num_cycles
 58 |     dilations = [dilation(i % layers_per_cycle) for i in range(total_layers)]
 59 |     return (kernel_size - 1) * sum(dilations) + 1
 60 | 
 61 | 
 62 | class WaveNet(nn.Module):
 63 |     """The WaveNet model that supports local and global conditioning.
 64 | 
 65 |     Args:
 66 |         out_channels (int): Output channels. If input_type is mu-law quantized
 67 |           one-hot vecror. this must equal to the quantize channels. Other wise
 68 |           num_mixtures x 3 (pi, mu, log_scale).
 69 |         layers (int): Number of total layers
 70 |         stacks (int): Number of dilation cycles
 71 |         residual_channels (int): Residual input / output channels
 72 |         gate_channels (int): Gated activation channels.
 73 |         skip_out_channels (int): Skip connection channels.
 74 |         kernel_size (int): Kernel size of convolution layers.
 75 |         dropout (float): Dropout probability.
 76 |         cin_channels (int): Local conditioning channels. If negative value is
 77 |           set, local conditioning is disabled.
 78 |         gin_channels (int): Global conditioning channels. If negative value is
 79 |           set, global conditioning is disabled.
 80 |         n_speakers (int): Number of speakers. Used only if global conditioning
 81 |           is enabled.
 82 |         weight_normalization (bool): If True, DeepVoice3-style weight
 83 |           normalization is applied.
 84 |         upsample_conditional_features (bool): Whether upsampling local
 85 |           conditioning features by transposed convolution layers or not.
 86 |         upsample_scales (list): List of upsample scale.
 87 |           ``np.prod(upsample_scales)`` must equal to hop size. Used only if
 88 |           upsample_conditional_features is enabled.
 89 |         freq_axis_kernel_size (int): Freq-axis kernel_size for transposed
 90 |           convolution layers for upsampling. If you only care about time-axis
 91 |           upsampling, set this to 1.
 92 |         scalar_input (Bool): If True, scalar input ([-1, 1]) is expected, otherwise
 93 |           quantized one-hot vector is expected.
 94 |         use_speaker_embedding (Bool): Use speaker embedding or Not. Set to False
 95 |           if you want to disable embedding layer and use external features
 96 |           directly.
 97 |     """
 98 | 
 99 |     def __init__(self, out_channels=256, layers=20, stacks=2,
100 |                  residual_channels=512,
101 |                  gate_channels=512,
102 |                  skip_out_channels=512,
103 |                  kernel_size=3, dropout=1 - 0.95,
104 |                  cin_channels=-1, gin_channels=-1, n_speakers=None,
105 |                  weight_normalization=True,
106 |                  upsample_conditional_features=False,
107 |                  upsample_scales=None,
108 |                  freq_axis_kernel_size=3,
109 |                  scalar_input=False,
110 |                  use_speaker_embedding=True,
111 |                  output_type='Gaussian'
112 |                  ):
113 |         super(WaveNet, self).__init__()
114 |         self.scalar_input = scalar_input
115 |         self.out_channels = out_channels
116 |         self.cin_channels = cin_channels
117 |         self.output_type = output_type
118 |         assert layers % stacks == 0
119 |         layers_per_stack = layers // stacks
120 |         if scalar_input:
121 |             self.first_conv = Conv1d1x1(1, residual_channels)
122 |         else:
123 |             self.first_conv = Conv1d1x1(out_channels, residual_channels)
124 | 
125 |         self.conv_layers = nn.ModuleList()
126 |         for layer in range(layers):
127 |             dilation = 2**(layer % layers_per_stack)
128 |             conv = ResidualConv1dGLU(
129 |                 residual_channels, gate_channels,
130 |                 kernel_size=kernel_size,
131 |                 skip_out_channels=skip_out_channels,
132 |                 bias=True,  # magenda uses bias, but musyoku doesn't
133 |                 dilation=dilation, dropout=dropout,
134 |                 cin_channels=cin_channels,
135 |                 gin_channels=gin_channels,
136 |                 weight_normalization=weight_normalization)
137 |             self.conv_layers.append(conv)
138 |         self.last_conv_layers = nn.ModuleList([
139 |             nn.ReLU(inplace=True),
140 |             Conv1d1x1(skip_out_channels, skip_out_channels,
141 |                       weight_normalization=weight_normalization),
142 |             nn.ReLU(inplace=True),
143 |             Conv1d1x1(skip_out_channels, out_channels,
144 |                       weight_normalization=weight_normalization),
145 |         ])
146 | 
147 |         if gin_channels > 0 and use_speaker_embedding:
148 |             assert n_speakers is not None
149 |             self.embed_speakers = Embedding(
150 |                 n_speakers, gin_channels, padding_idx=None, std=0.1)
151 |         else:
152 |             self.embed_speakers = None
153 | 
154 |         # Upsample conv net
155 |         if upsample_conditional_features:
156 |             self.upsample_conv = UpSampleConv()
157 |             if output_type == 'Gaussian' and False:
158 |                 self.upsample_conv = ClariUpsampleConv()
159 |         else:
160 |             self.upsample_conv = None
161 | 
162 |         self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
163 | 
164 |     def has_speaker_embedding(self):
165 |         return self.embed_speakers is not None
166 | 
167 |     def local_conditioning_enabled(self):
168 |         return self.cin_channels > 0
169 | 
170 |     def forward(self, x, c=None, g=None, softmax=False):
171 |         """Forward step
172 | 
173 |         Args:
174 |             x (Variable): One-hot encoded audio signal, shape (B x C x T)
175 |             c (Variable): Local conditioning features,
176 |               shape (B x cin_channels x T)
177 |             g (Variable): Global conditioning features,
178 |               shape (B x gin_channels x 1) or speaker Ids of shape (B x 1).
179 |               Note that ``self.use_speaker_embedding`` must be False when you
180 |               want to disable embedding layer and use external features
181 |               directly (e.g., one-hot vector).
182 |               Also type of input tensor must be FloatTensor, not LongTensor
183 |               in case of ``self.use_speaker_embedding`` equals False.
184 |             softmax (bool): Whether applies softmax or not.
185 | 
186 |         Returns:
187 |             Variable: output, shape B x out_channels x T
188 |         """
189 |         B, _, T = x.size()
190 | 
191 |         if g is not None:
192 |             if self.embed_speakers is not None:
193 |                 # (B x 1) -> (B x 1 x gin_channels)
194 |                 g = self.embed_speakers(g.view(B, -1))
195 |                 # (B x gin_channels x 1)
196 |                 g = g.transpose(1, 2)
197 |                 assert g.dim() == 3
198 |         # Expand global conditioning features to all time steps
199 |         g_bct = _expand_global_features(B, T, g, bct=True)
200 | 
201 |         if c is not None and self.upsample_conv is not None:
202 |             # B x 1 x C x T
203 |             c = c.unsqueeze(1)
204 |             c = self.upsample_conv(c)
205 |             # B x C x T
206 |             c = c.squeeze(1)
207 |             assert c.size(-1) == x.size(-1)
208 | 
209 |         # Feed data to network
210 |         x = self.first_conv(x)
211 |         skips = None
212 |         for f in self.conv_layers:
213 |             x, h = f(x, c, g_bct)
214 |             if skips is None:
215 |                 skips = h
216 |             else:
217 |                 skips += h
218 |                 skips *= math.sqrt(0.5)
219 |             # skips = h if skips is None else (skips + h) * math.sqrt(0.5)
220 | 
221 |         x = skips
222 |         for f in self.last_conv_layers:
223 |             x = f(x)
224 | 
225 |         x = F.softmax(x, dim=1) if softmax else x
226 | 
227 |         return x
228 | 
229 |     def incremental_forward(self, initial_input=None, c=None, g=None,
230 |                             T=100, test_inputs=None,
231 |                             tqdm=lambda x: x, softmax=True, quantize=True,
232 |                             log_scale_min=-7.0):
233 |         """Incremental forward step
234 | 
235 |         Due to linearized convolutions, inputs of shape (B x C x T) are reshaped
236 |         to (B x T x C) internally and fed to the network for each time step.
237 |         Input of each time step will be of shape (B x 1 x C).
238 | 
239 |         Args:
240 |             initial_input (Variable): Initial decoder input, (B x C x 1)
241 |             c (Variable): Local conditioning features, shape (B x C' x T)
242 |             g (Variable): Global conditioning features, shape (B x C'' or B x C''x 1)
243 |             T (int): Number of time steps to generate.
244 |             test_inputs (Variable): Teacher forcing inputs (for debugging)
245 |             tqdm (lamda) : tqdm
246 |             softmax (bool) : Whether applies softmax or not
247 |             quantize (bool): Whether quantize softmax output before feeding the
248 |               network output to input for the next time step. TODO: rename
249 |             log_scale_min (float):  Log scale minimum value.
250 | 
251 |         Returns:
252 |             Variable: Generated one-hot encoded samples. B x C x T　
253 |               or scaler vector B x 1 x T
254 |         """
255 |         self.clear_buffer()
256 |         B = 1
257 | 
258 |         # Note: shape should be **(B x T x C)**, not (B x C x T) opposed to
259 |         # batch forward due to linealized convolution
260 |         if test_inputs is not None:
261 |             if self.scalar_input:
262 |                 if test_inputs.size(1) == 1:
263 |                     test_inputs = test_inputs.transpose(1, 2).contiguous()
264 |             else:
265 |                 if test_inputs.size(1) == self.out_channels:
266 |                     test_inputs = test_inputs.transpose(1, 2).contiguous()
267 | 
268 |             B = test_inputs.size(0)
269 |             if T is None:
270 |                 T = test_inputs.size(1)
271 |             else:
272 |                 T = max(T, test_inputs.size(1))
273 |         # cast to int in case of numpy.int64...
274 |         T = int(T)
275 | 
276 |         # Global conditioning
277 |         if g is not None:
278 |             if self.embed_speakers is not None:
279 |                 g = self.embed_speakers(g.view(B, -1))
280 |                 # (B x gin_channels, 1)
281 |                 g = g.transpose(1, 2)
282 |                 assert g.dim() == 3
283 |         g_btc = _expand_global_features(B, T, g, bct=False)
284 | 
285 |         # Local conditioning
286 |         if c is not None and self.upsample_conv is not None:
287 |             assert c is not None
288 |             # B x 1 x C x T
289 |             c = c.unsqueeze(1)
290 |             c = self.upsample_conv(c)
291 |             # B x C x T
292 |             c = c.squeeze(1)
293 |             assert c.size(-1) == T
294 |         if c is not None and c.size(-1) == T:
295 |             c = c.transpose(1, 2).contiguous()
296 | 
297 |         outputs = []
298 |         if initial_input is None:
299 |             if self.scalar_input:
300 |                 initial_input = Variable(torch.zeros(B, 1, 1))
301 |             else:
302 |                 initial_input = Variable(torch.zeros(B, 1, self.out_channels))
303 |                 initial_input[:, :, 127] = 1  # TODO: is this ok?
304 |             # https://github.com/pytorch/pytorch/issues/584#issuecomment-275169567
305 |             if next(self.parameters()).is_cuda:
306 |                 initial_input = initial_input.cuda()
307 |         else:
308 |             if initial_input.size(1) == self.out_channels:
309 |                 initial_input = initial_input.transpose(1, 2).contiguous()
310 | 
311 |         current_input = initial_input
312 | 
313 |         for t in tqdm(range(T)):
314 |             if test_inputs is not None and t < test_inputs.size(1):
315 |                 current_input = test_inputs[:, t, :].unsqueeze(1)
316 |             else:
317 |                 if t > 0:
318 |                     current_input = outputs[-1]
319 |                     current_input = Variable(current_input)
320 | 
321 |             # Conditioning features for single time step
322 |             ct = None if c is None else c[:, t, :].unsqueeze(1)
323 |             gt = None if g is None else g_btc[:, t, :].unsqueeze(1)
324 | 
325 |             x = current_input
326 |             x = self.first_conv.incremental_forward(x)
327 |             skips = None
328 |             for f in self.conv_layers:
329 |                 x, h = f.incremental_forward(x, ct, gt)
330 |                 skips = h if skips is None else (skips + h) * math.sqrt(0.5)
331 |             x = skips
332 |             for f in self.last_conv_layers:
333 |                 try:
334 |                     x = f.incremental_forward(x)
335 |                 except AttributeError:
336 |                     x = f(x)
337 | 
338 |             # Generate next input by sampling
339 |             if self.scalar_input:
340 |                 sample_fn = sample_from_discretized_mix_logistic
341 |                 if self.output_type == "Gaussian":
342 |                     sample_fn = sample_from_discretized_gaussian
343 |                 x = sample_fn(
344 |                     x.view(B, -1, 1), log_scale_min=log_scale_min)
345 |             else:
346 |                 x = F.softmax(x.view(B, -1), dim=1) if softmax else x.view(B, -1)
347 |                 if quantize:
348 |                     sample = np.random.choice(
349 |                         np.arange(self.out_channels), p=x.view(-1).data.cpu().numpy())
350 |                     x.zero_()
351 |                     x[:, sample] = 1.0
352 |             outputs += [x.data]
353 |         # T x B x C
354 |         outputs = torch.stack(outputs)
355 |         # B x C x T
356 |         outputs = outputs.transpose(0, 1).transpose(1, 2).contiguous()
357 | 
358 |         self.clear_buffer()
359 |         return outputs
360 | 
361 |     def clear_buffer(self):
362 |         self.first_conv.clear_buffer()
363 |         for f in self.conv_layers:
364 |             f.clear_buffer()
365 |         for f in self.last_conv_layers:
366 |             try:
367 |                 f.clear_buffer()
368 |             except AttributeError:
369 |                 pass
370 | 
371 |     def make_generation_fast_(self):
372 |         def remove_weight_norm(m):
373 |             try:
374 |                 nn.utils.remove_weight_norm(m)
375 |             except ValueError:  # this module didn't have weight norm
376 |                 return
377 |         self.apply(remove_weight_norm)
378 | 


--------------------------------------------------------------------------------