├── .github
    └── stale.yml
├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── appveyor.yml
├── assets
    └── banner.jpg
├── audio.py
├── compute_timestamp_ratio.py
├── deepvoice3_pytorch
    ├── __init__.py
    ├── builder.py
    ├── conv.py
    ├── deepvoice3.py
    ├── frontend
    │   ├── __init__.py
    │   ├── en
    │   │   └── __init__.py
    │   ├── es
    │   │   └── __init__.py
    │   ├── jp
    │   │   └── __init__.py
    │   ├── ko
    │   │   └── __init__.py
    │   └── text
    │   │   ├── __init__.py
    │   │   ├── cleaners.py
    │   │   ├── cmudict.py
    │   │   ├── numbers.py
    │   │   └── symbols.py
    ├── modules.py
    ├── nyanko.py
    └── tfcompat
    │   ├── __init__.py
    │   ├── hparam.py
    │   └── readme.md
├── docs
    ├── .gitignore
    ├── config.toml
    ├── content
    │   └── index.md
    ├── layouts
    │   ├── _default
    │   │   ├── list.html
    │   │   └── single.html
    │   ├── index.html
    │   └── partials
    │   │   ├── footer.html
    │   │   ├── header.html
    │   │   ├── mathjax.html
    │   │   └── social.html
    └── static
    │   ├── audio
    │       ├── deepvoice3
    │       │   └── 3_keithito
    │       │   │   ├── 0_checkpoint_step000210000.wav
    │       │   │   ├── 0_checkpoint_step000210000_alignment.png
    │       │   │   ├── 1_checkpoint_step000210000.wav
    │       │   │   ├── 1_checkpoint_step000210000_alignment.png
    │       │   │   ├── 2_checkpoint_step000210000.wav
    │       │   │   ├── 2_checkpoint_step000210000_alignment.png
    │       │   │   ├── 3_checkpoint_step000210000.wav
    │       │   │   ├── 3_checkpoint_step000210000_alignment.png
    │       │   │   ├── 4_checkpoint_step000210000.wav
    │       │   │   ├── 4_checkpoint_step000210000_alignment.png
    │       │   │   ├── 5_checkpoint_step000210000.wav
    │       │   │   └── 5_checkpoint_step000210000_alignment.png
    │       ├── deepvoice3_multispeaker
    │       │   ├── 3_keithito
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   │   ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   │   ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   │   ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   │   ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   │   ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav
    │       │   │   ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png
    │       │   │   ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav
    │       │   │   └── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png
    │       │   └── loop
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav
    │       │   │   ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav
    │       │   │   └── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav
    │       └── nyanko
    │       │   └── 3_keithito
    │       │       ├── 0_20171129_nyanko_checkpoint_step000585000.wav
    │       │       ├── 0_20171129_nyanko_checkpoint_step000585000_alignment.png
    │       │       ├── 1_20171129_nyanko_checkpoint_step000585000.wav
    │       │       ├── 1_20171129_nyanko_checkpoint_step000585000_alignment.png
    │       │       ├── 2_20171129_nyanko_checkpoint_step000585000.wav
    │       │       ├── 2_20171129_nyanko_checkpoint_step000585000_alignment.png
    │       │       ├── 3_20171129_nyanko_checkpoint_step000585000.wav
    │       │       ├── 3_20171129_nyanko_checkpoint_step000585000_alignment.png
    │       │       ├── 4_20171129_nyanko_checkpoint_step000585000.wav
    │       │       ├── 4_20171129_nyanko_checkpoint_step000585000_alignment.png
    │       │       ├── 5_20171129_nyanko_checkpoint_step000585000.wav
    │       │       └── 5_20171129_nyanko_checkpoint_step000585000_alignment.png
    │   ├── css
    │       ├── custom.css
    │       ├── normalize.css
    │       └── skeleton.css
    │   ├── favicon.png
    │   └── images
    │       └── 512logotipo.png
├── dump_hparams_to_json.py
├── gentle_web_align.py
├── hparams.py
├── json_meta.py
├── jsut.py
├── ljspeech.py
├── lrschedule.py
├── nikl_m.py
├── nikl_preprocess
    ├── README.md
    └── prepare_metafile.py
├── nikl_s.py
├── preprocess.py
├── presets
    ├── deepvoice3_ljspeech.json
    ├── deepvoice3_niklm.json
    ├── deepvoice3_nikls.json
    ├── deepvoice3_vctk.json
    └── nyanko_ljspeech.json
├── release.sh
├── setup.py
├── synthesis.py
├── tests
    ├── data
    │   └── ljspeech-mel-00001.npy
    ├── test_audio.py
    ├── test_conv.py
    ├── test_deepvoice3.py
    ├── test_embedding.py
    ├── test_frontend.py
    └── test_nyanko.py
├── tox.ini
├── train.py
├── vctk.py
└── vctk_preprocess
    ├── .gitignore
    ├── README.md
    ├── extract_feats.py
    ├── prepare_htk_alignments_vctk.py
    └── prepare_vctk_labels.py


/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an Issue or Pull Request becomes stale
 2 | daysUntilStale: 60
 3 | 
 4 | # Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
 5 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
 6 | daysUntilClose: 7
 7 | 
 8 | # Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
 9 | onlyLabels: []
10 | 
11 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
12 | exemptLabels:
13 |   - roadmap
14 |   - bug
15 |   - design
16 | 
17 | # Set to true to ignore issues in a project (defaults to false)
18 | exemptProjects: true
19 | 
20 | # Set to true to ignore issues in a milestone (defaults to false)
21 | exemptMilestones: true
22 | 
23 | # Label to use when marking as stale
24 | staleLabel: wontfix
25 | 
26 | # Comment to post when marking as stale. Set to `false` to disable
27 | markComment: >
28 |   This issue has been automatically marked as stale because it has not had
29 |   recent activity. It will be closed if no further activity occurs. Thank you
30 |   for your contributions.
31 | 
32 | # Limit the number of actions per hour, from 1-30. Default is 30
33 | limitPerRun: 30
34 | 
35 | # Limit to only `issues` or `pulls`
36 | only: issues
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | legacy
  2 | notebooks
  3 | foobar*
  4 | run.sh
  5 | README.rst
  6 | pretrained_models
  7 | deepvoice3_pytorch/version.py
  8 | checkpoints*
  9 | log
 10 | generated
 11 | data
 12 | datasets
 13 | testout
 14 | 
 15 | # Created by https://www.gitignore.io
 16 | 
 17 | ### Python ###
 18 | # Byte-compiled / optimized / DLL files
 19 | __pycache__/
 20 | *.py[cod]
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | env/
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .coverage
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | 
 74 | ### IPythonNotebook ###
 75 | # Temporary data
 76 | .ipynb_checkpoints/
 77 | 
 78 | 
 79 | ### SublimeText ###
 80 | # cache files for sublime text
 81 | *.tmlanguage.cache
 82 | *.tmPreferences.cache
 83 | *.stTheme.cache
 84 | 
 85 | # workspace files are user-specific
 86 | *.sublime-workspace
 87 | 
 88 | # project files should be checked into the repository, unless a significant
 89 | # proportion of contributors will probably not be using SublimeText
 90 | # *.sublime-project
 91 | 
 92 | # sftp configuration file
 93 | sftp-config.json
 94 | 
 95 | 
 96 | ### Emacs ###
 97 | # -*- mode: gitignore; -*-
 98 | *~
 99 | \#*\#
100 | /.emacs.desktop
101 | /.emacs.desktop.lock
102 | *.elc
103 | auto-save-list
104 | tramp
105 | .\#*
106 | 
107 | # Org-mode
108 | .org-id-locations
109 | *_archive
110 | 
111 | # flymake-mode
112 | *_flymake.*
113 | 
114 | # eshell files
115 | /eshell/history
116 | /eshell/lastdir
117 | 
118 | # elpa packages
119 | /elpa/
120 | 
121 | # reftex files
122 | *.rel
123 | 
124 | # AUCTeX auto folder
125 | /auto/
126 | 
127 | # cask packages
128 | .cask/
129 | 
130 | 
131 | ### Vim ###
132 | [._]*.s[a-w][a-z]
133 | [._]s[a-w][a-z]
134 | *.un~
135 | Session.vim
136 | .netrwhist
137 | *~
138 | 
139 | 
140 | ### C++ ###
141 | # Compiled Object files
142 | *.slo
143 | *.lo
144 | *.o
145 | *.obj
146 | 
147 | # Precompiled Headers
148 | *.gch
149 | *.pch
150 | 
151 | # Compiled Dynamic libraries
152 | *.so
153 | *.dylib
154 | *.dll
155 | 
156 | # Fortran module files
157 | *.mod
158 | 
159 | # Compiled Static libraries
160 | *.lai
161 | *.la
162 | *.a
163 | *.lib
164 | 
165 | # Executables
166 | *.exe
167 | *.out
168 | *.app
169 | 
170 | 
171 | ### OSX ###
172 | .DS_Store
173 | .AppleDouble
174 | .LSOverride
175 | 
176 | # Icon must end with two \r
177 | Icon
178 | 
179 | 
180 | # Thumbnails
181 | ._*
182 | 
183 | # Files that might appear on external disk
184 | .Spotlight-V100
185 | .Trashes
186 | 
187 | # Directories potentially created on remote AFP share
188 | .AppleDB
189 | .AppleDesktop
190 | Network Trash Folder
191 | Temporary Items
192 | .apdisk
193 | 
194 | 
195 | ### Linux ###
196 | *~
197 | 
198 | # KDE directory preferences
199 | .directory
200 | 
201 | # Linux trash folder which might appear on any partition or disk
202 | .Trash-*
203 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/.gitmodules


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.6"
 5 | 
 6 | notifications:
 7 |   email: false
 8 | 
 9 | before_install:
10 |   - sudo apt-get update
11 |   - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then
12 |       wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh;
13 |     else
14 |       wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh;
15 |     fi
16 |   - bash miniconda.sh -b -p $HOME/miniconda
17 |   - export PATH="$HOME/miniconda/bin:$PATH"
18 |   - hash -r
19 |   - conda config --set always_yes yes --set changeps1 no
20 |   - conda update -q conda
21 |   # Useful for debugging any issues with conda
22 |   - conda config --add channels pypi
23 |   - conda info -a
24 |   - deps='pip numpy scipy cython nose pytorch flake8'
25 |   - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch
26 |   - source activate test-environment
27 | 
28 | install:
29 |   - pip install -e ".[test]"
30 |   - python -c "import nltk; nltk.download('cmudict')"
31 | 
32 | before_script:
33 |   # stop the build if there are Python syntax errors or undefined names
34 |   - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
35 |   # exit-zero treats all errors as warnings.  The GitHub editor is 127 chars wide
36 |   - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | 
38 | script:
39 |   - nosetests -v -w tests/ -a '!local_only'
40 |  
41 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The deepvoice3_pytorch package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2017: Ryuichi Yamamoto.
 4 | >
 5 | > Permission is hereby granted, free of charge, to any person obtaining
 6 | > a copy of this software and associated documentation files (the
 7 | > "Software"), to deal in the Software without restriction, including
 8 | > without limitation the rights to use, copy, modify, merge, publish,
 9 | > distribute, sublicense, and/or sell copies of the Software, and to
10 | > permit persons to whom the Software is furnished to do so, subject to
11 | > the following conditions:
12 | >
13 | > The above copyright notice and this permission notice shall be
14 | > included in all copies or substantial portions of the Software.
15 | >
16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | > # Part of code was adapted from https://github.com/facebookresearch/fairseq-py
25 | > # Copyright (c) 2017-present, Facebook, Inc.
26 | > # Thier licenses apply.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE.md
2 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     - PYTHON_VERSION: "3.6"
 4 |       PYTHON_ARCH: "64"
 5 |       MINICONDA: C:\Miniconda36-x64
 6 | 
 7 | branches:
 8 |   only:
 9 |     - master
10 |     - /release-.*/
11 | 
12 | skip_commits:
13 |   message: /\[av skip\]/
14 | 
15 | notifications:
16 |   - provider: Email
17 |     on_build_success: false
18 |     on_build_failure: false
19 |     on_build_status_changed: false
20 | 
21 | init:
22 |   - "ECHO %PYTHON_VERSION% %PYTHON_ARCH% %MINICONDA%"
23 | 
24 | install:
25 |   - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
26 |   - conda config --set always_yes yes  --set changeps1 no
27 |   - conda update -q conda
28 |   - conda info -a
29 |   - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch -c pytorch"
30 |   - activate test-environment
31 | 
32 | build_script:
33 |   - pip install -e ".[test]"
34 |   - python -c "import nltk; nltk.download('cmudict')"
35 | 
36 | test_script:
37 |   - nosetests -v -w tests/ -a "!local_only"
38 | 


--------------------------------------------------------------------------------
/assets/banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/assets/banner.jpg


--------------------------------------------------------------------------------
/audio.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import librosa.filters
 3 | import math
 4 | import numpy as np
 5 | from scipy import signal
 6 | from hparams import hparams
 7 | from scipy.io import wavfile
 8 | 
 9 | import lws
10 | 
11 | 
12 | def load_wav(path):
13 |     return librosa.core.load(path, sr=hparams.sample_rate)[0]
14 | 
15 | 
16 | def save_wav(wav, path):
17 |     wav = wav * 32767 / max(0.01, np.max(np.abs(wav)))
18 |     wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
19 | 
20 | 
21 | def preemphasis(x):
22 |     from nnmnkwii.preprocessing import preemphasis
23 |     return preemphasis(x, hparams.preemphasis)
24 | 
25 | 
26 | def inv_preemphasis(x):
27 |     from nnmnkwii.preprocessing import inv_preemphasis
28 |     return inv_preemphasis(x, hparams.preemphasis)
29 | 
30 | 
31 | def spectrogram(y):
32 |     D = _lws_processor().stft(preemphasis(y)).T
33 |     S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
34 |     return _normalize(S)
35 | 
36 | 
37 | def inv_spectrogram(spectrogram):
38 |     '''Converts spectrogram to waveform using librosa'''
39 |     S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
40 |     processor = _lws_processor()
41 |     D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
42 |     y = processor.istft(D).astype(np.float32)
43 |     return inv_preemphasis(y)
44 | 
45 | 
46 | def melspectrogram(y):
47 |     D = _lws_processor().stft(preemphasis(y)).T
48 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
49 |     if not hparams.allow_clipping_in_normalization:
50 |         assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
51 |     return _normalize(S)
52 | 
53 | 
54 | def _lws_processor():
55 |     return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech")
56 | 
57 | 
58 | # Conversions:
59 | 
60 | 
61 | _mel_basis = None
62 | 
63 | 
64 | def _linear_to_mel(spectrogram):
65 |     global _mel_basis
66 |     if _mel_basis is None:
67 |         _mel_basis = _build_mel_basis()
68 |     return np.dot(_mel_basis, spectrogram)
69 | 
70 | 
71 | def _build_mel_basis():
72 |     if hparams.fmax is not None:
73 |         assert hparams.fmax <= hparams.sample_rate // 2
74 |     return librosa.filters.mel(sr=hparams.sample_rate, n_fft=hparams.fft_size,
75 |                                fmin=hparams.fmin, fmax=hparams.fmax,
76 |                                n_mels=hparams.num_mels)
77 | 
78 | 
79 | def _amp_to_db(x):
80 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
81 |     return 20 * np.log10(np.maximum(min_level, x))
82 | 
83 | 
84 | def _db_to_amp(x):
85 |     return np.power(10.0, x * 0.05)
86 | 
87 | 
88 | def _normalize(S):
89 |     return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
90 | 
91 | 
92 | def _denormalize(S):
93 |     return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
94 | 


--------------------------------------------------------------------------------
/compute_timestamp_ratio.py:
--------------------------------------------------------------------------------
 1 | """Compute output/input timestamp ratio.
 2 | 
 3 | usage: compute_timestamp_ratio.py [options] <data_root>
 4 | 
 5 | options:
 6 |     --hparams=<parmas>        Hyper parameters [default: ].
 7 |     --preset=<json>           Path of preset parameters (json).
 8 |     -h, --help                Show this help message and exit
 9 | """
10 | from docopt import docopt
11 | import sys
12 | import numpy as np
13 | from hparams import hparams, hparams_debug_string
14 | import train
15 | from train import TextDataSource, MelSpecDataSource
16 | from nnmnkwii.datasets import FileSourceDataset
17 | from tqdm import trange
18 | from deepvoice3_pytorch import frontend
19 | 
20 | if __name__ == "__main__":
21 |     args = docopt(__doc__)
22 |     data_root = args["<data_root>"]
23 |     preset = args["--preset"]
24 | 
25 |     # Load preset if specified
26 |     if preset is not None:
27 |         with open(preset) as f:
28 |             hparams.parse_json(f.read())
29 |     # Override hyper parameters
30 |     hparams.parse(args["--hparams"])
31 |     assert hparams.name == "deepvoice3"
32 | 
33 |     train._frontend = getattr(frontend, hparams.frontend)
34 | 
35 |     # Code below
36 |     X = FileSourceDataset(TextDataSource(data_root))
37 |     Mel = FileSourceDataset(MelSpecDataSource(data_root))
38 | 
39 |     in_sizes = []
40 |     out_sizes = []
41 |     for i in trange(len(X)):
42 |         x, m = X[i], Mel[i]
43 |         if X.file_data_source.multi_speaker:
44 |             x = x[0]
45 |         in_sizes.append(x.shape[0])
46 |         out_sizes.append(m.shape[0])
47 | 
48 |     in_sizes = np.array(in_sizes)
49 |     out_sizes = np.array(out_sizes)
50 | 
51 |     input_timestamps = np.sum(in_sizes)
52 |     output_timestamps = np.sum(out_sizes) / hparams.outputs_per_step / hparams.downsample_step
53 | 
54 |     print(input_timestamps, output_timestamps, output_timestamps / input_timestamps)
55 |     sys.exit(0)
56 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | from .version import __version__
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | 
  8 | from .modules import Embedding
  9 | 
 10 | 
 11 | class MultiSpeakerTTSModel(nn.Module):
 12 |     """Attention seq2seq model + post processing network
 13 |     """
 14 | 
 15 |     def __init__(self, seq2seq, postnet,
 16 |                  mel_dim=80, linear_dim=513,
 17 |                  n_speakers=1, speaker_embed_dim=16, padding_idx=None,
 18 |                  trainable_positional_encodings=False,
 19 |                  use_decoder_state_for_postnet_input=False,
 20 |                  speaker_embedding_weight_std=0.01,
 21 |                  freeze_embedding=False):
 22 |         super(MultiSpeakerTTSModel, self).__init__()
 23 |         self.seq2seq = seq2seq
 24 |         self.postnet = postnet  # referred as "Converter" in DeepVoice3
 25 |         self.mel_dim = mel_dim
 26 |         self.linear_dim = linear_dim
 27 |         self.trainable_positional_encodings = trainable_positional_encodings
 28 |         self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input
 29 |         self.freeze_embedding = freeze_embedding
 30 | 
 31 |         # Speaker embedding
 32 |         if n_speakers > 1:
 33 |             self.embed_speakers = Embedding(
 34 |                 n_speakers, speaker_embed_dim, padding_idx=None,
 35 |                 std=speaker_embedding_weight_std)
 36 |         self.n_speakers = n_speakers
 37 |         self.speaker_embed_dim = speaker_embed_dim
 38 | 
 39 |     def make_generation_fast_(self):
 40 | 
 41 |         def remove_weight_norm(m):
 42 |             try:
 43 |                 nn.utils.remove_weight_norm(m)
 44 |             except ValueError:  # this module didn't have weight norm
 45 |                 return
 46 |         self.apply(remove_weight_norm)
 47 | 
 48 |     def get_trainable_parameters(self):
 49 |         freezed_param_ids = set()
 50 | 
 51 |         encoder, decoder = self.seq2seq.encoder, self.seq2seq.decoder
 52 | 
 53 |         # Avoid updating the position encoding
 54 |         if not self.trainable_positional_encodings:
 55 |             pe_query_param_ids = set(map(id, decoder.embed_query_positions.parameters()))
 56 |             pe_keys_param_ids = set(map(id, decoder.embed_keys_positions.parameters()))
 57 |             freezed_param_ids |= (pe_query_param_ids | pe_keys_param_ids)
 58 |         # Avoid updating the text embedding
 59 |         if self.freeze_embedding:
 60 |             embed_param_ids = set(map(id, encoder.embed_tokens.parameters()))
 61 |             freezed_param_ids |= embed_param_ids
 62 | 
 63 |         return (p for p in self.parameters() if id(p) not in freezed_param_ids)
 64 | 
 65 |     def forward(self, text_sequences, mel_targets=None, speaker_ids=None,
 66 |                 text_positions=None, frame_positions=None, input_lengths=None):
 67 |         B = text_sequences.size(0)
 68 | 
 69 |         if speaker_ids is not None:
 70 |             assert self.n_speakers > 1
 71 |             speaker_embed = self.embed_speakers(speaker_ids)
 72 |         else:
 73 |             speaker_embed = None
 74 | 
 75 |         # Apply seq2seq
 76 |         # (B, T//r, mel_dim*r)
 77 |         mel_outputs, alignments, done, decoder_states = self.seq2seq(
 78 |             text_sequences, mel_targets, speaker_embed,
 79 |             text_positions, frame_positions, input_lengths)
 80 | 
 81 |         # Reshape
 82 |         # (B, T, mel_dim)
 83 |         mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
 84 | 
 85 |         # Prepare postnet inputs
 86 |         if self.use_decoder_state_for_postnet_input:
 87 |             postnet_inputs = decoder_states.view(B, mel_outputs.size(1), -1)
 88 |         else:
 89 |             postnet_inputs = mel_outputs
 90 | 
 91 |         # (B, T, linear_dim)
 92 |         # Convert coarse mel-spectrogram (or decoder hidden states) to
 93 |         # high resolution spectrogram
 94 |         linear_outputs = self.postnet(postnet_inputs, speaker_embed)
 95 |         assert linear_outputs.size(-1) == self.linear_dim
 96 | 
 97 |         return mel_outputs, linear_outputs, alignments, done
 98 | 
 99 | 
100 | class AttentionSeq2Seq(nn.Module):
101 |     """Encoder + Decoder with attention
102 |     """
103 | 
104 |     def __init__(self, encoder, decoder):
105 |         super(AttentionSeq2Seq, self).__init__()
106 |         self.encoder = encoder
107 |         self.decoder = decoder
108 |         if isinstance(self.decoder.attention, nn.ModuleList):
109 |             self.encoder.num_attention_layers = sum(
110 |                 [layer is not None for layer in decoder.attention])
111 | 
112 |     def forward(self, text_sequences, mel_targets=None, speaker_embed=None,
113 |                 text_positions=None, frame_positions=None, input_lengths=None):
114 |         # (B, T, text_embed_dim)
115 |         encoder_outputs = self.encoder(
116 |             text_sequences, lengths=input_lengths, speaker_embed=speaker_embed)
117 | 
118 |         # Mel: (B, T//r, mel_dim*r)
119 |         # Alignments: (N, B, T_target, T_input)
120 |         # Done: (B, T//r, 1)
121 |         mel_outputs, alignments, done, decoder_states = self.decoder(
122 |             encoder_outputs, mel_targets,
123 |             text_positions=text_positions, frame_positions=frame_positions,
124 |             speaker_embed=speaker_embed, lengths=input_lengths)
125 | 
126 |         return mel_outputs, alignments, done, decoder_states
127 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/builder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
  5 | 
  6 | 
  7 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4,
  8 |                downsample_step=1,
  9 |                n_speakers=1, speaker_embed_dim=16, padding_idx=0,
 10 |                dropout=(1 - 0.95), kernel_size=5,
 11 |                encoder_channels=128,
 12 |                decoder_channels=256,
 13 |                converter_channels=256,
 14 |                query_position_rate=1.0,
 15 |                key_position_rate=1.29,
 16 |                use_memory_mask=False,
 17 |                trainable_positional_encodings=False,
 18 |                force_monotonic_attention=True,
 19 |                use_decoder_state_for_postnet_input=True,
 20 |                max_positions=512,
 21 |                embedding_weight_std=0.1,
 22 |                speaker_embedding_weight_std=0.01,
 23 |                freeze_embedding=False,
 24 |                window_ahead=3,
 25 |                window_backward=1,
 26 |                key_projection=False,
 27 |                value_projection=False,
 28 |                ):
 29 |     """Build deepvoice3
 30 |     """
 31 |     from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
 32 | 
 33 |     time_upsampling = max(downsample_step // r, 1)
 34 | 
 35 |     # Seq2seq
 36 |     h = encoder_channels  # hidden dim (channels)
 37 |     k = kernel_size   # kernel size
 38 |     encoder = Encoder(
 39 |         n_vocab, embed_dim, padding_idx=padding_idx,
 40 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 41 |         dropout=dropout, max_positions=max_positions,
 42 |         embedding_weight_std=embedding_weight_std,
 43 |         # (channels, kernel_size, dilation)
 44 |         convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
 45 |                       (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
 46 |                       (h, k, 1), (h, k, 3)],
 47 |     )
 48 | 
 49 |     h = decoder_channels
 50 |     decoder = Decoder(
 51 |         embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
 52 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 53 |         dropout=dropout, max_positions=max_positions,
 54 |         preattention=[(h, k, 1), (h, k, 3)],
 55 |         convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
 56 |                       (h, k, 1)],
 57 |         attention=[True, False, False, False, True],
 58 |         force_monotonic_attention=force_monotonic_attention,
 59 |         query_position_rate=query_position_rate,
 60 |         key_position_rate=key_position_rate,
 61 |         use_memory_mask=use_memory_mask,
 62 |         window_ahead=window_ahead,
 63 |         window_backward=window_backward,
 64 |         key_projection=key_projection,
 65 |         value_projection=value_projection,
 66 |     )
 67 | 
 68 |     seq2seq = AttentionSeq2Seq(encoder, decoder)
 69 | 
 70 |     # Post net
 71 |     if use_decoder_state_for_postnet_input:
 72 |         in_dim = h // r
 73 |     else:
 74 |         in_dim = mel_dim
 75 |     h = converter_channels
 76 |     converter = Converter(
 77 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 78 |         in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
 79 |         time_upsampling=time_upsampling,
 80 |         convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)],
 81 |     )
 82 | 
 83 |     # Seq2seq + post net
 84 |     model = MultiSpeakerTTSModel(
 85 |         seq2seq, converter, padding_idx=padding_idx,
 86 |         mel_dim=mel_dim, linear_dim=linear_dim,
 87 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 88 |         trainable_positional_encodings=trainable_positional_encodings,
 89 |         use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
 90 |         speaker_embedding_weight_std=speaker_embedding_weight_std,
 91 |         freeze_embedding=freeze_embedding)
 92 | 
 93 |     return model
 94 | 
 95 | 
 96 | def nyanko(n_vocab, embed_dim=128, mel_dim=80, linear_dim=513, r=1,
 97 |            downsample_step=4,
 98 |            n_speakers=1, speaker_embed_dim=16, padding_idx=0,
 99 |            dropout=(1 - 0.95), kernel_size=3,
100 |            encoder_channels=256,
101 |            decoder_channels=256,
102 |            converter_channels=512,
103 |            query_position_rate=1.0,
104 |            key_position_rate=1.29,
105 |            use_memory_mask=False,
106 |            trainable_positional_encodings=False,
107 |            force_monotonic_attention=True,
108 |            use_decoder_state_for_postnet_input=False,
109 |            max_positions=512, embedding_weight_std=0.01,
110 |            speaker_embedding_weight_std=0.01,
111 |            freeze_embedding=False,
112 |            window_ahead=3,
113 |            window_backward=1,
114 |            key_projection=False,
115 |            value_projection=False,
116 |            ):
117 |     from deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter
118 |     assert encoder_channels == decoder_channels
119 | 
120 |     if n_speakers != 1:
121 |         raise ValueError("Multi-speaker is not supported")
122 |     if not (downsample_step == 4 and r == 1):
123 |         raise ValueError("Not supported. You need to change hardcoded parameters")
124 | 
125 |     # Seq2seq
126 |     encoder = Encoder(
127 |         n_vocab, embed_dim, channels=encoder_channels, kernel_size=kernel_size,
128 |         padding_idx=padding_idx,
129 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
130 |         dropout=dropout, embedding_weight_std=embedding_weight_std,
131 |     )
132 | 
133 |     decoder = Decoder(
134 |         embed_dim, in_dim=mel_dim, r=r, channels=decoder_channels,
135 |         kernel_size=kernel_size, padding_idx=padding_idx,
136 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
137 |         dropout=dropout, max_positions=max_positions,
138 |         force_monotonic_attention=force_monotonic_attention,
139 |         query_position_rate=query_position_rate,
140 |         key_position_rate=key_position_rate,
141 |         use_memory_mask=use_memory_mask,
142 |         window_ahead=window_ahead,
143 |         window_backward=window_backward,
144 |         key_projection=key_projection,
145 |         value_projection=value_projection,
146 |     )
147 | 
148 |     seq2seq = AttentionSeq2Seq(encoder, decoder)
149 | 
150 |     if use_decoder_state_for_postnet_input:
151 |         in_dim = decoder_channels // r
152 |     else:
153 |         in_dim = mel_dim
154 | 
155 |     converter = Converter(
156 |         in_dim=in_dim, out_dim=linear_dim, channels=converter_channels,
157 |         kernel_size=kernel_size, dropout=dropout)
158 | 
159 |     # Seq2seq + post net
160 |     model = MultiSpeakerTTSModel(
161 |         seq2seq, converter, padding_idx=padding_idx,
162 |         mel_dim=mel_dim, linear_dim=linear_dim,
163 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
164 |         trainable_positional_encodings=trainable_positional_encodings,
165 |         use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
166 |         speaker_embedding_weight_std=speaker_embedding_weight_std,
167 |         freeze_embedding=freeze_embedding)
168 | 
169 |     return model
170 | 
171 | 
172 | def deepvoice3_multispeaker(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4,
173 |                             downsample_step=1,
174 |                             n_speakers=1, speaker_embed_dim=16, padding_idx=0,
175 |                             dropout=(1 - 0.95), kernel_size=5,
176 |                             encoder_channels=128,
177 |                             decoder_channels=256,
178 |                             converter_channels=256,
179 |                             query_position_rate=1.0,
180 |                             key_position_rate=1.29,
181 |                             use_memory_mask=False,
182 |                             trainable_positional_encodings=False,
183 |                             force_monotonic_attention=True,
184 |                             use_decoder_state_for_postnet_input=True,
185 |                             max_positions=512,
186 |                             embedding_weight_std=0.1,
187 |                             speaker_embedding_weight_std=0.01,
188 |                             freeze_embedding=False,
189 |                             window_ahead=3,
190 |                             window_backward=1,
191 |                             key_projection=True,
192 |                             value_projection=True,
193 |                             ):
194 |     """Build multi-speaker deepvoice3
195 |     """
196 |     from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
197 | 
198 |     time_upsampling = max(downsample_step // r, 1)
199 | 
200 |     # Seq2seq
201 |     h = encoder_channels  # hidden dim (channels)
202 |     k = kernel_size   # kernel size
203 |     encoder = Encoder(
204 |         n_vocab, embed_dim, padding_idx=padding_idx,
205 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
206 |         dropout=dropout, max_positions=max_positions,
207 |         embedding_weight_std=embedding_weight_std,
208 |         # (channels, kernel_size, dilation)
209 |         convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
210 |                       (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
211 |                       (h, k, 1), (h, k, 3)],
212 |     )
213 | 
214 |     h = decoder_channels
215 |     decoder = Decoder(
216 |         embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
217 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
218 |         dropout=dropout, max_positions=max_positions,
219 |         preattention=[(h, k, 1)],
220 |         convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27),
221 |                       (h, k, 1)],
222 |         attention=[True, False, False, False, False],
223 |         force_monotonic_attention=force_monotonic_attention,
224 |         query_position_rate=query_position_rate,
225 |         key_position_rate=key_position_rate,
226 |         use_memory_mask=use_memory_mask,
227 |         window_ahead=window_ahead,
228 |         window_backward=window_backward,
229 |         key_projection=key_projection,
230 |         value_projection=value_projection,
231 |     )
232 | 
233 |     seq2seq = AttentionSeq2Seq(encoder, decoder)
234 | 
235 |     # Post net
236 |     if use_decoder_state_for_postnet_input:
237 |         in_dim = h // r
238 |     else:
239 |         in_dim = mel_dim
240 |     h = converter_channels
241 |     converter = Converter(
242 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
243 |         in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
244 |         time_upsampling=time_upsampling,
245 |         convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)],
246 |     )
247 | 
248 |     # Seq2seq + post net
249 |     model = MultiSpeakerTTSModel(
250 |         seq2seq, converter, padding_idx=padding_idx,
251 |         mel_dim=mel_dim, linear_dim=linear_dim,
252 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
253 |         trainable_positional_encodings=trainable_positional_encodings,
254 |         use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
255 |         speaker_embedding_weight_std=speaker_embedding_weight_std,
256 |         freeze_embedding=freeze_embedding)
257 | 
258 |     return model
259 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/conv.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import torch
 3 | from torch import nn
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | class Conv1d(nn.Conv1d):
 8 |     """Extended nn.Conv1d for incremental dilated convolutions
 9 |     """
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         super().__init__(*args, **kwargs)
13 |         self.clear_buffer()
14 |         self._linearized_weight = None
15 |         self.register_backward_hook(self._clear_linearized_weight)
16 | 
17 |     def incremental_forward(self, input):
18 |         # input: (B, T, C)
19 |         if self.training:
20 |             raise RuntimeError('incremental_forward only supports eval mode')
21 | 
22 |         # run forward pre hooks (e.g., weight norm)
23 |         for hook in self._forward_pre_hooks.values():
24 |             hook(self, input)
25 | 
26 |         # reshape weight
27 |         weight = self._get_linearized_weight()
28 |         kw = self.kernel_size[0]
29 |         dilation = self.dilation[0]
30 | 
31 |         bsz = input.size(0)  # input: bsz x len x dim
32 |         if kw > 1:
33 |             input = input.data
34 |             if self.input_buffer is None:
35 |                 self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2))
36 |                 self.input_buffer.zero_()
37 |             else:
38 |                 # shift buffer
39 |                 self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
40 |             # append next input
41 |             self.input_buffer[:, -1, :] = input[:, -1, :]
42 |             input = self.input_buffer
43 |             if dilation > 1:
44 |                 input = input[:, 0::dilation, :].contiguous()
45 |         output = F.linear(input.view(bsz, -1), weight, self.bias)
46 |         return output.view(bsz, 1, -1)
47 | 
48 |     def clear_buffer(self):
49 |         self.input_buffer = None
50 | 
51 |     def _get_linearized_weight(self):
52 |         if self._linearized_weight is None:
53 |             kw = self.kernel_size[0]
54 |             # nn.Conv1d
55 |             if self.weight.size() == (self.out_channels, self.in_channels, kw):
56 |                 weight = self.weight.transpose(1, 2).contiguous()
57 |             else:
58 |                 # fairseq.modules.conv_tbc.ConvTBC
59 |                 weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
60 |             assert weight.size() == (self.out_channels, kw, self.in_channels)
61 |             self._linearized_weight = weight.view(self.out_channels, -1)
62 |         return self._linearized_weight
63 | 
64 |     def _clear_linearized_weight(self, *args):
65 |         self._linearized_weight = None
66 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """Text processing frontend
 4 | 
 5 | All frontend module should have the following functions:
 6 | 
 7 | - text_to_sequence(text, p)
 8 | - sequence_to_text(sequence)
 9 | 
10 | and the property:
11 | 
12 | - n_vocab
13 | 
14 | """
15 | from deepvoice3_pytorch.frontend import en
16 | 
17 | # optinoal Japanese frontend
18 | try:
19 |     from deepvoice3_pytorch.frontend import jp
20 | except ImportError:
21 |     jp = None
22 | 
23 | try:
24 |     from deepvoice3_pytorch.frontend import ko
25 | except ImportError:
26 |     ko = None
27 | 
28 | # if you are going to use the frontend, you need to modify _characters in symbol.py:
29 | # _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ'
30 | try:
31 |     from deepvoice3_pytorch.frontend import es
32 | except ImportError:
33 |     es = None
34 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/en/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from deepvoice3_pytorch.frontend.text.symbols import symbols
 3 | 
 4 | import nltk
 5 | from random import random
 6 | 
 7 | n_vocab = len(symbols)
 8 | 
 9 | _arpabet = nltk.corpus.cmudict.dict()
10 | 
11 | 
12 | def _maybe_get_arpabet(word, p):
13 |     try:
14 |         phonemes = _arpabet[word][0]
15 |         phonemes = " ".join(phonemes)
16 |     except KeyError:
17 |         return word
18 | 
19 |     return '{%s}' % phonemes if random() < p else word
20 | 
21 | 
22 | def mix_pronunciation(text, p):
23 |     text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
24 |     return text
25 | 
26 | 
27 | def text_to_sequence(text, p=0.0):
28 |     if p >= 0:
29 |         text = mix_pronunciation(text, p)
30 |     from deepvoice3_pytorch.frontend.text import text_to_sequence
31 |     text = text_to_sequence(text, ["english_cleaners"])
32 |     return text
33 | 
34 | 
35 | from deepvoice3_pytorch.frontend.text import sequence_to_text
36 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/es/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from deepvoice3_pytorch.frontend.text.symbols import symbols
 3 | 
 4 | import nltk
 5 | from random import random
 6 | 
 7 | n_vocab = len(symbols)
 8 | 
 9 | 
10 | def text_to_sequence(text, p=0.0):
11 |     from deepvoice3_pytorch.frontend.text import text_to_sequence
12 |     text = text_to_sequence(text, ["basic_cleaners"])
13 |     return text
14 | 
15 | 
16 | from deepvoice3_pytorch.frontend.text import sequence_to_text
17 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/jp/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | import MeCab
 5 | import jaconv
 6 | from random import random
 7 | 
 8 | n_vocab = 0xffff
 9 | 
10 | _eos = 1
11 | _pad = 0
12 | _tagger = None
13 | 
14 | 
15 | def _yomi(mecab_result):
16 |     tokens = []
17 |     yomis = []
18 |     for line in mecab_result.split("\n")[:-1]:
19 |         s = line.split("\t")
20 |         if len(s) == 1:
21 |             break
22 |         token, rest = s
23 |         rest = rest.split(",")
24 |         tokens.append(token)
25 |         yomi = rest[7] if len(rest) > 7 else None
26 |         yomi = None if yomi == "*" else yomi
27 |         yomis.append(yomi)
28 | 
29 |     return tokens, yomis
30 | 
31 | 
32 | def _mix_pronunciation(tokens, yomis, p):
33 |     return "".join(
34 |         yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
35 |         for idx in range(len(tokens)))
36 | 
37 | 
38 | def mix_pronunciation(text, p):
39 |     global _tagger
40 |     if _tagger is None:
41 |         _tagger = MeCab.Tagger("")
42 |     tokens, yomis = _yomi(_tagger.parse(text))
43 |     return _mix_pronunciation(tokens, yomis, p)
44 | 
45 | 
46 | def add_punctuation(text):
47 |     last = text[-1]
48 |     if last not in [".", ",", "、", "。", "！", "？", "!", "?"]:
49 |         text = text + "。"
50 |     return text
51 | 
52 | 
53 | def normalize_delimitor(text):
54 |     text = text.replace(",", "、")
55 |     text = text.replace(".", "。")
56 |     text = text.replace("，", "、")
57 |     text = text.replace("．", "。")
58 |     return text
59 | 
60 | 
61 | def text_to_sequence(text, p=0.0):
62 |     for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
63 |               "（", "）", "(", ")"]:
64 |         text = text.replace(c, "")
65 |     text = text.replace("!", "！")
66 |     text = text.replace("?", "？")
67 | 
68 |     text = normalize_delimitor(text)
69 |     text = jaconv.normalize(text)
70 |     if p > 0:
71 |         text = mix_pronunciation(text, p)
72 |     text = jaconv.hira2kata(text)
73 |     text = add_punctuation(text)
74 | 
75 |     return [ord(c) for c in text] + [_eos]  # EOS
76 | 
77 | 
78 | def sequence_to_text(seq):
79 |     return "".join(chr(n) for n in seq)
80 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/ko/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | from random import random
 5 | 
 6 | n_vocab = 0xffff
 7 | 
 8 | _eos = 1
 9 | _pad = 0
10 | _tagger = None
11 | 
12 | 
13 | def text_to_sequence(text, p=0.0):
14 |     return [ord(c) for c in text] + [_eos]  # EOS
15 | 
16 | def sequence_to_text(seq):
17 |     return "".join(chr(n) for n in seq)
18 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from deepvoice3_pytorch.frontend.text import cleaners
 3 | from deepvoice3_pytorch.frontend.text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |       in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |       Args:
21 |         text: string to convert to a sequence
22 |         cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |       Returns:
25 |         List of integers corresponding to the symbols in the text
26 |     '''
27 |     sequence = []
28 | 
29 |     # Check for curly braces and treat their contents as ARPAbet:
30 |     while len(text):
31 |         m = _curly_re.match(text)
32 |         if not m:
33 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |             break
35 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |         sequence += _arpabet_to_sequence(m.group(2))
37 |         text = m.group(3)
38 | 
39 |     # Append EOS token
40 |     sequence.append(_symbol_to_id['~'])
41 |     return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |     '''Converts a sequence of IDs back to a string'''
46 |     result = ''
47 |     for symbol_id in sequence:
48 |         if symbol_id in _id_to_symbol:
49 |             s = _id_to_symbol[symbol_id]
50 |             # Enclose ARPAbet back in curly braces:
51 |             if len(s) > 1 and s[0] == '@':
52 |                 s = '{%s}' % s[1:]
53 |             result += s
54 |     return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |     for name in cleaner_names:
59 |         cleaner = getattr(cleaners, name)
60 |         if not cleaner:
61 |             raise Exception('Unknown cleaner: %s' % name)
62 |         text = cleaner(text)
63 |     return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |     return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |     return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 |     ('mrs', 'misess'),
24 |     ('mr', 'mister'),
25 |     ('dr', 'doctor'),
26 |     ('st', 'saint'),
27 |     ('co', 'company'),
28 |     ('jr', 'junior'),
29 |     ('maj', 'major'),
30 |     ('gen', 'general'),
31 |     ('drs', 'doctors'),
32 |     ('rev', 'reverend'),
33 |     ('lt', 'lieutenant'),
34 |     ('hon', 'honorable'),
35 |     ('sgt', 'sergeant'),
36 |     ('capt', 'captain'),
37 |     ('esq', 'esquire'),
38 |     ('ltd', 'limited'),
39 |     ('col', 'colonel'),
40 |     ('ft', 'fort'),
41 | ]]
42 | 
43 | 
44 | def expand_abbreviations(text):
45 |     for regex, replacement in _abbreviations:
46 |         text = re.sub(regex, replacement, text)
47 |     return text
48 | 
49 | 
50 | def expand_numbers(text):
51 |     return normalize_numbers(text)
52 | 
53 | 
54 | def lowercase(text):
55 |     return text.lower()
56 | 
57 | 
58 | def collapse_whitespace(text):
59 |     return re.sub(_whitespace_re, ' ', text)
60 | 
61 | 
62 | def convert_to_ascii(text):
63 |     return unidecode(text)
64 | 
65 | 
66 | def add_punctuation(text):
67 |     if len(text) == 0:
68 |         return text
69 |     if text[-1] not in '!,.:;?':
70 |         text = text + '.'  # without this decoder is confused when to output EOS
71 |     return text
72 | 
73 | 
74 | def basic_cleaners(text):
75 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
76 |     text = lowercase(text)
77 |     text = collapse_whitespace(text)
78 |     return text
79 | 
80 | 
81 | def transliteration_cleaners(text):
82 |     '''Pipeline for non-English text that transliterates to ASCII.'''
83 |     text = convert_to_ascii(text)
84 |     text = lowercase(text)
85 |     text = collapse_whitespace(text)
86 |     return text
87 | 
88 | 
89 | def english_cleaners(text):
90 |     '''Pipeline for English text, including number and abbreviation expansion.'''
91 |     text = convert_to_ascii(text)
92 |     text = add_punctuation(text)
93 |     text = lowercase(text)
94 |     text = expand_numbers(text)
95 |     text = expand_abbreviations(text)
96 |     text = collapse_whitespace(text)
97 |     return text
98 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |     'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |     'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |     'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |     'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |     'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |     'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |     '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 | 
20 |     def __init__(self, file_or_path, keep_ambiguous=True):
21 |         if isinstance(file_or_path, str):
22 |             with open(file_or_path, encoding='latin-1') as f:
23 |                 entries = _parse_cmudict(f)
24 |         else:
25 |             entries = _parse_cmudict(file_or_path)
26 |         if not keep_ambiguous:
27 |             entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
28 |         self._entries = entries
29 | 
30 |     def __len__(self):
31 |         return len(self._entries)
32 | 
33 |     def lookup(self, word):
34 |         '''Returns list of ARPAbet pronunciations of the given word.'''
35 |         return self._entries.get(word.upper())
36 | 
37 | 
38 | _alt_re = re.compile(r'\([0-9]+\)')
39 | 
40 | 
41 | def _parse_cmudict(file):
42 |     cmudict = {}
43 |     for line in file:
44 |         if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
45 |             parts = line.split('  ')
46 |             word = re.sub(_alt_re, '', parts[0])
47 |             pronunciation = _get_pronunciation(parts[1])
48 |             if pronunciation:
49 |                 if word in cmudict:
50 |                     cmudict[word].append(pronunciation)
51 |                 else:
52 |                     cmudict[word] = [pronunciation]
53 |     return cmudict
54 | 
55 | 
56 | def _get_pronunciation(s):
57 |     parts = s.strip().split(' ')
58 |     for part in parts:
59 |         if part not in _valid_symbol_set:
60 |             return None
61 |     return ' '.join(parts)
62 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/text/numbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split('.')
27 |     if len(parts) > 2:
28 |         return match + ' dollars'  # Unexpected format
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |         cent_unit = 'cent' if cents == 1 else 'cents'
34 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |     elif dollars:
36 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |         return '%s %s' % (dollars, dollar_unit)
38 |     elif cents:
39 |         cent_unit = 'cent' if cents == 1 else 'cents'
40 |         return '%s %s' % (cents, cent_unit)
41 |     else:
42 |         return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return 'two thousand'
54 |         elif num > 2000 and num < 2010:
55 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + ' hundred'
58 |         else:
59 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
60 |     else:
61 |         return _inflect.number_to_words(num, andword='')
62 | 
63 | 
64 | def normalize_numbers(text):
65 |     text = re.sub(_comma_number_re, _remove_commas, text)
66 |     text = re.sub(_pounds_re, r'\1 pounds', text)
67 |     text = re.sub(_dollars_re, _expand_dollars, text)
68 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
69 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
70 |     text = re.sub(_number_re, _expand_number, text)
71 |     return text
72 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/frontend/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from .cmudict import valid_symbols
 8 | 
 9 | _pad = '_'
10 | _eos = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet
18 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/modules.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | import math
  6 | import numpy as np
  7 | from torch.nn import functional as F
  8 | 
  9 | 
 10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0,
 11 |                            sinusoidal=True):
 12 |     ''' Init the sinusoid position encoding table '''
 13 | 
 14 |     # keep dim 0 for padding token position encoding zero vector
 15 |     position_enc = np.array([
 16 |         [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)]
 17 |         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
 18 | 
 19 |     position_enc = torch.from_numpy(position_enc).float()
 20 |     if sinusoidal:
 21 |         position_enc[1:, 0::2] = torch.sin(position_enc[1:, 0::2])  # dim 2i
 22 |         position_enc[1:, 1::2] = torch.cos(position_enc[1:, 1::2])  # dim 2i+1
 23 | 
 24 |     return position_enc
 25 | 
 26 | 
 27 | def sinusoidal_encode(x, w):
 28 |     y = w * x
 29 |     y[1:, 0::2] = torch.sin(y[1:, 0::2].clone())
 30 |     y[1:, 1::2] = torch.cos(y[1:, 1::2].clone())
 31 |     return y
 32 | 
 33 | 
 34 | class SinusoidalEncoding(nn.Embedding):
 35 | 
 36 |     def __init__(self, num_embeddings, embedding_dim,
 37 |                  *args, **kwargs):
 38 |         super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim,
 39 |                                                  padding_idx=0,
 40 |                                                  *args, **kwargs)
 41 |         self.weight.data = position_encoding_init(num_embeddings, embedding_dim,
 42 |                                                   position_rate=1.0,
 43 |                                                   sinusoidal=False)
 44 | 
 45 |     def forward(self, x, w=1.0):
 46 |         isscaler = np.isscalar(w)
 47 |         assert self.padding_idx is not None
 48 | 
 49 |         if isscaler or w.size(0) == 1:
 50 |             weight = sinusoidal_encode(self.weight, w)
 51 |             return F.embedding(
 52 |                 x, weight, self.padding_idx, self.max_norm,
 53 |                 self.norm_type, self.scale_grad_by_freq, self.sparse)
 54 |         else:
 55 |             # TODO: cannot simply apply for batch
 56 |             # better to implement efficient function
 57 |             pe = []
 58 |             for batch_idx, we in enumerate(w):
 59 |                 weight = sinusoidal_encode(self.weight, we)
 60 |                 pe.append(F.embedding(
 61 |                     x[batch_idx], weight, self.padding_idx, self.max_norm,
 62 |                     self.norm_type, self.scale_grad_by_freq, self.sparse))
 63 |             pe = torch.stack(pe)
 64 |             return pe
 65 | 
 66 | 
 67 | class GradMultiply(torch.autograd.Function):
 68 |     @staticmethod
 69 |     def forward(ctx, x, scale):
 70 |         ctx.scale = scale
 71 |         res = x.new(x)
 72 |         ctx.mark_shared_storage((x, res))
 73 |         return res
 74 | 
 75 |     @staticmethod
 76 |     def backward(ctx, grad):
 77 |         return grad * ctx.scale, None
 78 | 
 79 | 
 80 | def Linear(in_features, out_features, dropout=0):
 81 |     """Weight-normalized Linear layer (input: N x T x C)"""
 82 |     m = nn.Linear(in_features, out_features)
 83 |     m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
 84 |     m.bias.data.zero_()
 85 |     return nn.utils.weight_norm(m)
 86 | 
 87 | 
 88 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
 89 |     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
 90 |     m.weight.data.normal_(0, std)
 91 |     return m
 92 | 
 93 | 
 94 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs):
 95 |     from .conv import Conv1d
 96 |     m = Conv1d(in_channels, out_channels, kernel_size, **kwargs)
 97 |     std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
 98 |     m.weight.data.normal_(mean=0, std=std)
 99 |     m.bias.data.zero_()
100 |     return nn.utils.weight_norm(m)
101 | 
102 | 
103 | def ConvTranspose1d(in_channels, out_channels, kernel_size, dropout=0,
104 |                     std_mul=1.0, **kwargs):
105 |     m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs)
106 |     std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
107 |     m.weight.data.normal_(mean=0, std=std)
108 |     m.bias.data.zero_()
109 |     return nn.utils.weight_norm(m)
110 | 
111 | 
112 | class Conv1dGLU(nn.Module):
113 |     """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding
114 |     """
115 | 
116 |     def __init__(self, n_speakers, speaker_embed_dim,
117 |                  in_channels, out_channels, kernel_size,
118 |                  dropout, padding=None, dilation=1, causal=False, residual=False,
119 |                  *args, **kwargs):
120 |         super(Conv1dGLU, self).__init__()
121 |         self.dropout = dropout
122 |         self.residual = residual
123 |         if padding is None:
124 |             # no future time stamps available
125 |             if causal:
126 |                 padding = (kernel_size - 1) * dilation
127 |             else:
128 |                 padding = (kernel_size - 1) // 2 * dilation
129 |         self.causal = causal
130 | 
131 |         self.conv = Conv1d(in_channels, 2 * out_channels, kernel_size,
132 |                            dropout=dropout, padding=padding, dilation=dilation,
133 |                            *args, **kwargs)
134 |         if n_speakers > 1:
135 |             self.speaker_proj = Linear(speaker_embed_dim, out_channels)
136 |         else:
137 |             self.speaker_proj = None
138 | 
139 |     def forward(self, x, speaker_embed=None):
140 |         return self._forward(x, speaker_embed, False)
141 | 
142 |     def incremental_forward(self, x, speaker_embed=None):
143 |         return self._forward(x, speaker_embed, True)
144 | 
145 |     def _forward(self, x, speaker_embed, is_incremental):
146 |         residual = x
147 |         x = F.dropout(x, p=self.dropout, training=self.training)
148 |         if is_incremental:
149 |             splitdim = -1
150 |             x = self.conv.incremental_forward(x)
151 |         else:
152 |             splitdim = 1
153 |             x = self.conv(x)
154 |             # remove future time steps
155 |             x = x[:, :, :residual.size(-1)] if self.causal else x
156 | 
157 |         a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
158 |         if self.speaker_proj is not None:
159 |             softsign = F.softsign(self.speaker_proj(speaker_embed))
160 |             # Since conv layer assumes BCT, we need to transpose
161 |             softsign = softsign if is_incremental else softsign.transpose(1, 2)
162 |             a = a + softsign
163 |         x = a * torch.sigmoid(b)
164 |         return (x + residual) * math.sqrt(0.5) if self.residual else x
165 | 
166 |     def clear_buffer(self):
167 |         self.conv.clear_buffer()
168 | 
169 | 
170 | class HighwayConv1d(nn.Module):
171 |     """Weight normzlized Conv1d + Highway network (support incremental forward)
172 |     """
173 | 
174 |     def __init__(self, in_channels, out_channels, kernel_size=1, padding=None,
175 |                  dilation=1, causal=False, dropout=0, std_mul=None, glu=False):
176 |         super(HighwayConv1d, self).__init__()
177 |         if std_mul is None:
178 |             std_mul = 4.0 if glu else 1.0
179 |         if padding is None:
180 |             # no future time stamps available
181 |             if causal:
182 |                 padding = (kernel_size - 1) * dilation
183 |             else:
184 |                 padding = (kernel_size - 1) // 2 * dilation
185 |         self.causal = causal
186 |         self.dropout = dropout
187 |         self.glu = glu
188 | 
189 |         self.conv = Conv1d(in_channels, 2 * out_channels,
190 |                            kernel_size=kernel_size, padding=padding,
191 |                            dilation=dilation, dropout=dropout,
192 |                            std_mul=std_mul)
193 | 
194 |     def forward(self, x):
195 |         return self._forward(x, False)
196 | 
197 |     def incremental_forward(self, x):
198 |         return self._forward(x, True)
199 | 
200 |     def _forward(self, x, is_incremental):
201 |         """Forward
202 | 
203 |         Args:
204 |             x: (B, in_channels, T)
205 |         returns:
206 |             (B, out_channels, T)
207 |         """
208 | 
209 |         residual = x
210 |         x = F.dropout(x, p=self.dropout, training=self.training)
211 |         if is_incremental:
212 |             splitdim = -1
213 |             x = self.conv.incremental_forward(x)
214 |         else:
215 |             splitdim = 1
216 |             x = self.conv(x)
217 |             # remove future time steps
218 |             x = x[:, :, :residual.size(-1)] if self.causal else x
219 | 
220 |         if self.glu:
221 |             x = F.glu(x, dim=splitdim)
222 |             return (x + residual) * math.sqrt(0.5)
223 |         else:
224 |             a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
225 |             T = torch.sigmoid(b)
226 |             return (T * a + (1 - T) * residual)
227 | 
228 |     def clear_buffer(self):
229 |         self.conv.clear_buffer()
230 | 
231 | 
232 | def get_mask_from_lengths(memory, memory_lengths):
233 |     """Get mask tensor from list of length
234 |     Args:
235 |         memory: (batch, max_time, dim)
236 |         memory_lengths: array like
237 |     """
238 |     max_len = max(memory_lengths)
239 |     mask = torch.arange(max_len).expand(memory.size(0), max_len) < torch.tensor(memory_lengths).unsqueeze(-1)
240 |     mask = mask.to(memory.device)
241 |     return ~mask
242 | 


--------------------------------------------------------------------------------
/deepvoice3_pytorch/tfcompat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/deepvoice3_pytorch/tfcompat/__init__.py


--------------------------------------------------------------------------------
/deepvoice3_pytorch/tfcompat/readme.md:
--------------------------------------------------------------------------------
1 | Source: hparam.py copied from tensorflow v1.12.0.
2 | 
3 | https://github.com/tensorflow/tensorflow/blob/v1.12.0/tensorflow/contrib/training/python/training/hparam.py
4 | 
5 | with the following:
6 | wget https://github.com/tensorflow/tensorflow/raw/v1.12.0/tensorflow/contrib/training/python/training/hparam.py
7 | 
8 | Once all other tensorflow dependencies of these file are removed, the class keeps its goal. Functions not available due to this process are not used in this project.
9 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | public


--------------------------------------------------------------------------------
/docs/config.toml:
--------------------------------------------------------------------------------
 1 | baseURL = "https://r9y9.github.io/deepvoice3_pytorch/"
 2 | languageCode = "ja-jp"
 3 | title = "An open source implementation of Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning"
 4 | author = "Ryuichi YAMAMOTO"
 5 | 
 6 | [params]
 7 |   author    = "Ryuichi YAMAMOTO"
 8 |   project   = "deepvoice3_pytorch"
 9 |   logo      = "/images/512logotipo.png"
10 |   twitter   = "r9y9"
11 |   github    = "r9y9"
12 |   analytics = "UA-44433856-1"
13 | 


--------------------------------------------------------------------------------
/docs/layouts/_default/list.html:
--------------------------------------------------------------------------------
 1 | {{ partial "header.html" . }}
 2 | 
 3 | 	<main role="main">
 4 | 		<h1 class="list-title">{{ .Title }}</h1>
 5 | {{ range .Data.Pages }}
 6 | 		<article itemscope itemtype="http://schema.org/Blog">
 7 | 			<h2 class="entry-title" itemprop="headline"><a href="{{ .RelPermalink }}">{{ .Title }}{{ if .Draft }} #Draft{{ end }}</a></h2>
 8 | 			<span class="entry-meta"><time itemprop="datePublished" datetime="{{ .Date.Format "2006-01-02" }}">{{ .Date.Format "January 02, 2006" }}</time></span>
 9 | 		</article>
10 | {{ end }}
11 | 	</main>
12 | 
13 | {{ partial "footer.html" . }}


--------------------------------------------------------------------------------
/docs/layouts/_default/single.html:
--------------------------------------------------------------------------------
 1 | {{ partial "header.html" . }}
 2 | 
 3 | 	<main role="main">
 4 | 		<article itemscope itemtype="http://schema.org/BlogPosting">
 5 | 			<h1 class="entry-title" itemprop="headline">{{ .Title }}</h1>
 6 | 			<span class="entry-meta"><time itemprop="datePublished" datetime="{{ .Date.Format "2006-01-02" }}">{{ .Date.Format "January 02, 2006" }}</time></span>
 7 | 			<section itemprop="entry-text">
 8 | 				{{ .Content }}
 9 | 				{{ partial "social.html" . }}
10 | 			</section>
11 | 		</article>
12 | 	</main>
13 | 
14 | {{ partial "footer.html" . }}
15 | 


--------------------------------------------------------------------------------
/docs/layouts/index.html:
--------------------------------------------------------------------------------
1 | {{ template "partials/header.html" . }}
2 | {{ range .Data.Pages }}
3 |     {{if eq .Type "index" }}
4 |         {{.Content}}
5 |     {{end}}
6 | {{ end }}
7 | {{ template "partials/footer.html" . }}
8 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/footer.html:
--------------------------------------------------------------------------------
 1 | 
 2 | 	<footer role="contentinfo">
 3 | 		<div class="hr"></div>
 4 | 		<address>
 5 | 			<div class="avatar-bottom">
 6 | 			</div>
 7 | 
 8 | 		<div class="copyright">Copyright &copy;
 9 | 			<a href="/about">{{ .Site.Params.author}}</a> All rights reserved.
10 | 
11 | 			<a href="https://github.com/{{ .Site.Params.github }}">
12 | 				<span class="github">{{ .Site.Params.github }}@Github</span>
13 | 			</a>
14 | 		</div>
15 | 		</address>
16 | 	</footer>
17 | 
18 | </div>
19 | 
20 | {{ with .Site.Params.analytics }}<script>
21 | 	(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
22 | 	(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
23 | 	m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
24 | 	})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
25 | 	ga('create', '{{ . }}', 'auto');
26 | 	ga('send', 'pageview');
27 | </script>{{ end }}
28 | 
29 | <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/highlight.min.js"></script>
30 | <script>hljs.initHighlightingOnLoad();</script>
31 | 
32 | {{ partial "mathjax.html" . }}
33 | 
34 | </body>
35 | </html>
36 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/header.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ with .Site.LanguageCode }}{{ . }}{{ else }}en-US{{ end }}">
 3 | <head>
 4 | <meta charset="utf-8">
 5 | {{ .Hugo.Generator }}
 6 | <meta name="viewport" content="width=device-width, initial-scale=1">
 7 | <link href="https://fonts.googleapis.com/css?family=Roboto:300,400,700" rel="stylesheet" type="text/css">
 8 | <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/styles/github.min.css">
 9 | <link rel="stylesheet" href="/{{ .Site.Params.Project }}/css/normalize.css">
10 | <link rel="stylesheet" href="/{{ .Site.Params.Project }}/css/skeleton.css">
11 | <link rel="stylesheet" href="/{{ .Site.Params.Project }}/css/custom.css">
12 | <link rel="alternate" href="/{{ .Site.Params.Project }}/index.xml" type="application/rss+xml" title="{{ .Site.Title }}">
13 | <link rel="shortcut icon" href="/{{ .Site.Params.Project }}/favicon.png" type="image/x-icon" />
14 | <title>{{ $isHomePage := eq .Title .Site.Title }}{{ .Title }}{{ if eq $isHomePage false }} - {{ .Site.Title }}{{ end }}</title>
15 | </head>
16 | <body>
17 | 
18 | <div class="container">
19 | 
20 | 	<header role="banner">
21 | 		<div class="header-logo">
22 | 			<a href="https://github.com/r9y9/deepvoice3_pytorch"><img src="/{{ .Site.Params.Project }}/{{ .Site.Params.logo }}" width="140" height="140"></a>
23 | 		</div>
24 | 		{{ if eq $isHomePage true }}<h1 class="site-title">{{ .Site.Title }}</h1>{{ end }}
25 | 	</header>
26 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/mathjax.html:
--------------------------------------------------------------------------------
 1 | <!-- mathjax config similar to math.stackexchange -->
 2 | 
 3 | <script type="text/x-mathjax-config">
 4 |      MathJax.Hub.Config({
 5 |          HTML: ["input/TeX","output/HTML-CSS"],
 6 |          TeX: {
 7 |                 Macros: {
 8 |                          bm: ["\\boldsymbol{#1}", 1],
 9 |                          argmax: ["\\mathop{\\rm arg\\,max}\\limits"],
10 |                          argmin: ["\\mathop{\\rm arg\\,min}\\limits"]},
11 |                 extensions: ["AMSmath.js","AMSsymbols.js"],
12 |                 equationNumbers: { autoNumber: "AMS" } },
13 |          extensions: ["tex2jax.js"],
14 |          jax: ["input/TeX","output/HTML-CSS"],
15 |          tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ],
16 |                     displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
17 |                     processEscapes: true },
18 |          "HTML-CSS": { availableFonts: ["TeX"],
19 |                        linebreaks: { automatic: true } }
20 |      });
21 |  </script>
22 | 
23 |  <script type="text/x-mathjax-config">
24 |      MathJax.Hub.Config({
25 |        tex2jax: {
26 |          skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
27 |        }
28 |      });
29 |  </script>
30 | 
31 | <script type="text/javascript"
32 |    src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
33 | </script>
34 | 
35 | <!-- mathjax settings end -->
36 | 


--------------------------------------------------------------------------------
/docs/layouts/partials/social.html:
--------------------------------------------------------------------------------
1 | {{ if isset .Site.Params "twitter" }}
2 | <div class="social">
3 |     <div>
4 |         <a href="https://twitter.com/share" class="twitter-share-button" {{ if isset .Site.Params "twitter" }}data-via="{{ .Site.Params.twitter }}"{{ end }} data-text="{{ .Title }}" data-related="{{ .Site.Params.twitter }}">Tweet</a>
5 |         <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
6 |     </div>
7 | </div>
8 | {{ end }}
9 | 


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav


--------------------------------------------------------------------------------
/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000.wav


--------------------------------------------------------------------------------
/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000_alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000_alignment.png


--------------------------------------------------------------------------------
/docs/static/css/custom.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |   font-family: "Roboto", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
  3 |   background-color: #FCFCFC;
  4 |   -webkit-font-smoothing: antialiased;
  5 |   font-size: 1.8em;
  6 |   line-height: 1.5;
  7 |   font-weight: 300;
  8 | }
  9 | 
 10 | h1, h2, h3, h4, h5, h6 {
 11 |     color: #263c4c;
 12 | }
 13 | h2, h3, h4, h5, h6 {
 14 |     margin-top: 5rem;
 15 |     margin-bottom: 3rem;
 16 |     font-weight: bold;
 17 |     padding-bottom: 10px;
 18 | }
 19 | 
 20 | h1 { font-size: 3.0rem; }
 21 | h2 {
 22 |     margin-top: 6rem;
 23 |     font-size: 2.6rem;
 24 | }
 25 | h3 { font-size: 2.1rem; }
 26 | h4,
 27 | h5,
 28 | h6 { font-size: 1.9rem; }
 29 | 
 30 | h2.entry-title {
 31 |     font-size: 2.1rem;
 32 |     margin-top: 0;
 33 |     font-weight: 400;
 34 |     border-bottom: none;
 35 | }
 36 | 
 37 | li {
 38 |     margin-bottom: 0.5rem;
 39 |     margin-left: 0.7em;
 40 | }
 41 | 
 42 | img {
 43 |   max-width: 100%;
 44 |   height: auto;
 45 |   vertical-align: middle;
 46 |   border: 0;
 47 |   margin: 1em 0;
 48 | }
 49 | 
 50 | header,
 51 | footer {
 52 |   margin: 4rem 0;
 53 |   text-align: center;
 54 | }
 55 | 
 56 | main {
 57 |   margin: 4rem 0;
 58 | }
 59 | 
 60 | .container {
 61 |   width: 90%;
 62 |   max-width: 700px;
 63 | }
 64 | 
 65 | .site-title {
 66 |   margin-top: 2rem;
 67 | }
 68 | 
 69 | .entry-title {
 70 |   margin-bottom: 0;
 71 | }
 72 | 
 73 | .entry-title a {
 74 |   text-decoration: none;
 75 | }
 76 | 
 77 | .entry-meta {
 78 |   display: inline-block;
 79 |   margin-bottom: 2rem;
 80 |   font-size: 1.6rem;
 81 |   color: #888;
 82 | }
 83 | 
 84 | .footer-link {
 85 |   margin: 2rem 0;
 86 | }
 87 | 
 88 | .hr {
 89 |   height: 1px;
 90 |   margin: 2rem 0;
 91 |   background: #E1E1E1;
 92 |   background: -webkit-gradient(linear, left top, right top, from(white), color-stop(#E1E1E1), to(white));
 93 |   background: -webkit-linear-gradient(left, white, #E1E1E1, white);
 94 |   background: linear-gradient(to right, white, #E1E1E1, white);
 95 | }
 96 | 
 97 | article .social {
 98 |   height: 40px;
 99 |   padding: 10px 0;
100 | }
101 | 
102 | address {
103 |     margin: 0;
104 |     font-size:0.9em;
105 |     max-height: 60px;
106 |     font-weight: 300;
107 |     font-style: normal;
108 |     display: block;
109 | }
110 | 
111 | address a {
112 |     text-decoration: none;
113 | }
114 | 
115 | .avatar-bottom img {
116 |     border-radius: 50%;
117 |     border: 1px solid #E1E1E1;
118 |     float: left;
119 |     max-width: 100%;
120 |     vertical-align: middle;
121 |     width: 32px;
122 |     height: 32px;
123 |     margin: 0 20px 0 0;
124 |     margin-top: -7px;
125 | }
126 | 
127 | .avatar-bottom img:hover {
128 |   border-color: #F1F1F1;
129 | }
130 | 
131 | .copyright {
132 |     font-size:0.9em;
133 |     font-weight: 300;
134 | }
135 | 
136 | .github {
137 |     float: right;
138 | }
139 | 
140 | blockquote {
141 |     position: relative;
142 |     padding: 10px 10px 10px 32px;
143 |     box-sizing: border-box;
144 |     font-style: italic;
145 |     color: #464646;
146 |     background: #e0e0e0;
147 | }
148 | 
149 | blockquote:before{
150 |     display: inline-block;
151 |     position: absolute;
152 |     top: 0;
153 |     left: 0;
154 |     vertical-align: middle;
155 |     content: "\f10d";
156 |     font-family: FontAwesome;
157 |     color: #e0e0e0;
158 |     font-size: 22px;
159 |     line-height: 1;
160 |     z-index: 2;
161 | }
162 | 
163 | blockquote:after{
164 |     position: absolute;
165 |     content: '';
166 |     left: 0;
167 |     top: 0;
168 |     border-width: 0 0 40px 40px;
169 |     border-style: solid;
170 |     border-color: transparent #ffffff;
171 | }
172 | 
173 | blockquote p {
174 |     position: relative;
175 |     padding: 0;
176 |     margin: 10px 0;
177 |     z-index: 3;
178 |     line-height: 1.7;
179 | }
180 | 
181 | blockquote cite {
182 |     display: block;
183 |     text-align: right;
184 |     color: #888888;
185 |     font-size: 0.9em;
186 | }
187 | 


--------------------------------------------------------------------------------
/docs/static/css/normalize.css:
--------------------------------------------------------------------------------
  1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */
  2 | 
  3 | /**
  4 |  * 1. Set default font family to sans-serif.
  5 |  * 2. Prevent iOS text size adjust after orientation change, without disabling
  6 |  *    user zoom.
  7 |  */
  8 | 
  9 | html {
 10 |   font-family: sans-serif; /* 1 */
 11 |   -ms-text-size-adjust: 100%; /* 2 */
 12 |   -webkit-text-size-adjust: 100%; /* 2 */
 13 | }
 14 | 
 15 | /**
 16 |  * Remove default margin.
 17 |  */
 18 | 
 19 | body {
 20 |   margin: 0;
 21 | }
 22 | 
 23 | /* HTML5 display definitions
 24 |    ========================================================================== */
 25 | 
 26 | /**
 27 |  * Correct `block` display not defined for any HTML5 element in IE 8/9.
 28 |  * Correct `block` display not defined for `details` or `summary` in IE 10/11
 29 |  * and Firefox.
 30 |  * Correct `block` display not defined for `main` in IE 11.
 31 |  */
 32 | 
 33 | article,
 34 | aside,
 35 | details,
 36 | figcaption,
 37 | figure,
 38 | footer,
 39 | header,
 40 | hgroup,
 41 | main,
 42 | menu,
 43 | nav,
 44 | section,
 45 | summary {
 46 |   display: block;
 47 | }
 48 | 
 49 | /**
 50 |  * 1. Correct `inline-block` display not defined in IE 8/9.
 51 |  * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera.
 52 |  */
 53 | 
 54 | audio,
 55 | canvas,
 56 | progress,
 57 | video {
 58 |   display: inline-block; /* 1 */
 59 |   vertical-align: baseline; /* 2 */
 60 | }
 61 | 
 62 | /**
 63 |  * Prevent modern browsers from displaying `audio` without controls.
 64 |  * Remove excess height in iOS 5 devices.
 65 |  */
 66 | 
 67 | audio:not([controls]) {
 68 |   display: none;
 69 |   height: 0;
 70 | }
 71 | 
 72 | /**
 73 |  * Address `[hidden]` styling not present in IE 8/9/10.
 74 |  * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22.
 75 |  */
 76 | 
 77 | [hidden],
 78 | template {
 79 |   display: none;
 80 | }
 81 | 
 82 | /* Links
 83 |    ========================================================================== */
 84 | 
 85 | /**
 86 |  * Remove the gray background color from active links in IE 10.
 87 |  */
 88 | 
 89 | a {
 90 |   background-color: transparent;
 91 | }
 92 | 
 93 | /**
 94 |  * Improve readability when focused and also mouse hovered in all browsers.
 95 |  */
 96 | 
 97 | a:active,
 98 | a:hover {
 99 |   outline: 0;
100 | }
101 | 
102 | /* Text-level semantics
103 |    ========================================================================== */
104 | 
105 | /**
106 |  * Address styling not present in IE 8/9/10/11, Safari, and Chrome.
107 |  */
108 | 
109 | abbr[title] {
110 |   border-bottom: 1px dotted;
111 | }
112 | 
113 | /**
114 |  * Address style set to `bolder` in Firefox 4+, Safari, and Chrome.
115 |  */
116 | 
117 | b,
118 | strong {
119 |   font-weight: bold;
120 | }
121 | 
122 | /**
123 |  * Address styling not present in Safari and Chrome.
124 |  */
125 | 
126 | dfn {
127 |   font-style: italic;
128 | }
129 | 
130 | /**
131 |  * Address variable `h1` font-size and margin within `section` and `article`
132 |  * contexts in Firefox 4+, Safari, and Chrome.
133 |  */
134 | 
135 | h1 {
136 |   font-size: 2em;
137 |   margin: 0.67em 0;
138 | }
139 | 
140 | /**
141 |  * Address styling not present in IE 8/9.
142 |  */
143 | 
144 | mark {
145 |   background: #ff0;
146 |   color: #000;
147 | }
148 | 
149 | /**
150 |  * Address inconsistent and variable font size in all browsers.
151 |  */
152 | 
153 | small {
154 |   font-size: 80%;
155 | }
156 | 
157 | /**
158 |  * Prevent `sub` and `sup` affecting `line-height` in all browsers.
159 |  */
160 | 
161 | sub,
162 | sup {
163 |   font-size: 75%;
164 |   line-height: 0;
165 |   position: relative;
166 |   vertical-align: baseline;
167 | }
168 | 
169 | sup {
170 |   top: -0.5em;
171 | }
172 | 
173 | sub {
174 |   bottom: -0.25em;
175 | }
176 | 
177 | /* Embedded content
178 |    ========================================================================== */
179 | 
180 | /**
181 |  * Remove border when inside `a` element in IE 8/9/10.
182 |  */
183 | 
184 | img {
185 |   border: 0;
186 | }
187 | 
188 | /**
189 |  * Correct overflow not hidden in IE 9/10/11.
190 |  */
191 | 
192 | svg:not(:root) {
193 |   overflow: hidden;
194 | }
195 | 
196 | /* Grouping content
197 |    ========================================================================== */
198 | 
199 | /**
200 |  * Address margin not present in IE 8/9 and Safari.
201 |  */
202 | 
203 | figure {
204 |   margin: 1em 40px;
205 | }
206 | 
207 | /**
208 |  * Address differences between Firefox and other browsers.
209 |  */
210 | 
211 | hr {
212 |   -moz-box-sizing: content-box;
213 |   box-sizing: content-box;
214 |   height: 0;
215 | }
216 | 
217 | /**
218 |  * Contain overflow in all browsers.
219 |  */
220 | 
221 | pre {
222 |   overflow: auto;
223 | }
224 | 
225 | /**
226 |  * Address odd `em`-unit font size rendering in all browsers.
227 |  */
228 | 
229 | code,
230 | kbd,
231 | pre,
232 | samp {
233 |   font-family: monospace, monospace;
234 |   font-size: 1em;
235 | }
236 | 
237 | /* Forms
238 |    ========================================================================== */
239 | 
240 | /**
241 |  * Known limitation: by default, Chrome and Safari on OS X allow very limited
242 |  * styling of `select`, unless a `border` property is set.
243 |  */
244 | 
245 | /**
246 |  * 1. Correct color not being inherited.
247 |  *    Known issue: affects color of disabled elements.
248 |  * 2. Correct font properties not being inherited.
249 |  * 3. Address margins set differently in Firefox 4+, Safari, and Chrome.
250 |  */
251 | 
252 | button,
253 | input,
254 | optgroup,
255 | select,
256 | textarea {
257 |   color: inherit; /* 1 */
258 |   font: inherit; /* 2 */
259 |   margin: 0; /* 3 */
260 | }
261 | 
262 | /**
263 |  * Address `overflow` set to `hidden` in IE 8/9/10/11.
264 |  */
265 | 
266 | button {
267 |   overflow: visible;
268 | }
269 | 
270 | /**
271 |  * Address inconsistent `text-transform` inheritance for `button` and `select`.
272 |  * All other form control elements do not inherit `text-transform` values.
273 |  * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera.
274 |  * Correct `select` style inheritance in Firefox.
275 |  */
276 | 
277 | button,
278 | select {
279 |   text-transform: none;
280 | }
281 | 
282 | /**
283 |  * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
284 |  *    and `video` controls.
285 |  * 2. Correct inability to style clickable `input` types in iOS.
286 |  * 3. Improve usability and consistency of cursor style between image-type
287 |  *    `input` and others.
288 |  */
289 | 
290 | button,
291 | html input[type="button"], /* 1 */
292 | input[type="reset"],
293 | input[type="submit"] {
294 |   -webkit-appearance: button; /* 2 */
295 |   cursor: pointer; /* 3 */
296 | }
297 | 
298 | /**
299 |  * Re-set default cursor for disabled elements.
300 |  */
301 | 
302 | button[disabled],
303 | html input[disabled] {
304 |   cursor: default;
305 | }
306 | 
307 | /**
308 |  * Remove inner padding and border in Firefox 4+.
309 |  */
310 | 
311 | button::-moz-focus-inner,
312 | input::-moz-focus-inner {
313 |   border: 0;
314 |   padding: 0;
315 | }
316 | 
317 | /**
318 |  * Address Firefox 4+ setting `line-height` on `input` using `!important` in
319 |  * the UA stylesheet.
320 |  */
321 | 
322 | input {
323 |   line-height: normal;
324 | }
325 | 
326 | /**
327 |  * It's recommended that you don't attempt to style these elements.
328 |  * Firefox's implementation doesn't respect box-sizing, padding, or width.
329 |  *
330 |  * 1. Address box sizing set to `content-box` in IE 8/9/10.
331 |  * 2. Remove excess padding in IE 8/9/10.
332 |  */
333 | 
334 | input[type="checkbox"],
335 | input[type="radio"] {
336 |   box-sizing: border-box; /* 1 */
337 |   padding: 0; /* 2 */
338 | }
339 | 
340 | /**
341 |  * Fix the cursor style for Chrome's increment/decrement buttons. For certain
342 |  * `font-size` values of the `input`, it causes the cursor style of the
343 |  * decrement button to change from `default` to `text`.
344 |  */
345 | 
346 | input[type="number"]::-webkit-inner-spin-button,
347 | input[type="number"]::-webkit-outer-spin-button {
348 |   height: auto;
349 | }
350 | 
351 | /**
352 |  * 1. Address `appearance` set to `searchfield` in Safari and Chrome.
353 |  * 2. Address `box-sizing` set to `border-box` in Safari and Chrome
354 |  *    (include `-moz` to future-proof).
355 |  */
356 | 
357 | input[type="search"] {
358 |   -webkit-appearance: textfield; /* 1 */
359 |   -moz-box-sizing: content-box;
360 |   -webkit-box-sizing: content-box; /* 2 */
361 |   box-sizing: content-box;
362 | }
363 | 
364 | /**
365 |  * Remove inner padding and search cancel button in Safari and Chrome on OS X.
366 |  * Safari (but not Chrome) clips the cancel button when the search input has
367 |  * padding (and `textfield` appearance).
368 |  */
369 | 
370 | input[type="search"]::-webkit-search-cancel-button,
371 | input[type="search"]::-webkit-search-decoration {
372 |   -webkit-appearance: none;
373 | }
374 | 
375 | /**
376 |  * Define consistent border, margin, and padding.
377 |  */
378 | 
379 | fieldset {
380 |   border: 1px solid #c0c0c0;
381 |   margin: 0 2px;
382 |   padding: 0.35em 0.625em 0.75em;
383 | }
384 | 
385 | /**
386 |  * 1. Correct `color` not being inherited in IE 8/9/10/11.
387 |  * 2. Remove padding so people aren't caught out if they zero out fieldsets.
388 |  */
389 | 
390 | legend {
391 |   border: 0; /* 1 */
392 |   padding: 0; /* 2 */
393 | }
394 | 
395 | /**
396 |  * Remove default vertical scrollbar in IE 8/9/10/11.
397 |  */
398 | 
399 | textarea {
400 |   overflow: auto;
401 | }
402 | 
403 | /**
404 |  * Don't inherit the `font-weight` (applied by a rule above).
405 |  * NOTE: the default cannot safely be changed in Chrome and Safari on OS X.
406 |  */
407 | 
408 | optgroup {
409 |   font-weight: bold;
410 | }
411 | 
412 | /* Tables
413 |    ========================================================================== */
414 | 
415 | /**
416 |  * Remove most spacing between table cells.
417 |  */
418 | 
419 | table {
420 |   border-collapse: collapse;
421 |   border-spacing: 0;
422 | }
423 | 
424 | td,
425 | th {
426 |   padding: 0;
427 | }


--------------------------------------------------------------------------------
/docs/static/css/skeleton.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Skeleton V2.0.4
  3 | * Copyright 2014, Dave Gamache
  4 | * www.getskeleton.com
  5 | * Free to use under the MIT license.
  6 | * http://www.opensource.org/licenses/mit-license.php
  7 | * 12/29/2014
  8 | */
  9 | 
 10 | 
 11 | /* Table of contents
 12 | ––––––––––––––––––––––––––––––––––––––––––––––––––
 13 | - Grid
 14 | - Base Styles
 15 | - Typography
 16 | - Links
 17 | - Buttons
 18 | - Forms
 19 | - Lists
 20 | - Code
 21 | - Tables
 22 | - Spacing
 23 | - Utilities
 24 | - Clearing
 25 | - Media Queries
 26 | */
 27 | 
 28 | 
 29 | /* Grid
 30 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
 31 | .container {
 32 |   position: relative;
 33 |   width: 100%;
 34 |   max-width: 960px;
 35 |   margin: 0 auto;
 36 |   padding: 0 20px;
 37 |   box-sizing: border-box; }
 38 | .column,
 39 | .columns {
 40 |   width: 100%;
 41 |   float: left;
 42 |   box-sizing: border-box; }
 43 | 
 44 | /* For devices larger than 400px */
 45 | @media (min-width: 400px) {
 46 |   .container {
 47 |     width: 85%;
 48 |     padding: 0; }
 49 | }
 50 | 
 51 | /* For devices larger than 550px */
 52 | @media (min-width: 550px) {
 53 |   .container {
 54 |     width: 80%; }
 55 |   .column,
 56 |   .columns {
 57 |     margin-left: 4%; }
 58 |   .column:first-child,
 59 |   .columns:first-child {
 60 |     margin-left: 0; }
 61 | 
 62 |   .one.column,
 63 |   .one.columns                    { width: 4.66666666667%; }
 64 |   .two.columns                    { width: 13.3333333333%; }
 65 |   .three.columns                  { width: 22%;            }
 66 |   .four.columns                   { width: 30.6666666667%; }
 67 |   .five.columns                   { width: 39.3333333333%; }
 68 |   .six.columns                    { width: 48%;            }
 69 |   .seven.columns                  { width: 56.6666666667%; }
 70 |   .eight.columns                  { width: 65.3333333333%; }
 71 |   .nine.columns                   { width: 74.0%;          }
 72 |   .ten.columns                    { width: 82.6666666667%; }
 73 |   .eleven.columns                 { width: 91.3333333333%; }
 74 |   .twelve.columns                 { width: 100%; margin-left: 0; }
 75 | 
 76 |   .one-third.column               { width: 30.6666666667%; }
 77 |   .two-thirds.column              { width: 65.3333333333%; }
 78 | 
 79 |   .one-half.column                { width: 48%; }
 80 | 
 81 |   /* Offsets */
 82 |   .offset-by-one.column,
 83 |   .offset-by-one.columns          { margin-left: 8.66666666667%; }
 84 |   .offset-by-two.column,
 85 |   .offset-by-two.columns          { margin-left: 17.3333333333%; }
 86 |   .offset-by-three.column,
 87 |   .offset-by-three.columns        { margin-left: 26%;            }
 88 |   .offset-by-four.column,
 89 |   .offset-by-four.columns         { margin-left: 34.6666666667%; }
 90 |   .offset-by-five.column,
 91 |   .offset-by-five.columns         { margin-left: 43.3333333333%; }
 92 |   .offset-by-six.column,
 93 |   .offset-by-six.columns          { margin-left: 52%;            }
 94 |   .offset-by-seven.column,
 95 |   .offset-by-seven.columns        { margin-left: 60.6666666667%; }
 96 |   .offset-by-eight.column,
 97 |   .offset-by-eight.columns        { margin-left: 69.3333333333%; }
 98 |   .offset-by-nine.column,
 99 |   .offset-by-nine.columns         { margin-left: 78.0%;          }
100 |   .offset-by-ten.column,
101 |   .offset-by-ten.columns          { margin-left: 86.6666666667%; }
102 |   .offset-by-eleven.column,
103 |   .offset-by-eleven.columns       { margin-left: 95.3333333333%; }
104 | 
105 |   .offset-by-one-third.column,
106 |   .offset-by-one-third.columns    { margin-left: 34.6666666667%; }
107 |   .offset-by-two-thirds.column,
108 |   .offset-by-two-thirds.columns   { margin-left: 69.3333333333%; }
109 | 
110 |   .offset-by-one-half.column,
111 |   .offset-by-one-half.columns     { margin-left: 52%; }
112 | 
113 | }
114 | 
115 | 
116 | /* Base Styles
117 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
118 | /* NOTE
119 | html is set to 62.5% so that all the REM measurements throughout Skeleton
120 | are based on 10px sizing. So basically 1.5rem = 15px :) */
121 | html {
122 |   font-size: 62.5%; }
123 | body {
124 |   font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */
125 |   line-height: 1.6;
126 |   font-weight: 400;
127 |   font-family: "Raleway", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
128 |   color: #222; }
129 | 
130 | 
131 | /* Typography
132 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
133 | h1, h2, h3, h4, h5, h6 {
134 |   margin-top: 0;
135 |   margin-bottom: 2rem;
136 |   font-weight: 300; }
137 | h1 { font-size: 4.0rem; line-height: 1.2;  letter-spacing: -.1rem;}
138 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; }
139 | h3 { font-size: 3.0rem; line-height: 1.3;  letter-spacing: -.1rem; }
140 | h4 { font-size: 2.4rem; line-height: 1.35; letter-spacing: -.08rem; }
141 | h5 { font-size: 1.8rem; line-height: 1.5;  letter-spacing: -.05rem; }
142 | h6 { font-size: 1.5rem; line-height: 1.6;  letter-spacing: 0; }
143 | 
144 | /* Larger than phablet */
145 | @media (min-width: 550px) {
146 |   h1 { font-size: 5.0rem; }
147 |   h2 { font-size: 4.2rem; }
148 |   h3 { font-size: 3.6rem; }
149 |   h4 { font-size: 3.0rem; }
150 |   h5 { font-size: 2.4rem; }
151 |   h6 { font-size: 1.5rem; }
152 | }
153 | 
154 | p {
155 |   margin-top: 0; }
156 | 
157 | 
158 | /* Links
159 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
160 | a {
161 |   color: #1EAEDB; }
162 | a:hover {
163 |   color: #0FA0CE; }
164 | 
165 | 
166 | /* Buttons
167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
168 | .button,
169 | button,
170 | input[type="submit"],
171 | input[type="reset"],
172 | input[type="button"] {
173 |   display: inline-block;
174 |   height: 38px;
175 |   padding: 0 30px;
176 |   color: #555;
177 |   text-align: center;
178 |   font-size: 11px;
179 |   font-weight: 600;
180 |   line-height: 38px;
181 |   letter-spacing: .1rem;
182 |   text-transform: uppercase;
183 |   text-decoration: none;
184 |   white-space: nowrap;
185 |   background-color: transparent;
186 |   border-radius: 4px;
187 |   border: 1px solid #bbb;
188 |   cursor: pointer;
189 |   box-sizing: border-box; }
190 | .button:hover,
191 | button:hover,
192 | input[type="submit"]:hover,
193 | input[type="reset"]:hover,
194 | input[type="button"]:hover,
195 | .button:focus,
196 | button:focus,
197 | input[type="submit"]:focus,
198 | input[type="reset"]:focus,
199 | input[type="button"]:focus {
200 |   color: #333;
201 |   border-color: #888;
202 |   outline: 0; }
203 | .button.button-primary,
204 | button.button-primary,
205 | input[type="submit"].button-primary,
206 | input[type="reset"].button-primary,
207 | input[type="button"].button-primary {
208 |   color: #FFF;
209 |   background-color: #33C3F0;
210 |   border-color: #33C3F0; }
211 | .button.button-primary:hover,
212 | button.button-primary:hover,
213 | input[type="submit"].button-primary:hover,
214 | input[type="reset"].button-primary:hover,
215 | input[type="button"].button-primary:hover,
216 | .button.button-primary:focus,
217 | button.button-primary:focus,
218 | input[type="submit"].button-primary:focus,
219 | input[type="reset"].button-primary:focus,
220 | input[type="button"].button-primary:focus {
221 |   color: #FFF;
222 |   background-color: #1EAEDB;
223 |   border-color: #1EAEDB; }
224 | 
225 | 
226 | /* Forms
227 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
228 | input[type="email"],
229 | input[type="number"],
230 | input[type="search"],
231 | input[type="text"],
232 | input[type="tel"],
233 | input[type="url"],
234 | input[type="password"],
235 | textarea,
236 | select {
237 |   height: 38px;
238 |   padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */
239 |   background-color: #fff;
240 |   border: 1px solid #D1D1D1;
241 |   border-radius: 4px;
242 |   box-shadow: none;
243 |   box-sizing: border-box; }
244 | /* Removes awkward default styles on some inputs for iOS */
245 | input[type="email"],
246 | input[type="number"],
247 | input[type="search"],
248 | input[type="text"],
249 | input[type="tel"],
250 | input[type="url"],
251 | input[type="password"],
252 | textarea {
253 |   -webkit-appearance: none;
254 |      -moz-appearance: none;
255 |           appearance: none; }
256 | textarea {
257 |   min-height: 65px;
258 |   padding-top: 6px;
259 |   padding-bottom: 6px; }
260 | input[type="email"]:focus,
261 | input[type="number"]:focus,
262 | input[type="search"]:focus,
263 | input[type="text"]:focus,
264 | input[type="tel"]:focus,
265 | input[type="url"]:focus,
266 | input[type="password"]:focus,
267 | textarea:focus,
268 | select:focus {
269 |   border: 1px solid #33C3F0;
270 |   outline: 0; }
271 | label,
272 | legend {
273 |   display: block;
274 |   margin-bottom: .5rem;
275 |   font-weight: 600; }
276 | fieldset {
277 |   padding: 0;
278 |   border-width: 0; }
279 | input[type="checkbox"],
280 | input[type="radio"] {
281 |   display: inline; }
282 | label > .label-body {
283 |   display: inline-block;
284 |   margin-left: .5rem;
285 |   font-weight: normal; }
286 | 
287 | 
288 | /* Lists
289 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
290 | ul {
291 |   list-style: circle inside; }
292 | ol {
293 |   list-style: decimal inside; }
294 | ol, ul {
295 |   padding-left: 0;
296 |   margin-top: 0; }
297 | ul ul,
298 | ul ol,
299 | ol ol,
300 | ol ul {
301 |   margin: 1.5rem 0 1.5rem 3rem;
302 |   font-size: 90%; }
303 | li {
304 |   margin-bottom: 1rem; }
305 | 
306 | 
307 | /* Code
308 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
309 | code {
310 |   padding: .2rem .5rem;
311 |   margin: 0 .2rem;
312 |   font-size: 90%;
313 |   white-space: nowrap;
314 |   background: #F1F1F1;
315 |   border: 1px solid #E1E1E1;
316 |   border-radius: 4px; }
317 | pre > code {
318 |   display: block;
319 |   padding: 1rem 1.5rem;
320 |   white-space: pre; }
321 | 
322 | 
323 | /* Tables
324 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
325 | th,
326 | td {
327 |   padding: 12px 15px;
328 |   text-align: left;
329 |   border-bottom: 1px solid #E1E1E1; }
330 | th:first-child,
331 | td:first-child {
332 |   padding-left: 0; }
333 | th:last-child,
334 | td:last-child {
335 |   padding-right: 0; }
336 | 
337 | 
338 | /* Spacing
339 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
340 | button,
341 | .button {
342 |   margin-bottom: 1rem; }
343 | input,
344 | textarea,
345 | select,
346 | fieldset {
347 |   margin-bottom: 1.5rem; }
348 | pre,
349 | blockquote,
350 | dl,
351 | figure,
352 | table,
353 | p,
354 | ul,
355 | ol,
356 | form {
357 |   margin-bottom: 2.5rem; }
358 | 
359 | 
360 | /* Utilities
361 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
362 | .u-full-width {
363 |   width: 100%;
364 |   box-sizing: border-box; }
365 | .u-max-full-width {
366 |   max-width: 100%;
367 |   box-sizing: border-box; }
368 | .u-pull-right {
369 |   float: right; }
370 | .u-pull-left {
371 |   float: left; }
372 | 
373 | 
374 | /* Misc
375 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
376 | hr {
377 |   margin-top: 3rem;
378 |   margin-bottom: 3.5rem;
379 |   border-width: 0;
380 |   border-top: 1px solid #E1E1E1; }
381 | 
382 | 
383 | /* Clearing
384 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
385 | 
386 | /* Self Clearing Goodness */
387 | .container:after,
388 | .row:after,
389 | .u-cf {
390 |   content: "";
391 |   display: table;
392 |   clear: both; }
393 | 
394 | 
395 | /* Media Queries
396 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
397 | /*
398 | Note: The best way to structure the use of media queries is to create the queries
399 | near the relevant code. For example, if you wanted to change the styles for buttons
400 | on small devices, paste the mobile query code up in the buttons section and style it
401 | there.
402 | */
403 | 
404 | 
405 | /* Larger than mobile */
406 | @media (min-width: 400px) {}
407 | 
408 | /* Larger than phablet (also point when grid becomes active) */
409 | @media (min-width: 550px) {}
410 | 
411 | /* Larger than tablet */
412 | @media (min-width: 750px) {}
413 | 
414 | /* Larger than desktop */
415 | @media (min-width: 1000px) {}
416 | 
417 | /* Larger than Desktop HD */
418 | @media (min-width: 1200px) {}
419 | 


--------------------------------------------------------------------------------
/docs/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/favicon.png


--------------------------------------------------------------------------------
/docs/static/images/512logotipo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/images/512logotipo.png


--------------------------------------------------------------------------------
/dump_hparams_to_json.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Dump hyper parameters to json file.
 4 | 
 5 | usage: dump_hparams_to_json.py [options] <output_json_path>
 6 | 
 7 | options:
 8 |     -h, --help               Show help message.
 9 | """
10 | from docopt import docopt
11 | 
12 | import sys
13 | import os
14 | from os.path import dirname, join, basename, splitext
15 | 
16 | import audio
17 | 
18 | # The deepvoice3 model
19 | from deepvoice3_pytorch import frontend
20 | from hparams import hparams
21 | import json
22 | 
23 | if __name__ == "__main__":
24 |     args = docopt(__doc__)
25 |     output_json_path = args["<output_json_path>"]
26 | 
27 |     j = hparams.values()
28 | 
29 |     # for compat legacy
30 |     for k in ["preset", "presets"]:
31 |         if k in j:
32 |             del j[k]
33 | 
34 |     with open(output_json_path, "w") as f:
35 |         json.dump(j, f, indent=2)
36 |     sys.exit(0)
37 | 


--------------------------------------------------------------------------------
/gentle_web_align.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Apr 21 09:06:37 2018
  4 | Phoneme alignment and conversion in HTK-style label file using Web-served Gentle
  5 | This works on any type of english dataset.
  6 | Unlike prepare_htk_alignments_vctk.py, this is Python3 and Windows(with Docker) compatible.
  7 | Preliminary results show that gentle has better performance with noisy dataset
  8 | (e.g. movie extracted audioclips)
  9 | *This work was derived from vctk_preprocess/prepare_htk_alignments_vctk.py
 10 | @author: engiecat(github)
 11 | 
 12 | usage:
 13 |     gentle_web_align.py (-w wav_pattern) (-t text_pattern) [options]
 14 |     gentle_web_align.py (--nested-directories=<main_directory>) [options]
 15 | 
 16 | options:
 17 |     -w <wav_pattern> --wav_pattern=<wav_pattern> Pattern of wav files to be aligned
 18 |     -t <txt_pattern> --txt_pattern=<txt_pattern> Pattern of txt transcript files to be aligned (same name required)
 19 |     --nested-directories=<main_directory>        Process every wav/txt file in the subfolders of the given folder
 20 |     --server_addr=<server_addr>                  Server address that serves gentle. [default: localhost]
 21 |     --port=<port>                                Server port that serves gentle. [default: 8567]
 22 |     --max_unalign=<max_unalign>                  Maximum threshold for unalignment occurence (0.0 ~ 1.0) [default: 0.3] 
 23 |     --skip-already-done                          Skips if there are preexisting .lab file
 24 |     -h --help                                    show this help message and exit
 25 | """
 26 | 
 27 | from docopt import docopt
 28 | from glob import glob
 29 | from tqdm import tqdm
 30 | import os.path
 31 | import requests
 32 | import numpy as np
 33 | 
 34 | def write_hts_label(labels, lab_path):
 35 |     lab = ""
 36 |     for s, e, l in labels:
 37 |         s, e = float(s) * 1e7, float(e) * 1e7
 38 |         s, e = int(s), int(e)
 39 |         lab += "{} {} {}\n".format(s, e, l)
 40 |     print(lab)
 41 |     with open(lab_path, "w", encoding='utf-8') as f:
 42 |         f.write(lab)
 43 | 
 44 | 
 45 | def json2hts(data):
 46 |     emit_bos = False
 47 |     emit_eos = False
 48 | 
 49 |     phone_start = 0
 50 |     phone_end = None
 51 |     labels = []
 52 |     failure_count = 0
 53 |     
 54 |     for word in data["words"]:
 55 |         case = word["case"]
 56 |         if case != "success":
 57 |             failure_count += 1 # instead of failing everything, 
 58 |             #raise RuntimeError("Alignment failed")
 59 |             continue
 60 |         start = float(word["start"])
 61 |         word_end = float(word["end"])
 62 | 
 63 |         if not emit_bos:
 64 |             labels.append((phone_start, start, "silB"))
 65 |             emit_bos = True
 66 | 
 67 |         phone_start = start
 68 |         phone_end = None
 69 |         for phone in word["phones"]:
 70 |             ph = str(phone["phone"][:-2])
 71 |             duration = float(phone["duration"])
 72 |             phone_end = phone_start + duration
 73 |             labels.append((phone_start, phone_end, ph))
 74 |             phone_start += duration
 75 |         assert np.allclose(phone_end, word_end)
 76 |     if not emit_eos:
 77 |         labels.append((phone_start, phone_end, "silE"))
 78 |         emit_eos = True
 79 |     unalign_ratio = float(failure_count) / len(data['words'])
 80 |     return unalign_ratio, labels
 81 | 
 82 | 
 83 | def gentle_request(wav_path,txt_path, server_addr, port, debug=False):
 84 |     print('\n')
 85 |     response = None
 86 |     wav_name = os.path.basename(wav_path)
 87 |     txt_name = os.path.basename(txt_path)
 88 |     if os.path.splitext(wav_name)[0] != os.path.splitext(txt_name)[0]:
 89 |         print(' [!] wav name and transcript name does not match - exiting...')
 90 |         return response
 91 |     with open(txt_path, 'r', encoding='utf-8-sig') as txt_file:
 92 |         print('Transcript - '+''.join(txt_file.readlines()))
 93 |     with open(wav_path,'rb') as wav_file, open(txt_path, 'rb') as txt_file:
 94 |         params = (('async','false'),)
 95 |         files={'audio':(wav_name,wav_file),
 96 |                'transcript':(txt_name,txt_file),
 97 |                }
 98 |         server_path = 'http://'+server_addr+':'+str(port)+'/transcriptions'
 99 |         response = requests.post(server_path, params=params,files=files)
100 |         if response.status_code != 200:
101 |             print(' [!] External server({}) returned bad response({})'.format(server_path, response.status_code))
102 |     if debug:
103 |         print('Response')
104 |         print(response.json())
105 |     return response
106 | 
107 | if __name__ == '__main__':
108 |     arguments = docopt(__doc__)    
109 |     server_addr = arguments['--server_addr']
110 |     port = int(arguments['--port'])
111 |     max_unalign  = float(arguments['--max_unalign'])
112 |     if arguments['--nested-directories'] is None:
113 |         wav_paths = sorted(glob(arguments['--wav_pattern']))
114 |         txt_paths = sorted(glob(arguments['--txt_pattern']))    
115 |     else:
116 |         # if this is multi-foldered environment
117 |         # (e.g. DATASET/speaker1/blahblah.wav)
118 |         wav_paths=[]
119 |         txt_paths=[]
120 |         topdir = arguments['--nested-directories']
121 |         subdirs = [f for f in os.listdir(topdir) if os.path.isdir(os.path.join(topdir, f))]
122 |         for subdir in subdirs:
123 |             wav_pattern_subdir = os.path.join(topdir, subdir, '*.wav')
124 |             txt_pattern_subdir = os.path.join(topdir, subdir, '*.txt')
125 |             wav_paths.extend(sorted(glob(wav_pattern_subdir)))
126 |             txt_paths.extend(sorted(glob(txt_pattern_subdir)))
127 |         
128 |     t = tqdm(range(len(wav_paths)))
129 |     for idx in t:
130 |         try:
131 |             t.set_description("Align via Gentle")
132 |             wav_path = wav_paths[idx]
133 |             txt_path = txt_paths[idx]
134 |             lab_path = os.path.splitext(wav_path)[0]+'.lab'
135 |             if os.path.exists(lab_path) and arguments['--skip-already-done']:
136 |                 print('[!] skipping because of pre-existing .lab file - {}'.format(lab_path))
137 |                 continue
138 |             res=gentle_request(wav_path,txt_path, server_addr, port)
139 |             unalign_ratio, lab = json2hts(res.json())
140 |             print('[*] Unaligned Ratio - {}'.format(unalign_ratio))
141 |             if unalign_ratio > max_unalign:
142 |                 print('[!] skipping this due to bad alignment')
143 |                 continue
144 |             write_hts_label(lab, lab_path)
145 |         except:
146 |             # if sth happens, skip it
147 |             import traceback
148 |             tb = traceback.format_exc()
149 |             print('[!] ERROR while processing {}'.format(wav_paths[idx]))
150 |             print('[!] StackTrace - ')
151 |             print(tb)
152 | 
153 |     


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | from deepvoice3_pytorch.tfcompat.hparam import HParams
  2 | 
  3 | # NOTE: If you want full control for model architecture. please take a look
  4 | # at the code and change whatever you want. Some hyper parameters are hardcoded.
  5 | 
  6 | # Default hyperparameters:
  7 | hparams = HParams(
  8 |     name="deepvoice3",
  9 | 
 10 |     # Text:
 11 |     # [en, jp]
 12 |     frontend='en',
 13 | 
 14 |     # Replace words to its pronunciation with fixed probability.
 15 |     # e.g., 'hello' to 'HH AH0 L OW1'
 16 |     # [en, jp]
 17 |     # en: Word -> pronunciation using CMUDict
 18 |     # jp: Word -> pronounciation usnig MeCab
 19 |     # [0 ~ 1.0]: 0 means no replacement happens.
 20 |     replace_pronunciation_prob=0.5,
 21 | 
 22 |     # Convenient model builder
 23 |     # [deepvoice3, deepvoice3_multispeaker, nyanko]
 24 |     # Definitions can be found at deepvoice3_pytorch/builder.py
 25 |     # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654
 26 |     # deepvoice3_multispeaker: Multi-speaker version of DeepVoice3
 27 |     # nyanko: https://arxiv.org/abs/1710.08969
 28 |     builder="deepvoice3",
 29 | 
 30 |     # Must be configured depends on the dataset and model you use
 31 |     n_speakers=1,
 32 |     speaker_embed_dim=16,
 33 | 
 34 |     # Audio:
 35 |     num_mels=80,
 36 |     fmin=125,
 37 |     fmax=7600,
 38 |     fft_size=1024,
 39 |     hop_size=256,
 40 |     sample_rate=22050,
 41 |     preemphasis=0.97,
 42 |     min_level_db=-100,
 43 |     ref_level_db=20,
 44 |     # whether to rescale waveform or not.
 45 |     # Let x is an input waveform, rescaled waveform y is given by:
 46 |     # y = x / np.abs(x).max() * rescaling_max
 47 |     rescaling=False,
 48 |     rescaling_max=0.999,
 49 |     # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
 50 |     # happen depends on min_level_db and ref_level_db, causing clipping noise.
 51 |     # If False, assertion is added to ensure no clipping happens.
 52 |     allow_clipping_in_normalization=True,
 53 | 
 54 |     # Model:
 55 |     downsample_step=4,  # must be 4 when builder="nyanko"
 56 |     outputs_per_step=1,  # must be 1 when builder="nyanko"
 57 |     embedding_weight_std=0.1,
 58 |     speaker_embedding_weight_std=0.01,
 59 |     padding_idx=0,
 60 |     # Maximum number of input text length
 61 |     # try setting larger value if you want to give very long text input
 62 |     max_positions=512,
 63 |     dropout=1 - 0.95,
 64 |     kernel_size=3,
 65 |     text_embed_dim=128,
 66 |     encoder_channels=256,
 67 |     decoder_channels=256,
 68 |     # Note: large converter channels requires significant computational cost
 69 |     converter_channels=256,
 70 |     query_position_rate=1.0,
 71 |     # can be computed by `compute_timestamp_ratio.py`.
 72 |     key_position_rate=1.385,  # 2.37 for jsut
 73 |     key_projection=False,
 74 |     value_projection=False,
 75 |     use_memory_mask=True,
 76 |     trainable_positional_encodings=False,
 77 |     freeze_embedding=False,
 78 |     # If True, use decoder's internal representation for postnet inputs,
 79 |     # otherwise use mel-spectrogram.
 80 |     use_decoder_state_for_postnet_input=True,
 81 | 
 82 |     # Data loader
 83 |     pin_memory=True,
 84 |     num_workers=2,  # Set it to 1 when in Windows (MemoryError, THAllocator.c 0x5)
 85 | 
 86 |     # Loss
 87 |     masked_loss_weight=0.5,  # (1-w)*loss + w * masked_loss
 88 |     priority_freq=3000,  # heuristic: priotrize [0 ~ priotiry_freq] for linear loss
 89 |     priority_freq_weight=0.0,  # (1-w)*linear_loss + w*priority_linear_loss
 90 |     # https://arxiv.org/pdf/1710.08969.pdf
 91 |     # Adding the divergence to the loss stabilizes training, expecially for
 92 |     # very deep (> 10 layers) networks.
 93 |     # Binary div loss seems has approx 10x scale compared to L1 loss, so I choose 0.1.
 94 |     binary_divergence_weight=0.1,  # set 0 to disable
 95 |     use_guided_attention=True,
 96 |     guided_attention_sigma=0.2,
 97 | 
 98 |     # Training:
 99 |     batch_size=16,
100 |     adam_beta1=0.5,
101 |     adam_beta2=0.9,
102 |     adam_eps=1e-6,
103 |     amsgrad=False,
104 |     initial_learning_rate=5e-4,  # 0.001,
105 |     lr_schedule="noam_learning_rate_decay",
106 |     lr_schedule_kwargs={},
107 |     nepochs=2000,
108 |     weight_decay=0.0,
109 |     clip_thresh=0.1,
110 | 
111 |     # Save
112 |     checkpoint_interval=10000,
113 |     eval_interval=10000,
114 |     save_optimizer_state=True,
115 | 
116 |     # Eval:
117 |     # this can be list for multple layers of attention
118 |     # e.g., [True, False, False, False, True]
119 |     force_monotonic_attention=True,
120 |     # Attention constraint for incremental decoding
121 |     window_ahead=3,
122 |     # 0 tends to prevent word repretetion, but sometime causes skip words
123 |     window_backward=1,
124 |     power=1.4,  # Power to raise magnitudes to prior to phase retrieval
125 | 
126 |     # GC:
127 |     # Forced garbage collection probability
128 |     # Use only when MemoryError continues in Windows (Disabled by default)
129 |     #gc_probability = 0.001,
130 | 
131 |     # json_meta mode only
132 |     # 0: "use all",
133 |     # 1: "ignore only unmatched_alignment",
134 |     # 2: "fully ignore recognition",
135 |     ignore_recognition_level=2,
136 |     # when dealing with non-dedicated speech dataset(e.g. movie excerpts), setting min_text above 15 is desirable. Can be adjusted by dataset.
137 |     min_text=20,
138 |     # if true, data without phoneme alignment file(.lab) will be ignored
139 |     process_only_htk_aligned=False,
140 | )
141 | 
142 | 
143 | def hparams_debug_string():
144 |     values = hparams.values()
145 |     hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
146 |     return 'Hyperparameters:\n' + '\n'.join(hp)
147 | 


--------------------------------------------------------------------------------
/json_meta.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Started in 1945h, Mar 10, 2018
  3 | First done in 2103h, Mar 11, 2018
  4 | Test done in 2324h, Mar 11, 2018
  5 | Modified for HTK labeling in 1426h, Apr 21, 2018
  6 | by engiecat(github)
  7 | 
  8 | This makes r9y9/deepvoice3_pytorch compatible with json format of carpedm20/multi-speaker-tacotron-tensorflow and keithito/tacotron.
  9 | The json file is given per speaker, generated in the format of 
 10 | 	(if completely aligned)
 11 | 		(path-to-the-audio):aligned text
 12 | 
 13 | 	(if partially aligned)
 14 | 		(path-to-the-audio):[candidate sentence - not aligned,recognized words]
 15 | 
 16 | 	(if non-aligned)
 17 | 		(path-to-the-audio):[recognized words]
 18 | is given per speaker.
 19 | 
 20 | (e.g. python preprocess.py json_meta "./datasets/LJSpeech_1_0/alignment.json,./datasets/GoTBookRev/alignment.json" "./datasets/LJ+GoTBookRev" --preset=./presets/deepvoice3_vctk.json )
 21 | 
 22 | usage: 
 23 |     python preprocess.py [option] <json_paths> <output_data_path>
 24 | 
 25 | 
 26 | options:
 27 |     --preset     Path of preset parameters (json).
 28 |     -h --help    show this help message and exit
 29 | 
 30 | 
 31 | '''
 32 | 
 33 | from concurrent.futures import ProcessPoolExecutor
 34 | from functools import partial
 35 | import numpy as np
 36 | import os
 37 | import audio
 38 | from nnmnkwii.io import hts
 39 | from hparams import hparams
 40 | from os.path import exists
 41 | import librosa
 42 | import json
 43 | 
 44 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 45 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 46 |     futures = []
 47 |     
 48 |     json_paths = in_dir.split(',')
 49 |     json_paths = [json_path.replace("'", "").replace('"',"") for json_path in json_paths]
 50 |     num_speakers = len(json_paths)
 51 |     is_aligned = {}
 52 |     
 53 |     speaker_id=0
 54 |     for json_path in json_paths:
 55 |         # Loads json metadata info
 56 |         if json_path.endswith("json"):
 57 |             with open(json_path, encoding='utf8') as f:
 58 |                 content = f.read()
 59 |             info = json.loads(content)
 60 |         elif json_path.endswith("csv"):
 61 |             with open(json_path) as f:
 62 |                 info = {}
 63 |                 for line in f:
 64 |                     path, text = line.strip().split('|')
 65 |                     info[path] = text
 66 |         else:
 67 |             raise Exception(" [!] Unknown metadata format: {}".format(json_path))
 68 | 
 69 |         print(" [*] Loaded - {}".format(json_path))
 70 |         # check audio file existence
 71 |         base_dir = os.path.dirname(json_path)
 72 |         new_info = {}
 73 |         for path in info.keys():
 74 |             if not os.path.exists(path):
 75 |                 new_path = os.path.join(base_dir, path)
 76 |                 if not os.path.exists(new_path):
 77 |                     print(" [!] Audio not found: {}".format([path, new_path]))
 78 |                     continue
 79 |             else:
 80 |                 new_path = path
 81 |             
 82 |             new_info[new_path] = info[path]
 83 |         
 84 |         info = new_info
 85 |         
 86 |         # ignore_recognition_level check
 87 |         for path in info.keys():
 88 |             is_aligned[path] = True
 89 |             if isinstance(info[path], list):
 90 |                 if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \
 91 |                         hparams.ignore_recognition_level == 2:
 92 |                     # flag the path to be 'non-aligned' text
 93 |                     is_aligned[path] = False
 94 |                 info[path] = info[path][0]
 95 |         
 96 |         # Reserve for future processing
 97 |         queue_count = 0
 98 |         for audio_path, text in info.items():
 99 |             if isinstance(text, list):
100 |                 if hparams.ignore_recognition_level == 0:
101 |                     text = text[-1]
102 |                 else:
103 |                     text = text[0]
104 |             if hparams.ignore_recognition_level > 0 and not is_aligned[audio_path]:
105 |                 continue
106 |             if hparams.min_text > len(text):
107 |                 continue
108 |             if num_speakers == 1:
109 |                 # Single-speaker
110 |                 futures.append(executor.submit(
111 |                     partial(_process_utterance_single, out_dir, text, audio_path)))
112 |             else:
113 |                 # Multi-speaker
114 |                 futures.append(executor.submit(
115 |                     partial(_process_utterance, out_dir, text, audio_path, speaker_id)))
116 |             queue_count += 1
117 |         print(" [*] Appended {} entries in the queue".format(queue_count))
118 |         
119 |         # increase speaker_id
120 |         speaker_id += 1
121 |     
122 |     # Show ignore_recognition_level description
123 |     ignore_description = {
124 |         0: "use all",
125 |         1: "ignore only unmatched_alignment",
126 |         2: "fully ignore recognition",
127 |     }
128 |     print(" [!] Skip recognition level: {} ({})". \
129 |             format(hparams.ignore_recognition_level,
130 |                    ignore_description[hparams.ignore_recognition_level]))
131 |     
132 |     if num_speakers == 1:
133 |         print(" [!] Single-speaker mode activated!")
134 |     else:
135 |         print(" [!] Multi-speaker({}) mode activated!".format(num_speakers))
136 |     
137 |     # Now, Do the job!
138 |     results = [future.result() for future in tqdm(futures)]
139 |     # Remove entries with None (That has been filtered due to bad htk alginment (if process_only_htk_aligned is enabled in hparams)
140 |     results = [result for result in results if result != None]
141 |     return results
142 |     
143 | 
144 | def start_at(labels):
145 |     has_silence = labels[0][-1] == "pau"
146 |     if not has_silence:
147 |         return labels[0][0]
148 |     for i in range(1, len(labels)):
149 |         if labels[i][-1] != "pau":
150 |             return labels[i][0]
151 |     assert False
152 | 
153 | 
154 | def end_at(labels):
155 |     has_silence = labels[-1][-1] == "pau"
156 |     if not has_silence:
157 |         return labels[-1][1]
158 |     for i in range(len(labels) - 2, 0, -1):
159 |         if labels[i][-1] != "pau":
160 |             return labels[i][1]
161 |     assert False
162 | 
163 | 
164 | def _process_utterance(out_dir, text, wav_path, speaker_id=None):
165 | 
166 |     # check whether singlespeaker_mode
167 |     if speaker_id is None:
168 |         return _process_utterance_single(out_dir,text,wav_path)
169 |     # modified version of VCTK _process_utterance
170 |     sr = hparams.sample_rate
171 | 
172 |     # Load the audio to a numpy array:
173 |     wav = audio.load_wav(wav_path)
174 | 
175 |     lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
176 |     if not exists(lab_path):
177 |         lab_path = os.path.splitext(wav_path)[0]+'.lab'
178 | 
179 |     # Trim silence from hts labels if available
180 |     if exists(lab_path):
181 |         labels = hts.load(lab_path)
182 |         b = int(start_at(labels) * 1e-7 * sr)
183 |         e = int(end_at(labels) * 1e-7 * sr)
184 |         wav = wav[b:e]
185 |         wav, _ = librosa.effects.trim(wav, top_db=25)
186 |     else:
187 |         if hparams.process_only_htk_aligned:
188 |             return None
189 |         wav, _ = librosa.effects.trim(wav, top_db=15)
190 | 
191 |     if hparams.rescaling:
192 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
193 | 
194 |     # Compute the linear-scale spectrogram from the wav:
195 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
196 |     n_frames = spectrogram.shape[1]
197 | 
198 |     # Compute a mel-scale spectrogram from the wav:
199 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
200 | 
201 |     # Write the spectrograms to disk: 
202 |     # Get filename from wav_path
203 |     wav_name = os.path.basename(wav_path)
204 |     wav_name = os.path.splitext(wav_name)[0]
205 |     
206 |     # case if wave files across different speakers have the same naming format.
207 |     # e.g. Recording0.wav
208 |     spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
209 |     mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
210 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
211 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
212 |     # Return a tuple describing this training example:
213 |     return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
214 |     
215 | def _process_utterance_single(out_dir, text, wav_path):
216 |     # modified version of LJSpeech _process_utterance
217 | 
218 |     # Load the audio to a numpy array:
219 |     wav = audio.load_wav(wav_path)
220 |     sr = hparams.sample_rate
221 |     # Added from the multispeaker version
222 |     lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
223 |     if not exists(lab_path):
224 |         lab_path = os.path.splitext(wav_path)[0]+'.lab'
225 | 
226 |     # Trim silence from hts labels if available
227 |     if exists(lab_path):
228 |         labels = hts.load(lab_path)
229 |         b = int(start_at(labels) * 1e-7 * sr)
230 |         e = int(end_at(labels) * 1e-7 * sr)
231 |         wav = wav[b:e]
232 |         wav, _ = librosa.effects.trim(wav, top_db=25)
233 |     else:
234 |         if hparams.process_only_htk_aligned:
235 |             return None
236 |         wav, _ = librosa.effects.trim(wav, top_db=15)
237 |     # End added from the multispeaker version
238 |     
239 |     if hparams.rescaling:
240 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
241 | 
242 |     # Compute the linear-scale spectrogram from the wav:
243 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
244 |     n_frames = spectrogram.shape[1]
245 | 
246 |     # Compute a mel-scale spectrogram from the wav:
247 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
248 | 
249 |     # Write the spectrograms to disk: 
250 |     # Get filename from wav_path
251 |     wav_name = os.path.basename(wav_path)
252 |     wav_name = os.path.splitext(wav_name)[0]
253 |     spectrogram_filename = 'spec-{}.npy'.format(wav_name)
254 |     mel_filename = 'mel-{}.npy'.format(wav_name)
255 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
256 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
257 | 
258 |     # Return a tuple describing this training example:
259 |     return (spectrogram_filename, mel_filename, n_frames, text)
260 | 
261 | 


--------------------------------------------------------------------------------
/jsut.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | from nnmnkwii.datasets import jsut
 7 | from nnmnkwii.io import hts
 8 | from hparams import hparams
 9 | from os.path import exists
10 | import librosa
11 | 
12 | 
13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
14 |     executor = ProcessPoolExecutor(max_workers=num_workers)
15 |     futures = []
16 | 
17 |     transcriptions = jsut.TranscriptionDataSource(
18 |         in_dir, subsets=jsut.available_subsets).collect_files()
19 |     wav_paths = jsut.WavFileDataSource(
20 |         in_dir, subsets=jsut.available_subsets).collect_files()
21 | 
22 |     for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
23 |         futures.append(executor.submit(
24 |             partial(_process_utterance, out_dir, index + 1, wav_path, text)))
25 |     return [future.result() for future in tqdm(futures)]
26 | 
27 | 
28 | def _process_utterance(out_dir, index, wav_path, text):
29 |     sr = hparams.sample_rate
30 | 
31 |     # Load the audio to a numpy array:
32 |     wav = audio.load_wav(wav_path)
33 | 
34 |     lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
35 | 
36 |     # Trim silence from hts labels if available
37 |     if exists(lab_path):
38 |         labels = hts.load(lab_path)
39 |         assert labels[0][-1] == "silB"
40 |         assert labels[-1][-1] == "silE"
41 |         b = int(labels[0][1] * 1e-7 * sr)
42 |         e = int(labels[-1][0] * 1e-7 * sr)
43 |         wav = wav[b:e]
44 |     else:
45 |         wav, _ = librosa.effects.trim(wav, top_db=30)
46 | 
47 |     if hparams.rescaling:
48 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
49 | 
50 |     # Compute the linear-scale spectrogram from the wav:
51 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
52 |     n_frames = spectrogram.shape[1]
53 | 
54 |     # Compute a mel-scale spectrogram from the wav:
55 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
56 | 
57 |     # Write the spectrograms to disk:
58 |     spectrogram_filename = 'jsut-spec-%05d.npy' % index
59 |     mel_filename = 'jsut-mel-%05d.npy' % index
60 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
61 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
62 | 
63 |     # Return a tuple describing this training example:
64 |     return (spectrogram_filename, mel_filename, n_frames, text)
65 | 


--------------------------------------------------------------------------------
/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | from hparams import hparams
 7 | 
 8 | 
 9 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
10 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
11 | 
12 |       Args:
13 |         in_dir: The directory where you have downloaded the LJ Speech dataset
14 |         out_dir: The directory to write the output into
15 |         num_workers: Optional number of worker processes to parallelize across
16 |         tqdm: You can optionally pass tqdm to get a nice progress bar
17 | 
18 |       Returns:
19 |         A list of tuples describing the training examples. This should be written to train.txt
20 |     '''
21 | 
22 |     # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
23 |     # can omit it and just call _process_utterance on each input if you want.
24 |     executor = ProcessPoolExecutor(max_workers=num_workers)
25 |     futures = []
26 |     index = 1
27 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
28 |         for line in f:
29 |             parts = line.strip().split('|')
30 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
31 |             text = parts[2]
32 |             if len(text) < hparams.min_text:
33 |                 continue
34 |             futures.append(executor.submit(
35 |                 partial(_process_utterance, out_dir, index, wav_path, text)))
36 |             index += 1
37 |     return [future.result() for future in tqdm(futures)]
38 | 
39 | 
40 | def _process_utterance(out_dir, index, wav_path, text):
41 |     '''Preprocesses a single utterance audio/text pair.
42 | 
43 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
44 |     to the train.txt file.
45 | 
46 |     Args:
47 |       out_dir: The directory to write the spectrograms into
48 |       index: The numeric index to use in the spectrogram filenames.
49 |       wav_path: Path to the audio file containing the speech input
50 |       text: The text spoken in the input audio file
51 | 
52 |     Returns:
53 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
54 |     '''
55 | 
56 |     # Load the audio to a numpy array:
57 |     wav = audio.load_wav(wav_path)
58 | 
59 |     if hparams.rescaling:
60 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
61 | 
62 |     # Compute the linear-scale spectrogram from the wav:
63 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
64 |     n_frames = spectrogram.shape[1]
65 | 
66 |     # Compute a mel-scale spectrogram from the wav:
67 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
68 | 
69 |     # Write the spectrograms to disk:
70 |     spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
71 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
72 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
73 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
74 | 
75 |     # Return a tuple describing this training example:
76 |     return (spectrogram_filename, mel_filename, n_frames, text)
77 | 


--------------------------------------------------------------------------------
/lrschedule.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329
 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000):
 6 |      # Noam scheme from tensor2tensor:
 7 |     warmup_steps = float(warmup_steps)
 8 |     step = global_step + 1.
 9 |     lr = init_lr * warmup_steps**0.5 * np.minimum(
10 |         step * warmup_steps**-1.5, step**-0.5)
11 |     return lr
12 | 
13 | 
14 | def step_learning_rate_decay(init_lr, global_step,
15 |                              anneal_rate=0.98,
16 |                              anneal_interval=30000):
17 |     return init_lr * anneal_rate ** (global_step // anneal_interval)
18 | 
19 | 
20 | def cyclic_cosine_annealing(init_lr, global_step, T, M):
21 |     """Cyclic cosine annealing
22 | 
23 |     https://arxiv.org/pdf/1704.00109.pdf
24 | 
25 |     Args:
26 |         init_lr (float): Initial learning rate
27 |         global_step (int): Current iteration number
28 |         T (int): Total iteration number (i,e. nepoch)
29 |         M (int): Number of ensembles we want
30 | 
31 |     Returns:
32 |         float: Annealed learning rate
33 |     """
34 |     TdivM = T // M
35 |     return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0)
36 | 


--------------------------------------------------------------------------------
/nikl_m.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | import re
 7 | 
 8 | from hparams import hparams
 9 | 
10 | 
11 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
12 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
13 | 
14 |       Args:
15 |         in_dir: The directory where you have downloaded the LJ Speech dataset
16 |         out_dir: The directory to write the output into
17 |         num_workers: Optional number of worker processes to parallelize across
18 |         tqdm: You can optionally pass tqdm to get a nice progress bar
19 | 
20 |       Returns:
21 |         A list of tuples describing the training examples. This should be written to train.txt
22 |     '''
23 | 
24 |     # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
25 |     # can omit it and just call _process_utterance on each input if you want.
26 | 
27 |     # You will need to modify and format NIKL transcrption file will UTF-8 format
28 |     # please check https://github.com/homink/deepspeech.pytorch.ko/blob/master/data/local/clean_corpus.sh
29 | 
30 |     executor = ProcessPoolExecutor(max_workers=num_workers)
31 |     futures = []
32 | 
33 |     spk_id = {}
34 |     with open(in_dir + '/speaker.mid', encoding='utf-8') as f:
35 |         for i, line in enumerate(f):
36 |             spk_id[line.rstrip()] = i
37 | 
38 |     index = 1
39 |     with open(in_dir + '/metadata.txt', encoding='utf-8') as f:
40 |         for line in f:
41 |             parts = line.strip().split('|')
42 |             wav_path = parts[0]
43 |             text = parts[1]
44 |             uid = re.search(r'([a-z][a-z][0-9][0-9]_t)', wav_path)
45 |             uid = uid.group(1).replace('_t', '')
46 |             futures.append(executor.submit(
47 |                 partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text)))
48 |             index += 1
49 |     return [future.result() for future in tqdm(futures)]
50 | 
51 | 
52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text):
53 |     '''Preprocesses a single utterance audio/text pair.
54 | 
55 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
56 |     to the train.txt file.
57 | 
58 |     Args:
59 |       out_dir: The directory to write the spectrograms into
60 |       index: The numeric index to use in the spectrogram filenames.
61 |       wav_path: Path to the audio file containing the speech input
62 |       text: The text spoken in the input audio file
63 | 
64 |     Returns:
65 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
66 |     '''
67 | 
68 |     # Load the audio to a numpy array:
69 |     wav = audio.load_wav(wav_path)
70 | 
71 |     if hparams.rescaling:
72 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
73 | 
74 |     # Compute the linear-scale spectrogram from the wav:
75 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
76 |     n_frames = spectrogram.shape[1]
77 | 
78 |     # Compute a mel-scale spectrogram from the wav:
79 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
80 | 
81 |     # Write the spectrograms to disk:
82 |     spectrogram_filename = 'nikl-multi-spec-%05d.npy' % index
83 |     mel_filename = 'nikl-multi-mel-%05d.npy' % index
84 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
85 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
86 | 
87 |     # Return a tuple describing this training example:
88 |     return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
89 | 


--------------------------------------------------------------------------------
/nikl_preprocess/README.md:
--------------------------------------------------------------------------------
 1 | # Preparation for Korean speech
 2 | 
 3 | ## Corpus
 4 | https://github.com/homink/speech.ko
 5 | 
 6 | ## Command
 7 | 
 8 | ### Multi-speaker
 9 | ```
10 | cd nikl_preprocess
11 | python prepare_metadata.py --corpus ${corpus location} --trans_file ${corpus location}/trans.txt --spk_id ${corpus location}/speaker.mid
12 | ```
13 | ### Single-speaker
14 | ```
15 | cd nikl_preprocess
16 | python prepare_metadata.py --corpus ${corpus location} --trans_file ${corpus location}/trans.txt --spk_id ${corpus location}/speaker.sid
17 | ```
18 | Default single speaker id is fv01. You can edit it by speaker id in [here](https://github.com/homink/speech.ko).
19 | 


--------------------------------------------------------------------------------
/nikl_preprocess/prepare_metafile.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import subprocess,os,re
 3 | 
 4 | def pwrap(args, shell=False):
 5 |     p = subprocess.Popen(args, shell=shell, stdout=subprocess.PIPE,
 6 |                          stdin=subprocess.PIPE, stderr=subprocess.PIPE,
 7 |                          universal_newlines=True)
 8 |     return p
 9 | 
10 | def execute(cmd, shell=False):
11 |     popen = pwrap(cmd, shell=shell)
12 |     for stdout_line in iter(popen.stdout.readline, ""):
13 |         yield stdout_line
14 | 
15 |     popen.stdout.close()
16 |     return_code = popen.wait()
17 |     if return_code:
18 |         raise subprocess.CalledProcessError(return_code, cmd)
19 | 
20 | def pe(cmd, shell=False):
21 |     """
22 |     Print and execute command on system
23 |     """
24 |     ret = []
25 |     for line in execute(cmd, shell=shell):
26 |         ret.append(line)
27 |         print(line, end="")
28 |     return ret
29 | 
30 | 
31 | if __name__ == "__main__":
32 |   import argparse
33 |   parser = argparse.ArgumentParser(description="Produce metafile where wav file path and its transcription are aligned",
34 |                                    epilog="Example usage: python preprea_metadata $HOME/copora/NIKL")
35 |   parser.add_argument("--corpus_dir", "-c",
36 |                       help="filepath for the root directory of corpus",
37 |                       required=True)
38 | 
39 |   parser.add_argument("--trans_file", "-t",
40 |                       help="Extracted transcription file obatained from extract_trans.py",
41 |                       required=True)
42 | 
43 |   parser.add_argument("--spk_id", "-sid",
44 |                       help="Speaker ID for single speaker such as fv01",
45 |                       required=False)
46 |   args = parser.parse_args()
47 | 
48 |   print("Prepare metadata file for all speakers")
49 |   pe("find %s -name %s | grep -v 'Bad\|Non\|Invalid' > %s/wav.lst" % (args.corpus_dir,"*.wav",args.corpus_dir),shell=True)
50 | 
51 |   trans={}
52 |   with open(args.trans_file,"r") as f:
53 |     for line in f:
54 |       line = line.rstrip()
55 |       line_split = line.split(" ")
56 |       trans[line_split[0]] = " ".join(line_split[1:])
57 | 
58 |   with open(args.corpus_dir+"/wav.lst", "r") as f:
59 |     wavfiles = f.readlines()
60 | 
61 |   pe("rm -f %s/metadata.txt" % (args.corpus_dir),shell=True)
62 |   for w in wavfiles:
63 |     w = w.rstrip()
64 |     tid = re.search(r'(t[0-9][0-9]_s[0-9][0-9])',w)
65 |     if tid:
66 |       tid_found = tid.group(1)
67 |       pe('echo %s"|"%s >> %s/metadata.txt' % (w,trans.get(tid_found),args.corpus_dir),shell=True)
68 | 
69 |   print("Metadata files is created in %s/metadata.txt" % (args.corpus_dir))
70 |   pe("ls -d -- %s/*/ | grep -v 'Bad\|Non\|Invalid' | rev | cut -d'/' -f2 | rev > %s/speaker.mid" % (args.corpus_dir,args.corpus_dir),shell=True)
71 |   pe("head -n 1 %s/speaker.mid > %s/speaker.sid" % (args.corpus_dir,args.corpus_dir),shell=True)
72 | 


--------------------------------------------------------------------------------
/nikl_s.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | import re
 7 | 
 8 | from hparams import hparams
 9 | 
10 | 
11 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
12 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
13 | 
14 |       Args:
15 |         in_dir: The directory where you have downloaded the LJ Speech dataset
16 |         out_dir: The directory to write the output into
17 |         num_workers: Optional number of worker processes to parallelize across
18 |         tqdm: You can optionally pass tqdm to get a nice progress bar
19 | 
20 |       Returns:
21 |         A list of tuples describing the training examples. This should be written to train.txt
22 |     '''
23 | 
24 |     # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
25 |     # can omit it and just call _process_utterance on each input if you want.
26 | 
27 |     # You will need to modify and format NIKL transcrption file will UTF-8 format
28 |     # please check https://github.com/homink/deepspeech.pytorch.ko/blob/master/data/local/clean_corpus.sh
29 | 
30 |     executor = ProcessPoolExecutor(max_workers=num_workers)
31 |     futures = []
32 | 
33 |     with open(in_dir + '/speaker.sid', encoding='utf-8') as f:
34 |         spk_id = f.readline().rstrip()
35 | 
36 |     index = 1
37 |     with open(in_dir + '/metadata.txt', encoding='utf-8') as f:
38 |         for line in f:
39 |             if spk_id in line:
40 |                 parts = line.strip().split('|')
41 |                 wav_path = parts[0]
42 |                 text = parts[1]
43 |                 futures.append(executor.submit(
44 |                     partial(_process_utterance, out_dir, index + 1, wav_path, text)))
45 |             index += 1
46 |     return [future.result() for future in tqdm(futures)]
47 | 
48 | 
49 | def _process_utterance(out_dir, index, wav_path, text):
50 |     '''Preprocesses a single utterance audio/text pair.
51 | 
52 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
53 |     to the train.txt file.
54 | 
55 |     Args:
56 |       out_dir: The directory to write the spectrograms into
57 |       index: The numeric index to use in the spectrogram filenames.
58 |       wav_path: Path to the audio file containing the speech input
59 |       text: The text spoken in the input audio file
60 | 
61 |     Returns:
62 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
63 |     '''
64 | 
65 |     # Load the audio to a numpy array:
66 |     wav = audio.load_wav(wav_path)
67 | 
68 |     if hparams.rescaling:
69 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
70 | 
71 |     # Compute the linear-scale spectrogram from the wav:
72 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
73 |     n_frames = spectrogram.shape[1]
74 | 
75 |     # Compute a mel-scale spectrogram from the wav:
76 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
77 | 
78 |     # Write the spectrograms to disk:
79 |     spectrogram_filename = 'nikl-single-spec-%05d.npy' % index
80 |     mel_filename = 'nikl-single-mel-%05d.npy' % index
81 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
82 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
83 | 
84 |     # Return a tuple describing this training example:
85 |     return (spectrogram_filename, mel_filename, n_frames, text)
86 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Preprocess dataset
 4 | 
 5 | usage: preprocess.py [options] <name> <in_dir> <out_dir>
 6 | 
 7 | options:
 8 |     --num_workers=<n>        Num workers.
 9 |     --hparams=<parmas>       Hyper parameters [default: ].
10 |     --preset=<json>          Path of preset parameters (json).
11 |     -h, --help               Show help message.
12 | """
13 | from docopt import docopt
14 | import os
15 | from multiprocessing import cpu_count
16 | from tqdm import tqdm
17 | import importlib
18 | from hparams import hparams, hparams_debug_string
19 | 
20 | 
21 | def preprocess(mod, in_dir, out_root, num_workers):
22 |     os.makedirs(out_dir, exist_ok=True)
23 |     metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
24 |     write_metadata(metadata, out_dir)
25 | 
26 | 
27 | def write_metadata(metadata, out_dir):
28 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
29 |         for m in metadata:
30 |             f.write('|'.join([str(x) for x in m]) + '\n')
31 |     frames = sum([m[2] for m in metadata])
32 |     frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000
33 |     hours = frames * frame_shift_ms / (3600 * 1000)
34 |     print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
35 |     print('Max input length:  %d' % max(len(m[3]) for m in metadata))
36 |     print('Max output length: %d' % max(m[2] for m in metadata))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     args = docopt(__doc__)
41 |     name = args["<name>"]
42 |     in_dir = args["<in_dir>"]
43 |     out_dir = args["<out_dir>"]
44 |     num_workers = args["--num_workers"]
45 |     num_workers = cpu_count() if num_workers is None else int(num_workers)
46 |     preset = args["--preset"]
47 | 
48 |     # Load preset if specified
49 |     if preset is not None:
50 |         with open(preset) as f:
51 |             hparams.parse_json(f.read())
52 |     # Override hyper parameters
53 |     hparams.parse(args["--hparams"])
54 |     assert hparams.name == "deepvoice3"
55 |     print(hparams_debug_string())
56 | 
57 |     assert name in ["jsut", "ljspeech", "vctk", "nikl_m", "nikl_s", "json_meta"]
58 |     mod = importlib.import_module(name)
59 |     preprocess(mod, in_dir, out_dir, num_workers)
60 | 


--------------------------------------------------------------------------------
/presets/deepvoice3_ljspeech.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepvoice3",
 3 |   "frontend": "en",
 4 |   "replace_pronunciation_prob": 0.5,
 5 |   "builder": "deepvoice3",
 6 |   "n_speakers": 1,
 7 |   "speaker_embed_dim": 16,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "sample_rate": 22050,
14 |   "preemphasis": 0.97,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": false,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "downsample_step": 4,
21 |   "outputs_per_step": 1,
22 |   "embedding_weight_std": 0.1,
23 |   "speaker_embedding_weight_std": 0.01,
24 |   "padding_idx": 0,
25 |   "max_positions": 512,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "text_embed_dim": 256,
29 |   "encoder_channels": 512,
30 |   "decoder_channels": 256,
31 |   "converter_channels": 256,
32 |   "query_position_rate": 1.0,
33 |   "key_position_rate": 1.385,
34 |   "key_projection": true,
35 |   "value_projection": true,
36 |   "use_memory_mask": true,
37 |   "trainable_positional_encodings": false,
38 |   "freeze_embedding": false,
39 |   "use_decoder_state_for_postnet_input": true,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "masked_loss_weight": 0.5,
43 |   "priority_freq": 3000,
44 |   "priority_freq_weight": 0.0,
45 |   "binary_divergence_weight": 0.1,
46 |   "use_guided_attention": true,
47 |   "guided_attention_sigma": 0.2,
48 |   "batch_size": 16,
49 |   "adam_beta1": 0.5,
50 |   "adam_beta2": 0.9,
51 |   "adam_eps": 1e-06,
52 |   "initial_learning_rate": 0.0005,
53 |   "lr_schedule": "noam_learning_rate_decay",
54 |   "lr_schedule_kwargs": {},
55 |   "nepochs": 2000,
56 |   "weight_decay": 0.0,
57 |   "clip_thresh": 0.1,
58 |   "checkpoint_interval": 10000,
59 |   "eval_interval": 10000,
60 |   "save_optimizer_state": true,
61 |   "force_monotonic_attention": true,
62 |   "window_ahead": 3,
63 |   "window_backward": 1,
64 |   "power": 1.4
65 | }


--------------------------------------------------------------------------------
/presets/deepvoice3_niklm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepvoice3",
 3 |   "frontend": "ko",
 4 |   "replace_pronunciation_prob": 0.5,
 5 |   "builder": "deepvoice3_multispeaker",
 6 |   "n_speakers": 118,
 7 |   "speaker_embed_dim": 16,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "sample_rate": 22050,
14 |   "preemphasis": 0.97,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": false,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "downsample_step": 4,
21 |   "outputs_per_step": 1,
22 |   "embedding_weight_std": 0.1,
23 |   "speaker_embedding_weight_std": 0.05,
24 |   "padding_idx": 0,
25 |   "max_positions": 1200,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "text_embed_dim": 256,
29 |   "encoder_channels": 512,
30 |   "decoder_channels": 256,
31 |   "converter_channels": 256,
32 |   "query_position_rate": 2.0,
33 |   "key_position_rate": 7.6,
34 |   "key_projection": true,
35 |   "value_projection": true,
36 |   "use_memory_mask": true,
37 |   "trainable_positional_encodings": false,
38 |   "freeze_embedding": false,
39 |   "use_decoder_state_for_postnet_input": true,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "masked_loss_weight": 0.5,
43 |   "priority_freq": 3000,
44 |   "priority_freq_weight": 0.0,
45 |   "binary_divergence_weight": 0.1,
46 |   "use_guided_attention": true,
47 |   "guided_attention_sigma": 0.4,
48 |   "batch_size": 8,
49 |   "adam_beta1": 0.5,
50 |   "adam_beta2": 0.9,
51 |   "adam_eps": 1e-06,
52 |   "initial_learning_rate": 0.0005,
53 |   "lr_schedule": "noam_learning_rate_decay",
54 |   "lr_schedule_kwargs": {},
55 |   "nepochs": 2000,
56 |   "weight_decay": 0.0,
57 |   "clip_thresh": 0.1,
58 |   "checkpoint_interval": 10000,
59 |   "eval_interval": 10000,
60 |   "save_optimizer_state": true,
61 |   "force_monotonic_attention": true,
62 |   "window_ahead": 3,
63 |   "window_backward": 1,
64 |   "power": 1.4
65 | }
66 | 


--------------------------------------------------------------------------------
/presets/deepvoice3_nikls.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepvoice3",
 3 |   "frontend": "ko",
 4 |   "replace_pronunciation_prob": 0.5,
 5 |   "builder": "deepvoice3",
 6 |   "n_speakers": 1,
 7 |   "speaker_embed_dim": 16,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "sample_rate": 22050,
14 |   "preemphasis": 0.97,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": false,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "downsample_step": 4,
21 |   "outputs_per_step": 1,
22 |   "embedding_weight_std": 0.1,
23 |   "speaker_embedding_weight_std": 0.05,
24 |   "padding_idx": 0,
25 |   "max_positions": 512,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "text_embed_dim": 256,
29 |   "encoder_channels": 512,
30 |   "decoder_channels": 256,
31 |   "converter_channels": 256,
32 |   "query_position_rate": 2.0,
33 |   "key_position_rate": 7.6,
34 |   "key_projection": true,
35 |   "value_projection": true,
36 |   "use_memory_mask": true,
37 |   "trainable_positional_encodings": false,
38 |   "freeze_embedding": false,
39 |   "use_decoder_state_for_postnet_input": true,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "masked_loss_weight": 0.5,
43 |   "priority_freq": 3000,
44 |   "priority_freq_weight": 0.0,
45 |   "binary_divergence_weight": 0.1,
46 |   "use_guided_attention": true,
47 |   "guided_attention_sigma": 0.4,
48 |   "batch_size": 8,
49 |   "adam_beta1": 0.5,
50 |   "adam_beta2": 0.9,
51 |   "adam_eps": 1e-06,
52 |   "initial_learning_rate": 0.0005,
53 |   "lr_schedule": "noam_learning_rate_decay",
54 |   "lr_schedule_kwargs": {},
55 |   "nepochs": 2000,
56 |   "weight_decay": 0.0,
57 |   "clip_thresh": 0.1,
58 |   "checkpoint_interval": 10000,
59 |   "eval_interval": 10000,
60 |   "save_optimizer_state": true,
61 |   "force_monotonic_attention": true,
62 |   "window_ahead": 3,
63 |   "window_backward": 1,
64 |   "power": 1.4
65 | }
66 | 


--------------------------------------------------------------------------------
/presets/deepvoice3_vctk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepvoice3",
 3 |   "frontend": "en",
 4 |   "replace_pronunciation_prob": 0.5,
 5 |   "builder": "deepvoice3_multispeaker",
 6 |   "n_speakers": 108,
 7 |   "speaker_embed_dim": 16,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "sample_rate": 22050,
14 |   "preemphasis": 0.97,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": false,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "downsample_step": 4,
21 |   "outputs_per_step": 1,
22 |   "embedding_weight_std": 0.1,
23 |   "speaker_embedding_weight_std": 0.05,
24 |   "padding_idx": 0,
25 |   "max_positions": 1024,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "text_embed_dim": 256,
29 |   "encoder_channels": 512,
30 |   "decoder_channels": 256,
31 |   "converter_channels": 256,
32 |   "query_position_rate": 2.0,
33 |   "key_position_rate": 7.6,
34 |   "key_projection": true,
35 |   "value_projection": true,
36 |   "use_memory_mask": true,
37 |   "trainable_positional_encodings": false,
38 |   "freeze_embedding": false,
39 |   "use_decoder_state_for_postnet_input": true,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "masked_loss_weight": 0.5,
43 |   "priority_freq": 3000,
44 |   "priority_freq_weight": 0.0,
45 |   "binary_divergence_weight": 0.1,
46 |   "use_guided_attention": true,
47 |   "guided_attention_sigma": 0.4,
48 |   "batch_size": 16,
49 |   "adam_beta1": 0.5,
50 |   "adam_beta2": 0.9,
51 |   "adam_eps": 1e-06,
52 |   "initial_learning_rate": 0.0005,
53 |   "lr_schedule": "noam_learning_rate_decay",
54 |   "lr_schedule_kwargs": {},
55 |   "nepochs": 2000,
56 |   "weight_decay": 0.0,
57 |   "clip_thresh": 0.1,
58 |   "checkpoint_interval": 10000,
59 |   "eval_interval": 10000,
60 |   "save_optimizer_state": true,
61 |   "force_monotonic_attention": true,
62 |   "window_ahead": 3,
63 |   "window_backward": 1,
64 |   "power": 1.4
65 | }
66 | 


--------------------------------------------------------------------------------
/presets/nyanko_ljspeech.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "deepvoice3",
 3 |   "frontend": "en",
 4 |   "replace_pronunciation_prob": 0.5,
 5 |   "builder": "nyanko",
 6 |   "n_speakers": 1,
 7 |   "speaker_embed_dim": 16,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "sample_rate": 22050,
14 |   "preemphasis": 0.97,
15 |   "min_level_db": -100,
16 |   "ref_level_db": 20,
17 |   "rescaling": false,
18 |   "rescaling_max": 0.999,
19 |   "allow_clipping_in_normalization": true,
20 |   "downsample_step": 4,
21 |   "outputs_per_step": 1,
22 |   "embedding_weight_std": 0.01,
23 |   "speaker_embedding_weight_std": 0.01,
24 |   "padding_idx": 0,
25 |   "max_positions": 512,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "text_embed_dim": 128,
29 |   "encoder_channels": 256,
30 |   "decoder_channels": 256,
31 |   "converter_channels": 256,
32 |   "query_position_rate": 1.0,
33 |   "key_position_rate": 1.385,
34 |   "key_projection": false,
35 |   "value_projection": false,
36 |   "use_memory_mask": true,
37 |   "trainable_positional_encodings": false,
38 |   "freeze_embedding": false,
39 |   "use_decoder_state_for_postnet_input": true,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "masked_loss_weight": 0.5,
43 |   "priority_freq": 3000,
44 |   "priority_freq_weight": 0.0,
45 |   "binary_divergence_weight": 0.1,
46 |   "use_guided_attention": true,
47 |   "guided_attention_sigma": 0.2,
48 |   "batch_size": 16,
49 |   "adam_beta1": 0.5,
50 |   "adam_beta2": 0.9,
51 |   "adam_eps": 1e-06,
52 |   "initial_learning_rate": 0.0005,
53 |   "lr_schedule": "noam_learning_rate_decay",
54 |   "lr_schedule_kwargs": {},
55 |   "nepochs": 2000,
56 |   "weight_decay": 0.0,
57 |   "clip_thresh": 0.1,
58 |   "checkpoint_interval": 10000,
59 |   "eval_interval": 10000,
60 |   "save_optimizer_state": true,
61 |   "force_monotonic_attention": true,
62 |   "window_ahead": 3,
63 |   "window_backward": 1,
64 |   "power": 1.4
65 | }
66 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script for Pypi release
 4 | # 0. Make sure you are on git tag
 5 | # 1. Run the script
 6 | # 2. Upload sdist
 7 | 
 8 | set -e
 9 | 
10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
11 | cd $script_dir
12 | 
13 | TAG=$(git describe --exact-match --tags HEAD)
14 | 
15 | VERSION=${TAG/v/}
16 | 
17 | DEEPVOICE3_PYTORCH_BUILD_VERSION=$VERSION python setup.py develop sdist
18 | echo "*** Ready to release! deepvoice3_pytorch $TAG ***"
19 | echo "Please make sure that release verion is correct."
20 | cat deepvoice3_pytorch/version.py
21 | echo "Please run the following command manually:"
22 | echo twine upload dist/deepvoice3_pytorch-${VERSION}.tar.gz --repository-url https://upload.pypi.org/legacy/
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from setuptools import setup, find_packages
  4 | import setuptools.command.develop
  5 | import setuptools.command.build_py
  6 | import os
  7 | import subprocess
  8 | from os.path import exists
  9 | 
 10 | version = '0.1.1'
 11 | 
 12 | # Adapted from https://github.com/pytorch/pytorch
 13 | cwd = os.path.dirname(os.path.abspath(__file__))
 14 | if os.getenv('DEEPVOICE3_PYTORCH_BUILD_VERSION'):
 15 |     version = os.getenv('DEEPVOICE3_PYTORCH_BUILD_VERSION')
 16 | else:
 17 |     try:
 18 |         sha = subprocess.check_output(
 19 |             ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
 20 |         version += '+' + sha[:7]
 21 |     except subprocess.CalledProcessError:
 22 |         pass
 23 |     except IOError:  # FileNotFoundError for python 3
 24 |         pass
 25 | 
 26 | 
 27 | class build_py(setuptools.command.build_py.build_py):
 28 | 
 29 |     def run(self):
 30 |         self.create_version_file()
 31 |         setuptools.command.build_py.build_py.run(self)
 32 | 
 33 |     @staticmethod
 34 |     def create_version_file():
 35 |         global version, cwd
 36 |         print('-- Building version ' + version)
 37 |         version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py')
 38 |         with open(version_path, 'w') as f:
 39 |             f.write("__version__ = '{}'\n".format(version))
 40 | 
 41 | 
 42 | class develop(setuptools.command.develop.develop):
 43 | 
 44 |     def run(self):
 45 |         build_py.create_version_file()
 46 |         setuptools.command.develop.develop.run(self)
 47 | 
 48 | 
 49 | def create_readme_rst():
 50 |     global cwd
 51 |     try:
 52 |         subprocess.check_call(
 53 |             ["pandoc", "--from=markdown", "--to=rst", "--output=README.rst",
 54 |              "README.md"], cwd=cwd)
 55 |         print("Generated README.rst from README.md using pandoc.")
 56 |     except subprocess.CalledProcessError:
 57 |         pass
 58 |     except OSError:
 59 |         pass
 60 | 
 61 | 
 62 | if not exists('README.rst'):
 63 |     create_readme_rst()
 64 | 
 65 | if exists('README.rst'):
 66 |     README = open('README.rst', 'rb').read().decode("utf-8")
 67 | else:
 68 |     README = ''
 69 | 
 70 | setup(name='deepvoice3_pytorch',
 71 |       version=version,
 72 |       description='PyTorch implementation of convolutional networks-based text-to-speech synthesis models.',
 73 |       long_description=README,
 74 |       packages=find_packages(),
 75 |       cmdclass={
 76 |           'build_py': build_py,
 77 |           'develop': develop,
 78 |       },
 79 |       install_requires=[
 80 |           "numpy",
 81 |           "scipy",
 82 |           "torch >= 1.0.0",
 83 |           "unidecode",
 84 |           "inflect",
 85 |           "librosa",
 86 |           "numba",
 87 |           "lws",
 88 |           "nltk",
 89 |       ],
 90 |       extras_require={
 91 |           "bin": [
 92 |               "docopt",
 93 |               "tqdm",
 94 |               "tensorboardX <= 1.2",
 95 |               "nnmnkwii >= 0.0.19",
 96 |               "requests",
 97 |               "matplotlib",
 98 |           ],
 99 |           "test": [
100 |               "nose",
101 |           ],
102 |           "jp": [
103 |               "jaconv",
104 |               "mecab-python3",
105 |           ],
106 |       })
107 | 


--------------------------------------------------------------------------------
/synthesis.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | Synthesis waveform from trained model.
  4 | 
  5 | usage: synthesis.py [options] <checkpoint> <text_list_file> <dst_dir>
  6 | 
  7 | options:
  8 |     --hparams=<parmas>                Hyper parameters [default: ].
  9 |     --preset=<json>                   Path of preset parameters (json).
 10 |     --checkpoint-seq2seq=<path>       Load seq2seq model from checkpoint path.
 11 |     --checkpoint-postnet=<path>       Load postnet model from checkpoint path.
 12 |     --file-name-suffix=<s>            File name suffix [default: ].
 13 |     --max-decoder-steps=<N>           Max decoder steps [default: 500].
 14 |     --replace_pronunciation_prob=<N>  Prob [default: 0.0].
 15 |     --speaker_id=<id>                 Speaker ID (for multi-speaker model).
 16 |     --output-html                     Output html for blog post.
 17 |     -h, --help               Show help message.
 18 | """
 19 | from docopt import docopt
 20 | 
 21 | import sys
 22 | import os
 23 | from os.path import dirname, join, basename, splitext
 24 | 
 25 | import audio
 26 | 
 27 | import torch
 28 | import numpy as np
 29 | import nltk
 30 | 
 31 | # The deepvoice3 model
 32 | from deepvoice3_pytorch import frontend
 33 | from hparams import hparams, hparams_debug_string
 34 | 
 35 | from tqdm import tqdm
 36 | 
 37 | use_cuda = torch.cuda.is_available()
 38 | device = torch.device("cuda" if use_cuda else "cpu")
 39 | _frontend = None  # to be set later
 40 | 
 41 | 
 42 | def tts(model, text, p=0, speaker_id=None, fast=False):
 43 |     """Convert text to speech waveform given a deepvoice3 model.
 44 | 
 45 |     Args:
 46 |         text (str) : Input text to be synthesized
 47 |         p (float) : Replace word to pronounciation if p > 0. Default is 0.
 48 |     """
 49 |     model = model.to(device)
 50 |     model.eval()
 51 |     if fast:
 52 |         model.make_generation_fast_()
 53 | 
 54 |     sequence = np.array(_frontend.text_to_sequence(text, p=p))
 55 |     sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
 56 |     text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device)
 57 |     speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device)
 58 | 
 59 |     # Greedy decoding
 60 |     with torch.no_grad():
 61 |         mel_outputs, linear_outputs, alignments, done = model(
 62 |             sequence, text_positions=text_positions, speaker_ids=speaker_ids)
 63 | 
 64 |     linear_output = linear_outputs[0].cpu().data.numpy()
 65 |     spectrogram = audio._denormalize(linear_output)
 66 |     alignment = alignments[0].cpu().data.numpy()
 67 |     mel = mel_outputs[0].cpu().data.numpy()
 68 |     mel = audio._denormalize(mel)
 69 | 
 70 |     # Predicted audio signal
 71 |     waveform = audio.inv_spectrogram(linear_output.T)
 72 | 
 73 |     return waveform, alignment, spectrogram, mel
 74 | 
 75 | 
 76 | def _load(checkpoint_path):
 77 |     if use_cuda:
 78 |         checkpoint = torch.load(checkpoint_path)
 79 |     else:
 80 |         checkpoint = torch.load(checkpoint_path,
 81 |                                 map_location=lambda storage, loc: storage)
 82 |     return checkpoint
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     args = docopt(__doc__)
 87 |     print("Command line args:\n", args)
 88 |     checkpoint_path = args["<checkpoint>"]
 89 |     text_list_file_path = args["<text_list_file>"]
 90 |     dst_dir = args["<dst_dir>"]
 91 |     checkpoint_seq2seq_path = args["--checkpoint-seq2seq"]
 92 |     checkpoint_postnet_path = args["--checkpoint-postnet"]
 93 |     max_decoder_steps = int(args["--max-decoder-steps"])
 94 |     file_name_suffix = args["--file-name-suffix"]
 95 |     replace_pronunciation_prob = float(args["--replace_pronunciation_prob"])
 96 |     output_html = args["--output-html"]
 97 |     speaker_id = args["--speaker_id"]
 98 |     if speaker_id is not None:
 99 |         speaker_id = int(speaker_id)
100 |     preset = args["--preset"]
101 | 
102 |     # Load preset if specified
103 |     if preset is not None:
104 |         with open(preset) as f:
105 |             hparams.parse_json(f.read())
106 |     # Override hyper parameters
107 |     hparams.parse(args["--hparams"])
108 |     assert hparams.name == "deepvoice3"
109 | 
110 |     _frontend = getattr(frontend, hparams.frontend)
111 |     import train
112 |     train._frontend = _frontend
113 |     from train import plot_alignment, build_model
114 | 
115 |     # Model
116 |     model = build_model()
117 | 
118 |     # Load checkpoints separately
119 |     if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None:
120 |         checkpoint = _load(checkpoint_seq2seq_path)
121 |         model.seq2seq.load_state_dict(checkpoint["state_dict"])
122 |         checkpoint = _load(checkpoint_postnet_path)
123 |         model.postnet.load_state_dict(checkpoint["state_dict"])
124 |         checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0]
125 |     else:
126 |         checkpoint = _load(checkpoint_path)
127 |         model.load_state_dict(checkpoint["state_dict"])
128 |         checkpoint_name = splitext(basename(checkpoint_path))[0]
129 | 
130 |     model.seq2seq.decoder.max_decoder_steps = max_decoder_steps
131 | 
132 |     os.makedirs(dst_dir, exist_ok=True)
133 |     with open(text_list_file_path, "rb") as f:
134 |         lines = f.readlines()
135 |         for idx, line in enumerate(lines):
136 |             text = line.decode("utf-8")[:-1]
137 |             words = nltk.word_tokenize(text)
138 |             waveform, alignment, _, _ = tts(
139 |                 model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True)
140 |             dst_wav_path = join(dst_dir, "{}_{}{}.wav".format(
141 |                 idx, checkpoint_name, file_name_suffix))
142 |             dst_alignment_path = join(
143 |                 dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name,
144 |                                                         file_name_suffix))
145 |             plot_alignment(alignment.T, dst_alignment_path,
146 |                            info="{}, {}".format(hparams.builder, basename(checkpoint_path)))
147 |             audio.save_wav(waveform, dst_wav_path)
148 |             name = splitext(basename(text_list_file_path))[0]
149 |             if output_html:
150 |                 print("""
151 | {}
152 | 
153 | ({} chars, {} words)
154 | 
155 | <audio controls="controls" >
156 | <source src="/audio/{}/{}/{}" autoplay/>
157 | Your browser does not support the audio element.
158 | </audio>
159 | 
160 | <div align="center"><img src="/audio/{}/{}/{}" /></div>
161 |                   """.format(text, len(text), len(words),
162 |                              hparams.builder, name, basename(dst_wav_path),
163 |                              hparams.builder, name, basename(dst_alignment_path)))
164 |             else:
165 |                 print(idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words)))
166 | 
167 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
168 |     sys.exit(0)
169 | 


--------------------------------------------------------------------------------
/tests/data/ljspeech-mel-00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/tests/data/ljspeech-mel-00001.npy


--------------------------------------------------------------------------------
/tests/test_audio.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import sys
 5 | from os.path import dirname, join
 6 | sys.path.insert(0, join(dirname(__file__), ".."))
 7 | 
 8 | import numpy as np
 9 | from nose.plugins.attrib import attr
10 | 
11 | import logging
12 | logging.getLogger('tensorflow').disabled = True
13 | 
14 | 
15 | @attr("local_only")
16 | def test_amp_to_db():
17 |     import audio
18 |     x = np.random.rand(10)
19 |     x_hat = audio._db_to_amp(audio._amp_to_db(x))
20 |     assert np.allclose(x, x_hat)
21 | 


--------------------------------------------------------------------------------
/tests/test_conv.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | from torch.nn import functional as F
 7 | from deepvoice3_pytorch.conv import Conv1d
 8 | 
 9 | 
10 | def test_conv1d_incremental():
11 |     def __test(kernel_size, dilation, T, B, C, causual=True):
12 |         dilation = (dilation,)
13 | 
14 |         # dilation = (4,)
15 |         # causual
16 |         assert causual
17 |         if causual:
18 |             padding = (kernel_size - 1) * dilation[0]
19 |         else:
20 |             padding = (kernel_size - 1) // 2 * dilation[0]
21 | 
22 |         # weight: (Cout, Cin, K)
23 |         conv = nn.Conv1d(
24 |             C, C * 2, kernel_size=kernel_size, padding=padding,
25 |             dilation=dilation).eval()
26 |         conv.weight.data.fill_(1.0)
27 |         conv.bias.data.zero_()
28 | 
29 |         # weight: (K, Cin, Cout)
30 |         # weight (linearized): (Cout*K, Cin)
31 |         conv_online = Conv1d(
32 |             C, C * 2, kernel_size=kernel_size, padding=padding,
33 |             dilation=dilation).eval()
34 |         conv_online.weight.data.fill_(1.0)
35 |         conv_online.bias.data.zero_()
36 | 
37 |         # (B, C, T)
38 |         bct = torch.zeros(B, C, T) + torch.arange(0, T).float()
39 |         output_conv = conv(bct)
40 | 
41 |         # Remove future time stamps
42 |         output_conv = output_conv[:, :, :T]
43 | 
44 |         output_conv_online = []
45 | 
46 |         # B, T, C
47 |         btc = bct.transpose(1, 2).contiguous()
48 |         for t in range(btc.size(1)):
49 |             input = btc[:, t, :].contiguous().view(B, -1, C)
50 |             output = conv_online.incremental_forward(input)
51 |             output_conv_online += [output]
52 | 
53 |         output_conv_online = torch.stack(output_conv_online).squeeze(2)
54 |         output_conv_online = output_conv_online.transpose(0, 1).transpose(1, 2)
55 | 
56 |         assert (output_conv == output_conv_online).all()
57 | 
58 |     for B in [1, 4]:
59 |         for T in [5, 10]:
60 |             for C in [1, 2, 4]:
61 |                 for kernel_size in [2, 3]:
62 |                     for dilation in [1, 2, 3, 4, 5, 9, 27]:
63 |                         yield __test, kernel_size, dilation, T, B, C
64 | 


--------------------------------------------------------------------------------
/tests/test_deepvoice3.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import sys
  5 | from os.path import dirname, join, exists
  6 | 
  7 | from deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | import numpy as np
 12 | 
 13 | from nose.plugins.attrib import attr
 14 | 
 15 | from deepvoice3_pytorch.builder import deepvoice3
 16 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
 17 | 
 18 | 
 19 | use_cuda = torch.cuda.is_available() and False
 20 | torch.backends.cudnn.deterministic = True
 21 | num_mels = 80
 22 | num_freq = 513
 23 | outputs_per_step = 4
 24 | padding_idx = 0
 25 | 
 26 | 
 27 | def _get_model(n_speakers=1, speaker_embed_dim=None,
 28 |                force_monotonic_attention=False,
 29 |                use_decoder_state_for_postnet_input=False, use_memory_mask=False):
 30 |     model = deepvoice3(n_vocab=n_vocab,
 31 |                        embed_dim=32,
 32 |                        mel_dim=num_mels,
 33 |                        linear_dim=num_freq,
 34 |                        r=outputs_per_step,
 35 |                        padding_idx=padding_idx,
 36 |                        n_speakers=n_speakers,
 37 |                        speaker_embed_dim=speaker_embed_dim,
 38 |                        dropout=1 - 0.95,
 39 |                        kernel_size=5,
 40 |                        encoder_channels=16,
 41 |                        decoder_channels=32,
 42 |                        converter_channels=32,
 43 |                        force_monotonic_attention=force_monotonic_attention,
 44 |                        use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input,
 45 |                        use_memory_mask=use_memory_mask,
 46 |                        )
 47 |     return model
 48 | 
 49 | 
 50 | def _pad(seq, max_len):
 51 |     return np.pad(seq, (0, max_len - len(seq)),
 52 |                   mode='constant', constant_values=0)
 53 | 
 54 | 
 55 | def _test_data():
 56 |     texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
 57 |     seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
 58 |     input_lengths = np.array([len(s) for s in seqs])
 59 |     max_len = np.max(input_lengths)
 60 |     seqs = np.array([_pad(s, max_len) for s in seqs])
 61 | 
 62 |     # Test encoder
 63 |     x = torch.LongTensor(seqs)
 64 |     y = torch.rand(x.size(0), 12, 80)
 65 | 
 66 |     return x, y, input_lengths
 67 | 
 68 | 
 69 | def _deepvoice3(n_vocab, embed_dim=256, mel_dim=80,
 70 |                 linear_dim=4096, r=5,
 71 |                 n_speakers=1, speaker_embed_dim=16,
 72 |                 padding_idx=None,
 73 |                 dropout=(1 - 0.95), dilation=1):
 74 | 
 75 |     from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter
 76 |     h = 128
 77 |     encoder = Encoder(
 78 |         n_vocab, embed_dim, padding_idx=padding_idx,
 79 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 80 |         dropout=dropout,
 81 |         convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
 82 |                       (h, 3, dilation), (h, 3, dilation)],
 83 |     )
 84 | 
 85 |     h = 256
 86 |     decoder = Decoder(
 87 |         embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx,
 88 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
 89 |         dropout=dropout,
 90 |         preattention=[(h, 3, 1)],
 91 |         convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
 92 |                       (h, 3, dilation), (h, 3, dilation)],
 93 |         attention=[True, False, False, False, True],
 94 |         force_monotonic_attention=False)
 95 | 
 96 |     seq2seq = AttentionSeq2Seq(encoder, decoder)
 97 | 
 98 |     in_dim = mel_dim
 99 |     h = 256
100 |     converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim,
101 |                           in_dim=in_dim, out_dim=linear_dim, dropout=dropout,
102 |                           convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation),
103 |                                         (h, 3, dilation), (h, 3, dilation)])
104 | 
105 |     model = MultiSpeakerTTSModel(
106 |         seq2seq, converter, padding_idx=padding_idx,
107 |         mel_dim=mel_dim, linear_dim=linear_dim,
108 |         n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim)
109 | 
110 |     return model
111 | 
112 | 
113 | def test_single_speaker_deepvoice3():
114 |     x, y, lengths = _test_data()
115 | 
116 |     for v in [False, True]:
117 |         model = _get_model(use_decoder_state_for_postnet_input=v)
118 |         mel_outputs, linear_outputs, alignments, done = model(x, y, input_lengths=lengths)
119 | 
120 |     model = _get_model(use_memory_mask=True)
121 |     mel_outputs, linear_outputs, alignments, done = model(x, y, input_lengths=lengths)
122 | 
123 | 
124 | def _pad_2d(x, max_len, b_pad=0):
125 |     x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)],
126 |                mode="constant", constant_values=0)
127 |     return x
128 | 
129 | 
130 | def test_multi_speaker_deepvoice3():
131 |     texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
132 |     seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
133 |     input_lengths = np.array([len(s) for s in seqs])
134 |     max_len = np.max(input_lengths)
135 |     seqs = np.array([_pad(s, max_len) for s in seqs])
136 | 
137 |     # Test encoder
138 |     x = torch.LongTensor(seqs)
139 |     y = torch.rand(x.size(0), 4 * 33, 80)
140 |     model = _get_model(n_speakers=32, speaker_embed_dim=16)
141 |     speaker_ids = torch.LongTensor([1, 2, 3])
142 | 
143 |     mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids)
144 |     print("Input text:", x.size())
145 |     print("Input mel:", y.size())
146 |     print("Mel:", mel_outputs.size())
147 |     print("Linear:", linear_outputs.size())
148 |     print("Alignments:", alignments.size())
149 |     print("Done:", done.size())
150 | 
151 | 
152 | @attr("issue38")
153 | def test_incremental_path_multiple_times():
154 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
155 |     seqs = np.array([text_to_sequence(t) for t in texts])
156 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
157 | 
158 |     r = 4
159 |     mel_dim = 80
160 |     sequence = torch.LongTensor(seqs)
161 |     text_positions = torch.LongTensor(text_positions)
162 | 
163 |     for model, speaker_ids in [
164 |             (_get_model(force_monotonic_attention=False), None),
165 |             (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), torch.LongTensor([1]))]:
166 |         model.eval()
167 | 
168 |         # first call
169 |         mel_outputs, linear_outputs, alignments, done = model(
170 |             sequence, text_positions=text_positions, speaker_ids=speaker_ids)
171 | 
172 |         # second call
173 |         mel_outputs2, linear_outputs2, alignments2, done2 = model(
174 |             sequence, text_positions=text_positions, speaker_ids=speaker_ids)
175 | 
176 |         # Should get same result
177 |         c = (mel_outputs - mel_outputs2).abs()
178 |         print(c.mean(), c.max())
179 | 
180 |         assert np.allclose(mel_outputs.cpu().data.numpy(),
181 |                            mel_outputs2.cpu().data.numpy(), atol=1e-5)
182 | 
183 | 
184 | def test_incremental_correctness():
185 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
186 |     seqs = np.array([text_to_sequence(t) for t in texts])
187 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
188 | 
189 |     mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
190 |     mel = np.load(mel_path)
191 |     max_target_len = mel.shape[0]
192 |     r = 4
193 |     mel_dim = 80
194 |     if max_target_len % r != 0:
195 |         max_target_len += r - max_target_len % r
196 |         assert max_target_len % r == 0
197 |     mel = _pad_2d(mel, max_target_len)
198 |     mel = torch.from_numpy(mel)
199 |     mel_reshaped = mel.contiguous().view(1, -1, mel_dim * r)
200 |     frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
201 | 
202 |     x = torch.LongTensor(seqs)
203 |     text_positions = torch.LongTensor(text_positions)
204 |     frame_positions = torch.LongTensor(frame_positions)
205 | 
206 |     for model, speaker_ids in [
207 |             (_get_model(force_monotonic_attention=False), None),
208 |             (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), torch.LongTensor([1]))]:
209 |         model.eval()
210 | 
211 |         if speaker_ids is not None:
212 |             speaker_embed = model.embed_speakers(speaker_ids)
213 |         else:
214 |             speaker_embed = None
215 | 
216 |         # Encoder
217 |         encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed)
218 | 
219 |         # Off line decoding
220 |         mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
221 |             encoder_outs, mel_reshaped, speaker_embed=speaker_embed,
222 |             text_positions=text_positions, frame_positions=frame_positions)
223 | 
224 |         # Online decoding with test inputs
225 |         model.seq2seq.decoder.start_fresh_sequence()
226 |         mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
227 |             encoder_outs, text_positions, speaker_embed=speaker_embed,
228 |             test_inputs=mel_reshaped)
229 | 
230 |         # Should get same result
231 |         c = (mel_outputs_offline - mel_outputs_online).abs()
232 |         print(c.mean(), c.max())
233 | 
234 |         assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
235 |                            mel_outputs_online.cpu().data.numpy(), atol=1e-5)
236 | 
237 | 
238 | @attr("local_only")
239 | def test_incremental_forward():
240 |     checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth")
241 |     if not exists(checkpoint_path):
242 |         return
243 |     model = _get_model()
244 | 
245 |     use_cuda = False
246 | 
247 |     checkpoint = torch.load(checkpoint_path)
248 |     model.load_state_dict(checkpoint["state_dict"])
249 |     model.make_generation_fast_()
250 |     model = model.cuda() if use_cuda else model
251 | 
252 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
253 |     seqs = np.array([text_to_sequence(t) for t in texts])
254 |     input_lengths = [len(s) for s in seqs]
255 | 
256 |     use_manual_padding = False
257 |     if use_manual_padding:
258 |         max_input_len = np.max(input_lengths) + 10  # manuall padding
259 |         seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int)
260 |         input_lengths = torch.LongTensor(input_lengths)
261 |         input_lengths = input_lengths.cuda() if use_cuda else input_lengths
262 |     else:
263 |         input_lengths = None
264 | 
265 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
266 | 
267 |     mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
268 |     max_target_len = mel.shape[0]
269 |     r = 4
270 |     mel_dim = 80
271 |     if max_target_len % r != 0:
272 |         max_target_len += r - max_target_len % r
273 |         assert max_target_len % r == 0
274 |     mel = _pad_2d(mel, max_target_len)
275 |     mel = torch.from_numpy(mel)
276 |     mel_reshaped = mel.contiguous().view(1, -1, mel_dim * r)
277 | 
278 |     frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
279 | 
280 |     x = torch.LongTensor(seqs)
281 |     text_positions = torch.LongTensor(text_positions)
282 |     frame_positions = torch.LongTensor(frame_positions)
283 | 
284 |     if use_cuda:
285 |         x = x.cuda()
286 |         text_positions = text_positions.cuda()
287 |         frame_positions = frame_positions.cuda()
288 |         mel_reshaped = mel_reshaped.cuda()
289 | 
290 |     model.eval()
291 | 
292 |     def _plot(mel, mel_predicted, alignments):
293 |         from matplotlib import pylab as plt
294 |         plt.figure(figsize=(16, 10))
295 |         plt.subplot(3, 1, 1)
296 |         plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom",
297 |                    aspect="auto", cmap="magma")
298 |         plt.colorbar()
299 | 
300 |         plt.subplot(3, 1, 2)
301 |         plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
302 |                    origin="lower bottom", aspect="auto", cmap="magma")
303 |         plt.colorbar()
304 | 
305 |         plt.subplot(3, 1, 3)
306 |         if alignments.dim() == 4:
307 |             alignments = alignments.mean(0)
308 |         plt.imshow(alignments[0].data.cpu(
309 |         ).numpy().T, origin="lower bottom", aspect="auto")
310 |         plt.colorbar()
311 |         plt.show()
312 | 
313 |     # Encoder
314 |     encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths)
315 | 
316 |     # Off line decoding
317 |     mel_output_offline, alignments_offline, done = model.seq2seq.decoder(
318 |         encoder_outs, mel_reshaped,
319 |         text_positions=text_positions, frame_positions=frame_positions,
320 |         lengths=input_lengths)
321 | 
322 |     _plot(mel, mel_output_offline, alignments_offline)
323 | 
324 |     # Online decoding
325 |     test_inputs = None
326 |     # test_inputs = mel_reshaped
327 |     model.seq2seq.decoder.start_fresh_sequence()
328 |     mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward(
329 |         encoder_outs, text_positions,
330 |         # initial_input=mel_reshaped[:, :1, :],
331 |         test_inputs=test_inputs)
332 | 
333 |     if test_inputs is not None:
334 |         c = (mel_output_offline - mel_outputs).abs()
335 |         print(c.mean(), c.max())
336 |         _plot(mel, c, alignments)
337 | 
338 |     _plot(mel, mel_outputs, alignments)
339 | 


--------------------------------------------------------------------------------
/tests/test_embedding.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | from deepvoice3_pytorch.modules import SinusoidalEncoding, position_encoding_init
 7 | import numpy as np
 8 | 
 9 | 
10 | def test_sinusoidal():
11 |     num_embedding = 512
12 |     embedding_dim = 128
13 | 
14 |     for w in [1.0, 0.5, 2.0, 10.0, 20.0]:
15 |         a = nn.Embedding(num_embedding, embedding_dim, padding_idx=0)
16 |         a.weight.data = position_encoding_init(
17 |             num_embedding, embedding_dim, position_rate=w)
18 | 
19 |         b = SinusoidalEncoding(num_embedding, embedding_dim)
20 | 
21 |         x = torch.arange(0, 128).long()
22 |         ax = a(x).data.numpy()
23 |         bx = b(x, w).data.numpy()
24 | 
25 |         print(w, np.abs(ax - bx).mean())
26 |         try:
27 |             assert np.allclose(ax, bx)
28 |         except:
29 |             print("TODO: has little numerical errors?")
30 |             assert np.abs(ax - bx).mean() < 1e-5
31 | 


--------------------------------------------------------------------------------
/tests/test_frontend.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | from deepvoice3_pytorch import frontend
 5 | from nose.plugins.attrib import attr
 6 | 
 7 | eos = 1
 8 | 
 9 | 
10 | def test_en():
11 |     f = getattr(frontend, "en")
12 |     seq = f.text_to_sequence("hello world.")
13 |     assert seq[-1] == eos
14 |     t = f.sequence_to_text(seq)
15 |     assert t == "hello world.~"
16 | 
17 | 
18 | @attr("local_only")
19 | def test_ja():
20 |     f = getattr(frontend, "jp")
21 |     seq = f.text_to_sequence("こんにちわ")
22 |     assert seq[-1] == eos
23 |     t = f.sequence_to_text(seq)
24 |     assert t[:-1] == "コンニチワ。"
25 | 
26 | 
27 | @attr("local_only")
28 | def test_en_lj():
29 |     f = getattr(frontend, "en")
30 |     from nnmnkwii.datasets import ljspeech
31 |     from tqdm import trange
32 |     import jaconv
33 | 
34 |     d = ljspeech.TranscriptionDataSource("/home/ryuichi/data/LJSpeech-1.0")
35 |     texts = d.collect_files()
36 | 
37 |     for p in [0.0, 0.5, 1.0]:
38 |         for idx in trange(len(texts)):
39 |             text = texts[idx]
40 |             seq = f.text_to_sequence(text, p=p)
41 |             assert seq[-1] == eos
42 |             t = f.sequence_to_text(seq)
43 | 
44 |             if idx < 10:
45 |                 print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t))
46 | 
47 | 
48 | @attr("local_only")
49 | def test_ja_jsut():
50 |     f = getattr(frontend, "jp")
51 |     from nnmnkwii.datasets import jsut
52 |     from tqdm import trange
53 |     import jaconv
54 | 
55 |     d = jsut.TranscriptionDataSource("/home/ryuichi/data/jsut_ver1.1/",
56 |                                      subsets=jsut.available_subsets)
57 |     texts = d.collect_files()
58 | 
59 |     for p in [0.0, 0.5, 1.0]:
60 |         for idx in trange(len(texts)):
61 |             text = texts[idx]
62 |             seq = f.text_to_sequence(text, p=p)
63 |             assert seq[-1] == eos
64 |             t = f.sequence_to_text(seq)
65 | 
66 |             if idx < 10:
67 |                 print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t))
68 | 


--------------------------------------------------------------------------------
/tests/test_nyanko.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import sys
  5 | from os.path import dirname, join, exists
  6 | 
  7 | from deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | import numpy as np
 12 | 
 13 | from nose.plugins.attrib import attr
 14 | 
 15 | from deepvoice3_pytorch.builder import nyanko
 16 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq
 17 | 
 18 | use_cuda = torch.cuda.is_available() and False
 19 | num_mels = 80
 20 | num_freq = 513
 21 | outputs_per_step = 4
 22 | padding_idx = 0
 23 | 
 24 | 
 25 | def _pad(seq, max_len):
 26 |     return np.pad(seq, (0, max_len - len(seq)),
 27 |                   mode='constant', constant_values=0)
 28 | 
 29 | 
 30 | def _test_data():
 31 |     texts = ["Thank you very much.", "Hello.", "Deep voice 3."]
 32 |     seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts]
 33 |     input_lengths = np.array([len(s) for s in seqs])
 34 |     max_len = np.max(input_lengths)
 35 |     seqs = np.array([_pad(s, max_len) for s in seqs])
 36 | 
 37 |     # Test encoder
 38 |     x = torch.LongTensor(seqs)
 39 |     y = torch.rand(x.size(0), 12, 80)
 40 | 
 41 |     return x, y
 42 | 
 43 | 
 44 | def _pad_2d(x, max_len, b_pad=0):
 45 |     x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)],
 46 |                mode="constant", constant_values=0)
 47 |     return x
 48 | 
 49 | 
 50 | def test_nyanko_basics():
 51 |     x, y = _test_data()
 52 | 
 53 |     for v in [False, True]:
 54 |         model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4,
 55 |                        use_decoder_state_for_postnet_input=v)
 56 |         mel_outputs, linear_outputs, alignments, done = model(x, y)
 57 | 
 58 | 
 59 | @attr("issue38")
 60 | def test_incremental_path_multiple_times():
 61 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
 62 |     seqs = np.array([text_to_sequence(t) for t in texts])
 63 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
 64 | 
 65 |     r = 1
 66 |     mel_dim = 80
 67 | 
 68 |     sequence = torch.LongTensor(seqs)
 69 |     text_positions = torch.LongTensor(text_positions)
 70 | 
 71 |     model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
 72 |                    r=r, force_monotonic_attention=False)
 73 |     model.eval()
 74 | 
 75 |     # first call
 76 |     mel_outputs, linear_outputs, alignments, done = model(
 77 |         sequence, text_positions=text_positions, speaker_ids=None)
 78 | 
 79 |     # second call
 80 |     mel_outputs2, linear_outputs2, alignments2, done2 = model(
 81 |         sequence, text_positions=text_positions, speaker_ids=None)
 82 | 
 83 |     # Should get same result
 84 |     c = (mel_outputs - mel_outputs2).abs()
 85 |     print(c.mean(), c.max())
 86 | 
 87 |     assert np.allclose(mel_outputs.cpu().data.numpy(),
 88 |                        mel_outputs2.cpu().data.numpy(), atol=1e-5)
 89 | 
 90 | 
 91 | def test_incremental_correctness():
 92 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
 93 |     seqs = np.array([text_to_sequence(t) for t in texts])
 94 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
 95 | 
 96 |     mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
 97 |     mel = np.load(mel_path)[::4]
 98 |     max_target_len = mel.shape[0]
 99 |     r = 1
100 |     mel_dim = 80
101 |     if max_target_len % r != 0:
102 |         max_target_len += r - max_target_len % r
103 |         assert max_target_len % r == 0
104 |     mel = _pad_2d(mel, max_target_len)
105 |     mel = torch.from_numpy(mel)
106 |     mel_reshaped = mel.view(1, -1, mel_dim * r)
107 |     frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
108 | 
109 |     x = torch.LongTensor(seqs)
110 |     text_positions = torch.LongTensor(text_positions)
111 |     frame_positions = torch.LongTensor(frame_positions)
112 | 
113 |     model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
114 |                    r=r, force_monotonic_attention=False)
115 |     model.eval()
116 | 
117 |     # Encoder
118 |     encoder_outs = model.seq2seq.encoder(x)
119 | 
120 |     # Off line decoding
121 |     mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
122 |         encoder_outs, mel_reshaped,
123 |         text_positions=text_positions, frame_positions=frame_positions)
124 | 
125 |     # Online decoding with test inputs
126 |     model.seq2seq.decoder.start_fresh_sequence()
127 |     mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
128 |         encoder_outs, text_positions,
129 |         test_inputs=mel_reshaped)
130 | 
131 |     # Should get same result
132 |     assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
133 |                        mel_outputs_online.cpu().data.numpy())
134 | 
135 | 
136 | @attr("local_only")
137 | def test_nyanko():
138 |     texts = ["they discarded this for a more completely Roman and far less beautiful letter."]
139 |     seqs = np.array([text_to_sequence(t) for t in texts])
140 |     text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))
141 | 
142 |     mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy")
143 |     max_target_len = mel.shape[0]
144 |     r = 1
145 |     mel_dim = 80
146 |     if max_target_len % r != 0:
147 |         max_target_len += r - max_target_len % r
148 |         assert max_target_len % r == 0
149 |     mel = _pad_2d(mel, max_target_len)
150 |     mel = torch.from_numpy(mel)
151 |     mel_reshaped = mel.view(1, -1, mel_dim * r)
152 |     frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))
153 | 
154 |     x = torch.LongTensor(seqs)
155 |     text_positions = torch.LongTensor(text_positions)
156 |     frame_positions = torch.LongTensor(frame_positions)
157 | 
158 |     model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
159 |                    r=r, force_monotonic_attention=False)
160 |     model.eval()
161 | 
162 |     def _plot(mel, mel_predicted, alignments):
163 |         from matplotlib import pylab as plt
164 |         plt.figure(figsize=(16, 10))
165 |         plt.subplot(3, 1, 1)
166 |         plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma")
167 |         plt.colorbar()
168 | 
169 |         plt.subplot(3, 1, 2)
170 |         plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
171 |                    origin="lower bottom", aspect="auto", cmap="magma")
172 |         plt.colorbar()
173 | 
174 |         plt.subplot(3, 1, 3)
175 |         if alignments.dim() == 4:
176 |             alignments = alignments.mean(0)
177 |         plt.imshow(alignments[0].data.cpu(
178 |         ).numpy().T, origin="lower bottom", aspect="auto")
179 |         plt.colorbar()
180 |         plt.show()
181 | 
182 |     seq2seq = model.seq2seq
183 | 
184 |     # Encoder
185 |     encoder_outs = seq2seq.encoder(x)
186 | 
187 |     # Off line decoding
188 |     print("Offline decoding")
189 |     mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder(
190 |         encoder_outs, mel_reshaped,
191 |         text_positions=text_positions, frame_positions=frame_positions)
192 | 
193 |     _plot(mel, mel_outputs_offline, alignments_offline)
194 | 
195 |     # Online decoding with test inputs
196 |     print("Online decoding")
197 |     seq2seq.decoder.start_fresh_sequence()
198 |     mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward(
199 |         encoder_outs, text_positions,
200 |         test_inputs=mel_reshaped)
201 | 
202 |     a = mel_outputs_offline.cpu().data.numpy()
203 |     b = mel_outputs_online.cpu().data.numpy()
204 |     c = (mel_outputs_offline - mel_outputs_online).abs()
205 |     print(c.mean(), c.max())
206 | 
207 |     _plot(mel, mel_outputs_offline, alignments_offline)
208 |     _plot(mel, mel_outputs_online, alignments)
209 |     _plot(mel, c, alignments)
210 | 
211 |     # Should get same result
212 |     assert np.allclose(a, b)
213 | 
214 |     postnet = model.postnet
215 | 
216 |     linear_outputs = postnet(mel_outputs_offline)
217 |     print(linear_outputs.size())
218 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999
4 | exclude = docs/,data,build,dist,notebooks,checkpoints*,legacy,vctk_preprocess,nikl_preprocess
5 | 


--------------------------------------------------------------------------------
/vctk.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | from nnmnkwii.datasets import vctk
 7 | from nnmnkwii.io import hts
 8 | from hparams import hparams
 9 | from os.path import exists
10 | import librosa
11 | 
12 | 
13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
14 |     executor = ProcessPoolExecutor(max_workers=num_workers)
15 |     futures = []
16 | 
17 |     speakers = vctk.available_speakers
18 | 
19 |     td = vctk.TranscriptionDataSource(in_dir, speakers=speakers)
20 |     transcriptions = td.collect_files()
21 |     speaker_ids = td.labels
22 |     wav_paths = vctk.WavFileDataSource(
23 |         in_dir, speakers=speakers).collect_files()
24 | 
25 |     for index, (speaker_id, text, wav_path) in enumerate(
26 |             zip(speaker_ids, transcriptions, wav_paths)):
27 |         futures.append(executor.submit(
28 |             partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text)))
29 |     return [future.result() for future in tqdm(futures)]
30 | 
31 | 
32 | def start_at(labels):
33 |     has_silence = labels[0][-1] == "pau"
34 |     if not has_silence:
35 |         return labels[0][0]
36 |     for i in range(1, len(labels)):
37 |         if labels[i][-1] != "pau":
38 |             return labels[i][0]
39 |     assert False
40 | 
41 | 
42 | def end_at(labels):
43 |     has_silence = labels[-1][-1] == "pau"
44 |     if not has_silence:
45 |         return labels[-1][1]
46 |     for i in range(len(labels) - 2, 0, -1):
47 |         if labels[i][-1] != "pau":
48 |             return labels[i][1]
49 |     assert False
50 | 
51 | 
52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text):
53 |     sr = hparams.sample_rate
54 | 
55 |     # Load the audio to a numpy array:
56 |     wav = audio.load_wav(wav_path)
57 | 
58 |     lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
59 | 
60 |     # Trim silence from hts labels if available
61 |     if exists(lab_path):
62 |         labels = hts.load(lab_path)
63 |         b = int(start_at(labels) * 1e-7 * sr)
64 |         e = int(end_at(labels) * 1e-7 * sr)
65 |         wav = wav[b:e]
66 |         wav, _ = librosa.effects.trim(wav, top_db=25)
67 |     else:
68 |         wav, _ = librosa.effects.trim(wav, top_db=15)
69 | 
70 |     if hparams.rescaling:
71 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
72 | 
73 |     # Compute the linear-scale spectrogram from the wav:
74 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
75 |     n_frames = spectrogram.shape[1]
76 | 
77 |     # Compute a mel-scale spectrogram from the wav:
78 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
79 | 
80 |     # Write the spectrograms to disk:
81 |     spectrogram_filename = 'vctk-spec-%05d.npy' % index
82 |     mel_filename = 'vctk-mel-%05d.npy' % index
83 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
84 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
85 | 
86 |     # Return a tuple describing this training example:
87 |     return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
88 | 


--------------------------------------------------------------------------------
/vctk_preprocess/.gitignore:
--------------------------------------------------------------------------------
1 | latest_features
2 | tts_env.sh
3 | 


--------------------------------------------------------------------------------
/vctk_preprocess/README.md:
--------------------------------------------------------------------------------
 1 | # Preprocessing for VCTK
 2 | 
 3 | Wav files in VCTK contains lots of long silences, which affects training char-level seq2seq models. To deal with the problem, we will
 4 | 
 5 | - **Prepare phoneme alignments for all utterances** (code in the directory)
 6 | - Cut silences during preprocessing (code in the parent directory)
 7 | 
 8 | ## Note
 9 | 
10 | Code in the directory heavily relies on https://gist.github.com/kastnerkyle/cc0ac48d34860c5bb3f9112f4d9a0300 (which is hard copied in the repo). If you have any issues, please make sure that you can successfully run the script.
11 | 
12 | ## Steps
13 | 
14 | 1. Download VCTK: http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html
15 | 2. Install HTK/speech_tools/festival/merlin and prepare `tts_env.sh`. If you don't have speech tools, you can install them by https://gist.github.com/kastnerkyle/001a58a58d090658ee5350cb6129f857. For the reference, `tts_env.sh` of mine is:
16 | ```
17 | export ESTDIR=/home/ryuichi/Dropbox/sp/speech_tools/
18 | export FESTDIR=/home/ryuichi/Dropbox/sp/festival/
19 | export FESTVOXDIR=/home/ryuichi/Dropbox/sp/festvox/
20 | export VCTKDIR=/home/ryuichi/data/VCTK-Corpus/
21 | export HTKDIR=/usr/local/HTS-2.3/bin/
22 | export SPTKDIR=/usr/local/bin/
23 | export MERLINDIR=/home/ryuichi/Dropbox/sp/merlin_pr/
24 | ```
25 | 3. Run the script (takes ~24 hours)
26 | ```
27 | python prepare_vctk_labels.py ${your_vctk_dir} ${dst_dir}
28 | ```
29 | This will process all utterances of VCTK and copy HTK-style alignments to `${dst_dir}`.
30 | It is recommended to copy alignments to the top of VCTK corpus. i.e.,
31 | ```
32 | python prepare_vctk_labels.py ~/data/VCTK-Corpus ~/data/VCTK-Corpus/lab
33 | ```
34 | 
35 | After the above steps, you will get alignments as follows:
36 | 
37 | ```                                                                                                              
38 | tree ~/data/VCTK-Corpus/lab/ | head                                                                                                                      /home/ryuichi/data/VCTK-Corpus/lab/
39 | ├── p225
40 | │   ├── p225_001.lab
41 | │   ├── p225_002.lab
42 | │   ├── p225_003.lab
43 | │   ├── p225_004.lab
44 | │   ├── p225_005.lab
45 | │   ├── p225_006.lab
46 | │   ├── p225_007.lab
47 | │   ├── p225_008.lab
48 | ```
49 | 
50 | ```
51 | cat ~/data/VCTK-Corpus/lab/p225/p225_001.lab
52 | 
53 |          0     850000 pau
54 |     850000    2850000 pau
55 |    2850000    3600000 p
56 |    3600000    3900000 l
57 |    3900000    6000000 iy
58 |    6000000    8450000 z
59 |    8450000    8600000 k
60 |    8600000   11300000 ao
61 |   11300000   11450000 l
62 |   11450000   12800000 s
63 |   12800000   13099999 t
64 |   13099999   15800000 eh
65 |   15800000   16050000 l
66 |   16050000   17600000 ax
67 |   17600000   20400000 pau
68 | ```
69 | 
70 | ## Using Gentle?
71 | 
72 | `prepare_htk_alignments_vctk.py` do the same things above using [Gentle](https://github.com/lowerquality/gentle), but turned out it seems not very good. Leaving code for future possibility if we can improve.
73 | 


--------------------------------------------------------------------------------
/vctk_preprocess/prepare_htk_alignments_vctk.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | Script for do force alignment by gentle for VCTK. This script takes approx
  4 | ~40 hours to finish. It processes all utterances in VCTK.
  5 | 
  6 | NOTE: Must be run with Python2, since gentle doesn't work with Python3.
  7 | 
  8 | Usage:
  9 |     1. Install https://github.com/lowerquality/gentle
 10 |     2. Download VCTK http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html
 11 | 
 12 | and then run the script by:
 13 | 
 14 |     python2 prepare_htk_alignments_vctk.py ${your_vctk_data_path}
 15 | 
 16 | After running the script, you will see alignment files in `lab` directory as
 17 | follows:
 18 | 
 19 |     > tree ~/data/VCTK-Corpus/ -d -L
 20 | 
 21 |     /home/ryuichi/data/VCTK-Corpus/
 22 |     ├── lab
 23 |     ├── txt
 24 |     └── wav48
 25 | """
 26 | import argparse
 27 | import logging
 28 | import multiprocessing
 29 | import os
 30 | import sys
 31 | from tqdm import tqdm
 32 | import json
 33 | from os.path import join, basename, dirname, exists
 34 | import numpy as np
 35 | 
 36 | import gentle
 37 | import librosa
 38 | from nnmnkwii.datasets import vctk
 39 | 
 40 | 
 41 | def on_progress(p):
 42 |     for k, v in p.items():
 43 |         logging.debug("%s: %s" % (k, v))
 44 | 
 45 | 
 46 | def write_hts_label(labels, lab_path):
 47 |     lab = ""
 48 |     for s, e, l in labels:
 49 |         s, e = float(s) * 1e7, float(e) * 1e7
 50 |         s, e = int(s), int(e)
 51 |         lab += "{} {} {}\n".format(s, e, l)
 52 |     print(lab)
 53 |     with open(lab_path, "w") as f:
 54 |         f.write(lab)
 55 | 
 56 | 
 57 | def json2hts(data):
 58 |     emit_bos = False
 59 |     emit_eos = False
 60 | 
 61 |     phone_start = 0
 62 |     phone_end = None
 63 |     labels = []
 64 | 
 65 |     for word in data["words"]:
 66 |         case = word["case"]
 67 |         if case != "success":
 68 |             raise RuntimeError("Alignment failed")
 69 |         start = float(word["start"])
 70 |         word_end = float(word["end"])
 71 | 
 72 |         if not emit_bos:
 73 |             labels.append((phone_start, start, "silB"))
 74 |             emit_bos = True
 75 | 
 76 |         phone_start = start
 77 |         phone_end = None
 78 |         for phone in word["phones"]:
 79 |             ph = str(phone["phone"][:-2])
 80 |             duration = float(phone["duration"])
 81 |             phone_end = phone_start + duration
 82 |             labels.append((phone_start, phone_end, ph))
 83 |             phone_start += duration
 84 |         assert np.allclose(phone_end, word_end)
 85 |     if not emit_eos:
 86 |         labels.append((phone_start, phone_end, "silE"))
 87 |         emit_eos = True
 88 | 
 89 |     return labels
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     parser = argparse.ArgumentParser(
 94 |         description='Do force alignment for VCTK and save HTK-style alignments')
 95 |     parser.add_argument(
 96 |         '--nthreads', default=multiprocessing.cpu_count(), type=int,
 97 |         help='number of alignment threads')
 98 |     parser.add_argument(
 99 |         '--conservative', dest='conservative', action='store_true',
100 |         help='conservative alignment')
101 |     parser.set_defaults(conservative=False)
102 |     parser.add_argument(
103 |         '--disfluency', dest='disfluency', action='store_true',
104 |         help='include disfluencies (uh, um) in alignment')
105 |     parser.set_defaults(disfluency=False)
106 |     parser.add_argument(
107 |         '--log', default="INFO",
108 |         help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)')
109 |     parser.add_argument('data_root', type=str, help='Data root')
110 | 
111 |     args = parser.parse_args()
112 | 
113 |     log_level = args.log.upper()
114 |     logging.getLogger().setLevel(log_level)
115 |     disfluencies = set(['uh', 'um'])
116 | 
117 |     data_root = args.data_root
118 | 
119 |     # Do for all speakers
120 |     speakers = vctk.available_speakers
121 | 
122 |     # Collect all transcripts/wav files
123 |     td = vctk.TranscriptionDataSource(data_root, speakers=speakers)
124 |     transcriptions = td.collect_files()
125 |     wav_paths = vctk.WavFileDataSource(
126 |         data_root, speakers=speakers).collect_files()
127 | 
128 |     # Save dir
129 |     save_dir = join(data_root, "lab")
130 |     if not exists(save_dir):
131 |         os.makedirs(save_dir)
132 | 
133 |     resources = gentle.Resources()
134 | 
135 |     for idx in tqdm(range(len(wav_paths))):
136 |         transcript = transcriptions[idx]
137 |         audiofile = wav_paths[idx]
138 |         lab_path = audiofile.replace("wav48/", "lab/").replace(".wav", ".lab")
139 |         print(transcript)
140 |         print(audiofile)
141 |         print(lab_path)
142 |         lab_dir = dirname(lab_path)
143 |         if not exists(lab_dir):
144 |             os.makedirs(lab_dir)
145 | 
146 |         logging.info("converting audio to 8K sampled wav")
147 |         with gentle.resampled(audiofile) as wavfile:
148 |             logging.info("starting alignment")
149 |             aligner = gentle.ForcedAligner(resources, transcript,
150 |                                            nthreads=args.nthreads,
151 |                                            disfluency=args.disfluency,
152 |                                            conservative=args.conservative,
153 |                                            disfluencies=disfluencies)
154 |             result = aligner.transcribe(
155 |                 wavfile, progress_cb=on_progress, logging=logging)
156 | 
157 |             # convert to htk format
158 |             a = json.loads(result.to_json())
159 |             try:
160 |                 labels = json2hts(a)
161 |             except RuntimeError as e:
162 |                 from warnings import warn
163 |                 warn(str(e))
164 |                 continue
165 | 
166 |             # Insert end time
167 |             x, sr = librosa.load(wavfile, sr=8000)
168 |             endtime = float(len(x)) / sr
169 |             labels[-1] = (labels[-1][0], endtime, labels[-1][-1])
170 | 
171 |             # write to file
172 |             write_hts_label(labels, lab_path)
173 | 


--------------------------------------------------------------------------------
/vctk_preprocess/prepare_vctk_labels.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Prepare HTS alignments for VCTK.
 4 | 
 5 | usage: prepare_vctk_labels.py [options] <data_root> <out_dir>
 6 | 
 7 | options:
 8 |     -h, --help               Show help message.
 9 | """
10 | from docopt import docopt
11 | import os
12 | from nnmnkwii.datasets import vctk
13 | from os.path import join, exists, splitext, basename
14 | import sys
15 | from glob import glob
16 | 
17 | from subprocess import Popen, PIPE
18 | from tqdm import tqdm
19 | 
20 | 
21 | def do(cmd):
22 |     print(cmd)
23 |     p = Popen(cmd, shell=True)
24 |     p.wait()
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     args = docopt(__doc__)
29 |     data_root = args["<data_root>"]
30 |     out_dir = args["<out_dir>"]
31 | 
32 |     for idx in tqdm(range(len(vctk.available_speakers))):
33 |         speaker = vctk.available_speakers[idx]
34 | 
35 |         wav_root = join(data_root, "wav48/p{}".format(speaker))
36 |         txt_root = join(data_root, "txt/p{}".format(speaker))
37 |         assert exists(wav_root)
38 |         assert exists(txt_root)
39 |         print(wav_root, txt_root)
40 | 
41 |         # Do alignments
42 |         cmd = "python ./extract_feats.py -w {} -t {}".format(wav_root, txt_root)
43 |         do(cmd)
44 | 
45 |         # Copy
46 |         lab_dir = join(out_dir, "p{}".format(speaker))
47 |         if not exists(lab_dir):
48 |             os.makedirs(lab_dir)
49 |         cmd = "cp ./latest_features/merlin/misc/scripts/alignment/phone_align/full-context-labels/mono/*.lab {}".format(
50 |             lab_dir)
51 |         do(cmd)
52 | 
53 |         # Remove
54 |         do("rm -rf ./latest_features")
55 | 
56 |     sys.exit(0)
57 | 


--------------------------------------------------------------------------------