├── .github └── stale.yml ├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── appveyor.yml ├── assets └── banner.jpg ├── audio.py ├── compute_timestamp_ratio.py ├── deepvoice3_pytorch ├── __init__.py ├── builder.py ├── conv.py ├── deepvoice3.py ├── frontend │ ├── __init__.py │ ├── en │ │ └── __init__.py │ ├── es │ │ └── __init__.py │ ├── jp │ │ └── __init__.py │ ├── ko │ │ └── __init__.py │ └── text │ │ ├── __init__.py │ │ ├── cleaners.py │ │ ├── cmudict.py │ │ ├── numbers.py │ │ └── symbols.py ├── modules.py ├── nyanko.py └── tfcompat │ ├── __init__.py │ ├── hparam.py │ └── readme.md ├── docs ├── .gitignore ├── config.toml ├── content │ └── index.md ├── layouts │ ├── _default │ │ ├── list.html │ │ └── single.html │ ├── index.html │ └── partials │ │ ├── footer.html │ │ ├── header.html │ │ ├── mathjax.html │ │ └── social.html └── static │ ├── audio │ ├── deepvoice3 │ │ └── 3_keithito │ │ │ ├── 0_checkpoint_step000210000.wav │ │ │ ├── 0_checkpoint_step000210000_alignment.png │ │ │ ├── 1_checkpoint_step000210000.wav │ │ │ ├── 1_checkpoint_step000210000_alignment.png │ │ │ ├── 2_checkpoint_step000210000.wav │ │ │ ├── 2_checkpoint_step000210000_alignment.png │ │ │ ├── 3_checkpoint_step000210000.wav │ │ │ ├── 3_checkpoint_step000210000_alignment.png │ │ │ ├── 4_checkpoint_step000210000.wav │ │ │ ├── 4_checkpoint_step000210000_alignment.png │ │ │ ├── 5_checkpoint_step000210000.wav │ │ │ └── 5_checkpoint_step000210000_alignment.png │ ├── deepvoice3_multispeaker │ │ ├── 3_keithito │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ │ ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ ├── 1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ │ ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ ├── 2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ │ ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ ├── 3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ │ ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ ├── 4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ │ ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav │ │ │ ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png │ │ │ ├── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav │ │ │ └── 5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png │ │ └── loop │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav │ │ │ ├── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav │ │ │ └── 0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav │ └── nyanko │ │ └── 3_keithito │ │ ├── 0_20171129_nyanko_checkpoint_step000585000.wav │ │ ├── 0_20171129_nyanko_checkpoint_step000585000_alignment.png │ │ ├── 1_20171129_nyanko_checkpoint_step000585000.wav │ │ ├── 1_20171129_nyanko_checkpoint_step000585000_alignment.png │ │ ├── 2_20171129_nyanko_checkpoint_step000585000.wav │ │ ├── 2_20171129_nyanko_checkpoint_step000585000_alignment.png │ │ ├── 3_20171129_nyanko_checkpoint_step000585000.wav │ │ ├── 3_20171129_nyanko_checkpoint_step000585000_alignment.png │ │ ├── 4_20171129_nyanko_checkpoint_step000585000.wav │ │ ├── 4_20171129_nyanko_checkpoint_step000585000_alignment.png │ │ ├── 5_20171129_nyanko_checkpoint_step000585000.wav │ │ └── 5_20171129_nyanko_checkpoint_step000585000_alignment.png │ ├── css │ ├── custom.css │ ├── normalize.css │ └── skeleton.css │ ├── favicon.png │ └── images │ └── 512logotipo.png ├── dump_hparams_to_json.py ├── gentle_web_align.py ├── hparams.py ├── json_meta.py ├── jsut.py ├── ljspeech.py ├── lrschedule.py ├── nikl_m.py ├── nikl_preprocess ├── README.md └── prepare_metafile.py ├── nikl_s.py ├── preprocess.py ├── presets ├── deepvoice3_ljspeech.json ├── deepvoice3_niklm.json ├── deepvoice3_nikls.json ├── deepvoice3_vctk.json └── nyanko_ljspeech.json ├── release.sh ├── setup.py ├── synthesis.py ├── tests ├── data │ └── ljspeech-mel-00001.npy ├── test_audio.py ├── test_conv.py ├── test_deepvoice3.py ├── test_embedding.py ├── test_frontend.py └── test_nyanko.py ├── tox.ini ├── train.py ├── vctk.py └── vctk_preprocess ├── .gitignore ├── README.md ├── extract_feats.py ├── prepare_htk_alignments_vctk.py └── prepare_vctk_labels.py /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an Issue or Pull Request becomes stale 2 | daysUntilStale: 60 3 | 4 | # Number of days of inactivity before an Issue or Pull Request with the stale label is closed. 5 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale. 6 | daysUntilClose: 7 7 | 8 | # Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled) 9 | onlyLabels: [] 10 | 11 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable 12 | exemptLabels: 13 | - roadmap 14 | - bug 15 | - design 16 | 17 | # Set to true to ignore issues in a project (defaults to false) 18 | exemptProjects: true 19 | 20 | # Set to true to ignore issues in a milestone (defaults to false) 21 | exemptMilestones: true 22 | 23 | # Label to use when marking as stale 24 | staleLabel: wontfix 25 | 26 | # Comment to post when marking as stale. Set to `false` to disable 27 | markComment: > 28 | This issue has been automatically marked as stale because it has not had 29 | recent activity. It will be closed if no further activity occurs. Thank you 30 | for your contributions. 31 | 32 | # Limit the number of actions per hour, from 1-30. Default is 30 33 | limitPerRun: 30 34 | 35 | # Limit to only `issues` or `pulls` 36 | only: issues 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | legacy 2 | notebooks 3 | foobar* 4 | run.sh 5 | README.rst 6 | pretrained_models 7 | deepvoice3_pytorch/version.py 8 | checkpoints* 9 | log 10 | generated 11 | data 12 | datasets 13 | testout 14 | 15 | # Created by https://www.gitignore.io 16 | 17 | ### Python ### 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | env/ 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | 74 | ### IPythonNotebook ### 75 | # Temporary data 76 | .ipynb_checkpoints/ 77 | 78 | 79 | ### SublimeText ### 80 | # cache files for sublime text 81 | *.tmlanguage.cache 82 | *.tmPreferences.cache 83 | *.stTheme.cache 84 | 85 | # workspace files are user-specific 86 | *.sublime-workspace 87 | 88 | # project files should be checked into the repository, unless a significant 89 | # proportion of contributors will probably not be using SublimeText 90 | # *.sublime-project 91 | 92 | # sftp configuration file 93 | sftp-config.json 94 | 95 | 96 | ### Emacs ### 97 | # -*- mode: gitignore; -*- 98 | *~ 99 | \#*\# 100 | /.emacs.desktop 101 | /.emacs.desktop.lock 102 | *.elc 103 | auto-save-list 104 | tramp 105 | .\#* 106 | 107 | # Org-mode 108 | .org-id-locations 109 | *_archive 110 | 111 | # flymake-mode 112 | *_flymake.* 113 | 114 | # eshell files 115 | /eshell/history 116 | /eshell/lastdir 117 | 118 | # elpa packages 119 | /elpa/ 120 | 121 | # reftex files 122 | *.rel 123 | 124 | # AUCTeX auto folder 125 | /auto/ 126 | 127 | # cask packages 128 | .cask/ 129 | 130 | 131 | ### Vim ### 132 | [._]*.s[a-w][a-z] 133 | [._]s[a-w][a-z] 134 | *.un~ 135 | Session.vim 136 | .netrwhist 137 | *~ 138 | 139 | 140 | ### C++ ### 141 | # Compiled Object files 142 | *.slo 143 | *.lo 144 | *.o 145 | *.obj 146 | 147 | # Precompiled Headers 148 | *.gch 149 | *.pch 150 | 151 | # Compiled Dynamic libraries 152 | *.so 153 | *.dylib 154 | *.dll 155 | 156 | # Fortran module files 157 | *.mod 158 | 159 | # Compiled Static libraries 160 | *.lai 161 | *.la 162 | *.a 163 | *.lib 164 | 165 | # Executables 166 | *.exe 167 | *.out 168 | *.app 169 | 170 | 171 | ### OSX ### 172 | .DS_Store 173 | .AppleDouble 174 | .LSOverride 175 | 176 | # Icon must end with two \r 177 | Icon 178 | 179 | 180 | # Thumbnails 181 | ._* 182 | 183 | # Files that might appear on external disk 184 | .Spotlight-V100 185 | .Trashes 186 | 187 | # Directories potentially created on remote AFP share 188 | .AppleDB 189 | .AppleDesktop 190 | Network Trash Folder 191 | Temporary Items 192 | .apdisk 193 | 194 | 195 | ### Linux ### 196 | *~ 197 | 198 | # KDE directory preferences 199 | .directory 200 | 201 | # Linux trash folder which might appear on any partition or disk 202 | .Trash-* 203 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/.gitmodules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | 6 | notifications: 7 | email: false 8 | 9 | before_install: 10 | - sudo apt-get update 11 | - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then 12 | wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh; 13 | else 14 | wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh; 15 | fi 16 | - bash miniconda.sh -b -p $HOME/miniconda 17 | - export PATH="$HOME/miniconda/bin:$PATH" 18 | - hash -r 19 | - conda config --set always_yes yes --set changeps1 no 20 | - conda update -q conda 21 | # Useful for debugging any issues with conda 22 | - conda config --add channels pypi 23 | - conda info -a 24 | - deps='pip numpy scipy cython nose pytorch flake8' 25 | - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch 26 | - source activate test-environment 27 | 28 | install: 29 | - pip install -e ".[test]" 30 | - python -c "import nltk; nltk.download('cmudict')" 31 | 32 | before_script: 33 | # stop the build if there are Python syntax errors or undefined names 34 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | 38 | script: 39 | - nosetests -v -w tests/ -a '!local_only' 40 | 41 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The deepvoice3_pytorch package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2017: Ryuichi Yamamoto. 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining 6 | > a copy of this software and associated documentation files (the 7 | > "Software"), to deal in the Software without restriction, including 8 | > without limitation the rights to use, copy, modify, merge, publish, 9 | > distribute, sublicense, and/or sell copies of the Software, and to 10 | > permit persons to whom the Software is furnished to do so, subject to 11 | > the following conditions: 12 | > 13 | > The above copyright notice and this permission notice shall be 14 | > included in all copies or substantial portions of the Software. 15 | > 16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | > # Part of code was adapted from https://github.com/facebookresearch/fairseq-py 25 | > # Copyright (c) 2017-present, Facebook, Inc. 26 | > # Thier licenses apply. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE.md 2 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - PYTHON_VERSION: "3.6" 4 | PYTHON_ARCH: "64" 5 | MINICONDA: C:\Miniconda36-x64 6 | 7 | branches: 8 | only: 9 | - master 10 | - /release-.*/ 11 | 12 | skip_commits: 13 | message: /\[av skip\]/ 14 | 15 | notifications: 16 | - provider: Email 17 | on_build_success: false 18 | on_build_failure: false 19 | on_build_status_changed: false 20 | 21 | init: 22 | - "ECHO %PYTHON_VERSION% %PYTHON_ARCH% %MINICONDA%" 23 | 24 | install: 25 | - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%" 26 | - conda config --set always_yes yes --set changeps1 no 27 | - conda update -q conda 28 | - conda info -a 29 | - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch -c pytorch" 30 | - activate test-environment 31 | 32 | build_script: 33 | - pip install -e ".[test]" 34 | - python -c "import nltk; nltk.download('cmudict')" 35 | 36 | test_script: 37 | - nosetests -v -w tests/ -a "!local_only" 38 | -------------------------------------------------------------------------------- /assets/banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/assets/banner.jpg -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | from scipy import signal 6 | from hparams import hparams 7 | from scipy.io import wavfile 8 | 9 | import lws 10 | 11 | 12 | def load_wav(path): 13 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | 15 | 16 | def save_wav(wav, path): 17 | wav = wav * 32767 / max(0.01, np.max(np.abs(wav))) 18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 19 | 20 | 21 | def preemphasis(x): 22 | from nnmnkwii.preprocessing import preemphasis 23 | return preemphasis(x, hparams.preemphasis) 24 | 25 | 26 | def inv_preemphasis(x): 27 | from nnmnkwii.preprocessing import inv_preemphasis 28 | return inv_preemphasis(x, hparams.preemphasis) 29 | 30 | 31 | def spectrogram(y): 32 | D = _lws_processor().stft(preemphasis(y)).T 33 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 34 | return _normalize(S) 35 | 36 | 37 | def inv_spectrogram(spectrogram): 38 | '''Converts spectrogram to waveform using librosa''' 39 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 40 | processor = _lws_processor() 41 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 42 | y = processor.istft(D).astype(np.float32) 43 | return inv_preemphasis(y) 44 | 45 | 46 | def melspectrogram(y): 47 | D = _lws_processor().stft(preemphasis(y)).T 48 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 49 | if not hparams.allow_clipping_in_normalization: 50 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 51 | return _normalize(S) 52 | 53 | 54 | def _lws_processor(): 55 | return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech") 56 | 57 | 58 | # Conversions: 59 | 60 | 61 | _mel_basis = None 62 | 63 | 64 | def _linear_to_mel(spectrogram): 65 | global _mel_basis 66 | if _mel_basis is None: 67 | _mel_basis = _build_mel_basis() 68 | return np.dot(_mel_basis, spectrogram) 69 | 70 | 71 | def _build_mel_basis(): 72 | if hparams.fmax is not None: 73 | assert hparams.fmax <= hparams.sample_rate // 2 74 | return librosa.filters.mel(sr=hparams.sample_rate, n_fft=hparams.fft_size, 75 | fmin=hparams.fmin, fmax=hparams.fmax, 76 | n_mels=hparams.num_mels) 77 | 78 | 79 | def _amp_to_db(x): 80 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 81 | return 20 * np.log10(np.maximum(min_level, x)) 82 | 83 | 84 | def _db_to_amp(x): 85 | return np.power(10.0, x * 0.05) 86 | 87 | 88 | def _normalize(S): 89 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 90 | 91 | 92 | def _denormalize(S): 93 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 94 | -------------------------------------------------------------------------------- /compute_timestamp_ratio.py: -------------------------------------------------------------------------------- 1 | """Compute output/input timestamp ratio. 2 | 3 | usage: compute_timestamp_ratio.py [options] 4 | 5 | options: 6 | --hparams= Hyper parameters [default: ]. 7 | --preset= Path of preset parameters (json). 8 | -h, --help Show this help message and exit 9 | """ 10 | from docopt import docopt 11 | import sys 12 | import numpy as np 13 | from hparams import hparams, hparams_debug_string 14 | import train 15 | from train import TextDataSource, MelSpecDataSource 16 | from nnmnkwii.datasets import FileSourceDataset 17 | from tqdm import trange 18 | from deepvoice3_pytorch import frontend 19 | 20 | if __name__ == "__main__": 21 | args = docopt(__doc__) 22 | data_root = args[""] 23 | preset = args["--preset"] 24 | 25 | # Load preset if specified 26 | if preset is not None: 27 | with open(preset) as f: 28 | hparams.parse_json(f.read()) 29 | # Override hyper parameters 30 | hparams.parse(args["--hparams"]) 31 | assert hparams.name == "deepvoice3" 32 | 33 | train._frontend = getattr(frontend, hparams.frontend) 34 | 35 | # Code below 36 | X = FileSourceDataset(TextDataSource(data_root)) 37 | Mel = FileSourceDataset(MelSpecDataSource(data_root)) 38 | 39 | in_sizes = [] 40 | out_sizes = [] 41 | for i in trange(len(X)): 42 | x, m = X[i], Mel[i] 43 | if X.file_data_source.multi_speaker: 44 | x = x[0] 45 | in_sizes.append(x.shape[0]) 46 | out_sizes.append(m.shape[0]) 47 | 48 | in_sizes = np.array(in_sizes) 49 | out_sizes = np.array(out_sizes) 50 | 51 | input_timestamps = np.sum(in_sizes) 52 | output_timestamps = np.sum(out_sizes) / hparams.outputs_per_step / hparams.downsample_step 53 | 54 | print(input_timestamps, output_timestamps, output_timestamps / input_timestamps) 55 | sys.exit(0) 56 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .version import __version__ 4 | 5 | import torch 6 | from torch import nn 7 | 8 | from .modules import Embedding 9 | 10 | 11 | class MultiSpeakerTTSModel(nn.Module): 12 | """Attention seq2seq model + post processing network 13 | """ 14 | 15 | def __init__(self, seq2seq, postnet, 16 | mel_dim=80, linear_dim=513, 17 | n_speakers=1, speaker_embed_dim=16, padding_idx=None, 18 | trainable_positional_encodings=False, 19 | use_decoder_state_for_postnet_input=False, 20 | speaker_embedding_weight_std=0.01, 21 | freeze_embedding=False): 22 | super(MultiSpeakerTTSModel, self).__init__() 23 | self.seq2seq = seq2seq 24 | self.postnet = postnet # referred as "Converter" in DeepVoice3 25 | self.mel_dim = mel_dim 26 | self.linear_dim = linear_dim 27 | self.trainable_positional_encodings = trainable_positional_encodings 28 | self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input 29 | self.freeze_embedding = freeze_embedding 30 | 31 | # Speaker embedding 32 | if n_speakers > 1: 33 | self.embed_speakers = Embedding( 34 | n_speakers, speaker_embed_dim, padding_idx=None, 35 | std=speaker_embedding_weight_std) 36 | self.n_speakers = n_speakers 37 | self.speaker_embed_dim = speaker_embed_dim 38 | 39 | def make_generation_fast_(self): 40 | 41 | def remove_weight_norm(m): 42 | try: 43 | nn.utils.remove_weight_norm(m) 44 | except ValueError: # this module didn't have weight norm 45 | return 46 | self.apply(remove_weight_norm) 47 | 48 | def get_trainable_parameters(self): 49 | freezed_param_ids = set() 50 | 51 | encoder, decoder = self.seq2seq.encoder, self.seq2seq.decoder 52 | 53 | # Avoid updating the position encoding 54 | if not self.trainable_positional_encodings: 55 | pe_query_param_ids = set(map(id, decoder.embed_query_positions.parameters())) 56 | pe_keys_param_ids = set(map(id, decoder.embed_keys_positions.parameters())) 57 | freezed_param_ids |= (pe_query_param_ids | pe_keys_param_ids) 58 | # Avoid updating the text embedding 59 | if self.freeze_embedding: 60 | embed_param_ids = set(map(id, encoder.embed_tokens.parameters())) 61 | freezed_param_ids |= embed_param_ids 62 | 63 | return (p for p in self.parameters() if id(p) not in freezed_param_ids) 64 | 65 | def forward(self, text_sequences, mel_targets=None, speaker_ids=None, 66 | text_positions=None, frame_positions=None, input_lengths=None): 67 | B = text_sequences.size(0) 68 | 69 | if speaker_ids is not None: 70 | assert self.n_speakers > 1 71 | speaker_embed = self.embed_speakers(speaker_ids) 72 | else: 73 | speaker_embed = None 74 | 75 | # Apply seq2seq 76 | # (B, T//r, mel_dim*r) 77 | mel_outputs, alignments, done, decoder_states = self.seq2seq( 78 | text_sequences, mel_targets, speaker_embed, 79 | text_positions, frame_positions, input_lengths) 80 | 81 | # Reshape 82 | # (B, T, mel_dim) 83 | mel_outputs = mel_outputs.view(B, -1, self.mel_dim) 84 | 85 | # Prepare postnet inputs 86 | if self.use_decoder_state_for_postnet_input: 87 | postnet_inputs = decoder_states.view(B, mel_outputs.size(1), -1) 88 | else: 89 | postnet_inputs = mel_outputs 90 | 91 | # (B, T, linear_dim) 92 | # Convert coarse mel-spectrogram (or decoder hidden states) to 93 | # high resolution spectrogram 94 | linear_outputs = self.postnet(postnet_inputs, speaker_embed) 95 | assert linear_outputs.size(-1) == self.linear_dim 96 | 97 | return mel_outputs, linear_outputs, alignments, done 98 | 99 | 100 | class AttentionSeq2Seq(nn.Module): 101 | """Encoder + Decoder with attention 102 | """ 103 | 104 | def __init__(self, encoder, decoder): 105 | super(AttentionSeq2Seq, self).__init__() 106 | self.encoder = encoder 107 | self.decoder = decoder 108 | if isinstance(self.decoder.attention, nn.ModuleList): 109 | self.encoder.num_attention_layers = sum( 110 | [layer is not None for layer in decoder.attention]) 111 | 112 | def forward(self, text_sequences, mel_targets=None, speaker_embed=None, 113 | text_positions=None, frame_positions=None, input_lengths=None): 114 | # (B, T, text_embed_dim) 115 | encoder_outputs = self.encoder( 116 | text_sequences, lengths=input_lengths, speaker_embed=speaker_embed) 117 | 118 | # Mel: (B, T//r, mel_dim*r) 119 | # Alignments: (N, B, T_target, T_input) 120 | # Done: (B, T//r, 1) 121 | mel_outputs, alignments, done, decoder_states = self.decoder( 122 | encoder_outputs, mel_targets, 123 | text_positions=text_positions, frame_positions=frame_positions, 124 | speaker_embed=speaker_embed, lengths=input_lengths) 125 | 126 | return mel_outputs, alignments, done, decoder_states 127 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 5 | 6 | 7 | def deepvoice3(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, 8 | downsample_step=1, 9 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 10 | dropout=(1 - 0.95), kernel_size=5, 11 | encoder_channels=128, 12 | decoder_channels=256, 13 | converter_channels=256, 14 | query_position_rate=1.0, 15 | key_position_rate=1.29, 16 | use_memory_mask=False, 17 | trainable_positional_encodings=False, 18 | force_monotonic_attention=True, 19 | use_decoder_state_for_postnet_input=True, 20 | max_positions=512, 21 | embedding_weight_std=0.1, 22 | speaker_embedding_weight_std=0.01, 23 | freeze_embedding=False, 24 | window_ahead=3, 25 | window_backward=1, 26 | key_projection=False, 27 | value_projection=False, 28 | ): 29 | """Build deepvoice3 30 | """ 31 | from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 32 | 33 | time_upsampling = max(downsample_step // r, 1) 34 | 35 | # Seq2seq 36 | h = encoder_channels # hidden dim (channels) 37 | k = kernel_size # kernel size 38 | encoder = Encoder( 39 | n_vocab, embed_dim, padding_idx=padding_idx, 40 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 41 | dropout=dropout, max_positions=max_positions, 42 | embedding_weight_std=embedding_weight_std, 43 | # (channels, kernel_size, dilation) 44 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 45 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 46 | (h, k, 1), (h, k, 3)], 47 | ) 48 | 49 | h = decoder_channels 50 | decoder = Decoder( 51 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 52 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 53 | dropout=dropout, max_positions=max_positions, 54 | preattention=[(h, k, 1), (h, k, 3)], 55 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 56 | (h, k, 1)], 57 | attention=[True, False, False, False, True], 58 | force_monotonic_attention=force_monotonic_attention, 59 | query_position_rate=query_position_rate, 60 | key_position_rate=key_position_rate, 61 | use_memory_mask=use_memory_mask, 62 | window_ahead=window_ahead, 63 | window_backward=window_backward, 64 | key_projection=key_projection, 65 | value_projection=value_projection, 66 | ) 67 | 68 | seq2seq = AttentionSeq2Seq(encoder, decoder) 69 | 70 | # Post net 71 | if use_decoder_state_for_postnet_input: 72 | in_dim = h // r 73 | else: 74 | in_dim = mel_dim 75 | h = converter_channels 76 | converter = Converter( 77 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 78 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 79 | time_upsampling=time_upsampling, 80 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], 81 | ) 82 | 83 | # Seq2seq + post net 84 | model = MultiSpeakerTTSModel( 85 | seq2seq, converter, padding_idx=padding_idx, 86 | mel_dim=mel_dim, linear_dim=linear_dim, 87 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 88 | trainable_positional_encodings=trainable_positional_encodings, 89 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 90 | speaker_embedding_weight_std=speaker_embedding_weight_std, 91 | freeze_embedding=freeze_embedding) 92 | 93 | return model 94 | 95 | 96 | def nyanko(n_vocab, embed_dim=128, mel_dim=80, linear_dim=513, r=1, 97 | downsample_step=4, 98 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 99 | dropout=(1 - 0.95), kernel_size=3, 100 | encoder_channels=256, 101 | decoder_channels=256, 102 | converter_channels=512, 103 | query_position_rate=1.0, 104 | key_position_rate=1.29, 105 | use_memory_mask=False, 106 | trainable_positional_encodings=False, 107 | force_monotonic_attention=True, 108 | use_decoder_state_for_postnet_input=False, 109 | max_positions=512, embedding_weight_std=0.01, 110 | speaker_embedding_weight_std=0.01, 111 | freeze_embedding=False, 112 | window_ahead=3, 113 | window_backward=1, 114 | key_projection=False, 115 | value_projection=False, 116 | ): 117 | from deepvoice3_pytorch.nyanko import Encoder, Decoder, Converter 118 | assert encoder_channels == decoder_channels 119 | 120 | if n_speakers != 1: 121 | raise ValueError("Multi-speaker is not supported") 122 | if not (downsample_step == 4 and r == 1): 123 | raise ValueError("Not supported. You need to change hardcoded parameters") 124 | 125 | # Seq2seq 126 | encoder = Encoder( 127 | n_vocab, embed_dim, channels=encoder_channels, kernel_size=kernel_size, 128 | padding_idx=padding_idx, 129 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 130 | dropout=dropout, embedding_weight_std=embedding_weight_std, 131 | ) 132 | 133 | decoder = Decoder( 134 | embed_dim, in_dim=mel_dim, r=r, channels=decoder_channels, 135 | kernel_size=kernel_size, padding_idx=padding_idx, 136 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 137 | dropout=dropout, max_positions=max_positions, 138 | force_monotonic_attention=force_monotonic_attention, 139 | query_position_rate=query_position_rate, 140 | key_position_rate=key_position_rate, 141 | use_memory_mask=use_memory_mask, 142 | window_ahead=window_ahead, 143 | window_backward=window_backward, 144 | key_projection=key_projection, 145 | value_projection=value_projection, 146 | ) 147 | 148 | seq2seq = AttentionSeq2Seq(encoder, decoder) 149 | 150 | if use_decoder_state_for_postnet_input: 151 | in_dim = decoder_channels // r 152 | else: 153 | in_dim = mel_dim 154 | 155 | converter = Converter( 156 | in_dim=in_dim, out_dim=linear_dim, channels=converter_channels, 157 | kernel_size=kernel_size, dropout=dropout) 158 | 159 | # Seq2seq + post net 160 | model = MultiSpeakerTTSModel( 161 | seq2seq, converter, padding_idx=padding_idx, 162 | mel_dim=mel_dim, linear_dim=linear_dim, 163 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 164 | trainable_positional_encodings=trainable_positional_encodings, 165 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 166 | speaker_embedding_weight_std=speaker_embedding_weight_std, 167 | freeze_embedding=freeze_embedding) 168 | 169 | return model 170 | 171 | 172 | def deepvoice3_multispeaker(n_vocab, embed_dim=256, mel_dim=80, linear_dim=513, r=4, 173 | downsample_step=1, 174 | n_speakers=1, speaker_embed_dim=16, padding_idx=0, 175 | dropout=(1 - 0.95), kernel_size=5, 176 | encoder_channels=128, 177 | decoder_channels=256, 178 | converter_channels=256, 179 | query_position_rate=1.0, 180 | key_position_rate=1.29, 181 | use_memory_mask=False, 182 | trainable_positional_encodings=False, 183 | force_monotonic_attention=True, 184 | use_decoder_state_for_postnet_input=True, 185 | max_positions=512, 186 | embedding_weight_std=0.1, 187 | speaker_embedding_weight_std=0.01, 188 | freeze_embedding=False, 189 | window_ahead=3, 190 | window_backward=1, 191 | key_projection=True, 192 | value_projection=True, 193 | ): 194 | """Build multi-speaker deepvoice3 195 | """ 196 | from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 197 | 198 | time_upsampling = max(downsample_step // r, 1) 199 | 200 | # Seq2seq 201 | h = encoder_channels # hidden dim (channels) 202 | k = kernel_size # kernel size 203 | encoder = Encoder( 204 | n_vocab, embed_dim, padding_idx=padding_idx, 205 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 206 | dropout=dropout, max_positions=max_positions, 207 | embedding_weight_std=embedding_weight_std, 208 | # (channels, kernel_size, dilation) 209 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 210 | (h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 211 | (h, k, 1), (h, k, 3)], 212 | ) 213 | 214 | h = decoder_channels 215 | decoder = Decoder( 216 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 217 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 218 | dropout=dropout, max_positions=max_positions, 219 | preattention=[(h, k, 1)], 220 | convolutions=[(h, k, 1), (h, k, 3), (h, k, 9), (h, k, 27), 221 | (h, k, 1)], 222 | attention=[True, False, False, False, False], 223 | force_monotonic_attention=force_monotonic_attention, 224 | query_position_rate=query_position_rate, 225 | key_position_rate=key_position_rate, 226 | use_memory_mask=use_memory_mask, 227 | window_ahead=window_ahead, 228 | window_backward=window_backward, 229 | key_projection=key_projection, 230 | value_projection=value_projection, 231 | ) 232 | 233 | seq2seq = AttentionSeq2Seq(encoder, decoder) 234 | 235 | # Post net 236 | if use_decoder_state_for_postnet_input: 237 | in_dim = h // r 238 | else: 239 | in_dim = mel_dim 240 | h = converter_channels 241 | converter = Converter( 242 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 243 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 244 | time_upsampling=time_upsampling, 245 | convolutions=[(h, k, 1), (h, k, 3), (2 * h, k, 1), (2 * h, k, 3)], 246 | ) 247 | 248 | # Seq2seq + post net 249 | model = MultiSpeakerTTSModel( 250 | seq2seq, converter, padding_idx=padding_idx, 251 | mel_dim=mel_dim, linear_dim=linear_dim, 252 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 253 | trainable_positional_encodings=trainable_positional_encodings, 254 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 255 | speaker_embedding_weight_std=speaker_embedding_weight_std, 256 | freeze_embedding=freeze_embedding) 257 | 258 | return model 259 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | class Conv1d(nn.Conv1d): 8 | """Extended nn.Conv1d for incremental dilated convolutions 9 | """ 10 | 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | self.clear_buffer() 14 | self._linearized_weight = None 15 | self.register_backward_hook(self._clear_linearized_weight) 16 | 17 | def incremental_forward(self, input): 18 | # input: (B, T, C) 19 | if self.training: 20 | raise RuntimeError('incremental_forward only supports eval mode') 21 | 22 | # run forward pre hooks (e.g., weight norm) 23 | for hook in self._forward_pre_hooks.values(): 24 | hook(self, input) 25 | 26 | # reshape weight 27 | weight = self._get_linearized_weight() 28 | kw = self.kernel_size[0] 29 | dilation = self.dilation[0] 30 | 31 | bsz = input.size(0) # input: bsz x len x dim 32 | if kw > 1: 33 | input = input.data 34 | if self.input_buffer is None: 35 | self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2)) 36 | self.input_buffer.zero_() 37 | else: 38 | # shift buffer 39 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 40 | # append next input 41 | self.input_buffer[:, -1, :] = input[:, -1, :] 42 | input = self.input_buffer 43 | if dilation > 1: 44 | input = input[:, 0::dilation, :].contiguous() 45 | output = F.linear(input.view(bsz, -1), weight, self.bias) 46 | return output.view(bsz, 1, -1) 47 | 48 | def clear_buffer(self): 49 | self.input_buffer = None 50 | 51 | def _get_linearized_weight(self): 52 | if self._linearized_weight is None: 53 | kw = self.kernel_size[0] 54 | # nn.Conv1d 55 | if self.weight.size() == (self.out_channels, self.in_channels, kw): 56 | weight = self.weight.transpose(1, 2).contiguous() 57 | else: 58 | # fairseq.modules.conv_tbc.ConvTBC 59 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 60 | assert weight.size() == (self.out_channels, kw, self.in_channels) 61 | self._linearized_weight = weight.view(self.out_channels, -1) 62 | return self._linearized_weight 63 | 64 | def _clear_linearized_weight(self, *args): 65 | self._linearized_weight = None 66 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """Text processing frontend 4 | 5 | All frontend module should have the following functions: 6 | 7 | - text_to_sequence(text, p) 8 | - sequence_to_text(sequence) 9 | 10 | and the property: 11 | 12 | - n_vocab 13 | 14 | """ 15 | from deepvoice3_pytorch.frontend import en 16 | 17 | # optinoal Japanese frontend 18 | try: 19 | from deepvoice3_pytorch.frontend import jp 20 | except ImportError: 21 | jp = None 22 | 23 | try: 24 | from deepvoice3_pytorch.frontend import ko 25 | except ImportError: 26 | ko = None 27 | 28 | # if you are going to use the frontend, you need to modify _characters in symbol.py: 29 | # _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ' 30 | try: 31 | from deepvoice3_pytorch.frontend import es 32 | except ImportError: 33 | es = None 34 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/en/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from deepvoice3_pytorch.frontend.text.symbols import symbols 3 | 4 | import nltk 5 | from random import random 6 | 7 | n_vocab = len(symbols) 8 | 9 | _arpabet = nltk.corpus.cmudict.dict() 10 | 11 | 12 | def _maybe_get_arpabet(word, p): 13 | try: 14 | phonemes = _arpabet[word][0] 15 | phonemes = " ".join(phonemes) 16 | except KeyError: 17 | return word 18 | 19 | return '{%s}' % phonemes if random() < p else word 20 | 21 | 22 | def mix_pronunciation(text, p): 23 | text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' ')) 24 | return text 25 | 26 | 27 | def text_to_sequence(text, p=0.0): 28 | if p >= 0: 29 | text = mix_pronunciation(text, p) 30 | from deepvoice3_pytorch.frontend.text import text_to_sequence 31 | text = text_to_sequence(text, ["english_cleaners"]) 32 | return text 33 | 34 | 35 | from deepvoice3_pytorch.frontend.text import sequence_to_text 36 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/es/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from deepvoice3_pytorch.frontend.text.symbols import symbols 3 | 4 | import nltk 5 | from random import random 6 | 7 | n_vocab = len(symbols) 8 | 9 | 10 | def text_to_sequence(text, p=0.0): 11 | from deepvoice3_pytorch.frontend.text import text_to_sequence 12 | text = text_to_sequence(text, ["basic_cleaners"]) 13 | return text 14 | 15 | 16 | from deepvoice3_pytorch.frontend.text import sequence_to_text 17 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/jp/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | import MeCab 5 | import jaconv 6 | from random import random 7 | 8 | n_vocab = 0xffff 9 | 10 | _eos = 1 11 | _pad = 0 12 | _tagger = None 13 | 14 | 15 | def _yomi(mecab_result): 16 | tokens = [] 17 | yomis = [] 18 | for line in mecab_result.split("\n")[:-1]: 19 | s = line.split("\t") 20 | if len(s) == 1: 21 | break 22 | token, rest = s 23 | rest = rest.split(",") 24 | tokens.append(token) 25 | yomi = rest[7] if len(rest) > 7 else None 26 | yomi = None if yomi == "*" else yomi 27 | yomis.append(yomi) 28 | 29 | return tokens, yomis 30 | 31 | 32 | def _mix_pronunciation(tokens, yomis, p): 33 | return "".join( 34 | yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] 35 | for idx in range(len(tokens))) 36 | 37 | 38 | def mix_pronunciation(text, p): 39 | global _tagger 40 | if _tagger is None: 41 | _tagger = MeCab.Tagger("") 42 | tokens, yomis = _yomi(_tagger.parse(text)) 43 | return _mix_pronunciation(tokens, yomis, p) 44 | 45 | 46 | def add_punctuation(text): 47 | last = text[-1] 48 | if last not in [".", ",", "、", "。", "!", "?", "!", "?"]: 49 | text = text + "。" 50 | return text 51 | 52 | 53 | def normalize_delimitor(text): 54 | text = text.replace(",", "、") 55 | text = text.replace(".", "。") 56 | text = text.replace(",", "、") 57 | text = text.replace(".", "。") 58 | return text 59 | 60 | 61 | def text_to_sequence(text, p=0.0): 62 | for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", 63 | "(", ")", "(", ")"]: 64 | text = text.replace(c, "") 65 | text = text.replace("!", "!") 66 | text = text.replace("?", "?") 67 | 68 | text = normalize_delimitor(text) 69 | text = jaconv.normalize(text) 70 | if p > 0: 71 | text = mix_pronunciation(text, p) 72 | text = jaconv.hira2kata(text) 73 | text = add_punctuation(text) 74 | 75 | return [ord(c) for c in text] + [_eos] # EOS 76 | 77 | 78 | def sequence_to_text(seq): 79 | return "".join(chr(n) for n in seq) 80 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/ko/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | from random import random 5 | 6 | n_vocab = 0xffff 7 | 8 | _eos = 1 9 | _pad = 0 10 | _tagger = None 11 | 12 | 13 | def text_to_sequence(text, p=0.0): 14 | return [ord(c) for c in text] + [_eos] # EOS 15 | 16 | def sequence_to_text(seq): 17 | return "".join(chr(n) for n in seq) 18 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from deepvoice3_pytorch.frontend.text import cleaners 3 | from deepvoice3_pytorch.frontend.text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, ' ', text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def add_punctuation(text): 67 | if len(text) == 0: 68 | return text 69 | if text[-1] not in '!,.:;?': 70 | text = text + '.' # without this decoder is confused when to output EOS 71 | return text 72 | 73 | 74 | def basic_cleaners(text): 75 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def transliteration_cleaners(text): 82 | '''Pipeline for non-English text that transliterates to ASCII.''' 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = collapse_whitespace(text) 86 | return text 87 | 88 | 89 | def english_cleaners(text): 90 | '''Pipeline for English text, including number and abbreviation expansion.''' 91 | text = convert_to_ascii(text) 92 | text = add_punctuation(text) 93 | text = lowercase(text) 94 | text = expand_numbers(text) 95 | text = expand_abbreviations(text) 96 | text = collapse_whitespace(text) 97 | return text 98 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | 20 | def __init__(self, file_or_path, keep_ambiguous=True): 21 | if isinstance(file_or_path, str): 22 | with open(file_or_path, encoding='latin-1') as f: 23 | entries = _parse_cmudict(f) 24 | else: 25 | entries = _parse_cmudict(file_or_path) 26 | if not keep_ambiguous: 27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 28 | self._entries = entries 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | def lookup(self, word): 34 | '''Returns list of ARPAbet pronunciations of the given word.''' 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | _alt_re = re.compile(r'\([0-9]+\)') 39 | 40 | 41 | def _parse_cmudict(file): 42 | cmudict = {} 43 | for line in file: 44 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 45 | parts = line.split(' ') 46 | word = re.sub(_alt_re, '', parts[0]) 47 | pronunciation = _get_pronunciation(parts[1]) 48 | if pronunciation: 49 | if word in cmudict: 50 | cmudict[word].append(pronunciation) 51 | else: 52 | cmudict[word] = [pronunciation] 53 | return cmudict 54 | 55 | 56 | def _get_pronunciation(s): 57 | parts = s.strip().split(' ') 58 | for part in parts: 59 | if part not in _valid_symbol_set: 60 | return None 61 | return ' '.join(parts) 62 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/text/numbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return 'two thousand' 54 | elif num > 2000 and num < 2010: 55 | return 'two thousand ' + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + ' hundred' 58 | else: 59 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 60 | else: 61 | return _inflect.number_to_words(num, andword='') 62 | 63 | 64 | def normalize_numbers(text): 65 | text = re.sub(_comma_number_re, _remove_commas, text) 66 | text = re.sub(_pounds_re, r'\1 pounds', text) 67 | text = re.sub(_dollars_re, _expand_dollars, text) 68 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 69 | text = re.sub(_ordinal_re, _expand_ordinal, text) 70 | text = re.sub(_number_re, _expand_number, text) 71 | return text 72 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/frontend/text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from .cmudict import valid_symbols 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet 18 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import torch 4 | from torch import nn 5 | import math 6 | import numpy as np 7 | from torch.nn import functional as F 8 | 9 | 10 | def position_encoding_init(n_position, d_pos_vec, position_rate=1.0, 11 | sinusoidal=True): 12 | ''' Init the sinusoid position encoding table ''' 13 | 14 | # keep dim 0 for padding token position encoding zero vector 15 | position_enc = np.array([ 16 | [position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec)] 17 | if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) 18 | 19 | position_enc = torch.from_numpy(position_enc).float() 20 | if sinusoidal: 21 | position_enc[1:, 0::2] = torch.sin(position_enc[1:, 0::2]) # dim 2i 22 | position_enc[1:, 1::2] = torch.cos(position_enc[1:, 1::2]) # dim 2i+1 23 | 24 | return position_enc 25 | 26 | 27 | def sinusoidal_encode(x, w): 28 | y = w * x 29 | y[1:, 0::2] = torch.sin(y[1:, 0::2].clone()) 30 | y[1:, 1::2] = torch.cos(y[1:, 1::2].clone()) 31 | return y 32 | 33 | 34 | class SinusoidalEncoding(nn.Embedding): 35 | 36 | def __init__(self, num_embeddings, embedding_dim, 37 | *args, **kwargs): 38 | super(SinusoidalEncoding, self).__init__(num_embeddings, embedding_dim, 39 | padding_idx=0, 40 | *args, **kwargs) 41 | self.weight.data = position_encoding_init(num_embeddings, embedding_dim, 42 | position_rate=1.0, 43 | sinusoidal=False) 44 | 45 | def forward(self, x, w=1.0): 46 | isscaler = np.isscalar(w) 47 | assert self.padding_idx is not None 48 | 49 | if isscaler or w.size(0) == 1: 50 | weight = sinusoidal_encode(self.weight, w) 51 | return F.embedding( 52 | x, weight, self.padding_idx, self.max_norm, 53 | self.norm_type, self.scale_grad_by_freq, self.sparse) 54 | else: 55 | # TODO: cannot simply apply for batch 56 | # better to implement efficient function 57 | pe = [] 58 | for batch_idx, we in enumerate(w): 59 | weight = sinusoidal_encode(self.weight, we) 60 | pe.append(F.embedding( 61 | x[batch_idx], weight, self.padding_idx, self.max_norm, 62 | self.norm_type, self.scale_grad_by_freq, self.sparse)) 63 | pe = torch.stack(pe) 64 | return pe 65 | 66 | 67 | class GradMultiply(torch.autograd.Function): 68 | @staticmethod 69 | def forward(ctx, x, scale): 70 | ctx.scale = scale 71 | res = x.new(x) 72 | ctx.mark_shared_storage((x, res)) 73 | return res 74 | 75 | @staticmethod 76 | def backward(ctx, grad): 77 | return grad * ctx.scale, None 78 | 79 | 80 | def Linear(in_features, out_features, dropout=0): 81 | """Weight-normalized Linear layer (input: N x T x C)""" 82 | m = nn.Linear(in_features, out_features) 83 | m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) 84 | m.bias.data.zero_() 85 | return nn.utils.weight_norm(m) 86 | 87 | 88 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01): 89 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 90 | m.weight.data.normal_(0, std) 91 | return m 92 | 93 | 94 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs): 95 | from .conv import Conv1d 96 | m = Conv1d(in_channels, out_channels, kernel_size, **kwargs) 97 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 98 | m.weight.data.normal_(mean=0, std=std) 99 | m.bias.data.zero_() 100 | return nn.utils.weight_norm(m) 101 | 102 | 103 | def ConvTranspose1d(in_channels, out_channels, kernel_size, dropout=0, 104 | std_mul=1.0, **kwargs): 105 | m = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, **kwargs) 106 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 107 | m.weight.data.normal_(mean=0, std=std) 108 | m.bias.data.zero_() 109 | return nn.utils.weight_norm(m) 110 | 111 | 112 | class Conv1dGLU(nn.Module): 113 | """(Dilated) Conv1d + Gated linear unit + (optionally) speaker embedding 114 | """ 115 | 116 | def __init__(self, n_speakers, speaker_embed_dim, 117 | in_channels, out_channels, kernel_size, 118 | dropout, padding=None, dilation=1, causal=False, residual=False, 119 | *args, **kwargs): 120 | super(Conv1dGLU, self).__init__() 121 | self.dropout = dropout 122 | self.residual = residual 123 | if padding is None: 124 | # no future time stamps available 125 | if causal: 126 | padding = (kernel_size - 1) * dilation 127 | else: 128 | padding = (kernel_size - 1) // 2 * dilation 129 | self.causal = causal 130 | 131 | self.conv = Conv1d(in_channels, 2 * out_channels, kernel_size, 132 | dropout=dropout, padding=padding, dilation=dilation, 133 | *args, **kwargs) 134 | if n_speakers > 1: 135 | self.speaker_proj = Linear(speaker_embed_dim, out_channels) 136 | else: 137 | self.speaker_proj = None 138 | 139 | def forward(self, x, speaker_embed=None): 140 | return self._forward(x, speaker_embed, False) 141 | 142 | def incremental_forward(self, x, speaker_embed=None): 143 | return self._forward(x, speaker_embed, True) 144 | 145 | def _forward(self, x, speaker_embed, is_incremental): 146 | residual = x 147 | x = F.dropout(x, p=self.dropout, training=self.training) 148 | if is_incremental: 149 | splitdim = -1 150 | x = self.conv.incremental_forward(x) 151 | else: 152 | splitdim = 1 153 | x = self.conv(x) 154 | # remove future time steps 155 | x = x[:, :, :residual.size(-1)] if self.causal else x 156 | 157 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 158 | if self.speaker_proj is not None: 159 | softsign = F.softsign(self.speaker_proj(speaker_embed)) 160 | # Since conv layer assumes BCT, we need to transpose 161 | softsign = softsign if is_incremental else softsign.transpose(1, 2) 162 | a = a + softsign 163 | x = a * torch.sigmoid(b) 164 | return (x + residual) * math.sqrt(0.5) if self.residual else x 165 | 166 | def clear_buffer(self): 167 | self.conv.clear_buffer() 168 | 169 | 170 | class HighwayConv1d(nn.Module): 171 | """Weight normzlized Conv1d + Highway network (support incremental forward) 172 | """ 173 | 174 | def __init__(self, in_channels, out_channels, kernel_size=1, padding=None, 175 | dilation=1, causal=False, dropout=0, std_mul=None, glu=False): 176 | super(HighwayConv1d, self).__init__() 177 | if std_mul is None: 178 | std_mul = 4.0 if glu else 1.0 179 | if padding is None: 180 | # no future time stamps available 181 | if causal: 182 | padding = (kernel_size - 1) * dilation 183 | else: 184 | padding = (kernel_size - 1) // 2 * dilation 185 | self.causal = causal 186 | self.dropout = dropout 187 | self.glu = glu 188 | 189 | self.conv = Conv1d(in_channels, 2 * out_channels, 190 | kernel_size=kernel_size, padding=padding, 191 | dilation=dilation, dropout=dropout, 192 | std_mul=std_mul) 193 | 194 | def forward(self, x): 195 | return self._forward(x, False) 196 | 197 | def incremental_forward(self, x): 198 | return self._forward(x, True) 199 | 200 | def _forward(self, x, is_incremental): 201 | """Forward 202 | 203 | Args: 204 | x: (B, in_channels, T) 205 | returns: 206 | (B, out_channels, T) 207 | """ 208 | 209 | residual = x 210 | x = F.dropout(x, p=self.dropout, training=self.training) 211 | if is_incremental: 212 | splitdim = -1 213 | x = self.conv.incremental_forward(x) 214 | else: 215 | splitdim = 1 216 | x = self.conv(x) 217 | # remove future time steps 218 | x = x[:, :, :residual.size(-1)] if self.causal else x 219 | 220 | if self.glu: 221 | x = F.glu(x, dim=splitdim) 222 | return (x + residual) * math.sqrt(0.5) 223 | else: 224 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 225 | T = torch.sigmoid(b) 226 | return (T * a + (1 - T) * residual) 227 | 228 | def clear_buffer(self): 229 | self.conv.clear_buffer() 230 | 231 | 232 | def get_mask_from_lengths(memory, memory_lengths): 233 | """Get mask tensor from list of length 234 | Args: 235 | memory: (batch, max_time, dim) 236 | memory_lengths: array like 237 | """ 238 | max_len = max(memory_lengths) 239 | mask = torch.arange(max_len).expand(memory.size(0), max_len) < torch.tensor(memory_lengths).unsqueeze(-1) 240 | mask = mask.to(memory.device) 241 | return ~mask 242 | -------------------------------------------------------------------------------- /deepvoice3_pytorch/tfcompat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/deepvoice3_pytorch/tfcompat/__init__.py -------------------------------------------------------------------------------- /deepvoice3_pytorch/tfcompat/readme.md: -------------------------------------------------------------------------------- 1 | Source: hparam.py copied from tensorflow v1.12.0. 2 | 3 | https://github.com/tensorflow/tensorflow/blob/v1.12.0/tensorflow/contrib/training/python/training/hparam.py 4 | 5 | with the following: 6 | wget https://github.com/tensorflow/tensorflow/raw/v1.12.0/tensorflow/contrib/training/python/training/hparam.py 7 | 8 | Once all other tensorflow dependencies of these file are removed, the class keeps its goal. Functions not available due to this process are not used in this project. 9 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | public -------------------------------------------------------------------------------- /docs/config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "https://r9y9.github.io/deepvoice3_pytorch/" 2 | languageCode = "ja-jp" 3 | title = "An open source implementation of Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning" 4 | author = "Ryuichi YAMAMOTO" 5 | 6 | [params] 7 | author = "Ryuichi YAMAMOTO" 8 | project = "deepvoice3_pytorch" 9 | logo = "/images/512logotipo.png" 10 | twitter = "r9y9" 11 | github = "r9y9" 12 | analytics = "UA-44433856-1" 13 | -------------------------------------------------------------------------------- /docs/layouts/_default/list.html: -------------------------------------------------------------------------------- 1 | {{ partial "header.html" . }} 2 | 3 |
4 |

{{ .Title }}

5 | {{ range .Data.Pages }} 6 | 10 | {{ end }} 11 |
12 | 13 | {{ partial "footer.html" . }} -------------------------------------------------------------------------------- /docs/layouts/_default/single.html: -------------------------------------------------------------------------------- 1 | {{ partial "header.html" . }} 2 | 3 |
4 |
5 |

{{ .Title }}

6 | 7 |
8 | {{ .Content }} 9 | {{ partial "social.html" . }} 10 |
11 |
12 |
13 | 14 | {{ partial "footer.html" . }} 15 | -------------------------------------------------------------------------------- /docs/layouts/index.html: -------------------------------------------------------------------------------- 1 | {{ template "partials/header.html" . }} 2 | {{ range .Data.Pages }} 3 | {{if eq .Type "index" }} 4 | {{.Content}} 5 | {{end}} 6 | {{ end }} 7 | {{ template "partials/footer.html" . }} 8 | -------------------------------------------------------------------------------- /docs/layouts/partials/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 20 | {{ with .Site.Params.analytics }}{{ end }} 28 | 29 | 30 | 31 | 32 | {{ partial "mathjax.html" . }} 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/layouts/partials/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ .Hugo.Generator }} 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | {{ $isHomePage := eq .Title .Site.Title }}{{ .Title }}{{ if eq $isHomePage false }} - {{ .Site.Title }}{{ end }} 15 | 16 | 17 | 18 |
19 | 20 |
21 | 24 | {{ if eq $isHomePage true }}

{{ .Site.Title }}

{{ end }} 25 |
26 | -------------------------------------------------------------------------------- /docs/layouts/partials/mathjax.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | 23 | 30 | 31 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/layouts/partials/social.html: -------------------------------------------------------------------------------- 1 | {{ if isset .Site.Params "twitter" }} 2 | 8 | {{ end }} 9 | -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/0_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/1_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/2_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/3_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/4_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3/3_keithito/5_checkpoint_step000210000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/1_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/2_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/3_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/4_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker61_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/3_keithito/5_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker62_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker0.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker1.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker10.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker11.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker2.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker3.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker4.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker5.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker6.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker7.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker8.wav -------------------------------------------------------------------------------- /docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/deepvoice3_multispeaker/loop/0_20171222_deepvoice3_vctk108_checkpoint_step000300000_speaker9.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/0_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/1_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/2_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/3_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/4_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000.wav -------------------------------------------------------------------------------- /docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000_alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/audio/nyanko/3_keithito/5_20171129_nyanko_checkpoint_step000585000_alignment.png -------------------------------------------------------------------------------- /docs/static/css/custom.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: "Roboto", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | background-color: #FCFCFC; 4 | -webkit-font-smoothing: antialiased; 5 | font-size: 1.8em; 6 | line-height: 1.5; 7 | font-weight: 300; 8 | } 9 | 10 | h1, h2, h3, h4, h5, h6 { 11 | color: #263c4c; 12 | } 13 | h2, h3, h4, h5, h6 { 14 | margin-top: 5rem; 15 | margin-bottom: 3rem; 16 | font-weight: bold; 17 | padding-bottom: 10px; 18 | } 19 | 20 | h1 { font-size: 3.0rem; } 21 | h2 { 22 | margin-top: 6rem; 23 | font-size: 2.6rem; 24 | } 25 | h3 { font-size: 2.1rem; } 26 | h4, 27 | h5, 28 | h6 { font-size: 1.9rem; } 29 | 30 | h2.entry-title { 31 | font-size: 2.1rem; 32 | margin-top: 0; 33 | font-weight: 400; 34 | border-bottom: none; 35 | } 36 | 37 | li { 38 | margin-bottom: 0.5rem; 39 | margin-left: 0.7em; 40 | } 41 | 42 | img { 43 | max-width: 100%; 44 | height: auto; 45 | vertical-align: middle; 46 | border: 0; 47 | margin: 1em 0; 48 | } 49 | 50 | header, 51 | footer { 52 | margin: 4rem 0; 53 | text-align: center; 54 | } 55 | 56 | main { 57 | margin: 4rem 0; 58 | } 59 | 60 | .container { 61 | width: 90%; 62 | max-width: 700px; 63 | } 64 | 65 | .site-title { 66 | margin-top: 2rem; 67 | } 68 | 69 | .entry-title { 70 | margin-bottom: 0; 71 | } 72 | 73 | .entry-title a { 74 | text-decoration: none; 75 | } 76 | 77 | .entry-meta { 78 | display: inline-block; 79 | margin-bottom: 2rem; 80 | font-size: 1.6rem; 81 | color: #888; 82 | } 83 | 84 | .footer-link { 85 | margin: 2rem 0; 86 | } 87 | 88 | .hr { 89 | height: 1px; 90 | margin: 2rem 0; 91 | background: #E1E1E1; 92 | background: -webkit-gradient(linear, left top, right top, from(white), color-stop(#E1E1E1), to(white)); 93 | background: -webkit-linear-gradient(left, white, #E1E1E1, white); 94 | background: linear-gradient(to right, white, #E1E1E1, white); 95 | } 96 | 97 | article .social { 98 | height: 40px; 99 | padding: 10px 0; 100 | } 101 | 102 | address { 103 | margin: 0; 104 | font-size:0.9em; 105 | max-height: 60px; 106 | font-weight: 300; 107 | font-style: normal; 108 | display: block; 109 | } 110 | 111 | address a { 112 | text-decoration: none; 113 | } 114 | 115 | .avatar-bottom img { 116 | border-radius: 50%; 117 | border: 1px solid #E1E1E1; 118 | float: left; 119 | max-width: 100%; 120 | vertical-align: middle; 121 | width: 32px; 122 | height: 32px; 123 | margin: 0 20px 0 0; 124 | margin-top: -7px; 125 | } 126 | 127 | .avatar-bottom img:hover { 128 | border-color: #F1F1F1; 129 | } 130 | 131 | .copyright { 132 | font-size:0.9em; 133 | font-weight: 300; 134 | } 135 | 136 | .github { 137 | float: right; 138 | } 139 | 140 | blockquote { 141 | position: relative; 142 | padding: 10px 10px 10px 32px; 143 | box-sizing: border-box; 144 | font-style: italic; 145 | color: #464646; 146 | background: #e0e0e0; 147 | } 148 | 149 | blockquote:before{ 150 | display: inline-block; 151 | position: absolute; 152 | top: 0; 153 | left: 0; 154 | vertical-align: middle; 155 | content: "\f10d"; 156 | font-family: FontAwesome; 157 | color: #e0e0e0; 158 | font-size: 22px; 159 | line-height: 1; 160 | z-index: 2; 161 | } 162 | 163 | blockquote:after{ 164 | position: absolute; 165 | content: ''; 166 | left: 0; 167 | top: 0; 168 | border-width: 0 0 40px 40px; 169 | border-style: solid; 170 | border-color: transparent #ffffff; 171 | } 172 | 173 | blockquote p { 174 | position: relative; 175 | padding: 0; 176 | margin: 10px 0; 177 | z-index: 3; 178 | line-height: 1.7; 179 | } 180 | 181 | blockquote cite { 182 | display: block; 183 | text-align: right; 184 | color: #888888; 185 | font-size: 0.9em; 186 | } 187 | -------------------------------------------------------------------------------- /docs/static/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */ 2 | 3 | /** 4 | * 1. Set default font family to sans-serif. 5 | * 2. Prevent iOS text size adjust after orientation change, without disabling 6 | * user zoom. 7 | */ 8 | 9 | html { 10 | font-family: sans-serif; /* 1 */ 11 | -ms-text-size-adjust: 100%; /* 2 */ 12 | -webkit-text-size-adjust: 100%; /* 2 */ 13 | } 14 | 15 | /** 16 | * Remove default margin. 17 | */ 18 | 19 | body { 20 | margin: 0; 21 | } 22 | 23 | /* HTML5 display definitions 24 | ========================================================================== */ 25 | 26 | /** 27 | * Correct `block` display not defined for any HTML5 element in IE 8/9. 28 | * Correct `block` display not defined for `details` or `summary` in IE 10/11 29 | * and Firefox. 30 | * Correct `block` display not defined for `main` in IE 11. 31 | */ 32 | 33 | article, 34 | aside, 35 | details, 36 | figcaption, 37 | figure, 38 | footer, 39 | header, 40 | hgroup, 41 | main, 42 | menu, 43 | nav, 44 | section, 45 | summary { 46 | display: block; 47 | } 48 | 49 | /** 50 | * 1. Correct `inline-block` display not defined in IE 8/9. 51 | * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera. 52 | */ 53 | 54 | audio, 55 | canvas, 56 | progress, 57 | video { 58 | display: inline-block; /* 1 */ 59 | vertical-align: baseline; /* 2 */ 60 | } 61 | 62 | /** 63 | * Prevent modern browsers from displaying `audio` without controls. 64 | * Remove excess height in iOS 5 devices. 65 | */ 66 | 67 | audio:not([controls]) { 68 | display: none; 69 | height: 0; 70 | } 71 | 72 | /** 73 | * Address `[hidden]` styling not present in IE 8/9/10. 74 | * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22. 75 | */ 76 | 77 | [hidden], 78 | template { 79 | display: none; 80 | } 81 | 82 | /* Links 83 | ========================================================================== */ 84 | 85 | /** 86 | * Remove the gray background color from active links in IE 10. 87 | */ 88 | 89 | a { 90 | background-color: transparent; 91 | } 92 | 93 | /** 94 | * Improve readability when focused and also mouse hovered in all browsers. 95 | */ 96 | 97 | a:active, 98 | a:hover { 99 | outline: 0; 100 | } 101 | 102 | /* Text-level semantics 103 | ========================================================================== */ 104 | 105 | /** 106 | * Address styling not present in IE 8/9/10/11, Safari, and Chrome. 107 | */ 108 | 109 | abbr[title] { 110 | border-bottom: 1px dotted; 111 | } 112 | 113 | /** 114 | * Address style set to `bolder` in Firefox 4+, Safari, and Chrome. 115 | */ 116 | 117 | b, 118 | strong { 119 | font-weight: bold; 120 | } 121 | 122 | /** 123 | * Address styling not present in Safari and Chrome. 124 | */ 125 | 126 | dfn { 127 | font-style: italic; 128 | } 129 | 130 | /** 131 | * Address variable `h1` font-size and margin within `section` and `article` 132 | * contexts in Firefox 4+, Safari, and Chrome. 133 | */ 134 | 135 | h1 { 136 | font-size: 2em; 137 | margin: 0.67em 0; 138 | } 139 | 140 | /** 141 | * Address styling not present in IE 8/9. 142 | */ 143 | 144 | mark { 145 | background: #ff0; 146 | color: #000; 147 | } 148 | 149 | /** 150 | * Address inconsistent and variable font size in all browsers. 151 | */ 152 | 153 | small { 154 | font-size: 80%; 155 | } 156 | 157 | /** 158 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 159 | */ 160 | 161 | sub, 162 | sup { 163 | font-size: 75%; 164 | line-height: 0; 165 | position: relative; 166 | vertical-align: baseline; 167 | } 168 | 169 | sup { 170 | top: -0.5em; 171 | } 172 | 173 | sub { 174 | bottom: -0.25em; 175 | } 176 | 177 | /* Embedded content 178 | ========================================================================== */ 179 | 180 | /** 181 | * Remove border when inside `a` element in IE 8/9/10. 182 | */ 183 | 184 | img { 185 | border: 0; 186 | } 187 | 188 | /** 189 | * Correct overflow not hidden in IE 9/10/11. 190 | */ 191 | 192 | svg:not(:root) { 193 | overflow: hidden; 194 | } 195 | 196 | /* Grouping content 197 | ========================================================================== */ 198 | 199 | /** 200 | * Address margin not present in IE 8/9 and Safari. 201 | */ 202 | 203 | figure { 204 | margin: 1em 40px; 205 | } 206 | 207 | /** 208 | * Address differences between Firefox and other browsers. 209 | */ 210 | 211 | hr { 212 | -moz-box-sizing: content-box; 213 | box-sizing: content-box; 214 | height: 0; 215 | } 216 | 217 | /** 218 | * Contain overflow in all browsers. 219 | */ 220 | 221 | pre { 222 | overflow: auto; 223 | } 224 | 225 | /** 226 | * Address odd `em`-unit font size rendering in all browsers. 227 | */ 228 | 229 | code, 230 | kbd, 231 | pre, 232 | samp { 233 | font-family: monospace, monospace; 234 | font-size: 1em; 235 | } 236 | 237 | /* Forms 238 | ========================================================================== */ 239 | 240 | /** 241 | * Known limitation: by default, Chrome and Safari on OS X allow very limited 242 | * styling of `select`, unless a `border` property is set. 243 | */ 244 | 245 | /** 246 | * 1. Correct color not being inherited. 247 | * Known issue: affects color of disabled elements. 248 | * 2. Correct font properties not being inherited. 249 | * 3. Address margins set differently in Firefox 4+, Safari, and Chrome. 250 | */ 251 | 252 | button, 253 | input, 254 | optgroup, 255 | select, 256 | textarea { 257 | color: inherit; /* 1 */ 258 | font: inherit; /* 2 */ 259 | margin: 0; /* 3 */ 260 | } 261 | 262 | /** 263 | * Address `overflow` set to `hidden` in IE 8/9/10/11. 264 | */ 265 | 266 | button { 267 | overflow: visible; 268 | } 269 | 270 | /** 271 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 272 | * All other form control elements do not inherit `text-transform` values. 273 | * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera. 274 | * Correct `select` style inheritance in Firefox. 275 | */ 276 | 277 | button, 278 | select { 279 | text-transform: none; 280 | } 281 | 282 | /** 283 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 284 | * and `video` controls. 285 | * 2. Correct inability to style clickable `input` types in iOS. 286 | * 3. Improve usability and consistency of cursor style between image-type 287 | * `input` and others. 288 | */ 289 | 290 | button, 291 | html input[type="button"], /* 1 */ 292 | input[type="reset"], 293 | input[type="submit"] { 294 | -webkit-appearance: button; /* 2 */ 295 | cursor: pointer; /* 3 */ 296 | } 297 | 298 | /** 299 | * Re-set default cursor for disabled elements. 300 | */ 301 | 302 | button[disabled], 303 | html input[disabled] { 304 | cursor: default; 305 | } 306 | 307 | /** 308 | * Remove inner padding and border in Firefox 4+. 309 | */ 310 | 311 | button::-moz-focus-inner, 312 | input::-moz-focus-inner { 313 | border: 0; 314 | padding: 0; 315 | } 316 | 317 | /** 318 | * Address Firefox 4+ setting `line-height` on `input` using `!important` in 319 | * the UA stylesheet. 320 | */ 321 | 322 | input { 323 | line-height: normal; 324 | } 325 | 326 | /** 327 | * It's recommended that you don't attempt to style these elements. 328 | * Firefox's implementation doesn't respect box-sizing, padding, or width. 329 | * 330 | * 1. Address box sizing set to `content-box` in IE 8/9/10. 331 | * 2. Remove excess padding in IE 8/9/10. 332 | */ 333 | 334 | input[type="checkbox"], 335 | input[type="radio"] { 336 | box-sizing: border-box; /* 1 */ 337 | padding: 0; /* 2 */ 338 | } 339 | 340 | /** 341 | * Fix the cursor style for Chrome's increment/decrement buttons. For certain 342 | * `font-size` values of the `input`, it causes the cursor style of the 343 | * decrement button to change from `default` to `text`. 344 | */ 345 | 346 | input[type="number"]::-webkit-inner-spin-button, 347 | input[type="number"]::-webkit-outer-spin-button { 348 | height: auto; 349 | } 350 | 351 | /** 352 | * 1. Address `appearance` set to `searchfield` in Safari and Chrome. 353 | * 2. Address `box-sizing` set to `border-box` in Safari and Chrome 354 | * (include `-moz` to future-proof). 355 | */ 356 | 357 | input[type="search"] { 358 | -webkit-appearance: textfield; /* 1 */ 359 | -moz-box-sizing: content-box; 360 | -webkit-box-sizing: content-box; /* 2 */ 361 | box-sizing: content-box; 362 | } 363 | 364 | /** 365 | * Remove inner padding and search cancel button in Safari and Chrome on OS X. 366 | * Safari (but not Chrome) clips the cancel button when the search input has 367 | * padding (and `textfield` appearance). 368 | */ 369 | 370 | input[type="search"]::-webkit-search-cancel-button, 371 | input[type="search"]::-webkit-search-decoration { 372 | -webkit-appearance: none; 373 | } 374 | 375 | /** 376 | * Define consistent border, margin, and padding. 377 | */ 378 | 379 | fieldset { 380 | border: 1px solid #c0c0c0; 381 | margin: 0 2px; 382 | padding: 0.35em 0.625em 0.75em; 383 | } 384 | 385 | /** 386 | * 1. Correct `color` not being inherited in IE 8/9/10/11. 387 | * 2. Remove padding so people aren't caught out if they zero out fieldsets. 388 | */ 389 | 390 | legend { 391 | border: 0; /* 1 */ 392 | padding: 0; /* 2 */ 393 | } 394 | 395 | /** 396 | * Remove default vertical scrollbar in IE 8/9/10/11. 397 | */ 398 | 399 | textarea { 400 | overflow: auto; 401 | } 402 | 403 | /** 404 | * Don't inherit the `font-weight` (applied by a rule above). 405 | * NOTE: the default cannot safely be changed in Chrome and Safari on OS X. 406 | */ 407 | 408 | optgroup { 409 | font-weight: bold; 410 | } 411 | 412 | /* Tables 413 | ========================================================================== */ 414 | 415 | /** 416 | * Remove most spacing between table cells. 417 | */ 418 | 419 | table { 420 | border-collapse: collapse; 421 | border-spacing: 0; 422 | } 423 | 424 | td, 425 | th { 426 | padding: 0; 427 | } -------------------------------------------------------------------------------- /docs/static/css/skeleton.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Skeleton V2.0.4 3 | * Copyright 2014, Dave Gamache 4 | * www.getskeleton.com 5 | * Free to use under the MIT license. 6 | * http://www.opensource.org/licenses/mit-license.php 7 | * 12/29/2014 8 | */ 9 | 10 | 11 | /* Table of contents 12 | –––––––––––––––––––––––––––––––––––––––––––––––––– 13 | - Grid 14 | - Base Styles 15 | - Typography 16 | - Links 17 | - Buttons 18 | - Forms 19 | - Lists 20 | - Code 21 | - Tables 22 | - Spacing 23 | - Utilities 24 | - Clearing 25 | - Media Queries 26 | */ 27 | 28 | 29 | /* Grid 30 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 31 | .container { 32 | position: relative; 33 | width: 100%; 34 | max-width: 960px; 35 | margin: 0 auto; 36 | padding: 0 20px; 37 | box-sizing: border-box; } 38 | .column, 39 | .columns { 40 | width: 100%; 41 | float: left; 42 | box-sizing: border-box; } 43 | 44 | /* For devices larger than 400px */ 45 | @media (min-width: 400px) { 46 | .container { 47 | width: 85%; 48 | padding: 0; } 49 | } 50 | 51 | /* For devices larger than 550px */ 52 | @media (min-width: 550px) { 53 | .container { 54 | width: 80%; } 55 | .column, 56 | .columns { 57 | margin-left: 4%; } 58 | .column:first-child, 59 | .columns:first-child { 60 | margin-left: 0; } 61 | 62 | .one.column, 63 | .one.columns { width: 4.66666666667%; } 64 | .two.columns { width: 13.3333333333%; } 65 | .three.columns { width: 22%; } 66 | .four.columns { width: 30.6666666667%; } 67 | .five.columns { width: 39.3333333333%; } 68 | .six.columns { width: 48%; } 69 | .seven.columns { width: 56.6666666667%; } 70 | .eight.columns { width: 65.3333333333%; } 71 | .nine.columns { width: 74.0%; } 72 | .ten.columns { width: 82.6666666667%; } 73 | .eleven.columns { width: 91.3333333333%; } 74 | .twelve.columns { width: 100%; margin-left: 0; } 75 | 76 | .one-third.column { width: 30.6666666667%; } 77 | .two-thirds.column { width: 65.3333333333%; } 78 | 79 | .one-half.column { width: 48%; } 80 | 81 | /* Offsets */ 82 | .offset-by-one.column, 83 | .offset-by-one.columns { margin-left: 8.66666666667%; } 84 | .offset-by-two.column, 85 | .offset-by-two.columns { margin-left: 17.3333333333%; } 86 | .offset-by-three.column, 87 | .offset-by-three.columns { margin-left: 26%; } 88 | .offset-by-four.column, 89 | .offset-by-four.columns { margin-left: 34.6666666667%; } 90 | .offset-by-five.column, 91 | .offset-by-five.columns { margin-left: 43.3333333333%; } 92 | .offset-by-six.column, 93 | .offset-by-six.columns { margin-left: 52%; } 94 | .offset-by-seven.column, 95 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 96 | .offset-by-eight.column, 97 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 98 | .offset-by-nine.column, 99 | .offset-by-nine.columns { margin-left: 78.0%; } 100 | .offset-by-ten.column, 101 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 102 | .offset-by-eleven.column, 103 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 104 | 105 | .offset-by-one-third.column, 106 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 107 | .offset-by-two-thirds.column, 108 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 109 | 110 | .offset-by-one-half.column, 111 | .offset-by-one-half.columns { margin-left: 52%; } 112 | 113 | } 114 | 115 | 116 | /* Base Styles 117 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 118 | /* NOTE 119 | html is set to 62.5% so that all the REM measurements throughout Skeleton 120 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 121 | html { 122 | font-size: 62.5%; } 123 | body { 124 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 125 | line-height: 1.6; 126 | font-weight: 400; 127 | font-family: "Raleway", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 128 | color: #222; } 129 | 130 | 131 | /* Typography 132 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 133 | h1, h2, h3, h4, h5, h6 { 134 | margin-top: 0; 135 | margin-bottom: 2rem; 136 | font-weight: 300; } 137 | h1 { font-size: 4.0rem; line-height: 1.2; letter-spacing: -.1rem;} 138 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; } 139 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; } 140 | h4 { font-size: 2.4rem; line-height: 1.35; letter-spacing: -.08rem; } 141 | h5 { font-size: 1.8rem; line-height: 1.5; letter-spacing: -.05rem; } 142 | h6 { font-size: 1.5rem; line-height: 1.6; letter-spacing: 0; } 143 | 144 | /* Larger than phablet */ 145 | @media (min-width: 550px) { 146 | h1 { font-size: 5.0rem; } 147 | h2 { font-size: 4.2rem; } 148 | h3 { font-size: 3.6rem; } 149 | h4 { font-size: 3.0rem; } 150 | h5 { font-size: 2.4rem; } 151 | h6 { font-size: 1.5rem; } 152 | } 153 | 154 | p { 155 | margin-top: 0; } 156 | 157 | 158 | /* Links 159 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 160 | a { 161 | color: #1EAEDB; } 162 | a:hover { 163 | color: #0FA0CE; } 164 | 165 | 166 | /* Buttons 167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 168 | .button, 169 | button, 170 | input[type="submit"], 171 | input[type="reset"], 172 | input[type="button"] { 173 | display: inline-block; 174 | height: 38px; 175 | padding: 0 30px; 176 | color: #555; 177 | text-align: center; 178 | font-size: 11px; 179 | font-weight: 600; 180 | line-height: 38px; 181 | letter-spacing: .1rem; 182 | text-transform: uppercase; 183 | text-decoration: none; 184 | white-space: nowrap; 185 | background-color: transparent; 186 | border-radius: 4px; 187 | border: 1px solid #bbb; 188 | cursor: pointer; 189 | box-sizing: border-box; } 190 | .button:hover, 191 | button:hover, 192 | input[type="submit"]:hover, 193 | input[type="reset"]:hover, 194 | input[type="button"]:hover, 195 | .button:focus, 196 | button:focus, 197 | input[type="submit"]:focus, 198 | input[type="reset"]:focus, 199 | input[type="button"]:focus { 200 | color: #333; 201 | border-color: #888; 202 | outline: 0; } 203 | .button.button-primary, 204 | button.button-primary, 205 | input[type="submit"].button-primary, 206 | input[type="reset"].button-primary, 207 | input[type="button"].button-primary { 208 | color: #FFF; 209 | background-color: #33C3F0; 210 | border-color: #33C3F0; } 211 | .button.button-primary:hover, 212 | button.button-primary:hover, 213 | input[type="submit"].button-primary:hover, 214 | input[type="reset"].button-primary:hover, 215 | input[type="button"].button-primary:hover, 216 | .button.button-primary:focus, 217 | button.button-primary:focus, 218 | input[type="submit"].button-primary:focus, 219 | input[type="reset"].button-primary:focus, 220 | input[type="button"].button-primary:focus { 221 | color: #FFF; 222 | background-color: #1EAEDB; 223 | border-color: #1EAEDB; } 224 | 225 | 226 | /* Forms 227 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 228 | input[type="email"], 229 | input[type="number"], 230 | input[type="search"], 231 | input[type="text"], 232 | input[type="tel"], 233 | input[type="url"], 234 | input[type="password"], 235 | textarea, 236 | select { 237 | height: 38px; 238 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 239 | background-color: #fff; 240 | border: 1px solid #D1D1D1; 241 | border-radius: 4px; 242 | box-shadow: none; 243 | box-sizing: border-box; } 244 | /* Removes awkward default styles on some inputs for iOS */ 245 | input[type="email"], 246 | input[type="number"], 247 | input[type="search"], 248 | input[type="text"], 249 | input[type="tel"], 250 | input[type="url"], 251 | input[type="password"], 252 | textarea { 253 | -webkit-appearance: none; 254 | -moz-appearance: none; 255 | appearance: none; } 256 | textarea { 257 | min-height: 65px; 258 | padding-top: 6px; 259 | padding-bottom: 6px; } 260 | input[type="email"]:focus, 261 | input[type="number"]:focus, 262 | input[type="search"]:focus, 263 | input[type="text"]:focus, 264 | input[type="tel"]:focus, 265 | input[type="url"]:focus, 266 | input[type="password"]:focus, 267 | textarea:focus, 268 | select:focus { 269 | border: 1px solid #33C3F0; 270 | outline: 0; } 271 | label, 272 | legend { 273 | display: block; 274 | margin-bottom: .5rem; 275 | font-weight: 600; } 276 | fieldset { 277 | padding: 0; 278 | border-width: 0; } 279 | input[type="checkbox"], 280 | input[type="radio"] { 281 | display: inline; } 282 | label > .label-body { 283 | display: inline-block; 284 | margin-left: .5rem; 285 | font-weight: normal; } 286 | 287 | 288 | /* Lists 289 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 290 | ul { 291 | list-style: circle inside; } 292 | ol { 293 | list-style: decimal inside; } 294 | ol, ul { 295 | padding-left: 0; 296 | margin-top: 0; } 297 | ul ul, 298 | ul ol, 299 | ol ol, 300 | ol ul { 301 | margin: 1.5rem 0 1.5rem 3rem; 302 | font-size: 90%; } 303 | li { 304 | margin-bottom: 1rem; } 305 | 306 | 307 | /* Code 308 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 309 | code { 310 | padding: .2rem .5rem; 311 | margin: 0 .2rem; 312 | font-size: 90%; 313 | white-space: nowrap; 314 | background: #F1F1F1; 315 | border: 1px solid #E1E1E1; 316 | border-radius: 4px; } 317 | pre > code { 318 | display: block; 319 | padding: 1rem 1.5rem; 320 | white-space: pre; } 321 | 322 | 323 | /* Tables 324 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 325 | th, 326 | td { 327 | padding: 12px 15px; 328 | text-align: left; 329 | border-bottom: 1px solid #E1E1E1; } 330 | th:first-child, 331 | td:first-child { 332 | padding-left: 0; } 333 | th:last-child, 334 | td:last-child { 335 | padding-right: 0; } 336 | 337 | 338 | /* Spacing 339 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 340 | button, 341 | .button { 342 | margin-bottom: 1rem; } 343 | input, 344 | textarea, 345 | select, 346 | fieldset { 347 | margin-bottom: 1.5rem; } 348 | pre, 349 | blockquote, 350 | dl, 351 | figure, 352 | table, 353 | p, 354 | ul, 355 | ol, 356 | form { 357 | margin-bottom: 2.5rem; } 358 | 359 | 360 | /* Utilities 361 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 362 | .u-full-width { 363 | width: 100%; 364 | box-sizing: border-box; } 365 | .u-max-full-width { 366 | max-width: 100%; 367 | box-sizing: border-box; } 368 | .u-pull-right { 369 | float: right; } 370 | .u-pull-left { 371 | float: left; } 372 | 373 | 374 | /* Misc 375 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 376 | hr { 377 | margin-top: 3rem; 378 | margin-bottom: 3.5rem; 379 | border-width: 0; 380 | border-top: 1px solid #E1E1E1; } 381 | 382 | 383 | /* Clearing 384 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 385 | 386 | /* Self Clearing Goodness */ 387 | .container:after, 388 | .row:after, 389 | .u-cf { 390 | content: ""; 391 | display: table; 392 | clear: both; } 393 | 394 | 395 | /* Media Queries 396 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 397 | /* 398 | Note: The best way to structure the use of media queries is to create the queries 399 | near the relevant code. For example, if you wanted to change the styles for buttons 400 | on small devices, paste the mobile query code up in the buttons section and style it 401 | there. 402 | */ 403 | 404 | 405 | /* Larger than mobile */ 406 | @media (min-width: 400px) {} 407 | 408 | /* Larger than phablet (also point when grid becomes active) */ 409 | @media (min-width: 550px) {} 410 | 411 | /* Larger than tablet */ 412 | @media (min-width: 750px) {} 413 | 414 | /* Larger than desktop */ 415 | @media (min-width: 1000px) {} 416 | 417 | /* Larger than Desktop HD */ 418 | @media (min-width: 1200px) {} 419 | -------------------------------------------------------------------------------- /docs/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/favicon.png -------------------------------------------------------------------------------- /docs/static/images/512logotipo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/docs/static/images/512logotipo.png -------------------------------------------------------------------------------- /dump_hparams_to_json.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Dump hyper parameters to json file. 4 | 5 | usage: dump_hparams_to_json.py [options] 6 | 7 | options: 8 | -h, --help Show help message. 9 | """ 10 | from docopt import docopt 11 | 12 | import sys 13 | import os 14 | from os.path import dirname, join, basename, splitext 15 | 16 | import audio 17 | 18 | # The deepvoice3 model 19 | from deepvoice3_pytorch import frontend 20 | from hparams import hparams 21 | import json 22 | 23 | if __name__ == "__main__": 24 | args = docopt(__doc__) 25 | output_json_path = args[""] 26 | 27 | j = hparams.values() 28 | 29 | # for compat legacy 30 | for k in ["preset", "presets"]: 31 | if k in j: 32 | del j[k] 33 | 34 | with open(output_json_path, "w") as f: 35 | json.dump(j, f, indent=2) 36 | sys.exit(0) 37 | -------------------------------------------------------------------------------- /gentle_web_align.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 21 09:06:37 2018 4 | Phoneme alignment and conversion in HTK-style label file using Web-served Gentle 5 | This works on any type of english dataset. 6 | Unlike prepare_htk_alignments_vctk.py, this is Python3 and Windows(with Docker) compatible. 7 | Preliminary results show that gentle has better performance with noisy dataset 8 | (e.g. movie extracted audioclips) 9 | *This work was derived from vctk_preprocess/prepare_htk_alignments_vctk.py 10 | @author: engiecat(github) 11 | 12 | usage: 13 | gentle_web_align.py (-w wav_pattern) (-t text_pattern) [options] 14 | gentle_web_align.py (--nested-directories=) [options] 15 | 16 | options: 17 | -w --wav_pattern= Pattern of wav files to be aligned 18 | -t --txt_pattern= Pattern of txt transcript files to be aligned (same name required) 19 | --nested-directories= Process every wav/txt file in the subfolders of the given folder 20 | --server_addr= Server address that serves gentle. [default: localhost] 21 | --port= Server port that serves gentle. [default: 8567] 22 | --max_unalign= Maximum threshold for unalignment occurence (0.0 ~ 1.0) [default: 0.3] 23 | --skip-already-done Skips if there are preexisting .lab file 24 | -h --help show this help message and exit 25 | """ 26 | 27 | from docopt import docopt 28 | from glob import glob 29 | from tqdm import tqdm 30 | import os.path 31 | import requests 32 | import numpy as np 33 | 34 | def write_hts_label(labels, lab_path): 35 | lab = "" 36 | for s, e, l in labels: 37 | s, e = float(s) * 1e7, float(e) * 1e7 38 | s, e = int(s), int(e) 39 | lab += "{} {} {}\n".format(s, e, l) 40 | print(lab) 41 | with open(lab_path, "w", encoding='utf-8') as f: 42 | f.write(lab) 43 | 44 | 45 | def json2hts(data): 46 | emit_bos = False 47 | emit_eos = False 48 | 49 | phone_start = 0 50 | phone_end = None 51 | labels = [] 52 | failure_count = 0 53 | 54 | for word in data["words"]: 55 | case = word["case"] 56 | if case != "success": 57 | failure_count += 1 # instead of failing everything, 58 | #raise RuntimeError("Alignment failed") 59 | continue 60 | start = float(word["start"]) 61 | word_end = float(word["end"]) 62 | 63 | if not emit_bos: 64 | labels.append((phone_start, start, "silB")) 65 | emit_bos = True 66 | 67 | phone_start = start 68 | phone_end = None 69 | for phone in word["phones"]: 70 | ph = str(phone["phone"][:-2]) 71 | duration = float(phone["duration"]) 72 | phone_end = phone_start + duration 73 | labels.append((phone_start, phone_end, ph)) 74 | phone_start += duration 75 | assert np.allclose(phone_end, word_end) 76 | if not emit_eos: 77 | labels.append((phone_start, phone_end, "silE")) 78 | emit_eos = True 79 | unalign_ratio = float(failure_count) / len(data['words']) 80 | return unalign_ratio, labels 81 | 82 | 83 | def gentle_request(wav_path,txt_path, server_addr, port, debug=False): 84 | print('\n') 85 | response = None 86 | wav_name = os.path.basename(wav_path) 87 | txt_name = os.path.basename(txt_path) 88 | if os.path.splitext(wav_name)[0] != os.path.splitext(txt_name)[0]: 89 | print(' [!] wav name and transcript name does not match - exiting...') 90 | return response 91 | with open(txt_path, 'r', encoding='utf-8-sig') as txt_file: 92 | print('Transcript - '+''.join(txt_file.readlines())) 93 | with open(wav_path,'rb') as wav_file, open(txt_path, 'rb') as txt_file: 94 | params = (('async','false'),) 95 | files={'audio':(wav_name,wav_file), 96 | 'transcript':(txt_name,txt_file), 97 | } 98 | server_path = 'http://'+server_addr+':'+str(port)+'/transcriptions' 99 | response = requests.post(server_path, params=params,files=files) 100 | if response.status_code != 200: 101 | print(' [!] External server({}) returned bad response({})'.format(server_path, response.status_code)) 102 | if debug: 103 | print('Response') 104 | print(response.json()) 105 | return response 106 | 107 | if __name__ == '__main__': 108 | arguments = docopt(__doc__) 109 | server_addr = arguments['--server_addr'] 110 | port = int(arguments['--port']) 111 | max_unalign = float(arguments['--max_unalign']) 112 | if arguments['--nested-directories'] is None: 113 | wav_paths = sorted(glob(arguments['--wav_pattern'])) 114 | txt_paths = sorted(glob(arguments['--txt_pattern'])) 115 | else: 116 | # if this is multi-foldered environment 117 | # (e.g. DATASET/speaker1/blahblah.wav) 118 | wav_paths=[] 119 | txt_paths=[] 120 | topdir = arguments['--nested-directories'] 121 | subdirs = [f for f in os.listdir(topdir) if os.path.isdir(os.path.join(topdir, f))] 122 | for subdir in subdirs: 123 | wav_pattern_subdir = os.path.join(topdir, subdir, '*.wav') 124 | txt_pattern_subdir = os.path.join(topdir, subdir, '*.txt') 125 | wav_paths.extend(sorted(glob(wav_pattern_subdir))) 126 | txt_paths.extend(sorted(glob(txt_pattern_subdir))) 127 | 128 | t = tqdm(range(len(wav_paths))) 129 | for idx in t: 130 | try: 131 | t.set_description("Align via Gentle") 132 | wav_path = wav_paths[idx] 133 | txt_path = txt_paths[idx] 134 | lab_path = os.path.splitext(wav_path)[0]+'.lab' 135 | if os.path.exists(lab_path) and arguments['--skip-already-done']: 136 | print('[!] skipping because of pre-existing .lab file - {}'.format(lab_path)) 137 | continue 138 | res=gentle_request(wav_path,txt_path, server_addr, port) 139 | unalign_ratio, lab = json2hts(res.json()) 140 | print('[*] Unaligned Ratio - {}'.format(unalign_ratio)) 141 | if unalign_ratio > max_unalign: 142 | print('[!] skipping this due to bad alignment') 143 | continue 144 | write_hts_label(lab, lab_path) 145 | except: 146 | # if sth happens, skip it 147 | import traceback 148 | tb = traceback.format_exc() 149 | print('[!] ERROR while processing {}'.format(wav_paths[idx])) 150 | print('[!] StackTrace - ') 151 | print(tb) 152 | 153 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | from deepvoice3_pytorch.tfcompat.hparam import HParams 2 | 3 | # NOTE: If you want full control for model architecture. please take a look 4 | # at the code and change whatever you want. Some hyper parameters are hardcoded. 5 | 6 | # Default hyperparameters: 7 | hparams = HParams( 8 | name="deepvoice3", 9 | 10 | # Text: 11 | # [en, jp] 12 | frontend='en', 13 | 14 | # Replace words to its pronunciation with fixed probability. 15 | # e.g., 'hello' to 'HH AH0 L OW1' 16 | # [en, jp] 17 | # en: Word -> pronunciation using CMUDict 18 | # jp: Word -> pronounciation usnig MeCab 19 | # [0 ~ 1.0]: 0 means no replacement happens. 20 | replace_pronunciation_prob=0.5, 21 | 22 | # Convenient model builder 23 | # [deepvoice3, deepvoice3_multispeaker, nyanko] 24 | # Definitions can be found at deepvoice3_pytorch/builder.py 25 | # deepvoice3: DeepVoice3 https://arxiv.org/abs/1710.07654 26 | # deepvoice3_multispeaker: Multi-speaker version of DeepVoice3 27 | # nyanko: https://arxiv.org/abs/1710.08969 28 | builder="deepvoice3", 29 | 30 | # Must be configured depends on the dataset and model you use 31 | n_speakers=1, 32 | speaker_embed_dim=16, 33 | 34 | # Audio: 35 | num_mels=80, 36 | fmin=125, 37 | fmax=7600, 38 | fft_size=1024, 39 | hop_size=256, 40 | sample_rate=22050, 41 | preemphasis=0.97, 42 | min_level_db=-100, 43 | ref_level_db=20, 44 | # whether to rescale waveform or not. 45 | # Let x is an input waveform, rescaled waveform y is given by: 46 | # y = x / np.abs(x).max() * rescaling_max 47 | rescaling=False, 48 | rescaling_max=0.999, 49 | # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may 50 | # happen depends on min_level_db and ref_level_db, causing clipping noise. 51 | # If False, assertion is added to ensure no clipping happens. 52 | allow_clipping_in_normalization=True, 53 | 54 | # Model: 55 | downsample_step=4, # must be 4 when builder="nyanko" 56 | outputs_per_step=1, # must be 1 when builder="nyanko" 57 | embedding_weight_std=0.1, 58 | speaker_embedding_weight_std=0.01, 59 | padding_idx=0, 60 | # Maximum number of input text length 61 | # try setting larger value if you want to give very long text input 62 | max_positions=512, 63 | dropout=1 - 0.95, 64 | kernel_size=3, 65 | text_embed_dim=128, 66 | encoder_channels=256, 67 | decoder_channels=256, 68 | # Note: large converter channels requires significant computational cost 69 | converter_channels=256, 70 | query_position_rate=1.0, 71 | # can be computed by `compute_timestamp_ratio.py`. 72 | key_position_rate=1.385, # 2.37 for jsut 73 | key_projection=False, 74 | value_projection=False, 75 | use_memory_mask=True, 76 | trainable_positional_encodings=False, 77 | freeze_embedding=False, 78 | # If True, use decoder's internal representation for postnet inputs, 79 | # otherwise use mel-spectrogram. 80 | use_decoder_state_for_postnet_input=True, 81 | 82 | # Data loader 83 | pin_memory=True, 84 | num_workers=2, # Set it to 1 when in Windows (MemoryError, THAllocator.c 0x5) 85 | 86 | # Loss 87 | masked_loss_weight=0.5, # (1-w)*loss + w * masked_loss 88 | priority_freq=3000, # heuristic: priotrize [0 ~ priotiry_freq] for linear loss 89 | priority_freq_weight=0.0, # (1-w)*linear_loss + w*priority_linear_loss 90 | # https://arxiv.org/pdf/1710.08969.pdf 91 | # Adding the divergence to the loss stabilizes training, expecially for 92 | # very deep (> 10 layers) networks. 93 | # Binary div loss seems has approx 10x scale compared to L1 loss, so I choose 0.1. 94 | binary_divergence_weight=0.1, # set 0 to disable 95 | use_guided_attention=True, 96 | guided_attention_sigma=0.2, 97 | 98 | # Training: 99 | batch_size=16, 100 | adam_beta1=0.5, 101 | adam_beta2=0.9, 102 | adam_eps=1e-6, 103 | amsgrad=False, 104 | initial_learning_rate=5e-4, # 0.001, 105 | lr_schedule="noam_learning_rate_decay", 106 | lr_schedule_kwargs={}, 107 | nepochs=2000, 108 | weight_decay=0.0, 109 | clip_thresh=0.1, 110 | 111 | # Save 112 | checkpoint_interval=10000, 113 | eval_interval=10000, 114 | save_optimizer_state=True, 115 | 116 | # Eval: 117 | # this can be list for multple layers of attention 118 | # e.g., [True, False, False, False, True] 119 | force_monotonic_attention=True, 120 | # Attention constraint for incremental decoding 121 | window_ahead=3, 122 | # 0 tends to prevent word repretetion, but sometime causes skip words 123 | window_backward=1, 124 | power=1.4, # Power to raise magnitudes to prior to phase retrieval 125 | 126 | # GC: 127 | # Forced garbage collection probability 128 | # Use only when MemoryError continues in Windows (Disabled by default) 129 | #gc_probability = 0.001, 130 | 131 | # json_meta mode only 132 | # 0: "use all", 133 | # 1: "ignore only unmatched_alignment", 134 | # 2: "fully ignore recognition", 135 | ignore_recognition_level=2, 136 | # when dealing with non-dedicated speech dataset(e.g. movie excerpts), setting min_text above 15 is desirable. Can be adjusted by dataset. 137 | min_text=20, 138 | # if true, data without phoneme alignment file(.lab) will be ignored 139 | process_only_htk_aligned=False, 140 | ) 141 | 142 | 143 | def hparams_debug_string(): 144 | values = hparams.values() 145 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 146 | return 'Hyperparameters:\n' + '\n'.join(hp) 147 | -------------------------------------------------------------------------------- /json_meta.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Started in 1945h, Mar 10, 2018 3 | First done in 2103h, Mar 11, 2018 4 | Test done in 2324h, Mar 11, 2018 5 | Modified for HTK labeling in 1426h, Apr 21, 2018 6 | by engiecat(github) 7 | 8 | This makes r9y9/deepvoice3_pytorch compatible with json format of carpedm20/multi-speaker-tacotron-tensorflow and keithito/tacotron. 9 | The json file is given per speaker, generated in the format of 10 | (if completely aligned) 11 | (path-to-the-audio):aligned text 12 | 13 | (if partially aligned) 14 | (path-to-the-audio):[candidate sentence - not aligned,recognized words] 15 | 16 | (if non-aligned) 17 | (path-to-the-audio):[recognized words] 18 | is given per speaker. 19 | 20 | (e.g. python preprocess.py json_meta "./datasets/LJSpeech_1_0/alignment.json,./datasets/GoTBookRev/alignment.json" "./datasets/LJ+GoTBookRev" --preset=./presets/deepvoice3_vctk.json ) 21 | 22 | usage: 23 | python preprocess.py [option] 24 | 25 | 26 | options: 27 | --preset Path of preset parameters (json). 28 | -h --help show this help message and exit 29 | 30 | 31 | ''' 32 | 33 | from concurrent.futures import ProcessPoolExecutor 34 | from functools import partial 35 | import numpy as np 36 | import os 37 | import audio 38 | from nnmnkwii.io import hts 39 | from hparams import hparams 40 | from os.path import exists 41 | import librosa 42 | import json 43 | 44 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 45 | executor = ProcessPoolExecutor(max_workers=num_workers) 46 | futures = [] 47 | 48 | json_paths = in_dir.split(',') 49 | json_paths = [json_path.replace("'", "").replace('"',"") for json_path in json_paths] 50 | num_speakers = len(json_paths) 51 | is_aligned = {} 52 | 53 | speaker_id=0 54 | for json_path in json_paths: 55 | # Loads json metadata info 56 | if json_path.endswith("json"): 57 | with open(json_path, encoding='utf8') as f: 58 | content = f.read() 59 | info = json.loads(content) 60 | elif json_path.endswith("csv"): 61 | with open(json_path) as f: 62 | info = {} 63 | for line in f: 64 | path, text = line.strip().split('|') 65 | info[path] = text 66 | else: 67 | raise Exception(" [!] Unknown metadata format: {}".format(json_path)) 68 | 69 | print(" [*] Loaded - {}".format(json_path)) 70 | # check audio file existence 71 | base_dir = os.path.dirname(json_path) 72 | new_info = {} 73 | for path in info.keys(): 74 | if not os.path.exists(path): 75 | new_path = os.path.join(base_dir, path) 76 | if not os.path.exists(new_path): 77 | print(" [!] Audio not found: {}".format([path, new_path])) 78 | continue 79 | else: 80 | new_path = path 81 | 82 | new_info[new_path] = info[path] 83 | 84 | info = new_info 85 | 86 | # ignore_recognition_level check 87 | for path in info.keys(): 88 | is_aligned[path] = True 89 | if isinstance(info[path], list): 90 | if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \ 91 | hparams.ignore_recognition_level == 2: 92 | # flag the path to be 'non-aligned' text 93 | is_aligned[path] = False 94 | info[path] = info[path][0] 95 | 96 | # Reserve for future processing 97 | queue_count = 0 98 | for audio_path, text in info.items(): 99 | if isinstance(text, list): 100 | if hparams.ignore_recognition_level == 0: 101 | text = text[-1] 102 | else: 103 | text = text[0] 104 | if hparams.ignore_recognition_level > 0 and not is_aligned[audio_path]: 105 | continue 106 | if hparams.min_text > len(text): 107 | continue 108 | if num_speakers == 1: 109 | # Single-speaker 110 | futures.append(executor.submit( 111 | partial(_process_utterance_single, out_dir, text, audio_path))) 112 | else: 113 | # Multi-speaker 114 | futures.append(executor.submit( 115 | partial(_process_utterance, out_dir, text, audio_path, speaker_id))) 116 | queue_count += 1 117 | print(" [*] Appended {} entries in the queue".format(queue_count)) 118 | 119 | # increase speaker_id 120 | speaker_id += 1 121 | 122 | # Show ignore_recognition_level description 123 | ignore_description = { 124 | 0: "use all", 125 | 1: "ignore only unmatched_alignment", 126 | 2: "fully ignore recognition", 127 | } 128 | print(" [!] Skip recognition level: {} ({})". \ 129 | format(hparams.ignore_recognition_level, 130 | ignore_description[hparams.ignore_recognition_level])) 131 | 132 | if num_speakers == 1: 133 | print(" [!] Single-speaker mode activated!") 134 | else: 135 | print(" [!] Multi-speaker({}) mode activated!".format(num_speakers)) 136 | 137 | # Now, Do the job! 138 | results = [future.result() for future in tqdm(futures)] 139 | # Remove entries with None (That has been filtered due to bad htk alginment (if process_only_htk_aligned is enabled in hparams) 140 | results = [result for result in results if result != None] 141 | return results 142 | 143 | 144 | def start_at(labels): 145 | has_silence = labels[0][-1] == "pau" 146 | if not has_silence: 147 | return labels[0][0] 148 | for i in range(1, len(labels)): 149 | if labels[i][-1] != "pau": 150 | return labels[i][0] 151 | assert False 152 | 153 | 154 | def end_at(labels): 155 | has_silence = labels[-1][-1] == "pau" 156 | if not has_silence: 157 | return labels[-1][1] 158 | for i in range(len(labels) - 2, 0, -1): 159 | if labels[i][-1] != "pau": 160 | return labels[i][1] 161 | assert False 162 | 163 | 164 | def _process_utterance(out_dir, text, wav_path, speaker_id=None): 165 | 166 | # check whether singlespeaker_mode 167 | if speaker_id is None: 168 | return _process_utterance_single(out_dir,text,wav_path) 169 | # modified version of VCTK _process_utterance 170 | sr = hparams.sample_rate 171 | 172 | # Load the audio to a numpy array: 173 | wav = audio.load_wav(wav_path) 174 | 175 | lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") 176 | if not exists(lab_path): 177 | lab_path = os.path.splitext(wav_path)[0]+'.lab' 178 | 179 | # Trim silence from hts labels if available 180 | if exists(lab_path): 181 | labels = hts.load(lab_path) 182 | b = int(start_at(labels) * 1e-7 * sr) 183 | e = int(end_at(labels) * 1e-7 * sr) 184 | wav = wav[b:e] 185 | wav, _ = librosa.effects.trim(wav, top_db=25) 186 | else: 187 | if hparams.process_only_htk_aligned: 188 | return None 189 | wav, _ = librosa.effects.trim(wav, top_db=15) 190 | 191 | if hparams.rescaling: 192 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 193 | 194 | # Compute the linear-scale spectrogram from the wav: 195 | spectrogram = audio.spectrogram(wav).astype(np.float32) 196 | n_frames = spectrogram.shape[1] 197 | 198 | # Compute a mel-scale spectrogram from the wav: 199 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 200 | 201 | # Write the spectrograms to disk: 202 | # Get filename from wav_path 203 | wav_name = os.path.basename(wav_path) 204 | wav_name = os.path.splitext(wav_name)[0] 205 | 206 | # case if wave files across different speakers have the same naming format. 207 | # e.g. Recording0.wav 208 | spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name) 209 | mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name) 210 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 211 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 212 | # Return a tuple describing this training example: 213 | return (spectrogram_filename, mel_filename, n_frames, text, speaker_id) 214 | 215 | def _process_utterance_single(out_dir, text, wav_path): 216 | # modified version of LJSpeech _process_utterance 217 | 218 | # Load the audio to a numpy array: 219 | wav = audio.load_wav(wav_path) 220 | sr = hparams.sample_rate 221 | # Added from the multispeaker version 222 | lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") 223 | if not exists(lab_path): 224 | lab_path = os.path.splitext(wav_path)[0]+'.lab' 225 | 226 | # Trim silence from hts labels if available 227 | if exists(lab_path): 228 | labels = hts.load(lab_path) 229 | b = int(start_at(labels) * 1e-7 * sr) 230 | e = int(end_at(labels) * 1e-7 * sr) 231 | wav = wav[b:e] 232 | wav, _ = librosa.effects.trim(wav, top_db=25) 233 | else: 234 | if hparams.process_only_htk_aligned: 235 | return None 236 | wav, _ = librosa.effects.trim(wav, top_db=15) 237 | # End added from the multispeaker version 238 | 239 | if hparams.rescaling: 240 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 241 | 242 | # Compute the linear-scale spectrogram from the wav: 243 | spectrogram = audio.spectrogram(wav).astype(np.float32) 244 | n_frames = spectrogram.shape[1] 245 | 246 | # Compute a mel-scale spectrogram from the wav: 247 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 248 | 249 | # Write the spectrograms to disk: 250 | # Get filename from wav_path 251 | wav_name = os.path.basename(wav_path) 252 | wav_name = os.path.splitext(wav_name)[0] 253 | spectrogram_filename = 'spec-{}.npy'.format(wav_name) 254 | mel_filename = 'mel-{}.npy'.format(wav_name) 255 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 256 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 257 | 258 | # Return a tuple describing this training example: 259 | return (spectrogram_filename, mel_filename, n_frames, text) 260 | 261 | -------------------------------------------------------------------------------- /jsut.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from nnmnkwii.datasets import jsut 7 | from nnmnkwii.io import hts 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | 13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 14 | executor = ProcessPoolExecutor(max_workers=num_workers) 15 | futures = [] 16 | 17 | transcriptions = jsut.TranscriptionDataSource( 18 | in_dir, subsets=jsut.available_subsets).collect_files() 19 | wav_paths = jsut.WavFileDataSource( 20 | in_dir, subsets=jsut.available_subsets).collect_files() 21 | 22 | for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)): 23 | futures.append(executor.submit( 24 | partial(_process_utterance, out_dir, index + 1, wav_path, text))) 25 | return [future.result() for future in tqdm(futures)] 26 | 27 | 28 | def _process_utterance(out_dir, index, wav_path, text): 29 | sr = hparams.sample_rate 30 | 31 | # Load the audio to a numpy array: 32 | wav = audio.load_wav(wav_path) 33 | 34 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") 35 | 36 | # Trim silence from hts labels if available 37 | if exists(lab_path): 38 | labels = hts.load(lab_path) 39 | assert labels[0][-1] == "silB" 40 | assert labels[-1][-1] == "silE" 41 | b = int(labels[0][1] * 1e-7 * sr) 42 | e = int(labels[-1][0] * 1e-7 * sr) 43 | wav = wav[b:e] 44 | else: 45 | wav, _ = librosa.effects.trim(wav, top_db=30) 46 | 47 | if hparams.rescaling: 48 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 49 | 50 | # Compute the linear-scale spectrogram from the wav: 51 | spectrogram = audio.spectrogram(wav).astype(np.float32) 52 | n_frames = spectrogram.shape[1] 53 | 54 | # Compute a mel-scale spectrogram from the wav: 55 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 56 | 57 | # Write the spectrograms to disk: 58 | spectrogram_filename = 'jsut-spec-%05d.npy' % index 59 | mel_filename = 'jsut-mel-%05d.npy' % index 60 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 61 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 62 | 63 | # Return a tuple describing this training example: 64 | return (spectrogram_filename, mel_filename, n_frames, text) 65 | -------------------------------------------------------------------------------- /ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from hparams import hparams 7 | 8 | 9 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 10 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 11 | 12 | Args: 13 | in_dir: The directory where you have downloaded the LJ Speech dataset 14 | out_dir: The directory to write the output into 15 | num_workers: Optional number of worker processes to parallelize across 16 | tqdm: You can optionally pass tqdm to get a nice progress bar 17 | 18 | Returns: 19 | A list of tuples describing the training examples. This should be written to train.txt 20 | ''' 21 | 22 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 23 | # can omit it and just call _process_utterance on each input if you want. 24 | executor = ProcessPoolExecutor(max_workers=num_workers) 25 | futures = [] 26 | index = 1 27 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 28 | for line in f: 29 | parts = line.strip().split('|') 30 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 31 | text = parts[2] 32 | if len(text) < hparams.min_text: 33 | continue 34 | futures.append(executor.submit( 35 | partial(_process_utterance, out_dir, index, wav_path, text))) 36 | index += 1 37 | return [future.result() for future in tqdm(futures)] 38 | 39 | 40 | def _process_utterance(out_dir, index, wav_path, text): 41 | '''Preprocesses a single utterance audio/text pair. 42 | 43 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 44 | to the train.txt file. 45 | 46 | Args: 47 | out_dir: The directory to write the spectrograms into 48 | index: The numeric index to use in the spectrogram filenames. 49 | wav_path: Path to the audio file containing the speech input 50 | text: The text spoken in the input audio file 51 | 52 | Returns: 53 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 54 | ''' 55 | 56 | # Load the audio to a numpy array: 57 | wav = audio.load_wav(wav_path) 58 | 59 | if hparams.rescaling: 60 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 61 | 62 | # Compute the linear-scale spectrogram from the wav: 63 | spectrogram = audio.spectrogram(wav).astype(np.float32) 64 | n_frames = spectrogram.shape[1] 65 | 66 | # Compute a mel-scale spectrogram from the wav: 67 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 68 | 69 | # Write the spectrograms to disk: 70 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index 71 | mel_filename = 'ljspeech-mel-%05d.npy' % index 72 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 73 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 74 | 75 | # Return a tuple describing this training example: 76 | return (spectrogram_filename, mel_filename, n_frames, text) 77 | -------------------------------------------------------------------------------- /lrschedule.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000): 6 | # Noam scheme from tensor2tensor: 7 | warmup_steps = float(warmup_steps) 8 | step = global_step + 1. 9 | lr = init_lr * warmup_steps**0.5 * np.minimum( 10 | step * warmup_steps**-1.5, step**-0.5) 11 | return lr 12 | 13 | 14 | def step_learning_rate_decay(init_lr, global_step, 15 | anneal_rate=0.98, 16 | anneal_interval=30000): 17 | return init_lr * anneal_rate ** (global_step // anneal_interval) 18 | 19 | 20 | def cyclic_cosine_annealing(init_lr, global_step, T, M): 21 | """Cyclic cosine annealing 22 | 23 | https://arxiv.org/pdf/1704.00109.pdf 24 | 25 | Args: 26 | init_lr (float): Initial learning rate 27 | global_step (int): Current iteration number 28 | T (int): Total iteration number (i,e. nepoch) 29 | M (int): Number of ensembles we want 30 | 31 | Returns: 32 | float: Annealed learning rate 33 | """ 34 | TdivM = T // M 35 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0) 36 | -------------------------------------------------------------------------------- /nikl_m.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | import re 7 | 8 | from hparams import hparams 9 | 10 | 11 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 12 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 13 | 14 | Args: 15 | in_dir: The directory where you have downloaded the LJ Speech dataset 16 | out_dir: The directory to write the output into 17 | num_workers: Optional number of worker processes to parallelize across 18 | tqdm: You can optionally pass tqdm to get a nice progress bar 19 | 20 | Returns: 21 | A list of tuples describing the training examples. This should be written to train.txt 22 | ''' 23 | 24 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 25 | # can omit it and just call _process_utterance on each input if you want. 26 | 27 | # You will need to modify and format NIKL transcrption file will UTF-8 format 28 | # please check https://github.com/homink/deepspeech.pytorch.ko/blob/master/data/local/clean_corpus.sh 29 | 30 | executor = ProcessPoolExecutor(max_workers=num_workers) 31 | futures = [] 32 | 33 | spk_id = {} 34 | with open(in_dir + '/speaker.mid', encoding='utf-8') as f: 35 | for i, line in enumerate(f): 36 | spk_id[line.rstrip()] = i 37 | 38 | index = 1 39 | with open(in_dir + '/metadata.txt', encoding='utf-8') as f: 40 | for line in f: 41 | parts = line.strip().split('|') 42 | wav_path = parts[0] 43 | text = parts[1] 44 | uid = re.search(r'([a-z][a-z][0-9][0-9]_t)', wav_path) 45 | uid = uid.group(1).replace('_t', '') 46 | futures.append(executor.submit( 47 | partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text))) 48 | index += 1 49 | return [future.result() for future in tqdm(futures)] 50 | 51 | 52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text): 53 | '''Preprocesses a single utterance audio/text pair. 54 | 55 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 56 | to the train.txt file. 57 | 58 | Args: 59 | out_dir: The directory to write the spectrograms into 60 | index: The numeric index to use in the spectrogram filenames. 61 | wav_path: Path to the audio file containing the speech input 62 | text: The text spoken in the input audio file 63 | 64 | Returns: 65 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 66 | ''' 67 | 68 | # Load the audio to a numpy array: 69 | wav = audio.load_wav(wav_path) 70 | 71 | if hparams.rescaling: 72 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 73 | 74 | # Compute the linear-scale spectrogram from the wav: 75 | spectrogram = audio.spectrogram(wav).astype(np.float32) 76 | n_frames = spectrogram.shape[1] 77 | 78 | # Compute a mel-scale spectrogram from the wav: 79 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 80 | 81 | # Write the spectrograms to disk: 82 | spectrogram_filename = 'nikl-multi-spec-%05d.npy' % index 83 | mel_filename = 'nikl-multi-mel-%05d.npy' % index 84 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 85 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 86 | 87 | # Return a tuple describing this training example: 88 | return (spectrogram_filename, mel_filename, n_frames, text, speaker_id) 89 | -------------------------------------------------------------------------------- /nikl_preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Preparation for Korean speech 2 | 3 | ## Corpus 4 | https://github.com/homink/speech.ko 5 | 6 | ## Command 7 | 8 | ### Multi-speaker 9 | ``` 10 | cd nikl_preprocess 11 | python prepare_metadata.py --corpus ${corpus location} --trans_file ${corpus location}/trans.txt --spk_id ${corpus location}/speaker.mid 12 | ``` 13 | ### Single-speaker 14 | ``` 15 | cd nikl_preprocess 16 | python prepare_metadata.py --corpus ${corpus location} --trans_file ${corpus location}/trans.txt --spk_id ${corpus location}/speaker.sid 17 | ``` 18 | Default single speaker id is fv01. You can edit it by speaker id in [here](https://github.com/homink/speech.ko). 19 | -------------------------------------------------------------------------------- /nikl_preprocess/prepare_metafile.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import subprocess,os,re 3 | 4 | def pwrap(args, shell=False): 5 | p = subprocess.Popen(args, shell=shell, stdout=subprocess.PIPE, 6 | stdin=subprocess.PIPE, stderr=subprocess.PIPE, 7 | universal_newlines=True) 8 | return p 9 | 10 | def execute(cmd, shell=False): 11 | popen = pwrap(cmd, shell=shell) 12 | for stdout_line in iter(popen.stdout.readline, ""): 13 | yield stdout_line 14 | 15 | popen.stdout.close() 16 | return_code = popen.wait() 17 | if return_code: 18 | raise subprocess.CalledProcessError(return_code, cmd) 19 | 20 | def pe(cmd, shell=False): 21 | """ 22 | Print and execute command on system 23 | """ 24 | ret = [] 25 | for line in execute(cmd, shell=shell): 26 | ret.append(line) 27 | print(line, end="") 28 | return ret 29 | 30 | 31 | if __name__ == "__main__": 32 | import argparse 33 | parser = argparse.ArgumentParser(description="Produce metafile where wav file path and its transcription are aligned", 34 | epilog="Example usage: python preprea_metadata $HOME/copora/NIKL") 35 | parser.add_argument("--corpus_dir", "-c", 36 | help="filepath for the root directory of corpus", 37 | required=True) 38 | 39 | parser.add_argument("--trans_file", "-t", 40 | help="Extracted transcription file obatained from extract_trans.py", 41 | required=True) 42 | 43 | parser.add_argument("--spk_id", "-sid", 44 | help="Speaker ID for single speaker such as fv01", 45 | required=False) 46 | args = parser.parse_args() 47 | 48 | print("Prepare metadata file for all speakers") 49 | pe("find %s -name %s | grep -v 'Bad\|Non\|Invalid' > %s/wav.lst" % (args.corpus_dir,"*.wav",args.corpus_dir),shell=True) 50 | 51 | trans={} 52 | with open(args.trans_file,"r") as f: 53 | for line in f: 54 | line = line.rstrip() 55 | line_split = line.split(" ") 56 | trans[line_split[0]] = " ".join(line_split[1:]) 57 | 58 | with open(args.corpus_dir+"/wav.lst", "r") as f: 59 | wavfiles = f.readlines() 60 | 61 | pe("rm -f %s/metadata.txt" % (args.corpus_dir),shell=True) 62 | for w in wavfiles: 63 | w = w.rstrip() 64 | tid = re.search(r'(t[0-9][0-9]_s[0-9][0-9])',w) 65 | if tid: 66 | tid_found = tid.group(1) 67 | pe('echo %s"|"%s >> %s/metadata.txt' % (w,trans.get(tid_found),args.corpus_dir),shell=True) 68 | 69 | print("Metadata files is created in %s/metadata.txt" % (args.corpus_dir)) 70 | pe("ls -d -- %s/*/ | grep -v 'Bad\|Non\|Invalid' | rev | cut -d'/' -f2 | rev > %s/speaker.mid" % (args.corpus_dir,args.corpus_dir),shell=True) 71 | pe("head -n 1 %s/speaker.mid > %s/speaker.sid" % (args.corpus_dir,args.corpus_dir),shell=True) 72 | -------------------------------------------------------------------------------- /nikl_s.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | import re 7 | 8 | from hparams import hparams 9 | 10 | 11 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 12 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 13 | 14 | Args: 15 | in_dir: The directory where you have downloaded the LJ Speech dataset 16 | out_dir: The directory to write the output into 17 | num_workers: Optional number of worker processes to parallelize across 18 | tqdm: You can optionally pass tqdm to get a nice progress bar 19 | 20 | Returns: 21 | A list of tuples describing the training examples. This should be written to train.txt 22 | ''' 23 | 24 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 25 | # can omit it and just call _process_utterance on each input if you want. 26 | 27 | # You will need to modify and format NIKL transcrption file will UTF-8 format 28 | # please check https://github.com/homink/deepspeech.pytorch.ko/blob/master/data/local/clean_corpus.sh 29 | 30 | executor = ProcessPoolExecutor(max_workers=num_workers) 31 | futures = [] 32 | 33 | with open(in_dir + '/speaker.sid', encoding='utf-8') as f: 34 | spk_id = f.readline().rstrip() 35 | 36 | index = 1 37 | with open(in_dir + '/metadata.txt', encoding='utf-8') as f: 38 | for line in f: 39 | if spk_id in line: 40 | parts = line.strip().split('|') 41 | wav_path = parts[0] 42 | text = parts[1] 43 | futures.append(executor.submit( 44 | partial(_process_utterance, out_dir, index + 1, wav_path, text))) 45 | index += 1 46 | return [future.result() for future in tqdm(futures)] 47 | 48 | 49 | def _process_utterance(out_dir, index, wav_path, text): 50 | '''Preprocesses a single utterance audio/text pair. 51 | 52 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 53 | to the train.txt file. 54 | 55 | Args: 56 | out_dir: The directory to write the spectrograms into 57 | index: The numeric index to use in the spectrogram filenames. 58 | wav_path: Path to the audio file containing the speech input 59 | text: The text spoken in the input audio file 60 | 61 | Returns: 62 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 63 | ''' 64 | 65 | # Load the audio to a numpy array: 66 | wav = audio.load_wav(wav_path) 67 | 68 | if hparams.rescaling: 69 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 70 | 71 | # Compute the linear-scale spectrogram from the wav: 72 | spectrogram = audio.spectrogram(wav).astype(np.float32) 73 | n_frames = spectrogram.shape[1] 74 | 75 | # Compute a mel-scale spectrogram from the wav: 76 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 77 | 78 | # Write the spectrograms to disk: 79 | spectrogram_filename = 'nikl-single-spec-%05d.npy' % index 80 | mel_filename = 'nikl-single-mel-%05d.npy' % index 81 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 82 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 83 | 84 | # Return a tuple describing this training example: 85 | return (spectrogram_filename, mel_filename, n_frames, text) 86 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Preprocess dataset 4 | 5 | usage: preprocess.py [options] 6 | 7 | options: 8 | --num_workers= Num workers. 9 | --hparams= Hyper parameters [default: ]. 10 | --preset= Path of preset parameters (json). 11 | -h, --help Show help message. 12 | """ 13 | from docopt import docopt 14 | import os 15 | from multiprocessing import cpu_count 16 | from tqdm import tqdm 17 | import importlib 18 | from hparams import hparams, hparams_debug_string 19 | 20 | 21 | def preprocess(mod, in_dir, out_root, num_workers): 22 | os.makedirs(out_dir, exist_ok=True) 23 | metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm) 24 | write_metadata(metadata, out_dir) 25 | 26 | 27 | def write_metadata(metadata, out_dir): 28 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 29 | for m in metadata: 30 | f.write('|'.join([str(x) for x in m]) + '\n') 31 | frames = sum([m[2] for m in metadata]) 32 | frame_shift_ms = hparams.hop_size / hparams.sample_rate * 1000 33 | hours = frames * frame_shift_ms / (3600 * 1000) 34 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) 35 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 36 | print('Max output length: %d' % max(m[2] for m in metadata)) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = docopt(__doc__) 41 | name = args[""] 42 | in_dir = args[""] 43 | out_dir = args[""] 44 | num_workers = args["--num_workers"] 45 | num_workers = cpu_count() if num_workers is None else int(num_workers) 46 | preset = args["--preset"] 47 | 48 | # Load preset if specified 49 | if preset is not None: 50 | with open(preset) as f: 51 | hparams.parse_json(f.read()) 52 | # Override hyper parameters 53 | hparams.parse(args["--hparams"]) 54 | assert hparams.name == "deepvoice3" 55 | print(hparams_debug_string()) 56 | 57 | assert name in ["jsut", "ljspeech", "vctk", "nikl_m", "nikl_s", "json_meta"] 58 | mod = importlib.import_module(name) 59 | preprocess(mod, in_dir, out_dir, num_workers) 60 | -------------------------------------------------------------------------------- /presets/deepvoice3_ljspeech.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "en", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "deepvoice3", 6 | "n_speakers": 1, 7 | "speaker_embed_dim": 16, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.1, 23 | "speaker_embedding_weight_std": 0.01, 24 | "padding_idx": 0, 25 | "max_positions": 512, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 256, 29 | "encoder_channels": 512, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 1.0, 33 | "key_position_rate": 1.385, 34 | "key_projection": true, 35 | "value_projection": true, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.2, 48 | "batch_size": 16, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } -------------------------------------------------------------------------------- /presets/deepvoice3_niklm.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "ko", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "deepvoice3_multispeaker", 6 | "n_speakers": 118, 7 | "speaker_embed_dim": 16, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.1, 23 | "speaker_embedding_weight_std": 0.05, 24 | "padding_idx": 0, 25 | "max_positions": 1200, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 256, 29 | "encoder_channels": 512, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 2.0, 33 | "key_position_rate": 7.6, 34 | "key_projection": true, 35 | "value_projection": true, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.4, 48 | "batch_size": 8, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } 66 | -------------------------------------------------------------------------------- /presets/deepvoice3_nikls.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "ko", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "deepvoice3", 6 | "n_speakers": 1, 7 | "speaker_embed_dim": 16, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.1, 23 | "speaker_embedding_weight_std": 0.05, 24 | "padding_idx": 0, 25 | "max_positions": 512, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 256, 29 | "encoder_channels": 512, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 2.0, 33 | "key_position_rate": 7.6, 34 | "key_projection": true, 35 | "value_projection": true, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.4, 48 | "batch_size": 8, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } 66 | -------------------------------------------------------------------------------- /presets/deepvoice3_vctk.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "en", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "deepvoice3_multispeaker", 6 | "n_speakers": 108, 7 | "speaker_embed_dim": 16, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.1, 23 | "speaker_embedding_weight_std": 0.05, 24 | "padding_idx": 0, 25 | "max_positions": 1024, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 256, 29 | "encoder_channels": 512, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 2.0, 33 | "key_position_rate": 7.6, 34 | "key_projection": true, 35 | "value_projection": true, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.4, 48 | "batch_size": 16, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } 66 | -------------------------------------------------------------------------------- /presets/nyanko_ljspeech.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "deepvoice3", 3 | "frontend": "en", 4 | "replace_pronunciation_prob": 0.5, 5 | "builder": "nyanko", 6 | "n_speakers": 1, 7 | "speaker_embed_dim": 16, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "sample_rate": 22050, 14 | "preemphasis": 0.97, 15 | "min_level_db": -100, 16 | "ref_level_db": 20, 17 | "rescaling": false, 18 | "rescaling_max": 0.999, 19 | "allow_clipping_in_normalization": true, 20 | "downsample_step": 4, 21 | "outputs_per_step": 1, 22 | "embedding_weight_std": 0.01, 23 | "speaker_embedding_weight_std": 0.01, 24 | "padding_idx": 0, 25 | "max_positions": 512, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "text_embed_dim": 128, 29 | "encoder_channels": 256, 30 | "decoder_channels": 256, 31 | "converter_channels": 256, 32 | "query_position_rate": 1.0, 33 | "key_position_rate": 1.385, 34 | "key_projection": false, 35 | "value_projection": false, 36 | "use_memory_mask": true, 37 | "trainable_positional_encodings": false, 38 | "freeze_embedding": false, 39 | "use_decoder_state_for_postnet_input": true, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "masked_loss_weight": 0.5, 43 | "priority_freq": 3000, 44 | "priority_freq_weight": 0.0, 45 | "binary_divergence_weight": 0.1, 46 | "use_guided_attention": true, 47 | "guided_attention_sigma": 0.2, 48 | "batch_size": 16, 49 | "adam_beta1": 0.5, 50 | "adam_beta2": 0.9, 51 | "adam_eps": 1e-06, 52 | "initial_learning_rate": 0.0005, 53 | "lr_schedule": "noam_learning_rate_decay", 54 | "lr_schedule_kwargs": {}, 55 | "nepochs": 2000, 56 | "weight_decay": 0.0, 57 | "clip_thresh": 0.1, 58 | "checkpoint_interval": 10000, 59 | "eval_interval": 10000, 60 | "save_optimizer_state": true, 61 | "force_monotonic_attention": true, 62 | "window_ahead": 3, 63 | "window_backward": 1, 64 | "power": 1.4 65 | } 66 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script for Pypi release 4 | # 0. Make sure you are on git tag 5 | # 1. Run the script 6 | # 2. Upload sdist 7 | 8 | set -e 9 | 10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) 11 | cd $script_dir 12 | 13 | TAG=$(git describe --exact-match --tags HEAD) 14 | 15 | VERSION=${TAG/v/} 16 | 17 | DEEPVOICE3_PYTORCH_BUILD_VERSION=$VERSION python setup.py develop sdist 18 | echo "*** Ready to release! deepvoice3_pytorch $TAG ***" 19 | echo "Please make sure that release verion is correct." 20 | cat deepvoice3_pytorch/version.py 21 | echo "Please run the following command manually:" 22 | echo twine upload dist/deepvoice3_pytorch-${VERSION}.tar.gz --repository-url https://upload.pypi.org/legacy/ 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import setuptools.command.develop 5 | import setuptools.command.build_py 6 | import os 7 | import subprocess 8 | from os.path import exists 9 | 10 | version = '0.1.1' 11 | 12 | # Adapted from https://github.com/pytorch/pytorch 13 | cwd = os.path.dirname(os.path.abspath(__file__)) 14 | if os.getenv('DEEPVOICE3_PYTORCH_BUILD_VERSION'): 15 | version = os.getenv('DEEPVOICE3_PYTORCH_BUILD_VERSION') 16 | else: 17 | try: 18 | sha = subprocess.check_output( 19 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 20 | version += '+' + sha[:7] 21 | except subprocess.CalledProcessError: 22 | pass 23 | except IOError: # FileNotFoundError for python 3 24 | pass 25 | 26 | 27 | class build_py(setuptools.command.build_py.build_py): 28 | 29 | def run(self): 30 | self.create_version_file() 31 | setuptools.command.build_py.build_py.run(self) 32 | 33 | @staticmethod 34 | def create_version_file(): 35 | global version, cwd 36 | print('-- Building version ' + version) 37 | version_path = os.path.join(cwd, 'deepvoice3_pytorch', 'version.py') 38 | with open(version_path, 'w') as f: 39 | f.write("__version__ = '{}'\n".format(version)) 40 | 41 | 42 | class develop(setuptools.command.develop.develop): 43 | 44 | def run(self): 45 | build_py.create_version_file() 46 | setuptools.command.develop.develop.run(self) 47 | 48 | 49 | def create_readme_rst(): 50 | global cwd 51 | try: 52 | subprocess.check_call( 53 | ["pandoc", "--from=markdown", "--to=rst", "--output=README.rst", 54 | "README.md"], cwd=cwd) 55 | print("Generated README.rst from README.md using pandoc.") 56 | except subprocess.CalledProcessError: 57 | pass 58 | except OSError: 59 | pass 60 | 61 | 62 | if not exists('README.rst'): 63 | create_readme_rst() 64 | 65 | if exists('README.rst'): 66 | README = open('README.rst', 'rb').read().decode("utf-8") 67 | else: 68 | README = '' 69 | 70 | setup(name='deepvoice3_pytorch', 71 | version=version, 72 | description='PyTorch implementation of convolutional networks-based text-to-speech synthesis models.', 73 | long_description=README, 74 | packages=find_packages(), 75 | cmdclass={ 76 | 'build_py': build_py, 77 | 'develop': develop, 78 | }, 79 | install_requires=[ 80 | "numpy", 81 | "scipy", 82 | "torch >= 1.0.0", 83 | "unidecode", 84 | "inflect", 85 | "librosa", 86 | "numba", 87 | "lws", 88 | "nltk", 89 | ], 90 | extras_require={ 91 | "bin": [ 92 | "docopt", 93 | "tqdm", 94 | "tensorboardX <= 1.2", 95 | "nnmnkwii >= 0.0.19", 96 | "requests", 97 | "matplotlib", 98 | ], 99 | "test": [ 100 | "nose", 101 | ], 102 | "jp": [ 103 | "jaconv", 104 | "mecab-python3", 105 | ], 106 | }) 107 | -------------------------------------------------------------------------------- /synthesis.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Synthesis waveform from trained model. 4 | 5 | usage: synthesis.py [options] 6 | 7 | options: 8 | --hparams= Hyper parameters [default: ]. 9 | --preset= Path of preset parameters (json). 10 | --checkpoint-seq2seq= Load seq2seq model from checkpoint path. 11 | --checkpoint-postnet= Load postnet model from checkpoint path. 12 | --file-name-suffix= File name suffix [default: ]. 13 | --max-decoder-steps= Max decoder steps [default: 500]. 14 | --replace_pronunciation_prob= Prob [default: 0.0]. 15 | --speaker_id= Speaker ID (for multi-speaker model). 16 | --output-html Output html for blog post. 17 | -h, --help Show help message. 18 | """ 19 | from docopt import docopt 20 | 21 | import sys 22 | import os 23 | from os.path import dirname, join, basename, splitext 24 | 25 | import audio 26 | 27 | import torch 28 | import numpy as np 29 | import nltk 30 | 31 | # The deepvoice3 model 32 | from deepvoice3_pytorch import frontend 33 | from hparams import hparams, hparams_debug_string 34 | 35 | from tqdm import tqdm 36 | 37 | use_cuda = torch.cuda.is_available() 38 | device = torch.device("cuda" if use_cuda else "cpu") 39 | _frontend = None # to be set later 40 | 41 | 42 | def tts(model, text, p=0, speaker_id=None, fast=False): 43 | """Convert text to speech waveform given a deepvoice3 model. 44 | 45 | Args: 46 | text (str) : Input text to be synthesized 47 | p (float) : Replace word to pronounciation if p > 0. Default is 0. 48 | """ 49 | model = model.to(device) 50 | model.eval() 51 | if fast: 52 | model.make_generation_fast_() 53 | 54 | sequence = np.array(_frontend.text_to_sequence(text, p=p)) 55 | sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device) 56 | text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device) 57 | speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device) 58 | 59 | # Greedy decoding 60 | with torch.no_grad(): 61 | mel_outputs, linear_outputs, alignments, done = model( 62 | sequence, text_positions=text_positions, speaker_ids=speaker_ids) 63 | 64 | linear_output = linear_outputs[0].cpu().data.numpy() 65 | spectrogram = audio._denormalize(linear_output) 66 | alignment = alignments[0].cpu().data.numpy() 67 | mel = mel_outputs[0].cpu().data.numpy() 68 | mel = audio._denormalize(mel) 69 | 70 | # Predicted audio signal 71 | waveform = audio.inv_spectrogram(linear_output.T) 72 | 73 | return waveform, alignment, spectrogram, mel 74 | 75 | 76 | def _load(checkpoint_path): 77 | if use_cuda: 78 | checkpoint = torch.load(checkpoint_path) 79 | else: 80 | checkpoint = torch.load(checkpoint_path, 81 | map_location=lambda storage, loc: storage) 82 | return checkpoint 83 | 84 | 85 | if __name__ == "__main__": 86 | args = docopt(__doc__) 87 | print("Command line args:\n", args) 88 | checkpoint_path = args[""] 89 | text_list_file_path = args[""] 90 | dst_dir = args[""] 91 | checkpoint_seq2seq_path = args["--checkpoint-seq2seq"] 92 | checkpoint_postnet_path = args["--checkpoint-postnet"] 93 | max_decoder_steps = int(args["--max-decoder-steps"]) 94 | file_name_suffix = args["--file-name-suffix"] 95 | replace_pronunciation_prob = float(args["--replace_pronunciation_prob"]) 96 | output_html = args["--output-html"] 97 | speaker_id = args["--speaker_id"] 98 | if speaker_id is not None: 99 | speaker_id = int(speaker_id) 100 | preset = args["--preset"] 101 | 102 | # Load preset if specified 103 | if preset is not None: 104 | with open(preset) as f: 105 | hparams.parse_json(f.read()) 106 | # Override hyper parameters 107 | hparams.parse(args["--hparams"]) 108 | assert hparams.name == "deepvoice3" 109 | 110 | _frontend = getattr(frontend, hparams.frontend) 111 | import train 112 | train._frontend = _frontend 113 | from train import plot_alignment, build_model 114 | 115 | # Model 116 | model = build_model() 117 | 118 | # Load checkpoints separately 119 | if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None: 120 | checkpoint = _load(checkpoint_seq2seq_path) 121 | model.seq2seq.load_state_dict(checkpoint["state_dict"]) 122 | checkpoint = _load(checkpoint_postnet_path) 123 | model.postnet.load_state_dict(checkpoint["state_dict"]) 124 | checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0] 125 | else: 126 | checkpoint = _load(checkpoint_path) 127 | model.load_state_dict(checkpoint["state_dict"]) 128 | checkpoint_name = splitext(basename(checkpoint_path))[0] 129 | 130 | model.seq2seq.decoder.max_decoder_steps = max_decoder_steps 131 | 132 | os.makedirs(dst_dir, exist_ok=True) 133 | with open(text_list_file_path, "rb") as f: 134 | lines = f.readlines() 135 | for idx, line in enumerate(lines): 136 | text = line.decode("utf-8")[:-1] 137 | words = nltk.word_tokenize(text) 138 | waveform, alignment, _, _ = tts( 139 | model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True) 140 | dst_wav_path = join(dst_dir, "{}_{}{}.wav".format( 141 | idx, checkpoint_name, file_name_suffix)) 142 | dst_alignment_path = join( 143 | dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name, 144 | file_name_suffix)) 145 | plot_alignment(alignment.T, dst_alignment_path, 146 | info="{}, {}".format(hparams.builder, basename(checkpoint_path))) 147 | audio.save_wav(waveform, dst_wav_path) 148 | name = splitext(basename(text_list_file_path))[0] 149 | if output_html: 150 | print(""" 151 | {} 152 | 153 | ({} chars, {} words) 154 | 155 | 159 | 160 |
161 | """.format(text, len(text), len(words), 162 | hparams.builder, name, basename(dst_wav_path), 163 | hparams.builder, name, basename(dst_alignment_path))) 164 | else: 165 | print(idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words))) 166 | 167 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 168 | sys.exit(0) 169 | -------------------------------------------------------------------------------- /tests/data/ljspeech-mel-00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r9y9/deepvoice3_pytorch/f90255c96177c344cd18b5a52651b420a4d8062d/tests/data/ljspeech-mel-00001.npy -------------------------------------------------------------------------------- /tests/test_audio.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join 6 | sys.path.insert(0, join(dirname(__file__), "..")) 7 | 8 | import numpy as np 9 | from nose.plugins.attrib import attr 10 | 11 | import logging 12 | logging.getLogger('tensorflow').disabled = True 13 | 14 | 15 | @attr("local_only") 16 | def test_amp_to_db(): 17 | import audio 18 | x = np.random.rand(10) 19 | x_hat = audio._db_to_amp(audio._amp_to_db(x)) 20 | assert np.allclose(x, x_hat) 21 | -------------------------------------------------------------------------------- /tests/test_conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | from deepvoice3_pytorch.conv import Conv1d 8 | 9 | 10 | def test_conv1d_incremental(): 11 | def __test(kernel_size, dilation, T, B, C, causual=True): 12 | dilation = (dilation,) 13 | 14 | # dilation = (4,) 15 | # causual 16 | assert causual 17 | if causual: 18 | padding = (kernel_size - 1) * dilation[0] 19 | else: 20 | padding = (kernel_size - 1) // 2 * dilation[0] 21 | 22 | # weight: (Cout, Cin, K) 23 | conv = nn.Conv1d( 24 | C, C * 2, kernel_size=kernel_size, padding=padding, 25 | dilation=dilation).eval() 26 | conv.weight.data.fill_(1.0) 27 | conv.bias.data.zero_() 28 | 29 | # weight: (K, Cin, Cout) 30 | # weight (linearized): (Cout*K, Cin) 31 | conv_online = Conv1d( 32 | C, C * 2, kernel_size=kernel_size, padding=padding, 33 | dilation=dilation).eval() 34 | conv_online.weight.data.fill_(1.0) 35 | conv_online.bias.data.zero_() 36 | 37 | # (B, C, T) 38 | bct = torch.zeros(B, C, T) + torch.arange(0, T).float() 39 | output_conv = conv(bct) 40 | 41 | # Remove future time stamps 42 | output_conv = output_conv[:, :, :T] 43 | 44 | output_conv_online = [] 45 | 46 | # B, T, C 47 | btc = bct.transpose(1, 2).contiguous() 48 | for t in range(btc.size(1)): 49 | input = btc[:, t, :].contiguous().view(B, -1, C) 50 | output = conv_online.incremental_forward(input) 51 | output_conv_online += [output] 52 | 53 | output_conv_online = torch.stack(output_conv_online).squeeze(2) 54 | output_conv_online = output_conv_online.transpose(0, 1).transpose(1, 2) 55 | 56 | assert (output_conv == output_conv_online).all() 57 | 58 | for B in [1, 4]: 59 | for T in [5, 10]: 60 | for C in [1, 2, 4]: 61 | for kernel_size in [2, 3]: 62 | for dilation in [1, 2, 3, 4, 5, 9, 27]: 63 | yield __test, kernel_size, dilation, T, B, C 64 | -------------------------------------------------------------------------------- /tests/test_deepvoice3.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join, exists 6 | 7 | from deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab 8 | 9 | import torch 10 | from torch import nn 11 | import numpy as np 12 | 13 | from nose.plugins.attrib import attr 14 | 15 | from deepvoice3_pytorch.builder import deepvoice3 16 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 17 | 18 | 19 | use_cuda = torch.cuda.is_available() and False 20 | torch.backends.cudnn.deterministic = True 21 | num_mels = 80 22 | num_freq = 513 23 | outputs_per_step = 4 24 | padding_idx = 0 25 | 26 | 27 | def _get_model(n_speakers=1, speaker_embed_dim=None, 28 | force_monotonic_attention=False, 29 | use_decoder_state_for_postnet_input=False, use_memory_mask=False): 30 | model = deepvoice3(n_vocab=n_vocab, 31 | embed_dim=32, 32 | mel_dim=num_mels, 33 | linear_dim=num_freq, 34 | r=outputs_per_step, 35 | padding_idx=padding_idx, 36 | n_speakers=n_speakers, 37 | speaker_embed_dim=speaker_embed_dim, 38 | dropout=1 - 0.95, 39 | kernel_size=5, 40 | encoder_channels=16, 41 | decoder_channels=32, 42 | converter_channels=32, 43 | force_monotonic_attention=force_monotonic_attention, 44 | use_decoder_state_for_postnet_input=use_decoder_state_for_postnet_input, 45 | use_memory_mask=use_memory_mask, 46 | ) 47 | return model 48 | 49 | 50 | def _pad(seq, max_len): 51 | return np.pad(seq, (0, max_len - len(seq)), 52 | mode='constant', constant_values=0) 53 | 54 | 55 | def _test_data(): 56 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 57 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 58 | input_lengths = np.array([len(s) for s in seqs]) 59 | max_len = np.max(input_lengths) 60 | seqs = np.array([_pad(s, max_len) for s in seqs]) 61 | 62 | # Test encoder 63 | x = torch.LongTensor(seqs) 64 | y = torch.rand(x.size(0), 12, 80) 65 | 66 | return x, y, input_lengths 67 | 68 | 69 | def _deepvoice3(n_vocab, embed_dim=256, mel_dim=80, 70 | linear_dim=4096, r=5, 71 | n_speakers=1, speaker_embed_dim=16, 72 | padding_idx=None, 73 | dropout=(1 - 0.95), dilation=1): 74 | 75 | from deepvoice3_pytorch.deepvoice3 import Encoder, Decoder, Converter 76 | h = 128 77 | encoder = Encoder( 78 | n_vocab, embed_dim, padding_idx=padding_idx, 79 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 80 | dropout=dropout, 81 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 82 | (h, 3, dilation), (h, 3, dilation)], 83 | ) 84 | 85 | h = 256 86 | decoder = Decoder( 87 | embed_dim, in_dim=mel_dim, r=r, padding_idx=padding_idx, 88 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 89 | dropout=dropout, 90 | preattention=[(h, 3, 1)], 91 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 92 | (h, 3, dilation), (h, 3, dilation)], 93 | attention=[True, False, False, False, True], 94 | force_monotonic_attention=False) 95 | 96 | seq2seq = AttentionSeq2Seq(encoder, decoder) 97 | 98 | in_dim = mel_dim 99 | h = 256 100 | converter = Converter(n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim, 101 | in_dim=in_dim, out_dim=linear_dim, dropout=dropout, 102 | convolutions=[(h, 3, dilation), (h, 3, dilation), (h, 3, dilation), 103 | (h, 3, dilation), (h, 3, dilation)]) 104 | 105 | model = MultiSpeakerTTSModel( 106 | seq2seq, converter, padding_idx=padding_idx, 107 | mel_dim=mel_dim, linear_dim=linear_dim, 108 | n_speakers=n_speakers, speaker_embed_dim=speaker_embed_dim) 109 | 110 | return model 111 | 112 | 113 | def test_single_speaker_deepvoice3(): 114 | x, y, lengths = _test_data() 115 | 116 | for v in [False, True]: 117 | model = _get_model(use_decoder_state_for_postnet_input=v) 118 | mel_outputs, linear_outputs, alignments, done = model(x, y, input_lengths=lengths) 119 | 120 | model = _get_model(use_memory_mask=True) 121 | mel_outputs, linear_outputs, alignments, done = model(x, y, input_lengths=lengths) 122 | 123 | 124 | def _pad_2d(x, max_len, b_pad=0): 125 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)], 126 | mode="constant", constant_values=0) 127 | return x 128 | 129 | 130 | def test_multi_speaker_deepvoice3(): 131 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 132 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 133 | input_lengths = np.array([len(s) for s in seqs]) 134 | max_len = np.max(input_lengths) 135 | seqs = np.array([_pad(s, max_len) for s in seqs]) 136 | 137 | # Test encoder 138 | x = torch.LongTensor(seqs) 139 | y = torch.rand(x.size(0), 4 * 33, 80) 140 | model = _get_model(n_speakers=32, speaker_embed_dim=16) 141 | speaker_ids = torch.LongTensor([1, 2, 3]) 142 | 143 | mel_outputs, linear_outputs, alignments, done = model(x, y, speaker_ids=speaker_ids) 144 | print("Input text:", x.size()) 145 | print("Input mel:", y.size()) 146 | print("Mel:", mel_outputs.size()) 147 | print("Linear:", linear_outputs.size()) 148 | print("Alignments:", alignments.size()) 149 | print("Done:", done.size()) 150 | 151 | 152 | @attr("issue38") 153 | def test_incremental_path_multiple_times(): 154 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 155 | seqs = np.array([text_to_sequence(t) for t in texts]) 156 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 157 | 158 | r = 4 159 | mel_dim = 80 160 | sequence = torch.LongTensor(seqs) 161 | text_positions = torch.LongTensor(text_positions) 162 | 163 | for model, speaker_ids in [ 164 | (_get_model(force_monotonic_attention=False), None), 165 | (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), torch.LongTensor([1]))]: 166 | model.eval() 167 | 168 | # first call 169 | mel_outputs, linear_outputs, alignments, done = model( 170 | sequence, text_positions=text_positions, speaker_ids=speaker_ids) 171 | 172 | # second call 173 | mel_outputs2, linear_outputs2, alignments2, done2 = model( 174 | sequence, text_positions=text_positions, speaker_ids=speaker_ids) 175 | 176 | # Should get same result 177 | c = (mel_outputs - mel_outputs2).abs() 178 | print(c.mean(), c.max()) 179 | 180 | assert np.allclose(mel_outputs.cpu().data.numpy(), 181 | mel_outputs2.cpu().data.numpy(), atol=1e-5) 182 | 183 | 184 | def test_incremental_correctness(): 185 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 186 | seqs = np.array([text_to_sequence(t) for t in texts]) 187 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 188 | 189 | mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy") 190 | mel = np.load(mel_path) 191 | max_target_len = mel.shape[0] 192 | r = 4 193 | mel_dim = 80 194 | if max_target_len % r != 0: 195 | max_target_len += r - max_target_len % r 196 | assert max_target_len % r == 0 197 | mel = _pad_2d(mel, max_target_len) 198 | mel = torch.from_numpy(mel) 199 | mel_reshaped = mel.contiguous().view(1, -1, mel_dim * r) 200 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 201 | 202 | x = torch.LongTensor(seqs) 203 | text_positions = torch.LongTensor(text_positions) 204 | frame_positions = torch.LongTensor(frame_positions) 205 | 206 | for model, speaker_ids in [ 207 | (_get_model(force_monotonic_attention=False), None), 208 | (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), torch.LongTensor([1]))]: 209 | model.eval() 210 | 211 | if speaker_ids is not None: 212 | speaker_embed = model.embed_speakers(speaker_ids) 213 | else: 214 | speaker_embed = None 215 | 216 | # Encoder 217 | encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed) 218 | 219 | # Off line decoding 220 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( 221 | encoder_outs, mel_reshaped, speaker_embed=speaker_embed, 222 | text_positions=text_positions, frame_positions=frame_positions) 223 | 224 | # Online decoding with test inputs 225 | model.seq2seq.decoder.start_fresh_sequence() 226 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( 227 | encoder_outs, text_positions, speaker_embed=speaker_embed, 228 | test_inputs=mel_reshaped) 229 | 230 | # Should get same result 231 | c = (mel_outputs_offline - mel_outputs_online).abs() 232 | print(c.mean(), c.max()) 233 | 234 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(), 235 | mel_outputs_online.cpu().data.numpy(), atol=1e-5) 236 | 237 | 238 | @attr("local_only") 239 | def test_incremental_forward(): 240 | checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth") 241 | if not exists(checkpoint_path): 242 | return 243 | model = _get_model() 244 | 245 | use_cuda = False 246 | 247 | checkpoint = torch.load(checkpoint_path) 248 | model.load_state_dict(checkpoint["state_dict"]) 249 | model.make_generation_fast_() 250 | model = model.cuda() if use_cuda else model 251 | 252 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 253 | seqs = np.array([text_to_sequence(t) for t in texts]) 254 | input_lengths = [len(s) for s in seqs] 255 | 256 | use_manual_padding = False 257 | if use_manual_padding: 258 | max_input_len = np.max(input_lengths) + 10 # manuall padding 259 | seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int) 260 | input_lengths = torch.LongTensor(input_lengths) 261 | input_lengths = input_lengths.cuda() if use_cuda else input_lengths 262 | else: 263 | input_lengths = None 264 | 265 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 266 | 267 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 268 | max_target_len = mel.shape[0] 269 | r = 4 270 | mel_dim = 80 271 | if max_target_len % r != 0: 272 | max_target_len += r - max_target_len % r 273 | assert max_target_len % r == 0 274 | mel = _pad_2d(mel, max_target_len) 275 | mel = torch.from_numpy(mel) 276 | mel_reshaped = mel.contiguous().view(1, -1, mel_dim * r) 277 | 278 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 279 | 280 | x = torch.LongTensor(seqs) 281 | text_positions = torch.LongTensor(text_positions) 282 | frame_positions = torch.LongTensor(frame_positions) 283 | 284 | if use_cuda: 285 | x = x.cuda() 286 | text_positions = text_positions.cuda() 287 | frame_positions = frame_positions.cuda() 288 | mel_reshaped = mel_reshaped.cuda() 289 | 290 | model.eval() 291 | 292 | def _plot(mel, mel_predicted, alignments): 293 | from matplotlib import pylab as plt 294 | plt.figure(figsize=(16, 10)) 295 | plt.subplot(3, 1, 1) 296 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", 297 | aspect="auto", cmap="magma") 298 | plt.colorbar() 299 | 300 | plt.subplot(3, 1, 2) 301 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, 302 | origin="lower bottom", aspect="auto", cmap="magma") 303 | plt.colorbar() 304 | 305 | plt.subplot(3, 1, 3) 306 | if alignments.dim() == 4: 307 | alignments = alignments.mean(0) 308 | plt.imshow(alignments[0].data.cpu( 309 | ).numpy().T, origin="lower bottom", aspect="auto") 310 | plt.colorbar() 311 | plt.show() 312 | 313 | # Encoder 314 | encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths) 315 | 316 | # Off line decoding 317 | mel_output_offline, alignments_offline, done = model.seq2seq.decoder( 318 | encoder_outs, mel_reshaped, 319 | text_positions=text_positions, frame_positions=frame_positions, 320 | lengths=input_lengths) 321 | 322 | _plot(mel, mel_output_offline, alignments_offline) 323 | 324 | # Online decoding 325 | test_inputs = None 326 | # test_inputs = mel_reshaped 327 | model.seq2seq.decoder.start_fresh_sequence() 328 | mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward( 329 | encoder_outs, text_positions, 330 | # initial_input=mel_reshaped[:, :1, :], 331 | test_inputs=test_inputs) 332 | 333 | if test_inputs is not None: 334 | c = (mel_output_offline - mel_outputs).abs() 335 | print(c.mean(), c.max()) 336 | _plot(mel, c, alignments) 337 | 338 | _plot(mel, mel_outputs, alignments) 339 | -------------------------------------------------------------------------------- /tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import torch 5 | from torch import nn 6 | from deepvoice3_pytorch.modules import SinusoidalEncoding, position_encoding_init 7 | import numpy as np 8 | 9 | 10 | def test_sinusoidal(): 11 | num_embedding = 512 12 | embedding_dim = 128 13 | 14 | for w in [1.0, 0.5, 2.0, 10.0, 20.0]: 15 | a = nn.Embedding(num_embedding, embedding_dim, padding_idx=0) 16 | a.weight.data = position_encoding_init( 17 | num_embedding, embedding_dim, position_rate=w) 18 | 19 | b = SinusoidalEncoding(num_embedding, embedding_dim) 20 | 21 | x = torch.arange(0, 128).long() 22 | ax = a(x).data.numpy() 23 | bx = b(x, w).data.numpy() 24 | 25 | print(w, np.abs(ax - bx).mean()) 26 | try: 27 | assert np.allclose(ax, bx) 28 | except: 29 | print("TODO: has little numerical errors?") 30 | assert np.abs(ax - bx).mean() < 1e-5 31 | -------------------------------------------------------------------------------- /tests/test_frontend.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | from deepvoice3_pytorch import frontend 5 | from nose.plugins.attrib import attr 6 | 7 | eos = 1 8 | 9 | 10 | def test_en(): 11 | f = getattr(frontend, "en") 12 | seq = f.text_to_sequence("hello world.") 13 | assert seq[-1] == eos 14 | t = f.sequence_to_text(seq) 15 | assert t == "hello world.~" 16 | 17 | 18 | @attr("local_only") 19 | def test_ja(): 20 | f = getattr(frontend, "jp") 21 | seq = f.text_to_sequence("こんにちわ") 22 | assert seq[-1] == eos 23 | t = f.sequence_to_text(seq) 24 | assert t[:-1] == "コンニチワ。" 25 | 26 | 27 | @attr("local_only") 28 | def test_en_lj(): 29 | f = getattr(frontend, "en") 30 | from nnmnkwii.datasets import ljspeech 31 | from tqdm import trange 32 | import jaconv 33 | 34 | d = ljspeech.TranscriptionDataSource("/home/ryuichi/data/LJSpeech-1.0") 35 | texts = d.collect_files() 36 | 37 | for p in [0.0, 0.5, 1.0]: 38 | for idx in trange(len(texts)): 39 | text = texts[idx] 40 | seq = f.text_to_sequence(text, p=p) 41 | assert seq[-1] == eos 42 | t = f.sequence_to_text(seq) 43 | 44 | if idx < 10: 45 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t)) 46 | 47 | 48 | @attr("local_only") 49 | def test_ja_jsut(): 50 | f = getattr(frontend, "jp") 51 | from nnmnkwii.datasets import jsut 52 | from tqdm import trange 53 | import jaconv 54 | 55 | d = jsut.TranscriptionDataSource("/home/ryuichi/data/jsut_ver1.1/", 56 | subsets=jsut.available_subsets) 57 | texts = d.collect_files() 58 | 59 | for p in [0.0, 0.5, 1.0]: 60 | for idx in trange(len(texts)): 61 | text = texts[idx] 62 | seq = f.text_to_sequence(text, p=p) 63 | assert seq[-1] == eos 64 | t = f.sequence_to_text(seq) 65 | 66 | if idx < 10: 67 | print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t)) 68 | -------------------------------------------------------------------------------- /tests/test_nyanko.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join, exists 6 | 7 | from deepvoice3_pytorch.frontend.en import text_to_sequence, n_vocab 8 | 9 | import torch 10 | from torch import nn 11 | import numpy as np 12 | 13 | from nose.plugins.attrib import attr 14 | 15 | from deepvoice3_pytorch.builder import nyanko 16 | from deepvoice3_pytorch import MultiSpeakerTTSModel, AttentionSeq2Seq 17 | 18 | use_cuda = torch.cuda.is_available() and False 19 | num_mels = 80 20 | num_freq = 513 21 | outputs_per_step = 4 22 | padding_idx = 0 23 | 24 | 25 | def _pad(seq, max_len): 26 | return np.pad(seq, (0, max_len - len(seq)), 27 | mode='constant', constant_values=0) 28 | 29 | 30 | def _test_data(): 31 | texts = ["Thank you very much.", "Hello.", "Deep voice 3."] 32 | seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] 33 | input_lengths = np.array([len(s) for s in seqs]) 34 | max_len = np.max(input_lengths) 35 | seqs = np.array([_pad(s, max_len) for s in seqs]) 36 | 37 | # Test encoder 38 | x = torch.LongTensor(seqs) 39 | y = torch.rand(x.size(0), 12, 80) 40 | 41 | return x, y 42 | 43 | 44 | def _pad_2d(x, max_len, b_pad=0): 45 | x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)], 46 | mode="constant", constant_values=0) 47 | return x 48 | 49 | 50 | def test_nyanko_basics(): 51 | x, y = _test_data() 52 | 53 | for v in [False, True]: 54 | model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4, 55 | use_decoder_state_for_postnet_input=v) 56 | mel_outputs, linear_outputs, alignments, done = model(x, y) 57 | 58 | 59 | @attr("issue38") 60 | def test_incremental_path_multiple_times(): 61 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 62 | seqs = np.array([text_to_sequence(t) for t in texts]) 63 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 64 | 65 | r = 1 66 | mel_dim = 80 67 | 68 | sequence = torch.LongTensor(seqs) 69 | text_positions = torch.LongTensor(text_positions) 70 | 71 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, 72 | r=r, force_monotonic_attention=False) 73 | model.eval() 74 | 75 | # first call 76 | mel_outputs, linear_outputs, alignments, done = model( 77 | sequence, text_positions=text_positions, speaker_ids=None) 78 | 79 | # second call 80 | mel_outputs2, linear_outputs2, alignments2, done2 = model( 81 | sequence, text_positions=text_positions, speaker_ids=None) 82 | 83 | # Should get same result 84 | c = (mel_outputs - mel_outputs2).abs() 85 | print(c.mean(), c.max()) 86 | 87 | assert np.allclose(mel_outputs.cpu().data.numpy(), 88 | mel_outputs2.cpu().data.numpy(), atol=1e-5) 89 | 90 | 91 | def test_incremental_correctness(): 92 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 93 | seqs = np.array([text_to_sequence(t) for t in texts]) 94 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 95 | 96 | mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy") 97 | mel = np.load(mel_path)[::4] 98 | max_target_len = mel.shape[0] 99 | r = 1 100 | mel_dim = 80 101 | if max_target_len % r != 0: 102 | max_target_len += r - max_target_len % r 103 | assert max_target_len % r == 0 104 | mel = _pad_2d(mel, max_target_len) 105 | mel = torch.from_numpy(mel) 106 | mel_reshaped = mel.view(1, -1, mel_dim * r) 107 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 108 | 109 | x = torch.LongTensor(seqs) 110 | text_positions = torch.LongTensor(text_positions) 111 | frame_positions = torch.LongTensor(frame_positions) 112 | 113 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, 114 | r=r, force_monotonic_attention=False) 115 | model.eval() 116 | 117 | # Encoder 118 | encoder_outs = model.seq2seq.encoder(x) 119 | 120 | # Off line decoding 121 | mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( 122 | encoder_outs, mel_reshaped, 123 | text_positions=text_positions, frame_positions=frame_positions) 124 | 125 | # Online decoding with test inputs 126 | model.seq2seq.decoder.start_fresh_sequence() 127 | mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( 128 | encoder_outs, text_positions, 129 | test_inputs=mel_reshaped) 130 | 131 | # Should get same result 132 | assert np.allclose(mel_outputs_offline.cpu().data.numpy(), 133 | mel_outputs_online.cpu().data.numpy()) 134 | 135 | 136 | @attr("local_only") 137 | def test_nyanko(): 138 | texts = ["they discarded this for a more completely Roman and far less beautiful letter."] 139 | seqs = np.array([text_to_sequence(t) for t in texts]) 140 | text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) 141 | 142 | mel = np.load("/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy") 143 | max_target_len = mel.shape[0] 144 | r = 1 145 | mel_dim = 80 146 | if max_target_len % r != 0: 147 | max_target_len += r - max_target_len % r 148 | assert max_target_len % r == 0 149 | mel = _pad_2d(mel, max_target_len) 150 | mel = torch.from_numpy(mel) 151 | mel_reshaped = mel.view(1, -1, mel_dim * r) 152 | frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1)) 153 | 154 | x = torch.LongTensor(seqs) 155 | text_positions = torch.LongTensor(text_positions) 156 | frame_positions = torch.LongTensor(frame_positions) 157 | 158 | model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, 159 | r=r, force_monotonic_attention=False) 160 | model.eval() 161 | 162 | def _plot(mel, mel_predicted, alignments): 163 | from matplotlib import pylab as plt 164 | plt.figure(figsize=(16, 10)) 165 | plt.subplot(3, 1, 1) 166 | plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") 167 | plt.colorbar() 168 | 169 | plt.subplot(3, 1, 2) 170 | plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, 171 | origin="lower bottom", aspect="auto", cmap="magma") 172 | plt.colorbar() 173 | 174 | plt.subplot(3, 1, 3) 175 | if alignments.dim() == 4: 176 | alignments = alignments.mean(0) 177 | plt.imshow(alignments[0].data.cpu( 178 | ).numpy().T, origin="lower bottom", aspect="auto") 179 | plt.colorbar() 180 | plt.show() 181 | 182 | seq2seq = model.seq2seq 183 | 184 | # Encoder 185 | encoder_outs = seq2seq.encoder(x) 186 | 187 | # Off line decoding 188 | print("Offline decoding") 189 | mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder( 190 | encoder_outs, mel_reshaped, 191 | text_positions=text_positions, frame_positions=frame_positions) 192 | 193 | _plot(mel, mel_outputs_offline, alignments_offline) 194 | 195 | # Online decoding with test inputs 196 | print("Online decoding") 197 | seq2seq.decoder.start_fresh_sequence() 198 | mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward( 199 | encoder_outs, text_positions, 200 | test_inputs=mel_reshaped) 201 | 202 | a = mel_outputs_offline.cpu().data.numpy() 203 | b = mel_outputs_online.cpu().data.numpy() 204 | c = (mel_outputs_offline - mel_outputs_online).abs() 205 | print(c.mean(), c.max()) 206 | 207 | _plot(mel, mel_outputs_offline, alignments_offline) 208 | _plot(mel, mel_outputs_online, alignments) 209 | _plot(mel, c, alignments) 210 | 211 | # Should get same result 212 | assert np.allclose(a, b) 213 | 214 | postnet = model.postnet 215 | 216 | linear_outputs = postnet(mel_outputs_offline) 217 | print(linear_outputs.size()) 218 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E305,E402,E721,E741,F401,F403,F405,F821,F841,F999 4 | exclude = docs/,data,build,dist,notebooks,checkpoints*,legacy,vctk_preprocess,nikl_preprocess 5 | -------------------------------------------------------------------------------- /vctk.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from nnmnkwii.datasets import vctk 7 | from nnmnkwii.io import hts 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | 13 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 14 | executor = ProcessPoolExecutor(max_workers=num_workers) 15 | futures = [] 16 | 17 | speakers = vctk.available_speakers 18 | 19 | td = vctk.TranscriptionDataSource(in_dir, speakers=speakers) 20 | transcriptions = td.collect_files() 21 | speaker_ids = td.labels 22 | wav_paths = vctk.WavFileDataSource( 23 | in_dir, speakers=speakers).collect_files() 24 | 25 | for index, (speaker_id, text, wav_path) in enumerate( 26 | zip(speaker_ids, transcriptions, wav_paths)): 27 | futures.append(executor.submit( 28 | partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text))) 29 | return [future.result() for future in tqdm(futures)] 30 | 31 | 32 | def start_at(labels): 33 | has_silence = labels[0][-1] == "pau" 34 | if not has_silence: 35 | return labels[0][0] 36 | for i in range(1, len(labels)): 37 | if labels[i][-1] != "pau": 38 | return labels[i][0] 39 | assert False 40 | 41 | 42 | def end_at(labels): 43 | has_silence = labels[-1][-1] == "pau" 44 | if not has_silence: 45 | return labels[-1][1] 46 | for i in range(len(labels) - 2, 0, -1): 47 | if labels[i][-1] != "pau": 48 | return labels[i][1] 49 | assert False 50 | 51 | 52 | def _process_utterance(out_dir, index, speaker_id, wav_path, text): 53 | sr = hparams.sample_rate 54 | 55 | # Load the audio to a numpy array: 56 | wav = audio.load_wav(wav_path) 57 | 58 | lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") 59 | 60 | # Trim silence from hts labels if available 61 | if exists(lab_path): 62 | labels = hts.load(lab_path) 63 | b = int(start_at(labels) * 1e-7 * sr) 64 | e = int(end_at(labels) * 1e-7 * sr) 65 | wav = wav[b:e] 66 | wav, _ = librosa.effects.trim(wav, top_db=25) 67 | else: 68 | wav, _ = librosa.effects.trim(wav, top_db=15) 69 | 70 | if hparams.rescaling: 71 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 72 | 73 | # Compute the linear-scale spectrogram from the wav: 74 | spectrogram = audio.spectrogram(wav).astype(np.float32) 75 | n_frames = spectrogram.shape[1] 76 | 77 | # Compute a mel-scale spectrogram from the wav: 78 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 79 | 80 | # Write the spectrograms to disk: 81 | spectrogram_filename = 'vctk-spec-%05d.npy' % index 82 | mel_filename = 'vctk-mel-%05d.npy' % index 83 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 84 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 85 | 86 | # Return a tuple describing this training example: 87 | return (spectrogram_filename, mel_filename, n_frames, text, speaker_id) 88 | -------------------------------------------------------------------------------- /vctk_preprocess/.gitignore: -------------------------------------------------------------------------------- 1 | latest_features 2 | tts_env.sh 3 | -------------------------------------------------------------------------------- /vctk_preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Preprocessing for VCTK 2 | 3 | Wav files in VCTK contains lots of long silences, which affects training char-level seq2seq models. To deal with the problem, we will 4 | 5 | - **Prepare phoneme alignments for all utterances** (code in the directory) 6 | - Cut silences during preprocessing (code in the parent directory) 7 | 8 | ## Note 9 | 10 | Code in the directory heavily relies on https://gist.github.com/kastnerkyle/cc0ac48d34860c5bb3f9112f4d9a0300 (which is hard copied in the repo). If you have any issues, please make sure that you can successfully run the script. 11 | 12 | ## Steps 13 | 14 | 1. Download VCTK: http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html 15 | 2. Install HTK/speech_tools/festival/merlin and prepare `tts_env.sh`. If you don't have speech tools, you can install them by https://gist.github.com/kastnerkyle/001a58a58d090658ee5350cb6129f857. For the reference, `tts_env.sh` of mine is: 16 | ``` 17 | export ESTDIR=/home/ryuichi/Dropbox/sp/speech_tools/ 18 | export FESTDIR=/home/ryuichi/Dropbox/sp/festival/ 19 | export FESTVOXDIR=/home/ryuichi/Dropbox/sp/festvox/ 20 | export VCTKDIR=/home/ryuichi/data/VCTK-Corpus/ 21 | export HTKDIR=/usr/local/HTS-2.3/bin/ 22 | export SPTKDIR=/usr/local/bin/ 23 | export MERLINDIR=/home/ryuichi/Dropbox/sp/merlin_pr/ 24 | ``` 25 | 3. Run the script (takes ~24 hours) 26 | ``` 27 | python prepare_vctk_labels.py ${your_vctk_dir} ${dst_dir} 28 | ``` 29 | This will process all utterances of VCTK and copy HTK-style alignments to `${dst_dir}`. 30 | It is recommended to copy alignments to the top of VCTK corpus. i.e., 31 | ``` 32 | python prepare_vctk_labels.py ~/data/VCTK-Corpus ~/data/VCTK-Corpus/lab 33 | ``` 34 | 35 | After the above steps, you will get alignments as follows: 36 | 37 | ``` 38 | tree ~/data/VCTK-Corpus/lab/ | head /home/ryuichi/data/VCTK-Corpus/lab/ 39 | ├── p225 40 | │   ├── p225_001.lab 41 | │   ├── p225_002.lab 42 | │   ├── p225_003.lab 43 | │   ├── p225_004.lab 44 | │   ├── p225_005.lab 45 | │   ├── p225_006.lab 46 | │   ├── p225_007.lab 47 | │   ├── p225_008.lab 48 | ``` 49 | 50 | ``` 51 | cat ~/data/VCTK-Corpus/lab/p225/p225_001.lab 52 | 53 | 0 850000 pau 54 | 850000 2850000 pau 55 | 2850000 3600000 p 56 | 3600000 3900000 l 57 | 3900000 6000000 iy 58 | 6000000 8450000 z 59 | 8450000 8600000 k 60 | 8600000 11300000 ao 61 | 11300000 11450000 l 62 | 11450000 12800000 s 63 | 12800000 13099999 t 64 | 13099999 15800000 eh 65 | 15800000 16050000 l 66 | 16050000 17600000 ax 67 | 17600000 20400000 pau 68 | ``` 69 | 70 | ## Using Gentle? 71 | 72 | `prepare_htk_alignments_vctk.py` do the same things above using [Gentle](https://github.com/lowerquality/gentle), but turned out it seems not very good. Leaving code for future possibility if we can improve. 73 | -------------------------------------------------------------------------------- /vctk_preprocess/prepare_htk_alignments_vctk.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Script for do force alignment by gentle for VCTK. This script takes approx 4 | ~40 hours to finish. It processes all utterances in VCTK. 5 | 6 | NOTE: Must be run with Python2, since gentle doesn't work with Python3. 7 | 8 | Usage: 9 | 1. Install https://github.com/lowerquality/gentle 10 | 2. Download VCTK http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html 11 | 12 | and then run the script by: 13 | 14 | python2 prepare_htk_alignments_vctk.py ${your_vctk_data_path} 15 | 16 | After running the script, you will see alignment files in `lab` directory as 17 | follows: 18 | 19 | > tree ~/data/VCTK-Corpus/ -d -L 20 | 21 | /home/ryuichi/data/VCTK-Corpus/ 22 | ├── lab 23 | ├── txt 24 | └── wav48 25 | """ 26 | import argparse 27 | import logging 28 | import multiprocessing 29 | import os 30 | import sys 31 | from tqdm import tqdm 32 | import json 33 | from os.path import join, basename, dirname, exists 34 | import numpy as np 35 | 36 | import gentle 37 | import librosa 38 | from nnmnkwii.datasets import vctk 39 | 40 | 41 | def on_progress(p): 42 | for k, v in p.items(): 43 | logging.debug("%s: %s" % (k, v)) 44 | 45 | 46 | def write_hts_label(labels, lab_path): 47 | lab = "" 48 | for s, e, l in labels: 49 | s, e = float(s) * 1e7, float(e) * 1e7 50 | s, e = int(s), int(e) 51 | lab += "{} {} {}\n".format(s, e, l) 52 | print(lab) 53 | with open(lab_path, "w") as f: 54 | f.write(lab) 55 | 56 | 57 | def json2hts(data): 58 | emit_bos = False 59 | emit_eos = False 60 | 61 | phone_start = 0 62 | phone_end = None 63 | labels = [] 64 | 65 | for word in data["words"]: 66 | case = word["case"] 67 | if case != "success": 68 | raise RuntimeError("Alignment failed") 69 | start = float(word["start"]) 70 | word_end = float(word["end"]) 71 | 72 | if not emit_bos: 73 | labels.append((phone_start, start, "silB")) 74 | emit_bos = True 75 | 76 | phone_start = start 77 | phone_end = None 78 | for phone in word["phones"]: 79 | ph = str(phone["phone"][:-2]) 80 | duration = float(phone["duration"]) 81 | phone_end = phone_start + duration 82 | labels.append((phone_start, phone_end, ph)) 83 | phone_start += duration 84 | assert np.allclose(phone_end, word_end) 85 | if not emit_eos: 86 | labels.append((phone_start, phone_end, "silE")) 87 | emit_eos = True 88 | 89 | return labels 90 | 91 | 92 | if __name__ == "__main__": 93 | parser = argparse.ArgumentParser( 94 | description='Do force alignment for VCTK and save HTK-style alignments') 95 | parser.add_argument( 96 | '--nthreads', default=multiprocessing.cpu_count(), type=int, 97 | help='number of alignment threads') 98 | parser.add_argument( 99 | '--conservative', dest='conservative', action='store_true', 100 | help='conservative alignment') 101 | parser.set_defaults(conservative=False) 102 | parser.add_argument( 103 | '--disfluency', dest='disfluency', action='store_true', 104 | help='include disfluencies (uh, um) in alignment') 105 | parser.set_defaults(disfluency=False) 106 | parser.add_argument( 107 | '--log', default="INFO", 108 | help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)') 109 | parser.add_argument('data_root', type=str, help='Data root') 110 | 111 | args = parser.parse_args() 112 | 113 | log_level = args.log.upper() 114 | logging.getLogger().setLevel(log_level) 115 | disfluencies = set(['uh', 'um']) 116 | 117 | data_root = args.data_root 118 | 119 | # Do for all speakers 120 | speakers = vctk.available_speakers 121 | 122 | # Collect all transcripts/wav files 123 | td = vctk.TranscriptionDataSource(data_root, speakers=speakers) 124 | transcriptions = td.collect_files() 125 | wav_paths = vctk.WavFileDataSource( 126 | data_root, speakers=speakers).collect_files() 127 | 128 | # Save dir 129 | save_dir = join(data_root, "lab") 130 | if not exists(save_dir): 131 | os.makedirs(save_dir) 132 | 133 | resources = gentle.Resources() 134 | 135 | for idx in tqdm(range(len(wav_paths))): 136 | transcript = transcriptions[idx] 137 | audiofile = wav_paths[idx] 138 | lab_path = audiofile.replace("wav48/", "lab/").replace(".wav", ".lab") 139 | print(transcript) 140 | print(audiofile) 141 | print(lab_path) 142 | lab_dir = dirname(lab_path) 143 | if not exists(lab_dir): 144 | os.makedirs(lab_dir) 145 | 146 | logging.info("converting audio to 8K sampled wav") 147 | with gentle.resampled(audiofile) as wavfile: 148 | logging.info("starting alignment") 149 | aligner = gentle.ForcedAligner(resources, transcript, 150 | nthreads=args.nthreads, 151 | disfluency=args.disfluency, 152 | conservative=args.conservative, 153 | disfluencies=disfluencies) 154 | result = aligner.transcribe( 155 | wavfile, progress_cb=on_progress, logging=logging) 156 | 157 | # convert to htk format 158 | a = json.loads(result.to_json()) 159 | try: 160 | labels = json2hts(a) 161 | except RuntimeError as e: 162 | from warnings import warn 163 | warn(str(e)) 164 | continue 165 | 166 | # Insert end time 167 | x, sr = librosa.load(wavfile, sr=8000) 168 | endtime = float(len(x)) / sr 169 | labels[-1] = (labels[-1][0], endtime, labels[-1][-1]) 170 | 171 | # write to file 172 | write_hts_label(labels, lab_path) 173 | -------------------------------------------------------------------------------- /vctk_preprocess/prepare_vctk_labels.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Prepare HTS alignments for VCTK. 4 | 5 | usage: prepare_vctk_labels.py [options] 6 | 7 | options: 8 | -h, --help Show help message. 9 | """ 10 | from docopt import docopt 11 | import os 12 | from nnmnkwii.datasets import vctk 13 | from os.path import join, exists, splitext, basename 14 | import sys 15 | from glob import glob 16 | 17 | from subprocess import Popen, PIPE 18 | from tqdm import tqdm 19 | 20 | 21 | def do(cmd): 22 | print(cmd) 23 | p = Popen(cmd, shell=True) 24 | p.wait() 25 | 26 | 27 | if __name__ == "__main__": 28 | args = docopt(__doc__) 29 | data_root = args[""] 30 | out_dir = args[""] 31 | 32 | for idx in tqdm(range(len(vctk.available_speakers))): 33 | speaker = vctk.available_speakers[idx] 34 | 35 | wav_root = join(data_root, "wav48/p{}".format(speaker)) 36 | txt_root = join(data_root, "txt/p{}".format(speaker)) 37 | assert exists(wav_root) 38 | assert exists(txt_root) 39 | print(wav_root, txt_root) 40 | 41 | # Do alignments 42 | cmd = "python ./extract_feats.py -w {} -t {}".format(wav_root, txt_root) 43 | do(cmd) 44 | 45 | # Copy 46 | lab_dir = join(out_dir, "p{}".format(speaker)) 47 | if not exists(lab_dir): 48 | os.makedirs(lab_dir) 49 | cmd = "cp ./latest_features/merlin/misc/scripts/alignment/phone_align/full-context-labels/mono/*.lab {}".format( 50 | lab_dir) 51 | do(cmd) 52 | 53 | # Remove 54 | do("rm -rf ./latest_features") 55 | 56 | sys.exit(0) 57 | --------------------------------------------------------------------------------