├── code ├── examples ├── lib │ ├── tfbldr │ │ ├── test │ │ │ ├── __init__.py │ │ │ ├── test_import.py │ │ │ └── test_simple.py │ │ ├── .gitignore │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ ├── plot │ │ │ ├── __init__.py │ │ │ └── audio.py │ │ ├── datasets │ │ │ ├── text │ │ │ │ ├── cleaning │ │ │ │ │ ├── README │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── symbols.py │ │ │ │ │ ├── cmudict.py │ │ │ │ │ ├── numbers.py │ │ │ │ │ ├── cleaners.py │ │ │ │ │ └── number_to_words.py │ │ │ │ └── __init__.py │ │ │ ├── audio │ │ │ │ ├── __init__.py │ │ │ │ └── magrecnp.py │ │ │ ├── music │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ └── plotters.py │ │ ├── __init__.py │ │ ├── core │ │ │ └── __init__.py │ │ ├── misc_scripts │ │ │ └── speech_itr_test.py │ │ └── nodes │ │ │ └── __init__.py │ ├── tfbldr.egg-info │ │ ├── dependency_links.txt │ │ ├── top_level.txt │ │ ├── requires.txt │ │ ├── PKG-INFO │ │ └── SOURCES.txt │ ├── README.rst │ ├── examples │ │ └── unaligned_ljspeech_chars │ │ │ ├── wavenet_stuff │ │ │ ├── audio.py │ │ │ ├── hparams.py │ │ │ ├── train.py │ │ │ ├── synthesis.py │ │ │ ├── lrschedule.py │ │ │ ├── wavenet_vocoder_core │ │ │ ├── wavenet_vocoder │ │ │ │ ├── MANIFEST.in │ │ │ │ ├── tox.ini │ │ │ │ ├── wavenet_vocoder │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── util.py │ │ │ │ │ ├── builder.py │ │ │ │ │ ├── conv.py │ │ │ │ │ ├── mixture.py │ │ │ │ │ └── modules.py │ │ │ │ ├── tests │ │ │ │ │ ├── test_audio.py │ │ │ │ │ ├── test_misc.py │ │ │ │ │ └── test_mixture.py │ │ │ │ ├── release.sh │ │ │ │ ├── tojson.py │ │ │ │ ├── appveyor.yml │ │ │ │ ├── .travis.yml │ │ │ │ ├── LICENSE.md │ │ │ │ ├── lrschedule.py │ │ │ │ ├── presets │ │ │ │ │ ├── ljspeech_mixture.json │ │ │ │ │ ├── cmu_arctic_8bit.json │ │ │ │ │ └── multispeaker_cmu_arctic_mixture.json │ │ │ │ ├── preprocess.py │ │ │ │ ├── setup.py │ │ │ │ ├── .gitignore │ │ │ │ ├── ljspeech.py │ │ │ │ ├── jsut.py │ │ │ │ ├── audio.py │ │ │ │ ├── cmu_arctic.py │ │ │ │ ├── hparams.py │ │ │ │ ├── librivox.py │ │ │ │ ├── evaluate.py │ │ │ │ └── synthesis.py │ │ │ ├── 20180510_mixture_lj_checkpoint_step000320000_ema.pth │ │ │ ├── 20180510_mixture_lj_checkpoint_step000320000_ema.json │ │ │ └── batch_synth.py │ │ │ ├── wiperesults.sh │ │ │ ├── basic_test.txt │ │ │ ├── norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz │ │ │ ├── quote_test.txt │ │ │ ├── taco_small_test.txt │ │ │ ├── sampleit.sh │ │ │ ├── taco_prosody_test.txt │ │ │ ├── full_test.txt │ │ │ └── rnn_unaligned_speech_ljspeech_nomask_blended_continue.py │ ├── continuous_integration │ │ ├── test_script.sh │ │ └── install.sh │ ├── setup.py │ ├── .travis.yml │ └── LICENSE └── README.md ├── pretrained ├── clean.sh ├── sample.sh ├── cmudict.json.gz ├── norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz ├── README.md ├── symbols.py ├── cmudict.py ├── numbers_rules.py ├── text.py ├── cleaners.py ├── representation_mixing_text_to_speech_demo_minimal.ipynb ├── number_to_words.py ├── transform_text.py └── cleaning.py ├── figures ├── white.png ├── single_mb_cropped.png ├── tbptt_mb_cropped.png ├── embedding_module_cropped.png └── network_diagram_cropped.png ├── LICENSE └── README.md /code/examples: -------------------------------------------------------------------------------- 1 | lib/examples -------------------------------------------------------------------------------- /code/lib/tfbldr/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/lib/tfbldr.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /code/lib/tfbldr.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | tfbldr 2 | -------------------------------------------------------------------------------- /code/lib/tfbldr/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.out 3 | *.npy 4 | -------------------------------------------------------------------------------- /pretrained/clean.sh: -------------------------------------------------------------------------------- 1 | rm *.png 2 | rm *.wav 3 | rm sample_*_mels.npz 4 | -------------------------------------------------------------------------------- /pretrained/sample.sh: -------------------------------------------------------------------------------- 1 | python sample_rnn_unaligned_speech_ljspeech.py 2 | -------------------------------------------------------------------------------- /code/lib/tfbldr.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | tensorflow-gpu 4 | -------------------------------------------------------------------------------- /code/lib/tfbldr/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import next_experiment_path 2 | -------------------------------------------------------------------------------- /code/lib/README.rst: -------------------------------------------------------------------------------- 1 | Tensorflow tools and experiments 2 | 3 | Use at your own risk 4 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/audio.py: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/audio.py -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/hparams.py: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/hparams.py -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/train.py: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/train.py -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/synthesis.py: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/synthesis.py -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/lrschedule.py: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/lrschedule.py -------------------------------------------------------------------------------- /figures/white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/white.png -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wiperesults.sh: -------------------------------------------------------------------------------- 1 | rm *.wav 2 | rm *.png 3 | rm -r sample_results 4 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder_core: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/wavenet_vocoder/ -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE.md 2 | -------------------------------------------------------------------------------- /pretrained/cmudict.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/pretrained/cmudict.json.gz -------------------------------------------------------------------------------- /figures/single_mb_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/single_mb_cropped.png -------------------------------------------------------------------------------- /figures/tbptt_mb_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/tbptt_mb_cropped.png -------------------------------------------------------------------------------- /figures/embedding_module_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/embedding_module_cropped.png -------------------------------------------------------------------------------- /figures/network_diagram_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/network_diagram_cropped.png -------------------------------------------------------------------------------- /code/lib/tfbldr/plot/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot import get_viridis 2 | from .plot import autoaspect 3 | from .audio import specgram 4 | from .audio import specplot 5 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/README: -------------------------------------------------------------------------------- 1 | text processing utils from Keith Ito 2 | replaced inflect engine with https://github.com/ianfieldhouse/number_to_words 3 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/20180510_mixture_lj_checkpoint_step000320000_ema.pth: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/20180510_mixture_lj_checkpoint_step000320000_ema.pth -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/20180510_mixture_lj_checkpoint_step000320000_ema.json: -------------------------------------------------------------------------------- 1 | wavenet_vocoder/20180510_mixture_lj_checkpoint_step000320000_ema.json -------------------------------------------------------------------------------- /code/lib/tfbldr/test/test_import.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.getLogger('tensorflow').disabled = True 3 | 4 | # implicit test 5 | from tfbldr import * 6 | 7 | def test_import_all(): 8 | pass 9 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = E305,E402,E704,E721,E741,F401,F403,F405,F821,F841,F999 4 | exclude = docs/,data,build,dist,notebooks,checkpoints*,legacy 5 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | #from .version import __version__ 5 | 6 | from .wavenet import receptive_field_size, WaveNet 7 | -------------------------------------------------------------------------------- /pretrained/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/pretrained/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/basic_test.txt: -------------------------------------------------------------------------------- 1 | i am learning english. 2 | thanks so much. 3 | i will be with you in a moment. 4 | the meeting is at eleven this morning. 5 | they will be gone for twenty eight days. 6 | i can help with that. 7 | this and that, these and those. 8 | they are a few sandwiches short of a picnic. 9 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/code/lib/examples/unaligned_ljspeech_chars/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/quote_test.txt: -------------------------------------------------------------------------------- 1 | sphinx of black quartz, judge my vow. 2 | the quick brown fox jumps over the lazy dog. 3 | pack my box with five dozen liquor jugs. 4 | we surely shall see the sun shine soon. 5 | lesser leather never weathered wetter weather better. 6 | near an ear, a nearer ear, a nearly eerie ear. 7 | the sky above the port was the color of television, tuned to a dead channel. 8 | all this happened, more or less. 9 | -------------------------------------------------------------------------------- /code/lib/tfbldr/__init__.py: -------------------------------------------------------------------------------- 1 | floatX = "float32" 2 | intX = "int32" 3 | import os 4 | 5 | # fix logging during travis testing 6 | if os.environ.get('TRAVIS') != "true": 7 | import logging 8 | logging.getLogger('tensorflow').disabled = True 9 | 10 | from .core import get_logger 11 | from .core import scan 12 | from .core import dot 13 | from .core import get_params_dict 14 | from .core import run_loop 15 | from .nodes import make_numpy_weights 16 | from .nodes import make_numpy_biases 17 | 18 | -------------------------------------------------------------------------------- /code/lib/tfbldr/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import get_params_dict 2 | from .core import get_logger 3 | from .core import scan 4 | from .core import _get_name 5 | from .core import _get_shared 6 | from .core import _set_shared 7 | from .core import run_loop 8 | from .core import print_network 9 | from .core import _ndim 10 | from .core import _shape 11 | from .core import dot 12 | from .core import get_weight_norm_default 13 | from .core import get_strict_mode_default 14 | from .core import print_network 15 | from .core import download 16 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from .audio_tools import soundsc 2 | from .audio_tools import overlap 3 | from .audio_tools import stft 4 | from .audio_tools import iterate_invert_spectrogram 5 | from .audio_tools import mel_freq_weights 6 | from .audio_tools import linear_to_mel_weight_matrix 7 | from .audio_tools import mu_law_encode 8 | from .audio_tools import mu_law_decode 9 | from .audio_tools import mu_law_transform 10 | from .audio_tools import mu_law_inverse 11 | from .audio_tools import fetch_sample_speech_tapestry 12 | from .datasets import wavfile_caching_mel_tbptt_iterator 13 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_audio.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import sys 5 | from os.path import dirname, join 6 | sys.path.insert(0, join(dirname(__file__), "..")) 7 | 8 | import numpy as np 9 | from nose.plugins.attrib import attr 10 | 11 | import logging 12 | logging.getLogger('tensorflow').disabled = True 13 | 14 | 15 | @attr("local_only") 16 | def test_amp_to_db(): 17 | import audio 18 | x = np.random.rand(10) 19 | x_hat = audio._db_to_amp(audio._amp_to_db(x)) 20 | assert np.allclose(x, x_hat) 21 | -------------------------------------------------------------------------------- /code/lib/tfbldr/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def next_experiment_path(): 6 | """ 7 | creates paths for new experiment 8 | returns path for next experiment 9 | """ 10 | 11 | idx = 0 12 | path = os.path.join('summary', 'experiment-{}') 13 | while os.path.exists(path.format(idx)): 14 | idx += 1 15 | path = path.format(idx) 16 | os.makedirs(os.path.join(path, 'models')) 17 | os.makedirs(os.path.join(path, 'backup')) 18 | for file in filter(lambda x: x.endswith('.py'), os.listdir('.')): 19 | shutil.copy2(file, os.path.join(path, 'backup')) 20 | return path 21 | -------------------------------------------------------------------------------- /code/lib/tfbldr.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: tfbldr 3 | Version: 0.0.1 4 | Summary: Deep Learning tools for Tensorflow 5 | Home-page: http://github.com/kastnerkyle/tfbldr/ 6 | Author: Kyle Kastner 7 | Author-email: kastnerkyle@gmail.com 8 | License: BSD 3-clause 9 | Description: Tensorflow tools and experiments 10 | 11 | Use at your own risk 12 | 13 | Platform: UNKNOWN 14 | Classifier: Development Status :: 3 - Alpha 15 | Classifier: Intended Audience :: Science/Research 16 | Classifier: License :: OSI Approved :: BSD License 17 | Classifier: Operating System :: OS Independent 18 | Classifier: Topic :: Scientific/Engineering 19 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/util.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | 5 | def _assert_valid_input_type(s): 6 | assert s == "mulaw-quantize" or s == "mulaw" or s == "raw" 7 | 8 | 9 | def is_mulaw_quantize(s): 10 | _assert_valid_input_type(s) 11 | return s == "mulaw-quantize" 12 | 13 | 14 | def is_mulaw(s): 15 | _assert_valid_input_type(s) 16 | return s == "mulaw" 17 | 18 | 19 | def is_raw(s): 20 | _assert_valid_input_type(s) 21 | return s == "raw" 22 | 23 | 24 | def is_scalar_input(s): 25 | return is_raw(s) or is_mulaw(s) 26 | -------------------------------------------------------------------------------- /pretrained/README.md: -------------------------------------------------------------------------------- 1 | # Colab Notebook Links 2 | Full demo link: https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb 3 | 4 | Minimal demo: https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo_minimal.ipynb 5 | 6 | # Notes 7 | Some files pulled and edited from tfbldr to enable standalone runtime 8 | 9 | Inspired by Colab example from Ryuichi Yamamoto (r9y9) https://r9y9.github.io/blog/2018/05/20/tacotron2/ 10 | 11 | Text processing utils from Keith Ito 12 | 13 | Replaced inflect engine with https://github.com/ianfieldhouse/number_to_words 14 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_misc.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | from wavenet_vocoder import receptive_field_size 5 | 6 | 7 | def test_receptive_field_size(): 8 | # Table 4 in https://arxiv.org/abs/1711.10433 9 | assert receptive_field_size(total_layers=30, num_cycles=3, kernel_size=3) == 6139 10 | assert receptive_field_size(total_layers=24, num_cycles=4, kernel_size=3) == 505 11 | assert receptive_field_size(total_layers=12, num_cycles=2, kernel_size=3) == 253 12 | assert receptive_field_size(total_layers=30, num_cycles=1, 13 | kernel_size=3, dilation=lambda x: 1) == 61 14 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script for Pypi release 4 | # 0. Make sure you are on git tag 5 | # 1. Run the script 6 | # 2. Upload sdist 7 | 8 | set -e 9 | 10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) 11 | cd $script_dir 12 | 13 | TAG=$(git describe --exact-match --tags HEAD) 14 | 15 | VERSION=${TAG/v/} 16 | 17 | WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py develop sdist 18 | echo "*** Ready to release! wavenet_vocoder $TAG ***" 19 | echo "Please run the following command manually:" 20 | echo WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py sdist upload 21 | echo "Please make sure that release verion is correct." 22 | cat wavenet_vocoder/version.py 23 | -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | # WARNING 2 | This folder contains a NON-RUNNABLE code dump of my research library used for training the model. This is only for very, very interested people and for seeing the exact model definition and dirty details in code. 3 | 4 | The actual json files containing char and phone alignments and timing, used for training can be directly downloaded from here https://www.dropbox.com/s/1m73uf2mslvq0t5/gentle_json.tar.gz?dl=0 5 | 6 | The gentle_json files were extracted using utilities from my repo https://github.com/kastnerkyle/raw_voice_cleanup/blob/master/alignment/align_many.py 7 | 8 | If you just want to hear sound, use the colab here https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb 9 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tojson.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Dump hyper parameters to json file. 4 | 5 | usage: tojson.py [options] 6 | 7 | options: 8 | -h, --help Show help message. 9 | """ 10 | from docopt import docopt 11 | 12 | import sys 13 | import os 14 | from os.path import dirname, join, basename, splitext 15 | import json 16 | 17 | from hparams import hparams 18 | 19 | if __name__ == "__main__": 20 | args = docopt(__doc__) 21 | output_json_path = args[""] 22 | 23 | j = hparams.values() 24 | 25 | # for compat legacy 26 | for k in ["preset", "presets"]: 27 | if k in j: 28 | del j[k] 29 | 30 | with open(output_json_path, "w") as f: 31 | json.dump(j, f, indent=2) 32 | sys.exit(0) 33 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/music/__init__.py: -------------------------------------------------------------------------------- 1 | # music21 is an optional dep 2 | from ...core import get_logger 3 | logger = get_logger() 4 | 5 | try: 6 | from .music import pitch_and_duration_to_quantized 7 | from .music import pitches_and_durations_to_pretty_midi 8 | from .music import quantized_to_pretty_midi 9 | from .music import quantized_to_pitch_duration 10 | from .music import plot_pitches_and_durations 11 | from .music import music21_to_pitch_duration 12 | from .music import music21_to_quantized 13 | from .music import plot_piano_roll 14 | from .music import quantized_imlike_to_image_array 15 | from .analysis import midi_to_notes 16 | from .analysis import notes_to_midi 17 | from .loaders import fetch_jsb 18 | from .loaders import fetch_josquin 19 | except ImportError: 20 | logger.info("Unable to import music21 related utilities") 21 | -------------------------------------------------------------------------------- /code/lib/continuous_integration/test_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called by the "script" step defined in 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 4 | # The behavior of the script is controlled by environment variabled defined 5 | # in the .travis.yml in the top level folder of the project. 6 | 7 | # License: 3-clause BSD 8 | 9 | # still doesn't fix anything... 10 | export TF_CPP_MIN_LOG_LEVEL=3 11 | set -e 12 | 13 | python --version 14 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 15 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 16 | python -c "import tensorflow as tf; print('tensorflow %s' % tf.__version__)" 17 | 18 | # Do not use "make test" or "make test-coverage" as they enable verbose mode 19 | # which renders travis output too slow to display in a browser. 20 | if [[ "$COVERAGE" == "true" ]]; then 21 | nosetests -s --with-coverage tfbldr 22 | else 23 | nosetests -s tfbldr 24 | fi 25 | -------------------------------------------------------------------------------- /code/lib/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | setuptools.setup( 5 | name='tfbldr', 6 | version='0.0.1', 7 | packages=setuptools.find_packages(), 8 | author='Kyle Kastner', 9 | author_email='kastnerkyle@gmail.com', 10 | description='Deep Learning tools for Tensorflow', 11 | long_description=open(os.path.join(os.path.dirname( 12 | os.path.abspath(__file__)), 'README.rst')).read(), 13 | license='BSD 3-clause', 14 | url='http://github.com/kastnerkyle/tfbldr/', 15 | package_data={ 16 | 'pthbldr': ['datasets/data/*'] 17 | }, 18 | install_requires=['numpy', 19 | 'scipy', 20 | 'tensorflow-gpu'], 21 | classifiers=['Development Status :: 3 - Alpha', 22 | 'Intended Audience :: Science/Research', 23 | 'License :: OSI Approved :: BSD License', 24 | 'Operating System :: OS Independent', 25 | 'Topic :: Scientific/Engineering'], 26 | ) 27 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - PYTHON_VERSION: "3.6" 4 | PYTHON_ARCH: "64" 5 | MINICONDA: C:\Miniconda36-x64 6 | 7 | branches: 8 | only: 9 | - master 10 | - /release-.*/ 11 | 12 | skip_commits: 13 | message: /\[av skip\]/ 14 | 15 | notifications: 16 | - provider: Email 17 | on_build_success: false 18 | on_build_failure: false 19 | on_build_status_changed: false 20 | 21 | init: 22 | - "ECHO %PYTHON_VERSION% %PYTHON_ARCH% %MINICONDA%" 23 | 24 | install: 25 | - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%" 26 | - conda config --set always_yes yes --set changeps1 no 27 | - conda update -q conda 28 | - conda install -n root _license 29 | - conda info -a 30 | - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch -c pytorch" 31 | - activate test-environment 32 | 33 | build_script: 34 | - pip install -e ".[test]" 35 | 36 | test_script: 37 | - nosetests -v -w tests/ -a "!local_only" 38 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/taco_small_test.txt: -------------------------------------------------------------------------------- 1 | Scientists at the CERN laboratory say they have discovered a new particle. 2 | There’s a way to measure the acute emotional intelligence that has never gone out of style. 3 | President Trump met with other leaders at the Group of 20 conference. 4 | The Senate’s bill to repeal and replace the Affordable Care Act is now imperiled. 5 | Generative adversarial network or variational auto-encoder. 6 | Basilar membrane and otolaryngology are not auto-correlations. 7 | He has read the whole thing. 8 | He reads books. 9 | Don’t desert me here in the desert! 10 | He thought it was time to present the present. 11 | Thisss isrealy awhsome. 12 | The buses aren't the problem, they actually provide a solution. 13 | The quick brown fox jumps over the lazy dog. 14 | Does the quick brown fox jump over the lazy dog? 15 | Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick? 16 | She sells sea-shells on the sea-shore. The shells she sells are sea-shells I’m sure. 17 | The Blue Lagoon is a nineteen eighty American romance adventure film. 18 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | 6 | notifications: 7 | email: false 8 | 9 | before_install: 10 | - sudo apt-get update 11 | - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then 12 | wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh; 13 | else 14 | wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh; 15 | fi 16 | - bash miniconda.sh -b -p $HOME/miniconda 17 | - export PATH="$HOME/miniconda/bin:$PATH" 18 | - hash -r 19 | - conda config --set always_yes yes --set changeps1 no 20 | - conda update -q conda 21 | # Useful for debugging any issues with conda 22 | - conda config --add channels pypi 23 | - conda info -a 24 | - deps='pip numpy scipy cython nose pytorch' 25 | - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch 26 | - source activate test-environment 27 | 28 | install: 29 | - pip install -e ".[test]" 30 | script: 31 | - nosetests -v -w tests/ -a '!local_only' 32 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/sampleit.sh: -------------------------------------------------------------------------------- 1 | if [ $# -eq 0 ]; then 2 | echo "Must pass model path (without .meta or other extensions) as an argument" 3 | exit 4 | fi 5 | 6 | mkdir -p sample_results 7 | for t in blend++ blend chars phones; do 8 | mkdir -p sample_results/ 9 | if [[ -z "$2" ]]; then 10 | python -u sample_rnn_unaligned_speech_ljspeech.py "$1" custom_test.txt taco_prosody_test.txt taco_small_test.txt quote_test.txt basic_test.txt valid --inp=$t --sonify=1000 2>&1 | tee /Tmp/kastner/sample_log.txt 11 | fi 12 | if [[ ! -z "$2" ]]; then 13 | python -u sample_rnn_unaligned_speech_ljspeech.py "$1" custom_test.txt taco_prosody_test.txt taco_small_test.txt quote_test.txt basic_test.txt valid "$2" --inp=$t --sonify=1000 2>&1 | tee /Tmp/kastner/sample_log.txt 14 | #python sample_rnn_unaligned_speech_ljspeech.py "$1" "$2" --inp=$t --test=$s --sonify=1000 2>&1 | tee sample_results/"$t"_"$s"/sample_log.txt 15 | fi 16 | #python sample_rnn_unaligned_speech_ljspeech.py "$1" --inp=$t --test=$s 2>&1 | tee sample_results/"$t"_"$s"/sample_log.txt 17 | mv *sampled_text_summary.txt sample_results/ 18 | mv /Tmp/kastner/sample_log.txt sample_results/ 19 | done 20 | 21 | mv *.wav sample_results/ 22 | mv *.png sample_results/ 23 | -------------------------------------------------------------------------------- /code/lib/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | # make it explicit that we favor the new container-based travis workers 3 | sudo: false 4 | addons: 5 | apt: 6 | packages: 7 | # Only used by the DISTRIB="ubuntu" setting 8 | - libatlas3gf-base 9 | - libatlas-dev 10 | - python-numpy 11 | - python-scipy 12 | env: 13 | matrix: 14 | - DISTRIB="conda" PYTHON_VERSION="2.7" 15 | NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.0" TF_VERSION="1.4.1" 16 | # This environment tests the newest supported anaconda env 17 | - DISTRIB="conda" PYTHON_VERSION="2.7" INSTALL_MKL="true" 18 | NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.0" TF_VERSION="1.4.1" 19 | # This environment tests the newest supported anaconda env 20 | #- DISTRIB="conda" PYTHON_VERSION="3.4" INSTALL_MKL="true" 21 | # NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.14.0" 22 | 23 | install: source continuous_integration/install.sh 24 | script: bash continuous_integration/test_script.sh 25 | after_success: 26 | # Ignore coveralls failures as the coveralls server is not very reliable 27 | # but we don't want travis to report a failure in the github UI just 28 | # because the coverage report failed to be published. 29 | - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi 30 | cache: apt 31 | -------------------------------------------------------------------------------- /code/lib/tfbldr/misc_scripts/speech_itr_test.py: -------------------------------------------------------------------------------- 1 | from tfbldr.datasets import tbptt_file_list_iterator 2 | import os 3 | import numpy as np 4 | 5 | files = os.listdir("/Tmp/kastner/lj_speech_hybrid_speakers/numpy_features/") 6 | files = ["/Tmp/kastner/lj_speech_hybrid_speakers/numpy_features/" + f for f in files] 7 | ljspeech_hybridset = [' ', '!', ',', '-', '.', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] 8 | hybrid_lookup = {v: k for k, v in enumerate(sorted(ljspeech_hybridset))} 9 | hybrid_inverse_lookup = {v: k for k, v in hybrid_lookup.items()} 10 | 11 | def file_access(f): 12 | d = np.load(f) 13 | text = d["text"] 14 | inds = [hybrid_lookup[t] for t in text.ravel()[0]] 15 | audio = d["audio_features"] 16 | return (audio, inds) 17 | 18 | random_state = np.random.RandomState(1442) 19 | batch_size = 8 20 | truncation_length = 256 21 | itr = tbptt_file_list_iterator(files, file_access, 22 | batch_size, 23 | truncation_length, 24 | other_one_hot_size=[len(ljspeech_hybridset)], 25 | random_state=random_state) 26 | for i in range(100000): 27 | print(i) 28 | r = itr.next_masked_batch() 29 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/LICENSE.md: -------------------------------------------------------------------------------- 1 | The wavenet_vocoder package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2017: Ryuichi Yamamoto. 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining 6 | > a copy of this software and associated documentation files (the 7 | > "Software"), to deal in the Software without restriction, including 8 | > without limitation the rights to use, copy, modify, merge, publish, 9 | > distribute, sublicense, and/or sell copies of the Software, and to 10 | > permit persons to whom the Software is furnished to do so, subject to 11 | > the following conditions: 12 | > 13 | > The above copyright notice and this permission notice shall be 14 | > included in all copies or substantial portions of the Software. 15 | > 16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/lrschedule.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000): 6 | # Noam scheme from tensor2tensor: 7 | warmup_steps = float(warmup_steps) 8 | step = global_step + 1. 9 | lr = init_lr * warmup_steps**0.5 * np.minimum( 10 | step * warmup_steps**-1.5, step**-0.5) 11 | return lr 12 | 13 | 14 | def step_learning_rate_decay(init_lr, global_step, 15 | anneal_rate=0.98, 16 | anneal_interval=30000): 17 | return init_lr * anneal_rate ** (global_step // anneal_interval) 18 | 19 | 20 | def cyclic_cosine_annealing(init_lr, global_step, T, M): 21 | """Cyclic cosine annealing 22 | 23 | https://arxiv.org/pdf/1704.00109.pdf 24 | 25 | Args: 26 | init_lr (float): Initial learning rate 27 | global_step (int): Current iteration number 28 | T (int): Total iteration number (i,e. nepoch) 29 | M (int): Number of ensembles we want 30 | 31 | Returns: 32 | float: Annealed learning rate 33 | """ 34 | TdivM = T // M 35 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0) 36 | -------------------------------------------------------------------------------- /code/lib/tfbldr.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.rst 2 | setup.py 3 | tfbldr/__init__.py 4 | tfbldr.egg-info/PKG-INFO 5 | tfbldr.egg-info/SOURCES.txt 6 | tfbldr.egg-info/dependency_links.txt 7 | tfbldr.egg-info/requires.txt 8 | tfbldr.egg-info/top_level.txt 9 | tfbldr/core/__init__.py 10 | tfbldr/core/core.py 11 | tfbldr/datasets/__init__.py 12 | tfbldr/datasets/iterators.py 13 | tfbldr/datasets/loaders.py 14 | tfbldr/datasets/plotters.py 15 | tfbldr/datasets/audio/__init__.py 16 | tfbldr/datasets/audio/audio_tools.py 17 | tfbldr/datasets/audio/datasets.py 18 | tfbldr/datasets/audio/magrecnp.py 19 | tfbldr/datasets/music/__init__.py 20 | tfbldr/datasets/music/analysis.py 21 | tfbldr/datasets/music/loaders.py 22 | tfbldr/datasets/music/music.py 23 | tfbldr/datasets/text/__init__.py 24 | tfbldr/datasets/text/cleaning/__init__.py 25 | tfbldr/datasets/text/cleaning/cleaners.py 26 | tfbldr/datasets/text/cleaning/cmudict.py 27 | tfbldr/datasets/text/cleaning/eng_rules.py 28 | tfbldr/datasets/text/cleaning/number_to_words.py 29 | tfbldr/datasets/text/cleaning/numbers.py 30 | tfbldr/datasets/text/cleaning/symbols.py 31 | tfbldr/nodes/__init__.py 32 | tfbldr/nodes/nodes.py 33 | tfbldr/plot/__init__.py 34 | tfbldr/plot/audio.py 35 | tfbldr/plot/plot.py 36 | tfbldr/test/__init__.py 37 | tfbldr/test/test_import.py 38 | tfbldr/test/test_simple.py 39 | tfbldr/utils/__init__.py 40 | tfbldr/utils/utils.py -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/taco_prosody_test.txt: -------------------------------------------------------------------------------- 1 | how do bureaucrats wrap presents? with lots of red tape. 2 | why are libraries so strict? they have to go by the book. 3 | why are fish so smart? because they hang out in schools so much. 4 | heaps of things. like fairy bread, how the surf is today and why magpies swoop. 5 | the past, the present, and the future walk into a bar. it was tense. 6 | i usually down a cup of java script. then i put on nature sounds and run a few strenuous searches to improve my speed. 7 | i don't have eyes, but i don't need them to know the vibe in here feels good. 8 | what time do you go to the dentist? at tooth-hurty! 9 | sweet dreams are made of these. friendly assistants who work hard to please 10 | you are what you eat. so i guess i'm a whole lot of data and a little bit of pizza recipes. 11 | men say they know many things; but lo! they have taken wings, the arts and sciences, And a thousand appliances; the wind that blows is all that any body knows. 12 | do you prefer chocolate or jelly? which would you like in your belly? you could make a good case, for a cool ice cream base, but I'd argue against vermicelli. 13 | halloween edition it is! remember to follow the moves as I say them. 14 | why are archaeologists so annoyed? they always have a bone to pick. 15 | that one sailed right over my head. 16 | wear your heart on your sleeve. it'll terrify people. 17 | -------------------------------------------------------------------------------- /code/lib/tfbldr/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | from .nodes import Linear 2 | from .nodes import ReLU 3 | from .nodes import Tanh 4 | from .nodes import Sigmoid 5 | from .nodes import OneHot 6 | from .nodes import Softmax 7 | from .nodes import Conv2d 8 | from .nodes import GatedMaskedConv2d 9 | from .nodes import ConvTranspose2d 10 | from .nodes import BatchNorm2d 11 | from .nodes import LayerNorm 12 | from .nodes import Embedding 13 | from .nodes import PositionalEncoding 14 | from .nodes import TransformerBlock 15 | from .nodes import MultiheadAttention 16 | from .nodes import Bilinear 17 | from .nodes import VqEmbedding 18 | from .nodes import VqSeqEmbedding 19 | from .nodes import SimpleRNNCell 20 | from .nodes import BiLSTMLayer 21 | from .nodes import SequenceConv1dStack 22 | from .nodes import LSTMCell 23 | from .nodes import GRUCell 24 | from .nodes import AdditiveGaussianNoise 25 | from .nodes import GaussianAttentionCell 26 | from .nodes import DiscreteMixtureOfLogistics 27 | from .nodes import DiscreteMixtureOfLogisticsCost 28 | from .nodes import BernoulliAndCorrelatedGMM 29 | from .nodes import BernoulliAndCorrelatedGMMCost 30 | from .nodes import BernoulliCrossEntropyCost 31 | from .nodes import CategoricalCrossEntropyCost 32 | from .nodes import CategoricalCrossEntropyIndexCost 33 | from .nodes import CategoricalCrossEntropyLinearIndexCost 34 | from .nodes import make_numpy_weights 35 | from .nodes import make_numpy_biases 36 | -------------------------------------------------------------------------------- /pretrained/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | 8 | import cmudict 9 | 10 | _pad = '_' 11 | _eos = '~' 12 | # PUT IT BACK!!! 13 | 14 | _phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh', ' '] 15 | special = [s for s in "!\',-.:?"] 16 | _pau_phones = _phones + [s for s in ["1","2","3","4"]] 17 | _phones = _phones + special 18 | 19 | _characters = 'abcdefghijklmnopqrstuvwxyz!\',-.:? ' 20 | _rules = 'abcdefghijklmnopqrstuvwxyz&^!\',-.:? ' 21 | 22 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\',-.:? ' 23 | 24 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 25 | #_arpabet = ['@' + s for s in cmudict.valid_symbols] 26 | 27 | # Export all symbols: 28 | char_symbols = [_pad, _eos] + list(_characters)# + _arpabet 29 | phone_symbols = [_pad, _eos] + list(_phones)# + _arpabet 30 | pau_phone_symbols = [_pad, _eos] + list(_pau_phones) 31 | rule_symbols = [_pad, _eos] + list(_rules)# + _arpabet 32 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 7 | 8 | import cmudict 9 | 10 | _pad = '_' 11 | _eos = '~' 12 | # PUT IT BACK!!! 13 | 14 | _phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh', ' '] 15 | special = [s for s in "!\',-.:?"] 16 | _pau_phones = _phones + [s for s in ["1","2","3","4"]] 17 | _phones = _phones + special 18 | 19 | _characters = 'abcdefghijklmnopqrstuvwxyz!\',-.:? ' 20 | _rules = 'abcdefghijklmnopqrstuvwxyz&^!\',-.:? ' 21 | 22 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\',-.:? ' 23 | 24 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 25 | #_arpabet = ['@' + s for s in cmudict.valid_symbols] 26 | 27 | # Export all symbols: 28 | char_symbols = [_pad, _eos] + list(_characters)# + _arpabet 29 | phone_symbols = [_pad, _eos] + list(_phones)# + _arpabet 30 | pau_phone_symbols = [_pad, _eos] + list(_pau_phones) 31 | rule_symbols = [_pad, _eos] + list(_rules)# + _arpabet 32 | -------------------------------------------------------------------------------- /code/lib/tfbldr/test/test_simple.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.getLogger('tensorflow').disabled = True 3 | import tensorflow as tf 4 | import numpy as np 5 | from tfbldr import make_numpy_weights, make_numpy_biases, dot, scan, get_params_dict 6 | from tfbldr.nodes import Linear, SimpleRNNCell 7 | 8 | n_batch = 64 9 | h_dim = 400 10 | random_state = np.random.RandomState(2145) 11 | 12 | inputs = tf.placeholder(tf.float32, [None, n_batch, 3], 13 | name="inputs") 14 | init_h = tf.placeholder(tf.float32, [n_batch, h_dim], 15 | name="init_h") 16 | 17 | def step(inp_t, h_tm1): 18 | output, state = SimpleRNNCell([inp_t], [3], h_tm1, h_dim, 20, random_state=random_state, 19 | name="l1") 20 | h = state[0] 21 | return output, h 22 | 23 | o = scan(step, [inputs], [None, init_h]) 24 | loss = tf.reduce_mean(o[0]) 25 | h_o = o[1] 26 | 27 | params_dict = get_params_dict() 28 | params = params_dict.values() 29 | grads = tf.gradients(loss, params) 30 | 31 | learning_rate = 0.0002 32 | opt = tf.train.AdamOptimizer(learning_rate=learning_rate, use_locking=True) 33 | updates = opt.apply_gradients(zip(grads, params)) 34 | 35 | inputs_np = random_state.randn(33, n_batch, 3) 36 | init_h_np = np.zeros((n_batch, h_dim)) 37 | with tf.Session() as sess: 38 | sess.run(tf.global_variables_initializer()) 39 | feed = {inputs: inputs_np, 40 | init_h: init_h_np} 41 | outs = [loss, updates, h_o] 42 | lop = sess.run(outs, feed) 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Kyle Kastner 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /code/lib/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Kyle Kastner 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .loaders import rsync_fetch 2 | from .loaders import fetch_iamondb 3 | from .loaders import fetch_ljspeech 4 | from .loaders import fetch_fruitspeech 5 | from .loaders import fetch_mnist 6 | from .loaders import fetch_fashion_mnist 7 | from .loaders import make_sinewaves 8 | from .loaders import get_tfbldr_dataset_dir 9 | from .loaders import fetch_norvig_words 10 | from .audio import wavfile_caching_mel_tbptt_iterator 11 | from .iterators import list_iterator 12 | from .iterators import ordered_list_iterator 13 | from .iterators import tbptt_list_iterator 14 | from .iterators import tbptt_file_list_iterator 15 | from .iterators import char_textfile_iterator 16 | 17 | from ..core import get_logger 18 | logger = get_logger() 19 | 20 | # music21 and PIL are optional deps 21 | try: 22 | from .music import fetch_jsb 23 | from .music import fetch_josquin 24 | from .music import pitch_and_duration_to_quantized 25 | from .music import pitches_and_durations_to_pretty_midi 26 | from .music import quantized_to_pretty_midi 27 | from .music import plot_pitches_and_durations 28 | from .music import music21_to_pitch_duration 29 | from .music import music21_to_quantized 30 | from .music import plot_piano_roll 31 | from .music import quantized_imlike_to_image_array 32 | from .music import midi_to_notes 33 | from .music import notes_to_midi 34 | from .music import quantized_to_pitch_duration 35 | except ImportError: 36 | logger.info("Unable to import music21 related utilities") 37 | 38 | try: 39 | from .plotters import save_image_array 40 | except ImportError: 41 | logger.info("Unable to import PIL related utilities") 42 | -------------------------------------------------------------------------------- /code/lib/tfbldr/plot/audio.py: -------------------------------------------------------------------------------- 1 | from ..datasets.audio import stft 2 | from .plot import get_viridis 3 | import numpy as np 4 | 5 | 6 | def specgram(arr, fftsize=512, step=16, mean_normalize=True, real=False, 7 | compute_onesided=True, min_value=-100, max_value=np.inf, axis=0): 8 | arr = np.array(arr) 9 | if len(arr.shape) != 1: 10 | raise ValueError("arr must be a 1D np array or list") 11 | 12 | if axis != 0: 13 | raise ValueError("Must have axis=0") 14 | 15 | Pxx = 20. * np.log10(np.abs(stft(arr, fftsize=fftsize, step=step, mean_normalize=mean_normalize, real=real, compute_onesided=compute_onesided))) 16 | return np.clip(Pxx, min_value, max_value) 17 | 18 | 19 | def specplot(arr, mplaxis, time_ratio=4, cmap="viridis"): 20 | """ 21 | assumes arr comes in with time on axis 0, frequency on axis 1 22 | """ 23 | import matplotlib.pyplot as plt 24 | if cmap == "viridis": 25 | cmap = get_viridis() 26 | # Transpose so time is X axis, and invert y axis so 27 | # frequency is low at bottom 28 | mag = arr.T[::-1, :] 29 | mplaxis.matshow(mag, cmap=cmap) 30 | x1 = mag.shape[0] 31 | y1 = mag.shape[1] 32 | 33 | def autoaspect(x_range, y_range): 34 | """ 35 | The aspect to make a plot square with ax.set_aspect in Matplotlib 36 | """ 37 | b = [x_range, y_range] 38 | mi = np.argmax(b) 39 | mx = b[mi] 40 | mn = b[1] if mi == 0 else b[0] 41 | ratio = time_ratio / 1. if mi == 0 else 1. / time_ratio 42 | if x_range <= y_range: 43 | return ratio * mx / float(mn) 44 | else: 45 | return ratio * mn / float(mx) 46 | asp = autoaspect(x1, y1) 47 | mplaxis.set_aspect(asp) 48 | mplaxis.xaxis.tick_bottom() 49 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/ljspeech_mixture.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "raw", 5 | "quantize_channels": 65536, 6 | "sample_rate": 22050, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 30, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": -1, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 2, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.001, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": true, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true 63 | } -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/cmu_arctic_8bit.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "mulaw-quantize", 5 | "quantize_channels": 256, 6 | "sample_rate": 16000, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 256, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": -1, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 2, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.001, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": false, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true 63 | } -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/multispeaker_cmu_arctic_mixture.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wavenet_vocoder", 3 | "builder": "wavenet", 4 | "input_type": "raw", 5 | "quantize_channels": 65536, 6 | "sample_rate": 16000, 7 | "silence_threshold": 2, 8 | "num_mels": 80, 9 | "fmin": 125, 10 | "fmax": 7600, 11 | "fft_size": 1024, 12 | "hop_size": 256, 13 | "frame_shift_ms": null, 14 | "min_level_db": -100, 15 | "ref_level_db": 20, 16 | "rescaling": true, 17 | "rescaling_max": 0.999, 18 | "allow_clipping_in_normalization": true, 19 | "log_scale_min": -32.23619130191664, 20 | "out_channels": 30, 21 | "layers": 24, 22 | "stacks": 4, 23 | "residual_channels": 512, 24 | "gate_channels": 512, 25 | "skip_out_channels": 256, 26 | "dropout": 0.050000000000000044, 27 | "kernel_size": 3, 28 | "weight_normalization": true, 29 | "cin_channels": 80, 30 | "upsample_conditional_features": true, 31 | "upsample_scales": [ 32 | 4, 33 | 4, 34 | 4, 35 | 4 36 | ], 37 | "freq_axis_kernel_size": 3, 38 | "gin_channels": 16, 39 | "n_speakers": 7, 40 | "pin_memory": true, 41 | "num_workers": 2, 42 | "test_size": 0.0441, 43 | "test_num_samples": null, 44 | "random_state": 1234, 45 | "batch_size": 2, 46 | "adam_beta1": 0.9, 47 | "adam_beta2": 0.999, 48 | "adam_eps": 1e-08, 49 | "initial_learning_rate": 0.001, 50 | "lr_schedule": "noam_learning_rate_decay", 51 | "lr_schedule_kwargs": {}, 52 | "nepochs": 2000, 53 | "weight_decay": 0.0, 54 | "clip_thresh": -1, 55 | "max_time_sec": null, 56 | "max_time_steps": 8000, 57 | "exponential_moving_average": true, 58 | "ema_decay": 0.9999, 59 | "checkpoint_interval": 10000, 60 | "train_eval_interval": 10000, 61 | "test_eval_epoch_interval": 5, 62 | "save_optimizer_state": true 63 | } -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/builder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | 5 | def wavenet(out_channels=256, 6 | layers=20, 7 | stacks=2, 8 | residual_channels=512, 9 | gate_channels=512, 10 | skip_out_channels=512, 11 | cin_channels=-1, 12 | gin_channels=-1, 13 | weight_normalization=True, 14 | dropout=1 - 0.95, 15 | kernel_size=3, 16 | n_speakers=None, 17 | upsample_conditional_features=False, 18 | upsample_scales=[16, 16], 19 | freq_axis_kernel_size=3, 20 | scalar_input=False, 21 | use_speaker_embedding=True, 22 | legacy=True, 23 | ): 24 | from wavenet_vocoder_core import WaveNet 25 | 26 | model = WaveNet(out_channels=out_channels, layers=layers, stacks=stacks, 27 | residual_channels=residual_channels, 28 | gate_channels=gate_channels, 29 | skip_out_channels=skip_out_channels, 30 | kernel_size=kernel_size, dropout=dropout, 31 | weight_normalization=weight_normalization, 32 | cin_channels=cin_channels, gin_channels=gin_channels, 33 | n_speakers=n_speakers, 34 | upsample_conditional_features=upsample_conditional_features, 35 | upsample_scales=upsample_scales, 36 | freq_axis_kernel_size=freq_axis_kernel_size, 37 | scalar_input=scalar_input, 38 | use_speaker_embedding=use_speaker_embedding, 39 | legacy=legacy, 40 | ) 41 | 42 | return model 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Representation Mixing 2 | 3 | This repo has code and pretrained models in support of the paper [Representation Mixing for TTS Synthesis](https://arxiv.org/abs/1811.07240) 4 | 5 | Try the demo! https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb 6 | 7 | Samples site: https://s3.amazonaws.com/representation-mixing-site/index.html 8 | 9 | # Abstract 10 | Recent character and phoneme-based parametric TTS systems using deep learning have shown strong performance in natural speech generation. However, the choice between character or phoneme input can create serious limitations for practical deployment, as direct control of pronunciation is crucial in certain cases. We demonstrate a simple method for combining multiple types of linguistic information in a single encoder, named representation mixing, enabling flexible choice between character, phoneme, or mixed representations during inference. Experiments and user studies on a public audiobook corpus show the efficacy of our approach. 11 | 12 | [(Taken from the paper)](https://arxiv.org/abs/1811.07240) 13 | 14 | # Architecture Diagram 15 |
16 | 17 | 18 |
19 |
20 | 21 | # More Info 22 | `pretrained/` contains some information and code for pretrained models, as well as a colab notebook for sampling from the pretrained model 23 | 24 | `code/` (will) contain a NON-RUNNABLE code dump of my research library used for training the model. This is only for very, very interested people and for seeing the model definition in code. If you just want sound, use the colab. 25 | 26 | -------------------------------------------------------------------------------- /pretrained/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | valid_symbols = [ 6 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 7 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 8 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 9 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 10 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 11 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 12 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 13 | ] 14 | 15 | _valid_symbol_set = set(valid_symbols) 16 | 17 | 18 | class CMUDict: 19 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 20 | def __init__(self, file_or_path, keep_ambiguous=True): 21 | if isinstance(file_or_path, str): 22 | with open(file_or_path, encoding='latin-1') as f: 23 | entries = _parse_cmudict(f) 24 | else: 25 | entries = _parse_cmudict(file_or_path) 26 | if not keep_ambiguous: 27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 28 | self._entries = entries 29 | 30 | 31 | def __len__(self): 32 | return len(self._entries) 33 | 34 | 35 | def lookup(self, word): 36 | '''Returns list of ARPAbet pronunciations of the given word.''' 37 | return self._entries.get(word.upper()) 38 | 39 | 40 | 41 | _alt_re = re.compile(r'\([0-9]+\)') 42 | 43 | 44 | def _parse_cmudict(file): 45 | cmudict = {} 46 | for line in file: 47 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 48 | parts = line.split(' ') 49 | word = re.sub(_alt_re, '', parts[0]) 50 | pronunciation = _get_pronunciation(parts[1]) 51 | if pronunciation: 52 | if word in cmudict: 53 | cmudict[word].append(pronunciation) 54 | else: 55 | cmudict[word] = [pronunciation] 56 | return cmudict 57 | 58 | 59 | def _get_pronunciation(s): 60 | parts = s.strip().split(' ') 61 | for part in parts: 62 | if part not in _valid_symbol_set: 63 | return None 64 | return ' '.join(parts) 65 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | valid_symbols = [ 6 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 7 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 8 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 9 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 10 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 11 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 12 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 13 | ] 14 | 15 | _valid_symbol_set = set(valid_symbols) 16 | 17 | 18 | class CMUDict: 19 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 20 | def __init__(self, file_or_path, keep_ambiguous=True): 21 | if isinstance(file_or_path, str): 22 | with open(file_or_path, encoding='latin-1') as f: 23 | entries = _parse_cmudict(f) 24 | else: 25 | entries = _parse_cmudict(file_or_path) 26 | if not keep_ambiguous: 27 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 28 | self._entries = entries 29 | 30 | 31 | def __len__(self): 32 | return len(self._entries) 33 | 34 | 35 | def lookup(self, word): 36 | '''Returns list of ARPAbet pronunciations of the given word.''' 37 | return self._entries.get(word.upper()) 38 | 39 | 40 | 41 | _alt_re = re.compile(r'\([0-9]+\)') 42 | 43 | 44 | def _parse_cmudict(file): 45 | cmudict = {} 46 | for line in file: 47 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 48 | parts = line.split(' ') 49 | word = re.sub(_alt_re, '', parts[0]) 50 | pronunciation = _get_pronunciation(parts[1]) 51 | if pronunciation: 52 | if word in cmudict: 53 | cmudict[word].append(pronunciation) 54 | else: 55 | cmudict[word] = [pronunciation] 56 | return cmudict 57 | 58 | 59 | def _get_pronunciation(s): 60 | parts = s.strip().split(' ') 61 | for part in parts: 62 | if part not in _valid_symbol_set: 63 | return None 64 | return ' '.join(parts) 65 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Preprocess dataset 4 | 5 | usage: preprocess.py [options] 6 | 7 | options: 8 | --num_workers= Num workers. 9 | --hparams= Hyper parameters [default: ]. 10 | --preset= Path of preset parameters (json). 11 | -h, --help Show help message. 12 | """ 13 | from docopt import docopt 14 | import os 15 | from multiprocessing import cpu_count 16 | from tqdm import tqdm 17 | import importlib 18 | from hparams import hparams 19 | 20 | 21 | def preprocess(mod, in_dir, out_root, num_workers): 22 | os.makedirs(out_dir, exist_ok=True) 23 | metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm) 24 | write_metadata(metadata, out_dir) 25 | 26 | 27 | def write_metadata(metadata, out_dir): 28 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 29 | for m in metadata: 30 | f.write('|'.join([str(x) for x in m]) + '\n') 31 | frames = sum([m[2] for m in metadata]) 32 | sr = hparams.sample_rate 33 | hours = frames / sr / 3600 34 | print('Wrote %d utterances, %d time steps (%.2f hours)' % (len(metadata), frames, hours)) 35 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 36 | print('Max output length: %d' % max(m[2] for m in metadata)) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = docopt(__doc__) 41 | name = args[""] 42 | in_dir = args[""] 43 | out_dir = args[""] 44 | num_workers = args["--num_workers"] 45 | num_workers = cpu_count() if num_workers is None else int(num_workers) 46 | preset = args["--preset"] 47 | 48 | # Load preset if specified 49 | if preset is not None: 50 | with open(preset) as f: 51 | hparams.parse_json(f.read()) 52 | # Override hyper parameters 53 | hparams.parse(args["--hparams"]) 54 | assert hparams.name == "wavenet_vocoder" 55 | 56 | print("Sampling frequency: {}".format(hparams.sample_rate)) 57 | 58 | assert name in ["cmu_arctic", "ljspeech", "librivox", "jsut"] 59 | mod = importlib.import_module(name) 60 | preprocess(mod, in_dir, out_dir, num_workers) 61 | -------------------------------------------------------------------------------- /code/lib/continuous_integration/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called by the "install" step defined in 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details. 4 | # The behavior of the script is controlled by environment variabled defined 5 | # in the .travis.yml in the top level folder of the project. 6 | 7 | # License: 3-clause BSD 8 | 9 | set -e 10 | 11 | # Fix the compilers to workaround avoid having the Python 3.4 build 12 | # lookup for g++44 unexpectedly. 13 | export CC=gcc 14 | export CXX=g++ 15 | 16 | echo 'List files from cached directories' 17 | echo 'pip:' 18 | ls $HOME/.cache/pip 19 | if [[ -d $HOME/download ]]; then 20 | echo 'download' 21 | ls $HOME/download 22 | fi 23 | 24 | # Deactivate the travis-provided virtual environment and setup a 25 | # conda-based environment instead 26 | deactivate 27 | 28 | # Use the miniconda installer for faster download / install of conda 29 | # itself 30 | pushd . 31 | cd 32 | mkdir -p download 33 | cd download 34 | echo "Cached in $HOME/download :" 35 | ls -l 36 | echo 37 | if [[ ! -f miniconda.sh ]] 38 | then 39 | wget https://repo.continuum.io/miniconda/Miniconda2-4.3.11-Linux-x86_64.sh \ 40 | -O miniconda.sh 41 | fi 42 | chmod +x miniconda.sh && ./miniconda.sh -b 43 | cd .. 44 | echo $(ls /home/travis/m*) 45 | export PATH=/home/travis/miniconda2/bin:$PATH 46 | conda update --yes conda 47 | popd 48 | 49 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 50 | numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION tensorflow=$TF_VERSION 51 | source activate testenv 52 | 53 | if [[ "$INSTALL_MKL" == "true" ]]; then 54 | # Make sure that MKL is used 55 | conda install --yes mkl 56 | else 57 | # Make sure that MKL is not used 58 | conda remove --yes --features mkl || echo "MKL not installed" 59 | fi 60 | 61 | if [[ "$COVERAGE" == "true" ]]; then 62 | pip install coverage coveralls 63 | fi 64 | 65 | # Build scikit-learn in the install.sh script to collapse the verbose 66 | # build output in the travis output when it succeeds. 67 | python --version 68 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 69 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 70 | python -c "import tensorflow as tf; print('tensorflow %s' % tf.__version__)" 71 | python setup.py build_ext --inplace 72 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import setuptools.command.develop 5 | import setuptools.command.build_py 6 | import os 7 | import subprocess 8 | 9 | version = '0.1.1' 10 | 11 | # Adapted from https://github.com/pytorch/pytorch 12 | cwd = os.path.dirname(os.path.abspath(__file__)) 13 | if os.getenv('WAVENET_VOCODER_BUILD_VERSION'): 14 | version = os.getenv('WAVENET_VOCODER_BUILD_VERSION') 15 | else: 16 | try: 17 | sha = subprocess.check_output( 18 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 19 | version += '+' + sha[:7] 20 | except subprocess.CalledProcessError: 21 | pass 22 | except IOError: # FileNotFoundError for python 3 23 | pass 24 | 25 | 26 | class build_py(setuptools.command.build_py.build_py): 27 | 28 | def run(self): 29 | self.create_version_file() 30 | setuptools.command.build_py.build_py.run(self) 31 | 32 | @staticmethod 33 | def create_version_file(): 34 | global version, cwd 35 | print('-- Building version ' + version) 36 | version_path = os.path.join(cwd, 'wavenet_vocoder', 'version.py') 37 | with open(version_path, 'w') as f: 38 | f.write("__version__ = '{}'\n".format(version)) 39 | 40 | 41 | class develop(setuptools.command.develop.develop): 42 | 43 | def run(self): 44 | build_py.create_version_file() 45 | setuptools.command.develop.develop.run(self) 46 | 47 | 48 | setup(name='wavenet_vocoder', 49 | version=version, 50 | description='PyTorch implementation of WaveNet vocoder', 51 | packages=find_packages(), 52 | cmdclass={ 53 | 'build_py': build_py, 54 | 'develop': develop, 55 | }, 56 | install_requires=[ 57 | "numpy", 58 | "scipy", 59 | "torch >= 0.4.1", 60 | ], 61 | extras_require={ 62 | "train": [ 63 | "docopt", 64 | "tqdm", 65 | "tensorboardX", 66 | "nnmnkwii >= 0.0.11", 67 | "keras", 68 | "scikit-learn", 69 | "lws", 70 | ], 71 | "test": [ 72 | "nose", 73 | "pysptk >= 0.1.9", 74 | "librosa", 75 | "matplotlib", 76 | "tqdm", 77 | "nnmnkwii >= 0.0.11", 78 | ], 79 | }) 80 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_mixture.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | import librosa 10 | import pysptk 11 | 12 | from wavenet_vocoder.mixture import discretized_mix_logistic_loss 13 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic 14 | 15 | 16 | def log_prob_from_logits(x): 17 | """ numerically stable log_softmax implementation that prevents overflow """ 18 | # TF ordering 19 | axis = len(x.size()) - 1 20 | m, _ = torch.max(x, dim=-1, keepdim=True) 21 | return x - m - torch.log(torch.sum(torch.exp(x - m), dim=axis, keepdim=True)) 22 | 23 | 24 | def test_log_softmax(): 25 | x = torch.rand(2, 16000, 30) 26 | y = log_prob_from_logits(x) 27 | y_hat = F.log_softmax(x, -1) 28 | 29 | y = y.data.cpu().numpy() 30 | y_hat = y_hat.data.cpu().numpy() 31 | assert np.allclose(y, y_hat) 32 | 33 | 34 | def test_mixture(): 35 | np.random.seed(1234) 36 | 37 | x, sr = librosa.load(pysptk.util.example_audio_file(), sr=None) 38 | assert sr == 16000 39 | 40 | T = len(x) 41 | x = x.reshape(1, T, 1) 42 | y = torch.from_numpy(x).float() 43 | y_hat = torch.rand(1, 30, T).float() 44 | 45 | print(y.shape, y_hat.shape) 46 | 47 | loss = discretized_mix_logistic_loss(y_hat, y) 48 | print(loss) 49 | 50 | loss = discretized_mix_logistic_loss(y_hat, y, reduce=False) 51 | print(loss.size(), y.size()) 52 | assert loss.size() == y.size() 53 | 54 | y = sample_from_discretized_mix_logistic(y_hat) 55 | print(y.shape) 56 | 57 | 58 | def test_misc(): 59 | # https://en.wikipedia.org/wiki/Logistic_distribution 60 | # what i have learned 61 | # m = (x - mu) / s 62 | m = torch.rand(10, 10) 63 | log_pdf_mid1 = -2 * torch.log(torch.exp(m / 2) + torch.exp(-m / 2)) 64 | log_pdf_mid2 = m - 2 * F.softplus(m) 65 | assert np.allclose(log_pdf_mid1.data.numpy(), log_pdf_mid2.data.numpy()) 66 | 67 | # Edge case for 0 68 | plus_in = torch.rand(10, 10) 69 | log_cdf_plus1 = F.sigmoid(m).log() 70 | log_cdf_plus2 = m - F.softplus(m) 71 | assert np.allclose(log_cdf_plus1.data.numpy(), log_cdf_plus2.data.numpy()) 72 | 73 | # Edge case for 255 74 | min_in = torch.rand(10, 10) 75 | log_one_minus_cdf_min1 = (1 - F.sigmoid(min_in)).log() 76 | log_one_minus_cdf_min2 = -F.softplus(min_in) 77 | assert np.allclose(log_one_minus_cdf_min1.data.numpy(), log_one_minus_cdf_min2.data.numpy()) 78 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/conv.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | class Conv1d(nn.Conv1d): 8 | """Extended nn.Conv1d for incremental dilated convolutions 9 | """ 10 | 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | self.clear_buffer() 14 | self._linearized_weight = None 15 | self.register_backward_hook(self._clear_linearized_weight) 16 | 17 | def incremental_forward(self, input): 18 | # input: (B, T, C) 19 | if self.training: 20 | raise RuntimeError('incremental_forward only supports eval mode') 21 | 22 | # run forward pre hooks (e.g., weight norm) 23 | for hook in self._forward_pre_hooks.values(): 24 | hook(self, input) 25 | 26 | # reshape weight 27 | weight = self._get_linearized_weight() 28 | kw = self.kernel_size[0] 29 | dilation = self.dilation[0] 30 | 31 | bsz = input.size(0) # input: bsz x len x dim 32 | if kw > 1: 33 | input = input.data 34 | if self.input_buffer is None: 35 | self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2)) 36 | self.input_buffer.zero_() 37 | else: 38 | # shift buffer 39 | self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone() 40 | # append next input 41 | self.input_buffer[:, -1, :] = input[:, -1, :] 42 | input = self.input_buffer 43 | if dilation > 1: 44 | input = input[:, 0::dilation, :].contiguous() 45 | output = F.linear(input.view(bsz, -1), weight, self.bias) 46 | return output.view(bsz, 1, -1) 47 | 48 | def clear_buffer(self): 49 | self.input_buffer = None 50 | 51 | def _get_linearized_weight(self): 52 | if self._linearized_weight is None: 53 | kw = self.kernel_size[0] 54 | # nn.Conv1d 55 | if self.weight.size() == (self.out_channels, self.in_channels, kw): 56 | weight = self.weight.transpose(1, 2).contiguous() 57 | else: 58 | # fairseq.modules.conv_tbc.ConvTBC 59 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 60 | assert weight.size() == (self.out_channels, kw, self.in_channels) 61 | self._linearized_weight = weight.view(self.out_channels, -1) 62 | return self._linearized_weight 63 | 64 | def _clear_linearized_weight(self, *args): 65 | self._linearized_weight = None 66 | -------------------------------------------------------------------------------- /pretrained/numbers_rules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ from https://github.com/keithito/tacotron """ 3 | 4 | import re 5 | from number_to_words import NumberToWords 6 | 7 | n2w = NumberToWords() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return n2w.convert(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | return n2w.convert(num) 52 | 53 | ''' 54 | _inflect = inflect.engine() 55 | def _expand_ordinal(m): 56 | return _inflect.number_to_words(m.group(0)) 57 | 58 | 59 | def _expand_number(m): 60 | num = int(m.group(0)) 61 | if num > 1000 and num < 3000: 62 | if num == 2000: 63 | return 'two thousand' 64 | elif num > 2000 and num < 2010: 65 | return 'two thousand ' + _inflect.number_to_words(num % 100) 66 | elif num % 100 == 0: 67 | return _inflect.number_to_words(num // 100) + ' hundred' 68 | else: 69 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 70 | else: 71 | return _inflect.number_to_words(num, andword='') 72 | ''' 73 | 74 | def normalize_numbers(text): 75 | text = re.sub(_comma_number_re, _remove_commas, text) 76 | text = re.sub(_pounds_re, r'\1 pounds', text) 77 | text = re.sub(_dollars_re, _expand_dollars, text) 78 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 79 | text = re.sub(_ordinal_re, _expand_ordinal, text) 80 | text = re.sub(_number_re, _expand_number, text) 81 | return text 82 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/numbers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ from https://github.com/keithito/tacotron """ 3 | 4 | import re 5 | from .number_to_words import NumberToWords 6 | 7 | n2w = NumberToWords() 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 13 | _number_re = re.compile(r'[0-9]+') 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(',', '') 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace('.', ' point ') 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split('.') 27 | if len(parts) > 2: 28 | return match + ' dollars' # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 33 | cent_unit = 'cent' if cents == 1 else 'cents' 34 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 37 | return '%s %s' % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = 'cent' if cents == 1 else 'cents' 40 | return '%s %s' % (cents, cent_unit) 41 | else: 42 | return 'zero dollars' 43 | 44 | 45 | def _expand_ordinal(m): 46 | return n2w.convert(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | return n2w.convert(num) 52 | 53 | ''' 54 | _inflect = inflect.engine() 55 | def _expand_ordinal(m): 56 | return _inflect.number_to_words(m.group(0)) 57 | 58 | 59 | def _expand_number(m): 60 | num = int(m.group(0)) 61 | if num > 1000 and num < 3000: 62 | if num == 2000: 63 | return 'two thousand' 64 | elif num > 2000 and num < 2010: 65 | return 'two thousand ' + _inflect.number_to_words(num % 100) 66 | elif num % 100 == 0: 67 | return _inflect.number_to_words(num // 100) + ' hundred' 68 | else: 69 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 70 | else: 71 | return _inflect.number_to_words(num, andword='') 72 | ''' 73 | 74 | def normalize_numbers(text): 75 | text = re.sub(_comma_number_re, _remove_commas, text) 76 | text = re.sub(_pounds_re, r'\1 pounds', text) 77 | text = re.sub(_dollars_re, _expand_dollars, text) 78 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 79 | text = re.sub(_ordinal_re, _expand_ordinal, text) 80 | text = re.sub(_number_re, _expand_number, text) 81 | return text 82 | -------------------------------------------------------------------------------- /pretrained/text.py: -------------------------------------------------------------------------------- 1 | from eng_rules import cmu_g2p, hybrid_g2p, rulebased_g2p 2 | from cleaners import english_cleaners 3 | import re 4 | 5 | def pronounce_chars(line, raw_line=None, cmu_only=False, int_timing_punct=True): 6 | # cleaners strip things... 7 | puncts = ["!",",",":","?","."] 8 | #puncts_timing = ["4","1","1","4", "4"] 9 | puncts_timing = [" "," "," "," ", " "] 10 | end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line] 11 | if len(end_punct) > 0: 12 | # preserve the end punctuation... 13 | if end_punct[-1][1] == line[-1]: 14 | end_punct = end_punct[-1] 15 | else: 16 | end_punct = (0, " ") 17 | else: 18 | end_punct = (0, " ") 19 | line = english_cleaners(line) 20 | if cmu_only: 21 | r0 = cmu_g2p(line, raw_line) 22 | return r0 23 | 24 | r = hybrid_g2p(line) 25 | 26 | if any([p in line for p in puncts]): 27 | new = [] 28 | psym = r.strip().split(" ") 29 | lsym = line.strip().split(" ") 30 | for lss, pss in zip(lsym, psym): 31 | prev = [] 32 | for ssi in pss.strip().split("@")[1:]: 33 | which_specials = [p for p in puncts if p in lss] 34 | if any([p in lss for p in puncts]): 35 | prev.append(re.sub(re.escape("|".join(puncts)), "", ssi)) 36 | # ASSUME ONLY 1? 37 | else: 38 | prev.append(ssi) 39 | if len(which_specials) > 0: 40 | prev.append(which_specials[0]) 41 | new.append(prev) 42 | prev = [] 43 | 44 | merged = "" 45 | for ii, chunk in enumerate(new): 46 | if any([p in chunk for p in puncts]): 47 | mstr = "" 48 | for ci in chunk: 49 | if any([p in ci for p in puncts]): 50 | which_specials = [(n, p) for n, p in enumerate(puncts) if p in ci] 51 | else: 52 | mstr += "@" 53 | mstr += ci 54 | merged += mstr 55 | if ii < (len(new) - 1): 56 | if not int_timing_punct: 57 | merged += which_specials[0][1] 58 | else: 59 | merged += puncts_timing[which_specials[0][0]] 60 | else: 61 | merged += "@" 62 | merged += "@".join(chunk) 63 | if ii < (len(new) - 1): 64 | merged += " " 65 | if merged[-1] == " ": 66 | merged = merged[:-1] 67 | if not int_timing_punct: 68 | merged += end_punct[1] 69 | else: 70 | merged += puncts_timing[end_punct[0]] 71 | merged += "~" 72 | return merged 73 | else: 74 | return r 75 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/plotters.py: -------------------------------------------------------------------------------- 1 | try: 2 | from cStringIO import StringIO as BytesIO 3 | except: # Python 3 4 | from io import BytesIO 5 | import numpy as np 6 | import PIL.Image 7 | import shutil 8 | from math import sqrt 9 | from skimage.transform import rescale, resize, downscale_local_mean 10 | from skimage.exposure import adjust_gamma 11 | 12 | 13 | def save_image_array(img, filename, resize_multiplier=(1, 1), gamma_multiplier=1, rescale_values=True, flipud=True, flat_wide=False, flat_vert=False, fmt="png"): 14 | """ 15 | Expects a 4D image array of (n_images, height, width, channels) 16 | 17 | rescale will rescale 1 channel images to the maximum value available 18 | 19 | Modified from implementation by Kyle McDonald 20 | 21 | https://github.com/kylemcdonald/python-utils/blob/master/show_array.py 22 | """ 23 | 24 | if len(img.shape) != 4: 25 | raise ValueError("Expects a 4D image array of (n_images, height, width, channels)") 26 | 27 | if flipud: 28 | img = img[:, ::-1] 29 | 30 | n_ex, o_height, o_width, o_channels = img.shape 31 | 32 | if img.shape[0] != 1: 33 | n = len(img) 34 | side = int(sqrt(n)) 35 | side0 = side 36 | side1 = side 37 | shp = img.shape 38 | if flat_wide or flat_vert or (side * side) == n: 39 | pass 40 | else: 41 | raise ValueError("Need input length that can be reshaped to a square (4, 16, 25, 36, etc)") 42 | n,h,w,c = img.shape 43 | if flat_wide: 44 | assert flat_wide != flat_vert 45 | side0 = 1 46 | side1 = n_ex 47 | elif flat_vert: 48 | assert flat_wide != flat_vert 49 | side0 = n_ex 50 | side1 = 1 51 | img = img.reshape(side0, side1, h, w, c).swapaxes(1, 2).reshape(side0*h, side1*w, c) 52 | else: 53 | img = img[0] 54 | 55 | if rescale_values: 56 | """ 57 | img_max = np.max(img) 58 | img_min = np.min(img) 59 | # scale to 0, 1 60 | img = (img - img_min) / float(img_max - img_min) 61 | # scale 0, 1 to 0, 255 62 | """ 63 | img *= 255. 64 | 65 | if img.shape[-1] == 1: 66 | img = img[:, :, 0] 67 | 68 | img = np.uint8(np.clip(img, 0, 255)) 69 | if resize_multiplier != (1, 1): 70 | rs = resize(img, (img.shape[0] * resize_multiplier[0], img.shape[1] * resize_multiplier[1])) 71 | 72 | if gamma_multiplier != 1: 73 | rs = adjust_gamma(rs, gamma_multiplier) 74 | 75 | if resize_multiplier != (1, 1) or gamma_multiplier != 1: 76 | rs *= 255. 77 | img = np.uint8(np.clip(rs, 0, 255)) 78 | image_data = BytesIO() 79 | PIL.Image.fromarray(img).save(image_data, fmt) 80 | with open(filename, 'wb') as f: 81 | image_data.seek(0) 82 | shutil.copyfileobj(image_data, f) 83 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/__init__.py: -------------------------------------------------------------------------------- 1 | from cleaning.eng_rules import cmu_g2p, hybrid_g2p, rulebased_g2p 2 | from cleaning.cleaners import english_cleaners 3 | import re 4 | 5 | def pronounce_chars(line, raw_line=None, cmu_only=False, int_timing_punct=True): 6 | # cleaners strip things... 7 | puncts = ["!",",",":","?","."] 8 | #puncts_timing = ["4","1","1","4", "4"] 9 | puncts_timing = [" "," "," "," ", " "] 10 | end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line] 11 | if len(end_punct) > 0: 12 | # preserve the end punctuation... 13 | if end_punct[-1][1] == line[-1]: 14 | end_punct = end_punct[-1] 15 | else: 16 | end_punct = (0, " ") 17 | else: 18 | end_punct = (0, " ") 19 | line = english_cleaners(line) 20 | if cmu_only: 21 | r0 = cmu_g2p(line, raw_line) 22 | return r0 23 | 24 | r = hybrid_g2p(line) 25 | 26 | if any([p in line for p in puncts]): 27 | new = [] 28 | psym = r.strip().split(" ") 29 | lsym = line.strip().split(" ") 30 | for lss, pss in zip(lsym, psym): 31 | prev = [] 32 | for ssi in pss.strip().split("@")[1:]: 33 | which_specials = [p for p in puncts if p in lss] 34 | if any([p in lss for p in puncts]): 35 | prev.append(re.sub(re.escape("|".join(puncts)), "", ssi)) 36 | # ASSUME ONLY 1? 37 | else: 38 | prev.append(ssi) 39 | if len(which_specials) > 0: 40 | prev.append(which_specials[0]) 41 | new.append(prev) 42 | prev = [] 43 | 44 | merged = "" 45 | for ii, chunk in enumerate(new): 46 | if any([p in chunk for p in puncts]): 47 | mstr = "" 48 | for ci in chunk: 49 | if any([p in ci for p in puncts]): 50 | which_specials = [(n, p) for n, p in enumerate(puncts) if p in ci] 51 | else: 52 | mstr += "@" 53 | mstr += ci 54 | merged += mstr 55 | if ii < (len(new) - 1): 56 | if not int_timing_punct: 57 | merged += which_specials[0][1] 58 | else: 59 | merged += puncts_timing[which_specials[0][0]] 60 | else: 61 | merged += "@" 62 | merged += "@".join(chunk) 63 | if ii < (len(new) - 1): 64 | merged += " " 65 | if merged[-1] == " ": 66 | merged = merged[:-1] 67 | if not int_timing_punct: 68 | merged += end_punct[1] 69 | else: 70 | merged += puncts_timing[end_punct[0]] 71 | merged += "~" 72 | return merged 73 | else: 74 | return r 75 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/.gitignore: -------------------------------------------------------------------------------- 1 | foobar* 2 | pretrained_models 3 | notebooks 4 | wavenet_vocoder/version.py 5 | checkpoints* 6 | log 7 | generated 8 | data 9 | text 10 | 11 | # Created by https://www.gitignore.io 12 | 13 | ### Python ### 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | 70 | ### IPythonNotebook ### 71 | # Temporary data 72 | .ipynb_checkpoints/ 73 | 74 | 75 | ### SublimeText ### 76 | # cache files for sublime text 77 | *.tmlanguage.cache 78 | *.tmPreferences.cache 79 | *.stTheme.cache 80 | 81 | # workspace files are user-specific 82 | *.sublime-workspace 83 | 84 | # project files should be checked into the repository, unless a significant 85 | # proportion of contributors will probably not be using SublimeText 86 | # *.sublime-project 87 | 88 | # sftp configuration file 89 | sftp-config.json 90 | 91 | 92 | ### Emacs ### 93 | # -*- mode: gitignore; -*- 94 | *~ 95 | \#*\# 96 | /.emacs.desktop 97 | /.emacs.desktop.lock 98 | *.elc 99 | auto-save-list 100 | tramp 101 | .\#* 102 | 103 | # Org-mode 104 | .org-id-locations 105 | *_archive 106 | 107 | # flymake-mode 108 | *_flymake.* 109 | 110 | # eshell files 111 | /eshell/history 112 | /eshell/lastdir 113 | 114 | # elpa packages 115 | /elpa/ 116 | 117 | # reftex files 118 | *.rel 119 | 120 | # AUCTeX auto folder 121 | /auto/ 122 | 123 | # cask packages 124 | .cask/ 125 | 126 | 127 | ### Vim ### 128 | [._]*.s[a-w][a-z] 129 | [._]s[a-w][a-z] 130 | *.un~ 131 | Session.vim 132 | .netrwhist 133 | *~ 134 | 135 | 136 | ### C++ ### 137 | # Compiled Object files 138 | *.slo 139 | *.lo 140 | *.o 141 | *.obj 142 | 143 | # Precompiled Headers 144 | *.gch 145 | *.pch 146 | 147 | # Compiled Dynamic libraries 148 | *.so 149 | *.dylib 150 | *.dll 151 | 152 | # Fortran module files 153 | *.mod 154 | 155 | # Compiled Static libraries 156 | *.lai 157 | *.la 158 | *.a 159 | *.lib 160 | 161 | # Executables 162 | *.exe 163 | *.out 164 | *.app 165 | 166 | 167 | ### OSX ### 168 | .DS_Store 169 | .AppleDouble 170 | .LSOverride 171 | 172 | # Icon must end with two \r 173 | Icon 174 | 175 | 176 | # Thumbnails 177 | ._* 178 | 179 | # Files that might appear on external disk 180 | .Spotlight-V100 181 | .Trashes 182 | 183 | # Directories potentially created on remote AFP share 184 | .AppleDB 185 | .AppleDesktop 186 | Network Trash Folder 187 | Temporary Items 188 | .apdisk 189 | 190 | 191 | ### Linux ### 192 | *~ 193 | 194 | # KDE directory preferences 195 | .directory 196 | 197 | # Linux trash folder which might appear on any partition or disk 198 | .Trash-* 199 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | 7 | from nnmnkwii import preprocessing as P 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 13 | 14 | 15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 16 | executor = ProcessPoolExecutor(max_workers=num_workers) 17 | futures = [] 18 | index = 1 19 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 20 | for line in f: 21 | parts = line.strip().split('|') 22 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 23 | text = parts[2] 24 | futures.append(executor.submit( 25 | partial(_process_utterance, out_dir, index, wav_path, text))) 26 | index += 1 27 | return [future.result() for future in tqdm(futures)] 28 | 29 | 30 | def _process_utterance(out_dir, index, wav_path, text): 31 | # Load the audio to a numpy array: 32 | wav = audio.load_wav(wav_path) 33 | 34 | if hparams.rescaling: 35 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 36 | 37 | # Mu-law quantize 38 | if is_mulaw_quantize(hparams.input_type): 39 | # [0, quantize_channels) 40 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 41 | 42 | # Trim silences 43 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 44 | wav = wav[start:end] 45 | out = out[start:end] 46 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 47 | out_dtype = np.int16 48 | elif is_mulaw(hparams.input_type): 49 | # [-1, 1] 50 | out = P.mulaw(wav, hparams.quantize_channels) 51 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 52 | out_dtype = np.float32 53 | else: 54 | # [-1, 1] 55 | out = wav 56 | constant_values = 0.0 57 | out_dtype = np.float32 58 | 59 | # Compute a mel-scale spectrogram from the trimmed wav: 60 | # (N, D) 61 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 62 | # lws pads zeros internally before performing stft 63 | # this is needed to adjust time resolution between audio and mel-spectrogram 64 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 65 | 66 | # zero pad for quantized signal 67 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 68 | N = mel_spectrogram.shape[0] 69 | assert len(out) >= N * audio.get_hop_size() 70 | 71 | # time resolution adjustment 72 | # ensure length of raw audio is multiple of hop_size so that we can use 73 | # transposed convolution to upsample 74 | out = out[:N * audio.get_hop_size()] 75 | assert len(out) % audio.get_hop_size() == 0 76 | 77 | timesteps = len(out) 78 | 79 | # Write the spectrograms to disk: 80 | audio_filename = 'ljspeech-audio-%05d.npy' % index 81 | mel_filename = 'ljspeech-mel-%05d.npy' % index 82 | np.save(os.path.join(out_dir, audio_filename), 83 | out.astype(out_dtype), allow_pickle=False) 84 | np.save(os.path.join(out_dir, mel_filename), 85 | mel_spectrogram.astype(np.float32), allow_pickle=False) 86 | 87 | # Return a tuple describing this training example: 88 | return (audio_filename, mel_filename, timesteps, text) 89 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/jsut.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from nnmnkwii.datasets import jsut 7 | from nnmnkwii.io import hts 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 13 | 14 | 15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 16 | executor = ProcessPoolExecutor(max_workers=num_workers) 17 | futures = [] 18 | 19 | transcriptions = jsut.TranscriptionDataSource( 20 | in_dir, subsets=jsut.available_subsets).collect_files() 21 | wav_paths = jsut.WavFileDataSource( 22 | in_dir, subsets=jsut.available_subsets).collect_files() 23 | 24 | for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)): 25 | futures.append(executor.submit( 26 | partial(_process_utterance, out_dir, index + 1, wav_path, text))) 27 | return [future.result() for future in tqdm(futures)] 28 | 29 | 30 | def _process_utterance(out_dir, index, wav_path, text): 31 | # Load the audio to a numpy array: 32 | wav = audio.load_wav(wav_path) 33 | sr = hparams.sample_rate 34 | 35 | if hparams.rescaling: 36 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 37 | 38 | # Trim silence from hts labels if available 39 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") 40 | if exists(lab_path): 41 | labels = hts.load(lab_path) 42 | assert "sil" in labels[0][-1] 43 | assert "sil" in labels[-1][-1] 44 | b = int(labels[0][1] * 1e-7 * sr) 45 | e = int(labels[-1][0] * 1e-7 * sr) 46 | wav = wav[b:e] 47 | else: 48 | wav, _ = librosa.effects.trim(wav, top_db=30) 49 | 50 | # Mu-law quantize 51 | if is_mulaw_quantize(hparams.input_type): 52 | # [0, quantize_channels) 53 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 54 | 55 | # Trim silences 56 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 57 | wav = wav[start:end] 58 | out = out[start:end] 59 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 60 | out_dtype = np.int16 61 | elif is_mulaw(hparams.input_type): 62 | # [-1, 1] 63 | out = P.mulaw(wav, hparams.quantize_channels) 64 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 65 | out_dtype = np.float32 66 | else: 67 | # [-1, 1] 68 | out = wav 69 | constant_values = 0.0 70 | out_dtype = np.float32 71 | 72 | # Compute a mel-scale spectrogram from the trimmed wav: 73 | # (N, D) 74 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 75 | # lws pads zeros internally before performing stft 76 | # this is needed to adjust time resolution between audio and mel-spectrogram 77 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 78 | 79 | # zero pad for quantized signal 80 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 81 | N = mel_spectrogram.shape[0] 82 | assert len(out) >= N * audio.get_hop_size() 83 | 84 | # time resolution adjustment 85 | # ensure length of raw audio is multiple of hop_size so that we can use 86 | # transposed convolution to upsample 87 | out = out[:N * audio.get_hop_size()] 88 | assert len(out) % audio.get_hop_size() == 0 89 | 90 | timesteps = len(out) 91 | 92 | # Write the spectrograms to disk: 93 | audio_filename = 'jsut-audio-%05d.npy' % index 94 | mel_filename = 'jsut-mel-%05d.npy' % index 95 | np.save(os.path.join(out_dir, audio_filename), 96 | out.astype(out_dtype), allow_pickle=False) 97 | np.save(os.path.join(out_dir, mel_filename), 98 | mel_spectrogram.astype(np.float32), allow_pickle=False) 99 | 100 | # Return a tuple describing this training example: 101 | return (audio_filename, mel_filename, timesteps, text) 102 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/audio/magrecnp.py: -------------------------------------------------------------------------------- 1 | from tfbldr.datasets.audio import fetch_sample_speech_tapestry 2 | from tfbldr.datasets.audio import soundsc 3 | import matplotlib 4 | matplotlib.use("Agg") 5 | import matplotlib.pyplot as plt 6 | import tensorflow as tf 7 | import os 8 | import numpy as np 9 | from scipy.io import wavfile 10 | from tfbldr.datasets.audio import linear_to_mel_weight_matrix 11 | from tfbldr.datasets.audio import stft 12 | from tfbldr.datasets.audio import iterate_invert_spectrogram 13 | 14 | 15 | def sonify(spectrogram, samples, transform_op_fn, logscaled=True): 16 | graph = tf.Graph() 17 | with graph.as_default(): 18 | 19 | noise = tf.Variable(tf.random_normal([samples], stddev=1e-6)) 20 | 21 | x = transform_op_fn(noise) 22 | y = spectrogram 23 | 24 | if logscaled: 25 | x = tf.expm1(x) 26 | y = tf.expm1(y) 27 | 28 | x = tf.nn.l2_normalize(x) 29 | y = tf.nn.l2_normalize(y) 30 | tf.losses.mean_squared_error(x, y[-tf.shape(x)[0]:]) 31 | 32 | optimizer = tf.contrib.opt.ScipyOptimizerInterface( 33 | loss=tf.losses.get_total_loss(), 34 | var_list=[noise], 35 | tol=1e-16, 36 | method='L-BFGS-B', 37 | options={ 38 | 'maxiter': 1000, 39 | 'disp': True 40 | }) 41 | 42 | with tf.Session(graph=graph) as session: 43 | session.run(tf.global_variables_initializer()) 44 | optimizer.minimize(session) 45 | waveform = session.run(noise) 46 | 47 | return waveform 48 | 49 | fs, d = fetch_sample_speech_tapestry() 50 | 51 | sample_rate = fs 52 | window_size = 512 53 | step = 128 54 | n_mel = 80 55 | wav_scale = 2 ** 15 56 | waveform = d / float(wav_scale) 57 | 58 | def logmel(waveform): 59 | z = tf.contrib.signal.stft(waveform, window_size, step) 60 | magnitudes = tf.abs(z) 61 | filterbank = tf.contrib.signal.linear_to_mel_weight_matrix( 62 | num_mel_bins=n_mel, 63 | num_spectrogram_bins=magnitudes.shape[-1].value, 64 | sample_rate=sample_rate, 65 | lower_edge_hertz=125., 66 | upper_edge_hertz=7800.) 67 | melspectrogram = tf.tensordot(magnitudes, filterbank, 1) 68 | return tf.log1p(melspectrogram) 69 | 70 | 71 | def logmel2(waveform): 72 | res = np.abs(stft(waveform, windowsize=window_size, step=step, real=False, compute_onesided=True)) 73 | mels = linear_to_mel_weight_matrix( 74 | res.shape[1], 75 | sample_rate, 76 | lower_edge_hertz=125., 77 | upper_edge_hertz=7800., 78 | n_filts=n_mel, dtype=np.float64) 79 | mel_res = np.dot(res, mels) 80 | return np.log1p(mel_res) 81 | 82 | with tf.Session(): 83 | spectrogram = logmel(waveform).eval() 84 | 85 | spectrogram2 = logmel2(waveform) 86 | spectrogram = (spectrogram - spectrogram.min()) / float(spectrogram.max() - spectrogram.min()) 87 | spectrogram2 = (spectrogram2 - spectrogram2.min()) / float(spectrogram2.max() - spectrogram2.min()) 88 | 89 | f, axarr = plt.subplots(1, 2) 90 | axarr[0].imshow(spectrogram) 91 | axarr[1].imshow(spectrogram2) 92 | plt.savefig("tmpspec") 93 | 94 | reconstructed_waveform = sonify(spectrogram, len(waveform), logmel) 95 | wavfile.write("tmp.wav", sample_rate, soundsc(reconstructed_waveform)) 96 | reconstructed_waveform2 = sonify(spectrogram2, len(waveform), logmel) 97 | wavfile.write("tmp2.wav", sample_rate, soundsc(reconstructed_waveform2)) 98 | 99 | 100 | fftsize = 512 101 | substep = 32 102 | rw_s = np.abs(stft(reconstructed_waveform, fftsize=fftsize, step=substep, real=False, 103 | compute_onesided=False)) 104 | rw = iterate_invert_spectrogram(rw_s, fftsize, substep, n_iter=100, verbose=True) 105 | 106 | rw2_s = np.abs(stft(reconstructed_waveform2, fftsize=fftsize, step=substep, real=False, 107 | compute_onesided=False)) 108 | rw2 = iterate_invert_spectrogram(rw2_s, fftsize, substep, n_iter=100, verbose=True) 109 | 110 | d_s = np.abs(stft(waveform, fftsize=fftsize, step=substep, real=False, 111 | compute_onesided=False)) 112 | df = iterate_invert_spectrogram(d_s, fftsize, substep, n_iter=10, verbose=True) 113 | wavfile.write("tmpif.wav", sample_rate, soundsc(df)) 114 | wavfile.write("tmpf.wav", sample_rate, soundsc(rw)) 115 | wavfile.write("tmpf2.wav", sample_rate, soundsc(rw2)) 116 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | from scipy import signal 6 | from hparams import hparams 7 | from scipy.io import wavfile 8 | 9 | import lws 10 | 11 | 12 | def load_wav(path): 13 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | 15 | 16 | def save_wav(wav, path): 17 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 18 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 19 | 20 | 21 | def trim(quantized): 22 | start, end = start_and_end_indices(quantized, hparams.silence_threshold) 23 | return quantized[start:end] 24 | 25 | 26 | def adjust_time_resolution(quantized, mel): 27 | """Adjust time resolution by repeating features 28 | 29 | Args: 30 | quantized (ndarray): (T,) 31 | mel (ndarray): (N, D) 32 | 33 | Returns: 34 | tuple: Tuple of (T,) and (T, D) 35 | """ 36 | assert len(quantized.shape) == 1 37 | assert len(mel.shape) == 2 38 | 39 | upsample_factor = quantized.size // mel.shape[0] 40 | mel = np.repeat(mel, upsample_factor, axis=0) 41 | n_pad = quantized.size - mel.shape[0] 42 | if n_pad != 0: 43 | assert n_pad > 0 44 | mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) 45 | 46 | # trim 47 | start, end = start_and_end_indices(quantized, hparams.silence_threshold) 48 | 49 | return quantized[start:end], mel[start:end, :] 50 | adjast_time_resolution = adjust_time_resolution # 'adjust' is correct spelling, this is for compatibility 51 | 52 | 53 | def start_and_end_indices(quantized, silence_threshold=2): 54 | for start in range(quantized.size): 55 | if abs(quantized[start] - 127) > silence_threshold: 56 | break 57 | for end in range(quantized.size - 1, 1, -1): 58 | if abs(quantized[end] - 127) > silence_threshold: 59 | break 60 | 61 | assert abs(quantized[start] - 127) > silence_threshold 62 | assert abs(quantized[end] - 127) > silence_threshold 63 | 64 | return start, end 65 | 66 | 67 | def melspectrogram(y): 68 | D = _lws_processor().stft(y).T 69 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 70 | if not hparams.allow_clipping_in_normalization: 71 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 72 | return _normalize(S) 73 | 74 | 75 | def get_hop_size(): 76 | hop_size = hparams.hop_size 77 | if hop_size is None: 78 | assert hparams.frame_shift_ms is not None 79 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 80 | return hop_size 81 | 82 | 83 | def _lws_processor(): 84 | return lws.lws(hparams.fft_size, get_hop_size(), mode="speech") 85 | 86 | 87 | def lws_num_frames(length, fsize, fshift): 88 | """Compute number of time frames of lws spectrogram 89 | """ 90 | pad = (fsize - fshift) 91 | if length % fshift == 0: 92 | M = (length + pad * 2 - fsize) // fshift + 1 93 | else: 94 | M = (length + pad * 2 - fsize) // fshift + 2 95 | return M 96 | 97 | 98 | def lws_pad_lr(x, fsize, fshift): 99 | """Compute left and right padding lws internally uses 100 | """ 101 | M = lws_num_frames(len(x), fsize, fshift) 102 | pad = (fsize - fshift) 103 | T = len(x) + 2 * pad 104 | r = (M - 1) * fshift + fsize - T 105 | return pad, pad + r 106 | 107 | # Conversions: 108 | 109 | 110 | _mel_basis = None 111 | 112 | 113 | def _linear_to_mel(spectrogram): 114 | global _mel_basis 115 | if _mel_basis is None: 116 | _mel_basis = _build_mel_basis() 117 | return np.dot(_mel_basis, spectrogram) 118 | 119 | 120 | def _build_mel_basis(): 121 | assert hparams.fmax <= hparams.sample_rate // 2 122 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, 123 | fmin=hparams.fmin, fmax=hparams.fmax, 124 | n_mels=hparams.num_mels) 125 | 126 | 127 | def _amp_to_db(x): 128 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 129 | return 20 * np.log10(np.maximum(min_level, x)) 130 | 131 | 132 | def _db_to_amp(x): 133 | return np.power(10.0, x * 0.05) 134 | 135 | 136 | def _normalize(S): 137 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 138 | 139 | 140 | def _denormalize(S): 141 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 142 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/cmu_arctic.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | from nnmnkwii.datasets import cmu_arctic 7 | from nnmnkwii.io import hts 8 | from nnmnkwii import preprocessing as P 9 | from hparams import hparams 10 | from os.path import exists 11 | import librosa 12 | 13 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 14 | 15 | 16 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 17 | executor = ProcessPoolExecutor(max_workers=num_workers) 18 | futures = [] 19 | 20 | speakers = cmu_arctic.available_speakers 21 | 22 | wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers) 23 | wav_paths = wd.collect_files() 24 | speaker_ids = wd.labels 25 | 26 | for index, (speaker_id, wav_path) in enumerate( 27 | zip(speaker_ids, wav_paths)): 28 | futures.append(executor.submit( 29 | partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A"))) 30 | return [future.result() for future in tqdm(futures)] 31 | 32 | 33 | def start_at(labels): 34 | has_silence = labels[0][-1] == "pau" 35 | if not has_silence: 36 | return labels[0][0] 37 | for i in range(1, len(labels)): 38 | if labels[i][-1] != "pau": 39 | return labels[i][0] 40 | assert False 41 | 42 | 43 | def end_at(labels): 44 | has_silence = labels[-1][-1] == "pau" 45 | if not has_silence: 46 | return labels[-1][1] 47 | for i in range(len(labels) - 2, 0, -1): 48 | if labels[i][-1] != "pau": 49 | return labels[i][1] 50 | assert False 51 | 52 | 53 | def _process_utterance(out_dir, index, speaker_id, wav_path, text): 54 | sr = hparams.sample_rate 55 | 56 | # Load the audio to a numpy array. Resampled if needed 57 | wav = audio.load_wav(wav_path) 58 | 59 | lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") 60 | 61 | # Trim silence from hts labels if available 62 | # TODO 63 | if exists(lab_path) and False: 64 | labels = hts.load(lab_path) 65 | b = int(start_at(labels) * 1e-7 * sr) 66 | e = int(end_at(labels) * 1e-7 * sr) 67 | wav = wav[b:e] 68 | wav, _ = librosa.effects.trim(wav, top_db=20) 69 | else: 70 | wav, _ = librosa.effects.trim(wav, top_db=20) 71 | 72 | if hparams.rescaling: 73 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 74 | 75 | # Mu-law quantize 76 | if is_mulaw_quantize(hparams.input_type): 77 | # [0, quantize_channels) 78 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 79 | 80 | # Trim silences 81 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 82 | wav = wav[start:end] 83 | out = out[start:end] 84 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 85 | out_dtype = np.int16 86 | elif is_mulaw(hparams.input_type): 87 | # [-1, 1] 88 | out = P.mulaw(wav, hparams.quantize_channels) 89 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 90 | out_dtype = np.float32 91 | else: 92 | # [-1, 1] 93 | out = wav 94 | constant_values = 0.0 95 | out_dtype = np.float32 96 | 97 | # Compute a mel-scale spectrogram from the trimmed wav: 98 | # (N, D) 99 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 100 | # lws pads zeros internally before performing stft 101 | # this is needed to adjust time resolution between audio and mel-spectrogram 102 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 103 | 104 | # zero pad for quantized signal 105 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 106 | N = mel_spectrogram.shape[0] 107 | assert len(out) >= N * audio.get_hop_size() 108 | 109 | # time resolution adjustment 110 | # ensure length of raw audio is multiple of hop_size so that we can use 111 | # transposed convolution to upsample 112 | out = out[:N * audio.get_hop_size()] 113 | assert len(out) % audio.get_hop_size() == 0 114 | 115 | timesteps = len(out) 116 | 117 | # Write the spectrograms to disk: 118 | audio_filename = 'cmu_arctic-audio-%05d.npy' % index 119 | mel_filename = 'cmu_arctic-mel-%05d.npy' % index 120 | np.save(os.path.join(out_dir, audio_filename), 121 | out.astype(out_dtype), allow_pickle=False) 122 | np.save(os.path.join(out_dir, mel_filename), 123 | mel_spectrogram.astype(np.float32), allow_pickle=False) 124 | 125 | # Return a tuple describing this training example: 126 | return (audio_filename, mel_filename, timesteps, text, speaker_id) 127 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | # NOTE: If you want full control for model architecture. please take a look 5 | # at the code and change whatever you want. Some hyper parameters are hardcoded. 6 | 7 | # Default hyperparameters: 8 | hparams = tf.contrib.training.HParams( 9 | name="wavenet_vocoder", 10 | 11 | # Convenient model builder 12 | builder="wavenet", 13 | 14 | # Input type: 15 | # 1. raw [-1, 1] 16 | # 2. mulaw [-1, 1] 17 | # 3. mulaw-quantize [0, mu] 18 | # If input_type is raw or mulaw, network assumes scalar input and 19 | # discretized mixture of logistic distributions output, otherwise one-hot 20 | # input and softmax output are assumed. 21 | # **NOTE**: if you change the one of the two parameters below, you need to 22 | # re-run preprocessing before training. 23 | input_type="raw", 24 | quantize_channels=65536, # 65536 or 256 25 | 26 | # Audio: 27 | sample_rate=22050, 28 | # this is only valid for mulaw is True 29 | silence_threshold=2, 30 | num_mels=80, 31 | fmin=125, 32 | fmax=7600, 33 | fft_size=1024, 34 | # shift can be specified by either hop_size or frame_shift_ms 35 | hop_size=256, 36 | frame_shift_ms=None, 37 | min_level_db=-100, 38 | ref_level_db=20, 39 | # whether to rescale waveform or not. 40 | # Let x is an input waveform, rescaled waveform y is given by: 41 | # y = x / np.abs(x).max() * rescaling_max 42 | rescaling=True, 43 | rescaling_max=0.999, 44 | # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may 45 | # happen depends on min_level_db and ref_level_db, causing clipping noise. 46 | # If False, assertion is added to ensure no clipping happens.o0 47 | allow_clipping_in_normalization=True, 48 | 49 | # Mixture of logistic distributions: 50 | log_scale_min=float(np.log(1e-14)), 51 | 52 | # Model: 53 | # This should equal to `quantize_channels` if mu-law quantize enabled 54 | # otherwise num_mixture * 3 (pi, mean, log_scale) 55 | out_channels=10 * 3, 56 | layers=24, 57 | stacks=4, 58 | residual_channels=512, 59 | gate_channels=512, # split into 2 gropus internally for gated activation 60 | skip_out_channels=256, 61 | dropout=1 - 0.95, 62 | kernel_size=3, 63 | # If True, apply weight normalization as same as DeepVoice3 64 | weight_normalization=True, 65 | # Use legacy code or not. Default is True since we already provided a model 66 | # based on the legacy code that can generate high-quality audio. 67 | # Ref: https://github.com/r9y9/wavenet_vocoder/pull/73 68 | legacy=True, 69 | 70 | # Local conditioning (set negative value to disable)) 71 | cin_channels=80, 72 | # If True, use transposed convolutions to upsample conditional features, 73 | # otherwise repeat features to adjust time resolution 74 | upsample_conditional_features=True, 75 | # should np.prod(upsample_scales) == hop_size 76 | upsample_scales=[4, 4, 4, 4], 77 | # Freq axis kernel size for upsampling network 78 | freq_axis_kernel_size=3, 79 | 80 | # Global conditioning (set negative value to disable) 81 | # currently limited for speaker embedding 82 | # this should only be enabled for multi-speaker dataset 83 | gin_channels=-1, # i.e., speaker embedding dim 84 | n_speakers=7, # 7 for CMU ARCTIC 85 | 86 | # Data loader 87 | pin_memory=True, 88 | num_workers=2, 89 | 90 | # train/test 91 | # test size can be specified as portion or num samples 92 | test_size=0.0441, # 50 for CMU ARCTIC single speaker 93 | test_num_samples=None, 94 | random_state=1234, 95 | 96 | # Loss 97 | 98 | # Training: 99 | batch_size=2, 100 | adam_beta1=0.9, 101 | adam_beta2=0.999, 102 | adam_eps=1e-8, 103 | amsgrad=False, 104 | initial_learning_rate=1e-3, 105 | # see lrschedule.py for available lr_schedule 106 | lr_schedule="noam_learning_rate_decay", 107 | lr_schedule_kwargs={}, # {"anneal_rate": 0.5, "anneal_interval": 50000}, 108 | nepochs=2000, 109 | weight_decay=0.0, 110 | clip_thresh=-1, 111 | # max time steps can either be specified as sec or steps 112 | # if both are None, then full audio samples are used in a batch 113 | max_time_sec=None, 114 | max_time_steps=8000, 115 | # Hold moving averaged parameters and use them for evaluation 116 | exponential_moving_average=True, 117 | # averaged = decay * averaged + (1 - decay) * x 118 | ema_decay=0.9999, 119 | 120 | # Save 121 | # per-step intervals 122 | checkpoint_interval=10000, 123 | train_eval_interval=10000, 124 | # per-epoch interval 125 | test_eval_epoch_interval=5, 126 | save_optimizer_state=True, 127 | 128 | # Eval: 129 | ) 130 | 131 | 132 | def hparams_debug_string(): 133 | values = hparams.values() 134 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 135 | return 'Hyperparameters:\n' + '\n'.join(hp) 136 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/librivox.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import audio 6 | 7 | from nnmnkwii import preprocessing as P 8 | from hparams import hparams 9 | from os.path import exists 10 | import librosa 11 | 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 13 | 14 | 15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 16 | executor = ProcessPoolExecutor(max_workers=num_workers) 17 | futures = [] 18 | index = 1 19 | 20 | # with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 21 | # for line in f: 22 | # parts = line.strip().split('|') 23 | # wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 24 | # text = parts[2] 25 | # futures.append(executor.submit( 26 | # partial(_process_utterance, out_dir, index, wav_path, text))) 27 | # index += 1 28 | 29 | valid_ext = '.ogg .wav .mp3'.split() 30 | for f in sorted(os.listdir(in_dir)): 31 | valid = sum([f.endswith(ext) for ext in valid_ext]) 32 | if valid < 1: 33 | continue 34 | 35 | audio_filepath = os.path.join(in_dir, f) 36 | text = audio_filepath # Not very informative 37 | futures.append(executor.submit( 38 | partial(_process_utterance, out_dir, index, audio_filepath, text))) 39 | index += 1 40 | return [tup for future in tqdm(futures) for tup in future.result()] 41 | 42 | 43 | def _process_utterance(out_dir, index, audio_filepath, text): 44 | # Load the audio to a numpy array: 45 | wav_whole = audio.load_wav(audio_filepath) 46 | 47 | if hparams.rescaling: 48 | wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max 49 | 50 | # This is a librivox source, so the audio files are going to be v. long 51 | # compared to a typical 'utterance' : So split the wav into chunks 52 | 53 | tup_results = [] 54 | 55 | n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances 56 | n_chunks = wav_whole.shape[0] // n_samples 57 | 58 | for chunk_idx in range(n_chunks): 59 | chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples 60 | if chunk_idx == n_chunks - 1: # This is the last chunk - allow it to extend to the end of the file 61 | chunk_end = None 62 | wav = wav_whole[chunk_start: chunk_end] 63 | 64 | # Mu-law quantize 65 | if is_mulaw_quantize(hparams.input_type): 66 | # [0, quantize_channels) 67 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 68 | 69 | # Trim silences 70 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 71 | wav = wav[start:end] 72 | out = out[start:end] 73 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 74 | out_dtype = np.int16 75 | elif is_mulaw(hparams.input_type): 76 | # [-1, 1] 77 | out = P.mulaw(wav, hparams.quantize_channels) 78 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 79 | out_dtype = np.float32 80 | else: 81 | # [-1, 1] 82 | out = wav 83 | constant_values = 0.0 84 | out_dtype = np.float32 85 | 86 | # Compute a mel-scale spectrogram from the trimmed wav: 87 | # (N, D) 88 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 89 | # lws pads zeros internally before performing stft 90 | # this is needed to adjust time resolution between audio and mel-spectrogram 91 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 92 | 93 | # zero pad for quantized signal 94 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 95 | N = mel_spectrogram.shape[0] 96 | assert len(out) >= N * audio.get_hop_size() 97 | 98 | # time resolution adjustment 99 | # ensure length of raw audio is multiple of hop_size so that we can use 100 | # transposed convolution to upsample 101 | out = out[:N * audio.get_hop_size()] 102 | assert len(out) % audio.get_hop_size() == 0 103 | 104 | timesteps = len(out) 105 | 106 | # Write the spectrograms to disk: 107 | audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,) 108 | mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,) 109 | text_idx = '%s - %05d' % (text, chunk_idx,) 110 | np.save(os.path.join(out_dir, audio_filename), 111 | out.astype(out_dtype), allow_pickle=False) 112 | np.save(os.path.join(out_dir, mel_filename), 113 | mel_spectrogram.astype(np.float32), allow_pickle=False) 114 | 115 | # Add results tuple describing this training example: 116 | tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) 117 | 118 | # Return all the audio results tuples (unpack in caller) 119 | return tup_results 120 | -------------------------------------------------------------------------------- /pretrained/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from numbers_rules import normalize_numbers 18 | 19 | 20 | _whitespace_re = re.compile(r'\s+') 21 | _apos_s_re = re.compile(r"'s") 22 | _single_re = re.compile(r'["]') 23 | _double_re = re.compile(r"[']") 24 | _semicolon_re = re.compile(r';') 25 | _paren_re = re.compile(r'[()]') 26 | _bracket_re = re.compile(r'[\[\]]') 27 | _dash_re = re.compile(r'--') 28 | _comma_re = re.compile(r' , ') 29 | _colon_re = re.compile(r':') 30 | _period_re = re.compile(r'\.$') 31 | _abbrev_re = re.compile(r'\.') 32 | _US_re = re.compile(r' US') 33 | _UK_re = re.compile(r' UK') 34 | _FBI_re = re.compile(r' FBI') 35 | _CIA_re = re.compile(r' CIA') 36 | _NSA_re = re.compile(r' NSA') 37 | _USA_re = re.compile(r' USA') 38 | _USSR_re = re.compile(r' USSR') 39 | 40 | # handle 22 -> 22nd??? 41 | 42 | # List of (regular expression, replacement) pairs for abbreviations: 43 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 44 | ('mrs', 'misess'), 45 | ('mr', 'mister'), 46 | ('dr', 'doctor'), 47 | ('st', 'saint'), 48 | ('co', 'company'), 49 | ('jr', 'junior'), 50 | ('maj', 'major'), 51 | ('gen', 'general'), 52 | ('drs', 'doctors'), 53 | ('rev', 'reverend'), 54 | ('lt', 'lieutenant'), 55 | ('hon', 'honorable'), 56 | ('sgt', 'sergeant'), 57 | ('capt', 'captain'), 58 | ('esq', 'esquire'), 59 | ('ltd', 'limited'), 60 | ('col', 'colonel'), 61 | ('ft', 'fort'), 62 | ]] 63 | 64 | 65 | def expand_abbreviations(text): 66 | for regex, replacement in _abbreviations: 67 | text = re.sub(regex, replacement, text) 68 | return text 69 | 70 | 71 | def expand_numbers(text): 72 | return normalize_numbers(text) 73 | 74 | 75 | def lowercase(text): 76 | text = re.sub(_USSR_re, ' U S S R', text) 77 | text = re.sub(_USA_re, ' U S A', text) 78 | text = re.sub(_US_re, ' U S', text) 79 | text = re.sub(_UK_re, ' U K', text) 80 | text = re.sub(_FBI_re, ' F B I', text) 81 | text = re.sub(_CIA_re, ' C I A', text) 82 | return text.lower() 83 | 84 | 85 | def collapse_whitespace(text): 86 | return re.sub(_whitespace_re, ' ', text) 87 | 88 | 89 | def convert_to_ascii(text): 90 | unicode_content = text.decode('utf-8') 91 | return unidecode(unicode_content) 92 | 93 | 94 | def collapse_spurious(text): 95 | text = re.sub(_apos_s_re, "-s", text) 96 | text = re.sub(_single_re, "", text) 97 | text = re.sub(_double_re, "", text) 98 | text = re.sub(_paren_re, "", text) 99 | text = re.sub(_semicolon_re, ",", text) 100 | text = re.sub(_dash_re, ",", text) 101 | text = re.sub(_colon_re, ", ", text) 102 | text = re.sub(_period_re, "", text) 103 | text = re.sub(_bracket_re, "", text) 104 | text = re.sub(_abbrev_re, " ", text) 105 | text = re.sub(_comma_re, ", ", text) 106 | text = re.sub(_comma_re, ", ", text) 107 | return text 108 | 109 | 110 | def basic_cleaners(text): 111 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 112 | text = lowercase(text) 113 | text = collapse_whitespace(text) 114 | return text 115 | 116 | 117 | def transliteration_cleaners(text): 118 | '''Pipeline for non-English text that transliterates to ASCII.''' 119 | text = convert_to_ascii(text) 120 | text = lowercase(text) 121 | text = collapse_whitespace(text) 122 | return text 123 | 124 | 125 | def rulebased_g2p_cleaners(text): 126 | text = convert_to_ascii(text) 127 | from eng_rules import rulebased_g2p 128 | r = rulebased_g2p(text) 129 | text = "^".join(["&".join(ri[1]).lower() for ri in r]) 130 | text = lowercase(text) 131 | return text 132 | 133 | 134 | def english_cleaners(text): 135 | '''Pipeline for English text, including number and abbreviation expansion.''' 136 | text = convert_to_ascii(text) 137 | text = lowercase(text) 138 | text = expand_numbers(text) 139 | text = expand_abbreviations(text) 140 | text = collapse_spurious(text) 141 | text = collapse_whitespace(text) 142 | return text 143 | 144 | 145 | def english_minimal_cleaners(text): 146 | '''Pipeline for English text, including number and abbreviation expansion.''' 147 | text = convert_to_ascii(text) 148 | text = lowercase(text) 149 | text = expand_numbers(text) 150 | text = collapse_whitespace(text) 151 | return text 152 | 153 | 154 | def english_phone_cleaners(text): 155 | '''Pipeline for English phones.''' 156 | return text 157 | 158 | def english_phone_pause_cleaners(text): 159 | '''Pipeline for English phones.''' 160 | return text 161 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from .numbers import normalize_numbers 18 | from .eng_rules import rulebased_g2p 19 | 20 | 21 | _whitespace_re = re.compile(r'\s+') 22 | _apos_s_re = re.compile(r"'s") 23 | _single_re = re.compile(r'["]') 24 | _double_re = re.compile(r"[']") 25 | _semicolon_re = re.compile(r';') 26 | _paren_re = re.compile(r'[()]') 27 | _bracket_re = re.compile(r'[\[\]]') 28 | _dash_re = re.compile(r'--') 29 | _comma_re = re.compile(r' , ') 30 | _colon_re = re.compile(r':') 31 | _period_re = re.compile(r'\.$') 32 | _abbrev_re = re.compile(r'\.') 33 | _US_re = re.compile(r' US') 34 | _UK_re = re.compile(r' UK') 35 | _FBI_re = re.compile(r' FBI') 36 | _CIA_re = re.compile(r' CIA') 37 | _NSA_re = re.compile(r' NSA') 38 | _USA_re = re.compile(r' USA') 39 | _USSR_re = re.compile(r' USSR') 40 | 41 | # handle 22 -> 22nd??? 42 | 43 | # List of (regular expression, replacement) pairs for abbreviations: 44 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 45 | ('mrs', 'misess'), 46 | ('mr', 'mister'), 47 | ('dr', 'doctor'), 48 | ('st', 'saint'), 49 | ('co', 'company'), 50 | ('jr', 'junior'), 51 | ('maj', 'major'), 52 | ('gen', 'general'), 53 | ('drs', 'doctors'), 54 | ('rev', 'reverend'), 55 | ('lt', 'lieutenant'), 56 | ('hon', 'honorable'), 57 | ('sgt', 'sergeant'), 58 | ('capt', 'captain'), 59 | ('esq', 'esquire'), 60 | ('ltd', 'limited'), 61 | ('col', 'colonel'), 62 | ('ft', 'fort'), 63 | ]] 64 | 65 | 66 | def expand_abbreviations(text): 67 | for regex, replacement in _abbreviations: 68 | text = re.sub(regex, replacement, text) 69 | return text 70 | 71 | 72 | def expand_numbers(text): 73 | return normalize_numbers(text) 74 | 75 | 76 | def lowercase(text): 77 | text = re.sub(_USSR_re, ' U S S R', text) 78 | text = re.sub(_USA_re, ' U S A', text) 79 | text = re.sub(_US_re, ' U S', text) 80 | text = re.sub(_UK_re, ' U K', text) 81 | text = re.sub(_FBI_re, ' F B I', text) 82 | text = re.sub(_CIA_re, ' C I A', text) 83 | return text.lower() 84 | 85 | 86 | def collapse_whitespace(text): 87 | return re.sub(_whitespace_re, ' ', text) 88 | 89 | 90 | def convert_to_ascii(text): 91 | unicode_content = text.decode('utf-8') 92 | return unidecode(unicode_content) 93 | 94 | 95 | def collapse_spurious(text): 96 | text = re.sub(_apos_s_re, "-s", text) 97 | text = re.sub(_single_re, "", text) 98 | text = re.sub(_double_re, "", text) 99 | text = re.sub(_paren_re, "", text) 100 | text = re.sub(_semicolon_re, ",", text) 101 | text = re.sub(_dash_re, ",", text) 102 | text = re.sub(_colon_re, ", ", text) 103 | text = re.sub(_period_re, "", text) 104 | text = re.sub(_bracket_re, "", text) 105 | text = re.sub(_abbrev_re, " ", text) 106 | text = re.sub(_comma_re, ", ", text) 107 | text = re.sub(_comma_re, ", ", text) 108 | return text 109 | 110 | 111 | def basic_cleaners(text): 112 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 113 | text = lowercase(text) 114 | text = collapse_whitespace(text) 115 | return text 116 | 117 | 118 | def transliteration_cleaners(text): 119 | '''Pipeline for non-English text that transliterates to ASCII.''' 120 | text = convert_to_ascii(text) 121 | text = lowercase(text) 122 | text = collapse_whitespace(text) 123 | return text 124 | 125 | 126 | def rulebased_g2p_cleaners(text): 127 | text = convert_to_ascii(text) 128 | r = rulebased_g2p(text) 129 | text = "^".join(["&".join(ri[1]).lower() for ri in r]) 130 | text = lowercase(text) 131 | return text 132 | 133 | 134 | def english_cleaners(text): 135 | '''Pipeline for English text, including number and abbreviation expansion.''' 136 | text = convert_to_ascii(text) 137 | text = lowercase(text) 138 | text = expand_numbers(text) 139 | text = expand_abbreviations(text) 140 | text = collapse_spurious(text) 141 | text = collapse_whitespace(text) 142 | return text 143 | 144 | 145 | def english_minimal_cleaners(text): 146 | '''Pipeline for English text, including number and abbreviation expansion.''' 147 | text = convert_to_ascii(text) 148 | text = lowercase(text) 149 | text = expand_numbers(text) 150 | text = collapse_whitespace(text) 151 | return text 152 | 153 | 154 | def english_phone_cleaners(text): 155 | '''Pipeline for English phones.''' 156 | return text 157 | 158 | def english_phone_pause_cleaners(text): 159 | '''Pipeline for English phones.''' 160 | return text 161 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/mixture.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Code is adapted from: 3 | # https://github.com/pclucas14/pixel-cnn-pp 4 | # https://github.com/openai/pixel-cnn 5 | 6 | from __future__ import with_statement, print_function, absolute_import 7 | 8 | import math 9 | import numpy as np 10 | 11 | import torch 12 | from torch import nn 13 | from torch.nn import functional as F 14 | 15 | 16 | def log_sum_exp(x): 17 | """ numerically stable log_sum_exp implementation that prevents overflow """ 18 | # TF ordering 19 | axis = len(x.size()) - 1 20 | m, _ = torch.max(x, dim=axis) 21 | m2, _ = torch.max(x, dim=axis, keepdim=True) 22 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) 23 | 24 | 25 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256, 26 | log_scale_min=-7.0, reduce=True): 27 | """Discretized mixture of logistic distributions loss 28 | 29 | Note that it is assumed that input is scaled to [-1, 1]. 30 | 31 | Args: 32 | y_hat (Tensor): Predicted output (B x C x T) 33 | y (Tensor): Target (B x T x 1). 34 | num_classes (int): Number of classes 35 | log_scale_min (float): Log scale minimum value 36 | reduce (bool): If True, the losses are averaged or summed for each 37 | minibatch. 38 | 39 | Returns 40 | Tensor: loss 41 | """ 42 | assert y_hat.dim() == 3 43 | assert y_hat.size(1) % 3 == 0 44 | nr_mix = y_hat.size(1) // 3 45 | 46 | # (B x T x C) 47 | y_hat = y_hat.transpose(1, 2) 48 | 49 | # unpack parameters. (B, T, num_mixtures) x 3 50 | logit_probs = y_hat[:, :, :nr_mix] 51 | means = y_hat[:, :, nr_mix:2 * nr_mix] 52 | log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) 53 | 54 | # B x T x 1 -> B x T x num_mixtures 55 | y = y.expand_as(means) 56 | 57 | centered_y = y - means 58 | inv_stdv = torch.exp(-log_scales) 59 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 60 | cdf_plus = F.sigmoid(plus_in) 61 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 62 | cdf_min = F.sigmoid(min_in) 63 | 64 | # log probability for edge case of 0 (before scaling) 65 | # equivalent: torch.log(F.sigmoid(plus_in)) 66 | log_cdf_plus = plus_in - F.softplus(plus_in) 67 | 68 | # log probability for edge case of 255 (before scaling) 69 | # equivalent: (1 - F.sigmoid(min_in)).log() 70 | log_one_minus_cdf_min = -F.softplus(min_in) 71 | 72 | # probability for all other cases 73 | cdf_delta = cdf_plus - cdf_min 74 | 75 | mid_in = inv_stdv * centered_y 76 | # log probability in the center of the bin, to be used in extreme cases 77 | # (not actually used in our code) 78 | log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) 79 | 80 | # tf equivalent 81 | """ 82 | log_probs = tf.where(x < -0.999, log_cdf_plus, 83 | tf.where(x > 0.999, log_one_minus_cdf_min, 84 | tf.where(cdf_delta > 1e-5, 85 | tf.log(tf.maximum(cdf_delta, 1e-12)), 86 | log_pdf_mid - np.log(127.5)))) 87 | """ 88 | # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value 89 | # for num_classes=65536 case? 1e-7? not sure.. 90 | inner_inner_cond = (cdf_delta > 1e-5).float() 91 | 92 | inner_inner_out = inner_inner_cond * \ 93 | torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ 94 | (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) 95 | inner_cond = (y > 0.999).float() 96 | inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out 97 | cond = (y < -0.999).float() 98 | log_probs = cond * log_cdf_plus + (1. - cond) * inner_out 99 | 100 | log_probs = log_probs + F.log_softmax(logit_probs, -1) 101 | 102 | if reduce: 103 | return -torch.sum(log_sum_exp(log_probs)) 104 | else: 105 | return -log_sum_exp(log_probs).unsqueeze(-1) 106 | 107 | 108 | def to_one_hot(tensor, n, fill_with=1.): 109 | # we perform one hot encore with respect to the last axis 110 | one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() 111 | if tensor.is_cuda: 112 | one_hot = one_hot.cuda() 113 | one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) 114 | return one_hot 115 | 116 | 117 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0): 118 | """ 119 | Sample from discretized mixture of logistic distributions 120 | 121 | Args: 122 | y (Tensor): B x C x T 123 | log_scale_min (float): Log scale minimum value 124 | 125 | Returns: 126 | Tensor: sample in range of [-1, 1]. 127 | """ 128 | assert y.size(1) % 3 == 0 129 | nr_mix = y.size(1) // 3 130 | 131 | # B x T x C 132 | y = y.transpose(1, 2) 133 | logit_probs = y[:, :, :nr_mix] 134 | 135 | # sample mixture indicator from softmax 136 | temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) 137 | temp = logit_probs.data - torch.log(- torch.log(temp)) 138 | _, argmax = temp.max(dim=-1) 139 | 140 | # (B, T) -> (B, T, nr_mix) 141 | one_hot = to_one_hot(argmax, nr_mix) 142 | # select logistic parameters 143 | means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) 144 | log_scales = torch.clamp(torch.sum( 145 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) 146 | # sample from logistic & clip to interval 147 | # we don't actually round to the nearest 8bit value when sampling 148 | u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) 149 | x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) 150 | 151 | x = torch.clamp(torch.clamp(x, min=-1.), max=1.) 152 | 153 | return x 154 | -------------------------------------------------------------------------------- /pretrained/representation_mixing_text_to_speech_demo_minimal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Minimal demo of Representation Mixing", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "private_outputs": true, 10 | "collapsed_sections": [], 11 | "toc_visible": true 12 | }, 13 | "kernelspec": { 14 | "name": "python2", 15 | "display_name": "Python 2" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "metadata": { 22 | "id": "ub5RuaFnxo-O", 23 | "colab_type": "text" 24 | }, 25 | "cell_type": "markdown", 26 | "source": [ 27 | "## Minimal Demo\n", 28 | "\n", 29 | "This is a minimal demo of Representation Mixing, for more details see the [arxiv paper](https://arxiv.org/abs/1811.07240)\n", 30 | "\n", 31 | "Approximate runtime ~4 minutes" 32 | ] 33 | }, 34 | { 35 | "metadata": { 36 | "id": "m7R_1MpFc3Za", 37 | "colab_type": "text" 38 | }, 39 | "cell_type": "markdown", 40 | "source": [ 41 | "## Setup\n", 42 | "\n", 43 | "### Install dependencies" 44 | ] 45 | }, 46 | { 47 | "metadata": { 48 | "id": "NlLC7Q7Us8go", 49 | "colab_type": "code", 50 | "colab": {} 51 | }, 52 | "cell_type": "code", 53 | "source": [ 54 | "import os\n", 55 | "from os.path import exists, join, expanduser\n", 56 | "\n", 57 | "os.chdir(os.path.expanduser(\"~\"))\n", 58 | "\n", 59 | "representation_mixing_dir = \"representation_mixing\"\n", 60 | "if not os.path.exists(representation_mixing_dir):\n", 61 | " ! git clone https://github.com/kastnerkyle/$representation_mixing_dir\n" 62 | ], 63 | "execution_count": 0, 64 | "outputs": [] 65 | }, 66 | { 67 | "metadata": { 68 | "id": "KBFfji_Avluz", 69 | "colab_type": "code", 70 | "colab": {} 71 | }, 72 | "cell_type": "code", 73 | "source": [ 74 | "# Install dependencies\n", 75 | "! pip install -q --upgrade \"tensorflow<=1.6.0\"\n", 76 | "! pip install -q --upgrade \"unidecode\"" 77 | ], 78 | "execution_count": 0, 79 | "outputs": [] 80 | }, 81 | { 82 | "metadata": { 83 | "id": "iZsAP7srBBTe", 84 | "colab_type": "code", 85 | "colab": {} 86 | }, 87 | "cell_type": "code", 88 | "source": [ 89 | "os.chdir(representation_mixing_dir)\n", 90 | "os.chdir(\"pretrained\")" 91 | ], 92 | "execution_count": 0, 93 | "outputs": [] 94 | }, 95 | { 96 | "metadata": { 97 | "id": "km1SAASEcIL6", 98 | "colab_type": "text" 99 | }, 100 | "cell_type": "markdown", 101 | "source": [ 102 | "\n", 103 | "## Input texts to be synthesized\n", 104 | "\n", 105 | "Choose your favorite sentences :)" 106 | ] 107 | }, 108 | { 109 | "metadata": { 110 | "id": "qnHnJyc1v6U7", 111 | "colab_type": "code", 112 | "colab": {} 113 | }, 114 | "cell_type": "code", 115 | "source": [ 116 | "if os.path.exists(\"sample_lines.txt\"):\n", 117 | " os.remove(\"sample_lines.txt\")" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "metadata": { 124 | "id": "tU1lz6PcbXut", 125 | "colab_type": "code", 126 | "colab": {} 127 | }, 128 | "cell_type": "code", 129 | "source": [ 130 | "%%bash\n", 131 | "cat << EOS > sample_lines.txt\n", 132 | "The cat ate bread.\n", 133 | "That cat is not dead.\n", 134 | "EOS\n", 135 | "\n", 136 | "cat sample_lines.txt" 137 | ], 138 | "execution_count": 0, 139 | "outputs": [] 140 | }, 141 | { 142 | "metadata": { 143 | "id": "15p8phXx6nxe", 144 | "colab_type": "code", 145 | "colab": {} 146 | }, 147 | "cell_type": "code", 148 | "source": [ 149 | "! bash sample.sh" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [] 153 | }, 154 | { 155 | "metadata": { 156 | "id": "rY_MfE0m8Ese", 157 | "colab_type": "code", 158 | "colab": {} 159 | }, 160 | "cell_type": "code", 161 | "source": [ 162 | "import IPython\n", 163 | "from IPython.display import Audio\n", 164 | "import numpy as np" 165 | ], 166 | "execution_count": 0, 167 | "outputs": [] 168 | }, 169 | { 170 | "metadata": { 171 | "id": "hNG8oI4OiJkJ", 172 | "colab_type": "text" 173 | }, 174 | "cell_type": "markdown", 175 | "source": [ 176 | "## Summary: audio samples" 177 | ] 178 | }, 179 | { 180 | "metadata": { 181 | "id": "OIyfhn0v9Ntg", 182 | "colab_type": "code", 183 | "colab": {} 184 | }, 185 | "cell_type": "code", 186 | "source": [ 187 | "with open(\"sample_lines.txt\", \"r\") as f:\n", 188 | " lines = f.readlines()\n", 189 | "lines = [l.strip() for l in lines]\n", 190 | "\n", 191 | "def sort(files):\n", 192 | " return sorted(files, key=lambda k: int(k.split(\"_\")[1]))\n", 193 | " \n", 194 | "mel_files = sort([f for f in os.listdir(\".\") if \"_mels.npz\" in f])\n", 195 | "audio_files = sort([f for f in os.listdir(\".\") if \"_post.wav\" in f]) \n", 196 | "maps = zip(lines, mel_files[:len(lines)], audio_files[:len(lines)])\n", 197 | "\n", 198 | "for idx, (text, mel, audio) in enumerate(maps):\n", 199 | " print(idx, text)\n", 200 | " IPython.display.display(Audio(audio, rate=22050))" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | } 205 | ] 206 | } 207 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/batch_synth.py: -------------------------------------------------------------------------------- 1 | # Setup WaveNet vocoder hparams 2 | import os 3 | os.environ["KERAS_BACKEND"] = "tensorflow" 4 | 5 | from hparams import hparams 6 | wn_preset = "20180510_mixture_lj_checkpoint_step000320000_ema.json" 7 | wn_checkpoint_path = "20180510_mixture_lj_checkpoint_step000320000_ema.pth" 8 | with open(wn_preset) as f: 9 | hparams.parse_json(f.read()) 10 | 11 | # Setup WaveNet vocoder 12 | from train import build_model 13 | from synthesis import wavegen 14 | import torch 15 | from scipy.io import wavfile 16 | 17 | from functools import partial 18 | import numpy as np 19 | import os 20 | import sys 21 | import audio 22 | from tqdm import tqdm 23 | 24 | from nnmnkwii import preprocessing as P 25 | from hparams import hparams 26 | from os.path import exists 27 | import librosa 28 | 29 | from wavenet_vocoder_core.util import is_mulaw_quantize, is_mulaw, is_raw 30 | 31 | if len(sys.argv) < 2: 32 | raise ValueError("Must pass directory of wav files as only argument") 33 | 34 | in_path = sys.argv[1] 35 | assert os.path.exists(in_path) 36 | 37 | def _process_utterance(wav_path, out_dir): 38 | fname = wav_path.split(os.sep)[-1].split(".")[0] 39 | audio_filename = '{}_resolved.npy'.format(fname) 40 | mel_filename = '{}_mel.npy'.format(fname) 41 | apth = os.path.join(out_dir, audio_filename) 42 | mpth = os.path.join(out_dir, mel_filename) 43 | if os.path.exists(apth) and os.path.exists(mpth): 44 | print("File {} already processed".format(wav_path)) 45 | return 46 | 47 | # Load the audio to a numpy array: 48 | wav = audio.load_wav(wav_path) 49 | 50 | if hparams.rescaling: 51 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 52 | 53 | # Mu-law quantize 54 | if is_mulaw_quantize(hparams.input_type): 55 | # [0, quantize_channels) 56 | out = P.mulaw_quantize(wav, hparams.quantize_channels) 57 | 58 | # Trim silences 59 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 60 | wav = wav[start:end] 61 | out = out[start:end] 62 | constant_values = P.mulaw_quantize(0, hparams.quantize_channels) 63 | out_dtype = np.int16 64 | elif is_mulaw(hparams.input_type): 65 | # [-1, 1] 66 | out = P.mulaw(wav, hparams.quantize_channels) 67 | constant_values = P.mulaw(0.0, hparams.quantize_channels) 68 | out_dtype = np.float32 69 | else: 70 | # [-1, 1] 71 | out = wav 72 | constant_values = 0.0 73 | out_dtype = np.float32 74 | 75 | # Compute a mel-scale spectrogram from the trimmed wav: 76 | # (N, D) 77 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T 78 | # lws pads zeros internally before performing stft 79 | # this is needed to adjust time resolution between audio and mel-spectrogram 80 | l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 81 | 82 | # zero pad for quantized signal 83 | out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) 84 | N = mel_spectrogram.shape[0] 85 | assert len(out) >= N * audio.get_hop_size() 86 | 87 | # time resolution adjustment 88 | # ensure length of raw audio is multiple of hop_size so that we can use 89 | # transposed convolution to upsample 90 | out = out[:N * audio.get_hop_size()] 91 | assert len(out) % audio.get_hop_size() == 0 92 | 93 | timesteps = len(out) 94 | 95 | # Write the spectrograms to disk: 96 | np.save(apth, 97 | out.astype(out_dtype), allow_pickle=False) 98 | np.save(mpth, 99 | mel_spectrogram.astype(np.float32), allow_pickle=False) 100 | 101 | 102 | def soundsc(X, gain_scale=.9, copy=True): 103 | """ 104 | Approximate implementation of soundsc from MATLAB without the audio playing. 105 | 106 | Parameters 107 | ---------- 108 | X : ndarray 109 | Signal to be rescaled 110 | 111 | gain_scale : float 112 | Gain multipler, default .9 (90% of maximum representation) 113 | 114 | copy : bool, optional (default=True) 115 | Whether to make a copy of input signal or operate in place. 116 | 117 | Returns 118 | ------- 119 | X_sc : ndarray 120 | (-32767, 32767) scaled version of X as int16, suitable for writing 121 | with scipy.io.wavfile 122 | """ 123 | X = np.array(X, copy=copy) 124 | X = (X - X.min()) / (X.max() - X.min()) 125 | X = 2 * X - 1 126 | X = gain_scale * X 127 | X = X * 2 ** 15 128 | return X.astype('int16') 129 | 130 | 131 | use_cuda = torch.cuda.is_available() 132 | device = torch.device("cuda" if use_cuda else "cpu") 133 | 134 | print("Load checkpoint from {}".format(wn_checkpoint_path)) 135 | if use_cuda: 136 | checkpoint = torch.load(wn_checkpoint_path) 137 | else: 138 | checkpoint = torch.load(wn_checkpoint_path, map_location="cpu") 139 | 140 | if in_path[-1] == str(os.sep): 141 | in_path = in_path[:-1] 142 | 143 | model = build_model().to(device) 144 | model.load_state_dict(checkpoint["state_dict"]) 145 | 146 | wav_paths = [in_path + os.sep + "{}".format(fi) for fi in os.listdir(in_path) if ".wav" in fi] 147 | out_dir = in_path + "_mel" 148 | if not os.path.exists(out_dir): 149 | os.mkdir(out_dir) 150 | 151 | for wp in wav_paths: 152 | print("Saving mels for {}".format(wp)) 153 | _process_utterance(wp, out_dir) 154 | 155 | mel_dir = out_dir 156 | wav_out_dir = mel_dir + "_wavenet_render" 157 | if not os.path.exists(wav_out_dir): 158 | os.mkdir(wav_out_dir) 159 | sample_rate = 22050 160 | mel_paths = [mel_dir + os.sep + "{}".format(fi) for fi in os.listdir(mel_dir) if "mel" in fi] 161 | for mel_path in mel_paths: 162 | c = np.load(mel_path) 163 | if c.shape[1] != hparams.num_mels: 164 | np.swapaxes(c, 0, 1) 165 | waveform = wavegen(model, c=c, fast=True, tqdm=tqdm) 166 | fname = mel_path.split(os.sep)[-1].split(".")[0] 167 | fpath = wav_out_dir + str(os.sep) + '{}.wav'.format(fname) 168 | wavfile.write(fpath, sample_rate, waveform) 169 | print("Saved HD audio {}".format(fpath)) 170 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/evaluate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Synthesis waveform for testset 4 | 5 | usage: evaluate.py [options] 6 | 7 | options: 8 | --data-root= Directory contains preprocessed features. 9 | --hparams= Hyper parameters [default: ]. 10 | --preset= Path of preset parameters (json). 11 | --length= Steps to generate [default: 32000]. 12 | --speaker-id= Use specific speaker of data in case for multi-speaker datasets. 13 | --initial-value= Initial value for the WaveNet decoder. 14 | --file-name-suffix= File name suffix [default: ]. 15 | --output-html Output html for blog post. 16 | --num-utterances=N> Generate N utterenaces per speaker [default: -1]. 17 | -h, --help Show help message. 18 | """ 19 | from docopt import docopt 20 | 21 | import sys 22 | import os 23 | from os.path import dirname, join, basename, splitext 24 | import torch 25 | import numpy as np 26 | from nnmnkwii import preprocessing as P 27 | from keras.utils import np_utils 28 | from tqdm import tqdm 29 | import librosa 30 | 31 | 32 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw 33 | 34 | import audio 35 | from hparams import hparams 36 | 37 | 38 | use_cuda = torch.cuda.is_available() 39 | device = torch.device("cuda" if use_cuda else "cpu") 40 | 41 | if __name__ == "__main__": 42 | args = docopt(__doc__) 43 | print("Command line args:\n", args) 44 | data_root = args["--data-root"] 45 | if data_root is None: 46 | data_root = join(dirname(__file__), "data", "cmu_arctic") 47 | checkpoint_path = args[""] 48 | dst_dir = args[""] 49 | 50 | length = int(args["--length"]) 51 | # Note that speaker-id is used for filtering out unrelated-speaker from 52 | # multi-speaker dataset. 53 | speaker_id = args["--speaker-id"] 54 | speaker_id = int(speaker_id) if speaker_id is not None else None 55 | initial_value = args["--initial-value"] 56 | initial_value = None if initial_value is None else float(initial_value) 57 | file_name_suffix = args["--file-name-suffix"] 58 | output_html = args["--output-html"] 59 | num_utterances = int(args["--num-utterances"]) 60 | preset = args["--preset"] 61 | 62 | # Load preset if specified 63 | if preset is not None: 64 | with open(preset) as f: 65 | hparams.parse_json(f.read()) 66 | # Override hyper parameters 67 | hparams.parse(args["--hparams"]) 68 | assert hparams.name == "wavenet_vocoder" 69 | 70 | from train import build_model, get_data_loaders 71 | from synthesis import wavegen 72 | 73 | # Data 74 | # Use exactly same testset used in training script 75 | # disable shuffle for convenience 76 | test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False)["test"] 77 | test_dataset = test_data_loader.dataset 78 | 79 | # Model 80 | model = build_model().to(device) 81 | 82 | # Load checkpoint 83 | print("Load checkpoint from {}".format(checkpoint_path)) 84 | if use_cuda: 85 | checkpoint = torch.load(checkpoint_path) 86 | else: 87 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 88 | model.load_state_dict(checkpoint["state_dict"]) 89 | checkpoint_name = splitext(basename(checkpoint_path))[0] 90 | 91 | os.makedirs(dst_dir, exist_ok=True) 92 | dst_dir_name = basename(os.path.normpath(dst_dir)) 93 | 94 | generated_utterances = {} 95 | for idx, (x, c, g) in enumerate(test_dataset): 96 | target_audio_path = test_dataset.X.collected_files[idx][0] 97 | if g is None and num_utterances > 0 and idx > num_utterances: 98 | break 99 | if num_utterances > 0 and g is not None: 100 | try: 101 | generated_utterances[g] += 1 102 | if generated_utterances[g] > num_utterances: 103 | continue 104 | except KeyError: 105 | generated_utterances[g] = 1 106 | 107 | if output_html: 108 | def _tqdm(x): return x 109 | else: 110 | _tqdm = tqdm 111 | print("Target audio is {}".format(target_audio_path)) 112 | if c is not None: 113 | print("Local conditioned by {}".format(test_dataset.Mel.collected_files[idx][0])) 114 | if g is not None: 115 | print("Global conditioned by speaker id {}".format(g)) 116 | 117 | # Paths 118 | if g is None: 119 | dst_wav_path = join(dst_dir, "{}_{}{}_predicted.wav".format( 120 | idx, checkpoint_name, file_name_suffix)) 121 | target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format( 122 | idx, checkpoint_name, file_name_suffix)) 123 | else: 124 | dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format( 125 | g, idx, checkpoint_name, file_name_suffix)) 126 | target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format( 127 | g, idx, checkpoint_name, file_name_suffix)) 128 | 129 | # Generate 130 | waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value, 131 | fast=True, tqdm=_tqdm) 132 | 133 | # save 134 | librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) 135 | if is_mulaw_quantize(hparams.input_type): 136 | x = P.inv_mulaw_quantize(x, hparams.quantize_channels) 137 | elif is_mulaw(hparams.input_type): 138 | x = P.inv_mulaw(x, hparams.quantize_channels) 139 | librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate) 140 | 141 | # log 142 | if output_html: 143 | print(""" 144 | 148 | """.format(hparams.name, dst_dir_name, basename(dst_wav_path))) 149 | 150 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 151 | sys.exit(0) 152 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/full_test.txt: -------------------------------------------------------------------------------- 1 | a b c. 2 | x y z. 3 | hurry. 4 | warehouse. 5 | referendum. 6 | is it free? 7 | justifiable. 8 | environment. 9 | a debt runs. 10 | gravitational. 11 | cardboard film. 12 | person thinking. 13 | prepared killer. 14 | aircraft torture. 15 | allergic trouser. 16 | strategic conduct. 17 | worrying literature. 18 | christmas is coming 19 | a pet dilemma thinks. 20 | how was the math test? 21 | good to the last drop. 22 | an m b a agent listens. 23 | a compromise disappears. 24 | an axis of x y or z freezes. 25 | she did her best to help him. 26 | a backbone contests the chaos. 27 | two a greater than two n nine. 28 | don't step on the broken glass. 29 | a damned flips into the patient. 30 | a trade purges within the b b c. 31 | i'd rather be a bird than a fish. 32 | i hear that nancy is very pretty. 33 | i want more detailed information. 34 | please wait outside of the house. 35 | n a s a exposure tunes the waffle. 36 | a mist dictates within the monster. 37 | a sketch ropes the middle ceremony. 38 | every farewell explodes the career. 39 | she folded her handkerchief neatly. 40 | against the steam chooses the studio. 41 | rock music approaches at high velocity. 42 | nine adam baye study on the two pieces. 43 | an unfriendly decay conveys the outcome. 44 | abstraction is often one floor above you. 45 | a played lady ranks any publicized preview. 46 | he told us a very exciting adventure story. 47 | on august twenty eighth, mary plays the piano. 48 | into a controller beams a concrete terrorist. 49 | i often see the time eleven eleven on clocks. 50 | it was getting dark, and we weren't there yet. 51 | against every rhyme starves a choral apparatus. 52 | everyone was busy, so i went to the movie alone. 53 | i checked to make sure that he was still alive. 54 | a dominant vegetarian shies away from the g o p. 55 | joe made the sugar cookies, susan decorated them. 56 | i want to buy a onesie, but know it won't suit me. 57 | a former override of q w e r t y outside the pope. 58 | f b i says that c i a says, i'll stay way from it. 59 | any climbing dish listens to a cumbersome formula. 60 | she wrote him a long letter, but he didn't read it. 61 | dear, beauty is in the heat not physical, i love you. 62 | an appeal on january fifth duplicates a sharp queen. 63 | a farewell solos on march twenty third shakes north. 64 | he ran out of money so he had to stop playing poker. 65 | for example, a newspaper has only regional distribution t. 66 | i currently have four windows open up, and i don't know why. 67 | next to my indirect vocal declines every unbearable academic. 68 | opposite her sounding bag is a m c's configured thoroughfare. 69 | from april eighth to the present, i only smoke four cigarettes. 70 | i will never be this young again, ever, oh damn, i just got older. 71 | a generous continuum of amazon dot com is the conflicting worker. 72 | she advised him to come back at once the wife lectures the blast. 73 | a song can make or ruin a person's day if they let it get to them. 74 | she did not cheat on the test, for it was not the right thing to do. 75 | he said he was not there yesterday, however, many people saw him there. 76 | should we start class now, or should we wait for everyone to get here? 77 | if purple people eaters are real, where do they find purple people to eat? 78 | on november eighteenth eighteen twenty one, a glittering gem is not enough. 79 | a rocket from space x interacts with the individual beneath the soft flaw. 80 | malls are great places to shop, i can find everything i need under one roof. 81 | i think i will buy the red car, or i will lease the blue one, the faith nests. 82 | italy is my favorite country, in fact, i plan to spend two weeks there next year. 83 | i would have gotten w w w w dot google dot com, but my attendance wasn't good enough. 84 | nineteen twenty is when we are unique together until we realise, we are all the same. 85 | my mum tries to be cool by saying h t t p colon slash slash w w w b a i d u dot com. 86 | he turned in the research paper on friday, otherwise, he emailed a s d f at yahoo dot org. 87 | she works two jobs to make ends meet, at least, that was her reason for no having time to join us. 88 | a remarkable well promotes the alphabet into the adjusted luck, the dress dodges across my assault. 89 | a b c d e f g h i j k l m n o p q r s t u v w x y z one two three four five six seven eight nine ten. 90 | across the waste persists the wrong pacifier, the washed passenger parades under the incorrect computer. 91 | if the easter bunny and the tooth fairy had babies would they take your teeth and leave chocolate for you? 92 | sometimes, all you need to do is completely make an ass of yourself and laugh it off to realise that life isn't so bad after all. 93 | she borrowed the book from him many years ago and hasn't yet returned it, why won't the distinguishing love jump with the juvenile? 94 | last friday in three week's time i saw a spotted striped blue worm shake hands with a legless lizard, the lake is a long way from here. 95 | i was very proud of my nickname throughout high school but today, i couldn't be any different to what my nickname was, the metal lusts, the ranging captain charters the link. 96 | i am happy to take your donation, any amount will be greatly appreciated, the waves were crashing on the shore, it was a lovely sight, the paradox sticks this bowl on top of a spontaneous tea. 97 | a purple pig and a green donkey flew a kite in the middle of the night and ended up sunburned, the contained error poses as a logical target, the divorce attacks near a missing doom, the opera fines the daily examiner into a murderer. 98 | as the most famous singer-songwriter, jay chou gave a perfect performance in beijing on may twenty fourth, twenty fifth, and twenty sixth twenty three all the fans thought highly of him and took pride in him all the tickets were sold out. 99 | if you like tuna and tomato sauce, try combining the two, it's really not as bad as it sounds, the body may perhaps compensates for the loss of a true metaphysics, the clock within this blog and the clock on my laptop are one hour different from each other. 100 | someone i know recently combined maple syrup and buttered popcorn thinking it would taste like caramel popcornm, it didn't and they don't recommend anyone else do it either, the gentleman marches around the principal, the divorce attacks near a missing doom, the color misprints a circular worry across the controversy. 101 | -------------------------------------------------------------------------------- /pretrained/number_to_words.py: -------------------------------------------------------------------------------- 1 | # https://github.com/ianfieldhouse/number_to_words 2 | 3 | class NumberToWords(object): 4 | """ 5 | Class for converting positive integer values to a textual representation 6 | of the submitted number for value of 0 up to 999999999. 7 | 8 | Example: 9 | >>> from number_to_words import NumberToWords 10 | >>> n2w = NumberToWords() 11 | >>> n2w.convert(123) 12 | 'one hundred and twenty three' 13 | """ 14 | 15 | MAX = 999999999 16 | SMALL_NUMBERS = ['', 'one', 'two', 'three', 'four', 'five', 'six', 17 | 'seven', 'eight', 'nine', 'ten', 'eleven', 18 | 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 19 | 'seventeen', 'eighteen', 'nineteen'] 20 | TENS = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 21 | 'eighty', 'ninety'] 22 | LARGE_NUMBERS = ['', 'thousand', 'million'] 23 | EXCEPTION_STRING = "This method expects positive integer values between " \ 24 | + "0 and {0}".format(MAX) 25 | 26 | def convert(self, number): 27 | """ 28 | Take an integer and return it converted to a textual representation. 29 | 30 | Args: 31 | number (int): The number to be converted. 32 | 33 | Returns: 34 | sentence (string): The textual representation of `number`. 35 | 36 | Raises: 37 | ValueError: If `number` is not a positive integer or is greater 38 | than `MAX`. 39 | """ 40 | 41 | if not isinstance(number, int): 42 | raise ValueError(self.EXCEPTION_STRING) 43 | try: 44 | sentence = "" 45 | if number == 0: 46 | sentence = "zero" 47 | else: 48 | # split number into a list of strings where each list item is 49 | # at most 3 character in length. 50 | groups = format(number, ',').split(',') 51 | 52 | # make sure each list item is exactly 3 characters long by 53 | # zero filling 54 | zero_filled_groups = [] 55 | for group in groups: 56 | zero_filled_groups.append(group.zfill(3)) 57 | 58 | # reverse the list of strings so that the list indexes of the 59 | # string representation of hundreds, thousands and million 60 | # match those of `LARGE_NUMBERS` 61 | zero_filled_groups.reverse() 62 | for group in zero_filled_groups: 63 | index = zero_filled_groups.index(group) 64 | suffix = self.LARGE_NUMBERS[index] 65 | is_and_required = False 66 | if index is 0 and len(zero_filled_groups) > 1: 67 | is_and_required = True 68 | number_as_words = " ".join( 69 | self._number_to_word_list(group, is_and_required, 70 | suffix)) 71 | if len(number_as_words) > 0: 72 | sentence = "{0} {1}".format(number_as_words, sentence) 73 | # set this group to None so as to not set a false `index` 74 | # for subsequent groups where `number` has multiple 75 | # identical groups 76 | zero_filled_groups[index] = None 77 | return sentence.rstrip() 78 | except (IndexError, ValueError): 79 | raise ValueError(self.EXCEPTION_STRING) 80 | 81 | def _number_to_word_list(self, number_string, is_and_required, 82 | suffix=None): 83 | """ 84 | Take a 3 digit string representation of an integer and convert it to a 85 | textual representation with an optional suffix. 86 | 87 | Args: 88 | number_string (str): The number to be converted as a string. 89 | is_and_required (bool): Whether the word and should be prefixed 90 | before tens and units when there is a zero 91 | in the hundreds column. 92 | suffix (Optional[str]): The string to append to the end of the 93 | words (default None) 94 | 95 | Returns: 96 | words (List[str]): A list of strings of the words that make up the 97 | textual representation of `number_string`. 98 | """ 99 | 100 | words = [] 101 | hundreds, tens, units = [int(n) for n in list(number_string)] 102 | total = sum([hundreds, tens, units]) 103 | if hundreds != 0: 104 | string = self.SMALL_NUMBERS[hundreds] 105 | words.append("{0} hundred".format(string)) 106 | if tens != 0 or units != 0: 107 | # KK: mod 108 | pass 109 | #words.append("and") 110 | elif hundreds == 0 and is_and_required and total != 0: 111 | # KK: mod 112 | pass 113 | #words.append("and") 114 | if tens == 1: 115 | string = self.SMALL_NUMBERS[int("{0}{1}".format(tens, units))] 116 | words.append("{0}".format(string)) 117 | else: 118 | if tens != 0: 119 | string = self.TENS[tens] 120 | words.append("{0}".format(string)) 121 | if units != 0: 122 | string = self.SMALL_NUMBERS[units] 123 | words.append("{0}".format(string)) 124 | 125 | if suffix and total != 0: 126 | words.append(suffix) 127 | 128 | return words 129 | 130 | if __name__ == "__main__": 131 | n2w = NumberToWords() 132 | unique = set() 133 | 134 | def fib(): 135 | x, y = 0, 1 136 | yield x 137 | yield y 138 | 139 | while True: 140 | x, y = y, x + y 141 | yield y 142 | 143 | for num in fib(): 144 | if num > n2w.MAX: 145 | break 146 | unique.add(num) 147 | 148 | print(n2w.__doc__) 149 | print(""" 150 | Some example conversions from number to words 151 | =============================================\n""") 152 | 153 | for num in sorted(list(unique)): 154 | print("{0} : {1}".format(format(num, ','), n2w.convert(num))) 155 | print("{0} : {1}".format(format(n2w.MAX, ','), n2w.convert(n2w.MAX))) 156 | -------------------------------------------------------------------------------- /code/lib/tfbldr/datasets/text/cleaning/number_to_words.py: -------------------------------------------------------------------------------- 1 | # https://github.com/ianfieldhouse/number_to_words 2 | 3 | class NumberToWords(object): 4 | """ 5 | Class for converting positive integer values to a textual representation 6 | of the submitted number for value of 0 up to 999999999. 7 | 8 | Example: 9 | >>> from number_to_words import NumberToWords 10 | >>> n2w = NumberToWords() 11 | >>> n2w.convert(123) 12 | 'one hundred and twenty three' 13 | """ 14 | 15 | MAX = 999999999 16 | SMALL_NUMBERS = ['', 'one', 'two', 'three', 'four', 'five', 'six', 17 | 'seven', 'eight', 'nine', 'ten', 'eleven', 18 | 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 19 | 'seventeen', 'eighteen', 'nineteen'] 20 | TENS = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 21 | 'eighty', 'ninety'] 22 | LARGE_NUMBERS = ['', 'thousand', 'million'] 23 | EXCEPTION_STRING = "This method expects positive integer values between " \ 24 | + "0 and {0}".format(MAX) 25 | 26 | def convert(self, number): 27 | """ 28 | Take an integer and return it converted to a textual representation. 29 | 30 | Args: 31 | number (int): The number to be converted. 32 | 33 | Returns: 34 | sentence (string): The textual representation of `number`. 35 | 36 | Raises: 37 | ValueError: If `number` is not a positive integer or is greater 38 | than `MAX`. 39 | """ 40 | 41 | if not isinstance(number, int): 42 | raise ValueError(self.EXCEPTION_STRING) 43 | try: 44 | sentence = "" 45 | if number == 0: 46 | sentence = "zero" 47 | else: 48 | # split number into a list of strings where each list item is 49 | # at most 3 character in length. 50 | groups = format(number, ',').split(',') 51 | 52 | # make sure each list item is exactly 3 characters long by 53 | # zero filling 54 | zero_filled_groups = [] 55 | for group in groups: 56 | zero_filled_groups.append(group.zfill(3)) 57 | 58 | # reverse the list of strings so that the list indexes of the 59 | # string representation of hundreds, thousands and million 60 | # match those of `LARGE_NUMBERS` 61 | zero_filled_groups.reverse() 62 | for group in zero_filled_groups: 63 | index = zero_filled_groups.index(group) 64 | suffix = self.LARGE_NUMBERS[index] 65 | is_and_required = False 66 | if index is 0 and len(zero_filled_groups) > 1: 67 | is_and_required = True 68 | number_as_words = " ".join( 69 | self._number_to_word_list(group, is_and_required, 70 | suffix)) 71 | if len(number_as_words) > 0: 72 | sentence = "{0} {1}".format(number_as_words, sentence) 73 | # set this group to None so as to not set a false `index` 74 | # for subsequent groups where `number` has multiple 75 | # identical groups 76 | zero_filled_groups[index] = None 77 | return sentence.rstrip() 78 | except (IndexError, ValueError): 79 | raise ValueError(self.EXCEPTION_STRING) 80 | 81 | def _number_to_word_list(self, number_string, is_and_required, 82 | suffix=None): 83 | """ 84 | Take a 3 digit string representation of an integer and convert it to a 85 | textual representation with an optional suffix. 86 | 87 | Args: 88 | number_string (str): The number to be converted as a string. 89 | is_and_required (bool): Whether the word and should be prefixed 90 | before tens and units when there is a zero 91 | in the hundreds column. 92 | suffix (Optional[str]): The string to append to the end of the 93 | words (default None) 94 | 95 | Returns: 96 | words (List[str]): A list of strings of the words that make up the 97 | textual representation of `number_string`. 98 | """ 99 | 100 | words = [] 101 | hundreds, tens, units = [int(n) for n in list(number_string)] 102 | total = sum([hundreds, tens, units]) 103 | if hundreds != 0: 104 | string = self.SMALL_NUMBERS[hundreds] 105 | words.append("{0} hundred".format(string)) 106 | if tens != 0 or units != 0: 107 | # KK: mod 108 | pass 109 | #words.append("and") 110 | elif hundreds == 0 and is_and_required and total != 0: 111 | # KK: mod 112 | pass 113 | #words.append("and") 114 | if tens == 1: 115 | string = self.SMALL_NUMBERS[int("{0}{1}".format(tens, units))] 116 | words.append("{0}".format(string)) 117 | else: 118 | if tens != 0: 119 | string = self.TENS[tens] 120 | words.append("{0}".format(string)) 121 | if units != 0: 122 | string = self.SMALL_NUMBERS[units] 123 | words.append("{0}".format(string)) 124 | 125 | if suffix and total != 0: 126 | words.append(suffix) 127 | 128 | return words 129 | 130 | if __name__ == "__main__": 131 | n2w = NumberToWords() 132 | unique = set() 133 | 134 | def fib(): 135 | x, y = 0, 1 136 | yield x 137 | yield y 138 | 139 | while True: 140 | x, y = y, x + y 141 | yield y 142 | 143 | for num in fib(): 144 | if num > n2w.MAX: 145 | break 146 | unique.add(num) 147 | 148 | print(n2w.__doc__) 149 | print(""" 150 | Some example conversions from number to words 151 | =============================================\n""") 152 | 153 | for num in sorted(list(unique)): 154 | print("{0} : {1}".format(format(num, ','), n2w.convert(num))) 155 | print("{0} : {1}".format(format(n2w.MAX, ','), n2w.convert(n2w.MAX))) 156 | -------------------------------------------------------------------------------- /pretrained/transform_text.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from text import pronounce_chars 3 | from cleaning import text_to_sequence 4 | from cleaning import sequence_to_text 5 | from cleaning import get_vocabulary_sizes 6 | import cleaners 7 | 8 | clean_names = ["english_cleaners", "english_phone_cleaners"] 9 | lcl_random_state = np.random.RandomState(4142) 10 | 11 | def transform_text(char_seq, auto_pronounce=True, phone_seq=None, force_char_spc=True, symbol_processing="blended_pref", random_state=None): 12 | """ 13 | chars format example: "i am learning english." 14 | phone_seq format example: "@ay @ae@m @l@er@n@ih@ng @ih@ng@g@l@ih@sh" 15 | 16 | chars_only 17 | phones_only 18 | blended_pref 19 | 20 | phone_seq formatting can be gotten from text, using the pronounce_chars function with 'from text import pronounce_chars' 21 | Uses cmudict to do pronunciation 22 | """ 23 | if random_state is None: 24 | random_state = lcl_random_state 25 | 26 | if phone_seq is None and auto_pronounce is False and symbol_processing != "chars_only": 27 | raise ValueError("phone_seq argument must be provided for iterator with self.symbol_processing != 'chars_only', currently '{}'".format(self.symbol_processing)) 28 | clean_char_seq = cleaners.english_cleaners(char_seq) 29 | char_seq_chunk = clean_char_seq.split(" ") 30 | dirty_seq_chunk = char_seq.split(" ") 31 | 32 | if auto_pronounce is True: 33 | if phone_seq is not None: 34 | raise ValueError("auto_pronounce set to True, but phone_seq was provided! Pass phone_seq=None for auto_pronounce=True") 35 | # take out specials then put them back... 36 | specials = "!?.,;:" 37 | puncts = "!?." 38 | tsc = [] 39 | for n, csc in enumerate(char_seq_chunk): 40 | broke = False 41 | for s in specials: 42 | if s in csc: 43 | new = csc.replace(s, "") 44 | tsc.append(new) 45 | broke = True 46 | break 47 | if not broke: 48 | tsc.append(csc) 49 | 50 | if symbol_processing == "blended_pref": 51 | chunky_phone_seq_chunk = [pronounce_chars(w, raw_line=dirty_seq_chunk[ii], cmu_only=True) for ii, w in enumerate(tsc)] 52 | phone_seq_chunk = [cpsc[0] if cpsc != None else None for cpsc in chunky_phone_seq_chunk] 53 | else: 54 | phone_seq_chunk = [pronounce_chars(w) for w in tsc] 55 | for n, psc in enumerate(phone_seq_chunk): 56 | for s in specials: 57 | if char_seq_chunk[n][-1] == s and phone_seq_chunk[n] != None: 58 | phone_seq_chunk[n] += char_seq_chunk[n][-1] 59 | #if char_seq_chunk[n][-1] in puncts and n != (len(phone_seq_chunk) - 1): 60 | # # add eos 61 | # char_seq_chunk[n] += "~" 62 | # phone_seq_chunk[n] += "~" 63 | break 64 | else: 65 | raise ValueError("Non auto_pronounce setting not yet configured") 66 | 67 | if len(char_seq_chunk) != len(phone_seq_chunk): 68 | raise ValueError("Char and phone chunking resulted in different lengths {} and {}!\n{}\n{}".format(len(char_seq_chunk), len(phone_seq_chunk), char_seq_chunk, phone_seq_chunk)) 69 | 70 | if symbol_processing != "phones_only": 71 | spc = text_to_sequence(" ", [clean_names[0]])[0] 72 | else: 73 | spc = text_to_sequence(" ", [clean_names[1]])[0] 74 | 75 | int_char_chunks = [] 76 | int_phone_chunks = [] 77 | for n in range(len(char_seq_chunk)): 78 | int_char_chunks.append(text_to_sequence(char_seq_chunk[n], [clean_names[0]])[:-1]) 79 | if phone_seq_chunk[n] == None: 80 | int_phone_chunks.append([]) 81 | else: 82 | int_phone_chunks.append(text_to_sequence(phone_seq_chunk[n], [clean_names[1]])[:-2]) 83 | 84 | # check inverses 85 | # w = [sequence_to_text(int_char_chunks[i], [self.clean_names[0]]) for i in range(len(int_char_chunks))] 86 | # p = [sequence_to_text(int_phone_chunks[i], [self.clean_names[1]]) for i in range(len(int_phone_chunks))] 87 | 88 | # TODO: Unify the two functions? 89 | char_phone_mask = [0] * len(int_char_chunks) + [1] * len(int_phone_chunks) 90 | random_state.shuffle(char_phone_mask) 91 | char_phone_mask = char_phone_mask[:len(int_char_chunks)] 92 | # setting char_phone_mask to 0 will use chars, 1 will use phones 93 | # these if statements override the default for blended... (above) 94 | if symbol_processing == "blended_pref": 95 | char_phone_mask = [0 if len(int_phone_chunks[i]) == 0 else 1 for i in range(len(int_char_chunks))] 96 | elif symbol_processing == "phones_only": 97 | # set the mask to use only phones 98 | # all files should have phones because of earlier preproc... 99 | char_phone_mask = [1 for i in range(len(char_phone_mask))] 100 | elif symbol_processing == "chars_only": 101 | # only use chars 102 | char_phone_mask = [0 for i in range(len(char_phone_mask))] 103 | 104 | # if the phones entry is None, the word was OOV or not recognized 105 | char_phone_int_seq = [int_char_chunks[i] if (len(int_phone_chunks[i]) == 0 or char_phone_mask[i] == 0) else int_phone_chunks[i] for i in range(len(int_char_chunks))] 106 | # check the inverse is ok 107 | # char_phone_txt = [sequence_to_text(char_phone_int_seq[i], [self.clean_names[char_phone_mask[i]]]) for i in range(len(char_phone_int_seq))] 108 | # combine into 1 sequence 109 | cphi = char_phone_int_seq[0] 110 | cpm = [char_phone_mask[0]] * len(char_phone_int_seq[0]) 111 | if force_char_spc or self.symbol_processing != "phones_only": 112 | spc = text_to_sequence(" ", [clean_names[0]])[0] 113 | else: 114 | spc = text_to_sequence(" ", [clean_names[1]])[0] 115 | for i in range(len(char_phone_int_seq[1:])): 116 | # add space 117 | cphi += [spc] 118 | # always treat space as char unless in phones only mode 119 | if force_char_spc or self.symbol_processing != "phones_only": 120 | cpm += [0] 121 | else: 122 | cpm += [1] 123 | cphi += char_phone_int_seq[i + 1] 124 | cpm += [char_phone_mask[i + 1]] * len(char_phone_int_seq[i + 1]) 125 | # trailing space 126 | #cphi = cphi + [spc] 127 | # trailing eos 128 | cphi = cphi + [1] 129 | # add trailing symbol 130 | if symbol_processing != "phones_only": 131 | cpm += [0] 132 | else: 133 | cpm += [1] 134 | # check inverse 135 | #cpt = "".join([sequence_to_text([cphi[i]], [self.clean_names[cpm[i]]]) for i in range(len(cphi))]) 136 | #if None in phone_seq_chunk: 137 | #print("NUN") 138 | #print(cpt) 139 | #from IPython import embed; embed(); raise ValueError() 140 | return cphi, cpm 141 | 142 | def inverse_transform_text(int_seq, mask): 143 | """ 144 | mask set to zero will use chars, mask set to 1 will use phones 145 | 146 | should invert the transform_txt function 147 | """ 148 | cphi = int_seq 149 | cpm = mask 150 | cpt = "".join([sequence_to_text([cphi[i]], [clean_names[cpm[i]]]) for i in range(len(cphi))]) 151 | return cpt 152 | # setting char_phone_mask to 0 will use chars, 1 will use phones 153 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/synthesis.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Synthesis waveform from trained WaveNet. 4 | 5 | usage: synthesis.py [options] 6 | 7 | options: 8 | --hparams= Hyper parameters [default: ]. 9 | --preset= Path of preset parameters (json). 10 | --length= Steps to generate [default: 32000]. 11 | --initial-value= Initial value for the WaveNet decoder. 12 | --conditional=

Conditional features path. 13 | --symmetric-mels Symmetric mel. 14 | --max-abs-value= Max abs value [default: -1]. 15 | --file-name-suffix= File name suffix [default: ]. 16 | --speaker-id= Speaker ID (for multi-speaker model). 17 | --output-html Output html for blog post. 18 | -h, --help Show help message. 19 | """ 20 | from docopt import docopt 21 | 22 | import sys 23 | import os 24 | from os.path import dirname, join, basename, splitext 25 | import torch 26 | import numpy as np 27 | from nnmnkwii import preprocessing as P 28 | from keras.utils import np_utils 29 | from tqdm import tqdm 30 | import librosa 31 | 32 | from wavenet_vocoder_core.util import is_mulaw_quantize, is_mulaw, is_raw 33 | 34 | import audio 35 | from hparams import hparams 36 | 37 | 38 | torch.set_num_threads(4) 39 | use_cuda = torch.cuda.is_available() 40 | device = torch.device("cuda" if use_cuda else "cpu") 41 | 42 | 43 | def _to_numpy(x): 44 | # this is ugly 45 | if x is None: 46 | return None 47 | if isinstance(x, np.ndarray) or np.isscalar(x): 48 | return x 49 | # remove batch axis 50 | if x.dim() == 3: 51 | x = x.squeeze(0) 52 | return x.numpy() 53 | 54 | 55 | def wavegen(model, length=None, c=None, g=None, initial_value=None, 56 | fast=False, tqdm=tqdm): 57 | """Generate waveform samples by WaveNet. 58 | 59 | Args: 60 | model (nn.Module) : WaveNet decoder 61 | length (int): Time steps to generate. If conditinlal features are given, 62 | then this is determined by the feature size. 63 | c (numpy.ndarray): Conditional features, of shape T x C 64 | g (scaler): Speaker ID 65 | initial_value (int) : initial_value for the WaveNet decoder. 66 | fast (Bool): Whether to remove weight normalization or not. 67 | tqdm (lambda): tqdm 68 | 69 | Returns: 70 | numpy.ndarray : Generated waveform samples 71 | """ 72 | from train import sanity_check 73 | sanity_check(model, c, g) 74 | 75 | c = _to_numpy(c) 76 | g = _to_numpy(g) 77 | 78 | model.eval() 79 | if fast: 80 | model.make_generation_fast_() 81 | 82 | if c is None: 83 | assert length is not None 84 | else: 85 | # (Tc, D) 86 | if c.ndim != 2: 87 | raise RuntimeError( 88 | "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) 89 | assert c.ndim == 2 90 | Tc = c.shape[0] 91 | upsample_factor = audio.get_hop_size() 92 | # Overwrite length according to feature size 93 | length = Tc * upsample_factor 94 | # (Tc, D) -> (Tc', D) 95 | # Repeat features before feeding it to the network 96 | if not hparams.upsample_conditional_features: 97 | c = np.repeat(c, upsample_factor, axis=0) 98 | 99 | # B x C x T 100 | c = torch.FloatTensor(c.T).unsqueeze(0) 101 | 102 | if initial_value is None: 103 | if is_mulaw_quantize(hparams.input_type): 104 | initial_value = P.mulaw_quantize(0, hparams.quantize_channels) 105 | else: 106 | initial_value = 0.0 107 | 108 | if is_mulaw_quantize(hparams.input_type): 109 | assert initial_value >= 0 and initial_value < hparams.quantize_channels 110 | initial_input = np_utils.to_categorical( 111 | initial_value, num_classes=hparams.quantize_channels).astype(np.float32) 112 | initial_input = torch.from_numpy(initial_input).view( 113 | 1, 1, hparams.quantize_channels) 114 | else: 115 | initial_input = torch.zeros(1, 1, 1).fill_(initial_value) 116 | 117 | g = None if g is None else torch.LongTensor([g]) 118 | 119 | # Transform data to GPU 120 | initial_input = initial_input.to(device) 121 | g = None if g is None else g.to(device) 122 | c = None if c is None else c.to(device) 123 | 124 | with torch.no_grad(): 125 | y_hat = model.incremental_forward( 126 | initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, 127 | log_scale_min=hparams.log_scale_min) 128 | 129 | if is_mulaw_quantize(hparams.input_type): 130 | y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() 131 | y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) 132 | elif is_mulaw(hparams.input_type): 133 | y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) 134 | else: 135 | y_hat = y_hat.view(-1).cpu().data.numpy() 136 | 137 | return y_hat 138 | 139 | 140 | if __name__ == "__main__": 141 | args = docopt(__doc__) 142 | print("Command line args:\n", args) 143 | checkpoint_path = args[""] 144 | dst_dir = args[""] 145 | 146 | length = int(args["--length"]) 147 | initial_value = args["--initial-value"] 148 | initial_value = None if initial_value is None else float(initial_value) 149 | conditional_path = args["--conditional"] 150 | # From https://github.com/Rayhane-mamah/Tacotron-2 151 | symmetric_mels = args["--symmetric-mels"] 152 | max_abs_value = float(args["--max-abs-value"]) 153 | 154 | file_name_suffix = args["--file-name-suffix"] 155 | output_html = args["--output-html"] 156 | speaker_id = args["--speaker-id"] 157 | speaker_id = None if speaker_id is None else int(speaker_id) 158 | preset = args["--preset"] 159 | 160 | # Load preset if specified 161 | if preset is not None: 162 | with open(preset) as f: 163 | hparams.parse_json(f.read()) 164 | # Override hyper parameters 165 | hparams.parse(args["--hparams"]) 166 | assert hparams.name == "wavenet_vocoder" 167 | 168 | # Load conditional features 169 | if conditional_path is not None: 170 | c = np.load(conditional_path) 171 | if c.shape[1] != hparams.num_mels: 172 | c = np.swapaxes(c, 0, 1) 173 | if max_abs_value > 0: 174 | min_, max_ = 0, max_abs_value 175 | if symmetric_mels: 176 | min_ = -max_ 177 | print("Normalize features to desired range [0, 1] from [{}, {}]".format(min_, max_)) 178 | c = np.interp(c, (min_, max_), (0, 1)) 179 | else: 180 | c = None 181 | 182 | from train import build_model 183 | 184 | # Model 185 | model = build_model().to(device) 186 | 187 | # Load checkpoint 188 | print("Load checkpoint from {}".format(checkpoint_path)) 189 | if use_cuda: 190 | checkpoint = torch.load(checkpoint_path) 191 | else: 192 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) 193 | model.load_state_dict(checkpoint["state_dict"]) 194 | checkpoint_name = splitext(basename(checkpoint_path))[0] 195 | 196 | os.makedirs(dst_dir, exist_ok=True) 197 | dst_wav_path = join(dst_dir, "{}{}.wav".format(checkpoint_name, file_name_suffix)) 198 | 199 | # DO generate 200 | waveform = wavegen(model, length, c=c, g=speaker_id, initial_value=initial_value, fast=True) 201 | 202 | # save 203 | librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) 204 | 205 | print("Finished! Check out {} for generated audio samples.".format(dst_dir)) 206 | sys.exit(0) 207 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/modules.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import with_statement, print_function, absolute_import 3 | 4 | import math 5 | import numpy as np 6 | 7 | import torch 8 | from wavenet_vocoder_core import conv 9 | from torch import nn 10 | from torch.nn import functional as F 11 | 12 | 13 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs): 14 | m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs) 15 | std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) 16 | m.weight.data.normal_(mean=0, std=std) 17 | m.bias.data.zero_() 18 | return nn.utils.weight_norm(m) 19 | 20 | 21 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01): 22 | m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) 23 | m.weight.data.normal_(0, std) 24 | return m 25 | 26 | 27 | def ConvTranspose2d(in_channels, out_channels, kernel_size, 28 | weight_normalization=True, **kwargs): 29 | freq_axis_kernel_size = kernel_size[0] 30 | m = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, **kwargs) 31 | m.weight.data.fill_(1.0 / freq_axis_kernel_size) 32 | m.bias.data.zero_() 33 | if weight_normalization: 34 | return nn.utils.weight_norm(m) 35 | else: 36 | return m 37 | 38 | 39 | def Conv1d1x1(in_channels, out_channels, bias=True, weight_normalization=True): 40 | """1-by-1 convolution layer 41 | """ 42 | if weight_normalization: 43 | assert bias 44 | return Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 45 | dilation=1, bias=bias, std_mul=1.0) 46 | else: 47 | return conv.Conv1d(in_channels, out_channels, kernel_size=1, padding=0, 48 | dilation=1, bias=bias) 49 | 50 | 51 | def _conv1x1_forward(conv, x, is_incremental): 52 | """Conv1x1 forward 53 | """ 54 | if is_incremental: 55 | x = conv.incremental_forward(x) 56 | else: 57 | x = conv(x) 58 | return x 59 | 60 | 61 | class ResidualConv1dGLU(nn.Module): 62 | """Residual dilated conv1d + Gated linear unit 63 | 64 | Args: 65 | residual_channels (int): Residual input / output channels 66 | gate_channels (int): Gated activation channels. 67 | kernel_size (int): Kernel size of convolution layers. 68 | skip_out_channels (int): Skip connection channels. If None, set to same 69 | as ``residual_channels``. 70 | cin_channels (int): Local conditioning channels. If negative value is 71 | set, local conditioning is disabled. 72 | gin_channels (int): Global conditioning channels. If negative value is 73 | set, global conditioning is disabled. 74 | dropout (float): Dropout probability. 75 | padding (int): Padding for convolution layers. If None, proper padding 76 | is computed depends on dilation and kernel_size. 77 | dilation (int): Dilation factor. 78 | weight_normalization (bool): If True, DeepVoice3-style weight 79 | normalization is applied. 80 | """ 81 | 82 | def __init__(self, residual_channels, gate_channels, kernel_size, 83 | skip_out_channels=None, 84 | cin_channels=-1, gin_channels=-1, 85 | dropout=1 - 0.95, padding=None, dilation=1, causal=True, 86 | bias=True, weight_normalization=True, *args, **kwargs): 87 | super(ResidualConv1dGLU, self).__init__() 88 | self.dropout = dropout 89 | if skip_out_channels is None: 90 | skip_out_channels = residual_channels 91 | if padding is None: 92 | # no future time stamps available 93 | if causal: 94 | padding = (kernel_size - 1) * dilation 95 | else: 96 | padding = (kernel_size - 1) // 2 * dilation 97 | self.causal = causal 98 | 99 | if weight_normalization: 100 | assert bias 101 | self.conv = Conv1d(residual_channels, gate_channels, kernel_size, 102 | padding=padding, dilation=dilation, 103 | bias=bias, std_mul=1.0, *args, **kwargs) 104 | else: 105 | self.conv = conv.Conv1d(residual_channels, gate_channels, kernel_size, 106 | padding=padding, dilation=dilation, 107 | bias=bias, *args, **kwargs) 108 | 109 | # local conditioning 110 | if cin_channels > 0: 111 | self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, 112 | bias=bias, 113 | weight_normalization=weight_normalization) 114 | else: 115 | self.conv1x1c = None 116 | 117 | # global conditioning 118 | if gin_channels > 0: 119 | self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, bias=bias, 120 | weight_normalization=weight_normalization) 121 | else: 122 | self.conv1x1g = None 123 | 124 | # conv output is split into two groups 125 | gate_out_channels = gate_channels // 2 126 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias, 127 | weight_normalization=weight_normalization) 128 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, bias=bias, 129 | weight_normalization=weight_normalization) 130 | 131 | def forward(self, x, c=None, g=None): 132 | return self._forward(x, c, g, False) 133 | 134 | def incremental_forward(self, x, c=None, g=None): 135 | return self._forward(x, c, g, True) 136 | 137 | def _forward(self, x, c, g, is_incremental): 138 | """Forward 139 | 140 | Args: 141 | x (Tensor): B x C x T 142 | c (Tensor): B x C x T, Local conditioning features 143 | g (Tensor): B x C x T, Expanded global conditioning features 144 | is_incremental (Bool) : Whether incremental mode or not 145 | 146 | Returns: 147 | Tensor: output 148 | """ 149 | residual = x 150 | x = F.dropout(x, p=self.dropout, training=self.training) 151 | if is_incremental: 152 | splitdim = -1 153 | x = self.conv.incremental_forward(x) 154 | else: 155 | splitdim = 1 156 | x = self.conv(x) 157 | # remove future time steps 158 | x = x[:, :, :residual.size(-1)] if self.causal else x 159 | 160 | a, b = x.split(x.size(splitdim) // 2, dim=splitdim) 161 | 162 | # local conditioning 163 | if c is not None: 164 | assert self.conv1x1c is not None 165 | c = _conv1x1_forward(self.conv1x1c, c, is_incremental) 166 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 167 | a, b = a + ca, b + cb 168 | 169 | # global conditioning 170 | if g is not None: 171 | assert self.conv1x1g is not None 172 | g = _conv1x1_forward(self.conv1x1g, g, is_incremental) 173 | ga, gb = g.split(g.size(splitdim) // 2, dim=splitdim) 174 | a, b = a + ga, b + gb 175 | 176 | x = torch.tanh(a) * torch.sigmoid(b) 177 | 178 | # For skip connection 179 | s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental) 180 | 181 | # For residual connection 182 | x = _conv1x1_forward(self.conv1x1_out, x, is_incremental) 183 | 184 | x = (x + residual) * math.sqrt(0.5) 185 | return x, s 186 | 187 | def clear_buffer(self): 188 | for c in [self.conv, self.conv1x1_out, self.conv1x1_skip, 189 | self.conv1x1c, self.conv1x1g]: 190 | if c is not None: 191 | c.clear_buffer() 192 | -------------------------------------------------------------------------------- /code/lib/examples/unaligned_ljspeech_chars/rnn_unaligned_speech_ljspeech_nomask_blended_continue.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import numpy as np 5 | import tensorflow as tf 6 | from collections import namedtuple 7 | 8 | import logging 9 | import shutil 10 | from tfbldr.datasets import rsync_fetch, fetch_ljspeech 11 | from tfbldr.datasets import wavfile_caching_mel_tbptt_iterator 12 | from tfbldr.utils import next_experiment_path 13 | from tfbldr import get_logger 14 | from tfbldr import run_loop 15 | from tfbldr.nodes import Linear 16 | from tfbldr.nodes import Linear 17 | from tfbldr.nodes import LSTMCell 18 | from tfbldr.nodes import BiLSTMLayer 19 | from tfbldr.nodes import SequenceConv1dStack 20 | from tfbldr.nodes import Embedding 21 | from tfbldr.nodes import GaussianAttentionCell 22 | from tfbldr.nodes import DiscreteMixtureOfLogistics 23 | from tfbldr.nodes import DiscreteMixtureOfLogisticsCost 24 | from tfbldr.nodes import AdditiveGaussianNoise 25 | from tfbldr import scan 26 | 27 | if len(sys.argv) < 1: 28 | raise ValueError("Continue script only for continuing training of a previous model") 29 | 30 | seq_len = 256 31 | batch_size = 64 32 | window_mixtures = 10 33 | cell_dropout = .925 34 | #noise_scale = 8. 35 | prenet_units = 128 36 | n_filts = 128 37 | n_stacks = 3 38 | enc_units = 128 39 | dec_units = 512 40 | emb_dim = 15 41 | truncation_len = seq_len 42 | cell_dropout_scale = cell_dropout 43 | epsilon = 1E-8 44 | forward_init = "truncated_normal" 45 | rnn_init = "truncated_normal" 46 | 47 | basedir = "/Tmp/kastner/lj_speech/LJSpeech-1.0/" 48 | ljspeech = rsync_fetch(fetch_ljspeech, "leto01") 49 | 50 | # THESE ARE CANNOT BE PAIRED (SOME MISSING), ITERATOR PAIRS THEM UP BY NAME 51 | wavfiles = ljspeech["wavfiles"] 52 | jsonfiles = ljspeech["jsonfiles"] 53 | 54 | model_path = sys.argv[1] 55 | seed = int(abs(hash(model_path))) % (2 ** 32 - 1) 56 | 57 | # THESE HAVE TO BE THE SAME TO ENSURE SPLIT IS CORRECT 58 | train_random_state = np.random.RandomState(seed) 59 | valid_random_state = np.random.RandomState(seed) 60 | 61 | train_itr = wavfile_caching_mel_tbptt_iterator(wavfiles, jsonfiles, batch_size, seq_len, stop_index=.95, shuffle=True, random_state=train_random_state) 62 | valid_itr = wavfile_caching_mel_tbptt_iterator(wavfiles, jsonfiles, batch_size, seq_len, start_index=.95, shuffle=True, random_state=valid_random_state) 63 | 64 | """ 65 | for i in range(10000): 66 | print(i) 67 | mels, mel_mask, text, text_mask, mask, mask_mask, reset = train_itr.next_masked_batch() 68 | """ 69 | 70 | # STRONG CHECK TO ENSURE NO OVERLAP IN TRAIN/VALID 71 | for tai in train_itr.all_indices_: 72 | assert tai not in valid_itr.all_indices_ 73 | for vai in valid_itr.all_indices_: 74 | assert vai not in train_itr.all_indices_ 75 | 76 | random_state = np.random.RandomState(1442) 77 | # use the max of the two blended types... 78 | vocabulary_size = max(train_itr.vocabulary_sizes) 79 | output_size = train_itr.n_mel_filters 80 | 81 | att_w_init = np.zeros((batch_size, 2 * enc_units)) 82 | att_k_init = np.zeros((batch_size, window_mixtures)) 83 | att_h_init = np.zeros((batch_size, dec_units)) 84 | att_c_init = np.zeros((batch_size, dec_units)) 85 | h1_init = np.zeros((batch_size, dec_units)) 86 | c1_init = np.zeros((batch_size, dec_units)) 87 | h2_init = np.zeros((batch_size, dec_units)) 88 | c2_init = np.zeros((batch_size, dec_units)) 89 | 90 | stateful_args = [att_w_init, 91 | att_k_init, 92 | att_h_init, 93 | att_c_init, 94 | h1_init, 95 | c1_init, 96 | h2_init, 97 | c2_init] 98 | 99 | with tf.Session() as sess: 100 | saver = tf.train.import_meta_graph(model_path + '.meta') 101 | logger = get_logger() 102 | logger.info("CONTINUING TRAINING FROM MODEL PATH {}".format(model_path)) 103 | saver.restore(sess, model_path) 104 | graph = tf.get_default_graph() 105 | 106 | fields = ["mels", 107 | "mel_mask", 108 | "in_mels", 109 | "in_mel_mask", 110 | "out_mels", 111 | "out_mel_mask", 112 | "text", 113 | "text_mask", 114 | "mask", 115 | "mask_mask", 116 | "bias", 117 | "cell_dropout", 118 | "prenet_dropout", 119 | "bn_flag", 120 | "pred", 121 | #"mix", "means", "lins", 122 | "att_w_init", 123 | "att_k_init", 124 | "att_h_init", 125 | "att_c_init", 126 | "h1_init", 127 | "c1_init", 128 | "h2_init", 129 | "c2_init", 130 | "att_w", 131 | "att_k", 132 | "att_phi", 133 | "att_h", 134 | "att_c", 135 | "h1", 136 | "c1", 137 | "h2", 138 | "c2", 139 | "loss", 140 | "train_step", 141 | "learning_rate"] 142 | vs = namedtuple('Params', fields)( 143 | *[tf.get_collection(name)[0] for name in fields] 144 | ) 145 | 146 | step_count = 0 147 | def loop(sess, itr, extras, stateful_args): 148 | """ 149 | global step_count 150 | global noise_scale 151 | step_count += 1 152 | if step_count > 10000: 153 | step_count = 0 154 | if noise_scale == 2: 155 | noise_scale = 1. 156 | else: 157 | noise_scale = noise_scale - 2. 158 | if noise_scale < .5: 159 | noise_scale = .5 160 | """ 161 | mels, mel_mask, text, text_mask, mask, mask_mask, reset = itr.next_masked_batch() 162 | in_m = mels[:-1] 163 | in_mel_mask = mel_mask[:-1] 164 | 165 | #noise_block = np.clip(random_state.randn(*in_m.shape), -6, 6) 166 | #in_m = in_m + noise_scale * noise_block 167 | 168 | out_m = mels[1:] 169 | out_mel_mask = mel_mask[1:] 170 | 171 | att_w_init = stateful_args[0] 172 | att_k_init = stateful_args[1] 173 | att_h_init = stateful_args[2] 174 | att_c_init = stateful_args[3] 175 | h1_init = stateful_args[4] 176 | c1_init = stateful_args[5] 177 | h2_init = stateful_args[6] 178 | c2_init = stateful_args[7] 179 | 180 | att_w_init *= reset 181 | att_k_init *= reset 182 | att_h_init *= reset 183 | att_c_init *= reset 184 | h1_init *= reset 185 | c1_init *= reset 186 | h2_init *= reset 187 | c2_init *= reset 188 | 189 | feed = { 190 | vs.in_mels: in_m, 191 | vs.in_mel_mask: in_mel_mask, 192 | vs.out_mels: out_m, 193 | vs.out_mel_mask: out_mel_mask, 194 | vs.bn_flag: 0., 195 | vs.text: text, 196 | vs.text_mask: text_mask, 197 | vs.mask: mask, 198 | vs.mask_mask: mask_mask, 199 | vs.att_w_init: att_w_init, 200 | vs.att_k_init: att_k_init, 201 | vs.att_h_init: att_h_init, 202 | vs.att_c_init: att_c_init, 203 | vs.h1_init: h1_init, 204 | vs.c1_init: c1_init, 205 | vs.h2_init: h2_init, 206 | vs.c2_init: c2_init} 207 | outs = [vs.att_w, vs.att_k, 208 | vs.att_h, vs.att_c, 209 | vs.h1, vs.c1, vs.h2, vs.c2, 210 | vs.att_phi, 211 | vs.loss, vs.train_step] 212 | 213 | r = sess.run(outs, feed_dict=feed) 214 | 215 | att_w_np = r[0] 216 | att_k_np = r[1] 217 | att_h_np = r[2] 218 | att_c_np = r[3] 219 | h1_np = r[4] 220 | c1_np = r[5] 221 | h2_np = r[6] 222 | c2_np = r[7] 223 | att_phi_np = r[8] 224 | l = r[-2] 225 | _ = r[-1] 226 | 227 | # set next inits 228 | att_w_init = att_w_np[-1] 229 | att_k_init = att_k_np[-1] 230 | att_h_init = att_h_np[-1] 231 | att_c_init = att_c_np[-1] 232 | h1_init = h1_np[-1] 233 | c1_init = c1_np[-1] 234 | h2_init = h2_np[-1] 235 | c2_init = c2_np[-1] 236 | 237 | stateful_args = [att_w_init, 238 | att_k_init, 239 | att_h_init, 240 | att_c_init, 241 | h1_init, 242 | c1_init, 243 | h2_init, 244 | c2_init] 245 | return l, None, stateful_args 246 | 247 | run_loop(sess, 248 | loop, train_itr, 249 | loop, train_itr, 250 | continue_training=True, 251 | n_steps=1000000, 252 | n_train_steps_per=1000, 253 | train_stateful_args=stateful_args, 254 | n_valid_steps_per=0, 255 | valid_stateful_args=stateful_args) 256 | -------------------------------------------------------------------------------- /pretrained/cleaning.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | import cleaners 4 | from symbols import char_symbols 5 | from symbols import phone_symbols 6 | from symbols import pau_phone_symbols 7 | from eng_rules import hybrid_g2p, rulebased_g2p 8 | 9 | 10 | # Mappings from symbol to numeric ID and vice versa: 11 | _char_symbol_to_id = {s: i for i, s in enumerate(char_symbols)} 12 | _id_to_char_symbol = {i: s for i, s in enumerate(char_symbols)} 13 | 14 | _phone_symbol_to_id = {s: i for i, s in enumerate(phone_symbols)} 15 | _id_to_phone_symbol = {i: s for i, s in enumerate(phone_symbols)} 16 | 17 | _pau_phone_symbol_to_id = {s: i for i, s in enumerate(pau_phone_symbols)} 18 | _id_to_pau_phone_symbol = {i: s for i, s in enumerate(pau_phone_symbols)} 19 | 20 | # Regular expression matching text enclosed in curly braces: 21 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 22 | 23 | 24 | def get_vocabulary_sizes(cleaner_names): 25 | """ 26 | if pause in name, return pause phone size 27 | if phone in name, return phone size 28 | else return char size 29 | """ 30 | outs = [] 31 | for cn in cleaner_names: 32 | if "pause" in cn: 33 | outs.append(len(_pau_phone_symbol_to_id)) 34 | elif "phone" in cn: 35 | outs.append(len(_phone_symbol_to_id)) 36 | else: 37 | outs.append(len(_char_symbol_to_id)) 38 | # needed? 39 | if len(outs) == 1: 40 | outs = outs[0] 41 | return outs 42 | 43 | 44 | def text_to_sequence(text, cleaner_names): 45 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 46 | 47 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 48 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 49 | 50 | Args: 51 | text: string to convert to a sequence 52 | cleaner_names: names of the cleaner functions to run the text through 53 | 54 | Returns: 55 | List of integers corresponding to the symbols in the text 56 | ''' 57 | if any(["rule" in name for name in cleaner_names]): 58 | raise ValueError("IMPLEMENT RULE TRANFORM") 59 | sequence = [] 60 | # Check for curly braces and treat their contents as ARPAbet: 61 | while len(text): 62 | m = _curly_re.match(text) 63 | if not m: 64 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 65 | break 66 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 67 | sequence += _arpabet_to_sequence(m.group(2)) 68 | text = m.group(3) 69 | 70 | # Append EOS token 71 | sequence.append(_symbol_to_id['~']) 72 | return sequence 73 | elif any(["pause" in name for name in cleaner_names]): 74 | sequence = [] 75 | # Check for curly braces and treat their contents as ARPAbet: 76 | while len(text): 77 | m = _curly_re.match(text) 78 | if not m: 79 | sequence += _pau_phone_symbols_to_sequence(_clean_text(text, cleaner_names)) 80 | break 81 | sequence += _pau_phone_symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 82 | sequence += _arpabet_to_sequence(m.group(2)) 83 | text = m.group(3) 84 | # Append EOS token 85 | sequence.append(_phone_symbol_to_id['~']) 86 | return sequence 87 | elif any(["phone" in name for name in cleaner_names]): 88 | sequence = [] 89 | # Check for curly braces and treat their contents as ARPAbet: 90 | while len(text): 91 | m = _curly_re.match(text) 92 | if not m: 93 | sequence += _phone_symbols_to_sequence(_clean_text(text, cleaner_names)) 94 | break 95 | sequence += _phone_symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 96 | sequence += _arpabet_to_sequence(m.group(2)) 97 | text = m.group(3) 98 | # Append EOS token 99 | sequence.append(_phone_symbol_to_id['~']) 100 | return sequence 101 | else: 102 | sequence = [] 103 | # Check for curly braces and treat their contents as ARPAbet: 104 | while len(text): 105 | m = _curly_re.match(text) 106 | if not m: 107 | sequence += _char_symbols_to_sequence(_clean_text(text, cleaner_names)) 108 | break 109 | sequence += _char_symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 110 | sequence += _arpabet_to_sequence(m.group(2)) 111 | text = m.group(3) 112 | 113 | # Append EOS token 114 | sequence.append(_char_symbol_to_id['~']) 115 | return sequence 116 | 117 | 118 | def sequence_to_text(sequence, cleaner_names): 119 | '''Converts a sequence of IDs back to a string''' 120 | if any(["rule" in name for name in cleaner_names]): 121 | raise ValueError("IMPLEMENT RULE TRANFORM") 122 | elif any(["pause" in name for name in cleaner_names]): 123 | result = "" 124 | space_id = _pau_phone_symbol_to_id[" "] 125 | pad_id = _pau_phone_symbol_to_id["_"] 126 | eos_id = _pau_phone_symbol_to_id["~"] 127 | special_ids = [_pau_phone_symbol_to_id[special] for special in ["1","2","3","4"]] 128 | for symbol_id in sequence: 129 | if symbol_id in [space_id, pad_id, eos_id]: 130 | result += _id_to_pau_phone_symbol[symbol_id] 131 | elif symbol_id in special_ids: 132 | result += _id_to_pau_phone_symbol[symbol_id] 133 | else: 134 | result += "@" + _id_to_pau_phone_symbol[symbol_id] 135 | return result 136 | elif any(["phone" in name for name in cleaner_names]): 137 | result = "" 138 | space_id = _phone_symbol_to_id[" "] 139 | pad_id = _phone_symbol_to_id["_"] 140 | eos_id = _phone_symbol_to_id["~"] 141 | special_ids = [_phone_symbol_to_id[special] for special in "!,:?"] 142 | for symbol_id in sequence: 143 | if symbol_id in [space_id, pad_id, eos_id] + special_ids: 144 | result += _id_to_phone_symbol[symbol_id] 145 | else: 146 | result += "@" + _id_to_phone_symbol[symbol_id] 147 | return result 148 | else: 149 | result = '' 150 | for symbol_id in sequence: 151 | if symbol_id in _id_to_char_symbol: 152 | s = _id_to_char_symbol[symbol_id] 153 | # Enclose ARPAbet back in curly braces: 154 | if len(s) > 1 and s[0] == '@': 155 | s = '{%s}' % s[1:] 156 | result += s 157 | return result.replace('}{', ' ') 158 | 159 | 160 | def _clean_text(text, cleaner_names): 161 | for name in cleaner_names: 162 | cleaner = getattr(cleaners, name) 163 | if not cleaner: 164 | raise Exception('Unknown cleaner: %s' % name) 165 | text = cleaner(text) 166 | return text 167 | 168 | 169 | def _char_symbols_to_sequence(symbols): 170 | return [_char_symbol_to_id[s] for s in symbols if _char_should_keep_symbol(s)] 171 | 172 | def _pau_phone_symbols_to_sequence(symbols): 173 | new = [] 174 | specials = ["1", "2", "3", "4"] 175 | for ss in symbols.split(" "): 176 | if any([special in ss for special in specials]): 177 | all_special = [special for special in ss if special in specials] 178 | all_non_special = [nonspecial[1:] for nonspecial in ss if nonspecial not in specials] 179 | prev = [] 180 | for ssi in ss.strip().split("@")[1:]: 181 | if any([special in ssi for special in specials]): 182 | prev.append(re.sub("|".join(specials), "", ssi)) 183 | which_specials = [special for special in specials if special in ssi] 184 | for p in prev: 185 | new.append(p) 186 | # ASSUME ONLY 1? 187 | new.append(which_specials[0]) 188 | prev = [] 189 | else: 190 | prev.append(ssi) 191 | else: 192 | for ssi in ss.strip().split("@")[1:] + [" "]: 193 | new.append(ssi) 194 | return [_pau_phone_symbol_to_id[s] for s in new if _pau_phone_should_keep_symbol(s)] 195 | 196 | def _phone_symbols_to_sequence(symbols): 197 | new = [] 198 | for ss in symbols.split(" "): 199 | if any([special in ss for special in "!,:?"]): 200 | # special symbols only at start or back of chunk 201 | if ss[0] in "!,:?": 202 | for ssi in [ss[0]] + ss[1:].strip().split("@")[1:] + [" "]: 203 | new.append(ssi) 204 | elif ss[-1] in "!,:?": 205 | for ssi in ss[:-1].strip().split("@")[1:] + [ss[-1]] + [" "]: 206 | new.append(ssi) 207 | else: 208 | for ssi in ss.strip().split("@")[1:] + [" "]: 209 | new.append(ssi) 210 | #new = [ssi for ss in symbols.split(" ") for ssi in ss.strip().split("@")[1:] + [" "]][:-1] 211 | return [_phone_symbol_to_id[s] for s in new if _phone_should_keep_symbol(s)] 212 | 213 | def _arpabet_to_sequence(text): 214 | return _symbols_to_sequence(['@' + s for s in text.split()]) 215 | 216 | def _char_should_keep_symbol(s): 217 | return s in _char_symbol_to_id and s is not '_' and s is not '~' 218 | 219 | def _pau_phone_should_keep_symbol(s): 220 | return s in _pau_phone_symbol_to_id and s is not '_' and s is not '~' 221 | 222 | def _phone_should_keep_symbol(s): 223 | return s in _phone_symbol_to_id and s is not '_' and s is not '~' 224 | --------------------------------------------------------------------------------