├── code
    ├── examples
    ├── lib
    │   ├── tfbldr
    │   │   ├── test
    │   │   │   ├── __init__.py
    │   │   │   ├── test_import.py
    │   │   │   └── test_simple.py
    │   │   ├── .gitignore
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── utils.py
    │   │   ├── plot
    │   │   │   ├── __init__.py
    │   │   │   └── audio.py
    │   │   ├── datasets
    │   │   │   ├── text
    │   │   │   │   ├── cleaning
    │   │   │   │   │   ├── README
    │   │   │   │   │   ├── LICENSE
    │   │   │   │   │   ├── symbols.py
    │   │   │   │   │   ├── cmudict.py
    │   │   │   │   │   ├── numbers.py
    │   │   │   │   │   ├── cleaners.py
    │   │   │   │   │   └── number_to_words.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── audio
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── magrecnp.py
    │   │   │   ├── music
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   └── plotters.py
    │   │   ├── __init__.py
    │   │   ├── core
    │   │   │   └── __init__.py
    │   │   ├── misc_scripts
    │   │   │   └── speech_itr_test.py
    │   │   └── nodes
    │   │   │   └── __init__.py
    │   ├── tfbldr.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── requires.txt
    │   │   ├── PKG-INFO
    │   │   └── SOURCES.txt
    │   ├── README.rst
    │   ├── examples
    │   │   └── unaligned_ljspeech_chars
    │   │   │   ├── wavenet_stuff
    │   │   │       ├── audio.py
    │   │   │       ├── hparams.py
    │   │   │       ├── train.py
    │   │   │       ├── synthesis.py
    │   │   │       ├── lrschedule.py
    │   │   │       ├── wavenet_vocoder_core
    │   │   │       ├── wavenet_vocoder
    │   │   │       │   ├── MANIFEST.in
    │   │   │       │   ├── tox.ini
    │   │   │       │   ├── wavenet_vocoder
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   ├── util.py
    │   │   │       │   │   ├── builder.py
    │   │   │       │   │   ├── conv.py
    │   │   │       │   │   ├── mixture.py
    │   │   │       │   │   └── modules.py
    │   │   │       │   ├── tests
    │   │   │       │   │   ├── test_audio.py
    │   │   │       │   │   ├── test_misc.py
    │   │   │       │   │   └── test_mixture.py
    │   │   │       │   ├── release.sh
    │   │   │       │   ├── tojson.py
    │   │   │       │   ├── appveyor.yml
    │   │   │       │   ├── .travis.yml
    │   │   │       │   ├── LICENSE.md
    │   │   │       │   ├── lrschedule.py
    │   │   │       │   ├── presets
    │   │   │       │   │   ├── ljspeech_mixture.json
    │   │   │       │   │   ├── cmu_arctic_8bit.json
    │   │   │       │   │   └── multispeaker_cmu_arctic_mixture.json
    │   │   │       │   ├── preprocess.py
    │   │   │       │   ├── setup.py
    │   │   │       │   ├── .gitignore
    │   │   │       │   ├── ljspeech.py
    │   │   │       │   ├── jsut.py
    │   │   │       │   ├── audio.py
    │   │   │       │   ├── cmu_arctic.py
    │   │   │       │   ├── hparams.py
    │   │   │       │   ├── librivox.py
    │   │   │       │   ├── evaluate.py
    │   │   │       │   └── synthesis.py
    │   │   │       ├── 20180510_mixture_lj_checkpoint_step000320000_ema.pth
    │   │   │       ├── 20180510_mixture_lj_checkpoint_step000320000_ema.json
    │   │   │       └── batch_synth.py
    │   │   │   ├── wiperesults.sh
    │   │   │   ├── basic_test.txt
    │   │   │   ├── norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz
    │   │   │   ├── quote_test.txt
    │   │   │   ├── taco_small_test.txt
    │   │   │   ├── sampleit.sh
    │   │   │   ├── taco_prosody_test.txt
    │   │   │   ├── full_test.txt
    │   │   │   └── rnn_unaligned_speech_ljspeech_nomask_blended_continue.py
    │   ├── continuous_integration
    │   │   ├── test_script.sh
    │   │   └── install.sh
    │   ├── setup.py
    │   ├── .travis.yml
    │   └── LICENSE
    └── README.md
├── pretrained
    ├── clean.sh
    ├── sample.sh
    ├── cmudict.json.gz
    ├── norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz
    ├── README.md
    ├── symbols.py
    ├── cmudict.py
    ├── numbers_rules.py
    ├── text.py
    ├── cleaners.py
    ├── representation_mixing_text_to_speech_demo_minimal.ipynb
    ├── number_to_words.py
    ├── transform_text.py
    └── cleaning.py
├── figures
    ├── white.png
    ├── single_mb_cropped.png
    ├── tbptt_mb_cropped.png
    ├── embedding_module_cropped.png
    └── network_diagram_cropped.png
├── LICENSE
└── README.md


/code/examples:
--------------------------------------------------------------------------------
1 | lib/examples


--------------------------------------------------------------------------------
/code/lib/tfbldr/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | tfbldr
2 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.out
3 | *.npy
4 | 


--------------------------------------------------------------------------------
/pretrained/clean.sh:
--------------------------------------------------------------------------------
1 | rm *.png
2 | rm *.wav
3 | rm sample_*_mels.npz
4 | 


--------------------------------------------------------------------------------
/pretrained/sample.sh:
--------------------------------------------------------------------------------
1 | python sample_rnn_unaligned_speech_ljspeech.py
2 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | tensorflow-gpu
4 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import next_experiment_path
2 | 


--------------------------------------------------------------------------------
/code/lib/README.rst:
--------------------------------------------------------------------------------
1 | Tensorflow tools and experiments
2 | 
3 | Use at your own risk
4 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/audio.py:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/audio.py


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/hparams.py:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/hparams.py


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/train.py:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/train.py


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/synthesis.py:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/synthesis.py


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/lrschedule.py:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/lrschedule.py


--------------------------------------------------------------------------------
/figures/white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/white.png


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wiperesults.sh:
--------------------------------------------------------------------------------
1 | rm *.wav
2 | rm *.png
3 | rm -r sample_results
4 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder_core:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/wavenet_vocoder/


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE.md
2 | 


--------------------------------------------------------------------------------
/pretrained/cmudict.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/pretrained/cmudict.json.gz


--------------------------------------------------------------------------------
/figures/single_mb_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/single_mb_cropped.png


--------------------------------------------------------------------------------
/figures/tbptt_mb_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/tbptt_mb_cropped.png


--------------------------------------------------------------------------------
/figures/embedding_module_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/embedding_module_cropped.png


--------------------------------------------------------------------------------
/figures/network_diagram_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/figures/network_diagram_cropped.png


--------------------------------------------------------------------------------
/code/lib/tfbldr/plot/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot import get_viridis
2 | from .plot import autoaspect
3 | from .audio import specgram
4 | from .audio import specplot
5 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/README:
--------------------------------------------------------------------------------
1 | text processing utils from Keith Ito
2 | replaced inflect engine with https://github.com/ianfieldhouse/number_to_words
3 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/20180510_mixture_lj_checkpoint_step000320000_ema.pth:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/20180510_mixture_lj_checkpoint_step000320000_ema.pth


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/20180510_mixture_lj_checkpoint_step000320000_ema.json:
--------------------------------------------------------------------------------
1 | wavenet_vocoder/20180510_mixture_lj_checkpoint_step000320000_ema.json


--------------------------------------------------------------------------------
/code/lib/tfbldr/test/test_import.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.getLogger('tensorflow').disabled = True
3 | 
4 | # implicit test
5 | from tfbldr import *
6 | 
7 | def test_import_all():
8 |     pass
9 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = E305,E402,E704,E721,E741,F401,F403,F405,F821,F841,F999
4 | exclude = docs/,data,build,dist,notebooks,checkpoints*,legacy
5 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from __future__ import with_statement, print_function, absolute_import
3 | 
4 | #from .version import __version__
5 | 
6 | from .wavenet import receptive_field_size, WaveNet
7 | 


--------------------------------------------------------------------------------
/pretrained/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/pretrained/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/basic_test.txt:
--------------------------------------------------------------------------------
1 | i am learning english.
2 | thanks so much.
3 | i will be with you in a moment.
4 | the meeting is at eleven this morning.
5 | they will be gone for twenty eight days.
6 | i can help with that.
7 | this and that, these and those.
8 | they are a few sandwiches short of a picnic.
9 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kastnerkyle/representation_mixing/HEAD/code/lib/examples/unaligned_ljspeech_chars/norm-mean-std-txt-cleanenglish_cleanersenglish_phone_cleaners-logmel-wsz512-wst128-leh125-ueh7800-nmel80.npz


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/quote_test.txt:
--------------------------------------------------------------------------------
1 | sphinx of black quartz, judge my vow.
2 | the quick brown fox jumps over the lazy dog.
3 | pack my box with five dozen liquor jugs.
4 | we surely shall see the sun shine soon.
5 | lesser leather never weathered wetter weather better.
6 | near an ear, a nearer ear, a nearly eerie ear.
7 | the sky above the port was the color of television, tuned to a dead channel.
8 | all this happened, more or less.
9 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/__init__.py:
--------------------------------------------------------------------------------
 1 | floatX = "float32"
 2 | intX = "int32"
 3 | import os
 4 | 
 5 | # fix logging during travis testing
 6 | if os.environ.get('TRAVIS') != "true":
 7 |     import logging
 8 |     logging.getLogger('tensorflow').disabled = True
 9 | 
10 | from .core import get_logger
11 | from .core import scan
12 | from .core import dot
13 | from .core import get_params_dict
14 | from .core import run_loop
15 | from .nodes import make_numpy_weights
16 | from .nodes import make_numpy_biases
17 | 
18 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/core/__init__.py:
--------------------------------------------------------------------------------
 1 | from .core import get_params_dict
 2 | from .core import get_logger
 3 | from .core import scan
 4 | from .core import _get_name
 5 | from .core import _get_shared
 6 | from .core import _set_shared
 7 | from .core import run_loop
 8 | from .core import print_network
 9 | from .core import _ndim
10 | from .core import _shape
11 | from .core import dot
12 | from .core import get_weight_norm_default
13 | from .core import get_strict_mode_default
14 | from .core import print_network
15 | from .core import download
16 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | from .audio_tools import soundsc
 2 | from .audio_tools import overlap
 3 | from .audio_tools import stft
 4 | from .audio_tools import iterate_invert_spectrogram
 5 | from .audio_tools import mel_freq_weights
 6 | from .audio_tools import linear_to_mel_weight_matrix
 7 | from .audio_tools import mu_law_encode
 8 | from .audio_tools import mu_law_decode
 9 | from .audio_tools import mu_law_transform
10 | from .audio_tools import mu_law_inverse
11 | from .audio_tools import fetch_sample_speech_tapestry
12 | from .datasets import wavfile_caching_mel_tbptt_iterator
13 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_audio.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import sys
 5 | from os.path import dirname, join
 6 | sys.path.insert(0, join(dirname(__file__), ".."))
 7 | 
 8 | import numpy as np
 9 | from nose.plugins.attrib import attr
10 | 
11 | import logging
12 | logging.getLogger('tensorflow').disabled = True
13 | 
14 | 
15 | @attr("local_only")
16 | def test_amp_to_db():
17 |     import audio
18 |     x = np.random.rand(10)
19 |     x_hat = audio._db_to_amp(audio._amp_to_db(x))
20 |     assert np.allclose(x, x_hat)
21 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def next_experiment_path():
 6 |     """
 7 |     creates paths for new experiment
 8 |     returns path for next experiment
 9 |     """
10 | 
11 |     idx = 0
12 |     path = os.path.join('summary', 'experiment-{}')
13 |     while os.path.exists(path.format(idx)):
14 |         idx += 1
15 |     path = path.format(idx)
16 |     os.makedirs(os.path.join(path, 'models'))
17 |     os.makedirs(os.path.join(path, 'backup'))
18 |     for file in filter(lambda x: x.endswith('.py'), os.listdir('.')):
19 |         shutil.copy2(file, os.path.join(path, 'backup'))
20 |     return path
21 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: tfbldr
 3 | Version: 0.0.1
 4 | Summary: Deep Learning tools for Tensorflow
 5 | Home-page: http://github.com/kastnerkyle/tfbldr/
 6 | Author: Kyle Kastner
 7 | Author-email: kastnerkyle@gmail.com
 8 | License: BSD 3-clause
 9 | Description: Tensorflow tools and experiments
10 |         
11 |         Use at your own risk
12 |         
13 | Platform: UNKNOWN
14 | Classifier: Development Status :: 3 - Alpha
15 | Classifier: Intended Audience :: Science/Research
16 | Classifier: License :: OSI Approved :: BSD License
17 | Classifier: Operating System :: OS Independent
18 | Classifier: Topic :: Scientific/Engineering
19 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/util.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | 
 5 | def _assert_valid_input_type(s):
 6 |     assert s == "mulaw-quantize" or s == "mulaw" or s == "raw"
 7 | 
 8 | 
 9 | def is_mulaw_quantize(s):
10 |     _assert_valid_input_type(s)
11 |     return s == "mulaw-quantize"
12 | 
13 | 
14 | def is_mulaw(s):
15 |     _assert_valid_input_type(s)
16 |     return s == "mulaw"
17 | 
18 | 
19 | def is_raw(s):
20 |     _assert_valid_input_type(s)
21 |     return s == "raw"
22 | 
23 | 
24 | def is_scalar_input(s):
25 |     return is_raw(s) or is_mulaw(s)
26 | 


--------------------------------------------------------------------------------
/pretrained/README.md:
--------------------------------------------------------------------------------
 1 | # Colab Notebook Links
 2 | Full demo link: https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb
 3 | 
 4 | Minimal demo: https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo_minimal.ipynb
 5 | 
 6 | # Notes
 7 | Some files pulled and edited from tfbldr to enable standalone runtime
 8 | 
 9 | Inspired by Colab example from Ryuichi Yamamoto (r9y9) https://r9y9.github.io/blog/2018/05/20/tacotron2/
10 | 
11 | Text processing utils from Keith Ito
12 | 
13 | Replaced inflect engine with https://github.com/ianfieldhouse/number_to_words
14 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | from wavenet_vocoder import receptive_field_size
 5 | 
 6 | 
 7 | def test_receptive_field_size():
 8 |     # Table 4 in https://arxiv.org/abs/1711.10433
 9 |     assert receptive_field_size(total_layers=30, num_cycles=3, kernel_size=3) == 6139
10 |     assert receptive_field_size(total_layers=24, num_cycles=4, kernel_size=3) == 505
11 |     assert receptive_field_size(total_layers=12, num_cycles=2, kernel_size=3) == 253
12 |     assert receptive_field_size(total_layers=30, num_cycles=1,
13 |                                 kernel_size=3, dilation=lambda x: 1) == 61
14 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script for Pypi release
 4 | # 0. Make sure you are on git tag
 5 | # 1. Run the script
 6 | # 2. Upload sdist
 7 | 
 8 | set -e
 9 | 
10 | script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd)
11 | cd $script_dir
12 | 
13 | TAG=$(git describe --exact-match --tags HEAD)
14 | 
15 | VERSION=${TAG/v/}
16 | 
17 | WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py develop sdist
18 | echo "*** Ready to release! wavenet_vocoder $TAG ***"
19 | echo "Please run the following command manually:"
20 | echo WAVENET_VOCODER_BUILD_VERSION=$VERSION python setup.py sdist upload
21 | echo "Please make sure that release verion is correct."
22 | cat wavenet_vocoder/version.py
23 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
1 | # WARNING
2 | This folder contains a NON-RUNNABLE code dump of my research library used for training the model. This is only for very, very interested people and for seeing the exact model definition and dirty details in code.
3 | 
4 | The actual json files containing char and phone alignments and timing, used for training can be directly downloaded from here https://www.dropbox.com/s/1m73uf2mslvq0t5/gentle_json.tar.gz?dl=0
5 | 
6 | The gentle_json files were extracted using utilities from my repo https://github.com/kastnerkyle/raw_voice_cleanup/blob/master/alignment/align_many.py
7 | 
8 | If you just want to hear sound, use the colab here https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb
9 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tojson.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Dump hyper parameters to json file.
 4 | 
 5 | usage: tojson.py [options] <output_json_path>
 6 | 
 7 | options:
 8 |     -h, --help               Show help message.
 9 | """
10 | from docopt import docopt
11 | 
12 | import sys
13 | import os
14 | from os.path import dirname, join, basename, splitext
15 | import json
16 | 
17 | from hparams import hparams
18 | 
19 | if __name__ == "__main__":
20 |     args = docopt(__doc__)
21 |     output_json_path = args["<output_json_path>"]
22 | 
23 |     j = hparams.values()
24 | 
25 |     # for compat legacy
26 |     for k in ["preset", "presets"]:
27 |         if k in j:
28 |             del j[k]
29 | 
30 |     with open(output_json_path, "w") as f:
31 |         json.dump(j, f, indent=2)
32 |     sys.exit(0)
33 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/music/__init__.py:
--------------------------------------------------------------------------------
 1 | # music21 is an optional dep
 2 | from ...core import get_logger
 3 | logger = get_logger()
 4 | 
 5 | try:
 6 |     from .music import pitch_and_duration_to_quantized
 7 |     from .music import pitches_and_durations_to_pretty_midi
 8 |     from .music import quantized_to_pretty_midi
 9 |     from .music import quantized_to_pitch_duration
10 |     from .music import plot_pitches_and_durations
11 |     from .music import music21_to_pitch_duration
12 |     from .music import music21_to_quantized
13 |     from .music import plot_piano_roll
14 |     from .music import quantized_imlike_to_image_array
15 |     from .analysis import midi_to_notes
16 |     from .analysis import notes_to_midi
17 |     from .loaders import fetch_jsb
18 |     from .loaders import fetch_josquin
19 | except ImportError:
20 |     logger.info("Unable to import music21 related utilities")
21 | 


--------------------------------------------------------------------------------
/code/lib/continuous_integration/test_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is meant to be called by the "script" step defined in
 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details.
 4 | # The behavior of the script is controlled by environment variabled defined
 5 | # in the .travis.yml in the top level folder of the project.
 6 | 
 7 | # License: 3-clause BSD
 8 | 
 9 | # still doesn't fix anything...
10 | export TF_CPP_MIN_LOG_LEVEL=3
11 | set -e
12 | 
13 | python --version
14 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
15 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
16 | python -c "import tensorflow as tf; print('tensorflow %s' % tf.__version__)"
17 | 
18 | # Do not use "make test" or "make test-coverage" as they enable verbose mode
19 | # which renders travis output too slow to display in a browser.
20 | if [[ "$COVERAGE" == "true" ]]; then
21 |     nosetests -s --with-coverage tfbldr
22 | else
23 |     nosetests -s tfbldr
24 | fi
25 | 


--------------------------------------------------------------------------------
/code/lib/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | setuptools.setup(
 5 |     name='tfbldr',
 6 |     version='0.0.1',
 7 |     packages=setuptools.find_packages(),
 8 |     author='Kyle Kastner',
 9 |     author_email='kastnerkyle@gmail.com',
10 |     description='Deep Learning tools for Tensorflow',
11 |     long_description=open(os.path.join(os.path.dirname(
12 |         os.path.abspath(__file__)), 'README.rst')).read(),
13 |     license='BSD 3-clause',
14 |     url='http://github.com/kastnerkyle/tfbldr/',
15 |     package_data={
16 |         'pthbldr': ['datasets/data/*']
17 |     },
18 |     install_requires=['numpy',
19 |                       'scipy',
20 |                       'tensorflow-gpu'],
21 |     classifiers=['Development Status :: 3 - Alpha',
22 |                  'Intended Audience :: Science/Research',
23 |                  'License :: OSI Approved :: BSD License',
24 |                  'Operating System :: OS Independent',
25 |                  'Topic :: Scientific/Engineering'],
26 | )
27 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     - PYTHON_VERSION: "3.6"
 4 |       PYTHON_ARCH: "64"
 5 |       MINICONDA: C:\Miniconda36-x64
 6 | 
 7 | branches:
 8 |   only:
 9 |     - master
10 |     - /release-.*/
11 | 
12 | skip_commits:
13 |   message: /\[av skip\]/
14 | 
15 | notifications:
16 |   - provider: Email
17 |     on_build_success: false
18 |     on_build_failure: false
19 |     on_build_status_changed: false
20 | 
21 | init:
22 |   - "ECHO %PYTHON_VERSION% %PYTHON_ARCH% %MINICONDA%"
23 | 
24 | install:
25 |   - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
26 |   - conda config --set always_yes yes  --set changeps1 no
27 |   - conda update -q conda
28 |   - conda install -n root _license
29 |   - conda info -a
30 |   - "conda create -q -n test-environment python=%PYTHON_VERSION% numpy scipy cython nose pytorch -c pytorch"
31 |   - activate test-environment
32 | 
33 | build_script:
34 |   - pip install -e ".[test]"
35 | 
36 | test_script:
37 |   - nosetests -v -w tests/ -a "!local_only"
38 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/taco_small_test.txt:
--------------------------------------------------------------------------------
 1 | Scientists at the CERN laboratory say they have discovered a new particle.
 2 | There’s a way to measure the acute emotional intelligence that has never gone out of style.
 3 | President Trump met with other leaders at the Group of 20 conference.
 4 | The Senate’s bill to repeal and replace the Affordable Care Act is now imperiled.
 5 | Generative adversarial network or variational auto-encoder.
 6 | Basilar membrane and otolaryngology are not auto-correlations.
 7 | He has read the whole thing.
 8 | He reads books.
 9 | Don’t desert me here in the desert!
10 | He thought it was time to present the present.
11 | Thisss isrealy awhsome.
12 | The buses aren't the problem, they actually provide a solution.
13 | The quick brown fox jumps over the lazy dog.
14 | Does the quick brown fox jump over the lazy dog?
15 | Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
16 | She sells sea-shells on the sea-shore. The shells she sells are sea-shells I’m sure.
17 | The Blue Lagoon is a nineteen eighty American romance adventure film.
18 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.6"
 5 | 
 6 | notifications:
 7 |   email: false
 8 | 
 9 | before_install:
10 |   - sudo apt-get update
11 |   - if [["$TRAVIS_PYTHON_VERSION" == "2.7"]]; then
12 |       wget http://repo.continuum.io/miniconda/Miniconda-3.8.3-Linux-x86_64.sh -O miniconda.sh;
13 |     else
14 |       wget http://repo.continuum.io/miniconda/Miniconda3-3.8.3-Linux-x86_64.sh -O miniconda.sh;
15 |     fi
16 |   - bash miniconda.sh -b -p $HOME/miniconda
17 |   - export PATH="$HOME/miniconda/bin:$PATH"
18 |   - hash -r
19 |   - conda config --set always_yes yes --set changeps1 no
20 |   - conda update -q conda
21 |   # Useful for debugging any issues with conda
22 |   - conda config --add channels pypi
23 |   - conda info -a
24 |   - deps='pip numpy scipy cython nose pytorch'
25 |   - conda create -q -n test-environment "python=$TRAVIS_PYTHON_VERSION" $deps -c pytorch
26 |   - source activate test-environment
27 | 
28 | install:
29 |   - pip install -e ".[test]"
30 | script:
31 |   - nosetests -v -w tests/ -a '!local_only'
32 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/sampleit.sh:
--------------------------------------------------------------------------------
 1 | if [ $# -eq 0 ]; then
 2 |       echo "Must pass model path (without .meta or other extensions) as an argument"
 3 |       exit
 4 | fi
 5 | 
 6 | mkdir -p sample_results
 7 | for t in blend++ blend chars phones; do
 8 |     mkdir -p sample_results/
 9 |     if [[ -z "$2" ]]; then
10 |         python -u sample_rnn_unaligned_speech_ljspeech.py "$1" custom_test.txt taco_prosody_test.txt taco_small_test.txt quote_test.txt basic_test.txt valid --inp=$t --sonify=1000 2>&1 | tee /Tmp/kastner/sample_log.txt
11 |     fi
12 |     if [[ ! -z "$2" ]]; then 
13 |         python -u sample_rnn_unaligned_speech_ljspeech.py "$1" custom_test.txt taco_prosody_test.txt taco_small_test.txt quote_test.txt basic_test.txt valid "$2" --inp=$t --sonify=1000 2>&1 | tee /Tmp/kastner/sample_log.txt
14 |         #python sample_rnn_unaligned_speech_ljspeech.py "$1" "$2" --inp=$t --test=$s --sonify=1000 2>&1 | tee sample_results/"$t"_"$s"/sample_log.txt
15 |     fi
16 |     #python sample_rnn_unaligned_speech_ljspeech.py "$1" --inp=$t --test=$s 2>&1 | tee sample_results/"$t"_"$s"/sample_log.txt
17 |     mv *sampled_text_summary.txt sample_results/
18 |     mv /Tmp/kastner/sample_log.txt sample_results/
19 | done
20 | 
21 | mv *.wav sample_results/
22 | mv *.png sample_results/
23 | 


--------------------------------------------------------------------------------
/code/lib/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | # make it explicit that we favor the new container-based travis workers
 3 | sudo: false
 4 | addons:
 5 |   apt:
 6 |     packages:
 7 |       # Only used by the DISTRIB="ubuntu" setting
 8 |       - libatlas3gf-base
 9 |       - libatlas-dev
10 |       - python-numpy
11 |       - python-scipy
12 | env:
13 |   matrix:
14 |     - DISTRIB="conda" PYTHON_VERSION="2.7"
15 |       NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.0" TF_VERSION="1.4.1"
16 |     # This environment tests the newest supported anaconda env
17 |     - DISTRIB="conda" PYTHON_VERSION="2.7" INSTALL_MKL="true"
18 |       NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.0" TF_VERSION="1.4.1"
19 |     # This environment tests the newest supported anaconda env
20 |     #- DISTRIB="conda" PYTHON_VERSION="3.4" INSTALL_MKL="true"
21 |     #  NUMPY_VERSION="1.8.2" SCIPY_VERSION="0.14.0"
22 | 
23 | install: source continuous_integration/install.sh
24 | script: bash continuous_integration/test_script.sh
25 | after_success:
26 |     # Ignore coveralls failures as the coveralls server is not very reliable
27 |     # but we don't want travis to report a failure in the github UI just
28 |     # because the coverage report failed to be published.
29 |     - if [[ "$COVERAGE" == "true" ]]; then coveralls || echo "failed"; fi
30 | cache: apt
31 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/misc_scripts/speech_itr_test.py:
--------------------------------------------------------------------------------
 1 | from tfbldr.datasets import tbptt_file_list_iterator
 2 | import os
 3 | import numpy as np
 4 | 
 5 | files = os.listdir("/Tmp/kastner/lj_speech_hybrid_speakers/numpy_features/")
 6 | files = ["/Tmp/kastner/lj_speech_hybrid_speakers/numpy_features/" + f for f in files]
 7 | ljspeech_hybridset = [' ', '!', ',', '-', '.', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
 8 | hybrid_lookup = {v: k for k, v in enumerate(sorted(ljspeech_hybridset))}
 9 | hybrid_inverse_lookup = {v: k for k, v in hybrid_lookup.items()}
10 | 
11 | def file_access(f):
12 |     d = np.load(f)
13 |     text = d["text"]
14 |     inds = [hybrid_lookup[t] for t in text.ravel()[0]]
15 |     audio = d["audio_features"]
16 |     return (audio, inds)
17 | 
18 | random_state = np.random.RandomState(1442)
19 | batch_size = 8
20 | truncation_length = 256
21 | itr = tbptt_file_list_iterator(files, file_access,
22 |                                batch_size,
23 |                                truncation_length,
24 |                                other_one_hot_size=[len(ljspeech_hybridset)],
25 |                                random_state=random_state)
26 | for i in range(100000):
27 |     print(i)
28 |     r = itr.next_masked_batch()
29 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The wavenet_vocoder package is licensed under the MIT "Expat" License:
 2 | 
 3 | > Copyright (c) 2017: Ryuichi Yamamoto.
 4 | >
 5 | > Permission is hereby granted, free of charge, to any person obtaining
 6 | > a copy of this software and associated documentation files (the
 7 | > "Software"), to deal in the Software without restriction, including
 8 | > without limitation the rights to use, copy, modify, merge, publish,
 9 | > distribute, sublicense, and/or sell copies of the Software, and to
10 | > permit persons to whom the Software is furnished to do so, subject to
11 | > the following conditions:
12 | >
13 | > The above copyright notice and this permission notice shall be
14 | > included in all copies or substantial portions of the Software.
15 | >
16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | > IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | > CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | > TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | > SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/lrschedule.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | # https://github.com/tensorflow/tensor2tensor/issues/280#issuecomment-339110329
 5 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000):
 6 |     # Noam scheme from tensor2tensor:
 7 |     warmup_steps = float(warmup_steps)
 8 |     step = global_step + 1.
 9 |     lr = init_lr * warmup_steps**0.5 * np.minimum(
10 |         step * warmup_steps**-1.5, step**-0.5)
11 |     return lr
12 | 
13 | 
14 | def step_learning_rate_decay(init_lr, global_step,
15 |                              anneal_rate=0.98,
16 |                              anneal_interval=30000):
17 |     return init_lr * anneal_rate ** (global_step // anneal_interval)
18 | 
19 | 
20 | def cyclic_cosine_annealing(init_lr, global_step, T, M):
21 |     """Cyclic cosine annealing
22 | 
23 |     https://arxiv.org/pdf/1704.00109.pdf
24 | 
25 |     Args:
26 |         init_lr (float): Initial learning rate
27 |         global_step (int): Current iteration number
28 |         T (int): Total iteration number (i,e. nepoch)
29 |         M (int): Number of ensembles we want
30 | 
31 |     Returns:
32 |         float: Annealed learning rate
33 |     """
34 |     TdivM = T // M
35 |     return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0)
36 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.rst
 2 | setup.py
 3 | tfbldr/__init__.py
 4 | tfbldr.egg-info/PKG-INFO
 5 | tfbldr.egg-info/SOURCES.txt
 6 | tfbldr.egg-info/dependency_links.txt
 7 | tfbldr.egg-info/requires.txt
 8 | tfbldr.egg-info/top_level.txt
 9 | tfbldr/core/__init__.py
10 | tfbldr/core/core.py
11 | tfbldr/datasets/__init__.py
12 | tfbldr/datasets/iterators.py
13 | tfbldr/datasets/loaders.py
14 | tfbldr/datasets/plotters.py
15 | tfbldr/datasets/audio/__init__.py
16 | tfbldr/datasets/audio/audio_tools.py
17 | tfbldr/datasets/audio/datasets.py
18 | tfbldr/datasets/audio/magrecnp.py
19 | tfbldr/datasets/music/__init__.py
20 | tfbldr/datasets/music/analysis.py
21 | tfbldr/datasets/music/loaders.py
22 | tfbldr/datasets/music/music.py
23 | tfbldr/datasets/text/__init__.py
24 | tfbldr/datasets/text/cleaning/__init__.py
25 | tfbldr/datasets/text/cleaning/cleaners.py
26 | tfbldr/datasets/text/cleaning/cmudict.py
27 | tfbldr/datasets/text/cleaning/eng_rules.py
28 | tfbldr/datasets/text/cleaning/number_to_words.py
29 | tfbldr/datasets/text/cleaning/numbers.py
30 | tfbldr/datasets/text/cleaning/symbols.py
31 | tfbldr/nodes/__init__.py
32 | tfbldr/nodes/nodes.py
33 | tfbldr/plot/__init__.py
34 | tfbldr/plot/audio.py
35 | tfbldr/plot/plot.py
36 | tfbldr/test/__init__.py
37 | tfbldr/test/test_import.py
38 | tfbldr/test/test_simple.py
39 | tfbldr/utils/__init__.py
40 | tfbldr/utils/utils.py


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/taco_prosody_test.txt:
--------------------------------------------------------------------------------
 1 | how do bureaucrats wrap presents? with lots of red tape.
 2 | why are libraries so strict? they have to go by the book. 
 3 | why are fish so smart? because they hang out in schools so much.
 4 | heaps of things. like fairy bread, how the surf is today and why magpies swoop. 
 5 | the past, the present, and the future walk into a bar. it was tense.
 6 | i usually down a cup of java script. then i put on nature sounds and run a few strenuous searches to improve my speed.
 7 | i don't have eyes, but i don't need them to know the vibe in here feels good.
 8 | what time do you go to the dentist? at tooth-hurty!
 9 | sweet dreams are made of these. friendly assistants who work hard to please
10 | you are what you eat. so i guess i'm a whole lot of data and a little bit of pizza recipes.
11 | men say they know many things; but lo! they have taken wings, the arts and sciences, And a thousand appliances; the wind that blows is all that any body knows.
12 | do you prefer chocolate or jelly? which would you like in your belly? you could make a good case, for a cool ice cream base, but I'd argue against vermicelli.
13 | halloween edition it is! remember to follow the moves as I say them.
14 | why are archaeologists so annoyed? they always have a bone to pick.
15 | that one sailed right over my head.
16 | wear your heart on your sleeve. it'll terrify people.
17 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .nodes import Linear
 2 | from .nodes import ReLU
 3 | from .nodes import Tanh
 4 | from .nodes import Sigmoid
 5 | from .nodes import OneHot
 6 | from .nodes import Softmax
 7 | from .nodes import Conv2d
 8 | from .nodes import GatedMaskedConv2d
 9 | from .nodes import ConvTranspose2d
10 | from .nodes import BatchNorm2d
11 | from .nodes import LayerNorm
12 | from .nodes import Embedding
13 | from .nodes import PositionalEncoding
14 | from .nodes import TransformerBlock
15 | from .nodes import MultiheadAttention
16 | from .nodes import Bilinear
17 | from .nodes import VqEmbedding
18 | from .nodes import VqSeqEmbedding
19 | from .nodes import SimpleRNNCell
20 | from .nodes import BiLSTMLayer
21 | from .nodes import SequenceConv1dStack
22 | from .nodes import LSTMCell
23 | from .nodes import GRUCell
24 | from .nodes import AdditiveGaussianNoise
25 | from .nodes import GaussianAttentionCell
26 | from .nodes import DiscreteMixtureOfLogistics
27 | from .nodes import DiscreteMixtureOfLogisticsCost
28 | from .nodes import BernoulliAndCorrelatedGMM
29 | from .nodes import BernoulliAndCorrelatedGMMCost
30 | from .nodes import BernoulliCrossEntropyCost
31 | from .nodes import CategoricalCrossEntropyCost
32 | from .nodes import CategoricalCrossEntropyIndexCost
33 | from .nodes import CategoricalCrossEntropyLinearIndexCost
34 | from .nodes import make_numpy_weights
35 | from .nodes import make_numpy_biases
36 | 


--------------------------------------------------------------------------------
/pretrained/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | 
 8 | import cmudict
 9 | 
10 | _pad = '_'
11 | _eos = '~'
12 | # PUT IT BACK!!!
13 | 
14 | _phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh', ' ']
15 | special = [s for s in "!\',-.:?"]
16 | _pau_phones = _phones + [s for s in ["1","2","3","4"]]
17 | _phones = _phones + special
18 | 
19 | _characters = 'abcdefghijklmnopqrstuvwxyz!\',-.:? '
20 | _rules = 'abcdefghijklmnopqrstuvwxyz&^!\',-.:? '
21 | 
22 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\',-.:? '
23 | 
24 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
25 | #_arpabet = ['@' + s for s in cmudict.valid_symbols]
26 | 
27 | # Export all symbols:
28 | char_symbols = [_pad, _eos] + list(_characters)# + _arpabet
29 | phone_symbols = [_pad, _eos] + list(_phones)# + _arpabet
30 | pau_phone_symbols = [_pad, _eos] + list(_pau_phones)
31 | rule_symbols = [_pad, _eos] + list(_rules)# + _arpabet
32 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
 7 | 
 8 | import cmudict
 9 | 
10 | _pad = '_'
11 | _eos = '~'
12 | # PUT IT BACK!!!
13 | 
14 | _phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh', ' ']
15 | special = [s for s in "!\',-.:?"]
16 | _pau_phones = _phones + [s for s in ["1","2","3","4"]]
17 | _phones = _phones + special
18 | 
19 | _characters = 'abcdefghijklmnopqrstuvwxyz!\',-.:? '
20 | _rules = 'abcdefghijklmnopqrstuvwxyz&^!\',-.:? '
21 | 
22 | #_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\',-.:? '
23 | 
24 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
25 | #_arpabet = ['@' + s for s in cmudict.valid_symbols]
26 | 
27 | # Export all symbols:
28 | char_symbols = [_pad, _eos] + list(_characters)# + _arpabet
29 | phone_symbols = [_pad, _eos] + list(_phones)# + _arpabet
30 | pau_phone_symbols = [_pad, _eos] + list(_pau_phones)
31 | rule_symbols = [_pad, _eos] + list(_rules)# + _arpabet
32 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/test/test_simple.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logging.getLogger('tensorflow').disabled = True
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | from tfbldr import make_numpy_weights, make_numpy_biases, dot, scan, get_params_dict
 6 | from tfbldr.nodes import Linear, SimpleRNNCell
 7 | 
 8 | n_batch = 64
 9 | h_dim = 400
10 | random_state = np.random.RandomState(2145)
11 | 
12 | inputs = tf.placeholder(tf.float32, [None, n_batch, 3],
13 |                             name="inputs")
14 | init_h = tf.placeholder(tf.float32, [n_batch, h_dim],
15 |                             name="init_h")
16 | 
17 | def step(inp_t, h_tm1):
18 |     output, state = SimpleRNNCell([inp_t], [3], h_tm1, h_dim, 20, random_state=random_state,
19 |                               name="l1")
20 |     h = state[0]
21 |     return output, h
22 | 
23 | o = scan(step, [inputs], [None, init_h])
24 | loss = tf.reduce_mean(o[0])
25 | h_o = o[1]
26 | 
27 | params_dict = get_params_dict()
28 | params = params_dict.values()
29 | grads = tf.gradients(loss, params)
30 | 
31 | learning_rate = 0.0002
32 | opt = tf.train.AdamOptimizer(learning_rate=learning_rate, use_locking=True)
33 | updates = opt.apply_gradients(zip(grads, params))
34 | 
35 | inputs_np = random_state.randn(33, n_batch, 3)
36 | init_h_np = np.zeros((n_batch, h_dim))
37 | with tf.Session() as sess:
38 |     sess.run(tf.global_variables_initializer())
39 |     feed = {inputs: inputs_np,
40 |             init_h: init_h_np}
41 |     outs = [loss, updates, h_o]
42 |     lop = sess.run(outs, feed)
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Kyle Kastner
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/code/lib/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Kyle Kastner
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .loaders import rsync_fetch
 2 | from .loaders import fetch_iamondb
 3 | from .loaders import fetch_ljspeech
 4 | from .loaders import fetch_fruitspeech
 5 | from .loaders import fetch_mnist
 6 | from .loaders import fetch_fashion_mnist
 7 | from .loaders import make_sinewaves
 8 | from .loaders import get_tfbldr_dataset_dir
 9 | from .loaders import fetch_norvig_words
10 | from .audio import wavfile_caching_mel_tbptt_iterator
11 | from .iterators import list_iterator
12 | from .iterators import ordered_list_iterator
13 | from .iterators import tbptt_list_iterator
14 | from .iterators import tbptt_file_list_iterator
15 | from .iterators import char_textfile_iterator
16 | 
17 | from ..core import get_logger
18 | logger = get_logger()
19 | 
20 | # music21 and PIL are optional deps
21 | try:
22 |     from .music import fetch_jsb
23 |     from .music import fetch_josquin
24 |     from .music import pitch_and_duration_to_quantized
25 |     from .music import pitches_and_durations_to_pretty_midi
26 |     from .music import quantized_to_pretty_midi
27 |     from .music import plot_pitches_and_durations
28 |     from .music import music21_to_pitch_duration
29 |     from .music import music21_to_quantized
30 |     from .music import plot_piano_roll
31 |     from .music import quantized_imlike_to_image_array
32 |     from .music import midi_to_notes
33 |     from .music import notes_to_midi
34 |     from .music import quantized_to_pitch_duration
35 | except ImportError:
36 |     logger.info("Unable to import music21 related utilities")
37 | 
38 | try:
39 |     from .plotters import save_image_array
40 | except ImportError:
41 |     logger.info("Unable to import PIL related utilities")
42 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/plot/audio.py:
--------------------------------------------------------------------------------
 1 | from ..datasets.audio import stft
 2 | from .plot import get_viridis
 3 | import numpy as np
 4 | 
 5 | 
 6 | def specgram(arr, fftsize=512, step=16, mean_normalize=True, real=False,
 7 |              compute_onesided=True, min_value=-100, max_value=np.inf, axis=0):
 8 |     arr = np.array(arr)
 9 |     if len(arr.shape) != 1:
10 |         raise ValueError("arr must be a 1D np array or list")
11 | 
12 |     if axis != 0:
13 |         raise ValueError("Must have axis=0")
14 | 
15 |     Pxx = 20. * np.log10(np.abs(stft(arr, fftsize=fftsize, step=step, mean_normalize=mean_normalize, real=real, compute_onesided=compute_onesided)))
16 |     return np.clip(Pxx, min_value, max_value)
17 | 
18 | 
19 | def specplot(arr, mplaxis, time_ratio=4, cmap="viridis"):
20 |     """
21 |     assumes arr comes in with time on axis 0, frequency on axis 1
22 |     """
23 |     import matplotlib.pyplot as plt
24 |     if cmap == "viridis":
25 |         cmap = get_viridis()
26 |     # Transpose so time is X axis, and invert y axis so
27 |     # frequency is low at bottom
28 |     mag = arr.T[::-1, :]
29 |     mplaxis.matshow(mag, cmap=cmap)
30 |     x1 = mag.shape[0]
31 |     y1 = mag.shape[1]
32 | 
33 |     def autoaspect(x_range, y_range):
34 |         """
35 |         The aspect to make a plot square with ax.set_aspect in Matplotlib
36 |         """
37 |         b = [x_range, y_range]
38 |         mi = np.argmax(b)
39 |         mx = b[mi]
40 |         mn = b[1] if mi == 0 else b[0]
41 |         ratio = time_ratio / 1. if mi == 0 else 1. / time_ratio
42 |         if x_range <= y_range:
43 |             return ratio * mx / float(mn)
44 |         else:
45 |             return ratio * mn / float(mx)
46 |     asp = autoaspect(x1, y1)
47 |     mplaxis.set_aspect(asp)
48 |     mplaxis.xaxis.tick_bottom()
49 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/ljspeech_mixture.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "raw",
 5 |   "quantize_channels": 65536,
 6 |   "sample_rate": 22050,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 30,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": -1,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 2,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.001,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": true,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true
63 | }


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/cmu_arctic_8bit.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "mulaw-quantize",
 5 |   "quantize_channels": 256,
 6 |   "sample_rate": 16000,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 256,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": -1,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 2,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.001,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": false,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true
63 | }


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/presets/multispeaker_cmu_arctic_mixture.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "wavenet_vocoder",
 3 |   "builder": "wavenet",
 4 |   "input_type": "raw",
 5 |   "quantize_channels": 65536,
 6 |   "sample_rate": 16000,
 7 |   "silence_threshold": 2,
 8 |   "num_mels": 80,
 9 |   "fmin": 125,
10 |   "fmax": 7600,
11 |   "fft_size": 1024,
12 |   "hop_size": 256,
13 |   "frame_shift_ms": null,
14 |   "min_level_db": -100,
15 |   "ref_level_db": 20,
16 |   "rescaling": true,
17 |   "rescaling_max": 0.999,
18 |   "allow_clipping_in_normalization": true,
19 |   "log_scale_min": -32.23619130191664,
20 |   "out_channels": 30,
21 |   "layers": 24,
22 |   "stacks": 4,
23 |   "residual_channels": 512,
24 |   "gate_channels": 512,
25 |   "skip_out_channels": 256,
26 |   "dropout": 0.050000000000000044,
27 |   "kernel_size": 3,
28 |   "weight_normalization": true,
29 |   "cin_channels": 80,
30 |   "upsample_conditional_features": true,
31 |   "upsample_scales": [
32 |     4,
33 |     4,
34 |     4,
35 |     4
36 |   ],
37 |   "freq_axis_kernel_size": 3,
38 |   "gin_channels": 16,
39 |   "n_speakers": 7,
40 |   "pin_memory": true,
41 |   "num_workers": 2,
42 |   "test_size": 0.0441,
43 |   "test_num_samples": null,
44 |   "random_state": 1234,
45 |   "batch_size": 2,
46 |   "adam_beta1": 0.9,
47 |   "adam_beta2": 0.999,
48 |   "adam_eps": 1e-08,
49 |   "initial_learning_rate": 0.001,
50 |   "lr_schedule": "noam_learning_rate_decay",
51 |   "lr_schedule_kwargs": {},
52 |   "nepochs": 2000,
53 |   "weight_decay": 0.0,
54 |   "clip_thresh": -1,
55 |   "max_time_sec": null,
56 |   "max_time_steps": 8000,
57 |   "exponential_moving_average": true,
58 |   "ema_decay": 0.9999,
59 |   "checkpoint_interval": 10000,
60 |   "train_eval_interval": 10000,
61 |   "test_eval_epoch_interval": 5,
62 |   "save_optimizer_state": true
63 | }


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/builder.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | 
 5 | def wavenet(out_channels=256,
 6 |             layers=20,
 7 |             stacks=2,
 8 |             residual_channels=512,
 9 |             gate_channels=512,
10 |             skip_out_channels=512,
11 |             cin_channels=-1,
12 |             gin_channels=-1,
13 |             weight_normalization=True,
14 |             dropout=1 - 0.95,
15 |             kernel_size=3,
16 |             n_speakers=None,
17 |             upsample_conditional_features=False,
18 |             upsample_scales=[16, 16],
19 |             freq_axis_kernel_size=3,
20 |             scalar_input=False,
21 |             use_speaker_embedding=True,
22 |             legacy=True,
23 |             ):
24 |     from wavenet_vocoder_core import WaveNet
25 | 
26 |     model = WaveNet(out_channels=out_channels, layers=layers, stacks=stacks,
27 |                     residual_channels=residual_channels,
28 |                     gate_channels=gate_channels,
29 |                     skip_out_channels=skip_out_channels,
30 |                     kernel_size=kernel_size, dropout=dropout,
31 |                     weight_normalization=weight_normalization,
32 |                     cin_channels=cin_channels, gin_channels=gin_channels,
33 |                     n_speakers=n_speakers,
34 |                     upsample_conditional_features=upsample_conditional_features,
35 |                     upsample_scales=upsample_scales,
36 |                     freq_axis_kernel_size=freq_axis_kernel_size,
37 |                     scalar_input=scalar_input,
38 |                     use_speaker_embedding=use_speaker_embedding,
39 |                     legacy=legacy,
40 |                     )
41 | 
42 |     return model
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Representation Mixing
 2 | 
 3 | This repo has code and pretrained models in support of the paper [Representation Mixing for TTS Synthesis](https://arxiv.org/abs/1811.07240)
 4 | 
 5 | Try the demo! https://colab.research.google.com/github/kastnerkyle/representation_mixing/blob/master/pretrained/representation_mixing_text_to_speech_demo.ipynb
 6 | 
 7 | Samples site: https://s3.amazonaws.com/representation-mixing-site/index.html
 8 | 
 9 | # Abstract
10 | Recent character and phoneme-based parametric TTS systems using deep learning have shown strong performance in natural speech generation. However, the choice between character or phoneme input can create serious limitations for practical deployment, as direct control of pronunciation is crucial in certain cases. We demonstrate a simple method for combining multiple types of linguistic information in a single encoder, named representation mixing, enabling flexible choice between character, phoneme, or mixed representations during inference. Experiments and user studies on a public audiobook corpus show the efficacy of our approach.
11 | 
12 | [(Taken from the paper)](https://arxiv.org/abs/1811.07240)
13 | 
14 | # Architecture Diagram
15 | <div style="text-align:center">
16 | <img width="110" height="200" src="https://raw.githubusercontent.com/kastnerkyle/representation_mixing/master/figures/white.png"/>
17 | <img width="250" height="200" src="https://raw.githubusercontent.com/kastnerkyle/representation_mixing/master/figures/network_diagram_cropped.png"/>
18 | </div>
19 | <div><img width="300" height="200" src="https://raw.githubusercontent.com/kastnerkyle/representation_mixing/master/figures/embedding_module_cropped.png"/></div>
20 | 
21 | # More Info
22 | `pretrained/` contains some information and code for pretrained models, as well as a colab notebook for sampling from the pretrained model
23 | 
24 | `code/` (will) contain a NON-RUNNABLE code dump of my research library used for training the model. This is only for very, very interested people and for seeing the model definition in code. If you just want sound, use the colab.
25 | 
26 | 


--------------------------------------------------------------------------------
/pretrained/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | valid_symbols = [
 6 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 7 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 8 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 9 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
10 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
11 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
12 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
13 | ]
14 | 
15 | _valid_symbol_set = set(valid_symbols)
16 | 
17 | 
18 | class CMUDict:
19 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
20 |   def __init__(self, file_or_path, keep_ambiguous=True):
21 |     if isinstance(file_or_path, str):
22 |       with open(file_or_path, encoding='latin-1') as f:
23 |         entries = _parse_cmudict(f)
24 |     else:
25 |       entries = _parse_cmudict(file_or_path)
26 |     if not keep_ambiguous:
27 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
28 |     self._entries = entries
29 | 
30 | 
31 |   def __len__(self):
32 |     return len(self._entries)
33 | 
34 | 
35 |   def lookup(self, word):
36 |     '''Returns list of ARPAbet pronunciations of the given word.'''
37 |     return self._entries.get(word.upper())
38 | 
39 | 
40 | 
41 | _alt_re = re.compile(r'\([0-9]+\)')
42 | 
43 | 
44 | def _parse_cmudict(file):
45 |   cmudict = {}
46 |   for line in file:
47 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
48 |       parts = line.split('  ')
49 |       word = re.sub(_alt_re, '', parts[0])
50 |       pronunciation = _get_pronunciation(parts[1])
51 |       if pronunciation:
52 |         if word in cmudict:
53 |           cmudict[word].append(pronunciation)
54 |         else:
55 |           cmudict[word] = [pronunciation]
56 |   return cmudict
57 | 
58 | 
59 | def _get_pronunciation(s):
60 |   parts = s.strip().split(' ')
61 |   for part in parts:
62 |     if part not in _valid_symbol_set:
63 |       return None
64 |   return ' '.join(parts)
65 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/cmudict.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import re
 4 | 
 5 | valid_symbols = [
 6 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 7 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 8 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 9 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
10 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
11 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
12 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
13 | ]
14 | 
15 | _valid_symbol_set = set(valid_symbols)
16 | 
17 | 
18 | class CMUDict:
19 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
20 |   def __init__(self, file_or_path, keep_ambiguous=True):
21 |     if isinstance(file_or_path, str):
22 |       with open(file_or_path, encoding='latin-1') as f:
23 |         entries = _parse_cmudict(f)
24 |     else:
25 |       entries = _parse_cmudict(file_or_path)
26 |     if not keep_ambiguous:
27 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
28 |     self._entries = entries
29 | 
30 | 
31 |   def __len__(self):
32 |     return len(self._entries)
33 | 
34 | 
35 |   def lookup(self, word):
36 |     '''Returns list of ARPAbet pronunciations of the given word.'''
37 |     return self._entries.get(word.upper())
38 | 
39 | 
40 | 
41 | _alt_re = re.compile(r'\([0-9]+\)')
42 | 
43 | 
44 | def _parse_cmudict(file):
45 |   cmudict = {}
46 |   for line in file:
47 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
48 |       parts = line.split('  ')
49 |       word = re.sub(_alt_re, '', parts[0])
50 |       pronunciation = _get_pronunciation(parts[1])
51 |       if pronunciation:
52 |         if word in cmudict:
53 |           cmudict[word].append(pronunciation)
54 |         else:
55 |           cmudict[word] = [pronunciation]
56 |   return cmudict
57 | 
58 | 
59 | def _get_pronunciation(s):
60 |   parts = s.strip().split(' ')
61 |   for part in parts:
62 |     if part not in _valid_symbol_set:
63 |       return None
64 |   return ' '.join(parts)
65 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """
 3 | Preprocess dataset
 4 | 
 5 | usage: preprocess.py [options] <name> <in_dir> <out_dir>
 6 | 
 7 | options:
 8 |     --num_workers=<n>        Num workers.
 9 |     --hparams=<parmas>       Hyper parameters [default: ].
10 |     --preset=<json>          Path of preset parameters (json).
11 |     -h, --help               Show help message.
12 | """
13 | from docopt import docopt
14 | import os
15 | from multiprocessing import cpu_count
16 | from tqdm import tqdm
17 | import importlib
18 | from hparams import hparams
19 | 
20 | 
21 | def preprocess(mod, in_dir, out_root, num_workers):
22 |     os.makedirs(out_dir, exist_ok=True)
23 |     metadata = mod.build_from_path(in_dir, out_dir, num_workers, tqdm=tqdm)
24 |     write_metadata(metadata, out_dir)
25 | 
26 | 
27 | def write_metadata(metadata, out_dir):
28 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
29 |         for m in metadata:
30 |             f.write('|'.join([str(x) for x in m]) + '\n')
31 |     frames = sum([m[2] for m in metadata])
32 |     sr = hparams.sample_rate
33 |     hours = frames / sr / 3600
34 |     print('Wrote %d utterances, %d time steps (%.2f hours)' % (len(metadata), frames, hours))
35 |     print('Max input length:  %d' % max(len(m[3]) for m in metadata))
36 |     print('Max output length: %d' % max(m[2] for m in metadata))
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     args = docopt(__doc__)
41 |     name = args["<name>"]
42 |     in_dir = args["<in_dir>"]
43 |     out_dir = args["<out_dir>"]
44 |     num_workers = args["--num_workers"]
45 |     num_workers = cpu_count() if num_workers is None else int(num_workers)
46 |     preset = args["--preset"]
47 | 
48 |     # Load preset if specified
49 |     if preset is not None:
50 |         with open(preset) as f:
51 |             hparams.parse_json(f.read())
52 |     # Override hyper parameters
53 |     hparams.parse(args["--hparams"])
54 |     assert hparams.name == "wavenet_vocoder"
55 | 
56 |     print("Sampling frequency: {}".format(hparams.sample_rate))
57 | 
58 |     assert name in ["cmu_arctic", "ljspeech", "librivox", "jsut"]
59 |     mod = importlib.import_module(name)
60 |     preprocess(mod, in_dir, out_dir, num_workers)
61 | 


--------------------------------------------------------------------------------
/code/lib/continuous_integration/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is meant to be called by the "install" step defined in
 3 | # .travis.yml. See http://docs.travis-ci.com/ for more details.
 4 | # The behavior of the script is controlled by environment variabled defined
 5 | # in the .travis.yml in the top level folder of the project.
 6 | 
 7 | # License: 3-clause BSD
 8 | 
 9 | set -e
10 | 
11 | # Fix the compilers to workaround avoid having the Python 3.4 build
12 | # lookup for g++44 unexpectedly.
13 | export CC=gcc
14 | export CXX=g++
15 | 
16 | echo 'List files from cached directories'
17 | echo 'pip:'
18 | ls $HOME/.cache/pip
19 | if [[ -d $HOME/download ]]; then
20 |     echo 'download'
21 |     ls $HOME/download
22 | fi
23 | 
24 | # Deactivate the travis-provided virtual environment and setup a
25 | # conda-based environment instead
26 | deactivate
27 | 
28 | # Use the miniconda installer for faster download / install of conda
29 | # itself
30 | pushd .
31 | cd
32 | mkdir -p download
33 | cd download
34 | echo "Cached in $HOME/download :"
35 | ls -l
36 | echo
37 | if [[ ! -f miniconda.sh ]]
38 |     then
39 |     wget https://repo.continuum.io/miniconda/Miniconda2-4.3.11-Linux-x86_64.sh \
40 |         -O miniconda.sh
41 |     fi
42 | chmod +x miniconda.sh && ./miniconda.sh -b
43 | cd ..
44 | echo $(ls /home/travis/m*)
45 | export PATH=/home/travis/miniconda2/bin:$PATH
46 | conda update --yes conda
47 | popd
48 | 
49 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
50 |     numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION tensorflow=$TF_VERSION
51 | source activate testenv
52 | 
53 | if [[ "$INSTALL_MKL" == "true" ]]; then
54 |     # Make sure that MKL is used
55 |     conda install --yes mkl
56 | else
57 |     # Make sure that MKL is not used
58 |     conda remove --yes --features mkl || echo "MKL not installed"
59 | fi
60 | 
61 | if [[ "$COVERAGE" == "true" ]]; then
62 |     pip install coverage coveralls
63 | fi
64 | 
65 | # Build scikit-learn in the install.sh script to collapse the verbose
66 | # build output in the travis output when it succeeds.
67 | python --version
68 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
69 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
70 | python -c "import tensorflow as tf; print('tensorflow %s' % tf.__version__)"
71 | python setup.py build_ext --inplace
72 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | import setuptools.command.develop
 5 | import setuptools.command.build_py
 6 | import os
 7 | import subprocess
 8 | 
 9 | version = '0.1.1'
10 | 
11 | # Adapted from https://github.com/pytorch/pytorch
12 | cwd = os.path.dirname(os.path.abspath(__file__))
13 | if os.getenv('WAVENET_VOCODER_BUILD_VERSION'):
14 |     version = os.getenv('WAVENET_VOCODER_BUILD_VERSION')
15 | else:
16 |     try:
17 |         sha = subprocess.check_output(
18 |             ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
19 |         version += '+' + sha[:7]
20 |     except subprocess.CalledProcessError:
21 |         pass
22 |     except IOError:  # FileNotFoundError for python 3
23 |         pass
24 | 
25 | 
26 | class build_py(setuptools.command.build_py.build_py):
27 | 
28 |     def run(self):
29 |         self.create_version_file()
30 |         setuptools.command.build_py.build_py.run(self)
31 | 
32 |     @staticmethod
33 |     def create_version_file():
34 |         global version, cwd
35 |         print('-- Building version ' + version)
36 |         version_path = os.path.join(cwd, 'wavenet_vocoder', 'version.py')
37 |         with open(version_path, 'w') as f:
38 |             f.write("__version__ = '{}'\n".format(version))
39 | 
40 | 
41 | class develop(setuptools.command.develop.develop):
42 | 
43 |     def run(self):
44 |         build_py.create_version_file()
45 |         setuptools.command.develop.develop.run(self)
46 | 
47 | 
48 | setup(name='wavenet_vocoder',
49 |       version=version,
50 |       description='PyTorch implementation of WaveNet vocoder',
51 |       packages=find_packages(),
52 |       cmdclass={
53 |           'build_py': build_py,
54 |           'develop': develop,
55 |       },
56 |       install_requires=[
57 |           "numpy",
58 |           "scipy",
59 |           "torch >= 0.4.1",
60 |       ],
61 |       extras_require={
62 |           "train": [
63 |               "docopt",
64 |               "tqdm",
65 |               "tensorboardX",
66 |               "nnmnkwii >= 0.0.11",
67 |               "keras",
68 |               "scikit-learn",
69 |               "lws",
70 |           ],
71 |           "test": [
72 |               "nose",
73 |               "pysptk >= 0.1.9",
74 |               "librosa",
75 |               "matplotlib",
76 |               "tqdm",
77 |               "nnmnkwii >= 0.0.11",
78 |           ],
79 |       })
80 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/tests/test_mixture.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from __future__ import with_statement, print_function, absolute_import
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch import nn
 7 | from torch.nn import functional as F
 8 | 
 9 | import librosa
10 | import pysptk
11 | 
12 | from wavenet_vocoder.mixture import discretized_mix_logistic_loss
13 | from wavenet_vocoder.mixture import sample_from_discretized_mix_logistic
14 | 
15 | 
16 | def log_prob_from_logits(x):
17 |     """ numerically stable log_softmax implementation that prevents overflow """
18 |     # TF ordering
19 |     axis = len(x.size()) - 1
20 |     m, _ = torch.max(x, dim=-1, keepdim=True)
21 |     return x - m - torch.log(torch.sum(torch.exp(x - m), dim=axis, keepdim=True))
22 | 
23 | 
24 | def test_log_softmax():
25 |     x = torch.rand(2, 16000, 30)
26 |     y = log_prob_from_logits(x)
27 |     y_hat = F.log_softmax(x, -1)
28 | 
29 |     y = y.data.cpu().numpy()
30 |     y_hat = y_hat.data.cpu().numpy()
31 |     assert np.allclose(y, y_hat)
32 | 
33 | 
34 | def test_mixture():
35 |     np.random.seed(1234)
36 | 
37 |     x, sr = librosa.load(pysptk.util.example_audio_file(), sr=None)
38 |     assert sr == 16000
39 | 
40 |     T = len(x)
41 |     x = x.reshape(1, T, 1)
42 |     y = torch.from_numpy(x).float()
43 |     y_hat = torch.rand(1, 30, T).float()
44 | 
45 |     print(y.shape, y_hat.shape)
46 | 
47 |     loss = discretized_mix_logistic_loss(y_hat, y)
48 |     print(loss)
49 | 
50 |     loss = discretized_mix_logistic_loss(y_hat, y, reduce=False)
51 |     print(loss.size(), y.size())
52 |     assert loss.size() == y.size()
53 | 
54 |     y = sample_from_discretized_mix_logistic(y_hat)
55 |     print(y.shape)
56 | 
57 | 
58 | def test_misc():
59 |     # https://en.wikipedia.org/wiki/Logistic_distribution
60 |     # what i have learned
61 |     # m = (x - mu) / s
62 |     m = torch.rand(10, 10)
63 |     log_pdf_mid1 = -2 * torch.log(torch.exp(m / 2) + torch.exp(-m / 2))
64 |     log_pdf_mid2 = m - 2 * F.softplus(m)
65 |     assert np.allclose(log_pdf_mid1.data.numpy(), log_pdf_mid2.data.numpy())
66 | 
67 |     # Edge case for 0
68 |     plus_in = torch.rand(10, 10)
69 |     log_cdf_plus1 = F.sigmoid(m).log()
70 |     log_cdf_plus2 = m - F.softplus(m)
71 |     assert np.allclose(log_cdf_plus1.data.numpy(), log_cdf_plus2.data.numpy())
72 | 
73 |     # Edge case for 255
74 |     min_in = torch.rand(10, 10)
75 |     log_one_minus_cdf_min1 = (1 - F.sigmoid(min_in)).log()
76 |     log_one_minus_cdf_min2 = -F.softplus(min_in)
77 |     assert np.allclose(log_one_minus_cdf_min1.data.numpy(), log_one_minus_cdf_min2.data.numpy())
78 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/conv.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import torch
 3 | from torch import nn
 4 | from torch.nn import functional as F
 5 | 
 6 | 
 7 | class Conv1d(nn.Conv1d):
 8 |     """Extended nn.Conv1d for incremental dilated convolutions
 9 |     """
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         super().__init__(*args, **kwargs)
13 |         self.clear_buffer()
14 |         self._linearized_weight = None
15 |         self.register_backward_hook(self._clear_linearized_weight)
16 | 
17 |     def incremental_forward(self, input):
18 |         # input: (B, T, C)
19 |         if self.training:
20 |             raise RuntimeError('incremental_forward only supports eval mode')
21 | 
22 |         # run forward pre hooks (e.g., weight norm)
23 |         for hook in self._forward_pre_hooks.values():
24 |             hook(self, input)
25 | 
26 |         # reshape weight
27 |         weight = self._get_linearized_weight()
28 |         kw = self.kernel_size[0]
29 |         dilation = self.dilation[0]
30 | 
31 |         bsz = input.size(0)  # input: bsz x len x dim
32 |         if kw > 1:
33 |             input = input.data
34 |             if self.input_buffer is None:
35 |                 self.input_buffer = input.new(bsz, kw + (kw - 1) * (dilation - 1), input.size(2))
36 |                 self.input_buffer.zero_()
37 |             else:
38 |                 # shift buffer
39 |                 self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
40 |             # append next input
41 |             self.input_buffer[:, -1, :] = input[:, -1, :]
42 |             input = self.input_buffer
43 |             if dilation > 1:
44 |                 input = input[:, 0::dilation, :].contiguous()
45 |         output = F.linear(input.view(bsz, -1), weight, self.bias)
46 |         return output.view(bsz, 1, -1)
47 | 
48 |     def clear_buffer(self):
49 |         self.input_buffer = None
50 | 
51 |     def _get_linearized_weight(self):
52 |         if self._linearized_weight is None:
53 |             kw = self.kernel_size[0]
54 |             # nn.Conv1d
55 |             if self.weight.size() == (self.out_channels, self.in_channels, kw):
56 |                 weight = self.weight.transpose(1, 2).contiguous()
57 |             else:
58 |                 # fairseq.modules.conv_tbc.ConvTBC
59 |                 weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
60 |             assert weight.size() == (self.out_channels, kw, self.in_channels)
61 |             self._linearized_weight = weight.view(self.out_channels, -1)
62 |         return self._linearized_weight
63 | 
64 |     def _clear_linearized_weight(self, *args):
65 |         self._linearized_weight = None
66 | 


--------------------------------------------------------------------------------
/pretrained/numbers_rules.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ from https://github.com/keithito/tacotron """
 3 | 
 4 | import re
 5 | from number_to_words import NumberToWords
 6 | 
 7 | n2w = NumberToWords()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |   return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |   return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |   match = m.group(1)
26 |   parts = match.split('.')
27 |   if len(parts) > 2:
28 |     return match + ' dollars'  # Unexpected format
29 |   dollars = int(parts[0]) if parts[0] else 0
30 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |   if dollars and cents:
32 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |     cent_unit = 'cent' if cents == 1 else 'cents'
34 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |   elif dollars:
36 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |     return '%s %s' % (dollars, dollar_unit)
38 |   elif cents:
39 |     cent_unit = 'cent' if cents == 1 else 'cents'
40 |     return '%s %s' % (cents, cent_unit)
41 |   else:
42 |     return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |   return n2w.convert(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |   num = int(m.group(0))
51 |   return n2w.convert(num)
52 | 
53 | '''
54 | _inflect = inflect.engine()
55 | def _expand_ordinal(m):
56 |   return _inflect.number_to_words(m.group(0))
57 | 
58 | 
59 | def _expand_number(m):
60 |   num = int(m.group(0))
61 |   if num > 1000 and num < 3000:
62 |     if num == 2000:
63 |       return 'two thousand'
64 |     elif num > 2000 and num < 2010:
65 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
66 |     elif num % 100 == 0:
67 |       return _inflect.number_to_words(num // 100) + ' hundred'
68 |     else:
69 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
70 |   else:
71 |     return _inflect.number_to_words(num, andword='')
72 | '''
73 | 
74 | def normalize_numbers(text):
75 |   text = re.sub(_comma_number_re, _remove_commas, text)
76 |   text = re.sub(_pounds_re, r'\1 pounds', text)
77 |   text = re.sub(_dollars_re, _expand_dollars, text)
78 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
79 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
80 |   text = re.sub(_number_re, _expand_number, text)
81 |   return text
82 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/numbers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ from https://github.com/keithito/tacotron """
 3 | 
 4 | import re
 5 | from .number_to_words import NumberToWords
 6 | 
 7 | n2w = NumberToWords()
 8 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 9 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
10 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
11 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
12 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
13 | _number_re = re.compile(r'[0-9]+')
14 | 
15 | 
16 | def _remove_commas(m):
17 |   return m.group(1).replace(',', '')
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |   return m.group(1).replace('.', ' point ')
22 | 
23 | 
24 | def _expand_dollars(m):
25 |   match = m.group(1)
26 |   parts = match.split('.')
27 |   if len(parts) > 2:
28 |     return match + ' dollars'  # Unexpected format
29 |   dollars = int(parts[0]) if parts[0] else 0
30 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |   if dollars and cents:
32 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
33 |     cent_unit = 'cent' if cents == 1 else 'cents'
34 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
35 |   elif dollars:
36 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
37 |     return '%s %s' % (dollars, dollar_unit)
38 |   elif cents:
39 |     cent_unit = 'cent' if cents == 1 else 'cents'
40 |     return '%s %s' % (cents, cent_unit)
41 |   else:
42 |     return 'zero dollars'
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |   return n2w.convert(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |   num = int(m.group(0))
51 |   return n2w.convert(num)
52 | 
53 | '''
54 | _inflect = inflect.engine()
55 | def _expand_ordinal(m):
56 |   return _inflect.number_to_words(m.group(0))
57 | 
58 | 
59 | def _expand_number(m):
60 |   num = int(m.group(0))
61 |   if num > 1000 and num < 3000:
62 |     if num == 2000:
63 |       return 'two thousand'
64 |     elif num > 2000 and num < 2010:
65 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
66 |     elif num % 100 == 0:
67 |       return _inflect.number_to_words(num // 100) + ' hundred'
68 |     else:
69 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
70 |   else:
71 |     return _inflect.number_to_words(num, andword='')
72 | '''
73 | 
74 | def normalize_numbers(text):
75 |   text = re.sub(_comma_number_re, _remove_commas, text)
76 |   text = re.sub(_pounds_re, r'\1 pounds', text)
77 |   text = re.sub(_dollars_re, _expand_dollars, text)
78 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
79 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
80 |   text = re.sub(_number_re, _expand_number, text)
81 |   return text
82 | 


--------------------------------------------------------------------------------
/pretrained/text.py:
--------------------------------------------------------------------------------
 1 | from eng_rules import cmu_g2p, hybrid_g2p, rulebased_g2p
 2 | from cleaners import english_cleaners
 3 | import re
 4 | 
 5 | def pronounce_chars(line, raw_line=None, cmu_only=False, int_timing_punct=True):
 6 |     # cleaners strip things...
 7 |     puncts = ["!",",",":","?","."]
 8 |     #puncts_timing = ["4","1","1","4", "4"]
 9 |     puncts_timing = [" "," "," "," ", " "]
10 |     end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line]
11 |     if len(end_punct) > 0:
12 |         # preserve the end punctuation...
13 |         if end_punct[-1][1] == line[-1]:
14 |             end_punct = end_punct[-1]
15 |         else:
16 |             end_punct = (0, " ")
17 |     else:
18 |         end_punct = (0, " ")
19 |     line = english_cleaners(line)
20 |     if cmu_only:
21 |         r0 = cmu_g2p(line, raw_line)
22 |         return r0
23 | 
24 |     r = hybrid_g2p(line)
25 | 
26 |     if any([p in line for p in puncts]):
27 |         new = []
28 |         psym = r.strip().split(" ")
29 |         lsym = line.strip().split(" ")
30 |         for lss, pss in zip(lsym, psym):
31 |             prev = []
32 |             for ssi in pss.strip().split("@")[1:]:
33 |                 which_specials = [p for p in puncts if p in lss]
34 |                 if any([p in lss for p in puncts]):
35 |                     prev.append(re.sub(re.escape("|".join(puncts)), "", ssi))
36 |                     # ASSUME ONLY 1?
37 |                 else:
38 |                     prev.append(ssi)
39 |             if len(which_specials) > 0:
40 |                 prev.append(which_specials[0])
41 |             new.append(prev)
42 |             prev = []
43 | 
44 |         merged = ""
45 |         for ii, chunk in enumerate(new):
46 |             if any([p in chunk for p in puncts]):
47 |                 mstr = ""
48 |                 for ci in chunk:
49 |                     if any([p in ci for p in puncts]):
50 |                         which_specials = [(n, p) for n, p in enumerate(puncts) if p in ci]
51 |                     else:
52 |                         mstr += "@"
53 |                         mstr += ci
54 |                 merged += mstr
55 |                 if ii < (len(new) - 1):
56 |                     if not int_timing_punct:
57 |                         merged += which_specials[0][1]
58 |                     else:
59 |                         merged += puncts_timing[which_specials[0][0]]
60 |             else:
61 |                 merged += "@"
62 |                 merged += "@".join(chunk)
63 |                 if ii < (len(new) - 1):
64 |                     merged += " "
65 |         if merged[-1] == " ":
66 |             merged = merged[:-1]
67 |         if not int_timing_punct:
68 |             merged += end_punct[1]
69 |         else:
70 |             merged += puncts_timing[end_punct[0]]
71 |         merged += "~"
72 |         return merged
73 |     else:
74 |         return r
75 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/plotters.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from cStringIO import StringIO as BytesIO
 3 | except: # Python 3
 4 |     from io import BytesIO
 5 | import numpy as np
 6 | import PIL.Image
 7 | import shutil
 8 | from math import sqrt
 9 | from skimage.transform import rescale, resize, downscale_local_mean
10 | from skimage.exposure import adjust_gamma
11 | 
12 | 
13 | def save_image_array(img, filename, resize_multiplier=(1, 1), gamma_multiplier=1, rescale_values=True, flipud=True, flat_wide=False, flat_vert=False, fmt="png"):
14 |     """
15 |     Expects a 4D image array of (n_images, height, width, channels)
16 | 
17 |     rescale will rescale 1 channel images to the maximum value available
18 | 
19 |     Modified from implementation by Kyle McDonald
20 | 
21 |     https://github.com/kylemcdonald/python-utils/blob/master/show_array.py
22 |     """
23 | 
24 |     if len(img.shape) != 4:
25 |        raise ValueError("Expects a 4D image array of (n_images, height, width, channels)")
26 | 
27 |     if flipud:
28 |         img = img[:, ::-1]
29 | 
30 |     n_ex, o_height, o_width, o_channels = img.shape
31 | 
32 |     if img.shape[0] != 1:
33 |         n = len(img)
34 |         side = int(sqrt(n))
35 |         side0 = side
36 |         side1 = side
37 |         shp = img.shape
38 |         if flat_wide or flat_vert or (side * side) == n:
39 |             pass
40 |         else:
41 |             raise ValueError("Need input length that can be reshaped to a square (4, 16, 25, 36, etc)")
42 |         n,h,w,c = img.shape
43 |         if flat_wide:
44 |             assert flat_wide != flat_vert
45 |             side0 = 1
46 |             side1 = n_ex
47 |         elif flat_vert:
48 |             assert flat_wide != flat_vert
49 |             side0 = n_ex
50 |             side1 = 1
51 |         img = img.reshape(side0, side1, h, w, c).swapaxes(1, 2).reshape(side0*h, side1*w, c)
52 |     else:
53 |         img = img[0]
54 | 
55 |     if rescale_values:
56 |         """
57 |         img_max = np.max(img)
58 |         img_min = np.min(img)
59 |         # scale to 0, 1
60 |         img = (img - img_min) / float(img_max - img_min)
61 |         # scale 0, 1 to 0, 255
62 |         """
63 |         img *= 255.
64 | 
65 |     if img.shape[-1] == 1:
66 |        img = img[:, :, 0]
67 | 
68 |     img = np.uint8(np.clip(img, 0, 255))
69 |     if resize_multiplier != (1, 1):
70 |         rs = resize(img, (img.shape[0] * resize_multiplier[0], img.shape[1] * resize_multiplier[1]))
71 | 
72 |     if gamma_multiplier != 1:
73 |         rs = adjust_gamma(rs, gamma_multiplier)
74 | 
75 |     if resize_multiplier != (1, 1) or gamma_multiplier != 1:
76 |         rs *= 255.
77 |         img = np.uint8(np.clip(rs, 0, 255))
78 |     image_data = BytesIO()
79 |     PIL.Image.fromarray(img).save(image_data, fmt)
80 |     with open(filename, 'wb') as f:
81 |         image_data.seek(0)
82 |         shutil.copyfileobj(image_data, f)
83 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/__init__.py:
--------------------------------------------------------------------------------
 1 | from cleaning.eng_rules import cmu_g2p, hybrid_g2p, rulebased_g2p
 2 | from cleaning.cleaners import english_cleaners
 3 | import re
 4 | 
 5 | def pronounce_chars(line, raw_line=None, cmu_only=False, int_timing_punct=True):
 6 |     # cleaners strip things...
 7 |     puncts = ["!",",",":","?","."]
 8 |     #puncts_timing = ["4","1","1","4", "4"]
 9 |     puncts_timing = [" "," "," "," ", " "]
10 |     end_punct = [(ni, pi) for ni, pi in enumerate(puncts) if pi in line]
11 |     if len(end_punct) > 0:
12 |         # preserve the end punctuation...
13 |         if end_punct[-1][1] == line[-1]:
14 |             end_punct = end_punct[-1]
15 |         else:
16 |             end_punct = (0, " ")
17 |     else:
18 |         end_punct = (0, " ")
19 |     line = english_cleaners(line)
20 |     if cmu_only:
21 |         r0 = cmu_g2p(line, raw_line)
22 |         return r0
23 | 
24 |     r = hybrid_g2p(line)
25 | 
26 |     if any([p in line for p in puncts]):
27 |         new = []
28 |         psym = r.strip().split(" ")
29 |         lsym = line.strip().split(" ")
30 |         for lss, pss in zip(lsym, psym):
31 |             prev = []
32 |             for ssi in pss.strip().split("@")[1:]:
33 |                 which_specials = [p for p in puncts if p in lss]
34 |                 if any([p in lss for p in puncts]):
35 |                     prev.append(re.sub(re.escape("|".join(puncts)), "", ssi))
36 |                     # ASSUME ONLY 1?
37 |                 else:
38 |                     prev.append(ssi)
39 |             if len(which_specials) > 0:
40 |                 prev.append(which_specials[0])
41 |             new.append(prev)
42 |             prev = []
43 | 
44 |         merged = ""
45 |         for ii, chunk in enumerate(new):
46 |             if any([p in chunk for p in puncts]):
47 |                 mstr = ""
48 |                 for ci in chunk:
49 |                     if any([p in ci for p in puncts]):
50 |                         which_specials = [(n, p) for n, p in enumerate(puncts) if p in ci]
51 |                     else:
52 |                         mstr += "@"
53 |                         mstr += ci
54 |                 merged += mstr
55 |                 if ii < (len(new) - 1):
56 |                     if not int_timing_punct:
57 |                         merged += which_specials[0][1]
58 |                     else:
59 |                         merged += puncts_timing[which_specials[0][0]]
60 |             else:
61 |                 merged += "@"
62 |                 merged += "@".join(chunk)
63 |                 if ii < (len(new) - 1):
64 |                     merged += " "
65 |         if merged[-1] == " ":
66 |             merged = merged[:-1]
67 |         if not int_timing_punct:
68 |             merged += end_punct[1]
69 |         else:
70 |             merged += puncts_timing[end_punct[0]]
71 |         merged += "~"
72 |         return merged
73 |     else:
74 |         return r
75 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/.gitignore:
--------------------------------------------------------------------------------
  1 | foobar*
  2 | pretrained_models
  3 | notebooks
  4 | wavenet_vocoder/version.py
  5 | checkpoints*
  6 | log
  7 | generated
  8 | data
  9 | text
 10 | 
 11 | # Created by https://www.gitignore.io
 12 | 
 13 | ### Python ###
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | env/
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .coverage
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | 
 70 | ### IPythonNotebook ###
 71 | # Temporary data
 72 | .ipynb_checkpoints/
 73 | 
 74 | 
 75 | ### SublimeText ###
 76 | # cache files for sublime text
 77 | *.tmlanguage.cache
 78 | *.tmPreferences.cache
 79 | *.stTheme.cache
 80 | 
 81 | # workspace files are user-specific
 82 | *.sublime-workspace
 83 | 
 84 | # project files should be checked into the repository, unless a significant
 85 | # proportion of contributors will probably not be using SublimeText
 86 | # *.sublime-project
 87 | 
 88 | # sftp configuration file
 89 | sftp-config.json
 90 | 
 91 | 
 92 | ### Emacs ###
 93 | # -*- mode: gitignore; -*-
 94 | *~
 95 | \#*\#
 96 | /.emacs.desktop
 97 | /.emacs.desktop.lock
 98 | *.elc
 99 | auto-save-list
100 | tramp
101 | .\#*
102 | 
103 | # Org-mode
104 | .org-id-locations
105 | *_archive
106 | 
107 | # flymake-mode
108 | *_flymake.*
109 | 
110 | # eshell files
111 | /eshell/history
112 | /eshell/lastdir
113 | 
114 | # elpa packages
115 | /elpa/
116 | 
117 | # reftex files
118 | *.rel
119 | 
120 | # AUCTeX auto folder
121 | /auto/
122 | 
123 | # cask packages
124 | .cask/
125 | 
126 | 
127 | ### Vim ###
128 | [._]*.s[a-w][a-z]
129 | [._]s[a-w][a-z]
130 | *.un~
131 | Session.vim
132 | .netrwhist
133 | *~
134 | 
135 | 
136 | ### C++ ###
137 | # Compiled Object files
138 | *.slo
139 | *.lo
140 | *.o
141 | *.obj
142 | 
143 | # Precompiled Headers
144 | *.gch
145 | *.pch
146 | 
147 | # Compiled Dynamic libraries
148 | *.so
149 | *.dylib
150 | *.dll
151 | 
152 | # Fortran module files
153 | *.mod
154 | 
155 | # Compiled Static libraries
156 | *.lai
157 | *.la
158 | *.a
159 | *.lib
160 | 
161 | # Executables
162 | *.exe
163 | *.out
164 | *.app
165 | 
166 | 
167 | ### OSX ###
168 | .DS_Store
169 | .AppleDouble
170 | .LSOverride
171 | 
172 | # Icon must end with two \r
173 | Icon
174 | 
175 | 
176 | # Thumbnails
177 | ._*
178 | 
179 | # Files that might appear on external disk
180 | .Spotlight-V100
181 | .Trashes
182 | 
183 | # Directories potentially created on remote AFP share
184 | .AppleDB
185 | .AppleDesktop
186 | Network Trash Folder
187 | Temporary Items
188 | .apdisk
189 | 
190 | 
191 | ### Linux ###
192 | *~
193 | 
194 | # KDE directory preferences
195 | .directory
196 | 
197 | # Linux trash folder which might appear on any partition or disk
198 | .Trash-*
199 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import audio
 6 | 
 7 | from nnmnkwii import preprocessing as P
 8 | from hparams import hparams
 9 | from os.path import exists
10 | import librosa
11 | 
12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
13 | 
14 | 
15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
16 |     executor = ProcessPoolExecutor(max_workers=num_workers)
17 |     futures = []
18 |     index = 1
19 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
20 |         for line in f:
21 |             parts = line.strip().split('|')
22 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
23 |             text = parts[2]
24 |             futures.append(executor.submit(
25 |                 partial(_process_utterance, out_dir, index, wav_path, text)))
26 |             index += 1
27 |     return [future.result() for future in tqdm(futures)]
28 | 
29 | 
30 | def _process_utterance(out_dir, index, wav_path, text):
31 |     # Load the audio to a numpy array:
32 |     wav = audio.load_wav(wav_path)
33 | 
34 |     if hparams.rescaling:
35 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
36 | 
37 |     # Mu-law quantize
38 |     if is_mulaw_quantize(hparams.input_type):
39 |         # [0, quantize_channels)
40 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
41 | 
42 |         # Trim silences
43 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
44 |         wav = wav[start:end]
45 |         out = out[start:end]
46 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
47 |         out_dtype = np.int16
48 |     elif is_mulaw(hparams.input_type):
49 |         # [-1, 1]
50 |         out = P.mulaw(wav, hparams.quantize_channels)
51 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
52 |         out_dtype = np.float32
53 |     else:
54 |         # [-1, 1]
55 |         out = wav
56 |         constant_values = 0.0
57 |         out_dtype = np.float32
58 | 
59 |     # Compute a mel-scale spectrogram from the trimmed wav:
60 |     # (N, D)
61 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
62 |     # lws pads zeros internally before performing stft
63 |     # this is needed to adjust time resolution between audio and mel-spectrogram
64 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
65 | 
66 |     # zero pad for quantized signal
67 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
68 |     N = mel_spectrogram.shape[0]
69 |     assert len(out) >= N * audio.get_hop_size()
70 | 
71 |     # time resolution adjustment
72 |     # ensure length of raw audio is multiple of hop_size so that we can use
73 |     # transposed convolution to upsample
74 |     out = out[:N * audio.get_hop_size()]
75 |     assert len(out) % audio.get_hop_size() == 0
76 | 
77 |     timesteps = len(out)
78 | 
79 |     # Write the spectrograms to disk:
80 |     audio_filename = 'ljspeech-audio-%05d.npy' % index
81 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
82 |     np.save(os.path.join(out_dir, audio_filename),
83 |             out.astype(out_dtype), allow_pickle=False)
84 |     np.save(os.path.join(out_dir, mel_filename),
85 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
86 | 
87 |     # Return a tuple describing this training example:
88 |     return (audio_filename, mel_filename, timesteps, text)
89 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/jsut.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | import numpy as np
  4 | import os
  5 | import audio
  6 | from nnmnkwii.datasets import jsut
  7 | from nnmnkwii.io import hts
  8 | from hparams import hparams
  9 | from os.path import exists
 10 | import librosa
 11 | 
 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 13 | 
 14 | 
 15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 16 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 17 |     futures = []
 18 | 
 19 |     transcriptions = jsut.TranscriptionDataSource(
 20 |         in_dir, subsets=jsut.available_subsets).collect_files()
 21 |     wav_paths = jsut.WavFileDataSource(
 22 |         in_dir, subsets=jsut.available_subsets).collect_files()
 23 | 
 24 |     for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
 25 |         futures.append(executor.submit(
 26 |             partial(_process_utterance, out_dir, index + 1, wav_path, text)))
 27 |     return [future.result() for future in tqdm(futures)]
 28 | 
 29 | 
 30 | def _process_utterance(out_dir, index, wav_path, text):
 31 |     # Load the audio to a numpy array:
 32 |     wav = audio.load_wav(wav_path)
 33 |     sr = hparams.sample_rate
 34 | 
 35 |     if hparams.rescaling:
 36 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
 37 | 
 38 |     # Trim silence from hts labels if available
 39 |     lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
 40 |     if exists(lab_path):
 41 |         labels = hts.load(lab_path)
 42 |         assert "sil" in labels[0][-1]
 43 |         assert "sil" in labels[-1][-1]
 44 |         b = int(labels[0][1] * 1e-7 * sr)
 45 |         e = int(labels[-1][0] * 1e-7 * sr)
 46 |         wav = wav[b:e]
 47 |     else:
 48 |         wav, _ = librosa.effects.trim(wav, top_db=30)
 49 | 
 50 |     # Mu-law quantize
 51 |     if is_mulaw_quantize(hparams.input_type):
 52 |         # [0, quantize_channels)
 53 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
 54 | 
 55 |         # Trim silences
 56 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 57 |         wav = wav[start:end]
 58 |         out = out[start:end]
 59 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 60 |         out_dtype = np.int16
 61 |     elif is_mulaw(hparams.input_type):
 62 |         # [-1, 1]
 63 |         out = P.mulaw(wav, hparams.quantize_channels)
 64 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
 65 |         out_dtype = np.float32
 66 |     else:
 67 |         # [-1, 1]
 68 |         out = wav
 69 |         constant_values = 0.0
 70 |         out_dtype = np.float32
 71 | 
 72 |     # Compute a mel-scale spectrogram from the trimmed wav:
 73 |     # (N, D)
 74 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
 75 |     # lws pads zeros internally before performing stft
 76 |     # this is needed to adjust time resolution between audio and mel-spectrogram
 77 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
 78 | 
 79 |     # zero pad for quantized signal
 80 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
 81 |     N = mel_spectrogram.shape[0]
 82 |     assert len(out) >= N * audio.get_hop_size()
 83 | 
 84 |     # time resolution adjustment
 85 |     # ensure length of raw audio is multiple of hop_size so that we can use
 86 |     # transposed convolution to upsample
 87 |     out = out[:N * audio.get_hop_size()]
 88 |     assert len(out) % audio.get_hop_size() == 0
 89 | 
 90 |     timesteps = len(out)
 91 | 
 92 |     # Write the spectrograms to disk:
 93 |     audio_filename = 'jsut-audio-%05d.npy' % index
 94 |     mel_filename = 'jsut-mel-%05d.npy' % index
 95 |     np.save(os.path.join(out_dir, audio_filename),
 96 |             out.astype(out_dtype), allow_pickle=False)
 97 |     np.save(os.path.join(out_dir, mel_filename),
 98 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
 99 | 
100 |     # Return a tuple describing this training example:
101 |     return (audio_filename, mel_filename, timesteps, text)
102 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/audio/magrecnp.py:
--------------------------------------------------------------------------------
  1 | from tfbldr.datasets.audio import fetch_sample_speech_tapestry
  2 | from tfbldr.datasets.audio import soundsc
  3 | import matplotlib
  4 | matplotlib.use("Agg")
  5 | import matplotlib.pyplot as plt
  6 | import tensorflow as tf
  7 | import os
  8 | import numpy as np
  9 | from scipy.io import wavfile
 10 | from tfbldr.datasets.audio import linear_to_mel_weight_matrix
 11 | from tfbldr.datasets.audio import stft
 12 | from tfbldr.datasets.audio import iterate_invert_spectrogram
 13 | 
 14 | 
 15 | def sonify(spectrogram, samples, transform_op_fn, logscaled=True):
 16 |     graph = tf.Graph()
 17 |     with graph.as_default():
 18 | 
 19 |         noise = tf.Variable(tf.random_normal([samples], stddev=1e-6))
 20 | 
 21 |         x = transform_op_fn(noise)
 22 |         y = spectrogram
 23 | 
 24 |         if logscaled:
 25 |             x = tf.expm1(x)
 26 |             y = tf.expm1(y)
 27 | 
 28 |         x = tf.nn.l2_normalize(x)
 29 |         y = tf.nn.l2_normalize(y)
 30 |         tf.losses.mean_squared_error(x, y[-tf.shape(x)[0]:])
 31 | 
 32 |         optimizer = tf.contrib.opt.ScipyOptimizerInterface(
 33 |             loss=tf.losses.get_total_loss(),
 34 |             var_list=[noise],
 35 |             tol=1e-16,
 36 |             method='L-BFGS-B',
 37 |             options={
 38 |                 'maxiter': 1000,
 39 |                 'disp': True
 40 |             })
 41 | 
 42 |     with tf.Session(graph=graph) as session:
 43 |         session.run(tf.global_variables_initializer())
 44 |         optimizer.minimize(session)
 45 |         waveform = session.run(noise)
 46 | 
 47 |     return waveform
 48 | 
 49 | fs, d = fetch_sample_speech_tapestry()
 50 | 
 51 | sample_rate = fs
 52 | window_size = 512
 53 | step = 128
 54 | n_mel = 80
 55 | wav_scale = 2 ** 15
 56 | waveform = d / float(wav_scale)
 57 | 
 58 | def logmel(waveform):
 59 |     z = tf.contrib.signal.stft(waveform, window_size, step)
 60 |     magnitudes = tf.abs(z)
 61 |     filterbank = tf.contrib.signal.linear_to_mel_weight_matrix(
 62 |         num_mel_bins=n_mel,
 63 |         num_spectrogram_bins=magnitudes.shape[-1].value,
 64 |         sample_rate=sample_rate,
 65 |         lower_edge_hertz=125.,
 66 |         upper_edge_hertz=7800.)
 67 |     melspectrogram = tf.tensordot(magnitudes, filterbank, 1)
 68 |     return tf.log1p(melspectrogram)
 69 | 
 70 | 
 71 | def logmel2(waveform):
 72 |     res = np.abs(stft(waveform, windowsize=window_size, step=step, real=False, compute_onesided=True))
 73 |     mels = linear_to_mel_weight_matrix(
 74 |         res.shape[1],
 75 |         sample_rate,
 76 |         lower_edge_hertz=125.,
 77 |         upper_edge_hertz=7800.,
 78 |         n_filts=n_mel, dtype=np.float64)
 79 |     mel_res = np.dot(res, mels)
 80 |     return np.log1p(mel_res)
 81 | 
 82 | with tf.Session():
 83 |     spectrogram = logmel(waveform).eval()
 84 | 
 85 | spectrogram2 = logmel2(waveform)
 86 | spectrogram = (spectrogram - spectrogram.min()) / float(spectrogram.max() - spectrogram.min())
 87 | spectrogram2 = (spectrogram2 - spectrogram2.min()) / float(spectrogram2.max() - spectrogram2.min())
 88 | 
 89 | f, axarr = plt.subplots(1, 2)
 90 | axarr[0].imshow(spectrogram)
 91 | axarr[1].imshow(spectrogram2)
 92 | plt.savefig("tmpspec")
 93 | 
 94 | reconstructed_waveform = sonify(spectrogram, len(waveform), logmel)
 95 | wavfile.write("tmp.wav", sample_rate, soundsc(reconstructed_waveform))
 96 | reconstructed_waveform2 = sonify(spectrogram2, len(waveform), logmel)
 97 | wavfile.write("tmp2.wav", sample_rate, soundsc(reconstructed_waveform2))
 98 | 
 99 | 
100 | fftsize = 512
101 | substep = 32
102 | rw_s = np.abs(stft(reconstructed_waveform, fftsize=fftsize, step=substep, real=False,
103 |                    compute_onesided=False))
104 | rw = iterate_invert_spectrogram(rw_s, fftsize, substep, n_iter=100, verbose=True)
105 | 
106 | rw2_s = np.abs(stft(reconstructed_waveform2, fftsize=fftsize, step=substep, real=False,
107 |                    compute_onesided=False))
108 | rw2 = iterate_invert_spectrogram(rw2_s, fftsize, substep, n_iter=100, verbose=True)
109 | 
110 | d_s = np.abs(stft(waveform, fftsize=fftsize, step=substep, real=False,
111 |                   compute_onesided=False))
112 | df = iterate_invert_spectrogram(d_s, fftsize, substep, n_iter=10, verbose=True)
113 | wavfile.write("tmpif.wav", sample_rate, soundsc(df))
114 | wavfile.write("tmpf.wav", sample_rate, soundsc(rw))
115 | wavfile.write("tmpf2.wav", sample_rate, soundsc(rw2))
116 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import math
  4 | import numpy as np
  5 | from scipy import signal
  6 | from hparams import hparams
  7 | from scipy.io import wavfile
  8 | 
  9 | import lws
 10 | 
 11 | 
 12 | def load_wav(path):
 13 |     return librosa.core.load(path, sr=hparams.sample_rate)[0]
 14 | 
 15 | 
 16 | def save_wav(wav, path):
 17 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 18 |     wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 19 | 
 20 | 
 21 | def trim(quantized):
 22 |     start, end = start_and_end_indices(quantized, hparams.silence_threshold)
 23 |     return quantized[start:end]
 24 | 
 25 | 
 26 | def adjust_time_resolution(quantized, mel):
 27 |     """Adjust time resolution by repeating features
 28 | 
 29 |     Args:
 30 |         quantized (ndarray): (T,)
 31 |         mel (ndarray): (N, D)
 32 | 
 33 |     Returns:
 34 |         tuple: Tuple of (T,) and (T, D)
 35 |     """
 36 |     assert len(quantized.shape) == 1
 37 |     assert len(mel.shape) == 2
 38 | 
 39 |     upsample_factor = quantized.size // mel.shape[0]
 40 |     mel = np.repeat(mel, upsample_factor, axis=0)
 41 |     n_pad = quantized.size - mel.shape[0]
 42 |     if n_pad != 0:
 43 |         assert n_pad > 0
 44 |         mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0)
 45 | 
 46 |     # trim
 47 |     start, end = start_and_end_indices(quantized, hparams.silence_threshold)
 48 | 
 49 |     return quantized[start:end], mel[start:end, :]
 50 | adjast_time_resolution = adjust_time_resolution  # 'adjust' is correct spelling, this is for compatibility
 51 | 
 52 | 
 53 | def start_and_end_indices(quantized, silence_threshold=2):
 54 |     for start in range(quantized.size):
 55 |         if abs(quantized[start] - 127) > silence_threshold:
 56 |             break
 57 |     for end in range(quantized.size - 1, 1, -1):
 58 |         if abs(quantized[end] - 127) > silence_threshold:
 59 |             break
 60 | 
 61 |     assert abs(quantized[start] - 127) > silence_threshold
 62 |     assert abs(quantized[end] - 127) > silence_threshold
 63 | 
 64 |     return start, end
 65 | 
 66 | 
 67 | def melspectrogram(y):
 68 |     D = _lws_processor().stft(y).T
 69 |     S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 70 |     if not hparams.allow_clipping_in_normalization:
 71 |         assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
 72 |     return _normalize(S)
 73 | 
 74 | 
 75 | def get_hop_size():
 76 |     hop_size = hparams.hop_size
 77 |     if hop_size is None:
 78 |         assert hparams.frame_shift_ms is not None
 79 |         hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 80 |     return hop_size
 81 | 
 82 | 
 83 | def _lws_processor():
 84 |     return lws.lws(hparams.fft_size, get_hop_size(), mode="speech")
 85 | 
 86 | 
 87 | def lws_num_frames(length, fsize, fshift):
 88 |     """Compute number of time frames of lws spectrogram
 89 |     """
 90 |     pad = (fsize - fshift)
 91 |     if length % fshift == 0:
 92 |         M = (length + pad * 2 - fsize) // fshift + 1
 93 |     else:
 94 |         M = (length + pad * 2 - fsize) // fshift + 2
 95 |     return M
 96 | 
 97 | 
 98 | def lws_pad_lr(x, fsize, fshift):
 99 |     """Compute left and right padding lws internally uses
100 |     """
101 |     M = lws_num_frames(len(x), fsize, fshift)
102 |     pad = (fsize - fshift)
103 |     T = len(x) + 2 * pad
104 |     r = (M - 1) * fshift + fsize - T
105 |     return pad, pad + r
106 | 
107 | # Conversions:
108 | 
109 | 
110 | _mel_basis = None
111 | 
112 | 
113 | def _linear_to_mel(spectrogram):
114 |     global _mel_basis
115 |     if _mel_basis is None:
116 |         _mel_basis = _build_mel_basis()
117 |     return np.dot(_mel_basis, spectrogram)
118 | 
119 | 
120 | def _build_mel_basis():
121 |     assert hparams.fmax <= hparams.sample_rate // 2
122 |     return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
123 |                                fmin=hparams.fmin, fmax=hparams.fmax,
124 |                                n_mels=hparams.num_mels)
125 | 
126 | 
127 | def _amp_to_db(x):
128 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
129 |     return 20 * np.log10(np.maximum(min_level, x))
130 | 
131 | 
132 | def _db_to_amp(x):
133 |     return np.power(10.0, x * 0.05)
134 | 
135 | 
136 | def _normalize(S):
137 |     return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
138 | 
139 | 
140 | def _denormalize(S):
141 |     return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
142 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/cmu_arctic.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | import numpy as np
  4 | import os
  5 | import audio
  6 | from nnmnkwii.datasets import cmu_arctic
  7 | from nnmnkwii.io import hts
  8 | from nnmnkwii import preprocessing as P
  9 | from hparams import hparams
 10 | from os.path import exists
 11 | import librosa
 12 | 
 13 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 14 | 
 15 | 
 16 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 17 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 18 |     futures = []
 19 | 
 20 |     speakers = cmu_arctic.available_speakers
 21 | 
 22 |     wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers)
 23 |     wav_paths = wd.collect_files()
 24 |     speaker_ids = wd.labels
 25 | 
 26 |     for index, (speaker_id, wav_path) in enumerate(
 27 |             zip(speaker_ids, wav_paths)):
 28 |         futures.append(executor.submit(
 29 |             partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A")))
 30 |     return [future.result() for future in tqdm(futures)]
 31 | 
 32 | 
 33 | def start_at(labels):
 34 |     has_silence = labels[0][-1] == "pau"
 35 |     if not has_silence:
 36 |         return labels[0][0]
 37 |     for i in range(1, len(labels)):
 38 |         if labels[i][-1] != "pau":
 39 |             return labels[i][0]
 40 |     assert False
 41 | 
 42 | 
 43 | def end_at(labels):
 44 |     has_silence = labels[-1][-1] == "pau"
 45 |     if not has_silence:
 46 |         return labels[-1][1]
 47 |     for i in range(len(labels) - 2, 0, -1):
 48 |         if labels[i][-1] != "pau":
 49 |             return labels[i][1]
 50 |     assert False
 51 | 
 52 | 
 53 | def _process_utterance(out_dir, index, speaker_id, wav_path, text):
 54 |     sr = hparams.sample_rate
 55 | 
 56 |     # Load the audio to a numpy array. Resampled if needed
 57 |     wav = audio.load_wav(wav_path)
 58 | 
 59 |     lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
 60 | 
 61 |     # Trim silence from hts labels if available
 62 |     # TODO
 63 |     if exists(lab_path) and False:
 64 |         labels = hts.load(lab_path)
 65 |         b = int(start_at(labels) * 1e-7 * sr)
 66 |         e = int(end_at(labels) * 1e-7 * sr)
 67 |         wav = wav[b:e]
 68 |         wav, _ = librosa.effects.trim(wav, top_db=20)
 69 |     else:
 70 |         wav, _ = librosa.effects.trim(wav, top_db=20)
 71 | 
 72 |     if hparams.rescaling:
 73 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
 74 | 
 75 |     # Mu-law quantize
 76 |     if is_mulaw_quantize(hparams.input_type):
 77 |         # [0, quantize_channels)
 78 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
 79 | 
 80 |         # Trim silences
 81 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 82 |         wav = wav[start:end]
 83 |         out = out[start:end]
 84 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 85 |         out_dtype = np.int16
 86 |     elif is_mulaw(hparams.input_type):
 87 |         # [-1, 1]
 88 |         out = P.mulaw(wav, hparams.quantize_channels)
 89 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
 90 |         out_dtype = np.float32
 91 |     else:
 92 |         # [-1, 1]
 93 |         out = wav
 94 |         constant_values = 0.0
 95 |         out_dtype = np.float32
 96 | 
 97 |     # Compute a mel-scale spectrogram from the trimmed wav:
 98 |     # (N, D)
 99 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
100 |     # lws pads zeros internally before performing stft
101 |     # this is needed to adjust time resolution between audio and mel-spectrogram
102 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
103 | 
104 |     # zero pad for quantized signal
105 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
106 |     N = mel_spectrogram.shape[0]
107 |     assert len(out) >= N * audio.get_hop_size()
108 | 
109 |     # time resolution adjustment
110 |     # ensure length of raw audio is multiple of hop_size so that we can use
111 |     # transposed convolution to upsample
112 |     out = out[:N * audio.get_hop_size()]
113 |     assert len(out) % audio.get_hop_size() == 0
114 | 
115 |     timesteps = len(out)
116 | 
117 |     # Write the spectrograms to disk:
118 |     audio_filename = 'cmu_arctic-audio-%05d.npy' % index
119 |     mel_filename = 'cmu_arctic-mel-%05d.npy' % index
120 |     np.save(os.path.join(out_dir, audio_filename),
121 |             out.astype(out_dtype), allow_pickle=False)
122 |     np.save(os.path.join(out_dir, mel_filename),
123 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
124 | 
125 |     # Return a tuple describing this training example:
126 |     return (audio_filename, mel_filename, timesteps, text, speaker_id)
127 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/hparams.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | # NOTE: If you want full control for model architecture. please take a look
  5 | # at the code and change whatever you want. Some hyper parameters are hardcoded.
  6 | 
  7 | # Default hyperparameters:
  8 | hparams = tf.contrib.training.HParams(
  9 |     name="wavenet_vocoder",
 10 | 
 11 |     # Convenient model builder
 12 |     builder="wavenet",
 13 | 
 14 |     # Input type:
 15 |     # 1. raw [-1, 1]
 16 |     # 2. mulaw [-1, 1]
 17 |     # 3. mulaw-quantize [0, mu]
 18 |     # If input_type is raw or mulaw, network assumes scalar input and
 19 |     # discretized mixture of logistic distributions output, otherwise one-hot
 20 |     # input and softmax output are assumed.
 21 |     # **NOTE**: if you change the one of the two parameters below, you need to
 22 |     # re-run preprocessing before training.
 23 |     input_type="raw",
 24 |     quantize_channels=65536,  # 65536 or 256
 25 | 
 26 |     # Audio:
 27 |     sample_rate=22050,
 28 |     # this is only valid for mulaw is True
 29 |     silence_threshold=2,
 30 |     num_mels=80,
 31 |     fmin=125,
 32 |     fmax=7600,
 33 |     fft_size=1024,
 34 |     # shift can be specified by either hop_size or frame_shift_ms
 35 |     hop_size=256,
 36 |     frame_shift_ms=None,
 37 |     min_level_db=-100,
 38 |     ref_level_db=20,
 39 |     # whether to rescale waveform or not.
 40 |     # Let x is an input waveform, rescaled waveform y is given by:
 41 |     # y = x / np.abs(x).max() * rescaling_max
 42 |     rescaling=True,
 43 |     rescaling_max=0.999,
 44 |     # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may
 45 |     # happen depends on min_level_db and ref_level_db, causing clipping noise.
 46 |     # If False, assertion is added to ensure no clipping happens.o0
 47 |     allow_clipping_in_normalization=True,
 48 | 
 49 |     # Mixture of logistic distributions:
 50 |     log_scale_min=float(np.log(1e-14)),
 51 | 
 52 |     # Model:
 53 |     # This should equal to `quantize_channels` if mu-law quantize enabled
 54 |     # otherwise num_mixture * 3 (pi, mean, log_scale)
 55 |     out_channels=10 * 3,
 56 |     layers=24,
 57 |     stacks=4,
 58 |     residual_channels=512,
 59 |     gate_channels=512,  # split into 2 gropus internally for gated activation
 60 |     skip_out_channels=256,
 61 |     dropout=1 - 0.95,
 62 |     kernel_size=3,
 63 |     # If True, apply weight normalization as same as DeepVoice3
 64 |     weight_normalization=True,
 65 |     # Use legacy code or not. Default is True since we already provided a model
 66 |     # based on the legacy code that can generate high-quality audio.
 67 |     # Ref: https://github.com/r9y9/wavenet_vocoder/pull/73
 68 |     legacy=True,
 69 | 
 70 |     # Local conditioning (set negative value to disable))
 71 |     cin_channels=80,
 72 |     # If True, use transposed convolutions to upsample conditional features,
 73 |     # otherwise repeat features to adjust time resolution
 74 |     upsample_conditional_features=True,
 75 |     # should np.prod(upsample_scales) == hop_size
 76 |     upsample_scales=[4, 4, 4, 4],
 77 |     # Freq axis kernel size for upsampling network
 78 |     freq_axis_kernel_size=3,
 79 | 
 80 |     # Global conditioning (set negative value to disable)
 81 |     # currently limited for speaker embedding
 82 |     # this should only be enabled for multi-speaker dataset
 83 |     gin_channels=-1,  # i.e., speaker embedding dim
 84 |     n_speakers=7,  # 7 for CMU ARCTIC
 85 | 
 86 |     # Data loader
 87 |     pin_memory=True,
 88 |     num_workers=2,
 89 | 
 90 |     # train/test
 91 |     # test size can be specified as portion or num samples
 92 |     test_size=0.0441,  # 50 for CMU ARCTIC single speaker
 93 |     test_num_samples=None,
 94 |     random_state=1234,
 95 | 
 96 |     # Loss
 97 | 
 98 |     # Training:
 99 |     batch_size=2,
100 |     adam_beta1=0.9,
101 |     adam_beta2=0.999,
102 |     adam_eps=1e-8,
103 |     amsgrad=False,
104 |     initial_learning_rate=1e-3,
105 |     # see lrschedule.py for available lr_schedule
106 |     lr_schedule="noam_learning_rate_decay",
107 |     lr_schedule_kwargs={},  # {"anneal_rate": 0.5, "anneal_interval": 50000},
108 |     nepochs=2000,
109 |     weight_decay=0.0,
110 |     clip_thresh=-1,
111 |     # max time steps can either be specified as sec or steps
112 |     # if both are None, then full audio samples are used in a batch
113 |     max_time_sec=None,
114 |     max_time_steps=8000,
115 |     # Hold moving averaged parameters and use them for evaluation
116 |     exponential_moving_average=True,
117 |     # averaged = decay * averaged + (1 - decay) * x
118 |     ema_decay=0.9999,
119 | 
120 |     # Save
121 |     # per-step intervals
122 |     checkpoint_interval=10000,
123 |     train_eval_interval=10000,
124 |     # per-epoch interval
125 |     test_eval_epoch_interval=5,
126 |     save_optimizer_state=True,
127 | 
128 |     # Eval:
129 | )
130 | 
131 | 
132 | def hparams_debug_string():
133 |     values = hparams.values()
134 |     hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
135 |     return 'Hyperparameters:\n' + '\n'.join(hp)
136 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/librivox.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | import numpy as np
  4 | import os
  5 | import audio
  6 | 
  7 | from nnmnkwii import preprocessing as P
  8 | from hparams import hparams
  9 | from os.path import exists
 10 | import librosa
 11 | 
 12 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 13 | 
 14 | 
 15 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 16 |     executor = ProcessPoolExecutor(max_workers=num_workers)
 17 |     futures = []
 18 |     index = 1
 19 | 
 20 |     # with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
 21 |     #    for line in f:
 22 |     #        parts = line.strip().split('|')
 23 |     #        wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
 24 |     #        text = parts[2]
 25 |     #        futures.append(executor.submit(
 26 |     #            partial(_process_utterance, out_dir, index, wav_path, text)))
 27 |     #        index += 1
 28 | 
 29 |     valid_ext = '.ogg .wav .mp3'.split()
 30 |     for f in sorted(os.listdir(in_dir)):
 31 |         valid = sum([f.endswith(ext) for ext in valid_ext])
 32 |         if valid < 1:
 33 |             continue
 34 | 
 35 |         audio_filepath = os.path.join(in_dir, f)
 36 |         text = audio_filepath  # Not very informative
 37 |         futures.append(executor.submit(
 38 |             partial(_process_utterance, out_dir, index, audio_filepath, text)))
 39 |         index += 1
 40 |     return [tup for future in tqdm(futures) for tup in future.result()]
 41 | 
 42 | 
 43 | def _process_utterance(out_dir, index, audio_filepath, text):
 44 |     # Load the audio to a numpy array:
 45 |     wav_whole = audio.load_wav(audio_filepath)
 46 | 
 47 |     if hparams.rescaling:
 48 |         wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max
 49 | 
 50 |     # This is a librivox source, so the audio files are going to be v. long
 51 |     # compared to a typical 'utterance' : So split the wav into chunks
 52 | 
 53 |     tup_results = []
 54 | 
 55 |     n_samples = int(8.0 * hparams.sample_rate)  # All 8 second utterances
 56 |     n_chunks = wav_whole.shape[0] // n_samples
 57 | 
 58 |     for chunk_idx in range(n_chunks):
 59 |         chunk_start, chunk_end = chunk_idx * n_samples, (chunk_idx + 1) * n_samples
 60 |         if chunk_idx == n_chunks - 1:  # This is the last chunk - allow it to extend to the end of the file
 61 |             chunk_end = None
 62 |         wav = wav_whole[chunk_start: chunk_end]
 63 | 
 64 |         # Mu-law quantize
 65 |         if is_mulaw_quantize(hparams.input_type):
 66 |             # [0, quantize_channels)
 67 |             out = P.mulaw_quantize(wav, hparams.quantize_channels)
 68 | 
 69 |             # Trim silences
 70 |             start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 71 |             wav = wav[start:end]
 72 |             out = out[start:end]
 73 |             constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 74 |             out_dtype = np.int16
 75 |         elif is_mulaw(hparams.input_type):
 76 |             # [-1, 1]
 77 |             out = P.mulaw(wav, hparams.quantize_channels)
 78 |             constant_values = P.mulaw(0.0, hparams.quantize_channels)
 79 |             out_dtype = np.float32
 80 |         else:
 81 |             # [-1, 1]
 82 |             out = wav
 83 |             constant_values = 0.0
 84 |             out_dtype = np.float32
 85 | 
 86 |         # Compute a mel-scale spectrogram from the trimmed wav:
 87 |         # (N, D)
 88 |         mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
 89 |         # lws pads zeros internally before performing stft
 90 |         # this is needed to adjust time resolution between audio and mel-spectrogram
 91 |         l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
 92 | 
 93 |         # zero pad for quantized signal
 94 |         out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
 95 |         N = mel_spectrogram.shape[0]
 96 |         assert len(out) >= N * audio.get_hop_size()
 97 | 
 98 |         # time resolution adjustment
 99 |         # ensure length of raw audio is multiple of hop_size so that we can use
100 |         # transposed convolution to upsample
101 |         out = out[:N * audio.get_hop_size()]
102 |         assert len(out) % audio.get_hop_size() == 0
103 | 
104 |         timesteps = len(out)
105 | 
106 |         # Write the spectrograms to disk:
107 |         audio_filename = 'librivox-audio-%04d-%05d.npy' % (index, chunk_idx,)
108 |         mel_filename = 'librivox-mel-%04d-%05d.npy' % (index, chunk_idx,)
109 |         text_idx = '%s - %05d' % (text, chunk_idx,)
110 |         np.save(os.path.join(out_dir, audio_filename),
111 |                 out.astype(out_dtype), allow_pickle=False)
112 |         np.save(os.path.join(out_dir, mel_filename),
113 |                 mel_spectrogram.astype(np.float32), allow_pickle=False)
114 | 
115 |         # Add results tuple describing this training example:
116 |         tup_results.append((audio_filename, mel_filename, timesteps, text_idx))
117 | 
118 |     # Return all the audio results tuples (unpack in caller)
119 |     return tup_results
120 | 


--------------------------------------------------------------------------------
/pretrained/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | from numbers_rules import normalize_numbers
 18 | 
 19 | 
 20 | _whitespace_re = re.compile(r'\s+')
 21 | _apos_s_re = re.compile(r"'s")
 22 | _single_re = re.compile(r'["]')
 23 | _double_re = re.compile(r"[']")
 24 | _semicolon_re = re.compile(r';')
 25 | _paren_re = re.compile(r'[()]')
 26 | _bracket_re = re.compile(r'[\[\]]')
 27 | _dash_re = re.compile(r'--')
 28 | _comma_re = re.compile(r' , ')
 29 | _colon_re = re.compile(r':')
 30 | _period_re = re.compile(r'\.$')
 31 | _abbrev_re = re.compile(r'\.')
 32 | _US_re = re.compile(r' US')
 33 | _UK_re = re.compile(r' UK')
 34 | _FBI_re = re.compile(r' FBI')
 35 | _CIA_re = re.compile(r' CIA')
 36 | _NSA_re = re.compile(r' NSA')
 37 | _USA_re = re.compile(r' USA')
 38 | _USSR_re = re.compile(r' USSR')
 39 | 
 40 | # handle 22 -> 22nd???
 41 | 
 42 | # List of (regular expression, replacement) pairs for abbreviations:
 43 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 44 |   ('mrs', 'misess'),
 45 |   ('mr', 'mister'),
 46 |   ('dr', 'doctor'),
 47 |   ('st', 'saint'),
 48 |   ('co', 'company'),
 49 |   ('jr', 'junior'),
 50 |   ('maj', 'major'),
 51 |   ('gen', 'general'),
 52 |   ('drs', 'doctors'),
 53 |   ('rev', 'reverend'),
 54 |   ('lt', 'lieutenant'),
 55 |   ('hon', 'honorable'),
 56 |   ('sgt', 'sergeant'),
 57 |   ('capt', 'captain'),
 58 |   ('esq', 'esquire'),
 59 |   ('ltd', 'limited'),
 60 |   ('col', 'colonel'),
 61 |   ('ft', 'fort'),
 62 | ]]
 63 | 
 64 | 
 65 | def expand_abbreviations(text):
 66 |   for regex, replacement in _abbreviations:
 67 |     text = re.sub(regex, replacement, text)
 68 |   return text
 69 | 
 70 | 
 71 | def expand_numbers(text):
 72 |   return normalize_numbers(text)
 73 | 
 74 | 
 75 | def lowercase(text):
 76 |   text = re.sub(_USSR_re, ' U S S R', text)
 77 |   text = re.sub(_USA_re, ' U S A', text)
 78 |   text = re.sub(_US_re, ' U S', text)
 79 |   text = re.sub(_UK_re, ' U K', text)
 80 |   text = re.sub(_FBI_re, ' F B I', text)
 81 |   text = re.sub(_CIA_re, ' C I A', text)
 82 |   return text.lower()
 83 | 
 84 | 
 85 | def collapse_whitespace(text):
 86 |   return re.sub(_whitespace_re, ' ', text)
 87 | 
 88 | 
 89 | def convert_to_ascii(text):
 90 |   unicode_content = text.decode('utf-8')
 91 |   return unidecode(unicode_content)
 92 | 
 93 | 
 94 | def collapse_spurious(text):
 95 |   text = re.sub(_apos_s_re, "-s", text)
 96 |   text = re.sub(_single_re, "", text)
 97 |   text = re.sub(_double_re, "", text)
 98 |   text = re.sub(_paren_re, "", text)
 99 |   text = re.sub(_semicolon_re, ",", text)
100 |   text = re.sub(_dash_re, ",", text)
101 |   text = re.sub(_colon_re, ", ", text)
102 |   text = re.sub(_period_re, "", text)
103 |   text = re.sub(_bracket_re, "", text)
104 |   text = re.sub(_abbrev_re, " ", text)
105 |   text = re.sub(_comma_re, ", ", text)
106 |   text = re.sub(_comma_re, ", ", text)
107 |   return text
108 | 
109 | 
110 | def basic_cleaners(text):
111 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
112 |   text = lowercase(text)
113 |   text = collapse_whitespace(text)
114 |   return text
115 | 
116 | 
117 | def transliteration_cleaners(text):
118 |   '''Pipeline for non-English text that transliterates to ASCII.'''
119 |   text = convert_to_ascii(text)
120 |   text = lowercase(text)
121 |   text = collapse_whitespace(text)
122 |   return text
123 | 
124 | 
125 | def rulebased_g2p_cleaners(text):
126 |   text = convert_to_ascii(text)
127 |   from eng_rules import rulebased_g2p
128 |   r = rulebased_g2p(text)
129 |   text = "^".join(["&".join(ri[1]).lower() for ri in r])
130 |   text = lowercase(text)
131 |   return text
132 | 
133 | 
134 | def english_cleaners(text):
135 |   '''Pipeline for English text, including number and abbreviation expansion.'''
136 |   text = convert_to_ascii(text)
137 |   text = lowercase(text)
138 |   text = expand_numbers(text)
139 |   text = expand_abbreviations(text)
140 |   text = collapse_spurious(text)
141 |   text = collapse_whitespace(text)
142 |   return text
143 | 
144 | 
145 | def english_minimal_cleaners(text):
146 |   '''Pipeline for English text, including number and abbreviation expansion.'''
147 |   text = convert_to_ascii(text)
148 |   text = lowercase(text)
149 |   text = expand_numbers(text)
150 |   text = collapse_whitespace(text)
151 |   return text
152 | 
153 | 
154 | def english_phone_cleaners(text):
155 |   '''Pipeline for English phones.'''
156 |   return text
157 | 
158 | def english_phone_pause_cleaners(text):
159 |   '''Pipeline for English phones.'''
160 |   return text
161 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | from .numbers import normalize_numbers
 18 | from .eng_rules import rulebased_g2p
 19 | 
 20 | 
 21 | _whitespace_re = re.compile(r'\s+')
 22 | _apos_s_re = re.compile(r"'s")
 23 | _single_re = re.compile(r'["]')
 24 | _double_re = re.compile(r"[']")
 25 | _semicolon_re = re.compile(r';')
 26 | _paren_re = re.compile(r'[()]')
 27 | _bracket_re = re.compile(r'[\[\]]')
 28 | _dash_re = re.compile(r'--')
 29 | _comma_re = re.compile(r' , ')
 30 | _colon_re = re.compile(r':')
 31 | _period_re = re.compile(r'\.$')
 32 | _abbrev_re = re.compile(r'\.')
 33 | _US_re = re.compile(r' US')
 34 | _UK_re = re.compile(r' UK')
 35 | _FBI_re = re.compile(r' FBI')
 36 | _CIA_re = re.compile(r' CIA')
 37 | _NSA_re = re.compile(r' NSA')
 38 | _USA_re = re.compile(r' USA')
 39 | _USSR_re = re.compile(r' USSR')
 40 | 
 41 | # handle 22 -> 22nd???
 42 | 
 43 | # List of (regular expression, replacement) pairs for abbreviations:
 44 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 45 |   ('mrs', 'misess'),
 46 |   ('mr', 'mister'),
 47 |   ('dr', 'doctor'),
 48 |   ('st', 'saint'),
 49 |   ('co', 'company'),
 50 |   ('jr', 'junior'),
 51 |   ('maj', 'major'),
 52 |   ('gen', 'general'),
 53 |   ('drs', 'doctors'),
 54 |   ('rev', 'reverend'),
 55 |   ('lt', 'lieutenant'),
 56 |   ('hon', 'honorable'),
 57 |   ('sgt', 'sergeant'),
 58 |   ('capt', 'captain'),
 59 |   ('esq', 'esquire'),
 60 |   ('ltd', 'limited'),
 61 |   ('col', 'colonel'),
 62 |   ('ft', 'fort'),
 63 | ]]
 64 | 
 65 | 
 66 | def expand_abbreviations(text):
 67 |   for regex, replacement in _abbreviations:
 68 |     text = re.sub(regex, replacement, text)
 69 |   return text
 70 | 
 71 | 
 72 | def expand_numbers(text):
 73 |   return normalize_numbers(text)
 74 | 
 75 | 
 76 | def lowercase(text):
 77 |   text = re.sub(_USSR_re, ' U S S R', text)
 78 |   text = re.sub(_USA_re, ' U S A', text)
 79 |   text = re.sub(_US_re, ' U S', text)
 80 |   text = re.sub(_UK_re, ' U K', text)
 81 |   text = re.sub(_FBI_re, ' F B I', text)
 82 |   text = re.sub(_CIA_re, ' C I A', text)
 83 |   return text.lower()
 84 | 
 85 | 
 86 | def collapse_whitespace(text):
 87 |   return re.sub(_whitespace_re, ' ', text)
 88 | 
 89 | 
 90 | def convert_to_ascii(text):
 91 |   unicode_content = text.decode('utf-8')
 92 |   return unidecode(unicode_content)
 93 | 
 94 | 
 95 | def collapse_spurious(text):
 96 |   text = re.sub(_apos_s_re, "-s", text)
 97 |   text = re.sub(_single_re, "", text)
 98 |   text = re.sub(_double_re, "", text)
 99 |   text = re.sub(_paren_re, "", text)
100 |   text = re.sub(_semicolon_re, ",", text)
101 |   text = re.sub(_dash_re, ",", text)
102 |   text = re.sub(_colon_re, ", ", text)
103 |   text = re.sub(_period_re, "", text)
104 |   text = re.sub(_bracket_re, "", text)
105 |   text = re.sub(_abbrev_re, " ", text)
106 |   text = re.sub(_comma_re, ", ", text)
107 |   text = re.sub(_comma_re, ", ", text)
108 |   return text
109 | 
110 | 
111 | def basic_cleaners(text):
112 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
113 |   text = lowercase(text)
114 |   text = collapse_whitespace(text)
115 |   return text
116 | 
117 | 
118 | def transliteration_cleaners(text):
119 |   '''Pipeline for non-English text that transliterates to ASCII.'''
120 |   text = convert_to_ascii(text)
121 |   text = lowercase(text)
122 |   text = collapse_whitespace(text)
123 |   return text
124 | 
125 | 
126 | def rulebased_g2p_cleaners(text):
127 |   text = convert_to_ascii(text)
128 |   r = rulebased_g2p(text)
129 |   text = "^".join(["&".join(ri[1]).lower() for ri in r])
130 |   text = lowercase(text)
131 |   return text
132 | 
133 | 
134 | def english_cleaners(text):
135 |   '''Pipeline for English text, including number and abbreviation expansion.'''
136 |   text = convert_to_ascii(text)
137 |   text = lowercase(text)
138 |   text = expand_numbers(text)
139 |   text = expand_abbreviations(text)
140 |   text = collapse_spurious(text)
141 |   text = collapse_whitespace(text)
142 |   return text
143 | 
144 | 
145 | def english_minimal_cleaners(text):
146 |   '''Pipeline for English text, including number and abbreviation expansion.'''
147 |   text = convert_to_ascii(text)
148 |   text = lowercase(text)
149 |   text = expand_numbers(text)
150 |   text = collapse_whitespace(text)
151 |   return text
152 | 
153 | 
154 | def english_phone_cleaners(text):
155 |   '''Pipeline for English phones.'''
156 |   return text
157 | 
158 | def english_phone_pause_cleaners(text):
159 |   '''Pipeline for English phones.'''
160 |   return text
161 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/mixture.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Code is adapted from:
  3 | # https://github.com/pclucas14/pixel-cnn-pp
  4 | # https://github.com/openai/pixel-cnn
  5 | 
  6 | from __future__ import with_statement, print_function, absolute_import
  7 | 
  8 | import math
  9 | import numpy as np
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | 
 15 | 
 16 | def log_sum_exp(x):
 17 |     """ numerically stable log_sum_exp implementation that prevents overflow """
 18 |     # TF ordering
 19 |     axis = len(x.size()) - 1
 20 |     m, _ = torch.max(x, dim=axis)
 21 |     m2, _ = torch.max(x, dim=axis, keepdim=True)
 22 |     return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
 23 | 
 24 | 
 25 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256,
 26 |                                   log_scale_min=-7.0, reduce=True):
 27 |     """Discretized mixture of logistic distributions loss
 28 | 
 29 |     Note that it is assumed that input is scaled to [-1, 1].
 30 | 
 31 |     Args:
 32 |         y_hat (Tensor): Predicted output (B x C x T)
 33 |         y (Tensor): Target (B x T x 1).
 34 |         num_classes (int): Number of classes
 35 |         log_scale_min (float): Log scale minimum value
 36 |         reduce (bool): If True, the losses are averaged or summed for each
 37 |           minibatch.
 38 | 
 39 |     Returns
 40 |         Tensor: loss
 41 |     """
 42 |     assert y_hat.dim() == 3
 43 |     assert y_hat.size(1) % 3 == 0
 44 |     nr_mix = y_hat.size(1) // 3
 45 | 
 46 |     # (B x T x C)
 47 |     y_hat = y_hat.transpose(1, 2)
 48 | 
 49 |     # unpack parameters. (B, T, num_mixtures) x 3
 50 |     logit_probs = y_hat[:, :, :nr_mix]
 51 |     means = y_hat[:, :, nr_mix:2 * nr_mix]
 52 |     log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
 53 | 
 54 |     # B x T x 1 -> B x T x num_mixtures
 55 |     y = y.expand_as(means)
 56 | 
 57 |     centered_y = y - means
 58 |     inv_stdv = torch.exp(-log_scales)
 59 |     plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 60 |     cdf_plus = F.sigmoid(plus_in)
 61 |     min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 62 |     cdf_min = F.sigmoid(min_in)
 63 | 
 64 |     # log probability for edge case of 0 (before scaling)
 65 |     # equivalent: torch.log(F.sigmoid(plus_in))
 66 |     log_cdf_plus = plus_in - F.softplus(plus_in)
 67 | 
 68 |     # log probability for edge case of 255 (before scaling)
 69 |     # equivalent: (1 - F.sigmoid(min_in)).log()
 70 |     log_one_minus_cdf_min = -F.softplus(min_in)
 71 | 
 72 |     # probability for all other cases
 73 |     cdf_delta = cdf_plus - cdf_min
 74 | 
 75 |     mid_in = inv_stdv * centered_y
 76 |     # log probability in the center of the bin, to be used in extreme cases
 77 |     # (not actually used in our code)
 78 |     log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
 79 | 
 80 |     # tf equivalent
 81 |     """
 82 |     log_probs = tf.where(x < -0.999, log_cdf_plus,
 83 |                          tf.where(x > 0.999, log_one_minus_cdf_min,
 84 |                                   tf.where(cdf_delta > 1e-5,
 85 |                                            tf.log(tf.maximum(cdf_delta, 1e-12)),
 86 |                                            log_pdf_mid - np.log(127.5))))
 87 |     """
 88 |     # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
 89 |     # for num_classes=65536 case? 1e-7? not sure..
 90 |     inner_inner_cond = (cdf_delta > 1e-5).float()
 91 | 
 92 |     inner_inner_out = inner_inner_cond * \
 93 |         torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
 94 |         (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
 95 |     inner_cond = (y > 0.999).float()
 96 |     inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
 97 |     cond = (y < -0.999).float()
 98 |     log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
 99 | 
100 |     log_probs = log_probs + F.log_softmax(logit_probs, -1)
101 | 
102 |     if reduce:
103 |         return -torch.sum(log_sum_exp(log_probs))
104 |     else:
105 |         return -log_sum_exp(log_probs).unsqueeze(-1)
106 | 
107 | 
108 | def to_one_hot(tensor, n, fill_with=1.):
109 |     # we perform one hot encore with respect to the last axis
110 |     one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
111 |     if tensor.is_cuda:
112 |         one_hot = one_hot.cuda()
113 |     one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
114 |     return one_hot
115 | 
116 | 
117 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0):
118 |     """
119 |     Sample from discretized mixture of logistic distributions
120 | 
121 |     Args:
122 |         y (Tensor): B x C x T
123 |         log_scale_min (float): Log scale minimum value
124 | 
125 |     Returns:
126 |         Tensor: sample in range of [-1, 1].
127 |     """
128 |     assert y.size(1) % 3 == 0
129 |     nr_mix = y.size(1) // 3
130 | 
131 |     # B x T x C
132 |     y = y.transpose(1, 2)
133 |     logit_probs = y[:, :, :nr_mix]
134 | 
135 |     # sample mixture indicator from softmax
136 |     temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
137 |     temp = logit_probs.data - torch.log(- torch.log(temp))
138 |     _, argmax = temp.max(dim=-1)
139 | 
140 |     # (B, T) -> (B, T, nr_mix)
141 |     one_hot = to_one_hot(argmax, nr_mix)
142 |     # select logistic parameters
143 |     means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
144 |     log_scales = torch.clamp(torch.sum(
145 |         y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
146 |     # sample from logistic & clip to interval
147 |     # we don't actually round to the nearest 8bit value when sampling
148 |     u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
149 |     x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
150 | 
151 |     x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
152 | 
153 |     return x
154 | 


--------------------------------------------------------------------------------
/pretrained/representation_mixing_text_to_speech_demo_minimal.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Minimal demo of Representation Mixing",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "private_outputs": true,
 10 |       "collapsed_sections": [],
 11 |       "toc_visible": true
 12 |     },
 13 |     "kernelspec": {
 14 |       "name": "python2",
 15 |       "display_name": "Python 2"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "metadata": {
 22 |         "id": "ub5RuaFnxo-O",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "cell_type": "markdown",
 26 |       "source": [
 27 |         "## Minimal Demo\n",
 28 |         "\n",
 29 |         "This is a minimal demo of Representation Mixing, for more details see the [arxiv paper](https://arxiv.org/abs/1811.07240)\n",
 30 |         "\n",
 31 |         "Approximate runtime ~4 minutes"
 32 |       ]
 33 |     },
 34 |     {
 35 |       "metadata": {
 36 |         "id": "m7R_1MpFc3Za",
 37 |         "colab_type": "text"
 38 |       },
 39 |       "cell_type": "markdown",
 40 |       "source": [
 41 |         "## Setup\n",
 42 |         "\n",
 43 |         "### Install dependencies"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "metadata": {
 48 |         "id": "NlLC7Q7Us8go",
 49 |         "colab_type": "code",
 50 |         "colab": {}
 51 |       },
 52 |       "cell_type": "code",
 53 |       "source": [
 54 |         "import os\n",
 55 |         "from os.path import exists, join, expanduser\n",
 56 |         "\n",
 57 |         "os.chdir(os.path.expanduser(\"~\"))\n",
 58 |         "\n",
 59 |         "representation_mixing_dir = \"representation_mixing\"\n",
 60 |         "if not os.path.exists(representation_mixing_dir):\n",
 61 |         "  ! git clone https://github.com/kastnerkyle/$representation_mixing_dir\n"
 62 |       ],
 63 |       "execution_count": 0,
 64 |       "outputs": []
 65 |     },
 66 |     {
 67 |       "metadata": {
 68 |         "id": "KBFfji_Avluz",
 69 |         "colab_type": "code",
 70 |         "colab": {}
 71 |       },
 72 |       "cell_type": "code",
 73 |       "source": [
 74 |         "# Install dependencies\n",
 75 |         "! pip install -q --upgrade \"tensorflow<=1.6.0\"\n",
 76 |         "! pip install -q --upgrade \"unidecode\""
 77 |       ],
 78 |       "execution_count": 0,
 79 |       "outputs": []
 80 |     },
 81 |     {
 82 |       "metadata": {
 83 |         "id": "iZsAP7srBBTe",
 84 |         "colab_type": "code",
 85 |         "colab": {}
 86 |       },
 87 |       "cell_type": "code",
 88 |       "source": [
 89 |         "os.chdir(representation_mixing_dir)\n",
 90 |         "os.chdir(\"pretrained\")"
 91 |       ],
 92 |       "execution_count": 0,
 93 |       "outputs": []
 94 |     },
 95 |     {
 96 |       "metadata": {
 97 |         "id": "km1SAASEcIL6",
 98 |         "colab_type": "text"
 99 |       },
100 |       "cell_type": "markdown",
101 |       "source": [
102 |         "\n",
103 |         "## Input texts to be synthesized\n",
104 |         "\n",
105 |         "Choose your favorite sentences :)"
106 |       ]
107 |     },
108 |     {
109 |       "metadata": {
110 |         "id": "qnHnJyc1v6U7",
111 |         "colab_type": "code",
112 |         "colab": {}
113 |       },
114 |       "cell_type": "code",
115 |       "source": [
116 |         "if os.path.exists(\"sample_lines.txt\"):\n",
117 |         "      os.remove(\"sample_lines.txt\")"
118 |       ],
119 |       "execution_count": 0,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "metadata": {
124 |         "id": "tU1lz6PcbXut",
125 |         "colab_type": "code",
126 |         "colab": {}
127 |       },
128 |       "cell_type": "code",
129 |       "source": [
130 |         "%%bash\n",
131 |         "cat << EOS > sample_lines.txt\n",
132 |         "The cat ate bread.\n",
133 |         "That cat is not dead.\n",
134 |         "EOS\n",
135 |         "\n",
136 |         "cat sample_lines.txt"
137 |       ],
138 |       "execution_count": 0,
139 |       "outputs": []
140 |     },
141 |     {
142 |       "metadata": {
143 |         "id": "15p8phXx6nxe",
144 |         "colab_type": "code",
145 |         "colab": {}
146 |       },
147 |       "cell_type": "code",
148 |       "source": [
149 |         "! bash sample.sh"
150 |       ],
151 |       "execution_count": 0,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "metadata": {
156 |         "id": "rY_MfE0m8Ese",
157 |         "colab_type": "code",
158 |         "colab": {}
159 |       },
160 |       "cell_type": "code",
161 |       "source": [
162 |         "import IPython\n",
163 |         "from IPython.display import Audio\n",
164 |         "import numpy as np"
165 |       ],
166 |       "execution_count": 0,
167 |       "outputs": []
168 |     },
169 |     {
170 |       "metadata": {
171 |         "id": "hNG8oI4OiJkJ",
172 |         "colab_type": "text"
173 |       },
174 |       "cell_type": "markdown",
175 |       "source": [
176 |         "## Summary: audio samples"
177 |       ]
178 |     },
179 |     {
180 |       "metadata": {
181 |         "id": "OIyfhn0v9Ntg",
182 |         "colab_type": "code",
183 |         "colab": {}
184 |       },
185 |       "cell_type": "code",
186 |       "source": [
187 |         "with open(\"sample_lines.txt\", \"r\") as f:\n",
188 |         "    lines = f.readlines()\n",
189 |         "lines = [l.strip() for l in lines]\n",
190 |         "\n",
191 |         "def sort(files):\n",
192 |         "    return sorted(files, key=lambda k: int(k.split(\"_\")[1]))\n",
193 |         "    \n",
194 |         "mel_files = sort([f for f in os.listdir(\".\") if \"_mels.npz\" in f])\n",
195 |         "audio_files = sort([f for f in os.listdir(\".\") if \"_post.wav\" in f])          \n",
196 |         "maps = zip(lines, mel_files[:len(lines)], audio_files[:len(lines)])\n",
197 |         "\n",
198 |         "for idx, (text, mel, audio) in enumerate(maps):\n",
199 |         "  print(idx, text)\n",
200 |         "  IPython.display.display(Audio(audio, rate=22050))"
201 |       ],
202 |       "execution_count": 0,
203 |       "outputs": []
204 |     }
205 |   ]
206 | }
207 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/batch_synth.py:
--------------------------------------------------------------------------------
  1 | # Setup WaveNet vocoder hparams
  2 | import os
  3 | os.environ["KERAS_BACKEND"] = "tensorflow"
  4 | 
  5 | from hparams import hparams
  6 | wn_preset = "20180510_mixture_lj_checkpoint_step000320000_ema.json"
  7 | wn_checkpoint_path = "20180510_mixture_lj_checkpoint_step000320000_ema.pth"
  8 | with open(wn_preset) as f:
  9 |     hparams.parse_json(f.read())
 10 | 
 11 | # Setup WaveNet vocoder
 12 | from train import build_model
 13 | from synthesis import wavegen
 14 | import torch
 15 | from scipy.io import wavfile
 16 | 
 17 | from functools import partial
 18 | import numpy as np
 19 | import os
 20 | import sys
 21 | import audio
 22 | from tqdm import tqdm
 23 | 
 24 | from nnmnkwii import preprocessing as P
 25 | from hparams import hparams
 26 | from os.path import exists
 27 | import librosa
 28 | 
 29 | from wavenet_vocoder_core.util import is_mulaw_quantize, is_mulaw, is_raw
 30 | 
 31 | if len(sys.argv) < 2:
 32 |     raise ValueError("Must pass directory of wav files as only argument")
 33 | 
 34 | in_path = sys.argv[1]
 35 | assert os.path.exists(in_path)
 36 | 
 37 | def _process_utterance(wav_path, out_dir):
 38 |     fname = wav_path.split(os.sep)[-1].split(".")[0]
 39 |     audio_filename = '{}_resolved.npy'.format(fname)
 40 |     mel_filename = '{}_mel.npy'.format(fname)
 41 |     apth = os.path.join(out_dir, audio_filename)
 42 |     mpth = os.path.join(out_dir, mel_filename)
 43 |     if os.path.exists(apth) and os.path.exists(mpth):
 44 |         print("File {} already processed".format(wav_path))
 45 |         return
 46 | 
 47 |     # Load the audio to a numpy array:
 48 |     wav = audio.load_wav(wav_path)
 49 | 
 50 |     if hparams.rescaling:
 51 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
 52 | 
 53 |     # Mu-law quantize
 54 |     if is_mulaw_quantize(hparams.input_type):
 55 |         # [0, quantize_channels)
 56 |         out = P.mulaw_quantize(wav, hparams.quantize_channels)
 57 | 
 58 |         # Trim silences
 59 |         start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 60 |         wav = wav[start:end]
 61 |         out = out[start:end]
 62 |         constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
 63 |         out_dtype = np.int16
 64 |     elif is_mulaw(hparams.input_type):
 65 |         # [-1, 1]
 66 |         out = P.mulaw(wav, hparams.quantize_channels)
 67 |         constant_values = P.mulaw(0.0, hparams.quantize_channels)
 68 |         out_dtype = np.float32
 69 |     else:
 70 |         # [-1, 1]
 71 |         out = wav
 72 |         constant_values = 0.0
 73 |         out_dtype = np.float32
 74 | 
 75 |     # Compute a mel-scale spectrogram from the trimmed wav:
 76 |     # (N, D)
 77 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
 78 |     # lws pads zeros internally before performing stft
 79 |     # this is needed to adjust time resolution between audio and mel-spectrogram
 80 |     l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
 81 | 
 82 |     # zero pad for quantized signal
 83 |     out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
 84 |     N = mel_spectrogram.shape[0]
 85 |     assert len(out) >= N * audio.get_hop_size()
 86 | 
 87 |     # time resolution adjustment
 88 |     # ensure length of raw audio is multiple of hop_size so that we can use
 89 |     # transposed convolution to upsample
 90 |     out = out[:N * audio.get_hop_size()]
 91 |     assert len(out) % audio.get_hop_size() == 0
 92 | 
 93 |     timesteps = len(out)
 94 | 
 95 |     # Write the spectrograms to disk:
 96 |     np.save(apth,
 97 |             out.astype(out_dtype), allow_pickle=False)
 98 |     np.save(mpth,
 99 |             mel_spectrogram.astype(np.float32), allow_pickle=False)
100 | 
101 | 
102 | def soundsc(X, gain_scale=.9, copy=True):
103 |     """
104 |     Approximate implementation of soundsc from MATLAB without the audio playing.
105 | 
106 |     Parameters
107 |     ----------
108 |     X : ndarray
109 |         Signal to be rescaled
110 | 
111 |     gain_scale : float
112 |         Gain multipler, default .9 (90% of maximum representation)
113 | 
114 |     copy : bool, optional (default=True)
115 |         Whether to make a copy of input signal or operate in place.
116 | 
117 |     Returns
118 |     -------
119 |     X_sc : ndarray
120 |         (-32767, 32767) scaled version of X as int16, suitable for writing
121 |         with scipy.io.wavfile
122 |     """
123 |     X = np.array(X, copy=copy)
124 |     X = (X - X.min()) / (X.max() - X.min())
125 |     X = 2 * X - 1
126 |     X = gain_scale * X
127 |     X = X * 2 ** 15
128 |     return X.astype('int16')
129 | 
130 | 
131 | use_cuda = torch.cuda.is_available()
132 | device = torch.device("cuda" if use_cuda else "cpu")
133 | 
134 | print("Load checkpoint from {}".format(wn_checkpoint_path))
135 | if use_cuda:
136 |     checkpoint = torch.load(wn_checkpoint_path)
137 | else:
138 |     checkpoint = torch.load(wn_checkpoint_path, map_location="cpu")
139 | 
140 | if in_path[-1] == str(os.sep):
141 |     in_path = in_path[:-1]
142 | 
143 | model = build_model().to(device)
144 | model.load_state_dict(checkpoint["state_dict"])
145 | 
146 | wav_paths = [in_path + os.sep + "{}".format(fi) for fi in os.listdir(in_path) if ".wav" in fi]
147 | out_dir = in_path + "_mel"
148 | if not os.path.exists(out_dir):
149 |     os.mkdir(out_dir)
150 | 
151 | for wp in wav_paths:
152 |     print("Saving mels for {}".format(wp))
153 |     _process_utterance(wp, out_dir)
154 | 
155 | mel_dir = out_dir
156 | wav_out_dir = mel_dir + "_wavenet_render"
157 | if not os.path.exists(wav_out_dir):
158 |     os.mkdir(wav_out_dir)
159 | sample_rate = 22050
160 | mel_paths = [mel_dir + os.sep + "{}".format(fi) for fi in os.listdir(mel_dir) if "mel" in fi]
161 | for mel_path in mel_paths:
162 |     c = np.load(mel_path)
163 |     if c.shape[1] != hparams.num_mels:
164 |         np.swapaxes(c, 0, 1)
165 |     waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
166 |     fname = mel_path.split(os.sep)[-1].split(".")[0]
167 |     fpath = wav_out_dir + str(os.sep) + '{}.wav'.format(fname)
168 |     wavfile.write(fpath, sample_rate, waveform)
169 |     print("Saved HD audio {}".format(fpath))
170 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/evaluate.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | Synthesis waveform for testset
  4 | 
  5 | usage: evaluate.py [options] <checkpoint> <dst_dir>
  6 | 
  7 | options:
  8 |     --data-root=<dir>           Directory contains preprocessed features.
  9 |     --hparams=<parmas>          Hyper parameters [default: ].
 10 |     --preset=<json>             Path of preset parameters (json).
 11 |     --length=<T>                Steps to generate [default: 32000].
 12 |     --speaker-id=<N>            Use specific speaker of data in case for multi-speaker datasets.
 13 |     --initial-value=<n>         Initial value for the WaveNet decoder.
 14 |     --file-name-suffix=<s>      File name suffix [default: ].
 15 |     --output-html               Output html for blog post.
 16 |     --num-utterances=N>         Generate N utterenaces per speaker [default: -1].
 17 |     -h, --help                  Show help message.
 18 | """
 19 | from docopt import docopt
 20 | 
 21 | import sys
 22 | import os
 23 | from os.path import dirname, join, basename, splitext
 24 | import torch
 25 | import numpy as np
 26 | from nnmnkwii import preprocessing as P
 27 | from keras.utils import np_utils
 28 | from tqdm import tqdm
 29 | import librosa
 30 | 
 31 | 
 32 | from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
 33 | 
 34 | import audio
 35 | from hparams import hparams
 36 | 
 37 | 
 38 | use_cuda = torch.cuda.is_available()
 39 | device = torch.device("cuda" if use_cuda else "cpu")
 40 | 
 41 | if __name__ == "__main__":
 42 |     args = docopt(__doc__)
 43 |     print("Command line args:\n", args)
 44 |     data_root = args["--data-root"]
 45 |     if data_root is None:
 46 |         data_root = join(dirname(__file__), "data", "cmu_arctic")
 47 |     checkpoint_path = args["<checkpoint>"]
 48 |     dst_dir = args["<dst_dir>"]
 49 | 
 50 |     length = int(args["--length"])
 51 |     # Note that speaker-id is used for filtering out unrelated-speaker from
 52 |     # multi-speaker dataset.
 53 |     speaker_id = args["--speaker-id"]
 54 |     speaker_id = int(speaker_id) if speaker_id is not None else None
 55 |     initial_value = args["--initial-value"]
 56 |     initial_value = None if initial_value is None else float(initial_value)
 57 |     file_name_suffix = args["--file-name-suffix"]
 58 |     output_html = args["--output-html"]
 59 |     num_utterances = int(args["--num-utterances"])
 60 |     preset = args["--preset"]
 61 | 
 62 |     # Load preset if specified
 63 |     if preset is not None:
 64 |         with open(preset) as f:
 65 |             hparams.parse_json(f.read())
 66 |     # Override hyper parameters
 67 |     hparams.parse(args["--hparams"])
 68 |     assert hparams.name == "wavenet_vocoder"
 69 | 
 70 |     from train import build_model, get_data_loaders
 71 |     from synthesis import wavegen
 72 | 
 73 |     # Data
 74 |     # Use exactly same testset used in training script
 75 |     # disable shuffle for convenience
 76 |     test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False)["test"]
 77 |     test_dataset = test_data_loader.dataset
 78 | 
 79 |     # Model
 80 |     model = build_model().to(device)
 81 | 
 82 |     # Load checkpoint
 83 |     print("Load checkpoint from {}".format(checkpoint_path))
 84 |     if use_cuda:
 85 |         checkpoint = torch.load(checkpoint_path)
 86 |     else:
 87 |         checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
 88 |     model.load_state_dict(checkpoint["state_dict"])
 89 |     checkpoint_name = splitext(basename(checkpoint_path))[0]
 90 | 
 91 |     os.makedirs(dst_dir, exist_ok=True)
 92 |     dst_dir_name = basename(os.path.normpath(dst_dir))
 93 | 
 94 |     generated_utterances = {}
 95 |     for idx, (x, c, g) in enumerate(test_dataset):
 96 |         target_audio_path = test_dataset.X.collected_files[idx][0]
 97 |         if g is None and num_utterances > 0 and idx > num_utterances:
 98 |             break
 99 |         if num_utterances > 0 and g is not None:
100 |             try:
101 |                 generated_utterances[g] += 1
102 |                 if generated_utterances[g] > num_utterances:
103 |                     continue
104 |             except KeyError:
105 |                 generated_utterances[g] = 1
106 | 
107 |         if output_html:
108 |             def _tqdm(x): return x
109 |         else:
110 |             _tqdm = tqdm
111 |             print("Target audio is {}".format(target_audio_path))
112 |             if c is not None:
113 |                 print("Local conditioned by {}".format(test_dataset.Mel.collected_files[idx][0]))
114 |             if g is not None:
115 |                 print("Global conditioned by speaker id {}".format(g))
116 | 
117 |         # Paths
118 |         if g is None:
119 |             dst_wav_path = join(dst_dir, "{}_{}{}_predicted.wav".format(
120 |                 idx, checkpoint_name, file_name_suffix))
121 |             target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format(
122 |                 idx, checkpoint_name, file_name_suffix))
123 |         else:
124 |             dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format(
125 |                 g, idx, checkpoint_name, file_name_suffix))
126 |             target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format(
127 |                 g, idx, checkpoint_name, file_name_suffix))
128 | 
129 |         # Generate
130 |         waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value,
131 |                            fast=True, tqdm=_tqdm)
132 | 
133 |         # save
134 |         librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
135 |         if is_mulaw_quantize(hparams.input_type):
136 |             x = P.inv_mulaw_quantize(x, hparams.quantize_channels)
137 |         elif is_mulaw(hparams.input_type):
138 |             x = P.inv_mulaw(x, hparams.quantize_channels)
139 |         librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate)
140 | 
141 |         # log
142 |         if output_html:
143 |             print("""
144 | <audio controls="controls" >
145 | <source src="/{}/audio/{}/{}" autoplay/>
146 | Your browser does not support the audio element.
147 | </audio>
148 | """.format(hparams.name, dst_dir_name, basename(dst_wav_path)))
149 | 
150 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
151 |     sys.exit(0)
152 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/full_test.txt:
--------------------------------------------------------------------------------
  1 | a b c.
  2 | x y z.
  3 | hurry.
  4 | warehouse.
  5 | referendum.
  6 | is it free?
  7 | justifiable.
  8 | environment.
  9 | a debt runs.
 10 | gravitational.
 11 | cardboard film.
 12 | person thinking.
 13 | prepared killer.
 14 | aircraft torture.
 15 | allergic trouser.
 16 | strategic conduct.
 17 | worrying literature.
 18 | christmas is coming
 19 | a pet dilemma thinks.
 20 | how was the math test?
 21 | good to the last drop.
 22 | an m b a agent listens.
 23 | a compromise disappears.
 24 | an axis of x y or z freezes.
 25 | she did her best to help him.
 26 | a backbone contests the chaos.
 27 | two a greater than two n nine.
 28 | don't step on the broken glass.
 29 | a damned flips into the patient.
 30 | a trade purges within the b b c.
 31 | i'd rather be a bird than a fish.
 32 | i hear that nancy is very pretty.
 33 | i want more detailed information.
 34 | please wait outside of the house.
 35 | n a s a exposure tunes the waffle.
 36 | a mist dictates within the monster.
 37 | a sketch ropes the middle ceremony.
 38 | every farewell explodes the career.
 39 | she folded her handkerchief neatly.
 40 | against the steam chooses the studio.
 41 | rock music approaches at high velocity.
 42 | nine adam baye study on the two pieces.
 43 | an unfriendly decay conveys the outcome.
 44 | abstraction is often one floor above you.
 45 | a played lady ranks any publicized preview.
 46 | he told us a very exciting adventure story.
 47 | on august twenty eighth, mary plays the piano.
 48 | into a controller beams a concrete terrorist.
 49 | i often see the time eleven eleven on clocks.
 50 | it was getting dark, and we weren't there yet.
 51 | against every rhyme starves a choral apparatus.
 52 | everyone was busy, so i went to the movie alone.
 53 | i checked to make sure that he was still alive.
 54 | a dominant vegetarian shies away from the g o p.
 55 | joe made the sugar cookies, susan decorated them.
 56 | i want to buy a onesie, but know it won't suit me.
 57 | a former override of q w e r t y outside the pope.
 58 | f b i says that c i a says, i'll stay way from it.
 59 | any climbing dish listens to a cumbersome formula.
 60 | she wrote him a long letter, but he didn't read it.
 61 | dear, beauty is in the heat not physical, i love you.
 62 | an appeal on january fifth duplicates a sharp queen.
 63 | a farewell solos on march twenty third shakes north.
 64 | he ran out of money so he had to stop playing poker.
 65 | for example, a newspaper has only regional distribution t.
 66 | i currently have four windows open up, and i don't know why.
 67 | next to my indirect vocal declines every unbearable academic.
 68 | opposite her sounding bag is a m c's configured thoroughfare.
 69 | from april eighth to the present, i only smoke four cigarettes.
 70 | i will never be this young again, ever, oh damn, i just got older.
 71 | a generous continuum of amazon dot com is the conflicting worker.
 72 | she advised him to come back at once the wife lectures the blast.
 73 | a song can make or ruin a person's day if they let it get to them.
 74 | she did not cheat on the test, for it was not the right thing to do.
 75 | he said he was not there yesterday, however, many people saw him there.
 76 | should we start class now, or should we wait for everyone to get here?
 77 | if purple people eaters are real, where do they find purple people to eat?
 78 | on november eighteenth eighteen twenty one, a glittering gem is not enough.
 79 | a rocket from space x interacts with the individual beneath the soft flaw.
 80 | malls are great places to shop, i can find everything i need under one roof.
 81 | i think i will buy the red car, or i will lease the blue one, the faith nests.
 82 | italy is my favorite country, in fact, i plan to spend two weeks there next year.
 83 | i would have gotten w w w w dot google dot com, but my attendance wasn't good enough.
 84 | nineteen twenty is when we are unique together until we realise, we are all the same.
 85 | my mum tries to be cool by saying h t t p colon slash slash w w w b a i d u dot com.
 86 | he turned in the research paper on friday, otherwise, he emailed a s d f at yahoo dot org.
 87 | she works two jobs to make ends meet, at least, that was her reason for no having time to join us.
 88 | a remarkable well promotes the alphabet into the adjusted luck, the dress dodges across my assault.
 89 | a b c d e f g h i j k l m n o p q r s t u v w x y z one two three four five six seven eight nine ten.
 90 | across the waste persists the wrong pacifier, the washed passenger parades under the incorrect computer.
 91 | if the easter bunny and the tooth fairy had babies would they take your teeth and leave chocolate for you?
 92 | sometimes, all you need to do is completely make an ass of yourself and laugh it off to realise that life isn't so bad after all.
 93 | she borrowed the book from him many years ago and hasn't yet returned it, why won't the distinguishing love jump with the juvenile?
 94 | last friday in three week's time i saw a spotted striped blue worm shake hands with a legless lizard, the lake is a long way from here.
 95 | i was very proud of my nickname throughout high school but today, i couldn't be any different to what my nickname was, the metal lusts, the ranging captain charters the link.
 96 | i am happy to take your donation, any amount will be greatly appreciated, the waves were crashing on the shore, it was a lovely sight, the paradox sticks this bowl on top of a spontaneous tea.
 97 | a purple pig and a green donkey flew a kite in the middle of the night and ended up sunburned, the contained error poses as a logical target, the divorce attacks near a missing doom, the opera fines the daily examiner into a murderer.
 98 | as the most famous singer-songwriter, jay chou gave a perfect performance in beijing on may twenty fourth, twenty fifth, and twenty sixth twenty three all the fans thought highly of him and took pride in him all the tickets were sold out.
 99 | if you like tuna and tomato sauce, try combining the two, it's really not as bad as it sounds, the body may perhaps compensates for the loss of a true metaphysics, the clock within this blog and the clock on my laptop are one hour different from each other.
100 | someone i know recently combined maple syrup and buttered popcorn thinking it would taste like caramel popcornm, it didn't and they don't recommend anyone else do it either, the gentleman marches around the principal, the divorce attacks near a missing doom, the color misprints a circular worry across the controversy.
101 | 


--------------------------------------------------------------------------------
/pretrained/number_to_words.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/ianfieldhouse/number_to_words
  2 | 
  3 | class NumberToWords(object):
  4 |     """
  5 |     Class for converting positive integer values to a textual representation
  6 |     of the submitted number for value of 0 up to 999999999.
  7 | 
  8 |     Example:
  9 |         >>> from number_to_words import NumberToWords
 10 |         >>> n2w = NumberToWords()
 11 |         >>> n2w.convert(123)
 12 |         'one hundred and twenty three'
 13 |     """
 14 | 
 15 |     MAX = 999999999
 16 |     SMALL_NUMBERS = ['', 'one', 'two', 'three', 'four', 'five', 'six',
 17 |                      'seven', 'eight', 'nine', 'ten', 'eleven',
 18 |                      'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
 19 |                      'seventeen', 'eighteen', 'nineteen']
 20 |     TENS = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy',
 21 |             'eighty', 'ninety']
 22 |     LARGE_NUMBERS = ['', 'thousand', 'million']
 23 |     EXCEPTION_STRING = "This method expects positive integer values between " \
 24 |         + "0 and {0}".format(MAX)
 25 | 
 26 |     def convert(self, number):
 27 |         """
 28 |         Take an integer and return it converted to a textual representation.
 29 | 
 30 |         Args:
 31 |             number (int): The number to be converted.
 32 | 
 33 |         Returns:
 34 |             sentence (string): The textual representation of `number`.
 35 | 
 36 |         Raises:
 37 |             ValueError: If `number` is not a positive integer or is greater
 38 |                         than `MAX`.
 39 |         """
 40 | 
 41 |         if not isinstance(number, int):
 42 |             raise ValueError(self.EXCEPTION_STRING)
 43 |         try:
 44 |             sentence = ""
 45 |             if number == 0:
 46 |                 sentence = "zero"
 47 |             else:
 48 |                 # split number into a list of strings where each list item is
 49 |                 # at most 3 character in length.
 50 |                 groups = format(number, ',').split(',')
 51 | 
 52 |                 # make sure each list item is exactly 3 characters long by
 53 |                 # zero filling
 54 |                 zero_filled_groups = []
 55 |                 for group in groups:
 56 |                     zero_filled_groups.append(group.zfill(3))
 57 | 
 58 |                 # reverse the list of strings so that the list indexes of the
 59 |                 # string representation of hundreds, thousands and million
 60 |                 # match those of `LARGE_NUMBERS`
 61 |                 zero_filled_groups.reverse()
 62 |                 for group in zero_filled_groups:
 63 |                     index = zero_filled_groups.index(group)
 64 |                     suffix = self.LARGE_NUMBERS[index]
 65 |                     is_and_required = False
 66 |                     if index is 0 and len(zero_filled_groups) > 1:
 67 |                         is_and_required = True
 68 |                     number_as_words = " ".join(
 69 |                         self._number_to_word_list(group, is_and_required,
 70 |                                                   suffix))
 71 |                     if len(number_as_words) > 0:
 72 |                         sentence = "{0} {1}".format(number_as_words, sentence)
 73 |                     # set this group to None so as to not set a false `index`
 74 |                     # for subsequent groups where `number` has multiple
 75 |                     # identical groups
 76 |                     zero_filled_groups[index] = None
 77 |             return sentence.rstrip()
 78 |         except (IndexError, ValueError):
 79 |             raise ValueError(self.EXCEPTION_STRING)
 80 | 
 81 |     def _number_to_word_list(self, number_string, is_and_required,
 82 |                              suffix=None):
 83 |         """
 84 |         Take a 3 digit string representation of an integer and convert it to a
 85 |         textual representation with an optional suffix.
 86 | 
 87 |         Args:
 88 |             number_string (str): The number to be converted as a string.
 89 |             is_and_required (bool): Whether the word and should be prefixed
 90 |                                     before tens and units when there is a zero
 91 |                                     in the hundreds column.
 92 |             suffix (Optional[str]): The string to append to the end of the
 93 |                                     words (default None)
 94 | 
 95 |         Returns:
 96 |             words (List[str]): A list of strings of the words that make up the
 97 |                            textual representation of `number_string`.
 98 |         """
 99 | 
100 |         words = []
101 |         hundreds, tens, units = [int(n) for n in list(number_string)]
102 |         total = sum([hundreds, tens, units])
103 |         if hundreds != 0:
104 |             string = self.SMALL_NUMBERS[hundreds]
105 |             words.append("{0} hundred".format(string))
106 |             if tens != 0 or units != 0:
107 |                 # KK: mod
108 |                 pass
109 |                 #words.append("and")
110 |         elif hundreds == 0 and is_and_required and total != 0:
111 |             # KK: mod
112 |             pass
113 |             #words.append("and")
114 |         if tens == 1:
115 |             string = self.SMALL_NUMBERS[int("{0}{1}".format(tens, units))]
116 |             words.append("{0}".format(string))
117 |         else:
118 |             if tens != 0:
119 |                 string = self.TENS[tens]
120 |                 words.append("{0}".format(string))
121 |             if units != 0:
122 |                 string = self.SMALL_NUMBERS[units]
123 |                 words.append("{0}".format(string))
124 | 
125 |         if suffix and total != 0:
126 |             words.append(suffix)
127 | 
128 |         return words
129 | 
130 | if __name__ == "__main__":
131 |     n2w = NumberToWords()
132 |     unique = set()
133 | 
134 |     def fib():
135 |         x, y = 0, 1
136 |         yield x
137 |         yield y
138 | 
139 |         while True:
140 |             x, y = y, x + y
141 |             yield y
142 | 
143 |     for num in fib():
144 |         if num > n2w.MAX:
145 |             break
146 |         unique.add(num)
147 | 
148 |     print(n2w.__doc__)
149 |     print("""
150 | Some example conversions from number to words
151 | =============================================\n""")
152 | 
153 |     for num in sorted(list(unique)):
154 |         print("{0} : {1}".format(format(num, ','), n2w.convert(num)))
155 |     print("{0} : {1}".format(format(n2w.MAX, ','), n2w.convert(n2w.MAX)))
156 | 


--------------------------------------------------------------------------------
/code/lib/tfbldr/datasets/text/cleaning/number_to_words.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/ianfieldhouse/number_to_words
  2 | 
  3 | class NumberToWords(object):
  4 |     """
  5 |     Class for converting positive integer values to a textual representation
  6 |     of the submitted number for value of 0 up to 999999999.
  7 | 
  8 |     Example:
  9 |         >>> from number_to_words import NumberToWords
 10 |         >>> n2w = NumberToWords()
 11 |         >>> n2w.convert(123)
 12 |         'one hundred and twenty three'
 13 |     """
 14 | 
 15 |     MAX = 999999999
 16 |     SMALL_NUMBERS = ['', 'one', 'two', 'three', 'four', 'five', 'six',
 17 |                      'seven', 'eight', 'nine', 'ten', 'eleven',
 18 |                      'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
 19 |                      'seventeen', 'eighteen', 'nineteen']
 20 |     TENS = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy',
 21 |             'eighty', 'ninety']
 22 |     LARGE_NUMBERS = ['', 'thousand', 'million']
 23 |     EXCEPTION_STRING = "This method expects positive integer values between " \
 24 |         + "0 and {0}".format(MAX)
 25 | 
 26 |     def convert(self, number):
 27 |         """
 28 |         Take an integer and return it converted to a textual representation.
 29 | 
 30 |         Args:
 31 |             number (int): The number to be converted.
 32 | 
 33 |         Returns:
 34 |             sentence (string): The textual representation of `number`.
 35 | 
 36 |         Raises:
 37 |             ValueError: If `number` is not a positive integer or is greater
 38 |                         than `MAX`.
 39 |         """
 40 | 
 41 |         if not isinstance(number, int):
 42 |             raise ValueError(self.EXCEPTION_STRING)
 43 |         try:
 44 |             sentence = ""
 45 |             if number == 0:
 46 |                 sentence = "zero"
 47 |             else:
 48 |                 # split number into a list of strings where each list item is
 49 |                 # at most 3 character in length.
 50 |                 groups = format(number, ',').split(',')
 51 | 
 52 |                 # make sure each list item is exactly 3 characters long by
 53 |                 # zero filling
 54 |                 zero_filled_groups = []
 55 |                 for group in groups:
 56 |                     zero_filled_groups.append(group.zfill(3))
 57 | 
 58 |                 # reverse the list of strings so that the list indexes of the
 59 |                 # string representation of hundreds, thousands and million
 60 |                 # match those of `LARGE_NUMBERS`
 61 |                 zero_filled_groups.reverse()
 62 |                 for group in zero_filled_groups:
 63 |                     index = zero_filled_groups.index(group)
 64 |                     suffix = self.LARGE_NUMBERS[index]
 65 |                     is_and_required = False
 66 |                     if index is 0 and len(zero_filled_groups) > 1:
 67 |                         is_and_required = True
 68 |                     number_as_words = " ".join(
 69 |                         self._number_to_word_list(group, is_and_required,
 70 |                                                   suffix))
 71 |                     if len(number_as_words) > 0:
 72 |                         sentence = "{0} {1}".format(number_as_words, sentence)
 73 |                     # set this group to None so as to not set a false `index`
 74 |                     # for subsequent groups where `number` has multiple
 75 |                     # identical groups
 76 |                     zero_filled_groups[index] = None
 77 |             return sentence.rstrip()
 78 |         except (IndexError, ValueError):
 79 |             raise ValueError(self.EXCEPTION_STRING)
 80 | 
 81 |     def _number_to_word_list(self, number_string, is_and_required,
 82 |                              suffix=None):
 83 |         """
 84 |         Take a 3 digit string representation of an integer and convert it to a
 85 |         textual representation with an optional suffix.
 86 | 
 87 |         Args:
 88 |             number_string (str): The number to be converted as a string.
 89 |             is_and_required (bool): Whether the word and should be prefixed
 90 |                                     before tens and units when there is a zero
 91 |                                     in the hundreds column.
 92 |             suffix (Optional[str]): The string to append to the end of the
 93 |                                     words (default None)
 94 | 
 95 |         Returns:
 96 |             words (List[str]): A list of strings of the words that make up the
 97 |                            textual representation of `number_string`.
 98 |         """
 99 | 
100 |         words = []
101 |         hundreds, tens, units = [int(n) for n in list(number_string)]
102 |         total = sum([hundreds, tens, units])
103 |         if hundreds != 0:
104 |             string = self.SMALL_NUMBERS[hundreds]
105 |             words.append("{0} hundred".format(string))
106 |             if tens != 0 or units != 0:
107 |                 # KK: mod
108 |                 pass
109 |                 #words.append("and")
110 |         elif hundreds == 0 and is_and_required and total != 0:
111 |             # KK: mod
112 |             pass
113 |             #words.append("and")
114 |         if tens == 1:
115 |             string = self.SMALL_NUMBERS[int("{0}{1}".format(tens, units))]
116 |             words.append("{0}".format(string))
117 |         else:
118 |             if tens != 0:
119 |                 string = self.TENS[tens]
120 |                 words.append("{0}".format(string))
121 |             if units != 0:
122 |                 string = self.SMALL_NUMBERS[units]
123 |                 words.append("{0}".format(string))
124 | 
125 |         if suffix and total != 0:
126 |             words.append(suffix)
127 | 
128 |         return words
129 | 
130 | if __name__ == "__main__":
131 |     n2w = NumberToWords()
132 |     unique = set()
133 | 
134 |     def fib():
135 |         x, y = 0, 1
136 |         yield x
137 |         yield y
138 | 
139 |         while True:
140 |             x, y = y, x + y
141 |             yield y
142 | 
143 |     for num in fib():
144 |         if num > n2w.MAX:
145 |             break
146 |         unique.add(num)
147 | 
148 |     print(n2w.__doc__)
149 |     print("""
150 | Some example conversions from number to words
151 | =============================================\n""")
152 | 
153 |     for num in sorted(list(unique)):
154 |         print("{0} : {1}".format(format(num, ','), n2w.convert(num)))
155 |     print("{0} : {1}".format(format(n2w.MAX, ','), n2w.convert(n2w.MAX)))
156 | 


--------------------------------------------------------------------------------
/pretrained/transform_text.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from text import pronounce_chars
  3 | from cleaning import text_to_sequence
  4 | from cleaning import sequence_to_text
  5 | from cleaning import get_vocabulary_sizes
  6 | import cleaners
  7 | 
  8 | clean_names = ["english_cleaners", "english_phone_cleaners"]
  9 | lcl_random_state = np.random.RandomState(4142)
 10 | 
 11 | def transform_text(char_seq, auto_pronounce=True, phone_seq=None, force_char_spc=True, symbol_processing="blended_pref", random_state=None):
 12 |     """
 13 |     chars format example: "i am learning english."
 14 |     phone_seq format example: "@ay @ae@m @l@er@n@ih@ng @ih@ng@g@l@ih@sh"
 15 | 
 16 |     chars_only
 17 |     phones_only
 18 |     blended_pref
 19 | 
 20 |     phone_seq formatting can be gotten from text, using the pronounce_chars function with 'from text import pronounce_chars'
 21 |         Uses cmudict to do pronunciation
 22 |     """
 23 |     if random_state is None:
 24 |         random_state = lcl_random_state
 25 | 
 26 |     if phone_seq is None and auto_pronounce is False and symbol_processing != "chars_only":
 27 |         raise ValueError("phone_seq argument must be provided for iterator with self.symbol_processing != 'chars_only', currently '{}'".format(self.symbol_processing))
 28 |     clean_char_seq = cleaners.english_cleaners(char_seq)
 29 |     char_seq_chunk = clean_char_seq.split(" ")
 30 |     dirty_seq_chunk = char_seq.split(" ")
 31 | 
 32 |     if auto_pronounce is True:
 33 |         if phone_seq is not None:
 34 |             raise ValueError("auto_pronounce set to True, but phone_seq was provided! Pass phone_seq=None for auto_pronounce=True")
 35 |         # take out specials then put them back...
 36 |         specials = "!?.,;:"
 37 |         puncts = "!?."
 38 |         tsc = []
 39 |         for n, csc in enumerate(char_seq_chunk):
 40 |             broke = False
 41 |             for s in specials:
 42 |                 if s in csc:
 43 |                     new = csc.replace(s, "")
 44 |                     tsc.append(new)
 45 |                     broke = True
 46 |                     break
 47 |             if not broke:
 48 |                 tsc.append(csc)
 49 | 
 50 |         if symbol_processing == "blended_pref":
 51 |             chunky_phone_seq_chunk = [pronounce_chars(w, raw_line=dirty_seq_chunk[ii], cmu_only=True) for ii, w in enumerate(tsc)]
 52 |             phone_seq_chunk = [cpsc[0] if cpsc != None else None for cpsc in chunky_phone_seq_chunk]
 53 |         else:
 54 |             phone_seq_chunk = [pronounce_chars(w) for w in tsc]
 55 |         for n, psc in enumerate(phone_seq_chunk):
 56 |             for s in specials:
 57 |                 if char_seq_chunk[n][-1] == s and phone_seq_chunk[n] != None:
 58 |                     phone_seq_chunk[n] += char_seq_chunk[n][-1]
 59 |                     #if char_seq_chunk[n][-1] in puncts and n != (len(phone_seq_chunk) - 1):
 60 |                     #    # add eos
 61 |                     #    char_seq_chunk[n] += "~"
 62 |                     #    phone_seq_chunk[n] += "~"
 63 |                     break
 64 |     else:
 65 |         raise ValueError("Non auto_pronounce setting not yet configured")
 66 | 
 67 |     if len(char_seq_chunk) != len(phone_seq_chunk):
 68 |         raise ValueError("Char and phone chunking resulted in different lengths {} and {}!\n{}\n{}".format(len(char_seq_chunk), len(phone_seq_chunk), char_seq_chunk, phone_seq_chunk))
 69 | 
 70 |     if symbol_processing != "phones_only":
 71 |         spc = text_to_sequence(" ", [clean_names[0]])[0]
 72 |     else:
 73 |         spc = text_to_sequence(" ", [clean_names[1]])[0]
 74 | 
 75 |     int_char_chunks = []
 76 |     int_phone_chunks = []
 77 |     for n in range(len(char_seq_chunk)):
 78 |         int_char_chunks.append(text_to_sequence(char_seq_chunk[n], [clean_names[0]])[:-1])
 79 |         if phone_seq_chunk[n] == None:
 80 |             int_phone_chunks.append([])
 81 |         else:
 82 |             int_phone_chunks.append(text_to_sequence(phone_seq_chunk[n], [clean_names[1]])[:-2])
 83 | 
 84 |     # check inverses
 85 |     # w = [sequence_to_text(int_char_chunks[i], [self.clean_names[0]]) for i in range(len(int_char_chunks))]
 86 |     # p = [sequence_to_text(int_phone_chunks[i], [self.clean_names[1]]) for i in range(len(int_phone_chunks))]
 87 | 
 88 |     # TODO: Unify the two functions?
 89 |     char_phone_mask = [0] * len(int_char_chunks) + [1] * len(int_phone_chunks)
 90 |     random_state.shuffle(char_phone_mask)
 91 |     char_phone_mask = char_phone_mask[:len(int_char_chunks)]
 92 |     # setting char_phone_mask to 0 will use chars, 1 will use phones
 93 |     # these if statements override the default for blended... (above)
 94 |     if symbol_processing == "blended_pref":
 95 |         char_phone_mask = [0 if len(int_phone_chunks[i]) == 0 else 1 for i in range(len(int_char_chunks))]
 96 |     elif symbol_processing == "phones_only":
 97 |         # set the mask to use only phones
 98 |         # all files should have phones because of earlier preproc...
 99 |         char_phone_mask = [1 for i in range(len(char_phone_mask))]
100 |     elif symbol_processing == "chars_only":
101 |         # only use chars
102 |         char_phone_mask = [0 for i in range(len(char_phone_mask))]
103 | 
104 |     # if the phones entry is None, the word was OOV or not recognized
105 |     char_phone_int_seq = [int_char_chunks[i] if (len(int_phone_chunks[i]) == 0 or char_phone_mask[i] == 0) else int_phone_chunks[i] for i in range(len(int_char_chunks))]
106 |     # check the inverse is ok
107 |     # char_phone_txt = [sequence_to_text(char_phone_int_seq[i], [self.clean_names[char_phone_mask[i]]]) for i in range(len(char_phone_int_seq))]
108 |     # combine into 1 sequence
109 |     cphi = char_phone_int_seq[0]
110 |     cpm = [char_phone_mask[0]] * len(char_phone_int_seq[0])
111 |     if force_char_spc or self.symbol_processing != "phones_only":
112 |         spc = text_to_sequence(" ", [clean_names[0]])[0]
113 |     else:
114 |         spc = text_to_sequence(" ", [clean_names[1]])[0]
115 |     for i in range(len(char_phone_int_seq[1:])):
116 |         # add space
117 |         cphi += [spc]
118 |         # always treat space as char unless in phones only mode
119 |         if force_char_spc or self.symbol_processing != "phones_only":
120 |             cpm += [0]
121 |         else:
122 |             cpm += [1]
123 |         cphi += char_phone_int_seq[i + 1]
124 |         cpm += [char_phone_mask[i + 1]] * len(char_phone_int_seq[i + 1])
125 |     # trailing space
126 |     #cphi = cphi + [spc]
127 |     # trailing eos
128 |     cphi = cphi + [1]
129 |     # add trailing symbol
130 |     if symbol_processing != "phones_only":
131 |         cpm += [0]
132 |     else:
133 |         cpm += [1]
134 |     # check inverse
135 |     #cpt = "".join([sequence_to_text([cphi[i]], [self.clean_names[cpm[i]]]) for i in range(len(cphi))])
136 |     #if None in phone_seq_chunk:
137 |         #print("NUN")
138 |         #print(cpt)
139 |         #from IPython import embed; embed(); raise ValueError()
140 |     return cphi, cpm
141 | 
142 | def inverse_transform_text(int_seq, mask):
143 |     """
144 |     mask set to zero will use chars, mask set to 1 will use phones
145 | 
146 |     should invert the transform_txt function
147 |     """
148 |     cphi = int_seq
149 |     cpm = mask
150 |     cpt = "".join([sequence_to_text([cphi[i]], [clean_names[cpm[i]]]) for i in range(len(cphi))])
151 |     return cpt
152 |     # setting char_phone_mask to 0 will use chars, 1 will use phones
153 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/synthesis.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """
  3 | Synthesis waveform from trained WaveNet.
  4 | 
  5 | usage: synthesis.py [options] <checkpoint> <dst_dir>
  6 | 
  7 | options:
  8 |     --hparams=<parmas>                Hyper parameters [default: ].
  9 |     --preset=<json>                   Path of preset parameters (json).
 10 |     --length=<T>                      Steps to generate [default: 32000].
 11 |     --initial-value=<n>               Initial value for the WaveNet decoder.
 12 |     --conditional=<p>                 Conditional features path.
 13 |     --symmetric-mels                  Symmetric mel.
 14 |     --max-abs-value=<N>               Max abs value [default: -1].
 15 |     --file-name-suffix=<s>            File name suffix [default: ].
 16 |     --speaker-id=<id>                 Speaker ID (for multi-speaker model).
 17 |     --output-html                     Output html for blog post.
 18 |     -h, --help               Show help message.
 19 | """
 20 | from docopt import docopt
 21 | 
 22 | import sys
 23 | import os
 24 | from os.path import dirname, join, basename, splitext
 25 | import torch
 26 | import numpy as np
 27 | from nnmnkwii import preprocessing as P
 28 | from keras.utils import np_utils
 29 | from tqdm import tqdm
 30 | import librosa
 31 | 
 32 | from wavenet_vocoder_core.util import is_mulaw_quantize, is_mulaw, is_raw
 33 | 
 34 | import audio
 35 | from hparams import hparams
 36 | 
 37 | 
 38 | torch.set_num_threads(4)
 39 | use_cuda = torch.cuda.is_available()
 40 | device = torch.device("cuda" if use_cuda else "cpu")
 41 | 
 42 | 
 43 | def _to_numpy(x):
 44 |     # this is ugly
 45 |     if x is None:
 46 |         return None
 47 |     if isinstance(x, np.ndarray) or np.isscalar(x):
 48 |         return x
 49 |     # remove batch axis
 50 |     if x.dim() == 3:
 51 |         x = x.squeeze(0)
 52 |     return x.numpy()
 53 | 
 54 | 
 55 | def wavegen(model, length=None, c=None, g=None, initial_value=None,
 56 |             fast=False, tqdm=tqdm):
 57 |     """Generate waveform samples by WaveNet.
 58 | 
 59 |     Args:
 60 |         model (nn.Module) : WaveNet decoder
 61 |         length (int): Time steps to generate. If conditinlal features are given,
 62 |           then this is determined by the feature size.
 63 |         c (numpy.ndarray): Conditional features, of shape T x C
 64 |         g (scaler): Speaker ID
 65 |         initial_value (int) : initial_value for the WaveNet decoder.
 66 |         fast (Bool): Whether to remove weight normalization or not.
 67 |         tqdm (lambda): tqdm
 68 | 
 69 |     Returns:
 70 |         numpy.ndarray : Generated waveform samples
 71 |     """
 72 |     from train import sanity_check
 73 |     sanity_check(model, c, g)
 74 | 
 75 |     c = _to_numpy(c)
 76 |     g = _to_numpy(g)
 77 | 
 78 |     model.eval()
 79 |     if fast:
 80 |         model.make_generation_fast_()
 81 | 
 82 |     if c is None:
 83 |         assert length is not None
 84 |     else:
 85 |         # (Tc, D)
 86 |         if c.ndim != 2:
 87 |             raise RuntimeError(
 88 |                 "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape))
 89 |             assert c.ndim == 2
 90 |         Tc = c.shape[0]
 91 |         upsample_factor = audio.get_hop_size()
 92 |         # Overwrite length according to feature size
 93 |         length = Tc * upsample_factor
 94 |         # (Tc, D) -> (Tc', D)
 95 |         # Repeat features before feeding it to the network
 96 |         if not hparams.upsample_conditional_features:
 97 |             c = np.repeat(c, upsample_factor, axis=0)
 98 | 
 99 |         # B x C x T
100 |         c = torch.FloatTensor(c.T).unsqueeze(0)
101 | 
102 |     if initial_value is None:
103 |         if is_mulaw_quantize(hparams.input_type):
104 |             initial_value = P.mulaw_quantize(0, hparams.quantize_channels)
105 |         else:
106 |             initial_value = 0.0
107 | 
108 |     if is_mulaw_quantize(hparams.input_type):
109 |         assert initial_value >= 0 and initial_value < hparams.quantize_channels
110 |         initial_input = np_utils.to_categorical(
111 |             initial_value, num_classes=hparams.quantize_channels).astype(np.float32)
112 |         initial_input = torch.from_numpy(initial_input).view(
113 |             1, 1, hparams.quantize_channels)
114 |     else:
115 |         initial_input = torch.zeros(1, 1, 1).fill_(initial_value)
116 | 
117 |     g = None if g is None else torch.LongTensor([g])
118 | 
119 |     # Transform data to GPU
120 |     initial_input = initial_input.to(device)
121 |     g = None if g is None else g.to(device)
122 |     c = None if c is None else c.to(device)
123 | 
124 |     with torch.no_grad():
125 |         y_hat = model.incremental_forward(
126 |             initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
127 |             log_scale_min=hparams.log_scale_min)
128 | 
129 |     if is_mulaw_quantize(hparams.input_type):
130 |         y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy()
131 |         y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels)
132 |     elif is_mulaw(hparams.input_type):
133 |         y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels)
134 |     else:
135 |         y_hat = y_hat.view(-1).cpu().data.numpy()
136 | 
137 |     return y_hat
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     args = docopt(__doc__)
142 |     print("Command line args:\n", args)
143 |     checkpoint_path = args["<checkpoint>"]
144 |     dst_dir = args["<dst_dir>"]
145 | 
146 |     length = int(args["--length"])
147 |     initial_value = args["--initial-value"]
148 |     initial_value = None if initial_value is None else float(initial_value)
149 |     conditional_path = args["--conditional"]
150 |     # From https://github.com/Rayhane-mamah/Tacotron-2
151 |     symmetric_mels = args["--symmetric-mels"]
152 |     max_abs_value = float(args["--max-abs-value"])
153 | 
154 |     file_name_suffix = args["--file-name-suffix"]
155 |     output_html = args["--output-html"]
156 |     speaker_id = args["--speaker-id"]
157 |     speaker_id = None if speaker_id is None else int(speaker_id)
158 |     preset = args["--preset"]
159 | 
160 |     # Load preset if specified
161 |     if preset is not None:
162 |         with open(preset) as f:
163 |             hparams.parse_json(f.read())
164 |     # Override hyper parameters
165 |     hparams.parse(args["--hparams"])
166 |     assert hparams.name == "wavenet_vocoder"
167 | 
168 |     # Load conditional features
169 |     if conditional_path is not None:
170 |         c = np.load(conditional_path)
171 |         if c.shape[1] != hparams.num_mels:
172 |             c = np.swapaxes(c, 0, 1)
173 |         if max_abs_value > 0:
174 |             min_, max_ = 0, max_abs_value
175 |             if symmetric_mels:
176 |                 min_ = -max_
177 |             print("Normalize features to desired range [0, 1] from [{}, {}]".format(min_, max_))
178 |             c = np.interp(c, (min_, max_), (0, 1))
179 |     else:
180 |         c = None
181 | 
182 |     from train import build_model
183 | 
184 |     # Model
185 |     model = build_model().to(device)
186 | 
187 |     # Load checkpoint
188 |     print("Load checkpoint from {}".format(checkpoint_path))
189 |     if use_cuda:
190 |         checkpoint = torch.load(checkpoint_path)
191 |     else:
192 |         checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
193 |     model.load_state_dict(checkpoint["state_dict"])
194 |     checkpoint_name = splitext(basename(checkpoint_path))[0]
195 | 
196 |     os.makedirs(dst_dir, exist_ok=True)
197 |     dst_wav_path = join(dst_dir, "{}{}.wav".format(checkpoint_name, file_name_suffix))
198 | 
199 |     # DO generate
200 |     waveform = wavegen(model, length, c=c, g=speaker_id, initial_value=initial_value, fast=True)
201 | 
202 |     # save
203 |     librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate)
204 | 
205 |     print("Finished! Check out {} for generated audio samples.".format(dst_dir))
206 |     sys.exit(0)
207 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/wavenet_stuff/wavenet_vocoder/wavenet_vocoder/modules.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import with_statement, print_function, absolute_import
  3 | 
  4 | import math
  5 | import numpy as np
  6 | 
  7 | import torch
  8 | from wavenet_vocoder_core import conv
  9 | from torch import nn
 10 | from torch.nn import functional as F
 11 | 
 12 | 
 13 | def Conv1d(in_channels, out_channels, kernel_size, dropout=0, std_mul=4.0, **kwargs):
 14 |     m = conv.Conv1d(in_channels, out_channels, kernel_size, **kwargs)
 15 |     std = math.sqrt((std_mul * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
 16 |     m.weight.data.normal_(mean=0, std=std)
 17 |     m.bias.data.zero_()
 18 |     return nn.utils.weight_norm(m)
 19 | 
 20 | 
 21 | def Embedding(num_embeddings, embedding_dim, padding_idx, std=0.01):
 22 |     m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
 23 |     m.weight.data.normal_(0, std)
 24 |     return m
 25 | 
 26 | 
 27 | def ConvTranspose2d(in_channels, out_channels, kernel_size,
 28 |                     weight_normalization=True, **kwargs):
 29 |     freq_axis_kernel_size = kernel_size[0]
 30 |     m = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, **kwargs)
 31 |     m.weight.data.fill_(1.0 / freq_axis_kernel_size)
 32 |     m.bias.data.zero_()
 33 |     if weight_normalization:
 34 |         return nn.utils.weight_norm(m)
 35 |     else:
 36 |         return m
 37 | 
 38 | 
 39 | def Conv1d1x1(in_channels, out_channels, bias=True, weight_normalization=True):
 40 |     """1-by-1 convolution layer
 41 |     """
 42 |     if weight_normalization:
 43 |         assert bias
 44 |         return Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
 45 |                       dilation=1, bias=bias, std_mul=1.0)
 46 |     else:
 47 |         return conv.Conv1d(in_channels, out_channels, kernel_size=1, padding=0,
 48 |                            dilation=1, bias=bias)
 49 | 
 50 | 
 51 | def _conv1x1_forward(conv, x, is_incremental):
 52 |     """Conv1x1 forward
 53 |     """
 54 |     if is_incremental:
 55 |         x = conv.incremental_forward(x)
 56 |     else:
 57 |         x = conv(x)
 58 |     return x
 59 | 
 60 | 
 61 | class ResidualConv1dGLU(nn.Module):
 62 |     """Residual dilated conv1d + Gated linear unit
 63 | 
 64 |     Args:
 65 |         residual_channels (int): Residual input / output channels
 66 |         gate_channels (int): Gated activation channels.
 67 |         kernel_size (int): Kernel size of convolution layers.
 68 |         skip_out_channels (int): Skip connection channels. If None, set to same
 69 |           as ``residual_channels``.
 70 |         cin_channels (int): Local conditioning channels. If negative value is
 71 |           set, local conditioning is disabled.
 72 |         gin_channels (int): Global conditioning channels. If negative value is
 73 |           set, global conditioning is disabled.
 74 |         dropout (float): Dropout probability.
 75 |         padding (int): Padding for convolution layers. If None, proper padding
 76 |           is computed depends on dilation and kernel_size.
 77 |         dilation (int): Dilation factor.
 78 |         weight_normalization (bool): If True, DeepVoice3-style weight
 79 |           normalization is applied.
 80 |     """
 81 | 
 82 |     def __init__(self, residual_channels, gate_channels, kernel_size,
 83 |                  skip_out_channels=None,
 84 |                  cin_channels=-1, gin_channels=-1,
 85 |                  dropout=1 - 0.95, padding=None, dilation=1, causal=True,
 86 |                  bias=True, weight_normalization=True, *args, **kwargs):
 87 |         super(ResidualConv1dGLU, self).__init__()
 88 |         self.dropout = dropout
 89 |         if skip_out_channels is None:
 90 |             skip_out_channels = residual_channels
 91 |         if padding is None:
 92 |             # no future time stamps available
 93 |             if causal:
 94 |                 padding = (kernel_size - 1) * dilation
 95 |             else:
 96 |                 padding = (kernel_size - 1) // 2 * dilation
 97 |         self.causal = causal
 98 | 
 99 |         if weight_normalization:
100 |             assert bias
101 |             self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
102 |                                padding=padding, dilation=dilation,
103 |                                bias=bias, std_mul=1.0, *args, **kwargs)
104 |         else:
105 |             self.conv = conv.Conv1d(residual_channels, gate_channels, kernel_size,
106 |                                     padding=padding, dilation=dilation,
107 |                                     bias=bias, *args, **kwargs)
108 | 
109 |         # local conditioning
110 |         if cin_channels > 0:
111 |             self.conv1x1c = Conv1d1x1(cin_channels, gate_channels,
112 |                                       bias=bias,
113 |                                       weight_normalization=weight_normalization)
114 |         else:
115 |             self.conv1x1c = None
116 | 
117 |         # global conditioning
118 |         if gin_channels > 0:
119 |             self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, bias=bias,
120 |                                       weight_normalization=weight_normalization)
121 |         else:
122 |             self.conv1x1g = None
123 | 
124 |         # conv output is split into two groups
125 |         gate_out_channels = gate_channels // 2
126 |         self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias,
127 |                                      weight_normalization=weight_normalization)
128 |         self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, bias=bias,
129 |                                       weight_normalization=weight_normalization)
130 | 
131 |     def forward(self, x, c=None, g=None):
132 |         return self._forward(x, c, g, False)
133 | 
134 |     def incremental_forward(self, x, c=None, g=None):
135 |         return self._forward(x, c, g, True)
136 | 
137 |     def _forward(self, x, c, g, is_incremental):
138 |         """Forward
139 | 
140 |         Args:
141 |             x (Tensor): B x C x T
142 |             c (Tensor): B x C x T, Local conditioning features
143 |             g (Tensor): B x C x T, Expanded global conditioning features
144 |             is_incremental (Bool) : Whether incremental mode or not
145 | 
146 |         Returns:
147 |             Tensor: output
148 |         """
149 |         residual = x
150 |         x = F.dropout(x, p=self.dropout, training=self.training)
151 |         if is_incremental:
152 |             splitdim = -1
153 |             x = self.conv.incremental_forward(x)
154 |         else:
155 |             splitdim = 1
156 |             x = self.conv(x)
157 |             # remove future time steps
158 |             x = x[:, :, :residual.size(-1)] if self.causal else x
159 | 
160 |         a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
161 | 
162 |         # local conditioning
163 |         if c is not None:
164 |             assert self.conv1x1c is not None
165 |             c = _conv1x1_forward(self.conv1x1c, c, is_incremental)
166 |             ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
167 |             a, b = a + ca, b + cb
168 | 
169 |         # global conditioning
170 |         if g is not None:
171 |             assert self.conv1x1g is not None
172 |             g = _conv1x1_forward(self.conv1x1g, g, is_incremental)
173 |             ga, gb = g.split(g.size(splitdim) // 2, dim=splitdim)
174 |             a, b = a + ga, b + gb
175 | 
176 |         x = torch.tanh(a) * torch.sigmoid(b)
177 | 
178 |         # For skip connection
179 |         s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental)
180 | 
181 |         # For residual connection
182 |         x = _conv1x1_forward(self.conv1x1_out, x, is_incremental)
183 | 
184 |         x = (x + residual) * math.sqrt(0.5)
185 |         return x, s
186 | 
187 |     def clear_buffer(self):
188 |         for c in [self.conv, self.conv1x1_out, self.conv1x1_skip,
189 |                   self.conv1x1c, self.conv1x1g]:
190 |             if c is not None:
191 |                 c.clear_buffer()
192 | 


--------------------------------------------------------------------------------
/code/lib/examples/unaligned_ljspeech_chars/rnn_unaligned_speech_ljspeech_nomask_blended_continue.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import sys
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from collections import namedtuple
  7 | 
  8 | import logging
  9 | import shutil
 10 | from tfbldr.datasets import rsync_fetch, fetch_ljspeech
 11 | from tfbldr.datasets import wavfile_caching_mel_tbptt_iterator
 12 | from tfbldr.utils import next_experiment_path
 13 | from tfbldr import get_logger
 14 | from tfbldr import run_loop
 15 | from tfbldr.nodes import Linear
 16 | from tfbldr.nodes import Linear
 17 | from tfbldr.nodes import LSTMCell
 18 | from tfbldr.nodes import BiLSTMLayer
 19 | from tfbldr.nodes import SequenceConv1dStack
 20 | from tfbldr.nodes import Embedding
 21 | from tfbldr.nodes import GaussianAttentionCell
 22 | from tfbldr.nodes import DiscreteMixtureOfLogistics
 23 | from tfbldr.nodes import DiscreteMixtureOfLogisticsCost
 24 | from tfbldr.nodes import AdditiveGaussianNoise
 25 | from tfbldr import scan
 26 | 
 27 | if len(sys.argv) < 1:
 28 |    raise ValueError("Continue script only for continuing training of a previous model")
 29 | 
 30 | seq_len = 256
 31 | batch_size = 64
 32 | window_mixtures = 10
 33 | cell_dropout = .925
 34 | #noise_scale = 8.
 35 | prenet_units = 128
 36 | n_filts = 128
 37 | n_stacks = 3
 38 | enc_units = 128
 39 | dec_units = 512
 40 | emb_dim = 15
 41 | truncation_len = seq_len
 42 | cell_dropout_scale = cell_dropout
 43 | epsilon = 1E-8
 44 | forward_init = "truncated_normal"
 45 | rnn_init = "truncated_normal"
 46 | 
 47 | basedir = "/Tmp/kastner/lj_speech/LJSpeech-1.0/"
 48 | ljspeech = rsync_fetch(fetch_ljspeech, "leto01")
 49 | 
 50 | # THESE ARE CANNOT BE PAIRED (SOME MISSING), ITERATOR PAIRS THEM UP BY NAME
 51 | wavfiles = ljspeech["wavfiles"]
 52 | jsonfiles = ljspeech["jsonfiles"]
 53 | 
 54 | model_path = sys.argv[1]
 55 | seed = int(abs(hash(model_path))) % (2 ** 32 - 1)
 56 | 
 57 | # THESE HAVE TO BE THE SAME TO ENSURE SPLIT IS CORRECT
 58 | train_random_state = np.random.RandomState(seed)
 59 | valid_random_state = np.random.RandomState(seed)
 60 | 
 61 | train_itr = wavfile_caching_mel_tbptt_iterator(wavfiles, jsonfiles, batch_size, seq_len, stop_index=.95, shuffle=True, random_state=train_random_state)
 62 | valid_itr = wavfile_caching_mel_tbptt_iterator(wavfiles, jsonfiles, batch_size, seq_len, start_index=.95, shuffle=True, random_state=valid_random_state)
 63 | 
 64 | """
 65 | for i in range(10000):
 66 |     print(i)
 67 |     mels, mel_mask, text, text_mask, mask, mask_mask, reset = train_itr.next_masked_batch()
 68 | """
 69 | 
 70 | # STRONG CHECK TO ENSURE NO OVERLAP IN TRAIN/VALID
 71 | for tai in train_itr.all_indices_:
 72 |     assert tai not in valid_itr.all_indices_
 73 | for vai in valid_itr.all_indices_:
 74 |     assert vai not in train_itr.all_indices_
 75 | 
 76 | random_state = np.random.RandomState(1442)
 77 | # use the max of the two blended types...
 78 | vocabulary_size = max(train_itr.vocabulary_sizes)
 79 | output_size = train_itr.n_mel_filters
 80 | 
 81 | att_w_init = np.zeros((batch_size, 2 * enc_units))
 82 | att_k_init = np.zeros((batch_size, window_mixtures))
 83 | att_h_init = np.zeros((batch_size, dec_units))
 84 | att_c_init = np.zeros((batch_size, dec_units))
 85 | h1_init = np.zeros((batch_size, dec_units))
 86 | c1_init = np.zeros((batch_size, dec_units))
 87 | h2_init = np.zeros((batch_size, dec_units))
 88 | c2_init = np.zeros((batch_size, dec_units))
 89 | 
 90 | stateful_args = [att_w_init,
 91 |                  att_k_init,
 92 |                  att_h_init,
 93 |                  att_c_init,
 94 |                  h1_init,
 95 |                  c1_init,
 96 |                  h2_init,
 97 |                  c2_init]
 98 | 
 99 | with tf.Session() as sess:
100 |     saver = tf.train.import_meta_graph(model_path + '.meta')
101 |     logger = get_logger()
102 |     logger.info("CONTINUING TRAINING FROM MODEL PATH {}".format(model_path))
103 |     saver.restore(sess, model_path)
104 |     graph = tf.get_default_graph()
105 | 
106 |     fields = ["mels",
107 |               "mel_mask",
108 |               "in_mels",
109 |               "in_mel_mask",
110 |               "out_mels",
111 |               "out_mel_mask",
112 |               "text",
113 |               "text_mask",
114 |               "mask",
115 |               "mask_mask",
116 |               "bias",
117 |               "cell_dropout",
118 |               "prenet_dropout",
119 |               "bn_flag",
120 |               "pred",
121 |               #"mix", "means", "lins",
122 |               "att_w_init",
123 |               "att_k_init",
124 |               "att_h_init",
125 |               "att_c_init",
126 |               "h1_init",
127 |               "c1_init",
128 |               "h2_init",
129 |               "c2_init",
130 |               "att_w",
131 |               "att_k",
132 |               "att_phi",
133 |               "att_h",
134 |               "att_c",
135 |               "h1",
136 |               "c1",
137 |               "h2",
138 |               "c2",
139 |               "loss",
140 |               "train_step",
141 |               "learning_rate"]
142 |     vs = namedtuple('Params', fields)(
143 |         *[tf.get_collection(name)[0] for name in fields]
144 |     )
145 | 
146 |     step_count = 0
147 |     def loop(sess, itr, extras, stateful_args):
148 |         """
149 |         global step_count
150 |         global noise_scale
151 |         step_count += 1
152 |         if step_count > 10000:
153 |             step_count = 0
154 |             if noise_scale == 2:
155 |                noise_scale = 1.
156 |             else:
157 |                 noise_scale = noise_scale - 2.
158 |             if noise_scale < .5:
159 |                 noise_scale = .5
160 |         """
161 |         mels, mel_mask, text, text_mask, mask, mask_mask, reset = itr.next_masked_batch()
162 |         in_m = mels[:-1]
163 |         in_mel_mask = mel_mask[:-1]
164 | 
165 |         #noise_block = np.clip(random_state.randn(*in_m.shape), -6, 6)
166 |         #in_m = in_m + noise_scale * noise_block
167 | 
168 |         out_m = mels[1:]
169 |         out_mel_mask = mel_mask[1:]
170 | 
171 |         att_w_init = stateful_args[0]
172 |         att_k_init = stateful_args[1]
173 |         att_h_init = stateful_args[2]
174 |         att_c_init = stateful_args[3]
175 |         h1_init = stateful_args[4]
176 |         c1_init = stateful_args[5]
177 |         h2_init = stateful_args[6]
178 |         c2_init = stateful_args[7]
179 | 
180 |         att_w_init *= reset
181 |         att_k_init *= reset
182 |         att_h_init *= reset
183 |         att_c_init *= reset
184 |         h1_init *= reset
185 |         c1_init *= reset
186 |         h2_init *= reset
187 |         c2_init *= reset
188 | 
189 |         feed = {
190 |                 vs.in_mels: in_m,
191 |                 vs.in_mel_mask: in_mel_mask,
192 |                 vs.out_mels: out_m,
193 |                 vs.out_mel_mask: out_mel_mask,
194 |                 vs.bn_flag: 0.,
195 |                 vs.text: text,
196 |                 vs.text_mask: text_mask,
197 |                 vs.mask: mask,
198 |                 vs.mask_mask: mask_mask,
199 |                 vs.att_w_init: att_w_init,
200 |                 vs.att_k_init: att_k_init,
201 |                 vs.att_h_init: att_h_init,
202 |                 vs.att_c_init: att_c_init,
203 |                 vs.h1_init: h1_init,
204 |                 vs.c1_init: c1_init,
205 |                 vs.h2_init: h2_init,
206 |                 vs.c2_init: c2_init}
207 |         outs = [vs.att_w, vs.att_k,
208 |                 vs.att_h, vs.att_c,
209 |                 vs.h1, vs.c1, vs.h2, vs.c2,
210 |                 vs.att_phi,
211 |                 vs.loss, vs.train_step]
212 | 
213 |         r = sess.run(outs, feed_dict=feed)
214 | 
215 |         att_w_np = r[0]
216 |         att_k_np = r[1]
217 |         att_h_np = r[2]
218 |         att_c_np = r[3]
219 |         h1_np = r[4]
220 |         c1_np = r[5]
221 |         h2_np = r[6]
222 |         c2_np = r[7]
223 |         att_phi_np = r[8]
224 |         l = r[-2]
225 |         _ = r[-1]
226 | 
227 |         # set next inits
228 |         att_w_init = att_w_np[-1]
229 |         att_k_init = att_k_np[-1]
230 |         att_h_init = att_h_np[-1]
231 |         att_c_init = att_c_np[-1]
232 |         h1_init = h1_np[-1]
233 |         c1_init = c1_np[-1]
234 |         h2_init = h2_np[-1]
235 |         c2_init = c2_np[-1]
236 | 
237 |         stateful_args = [att_w_init,
238 |                          att_k_init,
239 |                          att_h_init,
240 |                          att_c_init,
241 |                          h1_init,
242 |                          c1_init,
243 |                          h2_init,
244 |                          c2_init]
245 |         return l, None, stateful_args
246 | 
247 |     run_loop(sess,
248 |              loop, train_itr,
249 |              loop, train_itr,
250 |              continue_training=True,
251 |              n_steps=1000000,
252 |              n_train_steps_per=1000,
253 |              train_stateful_args=stateful_args,
254 |              n_valid_steps_per=0,
255 |              valid_stateful_args=stateful_args)
256 | 


--------------------------------------------------------------------------------
/pretrained/cleaning.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | import re
  3 | import cleaners
  4 | from symbols import char_symbols
  5 | from symbols import phone_symbols
  6 | from symbols import pau_phone_symbols
  7 | from eng_rules import hybrid_g2p, rulebased_g2p
  8 | 
  9 | 
 10 | # Mappings from symbol to numeric ID and vice versa:
 11 | _char_symbol_to_id = {s: i for i, s in enumerate(char_symbols)}
 12 | _id_to_char_symbol = {i: s for i, s in enumerate(char_symbols)}
 13 | 
 14 | _phone_symbol_to_id = {s: i for i, s in enumerate(phone_symbols)}
 15 | _id_to_phone_symbol = {i: s for i, s in enumerate(phone_symbols)}
 16 | 
 17 | _pau_phone_symbol_to_id = {s: i for i, s in enumerate(pau_phone_symbols)}
 18 | _id_to_pau_phone_symbol = {i: s for i, s in enumerate(pau_phone_symbols)}
 19 | 
 20 | # Regular expression matching text enclosed in curly braces:
 21 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 22 | 
 23 | 
 24 | def get_vocabulary_sizes(cleaner_names):
 25 |   """
 26 |     if pause in name, return pause phone size
 27 |     if phone in name, return phone size
 28 |     else return char size
 29 |   """
 30 |   outs = []
 31 |   for cn in cleaner_names:
 32 |       if "pause" in cn:
 33 |           outs.append(len(_pau_phone_symbol_to_id))
 34 |       elif "phone" in cn:
 35 |           outs.append(len(_phone_symbol_to_id))
 36 |       else:
 37 |           outs.append(len(_char_symbol_to_id))
 38 |   # needed?
 39 |   if len(outs) == 1:
 40 |       outs = outs[0]
 41 |   return outs
 42 | 
 43 | 
 44 | def text_to_sequence(text, cleaner_names):
 45 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 46 | 
 47 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
 48 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
 49 | 
 50 |     Args:
 51 |       text: string to convert to a sequence
 52 |       cleaner_names: names of the cleaner functions to run the text through
 53 | 
 54 |     Returns:
 55 |       List of integers corresponding to the symbols in the text
 56 |   '''
 57 |   if any(["rule" in name for name in cleaner_names]):
 58 |       raise ValueError("IMPLEMENT RULE TRANFORM")
 59 |       sequence = []
 60 |       # Check for curly braces and treat their contents as ARPAbet:
 61 |       while len(text):
 62 |         m = _curly_re.match(text)
 63 |         if not m:
 64 |           sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
 65 |           break
 66 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 67 |         sequence += _arpabet_to_sequence(m.group(2))
 68 |         text = m.group(3)
 69 | 
 70 |       # Append EOS token
 71 |       sequence.append(_symbol_to_id['~'])
 72 |       return sequence
 73 |   elif any(["pause" in name for name in cleaner_names]):
 74 |       sequence = []
 75 |       # Check for curly braces and treat their contents as ARPAbet:
 76 |       while len(text):
 77 |         m = _curly_re.match(text)
 78 |         if not m:
 79 |           sequence += _pau_phone_symbols_to_sequence(_clean_text(text, cleaner_names))
 80 |           break
 81 |         sequence += _pau_phone_symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 82 |         sequence += _arpabet_to_sequence(m.group(2))
 83 |         text = m.group(3)
 84 |       # Append EOS token
 85 |       sequence.append(_phone_symbol_to_id['~'])
 86 |       return sequence
 87 |   elif any(["phone" in name for name in cleaner_names]):
 88 |       sequence = []
 89 |       # Check for curly braces and treat their contents as ARPAbet:
 90 |       while len(text):
 91 |         m = _curly_re.match(text)
 92 |         if not m:
 93 |           sequence += _phone_symbols_to_sequence(_clean_text(text, cleaner_names))
 94 |           break
 95 |         sequence += _phone_symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 96 |         sequence += _arpabet_to_sequence(m.group(2))
 97 |         text = m.group(3)
 98 |       # Append EOS token
 99 |       sequence.append(_phone_symbol_to_id['~'])
100 |       return sequence
101 |   else:
102 |       sequence = []
103 |       # Check for curly braces and treat their contents as ARPAbet:
104 |       while len(text):
105 |         m = _curly_re.match(text)
106 |         if not m:
107 |           sequence += _char_symbols_to_sequence(_clean_text(text, cleaner_names))
108 |           break
109 |         sequence += _char_symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
110 |         sequence += _arpabet_to_sequence(m.group(2))
111 |         text = m.group(3)
112 | 
113 |       # Append EOS token
114 |       sequence.append(_char_symbol_to_id['~'])
115 |       return sequence
116 | 
117 | 
118 | def sequence_to_text(sequence, cleaner_names):
119 |   '''Converts a sequence of IDs back to a string'''
120 |   if any(["rule" in name for name in cleaner_names]):
121 |       raise ValueError("IMPLEMENT RULE TRANFORM")
122 |   elif any(["pause" in name for name in cleaner_names]):
123 |       result = ""
124 |       space_id = _pau_phone_symbol_to_id[" "]
125 |       pad_id = _pau_phone_symbol_to_id["_"]
126 |       eos_id = _pau_phone_symbol_to_id["~"]
127 |       special_ids = [_pau_phone_symbol_to_id[special] for special in ["1","2","3","4"]]
128 |       for symbol_id in sequence:
129 |           if symbol_id in [space_id, pad_id, eos_id]:
130 |               result += _id_to_pau_phone_symbol[symbol_id]
131 |           elif symbol_id in special_ids:
132 |               result += _id_to_pau_phone_symbol[symbol_id]
133 |           else:
134 |               result += "@" + _id_to_pau_phone_symbol[symbol_id]
135 |       return result
136 |   elif any(["phone" in name for name in cleaner_names]):
137 |       result = ""
138 |       space_id = _phone_symbol_to_id[" "]
139 |       pad_id = _phone_symbol_to_id["_"]
140 |       eos_id = _phone_symbol_to_id["~"]
141 |       special_ids = [_phone_symbol_to_id[special] for special in "!,:?"]
142 |       for symbol_id in sequence:
143 |           if symbol_id in [space_id, pad_id, eos_id] + special_ids:
144 |               result += _id_to_phone_symbol[symbol_id]
145 |           else:
146 |               result += "@" + _id_to_phone_symbol[symbol_id]
147 |       return result
148 |   else:
149 |       result = ''
150 |       for symbol_id in sequence:
151 |         if symbol_id in _id_to_char_symbol:
152 |           s = _id_to_char_symbol[symbol_id]
153 |           # Enclose ARPAbet back in curly braces:
154 |           if len(s) > 1 and s[0] == '@':
155 |             s = '{%s}' % s[1:]
156 |           result += s
157 |       return result.replace('}{', ' ')
158 | 
159 | 
160 | def _clean_text(text, cleaner_names):
161 |   for name in cleaner_names:
162 |     cleaner = getattr(cleaners, name)
163 |     if not cleaner:
164 |       raise Exception('Unknown cleaner: %s' % name)
165 |     text = cleaner(text)
166 |   return text
167 | 
168 | 
169 | def _char_symbols_to_sequence(symbols):
170 |   return [_char_symbol_to_id[s] for s in symbols if _char_should_keep_symbol(s)]
171 | 
172 | def _pau_phone_symbols_to_sequence(symbols):
173 |   new = []
174 |   specials = ["1", "2", "3", "4"]
175 |   for ss in symbols.split(" "):
176 |       if any([special in ss for special in specials]):
177 |           all_special = [special for special in ss if special in specials]
178 |           all_non_special = [nonspecial[1:] for nonspecial in ss if nonspecial not in specials]
179 |           prev = []
180 |           for ssi in ss.strip().split("@")[1:]:
181 |               if any([special in ssi for special in specials]):
182 |                   prev.append(re.sub("|".join(specials), "", ssi))
183 |                   which_specials = [special for special in specials if special in ssi]
184 |                   for p in prev:
185 |                       new.append(p)
186 |                   # ASSUME ONLY 1?
187 |                   new.append(which_specials[0])
188 |                   prev = []
189 |               else:
190 |                   prev.append(ssi)
191 |       else:
192 |           for ssi in ss.strip().split("@")[1:] + [" "]:
193 |               new.append(ssi)
194 |   return [_pau_phone_symbol_to_id[s] for s in new if _pau_phone_should_keep_symbol(s)]
195 | 
196 | def _phone_symbols_to_sequence(symbols):
197 |   new = []
198 |   for ss in symbols.split(" "):
199 |       if any([special in ss for special in "!,:?"]):
200 |           # special symbols only at start or back of chunk
201 |           if ss[0] in "!,:?":
202 |               for ssi in [ss[0]] + ss[1:].strip().split("@")[1:] + [" "]:
203 |                   new.append(ssi)
204 |           elif ss[-1] in "!,:?":
205 |               for ssi in ss[:-1].strip().split("@")[1:] + [ss[-1]] + [" "]:
206 |                   new.append(ssi)
207 |       else:
208 |           for ssi in ss.strip().split("@")[1:] + [" "]:
209 |               new.append(ssi)
210 |   #new = [ssi for ss in symbols.split(" ") for ssi in ss.strip().split("@")[1:] + [" "]][:-1]
211 |   return [_phone_symbol_to_id[s] for s in new if _phone_should_keep_symbol(s)]
212 | 
213 | def _arpabet_to_sequence(text):
214 |   return _symbols_to_sequence(['@' + s for s in text.split()])
215 | 
216 | def _char_should_keep_symbol(s):
217 |   return s in _char_symbol_to_id and s is not '_' and s is not '~'
218 | 
219 | def _pau_phone_should_keep_symbol(s):
220 |   return s in _pau_phone_symbol_to_id and s is not '_' and s is not '~'
221 | 
222 | def _phone_should_keep_symbol(s):
223 |   return s in _phone_symbol_to_id and s is not '_' and s is not '~'
224 | 


--------------------------------------------------------------------------------