├── .gitignore
├── __init__.py
├── choose_emotion_samples.py
├── emogst_hparams.py
├── emogst_train.py
├── eval.py
├── models
    ├── __init__.py
    ├── base.py
    ├── emogst_tacotron2.py
    ├── sygst_tacotron2.py
    └── tacotron2.py
├── modules
    ├── __init__.py
    ├── attention.py
    ├── custom_functions.py
    ├── custom_layers.py
    ├── decoders.py
    ├── encoders.py
    └── losses.py
├── predict_attention.py
├── prepare_meta_from_tfr.py
├── sygst_hparams.py
├── sygst_train.py
├── synthesizer.py
├── text
    ├── __init__.py
    ├── cleaners.py
    ├── cmudict.py
    ├── numbers.py
    └── symbols.py
├── tfr_dset.py
└── utils
    ├── __init__.py
    ├── audio.py
    ├── ce_loss_util.py
    ├── center_loss_util.py
    ├── data.py
    ├── data.py.bak-0707
    ├── debug.py
    ├── infolog.py
    ├── ops.py
    ├── parameter.py
    ├── plot.py
    ├── tool_wrappers.py
    └── utils
        ├── __init__.py
        ├── audio.py
        ├── ce_loss_util.py
        ├── center_loss_util.py
        ├── data.py
        ├── data.py.bak-0707
        ├── debug.py
        ├── index.html
        ├── infolog.py
        ├── mmd_utils.py
        ├── ops.py
        ├── parameter.py
        ├── plot.py
        └── tool_wrappers.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *DS_Store*
2 | *__pycache__*
3 | *.swp*
4 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | import models
2 | import modules
3 | import taco2_hparams
4 | import utils
5 | 


--------------------------------------------------------------------------------
/choose_emotion_samples.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import shutil
  4 | import numpy as np
  5 | 
  6 | meta_file = 'bc2013/full_meta.txt'
  7 | wavs_path = 'bc2013/wavs'
  8 | mels_path = 'bc2013/mels'
  9 | specs_path = 'bc2013/specs'
 10 | 
 11 | 
 12 | def save_topk_files(saved_wavs_path, saved_mels_path, meta_path, lines):
 13 |     # copy wavs and mels
 14 |     for i, line in enumerate(lines):
 15 |         wav_name = line[6] + '.wav'
 16 |         mel_name = os.path.basename(line[7])
 17 |         source_wav_name = os.path.join(wavs_path, wav_name)
 18 |         source_mel_name = os.path.join(mels_path, mel_name)
 19 |         saved_wav_name = os.path.join(saved_wavs_path, wav_name)
 20 |         saved_mel_name = os.path.join(saved_mels_path, f'{i:03d}-' + mel_name)  # add a sorted prefix
 21 |         os.makedirs(os.path.dirname(saved_wav_name), exist_ok=True)
 22 |         os.makedirs(os.path.dirname(saved_mel_name), exist_ok=True)
 23 |         shutil.copy(source_wav_name, saved_wav_name)
 24 |         shutil.copy(source_mel_name, saved_mel_name)
 25 | 
 26 |     # save meta lines
 27 |     os.makedirs(os.path.dirname(meta_path), exist_ok=True)
 28 |     lines = ['{}|{}|{}|{}|{}|{}|{}|{}|{}\n'.format(*line) for line in lines]
 29 |     with open(meta_path, 'w') as fw:
 30 |         fw.writelines(lines)
 31 | 
 32 | 
 33 | def choose_emo_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200):
 34 |     mels_path = os.path.join(base_dir, 'emo_mel_npys/emo{}')
 35 |     wavs_path = os.path.join(base_dir, 'emo_wavs/emo{}')
 36 |     meta_path = os.path.join(base_dir, 'emo_metas/emo{}.txt')
 37 |     for i in range(4):
 38 |         topk_lines = []
 39 |         # line[0] is the 4 categories of soft label: [neutral, angry, happy, sad]
 40 |         sorted_lines = sorted(meta_lines, key=lambda x: x[0][i], reverse=True)
 41 |         for line in sorted_lines:
 42 |             if line[3] >= min_chars and line[4] >= min_frames:
 43 |                 if i == 0:
 44 |                     topk_lines.append(line)
 45 |                 # elif i == 1 and line[1][1] > 0.5 and line[2][0] > 0.5:    # angry
 46 |                 elif i == 1 and line[1][1] > 0.5 and line[2][0] > 0.45:    # angry
 47 |                     topk_lines.append(line)
 48 |                 elif i == 2 and line[1][1] > 0.5 and line[2][1] > 0.45:  # happy
 49 |                     topk_lines.append(line)
 50 |                 # elif i == 3 and line[1][0] > 0.5 and line[2][0] > 0.5:  # sad
 51 |                 elif i == 3 and line[1][0] > 0.40 and line[2][0] > 0.45:  # sad
 52 |                     topk_lines.append(line)
 53 |                 if len(topk_lines) == top_k:
 54 |                     break
 55 |         save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines)
 56 | 
 57 | 
 58 | def choose_aro_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200):
 59 |     mels_path = os.path.join(base_dir, 'emo2d_mel_npys/arousal{}')
 60 |     wavs_path = os.path.join(base_dir, 'emo2d_wavs/arousal{}')
 61 |     meta_path = os.path.join(base_dir, 'emo2d_metas/arousal{}.txt')
 62 |     for i in range(2):
 63 |         topk_lines = []
 64 |         # line[1] is the arousal soft label
 65 |         sorted_lines = sorted(meta_lines, key=lambda x: x[1][i], reverse=True)
 66 |         for line in sorted_lines:
 67 |             if line[3] >= min_chars and line[4] >= min_frames:
 68 |                 if i == 0 and np.argmax(line[0]) in [0, 3]:
 69 |                     topk_lines.append(line)
 70 |                 elif i == 1 and np.argmax(line[0]) in [1, 2]:
 71 |                     topk_lines.append(line)
 72 |                 if len(topk_lines) == top_k:
 73 |                     break
 74 |         save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines)
 75 | 
 76 | 
 77 | def choose_val_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200):
 78 |     mels_path = os.path.join(base_dir, 'emo2d_mel_npys/valence{}')
 79 |     wavs_path = os.path.join(base_dir, 'emo2d_wavs/valence{}')
 80 |     meta_path = os.path.join(base_dir, 'emo2d_metas/valence{}.txt')
 81 |     for i in range(2):
 82 |         topk_lines = []
 83 |         # line[2] is the valence soft label
 84 |         sorted_lines = sorted(meta_lines, key=lambda x: x[2][i], reverse=True)
 85 |         for line in sorted_lines:
 86 |             if line[3] >= min_chars and line[4] >= min_frames:
 87 |                 if i == 0 and np.argmax(line[0]) in [1, 3]:
 88 |                     topk_lines.append(line)
 89 |                 elif i == 1 and np.argmax(line[0]) in [0, 2]:
 90 |                     topk_lines.append(line)
 91 |                 if len(topk_lines) == top_k:
 92 |                     break
 93 |         save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines)
 94 | 
 95 | 
 96 | def main():
 97 |     with open(meta_file, 'r') as fr:
 98 |         meta_lines = fr.readlines()
 99 | 
100 |     def parse_emostrs(line):
101 |         line[0] = json.loads(line[0])  # emotion 4 categories label
102 |         line[1] = json.loads(line[1])  # arousal label
103 |         line[2] = json.loads(line[2])  # valence label
104 |         line[3] = int(line[3])  # text length
105 |         line[4] = int(line[4])  # mel frame number
106 |         return line
107 |     meta_lines = [line.strip().split('|') for line in meta_lines if line.strip()[0] != '#']
108 |     meta_lines = list(map(parse_emostrs, meta_lines))
109 | 
110 |     """
111 |     choose_emo_samples(meta_lines, 'emogst_emo_data')
112 |     choose_aro_samples(meta_lines, 'sygst_emo_data')
113 |     choose_val_samples(meta_lines, 'sygst_emo_data')
114 |     """
115 |     choose_emo_samples(meta_lines, top_k=200)
116 |     choose_aro_samples(meta_lines, top_k=200)
117 |     choose_val_samples(meta_lines, top_k=200)
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/emogst_hparams.py:
--------------------------------------------------------------------------------
  1 | from utils.parameter import HParams
  2 | 
  3 | hp = HParams(
  4 |     # text
  5 |     cleaners='english_cleaners',
  6 | 
  7 |     # audio
  8 |     num_mels=80,
  9 |     num_spec=1025,  # n_fft / 2 + 1 only used when adding linear spectrograms post processing network
 10 |     sample_rate=16000,
 11 |     win_ms=50,  # For 22050Hz, 1100 ~= 50 ms (If None, win_size=n_fft) (0.05 * sample_rate)
 12 |     hop_ms=12.5,   # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
 13 |     n_fft=2048,
 14 |     min_level_db=-100,
 15 |     ref_level_db=20,
 16 |     fmin=95,        # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 17 |     fmax=7600,      # To be increased/reduced depending on data.
 18 |     preemphasis=0.97,  # filter coefficient.
 19 |     griffin_lim_power=1.5,  # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice.
 20 |     griffin_lim_iters=60,   # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence.
 21 | 
 22 |     # Tacotron
 23 |     outputs_per_step=3,   # number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality)
 24 |     feed_last_frame=True,  # whether feed all r frames or only the last frame of all the r frames
 25 |     stop_at_any=True,   # Determines whether the decoder should stop when predicting <stop> to any frame or to all of them (True works pretty well)
 26 |     clip_outputs=True,  # Whether to clip spectrograms to T2_output_range (even in loss computation). ie: Don't penalize model for exceeding output range and bring back to borders.
 27 |     lower_bound_decay=0.0,  # Small regularizer for noise synthesis by adding small range of penalty for silence regions. Set to 0 to clip in Tacotron range.
 28 |     clip_min=0,
 29 |     clip_max=1,
 30 | 
 31 |     # Input parameters
 32 |     num_symbols=150,
 33 |     embedding_dim=512,  # dimension of embedding space
 34 | 
 35 |     # Encoder parameters
 36 |     encoder_type='taco2',       # taco encoder is cbhg encoder
 37 |     encoder_cnns=[3, 5, 512],  # num_layers, kernel_size, channels
 38 |     encoder_rnns_units=256,    # number of lstm units for each direction (forward and backward)
 39 | 
 40 |     # reference encoder parameters
 41 |     reference_channels=[32, 32, 64, 64, 128, 128],
 42 |     reference_rnn_units=128,
 43 | 
 44 |     # gst parameters
 45 |     gst_heads=4,
 46 |     gst_tokens=10,
 47 |     gst_units=256,
 48 |     gst_atten_units=128,
 49 |     gst_atten_type='mlp',  # attention type for gst self-attention module(dot or mlp)
 50 |     gst_activation=None,
 51 |     gst_trainable=True,    # False at nvidia gst code
 52 | 
 53 |     # emotion parameters
 54 |     emo_used=False,
 55 |     emotion_embedding_units=128,
 56 | 
 57 |     # Attention mechanism
 58 |     smoothing=False,  # Whether to smooth the attention normalization function
 59 |     attention_type='location',  # sma: stepwise monotonic;  location: location sensitive
 60 |     attention_units=128,  # dimension of attention space
 61 |     attention_filters=32,  # number of attention convolution filters
 62 |     attention_kernel_size=(31, ),  # kernel size of attention convolution
 63 |     attention_sma_normalize=True,
 64 |     attention_sma_sigmoid_noise=2.0,
 65 |     attention_sma_sigmoid_noise_seed=None,
 66 |     attention_sma_score_bias_init=3.5,
 67 |     attention_sma_mode='parallel',
 68 | 
 69 |     # Attention synthesis constraints
 70 |     # "Monotonic" constraint forces the model to only look at the forwards attention_win_size steps.
 71 |     # "Window" allows the model to look at attention_win_size neighbors, both forward and backward steps.
 72 |     synthesis_constraint=True,  # Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis)
 73 |     # synthesis_constraint_type='window',  # can be in ('window', 'monotonic').
 74 |     synthesis_win_size=7,  # Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window.
 75 |     synthesis_softmax_temp=1.0,
 76 | 
 77 |     # Decoder
 78 |     prenet_units=[256, 256],    # number of layers and number of units of prenet
 79 |     attention_rnn_units=[1024, 1024],  # number of decoder lstm layers
 80 |     decode_rnn_units=None,  # number of decoder lstm units on each layer
 81 |     max_iters=2000,  # Max decoder steps during inference (Just for safety from infinite loop cases)
 82 |     impute_finished=False,
 83 |     frame_activation='relu',
 84 | 
 85 |     # Residual postnet
 86 |     postnet_cnns=[5, 5, 512],  # num_layers, kernel_size, channels
 87 | 
 88 |     # CBHG mel->linear postnet
 89 |     post_cbhg=True,
 90 |     cbhg_kernels=8,  # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act as "K-grams"
 91 |     cbhg_conv_channels=128,  # Channels of the convolution bank
 92 |     cbhg_pool_size=2,  # pooling size of the CBHG
 93 |     cbhg_projection=256,  # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
 94 |     cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
 95 |     cbhg_highway_nums=4,  # Number of HighwayNet layers
 96 |     cbhg_highway_units=128,  # Number of units used in HighwayNet fully connected layers
 97 |     cbhg_rnn_units=128,  # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in shape
 98 | 
 99 |     # Loss params
100 |     mask_encoder=True,   # whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence.
101 |     mask_decoder=False,  # set False for alignments converging faster
102 |     cross_entropy_pos_weight=20,  # Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1=disabled)
103 |     mel_loss='mae',
104 |     spec_loss='mae',
105 | 
106 | 
107 |     # Tacotron Training
108 |     # Reproduction seeds
109 |     random_seed=5339,  # Determines initial graph and operations (i.e: model) random state for reproducibility
110 |     # tacotron_data_random_state=1234,  # random state for train test split repeatability
111 | 
112 |     # performance parameters
113 |     tacotron_swap_with_cpu=False,  # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!)
114 | 
115 |     # train/test split ratios, mini-batches sizes
116 |     batch_size=32,  # number of training samples on each training steps
117 |     # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing).
118 |     # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder.
119 |     tacotron_synthesis_batch_size=1,  # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!!
120 |     tacotron_test_size=0.05,  # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit)
121 |     tacotron_test_batches=None,  # number of test batches.
122 | 
123 |     # Learning rate schedule
124 |     decay_learning_rate=True,  # boolean, determines if the learning rate will follow an exponential decay
125 |     start_decay=40000,  # Step at which learning decay starts
126 |     decay_steps=18000,  # Determines the learning rate decay slope (UNDER TEST)
127 |     decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
128 |     # initial_learning_rate=1e-3,  # starting learning rate
129 |     initial_learning_rate=0.002,
130 |     final_learning_rate=1e-4,  # minimal learning rate
131 | 
132 |     # Optimization parameters
133 |     adam_beta1=0.9,  # AdamOptimizer beta1 parameter
134 |     adam_beta2=0.999,  # AdamOptimizer beta2 parameter
135 |     adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter
136 | 
137 |     # Regularization parameters
138 |     # reg_weight=1e-6,  # regularization weight (for L2 regularization)
139 |     reg_weight=None,  # regularization weight (for L2 regularization)
140 |     scale_regularization=False,  # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)
141 |     zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
142 |     dropout_rate=0.5,  # dropout rate for all convolutional layers + prenet
143 |     clip_gradients=True,  # whether to clip gradients
144 | )
145 | 


--------------------------------------------------------------------------------
/emogst_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import time
  4 | import argparse
  5 | import traceback
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from datetime import datetime
  9 | 
 10 | 
 11 | from tfr_dset import TFDataSet
 12 | from text import sequence_to_text
 13 | from utils import audio, plot, infolog, ValueWindow  # , debug
 14 | 
 15 | from emogst_hparams import hp
 16 | from models.emogst_tacotron2 import Tacotron2EMOGST
 17 | 
 18 | log = infolog.log
 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 20 | 
 21 | 
 22 | _max_step = 500000
 23 | 
 24 | # spec_length max = 1116
 25 | # text length max = 99
 26 | 
 27 | 
 28 | def time_string():
 29 |     return datetime.now().strftime('%Y-%m-%d %H:%M')
 30 | 
 31 | 
 32 | def debug_data(batch=32, time_in=100, time_out=500):
 33 |     text_x = np.random.randint(0, 150, size=(batch, time_in), dtype=np.int32)
 34 |     mel = np.random.randn(batch, time_out, 80).astype(np.float32)
 35 |     spec = np.random.randn(batch, time_out, 1025).astype(np.float32)
 36 |     spec_len = np.random.randint(time_out // 2, time_out, size=batch, dtype=np.int32)
 37 |     aro_label = np.random.rand(batch, 2).astype(np.float32)
 38 |     val_label = np.random.rand(batch, 2).astype(np.float32)
 39 | 
 40 |     print('text_input:', text_x[0], 'spec_len:', spec_len, sep='\n')
 41 |     return text_x, mel, spec, spec_len, aro_label, val_label
 42 | 
 43 | 
 44 | def train(log_dir, args):
 45 |     checkpoint_path = os.path.join(log_dir, 'model.ckpt')
 46 |     # input_path = os.path.join(args.base_dir, args.input)
 47 |     log(hp.to_string(), is_print=False)
 48 |     log('Loading training data from: %s' % args.tfr_dir)
 49 |     log('Checkpoint path: %s' % checkpoint_path)
 50 |     log('Using model: emogst tacotron2')
 51 | 
 52 |     tf_dset = TFDataSet(hp, args.tfr_dir)
 53 |     feats = tf_dset.get_train_next()
 54 |     # Set up model:
 55 |     global_step = tf.Variable(0, name='global_step', trainable=False)
 56 |     training = tf.placeholder_with_default(True, shape=(), name='training')
 57 |     with tf.name_scope('model'):
 58 |         model = Tacotron2EMOGST(hp)
 59 |         model(feats['inputs'],
 60 |               mel_inputs=feats['mel_targets'],
 61 |               spec_inputs=feats['linear_targets'],
 62 |               spec_lengths=feats['spec_lengths'],
 63 |               ref_inputs=feats['mel_targets'],
 64 |               ref_lengths=feats['spec_lengths'],
 65 |               emo_labels=feats['soft_emo_labels'],
 66 |               training=training)
 67 |         model.add_loss()
 68 |         model.add_optimizer(global_step)
 69 |         stats = model.add_stats()
 70 | 
 71 |     # Bookkeeping:
 72 |     step = 0
 73 |     time_window = ValueWindow(100)
 74 |     loss_window = ValueWindow(100)
 75 |     saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2)
 76 | 
 77 |     # Train!
 78 |     config = tf.ConfigProto(allow_soft_placement=True,
 79 |                             gpu_options=tf.GPUOptions(allow_growth=True))
 80 |     with tf.Session(config=config) as sess:
 81 |         try:
 82 |             summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
 83 |             sess.run(tf.global_variables_initializer())
 84 |             if args.restore_step:
 85 |                 # Restore from a checkpoint if the user requested it.
 86 |                 restore_path = '%s-%s' % (checkpoint_path, args.restore_step)
 87 |                 saver.restore(sess, restore_path)
 88 |                 log('Resuming from checkpoint: %s' % restore_path, slack=True)
 89 |             else:
 90 |                 log('Starting a new training run ...', slack=True)
 91 | 
 92 |             """
 93 |             fetches = [global_step, model.optimize, model.loss, model.mel_loss,
 94 |                        model.spec_loss, model.stop_loss, model.emo_loss, model.mel_grad_norms_max,
 95 |                        model.spec_grad_norms_max, model.stop_grad_norms_max, model.emo_grad_norms_max]
 96 |             """
 97 |             fetches = [global_step, model.optimize, model.loss, model.mel_loss,
 98 |                        model.spec_loss, model.stop_loss, model.emo_loss]
 99 |             for _ in range(_max_step):
100 |                 start_time = time.time()
101 |                 # sess.run(debug.get_ops())
102 |                 # step, _, loss, mel_loss, spec_loss, stop_loss, emo_loss, mel_g, spec_g, stop_g, emo_g = sess.run(fetches)
103 |                 step, _, loss, mel_loss, spec_loss, stop_loss, emo_loss = sess.run(fetches)
104 |                 time_window.append(time.time() - start_time)
105 |                 loss_window.append(loss)
106 |                 """
107 |                 message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,el=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,eg=%.4f]' % (
108 |                     step, time_window.average, mel_loss, spec_loss, stop_loss, emo_loss, mel_g, spec_g, stop_g, emo_g)
109 |                 """
110 |                 message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,el=%.3f]' % (
111 |                     step, time_window.average, mel_loss, spec_loss, stop_loss, emo_loss)
112 |                 log(message, slack=(step % args.checkpoint_interval == 0))
113 | 
114 |                 if loss > 100 or math.isnan(loss):
115 |                     log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True)
116 |                     raise Exception('Loss Exploded')
117 | 
118 |                 if step % args.summary_interval == 0:
119 |                     log('Writing summary at step: %d' % step)
120 |                     try:
121 |                         summary_writer.add_summary(sess.run(stats), step)
122 |                     except Exception as e:
123 |                         log(f'summary failed and ignored: {str(e)}')
124 | 
125 |                 if step % args.checkpoint_interval == 0:
126 |                     log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
127 |                     saver.save(sess, checkpoint_path, global_step=step)
128 |                     log('Saving audio and alignment...')
129 |                     gt_mel, gt_spec, seq, mel, spec, align = sess.run([model.mel_targets[0], model.spec_targets[0],
130 |                                                                        model.text_targets[0], model.mel_outputs[0],
131 |                                                                        model.spec_outputs[0], model.alignment_outputs[0]])
132 |                     text = sequence_to_text(seq)
133 |                     wav = audio.inv_spectrogram(hp, spec.T)
134 |                     wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step)
135 |                     mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step)
136 |                     spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step)
137 |                     align_path = os.path.join(log_dir, 'step-%d-align.png' % step)
138 |                     info = '%s, %s, step=%d, loss=%.5f\n %s' % (args.model, time_string(), step, loss, text)
139 |                     plot.plot_alignment(align, align_path, info=info)
140 |                     plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel)
141 |                     plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec)
142 |                     audio.save_wav(hp, wav, wav_path)
143 |                     log('Input: %s' % text)
144 | 
145 |         except Exception as e:
146 |             log('Exiting due to exception: %s' % e, slack=True)
147 |             traceback.print_exc()
148 | 
149 | 
150 | def main():
151 |     parser = argparse.ArgumentParser()
152 |     parser.add_argument('--gpu', default='0')
153 |     parser.add_argument('--log', '-l', default='')
154 |     parser.add_argument('--restore_step', '-r', default=None)
155 |     parser.add_argument('--tfr_dir', default='bc2013/training/tfrs_with_emo_feature')
156 |     args = parser.parse_args()
157 | 
158 |     args.model = 'emogst_taco2'
159 |     args.summary_interval = 200
160 |     args.checkpoint_interval = 5000
161 |     # args.summary_interval = 2
162 |     # args.checkpoint_interval = 5
163 | 
164 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
165 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
166 |     log_dir = 'emogst_logs' + ('_' + args.log if args.log else '')
167 |     os.makedirs(log_dir, exist_ok=True)
168 | 
169 |     tf.set_random_seed(hp.random_seed)
170 |     infolog.init(os.path.join(log_dir, 'train.log'), args.model)
171 | 
172 |     train(log_dir, args)
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     main()
177 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thuhcsi/icassp2021-emotion-tts/45bc25405e7c0f51f45727cc91cd41573c05bc65/models/__init__.py


--------------------------------------------------------------------------------
/models/base.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | # from tensorflow.keras import layers
  5 | from tensorflow.keras import backend as K
  6 | 
  7 | from utils.infolog import log
  8 | # from utils.debug import debug_print
  9 | from modules import custom_layers as cl
 10 | from modules import custom_functions as cf
 11 | from modules.losses import get_mel_loss, get_spec_loss, get_stop_loss
 12 | 
 13 | 
 14 | class TacotronBase(keras.Model):
 15 |     def __init__(self, hp,
 16 |                  encoder,
 17 |                  decoder,
 18 |                  postnet=None,
 19 |                  postcbhg=None,
 20 |                  name='TacoBase'):
 21 |         super(TacotronBase, self).__init__(name=name)
 22 |         self.hp = hp
 23 |         self.encoder = encoder
 24 |         self.decoder = decoder
 25 |         self.postnet = postnet
 26 |         self.postcbhg = postcbhg
 27 | 
 28 |     def call(self, text_inputs, mel_inputs=None,
 29 |              spec_inputs=None, spec_lengths=None, training=None):
 30 | 
 31 |         self.training = K.learning_phase() if training is None else training
 32 |         self.batch_size = tf.shape(text_inputs)[0]
 33 | 
 34 |         # trim the inputs with a length of multiplies of r
 35 |         if mel_inputs is not None and spec_inputs is not None:
 36 |             mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs(
 37 |                 self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths)
 38 | 
 39 |         # encoder
 40 |         encoder_outputs = self.encoder_call(text_inputs)
 41 | 
 42 |         # set values for attention layer and text_input_shape for decoder_cell
 43 |         self.atten_layer.set_values_keys(values=encoder_outputs)
 44 |         self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1])
 45 | 
 46 |         # decoder
 47 |         outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths)
 48 |         return outputs
 49 | 
 50 |     def encoder_call(self, text_inputs):
 51 |         # encoder: takes text(char ids) as input, outupts text embeddings(with mask)
 52 |         # [batch, text_time, embeding_dim]
 53 |         encoder_outputs = self.encoder(text_inputs, training=self.training)
 54 | 
 55 |         self.text_targets = text_inputs
 56 |         self.encoder_outputs = encoder_outputs
 57 | 
 58 |         return encoder_outputs
 59 | 
 60 |     def decoder_call(self, mel_inputs=None, spec_inputs=None, spec_lengths=None):
 61 |         hp = self.hp
 62 |         training = self.training
 63 |         batch_size = self.batch_size
 64 | 
 65 |         # decoder: take mels as input, outupts mels, stop_tokens, alignments, and seq_lengths
 66 |         # mel [batch, mel_time / r, mel_num * r], stop_token [batch, mel_time / r, r]
 67 |         # alignments [batch, mel_time / r, text_time] seq_length [batch]
 68 |         decoder_outputs = self.decoder(inputs=mel_inputs,
 69 |                                        inputs_lengths=spec_lengths,
 70 |                                        batch_size=batch_size,
 71 |                                        training=training)
 72 |         (decoder_mel, stop_outputs, alignments), _, seq_length_outputs = decoder_outputs
 73 | 
 74 |         # output r frame of mels at each time step
 75 |         # mel [batch, mel_time, mel_num], stop_token [batch, mel_time, 1]
 76 |         decoder_mel = tf.reshape(decoder_mel, shape=[batch_size, -1, hp.num_mels])
 77 |         stop_outputs = tf.reshape(stop_outputs, shape=[batch_size, -1])
 78 |         if hp.clip_outputs:
 79 |             c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max
 80 |             decoder_mel = tf.clip_by_value(decoder_mel, c_min, c_max)
 81 | 
 82 |         # Postnet
 83 |         if self.postnet:
 84 |             residual_mel = self.postnet(decoder_mel, training)
 85 |         else:
 86 |             residual_mel = 0.
 87 | 
 88 |         # Mel outputs
 89 |         mel_outputs = decoder_mel + residual_mel
 90 |         if hp.clip_outputs:
 91 |             c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max
 92 |             # mel_outputs = mel_outputs - 0.05
 93 |             # mel_outputs = tf.where(mel_outputs < 0.3, mel_outputs - 0.05, mel_outputs)
 94 |             mel_outputs = tf.clip_by_value(mel_outputs, c_min, c_max)
 95 | 
 96 |         # CBHG convert mel to linear spectrum
 97 |         if self.postcbhg:
 98 |             self.spec_projection = cl.FrameProjection(hp.num_spec, hp.frame_activation)
 99 |             post_outputs = self.postcbhg(mel_outputs, training)
100 |             spec_outputs = self.spec_projection(post_outputs)
101 | 
102 |             if hp.clip_outputs:
103 |                 c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max
104 |                 # spec_outputs = spec_outputs - 0.05
105 |                 # spec_outputs = tf.where(spec_outputs < 0.6, spec_outputs - 0.1, spec_outputs)
106 |                 spec_outputs = tf.clip_by_value(spec_outputs, c_min, c_max)
107 |             self.spec_outputs = spec_outputs
108 | 
109 |         self.mel_targets = mel_inputs
110 |         self.spec_targets = spec_inputs
111 |         self.spec_length_targets = spec_lengths
112 | 
113 |         self.mel_outputs = mel_outputs
114 |         self.mel_decoder_outputs = decoder_mel
115 |         self.stop_outputs = stop_outputs
116 |         self.alignment_outputs = tf.transpose(alignments, [0, 2, 1])  # [batch, encode_time, decode_time]
117 |         self.seq_length_outputs = seq_length_outputs
118 | 
119 |         self.all_vars = tf.trainable_variables()
120 | 
121 |         is_print = True
122 |         log(f'{self.name} Model Dimensions: ', is_print=is_print)
123 |         log('  text embedding:           %d' % self.encoder.embed_output.shape[-1], is_print=is_print)
124 |         log('  encoder  out:             %d' % self.encoder_outputs.shape[-1], is_print=is_print)
125 |         log('  decoder  out:             %d' % mel_outputs.shape[-1], is_print=is_print)
126 |         log('  postcbhg out:             %d' % post_outputs.shape[-1], is_print=is_print)
127 |         log('  linear   out:             %d' % spec_outputs.shape[-1], is_print=is_print)
128 |         log('  Model Parameters       {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
129 | 
130 |         return mel_outputs, seq_length_outputs
131 | 
132 |     def add_loss(self, name='loss'):
133 |         '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
134 | 
135 |         priority_freq_n = int(2000 / (self.hp.sample_rate * 0.5) * self.hp.num_spec)
136 |         with tf.name_scope(name):
137 |             mel_loss, spec_loss = self.hp.mel_loss, self.hp.spec_loss
138 |             if self.hp.mask_decoder:
139 |                 self.before_mel_loss = get_mel_loss(self.mel_targets, self.mel_decoder_outputs,
140 |                                                     self.spec_length_targets, method=mel_loss)
141 |                 self.after_mel_loss = get_mel_loss(self.mel_targets, self.mel_outputs,
142 |                                                    self.spec_length_targets, method=mel_loss)
143 |                 self.spec_loss = get_spec_loss(self.spec_targets, self.spec_outputs, priority_freq_n,
144 |                                                self.spec_length_targets, method=spec_loss)
145 |                 self.stop_loss = get_stop_loss(None, self.stop_outputs, self.hp.outputs_per_step,
146 |                                                self.spec_length_targets, do_mask=True,
147 |                                                pos_weight=self.hp.cross_entropy_pos_weight)
148 |             else:
149 |                 self.before_mel_loss = get_mel_loss(self.mel_targets, self.mel_decoder_outputs, method=mel_loss)
150 |                 self.after_mel_loss = get_mel_loss(self.mel_targets, self.mel_outputs, method=mel_loss)
151 |                 self.spec_loss = get_spec_loss(self.spec_targets, self.spec_outputs, priority_freq_n, method=spec_loss)
152 |                 self.stop_loss = get_stop_loss(None, self.stop_outputs, self.hp.outputs_per_step,
153 |                                                self.spec_length_targets)
154 | 
155 |             self.reg_loss = tf.constant(0.0)
156 |             if self.hp.reg_weight is not None:
157 |                 """
158 |                 self.reg_vars = [v for v in self.all_vars if not('bias' in v.name or 'atten_cell' in v.name
159 |                                                                  or '_projection' in v.name or 'embeddings' in v.name
160 |                                                                  or 'gru' in v.name or 'lstm' in v.name)]
161 |                 """
162 |                 self.reg_vars = [v for v in self.all_vars if not('bias' in v.name or '_projection' in v.name or 'embeddings' in v.name)]
163 |                 self.reg_loss = self.hp.reg_weight * tf.add_n([tf.nn.l2_loss(v) for v in self.reg_vars], name='reg_loss')
164 | 
165 |             self.mel_loss = self.before_mel_loss + self.after_mel_loss
166 |             self.loss = self.mel_loss + self.spec_loss + self.stop_loss + self.reg_loss
167 | 
168 |     def add_optimizer(self, global_step, update_step=True, name='optimizer'):
169 |         '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
170 |         # Arguments
171 |           global_step: int32 scalar Tensor representing current global step in training
172 |         '''
173 |         with tf.name_scope(name):
174 |             hp = self.hp
175 |             if hp.decay_learning_rate:
176 |                 self.learning_rate = self.add_learning_rate(hp.initial_learning_rate, global_step)
177 |             else:
178 |                 self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
179 |             optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
180 |             gradients, variables = zip(*optimizer.compute_gradients(self.loss))
181 |             clipped_gradients = [tf.clip_by_value(g, -1.0, 1.0) for g in gradients]  # 经常出现梯度爆炸
182 |             clipped_gradients, global_norm = tf.clip_by_global_norm(clipped_gradients, 1.0)
183 | 
184 |             with tf.control_dependencies(self.updates):
185 |                 self.optimize = optimizer.apply_gradients(
186 |                     zip(clipped_gradients, variables),
187 |                     global_step=global_step if update_step else None,
188 |                 )
189 | 
190 |             self.optimizer = optimizer
191 |             self.gradients = gradients
192 |             self.global_gradient_norm = global_norm
193 | 
194 |     def add_stats(self, name='stats'):
195 |         with tf.name_scope(name):
196 |             tf.summary.scalar('loss', self.loss)
197 |             tf.summary.scalar('reg_loss', self.reg_loss)
198 |             tf.summary.scalar('mel_loss', self.mel_loss)
199 |             tf.summary.scalar('spec_loss', self.spec_loss)
200 |             tf.summary.scalar('stop_loss', self.stop_loss)
201 |             tf.summary.scalar('learning_rate', self.learning_rate)
202 |             tf.summary.histogram('spec_outputs', self.spec_outputs)
203 |             tf.summary.histogram('spec_targets', self.spec_targets)
204 |             tf.summary.histogram('mel_outputs', self.mel_outputs)
205 |             tf.summary.histogram('mel_targets', self.mel_targets)
206 | 
207 |             self.total_grad_norms = [tf.norm(g) for g in self.gradients]
208 |             self.reg_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.reg_loss) if g[0] is not None]
209 |             self.mel_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.mel_loss) if g[0] is not None]
210 |             self.spec_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.spec_loss) if g[0] is not None]
211 |             self.stop_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.stop_loss) if g[0] is not None]
212 |             self.total_grad_norms_max = tf.reduce_max(self.total_grad_norms)
213 |             self.reg_grad_norms_max = tf.reduce_max(self.reg_grad_norms)
214 |             self.mel_grad_norms_max = tf.reduce_max(self.mel_grad_norms)
215 |             self.spec_grad_norms_max = tf.reduce_max(self.spec_grad_norms)
216 |             self.stop_grad_norms_max = tf.reduce_max(self.stop_grad_norms)
217 | 
218 |             tf.summary.scalar('global_grad_norm', self.global_gradient_norm)
219 |             tf.summary.scalar('total_grad_norms_max', self.total_grad_norms_max)
220 |             tf.summary.scalar('reg_grad_norms_max', self.reg_grad_norms_max)
221 |             tf.summary.scalar('mel_grad_norms_max', self.mel_grad_norms_max)
222 |             tf.summary.scalar('spec_grad_norms_max', self.spec_grad_norms_max)
223 |             tf.summary.scalar('stop_grad_norms_max', self.stop_grad_norms_max)
224 | 
225 |             tf.summary.histogram('total_grad_norms', self.total_grad_norms)
226 |             tf.summary.histogram('reg_grad_norms', self.reg_grad_norms)
227 |             tf.summary.histogram('mel_grad_norms', self.mel_grad_norms)
228 |             tf.summary.histogram('spec_grad_norms', self.spec_grad_norms)
229 |             tf.summary.histogram('stop_grad_norms', self.stop_grad_norms)
230 |             return tf.summary.merge_all()
231 | 
232 |     def add_learning_rate(self, init_lr, global_step):
233 |         # Noam scheme from tensor2tensor:
234 |         warmup_steps = 4000.0
235 |         step = tf.cast(global_step + 1, dtype=tf.float32)
236 |         return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
237 | 


--------------------------------------------------------------------------------
/models/emogst_tacotron2.py:
--------------------------------------------------------------------------------
  1 | # import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras import layers
  5 | from tensorflow.keras import backend as K
  6 | 
  7 | # from utils.debug import debug_print
  8 | 
  9 | from modules import custom_layers as cl
 10 | from modules import custom_functions as cf
 11 | from modules.attention import GSTAttention
 12 | 
 13 | from models.tacotron2 import Tacotron2
 14 | 
 15 | 
 16 | class Tacotron2EMOGST(Tacotron2):
 17 |     def __init__(self, hp, gta=False, name='taco2_emogst'):
 18 |         super(Tacotron2EMOGST, self).__init__(hp, gta, name=name)
 19 |         self.reference_encoder = cl.ReferenceEncoder(
 20 |             channels=hp.reference_channels,
 21 |             rnn_units=hp.reference_rnn_units,
 22 |             output_units=hp.emotion_embedding_units)
 23 |         self.gst_attention = GSTAttention(
 24 |             num_heads=hp.gst_heads,
 25 |             num_tokens=hp.gst_tokens,
 26 |             gst_units=hp.gst_units,
 27 |             attention_units=hp.gst_atten_units,
 28 |             attention_type=hp.gst_atten_type,
 29 |             activation=hp.gst_activation,
 30 |             trainable=hp.gst_trainable)
 31 | 
 32 |     def call(self,
 33 |              text_inputs,
 34 |              mel_inputs=None,
 35 |              spec_inputs=None,
 36 |              spec_lengths=None,
 37 |              ref_inputs=None,
 38 |              ref_lengths=None,
 39 |              emo_labels=None,
 40 |              atten_weights_ph=None,
 41 |              training=None):
 42 | 
 43 |         self.training = K.learning_phase() if training is None else training
 44 |         self.batch_size = tf.shape(text_inputs)[0]
 45 | 
 46 |         # trim the inputs with a length of multiplies of r
 47 |         if mel_inputs is not None and spec_inputs is not None:
 48 |             mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs(
 49 |                 self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths)
 50 | 
 51 |         # set reference to mel if it is not given
 52 |         if ref_inputs is None:
 53 |             ref_inputs = mel_inputs
 54 |             ref_lengths = spec_lengths
 55 | 
 56 |         # encoder
 57 |         encoder_outputs = self.encoder_call(text_inputs)
 58 | 
 59 |         # reference encoder
 60 |         ref_outputs = None
 61 |         if ref_inputs is not None:  # 之前没有传递training参数, 导致bn层参数梯度nan
 62 |             ref_outputs = self.reference_encoder(ref_inputs, x_length=ref_lengths, training=training)
 63 |         gst_outputs = self.gst_attention(ref_outputs, atten_weights_ph=atten_weights_ph, training=training)
 64 | 
 65 |         self.ref_outputs = ref_outputs  # [N, ref_output_units]
 66 |         self.gst_outputs = gst_outputs  # [N, 1, gst_units]
 67 |         self.gst_weights = self.gst_attention.atten_weights  # [N, gst_heads, 1, gst_tokens]
 68 | 
 69 |         self.add_emotion_task(self.gst_weights)
 70 |         self.emo_labels = emo_labels
 71 | 
 72 |         gst_outputs = tf.tile(gst_outputs, [1, tf.shape(encoder_outputs)[1], 1])
 73 |         encoder_outputs = tf.concat([encoder_outputs, gst_outputs], axis=-1)
 74 | 
 75 |         # set values for attention layer and batch_timesteps for decoder_cell
 76 |         self.atten_layer.set_values_keys(values=encoder_outputs)
 77 |         self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1])
 78 | 
 79 |         # decoder
 80 |         outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths)
 81 |         return outputs
 82 | 
 83 |     def add_emotion_task(self, gst_weights):
 84 |         if self.hp.emo_used:
 85 |             weights_dim = self.hp.gst_heads * self.hp.gst_tokens
 86 |             gst_weights = tf.reshape(gst_weights, [-1, weights_dim])
 87 |             self.emo_logits = layers.Dense(4, name='emo_dense')(gst_weights)
 88 |         else:
 89 |             self.emo_logits = None
 90 | 
 91 |     def add_loss(self):
 92 |         self.emo_loss = tf.constant(0.0)
 93 |         if self.emo_logits is not None:
 94 |             loss_fn = keras.losses.CategoricalCrossentropy(from_logits=True)
 95 |             self.emo_loss = loss_fn(self.emo_labels, self.emo_logits)
 96 | 
 97 |         super().add_loss()
 98 |         self.loss = self.loss + self.emo_loss
 99 | 
100 |     def add_stats(self):
101 |         with tf.variable_scope('stats'):
102 |             if self.hp.emo_used:
103 |                 emo_grads = self.optimizer.compute_gradients(self.emo_loss)
104 |                 self.emo_grad_norms = [tf.norm(g[0]) for g in emo_grads if g[0] is not None]
105 |                 self.emo_grad_norms_max = tf.reduce_max(self.emo_grad_norms)
106 |                 tf.summary.scalar('emo_grad_norms_max', self.emo_grad_norms_max)
107 |                 tf.summary.histogram('emo_grad_norms', self.emo_grad_norms)
108 |             else:
109 |                 self.emo_grad_norms = tf.constant(0)
110 |                 self.emo_grad_norms_max = tf.constant(0)
111 | 
112 |             tf.summary.scalar('emo_loss', self.emo_loss)
113 |             super().add_stats(name='base_stats')
114 |             return tf.summary.merge_all()
115 | 


--------------------------------------------------------------------------------
/models/sygst_tacotron2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras import layers
  5 | from tensorflow.keras import backend as K
  6 | 
  7 | # from utils.debug import debug_print
  8 | 
  9 | from modules import custom_layers as cl
 10 | from modules import custom_functions as cf
 11 | from modules.attention import GSTAttention
 12 | 
 13 | from models.tacotron2 import Tacotron2
 14 | 
 15 | 
 16 | class Tacotron2SYGST(Tacotron2):
 17 |     def __init__(self, hp, gta=False, name='taco2_sygst'):
 18 |         super(Tacotron2SYGST, self).__init__(hp, gta, name=name)
 19 |         self.reference_encoder = cl.ReferenceEncoder(
 20 |             channels=hp.reference_channels,
 21 |             rnn_units=hp.reference_rnn_units,
 22 |             output_units=hp.emotion_embedding_units)
 23 |         self.gst_attention = GSTAttention(
 24 |             num_heads=hp.gst_heads,
 25 |             num_tokens=hp.gst_tokens,
 26 |             gst_units=hp.gst_units,
 27 |             attention_units=hp.gst_atten_units,
 28 |             attention_type=hp.gst_atten_type,
 29 |             activation=hp.gst_activation,
 30 |             trainable=hp.gst_trainable)
 31 | 
 32 |     def call(self,
 33 |              text_inputs,
 34 |              mel_inputs=None,
 35 |              spec_inputs=None,
 36 |              spec_lengths=None,
 37 |              ref_inputs=None,
 38 |              ref_lengths=None,
 39 |              arousal_labels=None,
 40 |              valence_labels=None,
 41 |              atten_weights_ph=None,
 42 |              training=None):
 43 | 
 44 |         self.training = K.learning_phase() if training is None else training
 45 |         self.batch_size = tf.shape(text_inputs)[0]
 46 | 
 47 |         # trim the inputs with a length of multiplies of r
 48 |         if mel_inputs is not None and spec_inputs is not None:
 49 |             mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs(
 50 |                 self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths)
 51 | 
 52 |         # set reference to mel if it is not given
 53 |         if ref_inputs is None:
 54 |             ref_inputs = mel_inputs
 55 |             ref_lengths = spec_lengths
 56 | 
 57 |         # encoder
 58 |         encoder_outputs = self.encoder_call(text_inputs)
 59 | 
 60 |         # reference encoder
 61 |         ref_outputs = None
 62 |         if ref_inputs is not None:  # 之前没有传递training参数, 导致bn层参数梯度nan
 63 |             ref_outputs = self.reference_encoder(ref_inputs, x_length=ref_lengths, training=training)
 64 |         gst_outputs = self.gst_attention(ref_outputs, atten_weights_ph=atten_weights_ph, training=training)
 65 | 
 66 |         self.ref_outputs = ref_outputs  # [N, ref_output_units]
 67 |         self.gst_outputs = gst_outputs  # [N, 1, gst_units]
 68 |         self.gst_weights = self.gst_attention.atten_weights  # [N, gst_heads, 1, gst_tokens]
 69 | 
 70 |         self.add_emotion_task(self.gst_weights)
 71 |         self.arousal_labels = arousal_labels
 72 |         self.valence_labels = valence_labels
 73 | 
 74 |         gst_outputs = tf.tile(gst_outputs, [1, tf.shape(encoder_outputs)[1], 1])
 75 |         encoder_outputs = tf.concat([encoder_outputs, gst_outputs], axis=-1)
 76 | 
 77 |         # set values for attention layer and batch_timesteps for decoder_cell
 78 |         self.atten_layer.set_values_keys(values=encoder_outputs)
 79 |         self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1])
 80 | 
 81 |         # decoder
 82 |         outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths)
 83 |         return outputs
 84 | 
 85 |     def add_emotion_task(self, gst_weights):
 86 |         units, emo_loss = self.hp.emo_output_units, self.hp.emo_loss
 87 |         units = 1 if emo_loss in ['mae', 'mse', 'sigmoid'] else units
 88 | 
 89 |         if self.hp.emo_used:
 90 |             arousal_weights, valence_weights = tf.split(gst_weights, 2, axis=1)
 91 |             weights_dim = np.prod(arousal_weights.shape[1:])
 92 |             arousal_weights = tf.reshape(arousal_weights, [-1, weights_dim])
 93 |             valence_weights = tf.reshape(valence_weights, [-1, weights_dim])
 94 |             self.arousal_logits = layers.Dense(units, name='aro_dense')(arousal_weights)
 95 |             self.valence_logits = layers.Dense(units, name='val_dense')(valence_weights)
 96 |         else:
 97 |             self.arousal_logits = None
 98 |             self.valence_logits = None
 99 | 
100 |     def add_loss(self):
101 |         emo_loss = self.hp.emo_loss
102 |         if emo_loss in ['mae', 'mse']:
103 |             loss_fn = keras.losses.get(emo_loss)
104 |         elif emo_loss == 'sigmoid':
105 |             loss_fn = keras.losses.BinaryCrossentropy(from_logits=True)
106 |         elif emo_loss == 'softmax':
107 |             loss_fn = keras.losses.CategoricalCrossentropy(from_logits=True)
108 |         else:
109 |             raise ValueError(f'The emo_loss={emo_loss} is not valid')
110 | 
111 |         self.arousal_loss = tf.constant(0.0)
112 |         self.valence_loss = tf.constant(0.0)
113 |         if self.arousal_logits is not None:
114 |             self.arousal_loss = loss_fn(self.arousal_labels, self.arousal_logits)
115 |             self.valence_loss = loss_fn(self.valence_labels, self.valence_logits)
116 |         self.emo_loss = self.arousal_loss + self.valence_loss
117 | 
118 |         super().add_loss()
119 |         self.loss = self.loss + self.emo_loss
120 | 
121 |     def add_stats(self, name='stats'):
122 |         with tf.name_scope(name):
123 |             if self.hp.emo_used:
124 |                 aro_grads = self.optimizer.compute_gradients(self.arousal_loss)
125 |                 val_grads = self.optimizer.compute_gradients(self.valence_loss)
126 |                 self.aro_grad_norms = [tf.norm(g[0]) for g in aro_grads if g[0] is not None]
127 |                 self.val_grad_norms = [tf.norm(g[0]) for g in val_grads if g[0] is not None]
128 |                 self.aro_grad_norms_max = tf.reduce_max(self.aro_grad_norms)
129 |                 self.val_grad_norms_max = tf.reduce_max(self.val_grad_norms)
130 | 
131 |                 tf.summary.scalar('aro_grad_norms_max', self.aro_grad_norms_max)
132 |                 tf.summary.scalar('val_grad_norms_max', self.val_grad_norms_max)
133 |                 tf.summary.histogram('aro_grad_norms', self.aro_grad_norms)
134 |                 tf.summary.histogram('val_grad_norms', self.val_grad_norms)
135 |             else:
136 |                 self.aro_grad_norms = tf.constant(0)
137 |                 self.val_grad_norms = tf.constant(0)
138 |                 self.aro_grad_norms_max = tf.constant(0)
139 |                 self.val_grad_norms_max = tf.constant(0)
140 | 
141 |             tf.summary.scalar('aro_loss', self.arousal_loss)
142 |             tf.summary.scalar('val_loss', self.valence_loss)
143 |             super().add_stats(name='base_stats')
144 |         return tf.summary.merge_all()
145 | 


--------------------------------------------------------------------------------
/models/tacotron2.py:
--------------------------------------------------------------------------------
 1 | # import tensorflow as tf
 2 | from tensorflow import keras
 3 | # from tensorflow.keras import layers
 4 | # from tensorflow.keras import backend as K
 5 | 
 6 | # from utils.debug import debug_print
 7 | from modules import custom_layers as cl
 8 | from modules.encoders import TacotronEncoder
 9 | from modules.encoders import Tacotron2Encoder
10 | from modules.decoders import DecoderCell, Decoder
11 | from modules.attention import LocationSensitiveAttention
12 | from modules.attention import StepwiseMonotonicAttention
13 | 
14 | from models.base import TacotronBase
15 | 
16 | 
17 | class Tacotron2(TacotronBase):
18 |     def __init__(self, hp, gta=False, name='Tacotron2'):
19 |         super(keras.Model, self).__init__(name=name)
20 |         self.hp = hp
21 |         self.gta = gta
22 | 
23 |         if hp.encoder_type == 'taco2':    # tacotron2 encoder
24 |             self.encoder = Tacotron2Encoder(hp)
25 |         elif hp.encoder_type == 'taco':   # actually is cbhg encoder
26 |             self.encoder = TacotronEncoder(hp)
27 |         else:
28 |             raise ValueError('encoder_type must in [taco2, taco]')
29 | 
30 |         if hp.attention_type == 'location':  # 各个模块的定义顺序最好不要改变
31 |             self.atten_layer = LocationSensitiveAttention(
32 |                 units=hp.attention_units,
33 |                 location_filters=hp.attention_filters,
34 |                 location_kernel_size=hp.attention_kernel_size,
35 |                 synthesis_constraint=hp.synthesis_constraint,
36 |                 synthesis_win_size=hp.synthesis_win_size,
37 |                 synthesis_softmax_temp=hp.synthesis_softmax_temp
38 |             )
39 |         elif hp.attention_type == 'sma':
40 |             self.atten_layer = StepwiseMonotonicAttention(
41 |                 units=hp.attention_units,
42 |                 normalize=hp.attention_sma_normalize,
43 |                 sigmoid_noise=hp.attention_sma_sigmoid_noise,
44 |                 sigmoid_noise_seed=hp.attention_sma_sigmoid_noise_seed,
45 |                 score_bias_init=hp.attention_sma_score_bias_init,
46 |                 mode=hp.attention_sma_mode
47 |             )
48 |         else:
49 |             raise ValueError('attention_type must in [location, sma]')
50 | 
51 |         self.decoder_prenet = cl.Prenet(units=hp.prenet_units, drop_rate=hp.dropout_rate)
52 |         self.atten_rnn_cell = cl.AttentionRNNCell(units=hp.attention_rnn_units,
53 |                                                   zone_rate=hp.zoneout_rate)
54 |         self.frame_projection = cl.FrameProjection(units=hp.outputs_per_step * hp.num_mels,
55 |                                                    activation=hp.frame_activation)
56 |         self.stop_projection = cl.StopProjection(units=hp.outputs_per_step)
57 |         self.decoder_cell = DecoderCell(prenet=self.decoder_prenet,
58 |                                         attention_rnn=self.atten_rnn_cell,
59 |                                         attention_layer=self.atten_layer,
60 |                                         frame_projection=self.frame_projection,
61 |                                         stop_projection=self.stop_projection)
62 |         self.decoder = Decoder(hp,
63 |                                self.decoder_cell,
64 |                                gta=gta,
65 |                                impute_finished=hp.impute_finished,
66 |                                maximum_steps=hp.max_iters)
67 | 
68 |         self.postnet = cl.Postnet(num_layers=hp.postnet_cnns[0],
69 |                                   kernel_size=hp.postnet_cnns[1],
70 |                                   channels=hp.postnet_cnns[2],
71 |                                   drop_rate=hp.dropout_rate,
72 |                                   output_units=hp.num_mels,
73 |                                   output_activation=hp.frame_activation)
74 | 
75 |         self.postcbhg = None
76 |         if hp.post_cbhg:
77 |             self.postcbhg = cl.CBHG(K=hp.cbhg_kernels,
78 |                                     conv_channels=hp.cbhg_conv_channels,
79 |                                     pool_size=hp.cbhg_pool_size,
80 |                                     projections=[hp.cbhg_projection, hp.num_mels],
81 |                                     highway_units=hp.cbhg_highway_units,
82 |                                     highway_nums=hp.cbhg_highway_nums,
83 |                                     rnn_units=hp.cbhg_rnn_units,
84 |                                     name='post_cbhg')
85 | 
86 |         super(Tacotron2, self).__init__(hp=hp,
87 |                                         encoder=self.encoder,
88 |                                         decoder=self.decoder,
89 |                                         postnet=self.postnet,
90 |                                         postcbhg=self.postcbhg,
91 |                                         name=name)
92 | 


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thuhcsi/icassp2021-emotion-tts/45bc25405e7c0f51f45727cc91cd41573c05bc65/modules/__init__.py


--------------------------------------------------------------------------------
/modules/custom_functions.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def trim_inputs(r, mel_inputs, spec_inputs, spec_lengths):
 5 |     """Trim the inputs' lengths to maximum multiplies of r
 6 |     """
 7 |     r = tf.cast(r, tf.int32)
 8 |     mel_inputs = tf.cast(mel_inputs, tf.float32)    # tf.cast 如果tensor dtype一样, 则返回原tensor
 9 |     spec_inputs = tf.cast(spec_inputs, tf.float32)  # 如果是np.ndarray, 则创建tensor, 且能做类型转换
10 |     spec_lengths = tf.cast(spec_lengths, tf.int32)  # tf.convert_to_tensor, 类型不一致会报错
11 | 
12 |     max_len = tf.reduce_max(spec_lengths)
13 |     max_len = tf.cast(max_len / r, dtype=tf.int32) * r  # cast to int32 <=> floor
14 | 
15 |     mel_inputs = mel_inputs[:, : max_len, :]
16 |     spec_inputs = spec_inputs[:, : max_len, :]
17 |     spec_lengths = tf.clip_by_value(spec_lengths, 0, max_len)
18 |     return mel_inputs, spec_inputs, spec_lengths
19 | 


--------------------------------------------------------------------------------
/modules/custom_layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | # from tensorflow import keras
  3 | from tensorflow.keras import layers
  4 | from tensorflow.compat.v1.logging import warn
  5 | from tensorflow.python.keras import backend as K
  6 | from tensorflow.python.keras.utils import tf_utils
  7 | 
  8 | # from warnings import warn
  9 | # from utils.debug import debug_print
 10 | 
 11 | 
 12 | class ConvBlock(layers.Layer):
 13 |     def __init__(self,
 14 |                  mode,
 15 |                  conv_type,
 16 |                  channels,
 17 |                  kernel_size,
 18 |                  dropout_rate=None,
 19 |                  activation='relu',
 20 |                  batch_norm=True,
 21 |                  strides=1,
 22 |                  padding='same',
 23 |                  do_mask=False,
 24 |                  name='conv_block'):
 25 |         """stack conv(c), activation(a), batch norm(b), dropout(d) layers
 26 |         # Arguments
 27 |             mode: a str with char in ['c', 'a', 'b', 'd'] denotes the order
 28 |                 of calling the corresponding layer
 29 |             conv_type: '1D' '2D' '3D' for Conv1D, Conv2D, Conv3D
 30 |         """
 31 |         super(ConvBlock, self).__init__(name=name)
 32 |         conv_map = {'1D': layers.Conv1D, '2D': layers.Conv2D, '3D': layers.Conv3D}
 33 |         conv_layer = conv_map.get(conv_type)
 34 | 
 35 |         self.conv = conv_layer(channels, kernel_size, strides, padding=padding, name='conv')
 36 |         self.activation = layers.Activation(activation) if activation else None
 37 |         self.bn = layers.BatchNormalization(name='bn') if batch_norm else None
 38 |         self.dropout = layers.Dropout(dropout_rate, name='drop') if dropout_rate else None
 39 | 
 40 |         self.do_mask = do_mask
 41 |         self.block_mode = mode
 42 |         self.supports_masking = True
 43 |         self.conv.supports_masking = True
 44 | 
 45 |     def call(self, x, training=None, mask=None):  # 添加mask参数, 变成掩码使用层
 46 |         layer_map = {'c': self.conv, 'b': self.bn,
 47 |                      'a': self.activation, 'd': self.dropout}
 48 | 
 49 |         for c in self.block_mode:
 50 |             layer = layer_map.get(c)
 51 |             # assert layer is not None, 'Layer at conv block mode but not been built'
 52 |             if layer is not None:
 53 |                 if c in ['b', 'd']:
 54 |                     x = layer(x, training=training)
 55 |                 else:
 56 |                     x = layer(x)
 57 |             else:
 58 |                 warn(RuntimeWarning(f'ConvBlock: {c} in mode str but layer not built'))
 59 | 
 60 |         if self.do_mask and mask:
 61 |             mask = tf.cast(mask, 'float32')
 62 |             x = x * tf.expand_dims(mask, axis=-1)  # [N, T_in, 1]
 63 |         return x
 64 | 
 65 | 
 66 | class ZoneoutLSTMCell(layers.LSTMCell):
 67 |     def __init__(self,
 68 |                  units,
 69 |                  output_rate=0.,
 70 |                  cell_rate=None,
 71 |                  name='zolstm',
 72 |                  **kwargs):
 73 |         cell_rate = output_rate if cell_rate is None else cell_rate
 74 |         assert output_rate >= 0 and output_rate <= 1
 75 |         assert cell_rate >= 0 and cell_rate <= 1
 76 | 
 77 |         super(ZoneoutLSTMCell, self).__init__(units, name=name, **kwargs)
 78 |         self._cell_dropout = layers.Dropout(cell_rate, name='drop_cell')
 79 |         self._output_dropout = layers.Dropout(output_rate, name='drop_output')
 80 |         self._output_rate = output_rate
 81 |         self._cell_rate = cell_rate
 82 | 
 83 |     def call(self, x, state, training=None):
 84 |         '''Runs vanilla LSTM Cell and applies zoneout.
 85 |         '''
 86 |         # print('DEBUG custom_layers: training', training)  # RNN()会调用cell.call 2次
 87 |         output, new_state = super().call(x, state, training)   # <=> LSTMCell.call(self, x, state)
 88 |         pre_c, pre_h = state
 89 |         new_c, new_h = new_state
 90 | 
 91 |         c = (1 - self._cell_rate) * self._cell_dropout(new_c - pre_c, training) + pre_c
 92 |         h = (1 - self._output_rate) * self._output_dropout(new_h - pre_h, training) + pre_h
 93 | 
 94 |         new_state = [c, h]
 95 |         return output, new_state
 96 | 
 97 |     def get_config(self):
 98 |         config = {'output_rate': self._output_rate,
 99 |                   'cell_rate': self._cell_rate}
100 |         super_config = super(ZoneoutLSTMCell, self).get_config()
101 |         config.update(super_config)
102 |         return config
103 | 
104 | 
105 | class AttentionRNNCell(layers.StackedRNNCells):
106 |     def __init__(self,
107 |                  units=[1024, 1024],
108 |                  zone_rate=0.1,
109 |                  name='atten_cell'):
110 |         self.units = units
111 |         self.zone_rate = zone_rate
112 |         self.lstm_cells = [ZoneoutLSTMCell(k, zone_rate) for k in units]
113 |         super(AttentionRNNCell, self).__init__(cells=self.lstm_cells, name=name)
114 | 
115 |     def get_initial_state(self, inputs=None, batch_size=None, dtype=tf.float32):
116 |         return super().get_initial_state(inputs, batch_size, dtype=dtype)
117 | 
118 | 
119 | class Prenet(layers.Layer):
120 |     def __init__(self,
121 |                  units,
122 |                  drop_rate=0.5,
123 |                  activation='relu',
124 |                  name='prenet'):
125 |         super(Prenet, self).__init__(name=name)
126 |         self.units = units
127 |         self.drop_rate = drop_rate
128 |         self.activation = activation
129 |         self.dense_layers = [layers.Dense(k, activation, name='dense') for k in units]
130 |         self.drop_layers = [layers.Dropout(drop_rate, name='drop') for k in range(len(units))]
131 | 
132 |     def call(self, x):
133 |         for dense, drop in zip(self.dense_layers, self.drop_layers):
134 |             x = dense(x)
135 |             x = drop(x, training=True)
136 |         return x
137 | 
138 | 
139 | class StopProjection(layers.Layer):
140 |     """Projection to a scalar and through a sigmoid activation
141 |     """
142 |     def __init__(self,
143 |                  units=1,
144 |                  activation='sigmoid',
145 |                  name='stop_projection'):
146 |         super(StopProjection, self).__init__(name=name)
147 |         self.units = units
148 |         self.activation = activation
149 |         self.dense = layers.Dense(units, None, name='dense')
150 |         self.activation_layer = layers.Activation(activation)
151 | 
152 |     def call(self, x, training=None):
153 |         if training is None:
154 |             training = K.learning_phase()
155 |         x = self.dense(x)
156 |         x = tf_utils.smart_cond(training,
157 |                                 true_fn=lambda: tf.identity(x),
158 |                                 false_fn=lambda: self.activation_layer(x))
159 |         return x
160 | 
161 | 
162 | class FrameProjection(layers.Layer):
163 |     """Projection layer to r * num_mels dimensions or num_mels dimensions
164 |     """
165 |     def __init__(self,
166 |                  units=80,
167 |                  activation='relu',   # our mel is normalized to [0, 1]; sygst is None
168 |                  # activation=None,  # sygst and rayhame are None
169 |                  name='frame_projection'):
170 |         super(FrameProjection, self).__init__(name=name)
171 |         self.units = units
172 |         self.activation = activation
173 |         self.dense = layers.Dense(units, activation, name='dense')
174 | 
175 |     def call(self, x):
176 |         x = self.dense(x)
177 |         return x
178 | 
179 | 
180 | class Postnet(layers.Layer):
181 |     def __init__(self,
182 |                  num_layers,
183 |                  channels,
184 |                  kernel_size,
185 |                  drop_rate,
186 |                  output_units,
187 |                  output_activation,
188 |                  name='postnet'):
189 |         """
190 |         # Arguments
191 |             output_units: it is usually the num_mels for mel residual connection
192 |         """
193 |         super(Postnet, self).__init__(name=name)
194 |         self.cnns = [ConvBlock('cabd', '1D', channels, kernel_size, drop_rate, 'tanh')
195 |                      for i in range(num_layers - 1)]
196 |         self.last_cnn = ConvBlock('cbd', '1D', channels, kernel_size, drop_rate, None)
197 |         self.dim_projection = FrameProjection(output_units, output_activation, name='postnet_proj')
198 | 
199 |     def call(self, x, training=None):
200 |         for cnn in self.cnns:
201 |             x = cnn(x, training)
202 |         x = self.last_cnn(x, training)
203 |         x = self.dim_projection(x)
204 |         return x
205 | 
206 | 
207 | class HighwayNet(layers.Layer):
208 |     def __init__(self, units, name='highway_net'):
209 |         super(HighwayNet, self).__init__(name=name)
210 |         self.units = units
211 |         self.h_layer = layers.Dense(units, 'relu', name='H')
212 |         self.t_layer = layers.Dense(units, 'sigmoid', name='T',
213 |                                     bias_initializer=tf.constant_initializer(-1.))
214 | 
215 |         self.supports_masking = True
216 | 
217 |     def call(self, x):
218 |         h = self.h_layer(x)
219 |         t = self.t_layer(x)
220 |         x = h * t + x * (1. - t)
221 |         return x
222 | 
223 | 
224 | class CBHG(layers.Layer):
225 |     def __init__(self,
226 |                  K,
227 |                  conv_channels,
228 |                  pool_size,
229 |                  projections,
230 |                  highway_units,
231 |                  highway_nums,
232 |                  rnn_units,
233 |                  name='cbhg'):
234 |         super(CBHG, self).__init__(name=name)
235 |         self.K = K
236 |         self.conv_channels = conv_channels
237 |         self.pool_size = pool_size
238 |         self.projections = projections
239 |         self.highway_units = highway_units
240 |         self.highway_nums = highway_nums
241 |         self.rnn_units = rnn_units
242 | 
243 |         self.conv_banks = [ConvBlock('cab', '1D', conv_channels, k, name='conv_banks')
244 |                            for k in range(1, K + 1)]
245 |         self.pool = layers.MaxPool1D(pool_size, strides=1, padding='same', name='pool')
246 |         acts = ['relu'] * (len(projections) - 1) + [None]
247 |         self.conv_projections = [ConvBlock('cab', '1D', c, 3, None, a, name='conv_projs')
248 |                                  for c, a in zip(projections, acts)]
249 |         self.highway_nets = [HighwayNet(highway_units, name='highway')
250 |                              for _ in range(highway_nums)]
251 |         self.gru_layer = layers.Bidirectional(layers.GRU(rnn_units, return_sequences=True),
252 |                                               name='bigru')
253 | 
254 |         self.supports_masking = True
255 | 
256 |     def call(self, x, training=None):
257 |         original_x = x
258 |         # K conv banks: concat on the last axis to stack channels from all convs
259 |         x = tf.concat([cnn(x, training=training) for cnn in self.conv_banks], axis=-1)
260 | 
261 |         # MaxPooling
262 |         x = self.pool(x)
263 | 
264 |         # 2-layer conv projections
265 |         for conv_proj in self.conv_projections:
266 |             x = conv_proj(x, training)
267 | 
268 |         # Residual connection
269 |         x = original_x + x
270 | 
271 |         # 4-layer HighwayNet
272 |         if x.shape[2] != self.highway_units:
273 |             x = layers.Dense(self.highway_units, name='dim_dense')(x)
274 |         for highway in self.highway_nets:
275 |             x = highway(x)
276 | 
277 |         # 1-layer bidirectional GRU
278 |         x = self.gru_layer(x, training=training)
279 |         return x
280 | 
281 | 
282 | class ReferenceEncoder(layers.Layer):
283 |     def __init__(self,
284 |                  channels,
285 |                  kernel_size=(3, 3),
286 |                  strides=(2, 2),
287 |                  rnn_units=128,
288 |                  output_units=128,
289 |                  dropout_rate=0.5,
290 |                  name='ref_encoder'):
291 |         super(ReferenceEncoder, self).__init__(name=name)
292 |         self.stride = strides[0]
293 |         self.rnn_units = rnn_units
294 |         self.output_units = output_units
295 | 
296 |         # 6-layer conv2d
297 |         self.cnns = [ConvBlock('cabd', '2D', c, kernel_size, dropout_rate, strides=strides)
298 |                      for c in channels]
299 | 
300 |         # 1-layer bi-gru
301 |         single_layer = layers.GRU(rnn_units, return_state=False)
302 |         self.rnn = layers.Bidirectional(single_layer, name='bigru')
303 | 
304 |         # 1-layer dense
305 |         self.dense = layers.Dense(output_units, 'tanh', name='output_dense')
306 | 
307 |     def call(self, x, x_length, training=None):
308 |         if x.shape.ndims == 3:
309 |             x = tf.expand_dims(x, axis=-1)
310 | 
311 |         # 6-layer conv2d
312 |         for cnn in self.cnns:
313 |             x = cnn(x, training)
314 |             x_length = (x_length + self.stride - 1) // self.stride
315 | 
316 |         # stack the rest mel dim to the conv channels
317 |         x = tf.concat(tf.unstack(x, axis=2), axis=-1)  # [N, time, mel_dim * channels]
318 | 
319 |         # 1-layer bigru with mask
320 |         rnn_mask = tf.sequence_mask(x_length, maxlen=tf.shape(x)[1])
321 |         x._keras_mask = rnn_mask
322 |         x = self.rnn(x, training=training)  # [N, rnn_units * 2]
323 | 
324 |         # 1-layer dense for final output
325 |         x = self.dense(x)  # [N, output_units]
326 |         return x
327 | 


--------------------------------------------------------------------------------
/modules/decoders.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | # from tensorflow import keras
  3 | from tensorflow.keras import layers
  4 | 
  5 | from tensorflow.python import rnn
  6 | from tensorflow.python.util import nest
  7 | from tensorflow.python.keras.utils import tf_utils
  8 | 
  9 | from collections import namedtuple
 10 | 
 11 | # from utils.debug import debug_print
 12 | 
 13 | """
 14 | # 此种写法不支持tf.convert_to_tensor()
 15 | class DecoderCellState:
 16 |     def __init__(self,
 17 |                  attention_rnn_state,
 18 |                  attention_context,
 19 |                  attention_alignments,
 20 |                  attention_accum_alignments,
 21 |                  decode_rnn_state):
 22 |         self.attention_rnn_state = attention_rnn_state
 23 |         self.attention_context = attention_context
 24 |         self.attention_alignments = attention_alignments
 25 |         self.attention_accum_alignments = attention_accum_alignments
 26 |         self.decode_rnn_state = decode_rnn_state
 27 | """
 28 | 
 29 | DecoderCellState = namedtuple('DecoderCellState',
 30 |                               ['attention_state',
 31 |                                'attention_rnn_state',
 32 |                                'decode_rnn_state'])
 33 | 
 34 | 
 35 | class DecoderCell(layers.Layer):
 36 |     def __init__(self,
 37 |                  prenet,
 38 |                  attention_rnn,
 39 |                  attention_layer,
 40 |                  frame_projection,
 41 |                  stop_projection,
 42 |                  decode_rnn=None,
 43 |                  text_input_shape=None,
 44 |                  name='decoder_cell'):
 45 |         """
 46 |         # Arguments
 47 |             text_input_shape: the shape of encoder outputs, must be get by tf.shape
 48 |         """
 49 |         super(DecoderCell, self).__init__(name=name)
 50 |         self.prenet = prenet
 51 |         self.attention_rnn = attention_rnn
 52 |         self.attention_layer = attention_layer
 53 |         self.frame_projection = frame_projection
 54 |         self.stop_projection = stop_projection
 55 |         self.decode_rnn = decode_rnn
 56 | 
 57 |         self._batch_size = None
 58 |         self._text_time_steps = None
 59 | 
 60 |     def call(self, x, state, training=None):
 61 |         # prenet
 62 |         prenet_output = self.prenet(x)
 63 |         atten_rnn_input = tf.concat([prenet_output,
 64 |                                      state.attention_state.context],
 65 |                                     axis=-1, name='prenet_concat')
 66 |         # attention rnn
 67 |         atten_rnn_output, atten_rnn_state = self.attention_rnn(atten_rnn_input,
 68 |                                                                state.attention_rnn_state,
 69 |                                                                training=training)
 70 |         # nvidia-torch -> drop(atten_rnn_output, 0.1)
 71 | 
 72 |         # attention computation
 73 |         atten_context, atten_state = self.attention_layer(atten_rnn_output,
 74 |                                                           state.attention_state,
 75 |                                                           training=training)
 76 |         atten_rnn_context_cat = tf.concat([atten_rnn_output, atten_context],
 77 |                                           axis=-1, name='atten_rnn_concat')
 78 |         if self.decode_rnn:
 79 |             decode_rnn_output, decode_rnn_state = self.decode_rnn(atten_rnn_context_cat,
 80 |                                                                   state.decode_rnn_state,
 81 |                                                                   training=training)
 82 |             # nvidia-torch -> drop(decode_rnn_output, 0.1)
 83 |             projection_input = tf.concat([decode_rnn_output, atten_context],
 84 |                                          axis=-1, name='decode_rnn_concat')
 85 |         else:
 86 |             decode_rnn_state = tf.zeros(tf.shape(x)[0])
 87 |             projection_input = atten_rnn_context_cat
 88 | 
 89 |         cell_outputs = self.frame_projection(projection_input)
 90 |         stop_tokens = self.stop_projection(projection_input)
 91 | 
 92 |         next_state = DecoderCellState(atten_state,
 93 |                                       atten_rnn_state,
 94 |                                       decode_rnn_state)
 95 |         return (cell_outputs, stop_tokens, atten_state.alignments), next_state
 96 | 
 97 |     def set_batch_timesteps(self, batch_size, text_time_steps):
 98 |         self.batch_size = batch_size
 99 |         self.text_time_steps = text_time_steps
100 | 
101 |     def get_init_state(self):
102 |         assert self.batch_size is not None, 'batch_size is None, can not get initial state'
103 | 
104 |         attention_state = self.attention_layer.init_state
105 |         attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=self.batch_size)
106 |         if self.decode_rnn:
107 |             decode_rnn_state = self.decode_rnn.get_initial_state(batch_size=self.batch_size)
108 |         else:
109 |             decode_rnn_state = tf.zeros(self.batch_size)  # 不能写成None
110 |         return DecoderCellState(attention_state,
111 |                                 attention_rnn_state,
112 |                                 decode_rnn_state)
113 | 
114 |     @property
115 |     def output_size(self):
116 |         return (self.frame_projection.units,
117 |                 self.stop_projection.units,
118 |                 self.text_time_steps)
119 | 
120 |     @property
121 |     def output_dtype(self):
122 |         return (tf.float32, tf.float32, tf.float32)
123 | 
124 |     @property
125 |     def batch_size(self):
126 |         return self._batch_size
127 | 
128 |     @batch_size.setter
129 |     def batch_size(self, batch_size):
130 |         assert isinstance(batch_size, (tf.Tensor, int))
131 |         self._batch_size = batch_size
132 | 
133 |     @property
134 |     def text_time_steps(self):
135 |         return self._text_time_steps
136 | 
137 |     @text_time_steps.setter
138 |     def text_time_steps(self, text_time_steps):
139 |         assert isinstance(text_time_steps, tf.Tensor)
140 |         self._text_time_steps = text_time_steps
141 | 
142 | 
143 | class Decoder(layers.Layer):
144 |     def __init__(self,
145 |                  hp,
146 |                  decoder_cell,
147 |                  gta=False,     # ground truth alignment
148 |                  impute_finished=False,
149 |                  maximum_steps=2000,
150 |                  parallel_iterations=32,
151 |                  swap_memory=False,
152 |                  name='decoder'):
153 |         super(Decoder, self).__init__(name=name)
154 |         self.hp = hp
155 |         self.cell = decoder_cell
156 |         self.gta = gta
157 |         self.impute_finished = impute_finished
158 |         self.maximum_steps = maximum_steps
159 |         self.parallel_iterations = parallel_iterations
160 |         self.swap_memory = swap_memory
161 | 
162 |         self.r = hp.outputs_per_step
163 |         self.feed_last_frame = hp.feed_last_frame
164 |         self.input_dim = hp.num_mels * (1 if hp.feed_last_frame else self.r)
165 | 
166 |     def set_inputs(self, inputs, inputs_lengths):
167 |         self._inputs, self._inputs_lengths = None, None
168 |         if inputs is not None and inputs_lengths is not None:
169 |             batch = tf.shape(inputs)[0]
170 |             self._inputs_lengths = tf.cast(inputs_lengths / self.r, dtype=tf.int32)
171 |             if self.feed_last_frame:
172 |                 self._inputs = inputs[:, self.r - 1::self.r, :]
173 |             else:
174 |                 self._inputs = tf.reshape(inputs, [batch, -1, self.input_dim])
175 | 
176 |     def get_init_values(self):
177 |         with tf.name_scope('while_loop_init_values'):
178 |             init_time = tf.constant(0, dtype=tf.int32)
179 |             init_state = self.cell.get_init_state()
180 |             init_inputs = tf.zeros(shape=(self.batch_size, self.input_dim))
181 |             init_finished = tf.tile([False], [self.batch_size])
182 |             init_seq_lengths = tf.zeros(shape=self.batch_size, dtype=tf.int32)
183 | 
184 |             def _create_tensor_array(s, d):
185 |                 return tf.TensorArray(size=0, dtype=d,
186 |                                       dynamic_size=True,
187 |                                       # element_shape=(self.batch_size, s))
188 |                                       element_shape=None)
189 |             init_outputs_ta = nest.map_structure(_create_tensor_array,
190 |                                                  self.cell.output_size,
191 |                                                  self.cell.output_dtype)
192 | 
193 |             return init_time, init_outputs_ta, init_state, init_inputs, init_finished, init_seq_lengths
194 | 
195 |     def _next_inputs(self, time, outputs):
196 |         mel_output, stop_token = outputs[:2]  # [N, mel_num * r], [N, r]
197 | 
198 |         def _true_fn():  # training=True or gta=True
199 |             next_inputs = self._inputs[:, time, :]
200 |             finished = (time + 1 >= self._inputs_lengths)  # 下一个时间步是否完成
201 |             # finished = (time + 1 >= tf.shape(self._inputs)[1])  # 防止出现整个batch最大长度小于time_step
202 |             # 此句报错(构建计算图时), 说迭代前后shape不一致, 可能是因为tf.shape(self._inputs)[1]的原因
203 |             # 后面是直接将原始的输入做trim到r的整数倍就可以直接用time>=self._input_lengths了
204 |             return next_inputs, finished
205 | 
206 |         def _false_fn():
207 |             next_inputs = mel_output[:, -self.input_dim:]
208 |             finished = tf.cast(tf.round(stop_token), tf.bool)  # >0.5->1, <=0.5->0
209 |             finished = tf.reduce_any(finished, axis=1)   # maximum_iteraions is set at tf.while_loop
210 |             return next_inputs, finished
211 |         assert not (self.gta and self._inputs is None)
212 |         pred = tf.logical_or(self.training, self.gta)
213 |         return tf_utils.smart_cond(pred, _true_fn, _false_fn)
214 | 
215 |     def step(self, time, inputs, state):
216 |         """
217 |         # Arguments
218 |             time: current time step
219 |             inputs: current time step inputs
220 |             state: previous time step state
221 |         # Returns
222 |             outputs: current time step outputs
223 |             next_state: current time step sate
224 |             next_inputs: next time step inputs
225 |             next_finished: whether the next time step is finished
226 |         # ps: when at the i-th time step
227 |             inputs[i] = targets[i - 1]
228 |             param state = state[i - 1]
229 |             outputs[i] = targets[i] = inputs[i + 1]
230 |             next_state = state[i] = param state[i + 1]
231 |             next_finished = finished[i + 1]
232 |         """
233 |         outputs, next_state = self.cell(inputs, state, self.training)
234 |         next_inputs, next_finished = self._next_inputs(time, outputs)
235 |         return outputs, next_state, next_inputs, next_finished
236 | 
237 |     def call(self, inputs=None, inputs_lengths=None, batch_size=None, training=None):
238 |         """
239 |         # Arguments
240 |             inputs: mel spectrum inputs, with shape [N, frame_nums, num_mels]
241 |                 it can be None when in inference phase
242 |             inputs_lengths: mel spectrum lengths
243 |             batch_size: used in inference phase where inputs is None
244 |         """
245 |         assert inputs is not None or batch_size is not None
246 | 
247 |         self.training = training
248 |         self.batch_size = batch_size if inputs is None else tf.shape(inputs)[0]
249 |         self.set_inputs(inputs, inputs_lengths)
250 |         init_values = self.get_init_values()
251 |         zero_outputs = nest.map_structure(lambda s, d: tf.zeros((self.batch_size, s), d),
252 |                                           self.cell.output_size,
253 |                                           self.cell.output_dtype)
254 | 
255 |         def condition(unused_time, unused_outputs_ta, unused_state,
256 |                       unused_inputs, finished, unused_seq_lengths):
257 |             return tf.logical_not(tf.reduce_all(finished))
258 | 
259 |         def body(time, outputs_ta, state, inputs, finished, seq_lengths):
260 |             """
261 |             # Arguments
262 |                 time: current time step
263 |                 outputs_ta: the tensor array outputs, for collecting outputs at each time step
264 |                 inputs: current time step inputs
265 |                 finished: whether current time step is marked as finished
266 |                 seq_lengths: the actually lengths for each sample in batch(only used at inference phase))
267 | 
268 |             # Returns
269 |                 final_outputs: a sequence of outputs from decoder cell, i.e., (mel, stop, alignments)
270 |                 final_state: the last state from decoder cell
271 |                 final_seq_lengths: the actually lengths of outputs sequence (used at inference)
272 |             """
273 |             (outputs, next_state, next_inputs, next_finished) = self.step(time, inputs, state)
274 |             next_finished = tf.logical_or(finished, next_finished)
275 |             next_seq_lengths = seq_lengths + tf.cast(tf.logical_not(finished), dtype=tf.int32)
276 | 
277 |             nest.assert_same_structure(state, next_state)
278 |             nest.assert_same_structure(outputs_ta, outputs)
279 |             nest.assert_same_structure(inputs, next_inputs)
280 |             # note: the following two lines use the 'finished' instead of 'next_finished'
281 |             # the output at time after finished will be zero
282 |             if self.impute_finished:  # output zero and copy the state
283 |                 emit = nest.map_structure(lambda zero, out: tf.where(finished, zero, out),
284 |                                           zero_outputs, outputs)
285 |                 # the state at time after finished will be copied the last state
286 |                 next_state = nest.map_structure(lambda cur, new: tf.where(finished, cur, new),
287 |                                                 state, next_state)
288 |             else:
289 |                 emit = outputs
290 |             outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
291 |                                             outputs_ta, emit)
292 |             return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_seq_lengths)
293 | 
294 |         res = tf.while_loop(condition, body, loop_vars=init_values,
295 |                             parallel_iterations=self.parallel_iterations,
296 |                             maximum_iterations=self.maximum_steps,
297 |                             swap_memory=self.swap_memory)
298 |         final_outputs_ta = res[1]
299 |         final_state = res[2]
300 |         final_seq_lengths = res[5]
301 | 
302 |         final_outputs = nest.map_structure(lambda ta: ta.stack(),
303 |                                            final_outputs_ta)
304 |         final_outputs = nest.map_structure(rnn._transpose_batch_time,
305 |                                            final_outputs)
306 | 
307 |         return final_outputs, final_state, final_seq_lengths
308 | 


--------------------------------------------------------------------------------
/modules/encoders.py:
--------------------------------------------------------------------------------
 1 | # import tensorflow as tf
 2 | # from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | # from taco2_hparams import hp
 7 | from modules import custom_layers as cs
 8 | 
 9 | 
10 | class Tacotron2Encoder(layers.Layer):
11 |     def __init__(self, hp, name='taco2_encoder'):
12 |         super(Tacotron2Encoder, self).__init__(name=name)
13 |         # embedding layer
14 |         self.embed_layer = layers.Embedding(hp.num_symbols, hp.embedding_dim, mask_zero=True)
15 | 
16 |         # 3-layer conv1d
17 |         cnns_num, ksize, channels = hp.encoder_cnns
18 |         self.cnns = [cs.ConvBlock('cabd', '1D', channels, ksize, hp.dropout_rate)
19 |                      for i in range(cnns_num)]
20 | 
21 |         # 1-layer bi-lstm
22 |         units, zo_rate = hp.encoder_rnns_units, hp.zoneout_rate
23 |         single_layer = layers.RNN(cs.ZoneoutLSTMCell(units, zo_rate),
24 |                                   return_sequences=True)
25 |         # with mask, outputs zero for time step that mask is 0
26 |         self.rnn = layers.Bidirectional(single_layer, name='bilstm')
27 | 
28 |     def call(self, x, training=None):
29 |         x = self.embed_output = self.embed_layer(x)   # x是mask_tensor, 即, x._keras_tensor != None
30 |         for cnn in self.cnns:
31 |             x = cnn(x, training=training)             # 有bn和dropout, 必须传递training
32 |         x = self.rnn(x, training=training)  # bi-rnn必须用keyword arguments传递
33 |         return x
34 | 
35 | 
36 | class TacotronEncoder(layers.Layer):
37 |     def __init__(self, hp, name='taco_encoder'):
38 |         super(TacotronEncoder, self).__init__(name=name)
39 | 
40 |         # embedding layer
41 |         self.embed_layer = layers.Embedding(hp.num_symbols, hp.embedding_dim, mask_zero=True)
42 | 
43 |         # 2-dense-layer prenet
44 |         # self.prenet = cs.Prenet(units=[256, 128], name='prenet')
45 |         self.prenet = cs.Prenet(units=[256, 256], name='prenet')
46 | 
47 |         # cbhg block
48 |         """
49 |         self.cbhg = cs.CBHG(K=16,
50 |                             conv_channels=128,
51 |                             pool_size=2,
52 |                             projections=[128, 128],
53 |                             highway_units=128,
54 |                             highway_nums=4,
55 |                             rnn_units=128,  # 标准tacotron是128
56 |                             name='cbhg')
57 |         """
58 |         self.cbhg = cs.CBHG(K=16,
59 |                             conv_channels=256,
60 |                             pool_size=2,
61 |                             projections=[256, 256],
62 |                             highway_units=256,
63 |                             highway_nums=4,
64 |                             rnn_units=256,  # 标准tacotron是128
65 |                             name='cbhg')
66 | 
67 |     def call(self, x, training=None):
68 |         x = self.embed_output = self.embed_layer(x)
69 |         x = self.prenet(x)
70 |         x = self.cbhg(x, training=training)
71 |         return x
72 | 


--------------------------------------------------------------------------------
/modules/losses.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def get_mel_loss(targets, outputs, spec_len=None, method='mse'):
 5 |     assert method in ['mse', 'mae'], 'loss method:{method} is not valid'
 6 | 
 7 |     norm_func = tf.square if method == 'mse' else tf.abs
 8 |     norm = norm_func(targets - outputs)
 9 | 
10 |     if spec_len is not None:    # mask loss
11 |         time_step = tf.shape(outputs)[1]
12 |         mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32)
13 |         sum_n = tf.reduce_sum(mask) * tf.cast(tf.shape(outputs)[-1], tf.float32)
14 |         loss = norm * tf.expand_dims(mask, axis=-1)
15 |     else:
16 |         sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32)
17 |         loss = norm
18 | 
19 |     name = 'mel_{}{}_loss'.format('' if spec_len is None else 'mask_', method)
20 |     loss = tf.truediv(tf.reduce_sum(loss), sum_n, name=name)
21 |     return loss
22 | 
23 | 
24 | def get_spec_loss(targets, outputs, priority_freq_n, spec_len=None, method='mae'):
25 |     assert method in ['mse', 'mae'], 'loss method:{method} is not valid'
26 | 
27 |     norm_func = tf.square if method == 'mse' else tf.abs
28 |     norm = norm_func(targets - outputs)
29 |     priority_freq_n = tf.cast(priority_freq_n, tf.int32)
30 | 
31 |     if spec_len is not None:    # mask loss
32 |         time_step = tf.shape(outputs)[1]
33 |         mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32)
34 |         sum_n = tf.reduce_sum(mask) * tf.cast(tf.shape(outputs)[-1], tf.float32)
35 |         sum_m = tf.reduce_sum(mask) * tf.cast(priority_freq_n, tf.float32)
36 |         loss = norm * tf.expand_dims(mask, axis=-1)
37 |     else:
38 |         sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32)
39 |         sum_m = tf.cast(tf.reduce_prod(tf.shape(outputs)[: 2]) * priority_freq_n, tf.float32)
40 |         loss = norm
41 | 
42 |     name = 'spec_{}{}_loss'.format('' if spec_len is None else 'mask_', method)
43 |     # Prioritize loss for frequencies under 2000 Hz.
44 |     loss_low = loss[:, :, 0: priority_freq_n]
45 |     loss = [tf.reduce_sum(loss) / sum_n, tf.reduce_sum(loss_low) / sum_m]
46 |     loss = tf.tensordot(loss, [0.5, 0.5], axes=1, name=name)
47 |     return loss
48 | 
49 | 
50 | def get_stop_loss(targets, outputs, outputs_per_step=None, spec_len=None, do_mask=False, pos_weight=1.):
51 |     time_step = tf.shape(outputs)[1]
52 |     if targets is None:
53 |         assert spec_len is not None, 'stop token targets and spec_len can not be both None'
54 |         pre_zero_len = spec_len - outputs_per_step
55 |         pre_zero_mask = tf.cast(tf.sequence_mask(pre_zero_len, time_step), tf.float32)
56 |         targets = tf.ones_like(outputs) - pre_zero_mask
57 | 
58 |     loss = tf.nn.weighted_cross_entropy_with_logits(labels=targets, logits=outputs, pos_weight=pos_weight)
59 | 
60 |     if do_mask:
61 |         assert spec_len is not None, 'do_mask=True requires spec_len is not None'
62 |         mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32)
63 |         sum_n = tf.reduce_sum(mask)
64 |         loss = loss * mask
65 |     else:
66 |         sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32)
67 | 
68 |     name = 'stop_{}loss'.format('mask_' if do_mask else '')
69 |     loss = tf.truediv(tf.reduce_sum(loss), sum_n, name=name)
70 |     return loss
71 | 


--------------------------------------------------------------------------------
/predict_attention.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import argparse
  4 | import numpy as np
  5 | 
  6 | import tensorflow as tf
  7 | from tensorflow.keras.layers import Input
  8 | 
  9 | from text import text_to_sequence
 10 | 
 11 | from sygst_hparams import hp as sygst_hp
 12 | from embjoint_hparams import hp as embgst_joint_hp
 13 | from ser.hparams import hp as ser_hp
 14 | from emogst_hparams import hp as emogst_hp
 15 | from models.sygst_tacotron2 import Tacotron2SYGST
 16 | from models.embgst_tacotron2_joint import Tacotron2EMBGSTJoint
 17 | from models.emogst_tacotron2 import Tacotron2EMOGST
 18 | 
 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 20 | 
 21 | map_model = {'sygst': Tacotron2SYGST, 'embgst_joint': Tacotron2EMBGSTJoint, 'emogst': Tacotron2EMOGST}
 22 | map_hp = {'sygst': sygst_hp, 'embgst_joint': embgst_joint_hp, 'emogst': emogst_hp}
 23 | 
 24 | 
 25 | class AttentionPredictor:
 26 |     def __init__(self, model_name='sygst'):
 27 |         assert model_name in ['sygst', 'embgst', 'embgst_joint', 'emogst']
 28 | 
 29 |         self.hp = map_hp[model_name]
 30 |         self.model = map_model[model_name](self.hp) if model_name != 'embgst' else map_model[model_name](self.hp, ser_hp)
 31 |         self.model_name = model_name
 32 |         self.cleaner_names = [x.strip() for x in self.hp.cleaners.split(',')]
 33 | 
 34 |         # build model
 35 |         with tf.name_scope('model'):   # 只能和训练时的scope一致
 36 |             d = self.hp.emotion_embedding_units
 37 |             self.text_inputs = Input([None], dtype=tf.int32, name='text_inputs')
 38 |             self.mel_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='mel_inputs')
 39 |             self.spec_lengths = Input([], dtype=tf.int32, name='spec_lengths')
 40 |             self.aro_embed = Input([d], dtype=tf.float32, name='aro_embed')
 41 |             self.val_embed = Input([d], dtype=tf.float32, name='val_embed')
 42 | 
 43 |             call_fn_kwargs = {'mel_inputs': self.mel_inputs,
 44 |                               'spec_lengths': self.spec_lengths,
 45 |                               'training': False}
 46 |             if model_name == 'embgst':
 47 |                 call_fn_kwargs.update(arousal_embedding=self.aro_embed,
 48 |                                       valence_embedding=self.val_embed)
 49 |             self.model(self.text_inputs, **call_fn_kwargs)
 50 | 
 51 |     def load(self, ckpt_path):
 52 |         print('Loading checkpoint: %s' % ckpt_path)
 53 |         self.eval_step = re.search(r'ckpt-(\d+)', ckpt_path).group(1)
 54 |         self.session = tf.Session()
 55 |         saver = tf.train.Saver()
 56 |         saver.restore(self.session, ckpt_path)
 57 | 
 58 |     def predict(self, mel_inputs=None, spec_lengths=None, aro_embed=None, val_embed=None):
 59 |         seq = text_to_sequence('hello', self.cleaner_names)
 60 |         feed_dict = {self.text_inputs: [np.asarray(seq, dtype=np.int32)]}
 61 |         if mel_inputs is not None:
 62 |             assert spec_lengths is not None
 63 |             mel_inputs = np.expand_dims(mel_inputs, 0).astype(np.float32)
 64 |             spec_lengths = np.expand_dims(spec_lengths, 0).astype(np.int32)
 65 |             feed_dict.update({self.mel_inputs: mel_inputs, self.spec_lengths: spec_lengths})
 66 |             if self.model_name in ['sygst', 'emogst']:
 67 |                 attention_outputs = self.model.gst_weights
 68 |             elif self.model_name == 'embgst_joint':
 69 |                 attention_outputs = [self.model.aro_weights, self.model.val_weights]
 70 |             else:
 71 |                 raise ValueError('when mel_inputs is not None, model must be sygst or embgst_joint')
 72 |         else:
 73 |             assert aro_embed is not None or val_embed is not None
 74 |             if aro_embed is not None:
 75 |                 aro_embed = np.expand_dims(aro_embed, 0)
 76 |                 feed_dict.update({self.arousal_embedding: aro_embed.astype(np.float32)})
 77 |                 attention_outputs = self.model.aro_weights
 78 |             else:
 79 |                 val_embed = np.expand_dims(val_embed, 0)
 80 |                 feed_dict.update({self.valence_embedding: val_embed.astype(np.float32)})
 81 |                 attention_outputs = self.model.val_weights
 82 | 
 83 |         attention_weights = self.session.run(attention_outputs, feed_dict=feed_dict)
 84 |         return attention_weights
 85 | 
 86 | 
 87 | def process_fold(args, model, ref_path, output_path, emo_type='arousal', max_items=50):
 88 |     atten_list = []
 89 |     # ref_names = [os.path.join(ref_path, name) for name in sorted(os.listdir(ref_path))]
 90 |     ref_names = [os.path.join(ref_path, name) for name in os.listdir(ref_path)]
 91 |     ref_names = ref_names[:max_items] if max_items is not None else ref_names
 92 | 
 93 |     for ref_name in ref_names:
 94 |         ref_feature = np.load(ref_name)
 95 |         if args.model_name in ['sygst', 'embgst_joint', 'emogst']:
 96 |             ref_len = ref_feature.shape[0]
 97 |             # if ref_len < 250 or ref_len > 1000:
 98 |             #     continue
 99 |             atten_weight = model.predict(mel_inputs=ref_feature, spec_lengths=ref_len)
100 |             if args.model_name == 'embgst_joint':
101 |                 atten_weight = atten_weight[0] if args.emo_type == 'arousal' else atten_weight[1]
102 |         else:
103 |             assert emo_type in ['arousal', 'valence']
104 |             if emo_type == 'arousal':
105 |                 atten_weight = model.predict(aro_embed=ref_feature)
106 |             else:
107 |                 atten_weight = model.predict(val_embed=ref_feature)
108 |         atten = np.squeeze(atten_weight, 0)  # [num_heads, 1, num_tokens]
109 |         atten_list.append(atten)
110 |     atten_list_np = np.array(atten_list)
111 |     avg_atten = np.mean(atten_list_np, axis=0)
112 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
113 |     np.save(output_path, avg_atten)
114 |     print(f'Process finished for {args.model_name} {emo_type} with shape: {atten_list_np.shape}')
115 | 
116 | 
117 | def process_arousal(args, model):
118 |     for i in range(2):
119 |         ref_path = args.model_name + f'_emo_data/emo2d_mel_npys/arousal{i}'
120 |         output_path = args.model_name + f'_emo_data/emo2d_mel_gst_weights/arousal{i}.npy'
121 |         if args.model_name == 'embgst':
122 |             ref_path = ref_path.replace('_mel_', '_embed_')
123 |             output_path = output_path.replace('_mel_', '_embed_')
124 |         process_fold(args, model, ref_path, output_path, 'arousal')
125 | 
126 | 
127 | def process_valence(args, model):
128 |     for i in range(2):
129 |         ref_path = args.model_name + f'_emo_data/emo2d_mel_npys/valence{i}'
130 |         output_path = args.model_name + f'_emo_data/emo2d_mel_gst_weights/valence{i}.npy'
131 |         if args.model_name == 'embgst':
132 |             ref_path = ref_path.replace('_mel_', '_embed_')
133 |             output_path = output_path.replace('_mel_', '_embed_')
134 |         process_fold(args, model, ref_path, output_path, 'valence')
135 | 
136 | 
137 | def process_emotion(args, model):
138 |     for i in range(4):
139 |         ref_path = args.model_name + f'_emo_data/emo_mel_npys/emo{i}'
140 |         output_path = args.model_name + f'_emo_data/emo_gst_weights/emo{i}.npy'
141 |         process_fold(args, model, ref_path, output_path)
142 | 
143 | 
144 | def main():
145 |     parser = argparse.ArgumentParser()
146 |     parser.add_argument('--model_name', '-m', default='sygst')
147 |     parser.add_argument('--ckpt_step', '-c', default=None)
148 |     args = parser.parse_args()
149 | 
150 |     assert args.model_name in ['sygst', 'embgst', 'embgst_joint', 'emogst']
151 | 
152 |     ckpt_path = args.model_name + f'_emo_data/ckpts/model.ckpt-{args.ckpt_step}'
153 |     model = AttentionPredictor(args.model_name)
154 |     model.load(ckpt_path)
155 | 
156 |     if args.model_name == 'emogst':
157 |         process_emotion(args, model)
158 |     else:
159 |         process_arousal(args, model)
160 |         process_valence(args, model)
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     main()
165 | 


--------------------------------------------------------------------------------
/prepare_meta_from_tfr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from tqdm import tqdm
 5 | from text import sequence_to_text
 6 | from tfr_dset import load_for_prepare_meta_from_tfr
 7 | 
 8 | 
 9 | max_steps = 100000   # actual 95650
10 | tfr_dir = 'bc2013/training/tfrs_with_emo_feature'
11 | meta_path = 'bc2013/full_meta.txt'
12 | mel_path = 'bc2013/mels'
13 | spec_path = 'bc2013/specs'
14 | 
15 | 
16 | def main():
17 |     tf_dset = load_for_prepare_meta_from_tfr(tfr_dir)
18 |     feats = tf_dset.make_one_shot_iterator().get_next()
19 | 
20 |     i, lines, mels, specs = 1, [], [], []
21 |     lines.append('# emo|aro|val|text_len|spec_len|text|uid|mel|spec\n')
22 | 
23 |     pbar = tqdm(max_steps)
24 |     sess = tf.Session()
25 |     try:
26 |         while True:
27 |             fetched_feats = sess.run(feats)
28 |             # uid = fetched_feats['uid'].tobytes().decode('utf-8')
29 |             uid = fetched_feats['uid'].decode('utf-8')
30 |             text = sequence_to_text(fetched_feats['inputs'])
31 |             text_lens = fetched_feats['input_lengths']
32 |             mel = fetched_feats['mel_targets']
33 |             spec = fetched_feats['linear_targets']
34 |             spec_lens = fetched_feats['spec_lengths']
35 |             emo = '[{:.5f}, {:.5f}, {:.5f}, {:.5f}]'.format(*fetched_feats['soft_emo_labels'])
36 |             aro = '[{:.5f}, {:.5f}]'.format(*fetched_feats['soft_arousal_labels'])
37 |             val = '[{:.5f}, {:.5f}]'.format(*fetched_feats['soft_valance_labels'])
38 |             mel_name = os.path.join(mel_path, f'bc13-mel-{i:06d}.npy')
39 |             spec_name = os.path.join(spec_path, f'bc13-spec-{i:06d}.npy')
40 |             line = f'{emo}|{aro}|{val}|{text_lens}|{spec_lens}|{text}|{uid}|{mel_name}|{spec_name}\n'
41 |             lines.append(line)
42 |             mels.append([mel, mel_name])
43 |             specs.append([spec, spec_name])
44 |             pbar.update(1)
45 |             i += 1
46 |     except tf.errors.OutOfRangeError:
47 |         print('sess.run finished!')
48 |     finally:
49 |         pbar.close()
50 |         sess.close()
51 | 
52 |     os.makedirs(mel_path, exist_ok=True)
53 |     os.makedirs(spec_path, exist_ok=True)
54 |     for mel, mel_name in tqdm(mels):
55 |         np.save(mel_name, mel)
56 |     for spec, spec_name in tqdm(specs):
57 |         np.save(spec_name, spec)
58 |     with open(meta_path, 'w') as fw:
59 |         fw.writelines(lines)
60 |     print(f'total {i} items finished!')
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/sygst_hparams.py:
--------------------------------------------------------------------------------
  1 | from utils.parameter import HParams
  2 | 
  3 | hp = HParams(
  4 |     # text
  5 |     cleaners='english_cleaners',
  6 | 
  7 |     # audio
  8 |     num_mels=80,
  9 |     num_spec=1025,  # n_fft / 2 + 1 only used when adding linear spectrograms post processing network
 10 |     sample_rate=16000,
 11 |     win_ms=50,  # For 22050Hz, 1100 ~= 50 ms (If None, win_size=n_fft) (0.05 * sample_rate)
 12 |     hop_ms=12.5,   # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
 13 |     n_fft=2048,
 14 |     min_level_db=-100,
 15 |     ref_level_db=20,
 16 |     fmin=95,        # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
 17 |     fmax=7600,      # To be increased/reduced depending on data.
 18 |     preemphasis=0.97,  # filter coefficient.
 19 |     griffin_lim_power=1.5,  # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice.
 20 |     griffin_lim_iters=60,   # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence.
 21 | 
 22 |     # Tacotron
 23 |     outputs_per_step=3,   # number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality)
 24 |     feed_last_frame=True,  # whether feed all r frames or only the last frame of all the r frames
 25 |     stop_at_any=True,   # Determines whether the decoder should stop when predicting <stop> to any frame or to all of them (True works pretty well)
 26 |     clip_outputs=True,  # Whether to clip spectrograms to T2_output_range (even in loss computation). ie: Don't penalize model for exceeding output range and bring back to borders.
 27 |     lower_bound_decay=0.0,  # Small regularizer for noise synthesis by adding small range of penalty for silence regions. Set to 0 to clip in Tacotron range.
 28 |     clip_min=0,
 29 |     clip_max=1,
 30 | 
 31 |     # Input parameters
 32 |     num_symbols=150,
 33 |     embedding_dim=512,  # dimension of embedding space
 34 | 
 35 |     # Encoder parameters
 36 |     encoder_type='taco2',       # ['taco2', 'taco'] taco encoder is cbhg encoder
 37 |     encoder_cnns=[3, 5, 512],  # num_layers, kernel_size, channels
 38 |     encoder_rnns_units=256,    # number of lstm units for each direction (forward and backward)
 39 | 
 40 |     # reference encoder parameters
 41 |     reference_channels=[32, 32, 64, 64, 128, 128],
 42 |     reference_rnn_units=128,
 43 | 
 44 |     # gst parameters
 45 |     #gst_heads=4,
 46 |     gst_heads=8,
 47 |     # gst_tokens=10,
 48 |     gst_tokens=16,
 49 |     # gst_units=256,
 50 |     gst_units=512,
 51 |     gst_atten_units=128,
 52 |     gst_atten_type='mlp',  # attention type for gst self-attention module(dot or mlp)
 53 |     gst_activation=None,
 54 |     gst_trainable=True,    # False at nvidia gst code
 55 | 
 56 |     # emotion parameters
 57 |     emo_used=True,
 58 |     emo_loss='softmax',        # ['mae', 'mse', 'sigmoid', 'softmax']
 59 |     emo_output_units=2,
 60 |     emotion_embedding_units=128,
 61 | 
 62 |     # Attention mechanism
 63 |     smoothing=False,  # Whether to smooth the attention normalization function
 64 |     attention_type='location',  # sma: stepwise monotonic;  location: location sensitive
 65 |     attention_units=128,  # dimension of attention space
 66 |     attention_filters=32,  # number of attention convolution filters
 67 |     attention_kernel_size=(31, ),  # kernel size of attention convolution
 68 |     attention_sma_normalize=True,
 69 |     attention_sma_sigmoid_noise=2.0,
 70 |     attention_sma_sigmoid_noise_seed=None,
 71 |     attention_sma_score_bias_init=3.5,
 72 |     attention_sma_mode='parallel',
 73 | 
 74 |     # Attention synthesis constraints
 75 |     # "Monotonic" constraint forces the model to only look at the forwards attention_win_size steps.
 76 |     # "Window" allows the model to look at attention_win_size neighbors, both forward and backward steps.
 77 |     synthesis_constraint=False,  # Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis)
 78 |     # synthesis_constraint_type='window',  # can be in ('window', 'monotonic').
 79 |     synthesis_win_size=7,  # Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window.
 80 |     synthesis_softmax_temp=1.0,
 81 | 
 82 |     # Decoder
 83 |     prenet_units=[256, 256],    # number of layers and number of units of prenet
 84 |     attention_rnn_units=[1024, 1024],  # number of decoder lstm layers
 85 |     decode_rnn_units=None,  # number of decoder lstm units on each layer
 86 |     max_iters=2000,  # Max decoder steps during inference (Just for safety from infinite loop cases)
 87 |     impute_finished=False,
 88 |     frame_activation='relu',
 89 | 
 90 |     # Residual postnet
 91 |     postnet_cnns=[5, 5, 512],  # num_layers, kernel_size, channels
 92 | 
 93 |     # CBHG mel->linear postnet
 94 |     post_cbhg=True,
 95 |     cbhg_kernels=8,  # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act as "K-grams"
 96 |     cbhg_conv_channels=128,  # Channels of the convolution bank
 97 |     cbhg_pool_size=2,  # pooling size of the CBHG
 98 |     cbhg_projection=256,  # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
 99 |     cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
100 |     cbhg_highway_nums=4,  # Number of HighwayNet layers
101 |     cbhg_highway_units=128,  # Number of units used in HighwayNet fully connected layers
102 |     cbhg_rnn_units=128,  # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in shape
103 | 
104 |     # Loss params
105 |     mask_encoder=True,   # whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence.
106 |     mask_decoder=False,  # set False for alignments converging faster
107 |     cross_entropy_pos_weight=20,  # Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1=disabled)
108 |     mel_loss='mae',
109 |     spec_loss='mae',
110 | 
111 | 
112 |     # Tacotron Training
113 |     # Reproduction seeds
114 |     random_seed=2020,  # Determines initial graph and operations (i.e: model) random state for reproducibility
115 |     # tacotron_data_random_state=1234,  # random state for train test split repeatability
116 | 
117 |     # performance parameters
118 |     tacotron_swap_with_cpu=False,  # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!)
119 | 
120 |     # train/test split ratios, mini-batches sizes
121 |     batch_size=32,  # number of training samples on each training steps
122 |     # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing).
123 |     # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder.
124 |     tacotron_synthesis_batch_size=1,  # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!!
125 |     tacotron_test_size=0.05,  # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit)
126 |     tacotron_test_batches=None,  # number of test batches.
127 | 
128 |     # Learning rate schedule
129 |     decay_learning_rate=True,  # boolean, determines if the learning rate will follow an exponential decay
130 |     start_decay=40000,  # Step at which learning decay starts
131 |     decay_steps=18000,  # Determines the learning rate decay slope (UNDER TEST)
132 |     decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
133 |     # initial_learning_rate=1e-3,  # starting learning rate
134 |     initial_learning_rate=0.002,
135 |     final_learning_rate=1e-4,  # minimal learning rate
136 | 
137 |     # Optimization parameters
138 |     adam_beta1=0.9,  # AdamOptimizer beta1 parameter
139 |     adam_beta2=0.999,  # AdamOptimizer beta2 parameter
140 |     adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter
141 | 
142 |     # Regularization parameters
143 |     # reg_weight=1e-6,  # regularization weight (for L2 regularization)
144 |     reg_weight=None,  # regularization weight (for L2 regularization)
145 |     scale_regularization=False,  # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)
146 |     zoneout_rate=0.1,  # zoneout rate for all LSTM cells in the network
147 |     dropout_rate=0.5,  # dropout rate for all convolutional layers + prenet
148 |     clip_gradients=True,  # whether to clip gradients
149 | )
150 | 


--------------------------------------------------------------------------------
/sygst_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import time
  4 | import argparse
  5 | import traceback
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from datetime import datetime
  9 | 
 10 | 
 11 | from tfr_dset import TFDataSet
 12 | from text import sequence_to_text
 13 | from utils import audio, plot, infolog, ValueWindow, debug
 14 | 
 15 | from sygst_hparams import hp
 16 | from models.sygst_tacotron2 import Tacotron2SYGST
 17 | 
 18 | log = infolog.log
 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 20 | 
 21 | 
 22 | _max_step = 500000
 23 | hdfs_ckpts='hdfs://haruna/home/byte_speech_sv/user/caixiong/ckpts'
 24 | 
 25 | # spec_length max = 1116
 26 | # text length max = 99
 27 | 
 28 | 
 29 | def time_string():
 30 |     return datetime.now().strftime('%Y-%m-%d %H:%M')
 31 | 
 32 | 
 33 | def debug_data(batch=32, time_in=100, time_out=500):
 34 |     text_x = np.random.randint(0, 150, size=(batch, time_in), dtype=np.int32)
 35 |     mel = np.random.randn(batch, time_out, 80).astype(np.float32)
 36 |     spec = np.random.randn(batch, time_out, 1025).astype(np.float32)
 37 |     spec_len = np.random.randint(time_out // 2, time_out, size=batch, dtype=np.int32)
 38 |     aro_label = np.random.rand(batch, 2).astype(np.float32)
 39 |     val_label = np.random.rand(batch, 2).astype(np.float32)
 40 | 
 41 |     print('text_input:', text_x[0], 'spec_len:', spec_len, sep='\n')
 42 |     return text_x, mel, spec, spec_len, aro_label, val_label
 43 | 
 44 | 
 45 | def train(log_dir, args):
 46 |     checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt')
 47 |     log(hp.to_string(), is_print=False)
 48 |     log('Loading training data from: %s' % args.tfr_dir)
 49 |     log('Checkpoint path: %s' % checkpoint_path)
 50 |     log('Using model: sygst tacotron2')
 51 | 
 52 |     tf_dset = TFDataSet(hp, args.tfr_dir)
 53 |     feats = tf_dset.get_train_next()
 54 |     # Set up model:
 55 |     global_step = tf.Variable(0, name='global_step', trainable=False)
 56 |     training = tf.placeholder_with_default(True, shape=(), name='training')
 57 |     with tf.name_scope('model'):
 58 |         model = Tacotron2SYGST(hp)
 59 |         model(feats['inputs'],
 60 |               mel_inputs=feats['mel_targets'],
 61 |               spec_inputs=feats['linear_targets'],
 62 |               spec_lengths=feats['spec_lengths'],
 63 |               ref_inputs=feats['mel_targets'],
 64 |               ref_lengths=feats['spec_lengths'],
 65 |               arousal_labels=feats['soft_arousal_labels'],
 66 |               valence_labels=feats['soft_valance_labels'],
 67 |               training=training)
 68 |         """
 69 |         text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10)
 70 |         model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training)
 71 |         """
 72 |         model.add_loss()
 73 |         model.add_optimizer(global_step)
 74 |         stats = model.add_stats()
 75 | 
 76 |     # Bookkeeping:
 77 |     step = 0
 78 |     time_window = ValueWindow(100)
 79 |     loss_window = ValueWindow(100)
 80 |     saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2)
 81 | 
 82 |     # Train!
 83 |     config = tf.ConfigProto(allow_soft_placement=True,
 84 |                             gpu_options=tf.GPUOptions(allow_growth=True))
 85 |     with tf.Session(config=config) as sess:
 86 |         try:
 87 |             summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
 88 |             sess.run(tf.global_variables_initializer())
 89 |             if args.restore_step:
 90 |                 # Restore from a checkpoint if the user requested it.
 91 |                 restore_path = '%s-%s' % (checkpoint_path, args.restore_step)
 92 |                 saver.restore(sess, restore_path)
 93 |                 log('Resuming from checkpoint: %s' % restore_path, slack=True)
 94 |             else:
 95 |                 log('Starting a new training run ...', slack=True)
 96 | 
 97 |             """
 98 |             fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss,
 99 |                        model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max,
100 |                        model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max]
101 |             """
102 |             fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss,
103 |                        model.stop_loss, model.arousal_loss, model.valence_loss]
104 |             for _ in range(_max_step):
105 |                 start_time = time.time()
106 |                 sess.run(debug.get_ops())
107 |                 # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches)
108 |                 step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run(fetches)
109 |                 time_window.append(time.time() - start_time)
110 |                 loss_window.append(loss)
111 |                 """
112 |                 message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % (
113 |                     step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g)
114 |                 """
115 |                 message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % (
116 |                     step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss)
117 |                 log(message, slack=(step % args.checkpoint_interval == 0))
118 | 
119 |                 if loss > 100 or math.isnan(loss):
120 |                     log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True)
121 |                     raise Exception('Loss Exploded')
122 | 
123 |                 if step % args.summary_interval == 0:
124 |                     log('Writing summary at step: %d' % step)
125 |                     try:
126 |                         summary_writer.add_summary(sess.run(stats), step)
127 |                     except Exception as e:
128 |                         log(f'summary failed and ignored: {str(e)}')
129 | 
130 |                 if step % args.checkpoint_interval == 0:
131 |                     log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
132 |                     saver.save(sess, checkpoint_path, global_step=step)
133 |                     log('Saving audio and alignment...')
134 |                     gt_mel, gt_spec, seq, mel, spec, align = sess.run([model.mel_targets[0], model.spec_targets[0],
135 |                                                                        model.text_targets[0], model.mel_outputs[0],
136 |                                                                        model.spec_outputs[0], model.alignment_outputs[0]])
137 |                     text = sequence_to_text(seq)
138 |                     wav = audio.inv_spectrogram(hp, spec.T)
139 |                     wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step)
140 |                     mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step)
141 |                     spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step)
142 |                     align_path = os.path.join(log_dir, 'step-%d-align.png' % step)
143 |                     info = '%s, %s, step=%d, loss=%.5f\n %s' % (args.model, time_string(), step, loss, text)
144 |                     plot.plot_alignment(align, align_path, info=info)
145 |                     plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel)
146 |                     plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec)
147 |                     audio.save_wav(hp, wav, wav_path)
148 |                     log('Input: %s' % text)
149 | 
150 |         except Exception as e:
151 |             log('Exiting due to exception: %s' % e, slack=True)
152 |             traceback.print_exc()
153 | 
154 | 
155 | def main():
156 |     parser = argparse.ArgumentParser()
157 |     parser.add_argument('--gpu', default='0')
158 |     parser.add_argument('--log', '-l', default='')
159 |     parser.add_argument('--restore_step', '-r', default=None)
160 |     parser.add_argument('--tfr_dir', default='bc2013/training/tfrs_with_emo_feature')
161 |     args = parser.parse_args()
162 | 
163 |     args.model = 'sygst_taco2'
164 |     args.summary_interval = 200
165 |     args.checkpoint_interval = 5000
166 |     # args.summary_interval = 2
167 |     # args.checkpoint_interval = 5
168 | 
169 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
170 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
171 |     log_dir = 'sygst_logs' + ('_' + args.log if args.log else '')
172 |     os.makedirs(log_dir, exist_ok=True)
173 | 
174 |     tf.set_random_seed(hp.random_seed)
175 |     infolog.init(os.path.join(log_dir, 'train.log'), args.model)
176 | 
177 |     train(log_dir, args)
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     main()
182 | 


--------------------------------------------------------------------------------
/synthesizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import textwrap
  3 | from datetime import datetime
  4 | 
  5 | import tensorflow as tf
  6 | from tensorflow.keras.layers import Input
  7 | 
  8 | from utils import audio, plot
  9 | 
 10 | from ser.hparams import hp as ser_hp
 11 | from taco2_hparams import hp as taco2_hp
 12 | from sygst_hparams import hp as sygst_hp
 13 | from emogst_hparams import hp as emogst_hp
 14 | from embjoint_hparams import hp as embgst_joint_hp
 15 | 
 16 | from models.tacotron2 import Tacotron2
 17 | from models.sygst_tacotron2 import Tacotron2SYGST
 18 | from models.emogst_tacotron2 import Tacotron2EMOGST
 19 | from models.embgst_tacotron2_joint import Tacotron2EMBGSTJoint
 20 | 
 21 | tf.compat.v1.logging.set_verbosity(40)   # Only print error infos
 22 | 
 23 | map_model = {'taco2': Tacotron2, 'sygst': Tacotron2SYGST,
 24 |              'emogst': Tacotron2EMOGST, 'embgst_joint': Tacotron2EMBGSTJoint}
 25 | map_hp = {'taco2': taco2_hp, 'sygst': sygst_hp,
 26 |           'emogst': emogst_hp, 'embgst_joint': embgst_joint_hp}
 27 | 
 28 | 
 29 | class Synthesizer:
 30 |     def __init__(self, use_gta=False, use_ref=False, use_att=True, model_name='taco2'):
 31 | 
 32 |         assert model_name in ['taco2', 'sygst', 'embgst', 'emogst', 'embgst_joint']
 33 | 
 34 |         self.use_gta = use_gta  # whether using ground truth alignment
 35 |         self.use_ref = use_ref  # whether using reference mel
 36 |         self.use_att = use_att  # whether using attention weights
 37 |         self.hp = map_hp[model_name]
 38 |         self.model = map_model[model_name](self.hp, use_gta) if model_name != 'embgst_joint' else map_model[model_name](embgst_joint_hp, ser_hp, use_gta)
 39 |         self.model_name = model_name
 40 | 
 41 |         # build model
 42 |         with tf.name_scope('model'):
 43 |             h, t = self.hp.gst_heads, self.hp.gst_tokens
 44 |             self.text_inputs = Input([None], dtype=tf.int32, name='text_inputs')
 45 |             self.mel_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='mel_inputs')
 46 |             self.mel_lengths = Input([], dtype=tf.int32, name='mel_lengths')
 47 |             self.ref_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='ref_inputs')
 48 |             self.ref_lengths = Input([], dtype=tf.int32, name='ref_lengths')
 49 |             self.aro_weights_ph = Input([h, 1, t], dtype=tf.float32, name='arousal_weitght_ph')
 50 |             self.val_weights_ph = Input([h, 1, t], dtype=tf.float32, name='valence_weitght_ph')
 51 |             self.atten_weights_ph = Input([h, 1, t], dtype=tf.float32, name='attention_weights_ph')
 52 | 
 53 |             call_fn_kwargs = {}
 54 |             if use_gta:
 55 |                 assert not use_ref
 56 |                 call_fn_kwargs.update(mel_inputs=self.mel_inputs,
 57 |                                       spec_lengths=self.mel_lengths)
 58 |             if use_ref:
 59 |                 assert not use_att and model_name != 'taco2'
 60 |                 call_fn_kwargs.update(ref_inputs=self.ref_inputs,
 61 |                                       ref_lengths=self.ref_lengths)
 62 |             if use_att:
 63 |                 if model_name in ['sygst', 'emogst']:
 64 |                     call_fn_kwargs.update(atten_weights_ph=self.atten_weights_ph)
 65 |                 elif model_name in ['embgst', 'embgst_joint']:
 66 |                     call_fn_kwargs.update(aro_weights_ph=self.aro_weights_ph,
 67 |                                           val_weights_ph=self.val_weights_ph)
 68 |             self.model_call_fn_kwargs = call_fn_kwargs
 69 |             self.model(self.text_inputs, training=False, **call_fn_kwargs)
 70 | 
 71 |         # outputs
 72 |         model = self.model if self.model_name != 'embgst_joint' else self.model.tts_model
 73 |         self.seq_length_outputs = model.seq_length_outputs
 74 |         self.mel_outputs = model.mel_outputs
 75 |         self.spec_outputs = model.spec_outputs
 76 |         self.wav_outputs = audio.inv_spectrogram_tensorflow(self.hp, self.spec_outputs)
 77 |         self.alignment_outputs = model.alignment_outputs
 78 | 
 79 |     def load(self, ckpt_path):
 80 |         self.eval_step = re.search(r'ckpt-(\d+)', ckpt_path).group(1)
 81 |         self.session = tf.Session()
 82 |         saver = tf.train.Saver()
 83 |         saver.restore(self.session, ckpt_path)
 84 | 
 85 |     def synthesize(self, text_seqs, texts, output_path,
 86 |                    mel_inputs=None, mel_lengths=None,
 87 |                    ref_inputs=None, ref_lengths=None,
 88 |                    atten_weights=None, aro_weights=None, val_weights=None):
 89 | 
 90 |         feed_dict = {self.text_inputs: text_seqs}
 91 |         if mel_inputs is not None:
 92 |             feed_dict.update({self.mel_inputs: mel_inputs,
 93 |                               self.mel_lengths: mel_lengths})
 94 |         if ref_inputs is not None:
 95 |             feed_dict.update({self.ref_inputs: ref_inputs,
 96 |                               self.ref_lengths: ref_lengths})
 97 |         if aro_weights is not None and val_weights is not None:
 98 |             feed_dict.update({self.aro_weights_ph: aro_weights,
 99 |                               self.val_weights_ph: val_weights})
100 |         if atten_weights is not None:
101 |             feed_dict.update({self.atten_weights_ph: atten_weights})
102 | 
103 |         self.now_time = datetime.now().strftime('%Y-%m-%d %H:%M')
104 |         lens, wavs, mels, specs, aligns = self.session.run([self.seq_length_outputs,
105 |                                                             self.wav_outputs,
106 |                                                             self.mel_outputs,
107 |                                                             self.spec_outputs,
108 |                                                             self.alignment_outputs],
109 |                                                            feed_dict=feed_dict)
110 |         self.post_process(output_path, texts, lens, wavs, mels, specs, aligns)
111 | 
112 |     def post_process(self, output_path, texts, lens, wavs, mels, specs, aligns):
113 | 
114 |         zipped_inputs = zip(output_path, texts, lens, wavs, mels, specs, aligns)
115 |         for path, text, mel_len, wav, mel, spec, align in zipped_inputs:
116 |             wav = audio.inv_preemphasis(self.hp, wav)
117 |             end_point = audio.find_endpoint(self.hp, wav)
118 |             # end_point = wav.shape[0]
119 |             # end_point = int((mel_len * self.hp.hop_ms / 1000) * self.hp.sample_rate)
120 |             wav = wav[:end_point]
121 |             mel_len = int(end_point / (self.hp.hop_ms / 1000 * self.hp.sample_rate)) + 1
122 |             pathes = [path + suffix for suffix in ['.wav', '-mel.png', '-spec.png', '-align.png']]
123 |             wav_path, mel_path, spec_path, align_path = pathes
124 |             title = f'{self.model_name}, {self.eval_step}, {self.now_time}'
125 |             info = '\n'.join(textwrap.wrap(text, 70, break_long_words=False))
126 |             plot.plot_alignment(align[:, : mel_len], align_path, info, title)
127 |             plot.plot_mel(mel[: mel_len, :], mel_path, info, title)
128 |             plot.plot_mel(spec[: mel_len, :], spec_path, info, title)
129 |             audio.save_wav(self.hp, wav, wav_path)
130 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from text import cleaners
 3 | from text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |     Args:
21 |       text: string to convert to a sequence
22 |       cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |     Returns:
25 |       List of integers corresponding to the symbols in the text
26 |   '''
27 |   sequence = []
28 | 
29 |   # Check for curly braces and treat their contents as ARPAbet:
30 |   while len(text):
31 |     m = _curly_re.match(text)
32 |     if not m:
33 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |       break
35 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |     sequence += _arpabet_to_sequence(m.group(2))
37 |     text = m.group(3)
38 | 
39 |   # Append EOS token
40 |   sequence.append(_symbol_to_id['~'])
41 |   return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |   '''Converts a sequence of IDs back to a string'''
46 |   result = ''
47 |   for symbol_id in sequence:
48 |     if symbol_id in _id_to_symbol:
49 |       s = _id_to_symbol[symbol_id]
50 |       # Enclose ARPAbet back in curly braces:
51 |       if len(s) > 1 and s[0] == '@':
52 |         s = '{%s}' % s[1:]
53 |       result += s
54 |   return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |   for name in cleaner_names:
59 |     cleaner = getattr(cleaners, name)
60 |     if not cleaner:
61 |       raise Exception('Unknown cleaner: %s' % name)
62 |     text = cleaner(text)
63 |   return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |   return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |   return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | # Regular expression matching whitespace:
18 | _whitespace_re = re.compile(r'\s+')
19 | 
20 | # List of (regular expression, replacement) pairs for abbreviations:
21 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
22 |     ('mrs', 'misess'),
23 |     ('mr', 'mister'),
24 |     ('dr', 'doctor'),
25 |     ('st', 'saint'),
26 |     ('co', 'company'),
27 |     ('jr', 'junior'),
28 |     ('maj', 'major'),
29 |     ('gen', 'general'),
30 |     ('drs', 'doctors'),
31 |     ('rev', 'reverend'),
32 |     ('lt', 'lieutenant'),
33 |     ('hon', 'honorable'),
34 |     ('sgt', 'sergeant'),
35 |     ('capt', 'captain'),
36 |     ('esq', 'esquire'),
37 |     ('ltd', 'limited'),
38 |     ('col', 'colonel'),
39 |     ('ft', 'fort'),
40 | ]]
41 | 
42 | 
43 | def expand_abbreviations(text):
44 |     for regex, replacement in _abbreviations:
45 |         text = re.sub(regex, replacement, text)
46 |     return text
47 | 
48 | 
49 | def expand_numbers(text):
50 |     return normalize_numbers(text)
51 | 
52 | 
53 | def lowercase(text):
54 |     return text.lower()
55 | 
56 | 
57 | def collapse_whitespace(text):
58 |     return re.sub(_whitespace_re, ' ', text)
59 | 
60 | 
61 | def convert_to_ascii(text):
62 |     return unidecode(text)
63 | 
64 | 
65 | def basic_cleaners(text):
66 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
67 |     text = lowercase(text)
68 |     text = collapse_whitespace(text)
69 |     return text
70 | 
71 | 
72 | def transliteration_cleaners(text):
73 |     '''Pipeline for non-English text that transliterates to ASCII.'''
74 |     text = convert_to_ascii(text)
75 |     text = lowercase(text)
76 |     text = collapse_whitespace(text)
77 |     return text
78 | 
79 | 
80 | def english_cleaners(text):
81 |     '''Pipeline for English text, including number and abbreviation expansion.'''
82 |     text = convert_to_ascii(text)
83 |     text = lowercase(text)
84 |     text = expand_numbers(text)
85 |     text = expand_abbreviations(text)
86 |     text = collapse_whitespace(text)
87 |     return text
88 | 


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |   def __init__(self, file_or_path, keep_ambiguous=True):
20 |     if isinstance(file_or_path, str):
21 |       with open(file_or_path, encoding='latin-1') as f:
22 |         entries = _parse_cmudict(f)
23 |     else:
24 |       entries = _parse_cmudict(file_or_path)
25 |     if not keep_ambiguous:
26 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
27 |     self._entries = entries
28 | 
29 | 
30 |   def __len__(self):
31 |     return len(self._entries)
32 | 
33 | 
34 |   def lookup(self, word):
35 |     '''Returns list of ARPAbet pronunciations of the given word.'''
36 |     return self._entries.get(word.upper())
37 | 
38 | 
39 | def _parse_cmudict(file):
40 |   cmudict = {}
41 |   for line in file:
42 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
43 |       parts = line.split('  ')
44 |       word = parts[0]
45 |       pronunciation = _get_pronunciation(parts[1])
46 |       if pronunciation:
47 |         if word in cmudict:
48 |           cmudict[word].append(pronunciation)
49 |         else:
50 |           cmudict[word] = [pronunciation]
51 |   return cmudict
52 | 
53 | 
54 | def _get_pronunciation(s):
55 |   parts = s.strip().split(' ')
56 |   for part in parts:
57 |     if part not in _valid_symbol_set:
58 |       return None
59 |   return ' '.join(parts)
60 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | _inflect = inflect.engine()
 5 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 6 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 7 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 8 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 9 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
10 | _number_re = re.compile(r'[0-9]+')
11 | 
12 | 
13 | def _remove_commas(m):
14 |     return m.group(1).replace(',', '')
15 | 
16 | 
17 | def _expand_decimal_point(m):
18 |     return m.group(1).replace('.', ' point ')
19 | 
20 | 
21 | def _expand_dollars(m):
22 |     match = m.group(1)
23 |     parts = match.split('.')
24 |     if len(parts) > 2:
25 |         return match + ' dollars'  # Unexpected format
26 |     dollars = int(parts[0]) if parts[0] else 0
27 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
28 |     if dollars and cents:
29 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
30 |         cent_unit = 'cent' if cents == 1 else 'cents'
31 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
32 |     elif dollars:
33 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
34 |         return '%s %s' % (dollars, dollar_unit)
35 |     elif cents:
36 |         cent_unit = 'cent' if cents == 1 else 'cents'
37 |         return '%s %s' % (cents, cent_unit)
38 |     else:
39 |         return 'zero dollars'
40 | 
41 | 
42 | def _expand_ordinal(m):
43 |     return _inflect.number_to_words(m.group(0))
44 | 
45 | 
46 | def _expand_number(m):
47 |     num = int(m.group(0))
48 |     if num > 1000 and num < 3000:
49 |         if num == 2000:
50 |             return 'two thousand'
51 |         elif num > 2000 and num < 2010:
52 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
53 |         elif num % 100 == 0:
54 |             return _inflect.number_to_words(num // 100) + ' hundred'
55 |         else:
56 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
57 |     else:
58 |         return _inflect.number_to_words(num, andword='')
59 | 
60 | 
61 | def normalize_numbers(text):
62 |     text = re.sub(_comma_number_re, _remove_commas, text)
63 |     text = re.sub(_pounds_re, r'\1 pounds', text)
64 |     text = re.sub(_dollars_re, _expand_dollars, text)
65 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
66 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
67 |     text = re.sub(_number_re, _expand_number, text)
68 |     return text
69 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from text import cmudict
 8 | 
 9 | _pad        = '_'
10 | _eos        = '~'
11 | _characters = '"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet
18 | 


--------------------------------------------------------------------------------
/tfr_dset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import tensorflow as tf
  4 | 
  5 | # from taco2_hparams import hp
  6 | # from hparams import hparams as hp
  7 | # cpu_num = os.cpu_count()
  8 | 
  9 | _pad = 0.
 10 | _pad_emo = 0.25
 11 | _pad_bemo = 0.5
 12 | _pad_token = 1.  # stop token pad 1. for marking sequences finished
 13 | 
 14 | 
 15 | def parse_single_example(example_proto):
 16 |     features = {'uid': tf.FixedLenFeature([], tf.string),
 17 |                 'inputs': tf.FixedLenFeature([], tf.string),
 18 |                 'input_lengths': tf.FixedLenFeature([], tf.int64),
 19 |                 'mel_targets': tf.FixedLenFeature([], tf.string),
 20 |                 'linear_targets': tf.FixedLenFeature([], tf.string),
 21 |                 'spec_lengths': tf.FixedLenFeature([], tf.int64),
 22 |                 'soft_emo_labels': tf.FixedLenFeature([], tf.string),
 23 |                 'soft_arousal_labels': tf.FixedLenFeature([], tf.string),
 24 |                 'soft_valance_labels': tf.FixedLenFeature([], tf.string),
 25 |                 'arousal_embedding': tf.FixedLenFeature([], tf.string),
 26 |                 'valance_embedding': tf.FixedLenFeature([], tf.string)}
 27 |     parsed = tf.parse_single_example(example_proto, features=features)
 28 |     inputs = tf.decode_raw(parsed['inputs'], tf.int32)
 29 |     input_lengths = tf.cast(parsed['input_lengths'], tf.int32)
 30 |     spec_lengths = tf.cast(parsed['spec_lengths'], tf.int32)
 31 |     mel_targets = tf.reshape(tf.decode_raw(parsed['mel_targets'], tf.float32), [spec_lengths, -1])
 32 |     linear_targets = tf.reshape(tf.decode_raw(parsed['linear_targets'], tf.float32), [spec_lengths, -1])
 33 |     soft_emo_labels = tf.decode_raw(parsed['soft_emo_labels'], tf.float32)
 34 |     soft_arousal_labels = tf.decode_raw(parsed['soft_arousal_labels'], tf.float32)
 35 |     soft_valance_labels = tf.decode_raw(parsed['soft_valance_labels'], tf.float32)
 36 |     arousal_embedding = tf.decode_raw(parsed['arousal_embedding'], tf.float32)
 37 |     valance_embedding = tf.decode_raw(parsed['valance_embedding'], tf.float32)
 38 |     return {'uid': parsed['uid'],
 39 |             'inputs': inputs,
 40 |             'input_lengths': input_lengths,
 41 |             'mel_targets': mel_targets,
 42 |             'linear_targets': linear_targets,
 43 |             'spec_lengths': spec_lengths,
 44 |             'soft_emo_labels': soft_emo_labels,
 45 |             'soft_arousal_labels': soft_arousal_labels,
 46 |             'soft_valance_labels': soft_valance_labels,
 47 |             'arousal_embedding': arousal_embedding,
 48 |             'valance_embedding': valance_embedding}
 49 | 
 50 | 
 51 | def parse_single_example_for_merge_emo_feature(example_proto):
 52 |     features = {'uid': tf.FixedLenFeature([], tf.string),
 53 |                 'inputs': tf.FixedLenFeature([], tf.string),
 54 |                 'input_lengths': tf.FixedLenFeature([], tf.int64),
 55 |                 'mel_targets': tf.FixedLenFeature([], tf.string),
 56 |                 'linear_targets': tf.FixedLenFeature([], tf.string),
 57 |                 'spec_lengths': tf.FixedLenFeature([], tf.int64),
 58 |                 'soft_emo_labels': tf.FixedLenFeature([], tf.string)}
 59 |     parsed = tf.parse_single_example(example_proto, features=features)
 60 |     return parsed
 61 | 
 62 | 
 63 | def load_for_merge_emo_features(tfr_dir):
 64 |     file_pattern = os.path.join(tfr_dir, '*.tfr')
 65 |     tfrecord_files_num = len(glob.glob(file_pattern))
 66 |     tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True)
 67 |     dataset = tfrecord_files.apply(
 68 |         tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset,
 69 |                                                  cycle_length=min(
 70 |                                                      tfrecord_files_num,
 71 |                                                      128),
 72 |                                                  block_length=1))
 73 |     dataset = dataset.map(parse_single_example_for_merge_emo_feature,
 74 |                           num_parallel_calls=os.cpu_count())
 75 |     return dataset
 76 | 
 77 | 
 78 | def load_for_prepare_meta_from_tfr(tfr_dir):
 79 |     file_pattern = os.path.join(tfr_dir, '*.tfr')
 80 |     tfrecord_files_num = len(glob.glob(file_pattern))
 81 |     tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True)
 82 |     dataset = tfrecord_files.apply(
 83 |         tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset,
 84 |                                                  cycle_length=min(
 85 |                                                      tfrecord_files_num,
 86 |                                                      128),
 87 |                                                  block_length=1))
 88 |     dataset = dataset.map(parse_single_example,
 89 |                           num_parallel_calls=os.cpu_count())
 90 |     return dataset
 91 | 
 92 | 
 93 | class TFDataSet(object):
 94 |     def __init__(self,
 95 |                  hp,
 96 |                  tfr_dir,
 97 |                  cache_path=None,
 98 |                  valid_batches=None,
 99 |                  load_for_rayhame=False):
100 |         """Load bc2013 dataset as a tf.data.Dataset object for training tts models
101 |         # Arguments
102 |             tfr_dir: the path of tf record files for bc2013
103 |             batch_size: the batch size used for training and evaluating
104 |             valid_batches: split the first 'valid_bathes' batch data as validation
105 |                 set, if None, will return the whole dataset as training set
106 |             outputs_per_step: emit this number of frames at each tacotron
107 |                 decoder time step, it is passed to trim the spectrum lengths
108 |             load_for_rayhame: whether load  data for training Rayhame's Tacotron2
109 |                 model, if True, wu will trim down the spectrums and compute the
110 |                 stop token targets
111 |         """
112 | 
113 |         self.tfr_dir = tfr_dir
114 |         self.cache_path = cache_path
115 |         self.num_mels = hp.num_mels
116 |         self.num_spec = hp.num_spec
117 |         self.batch_size = hp.batch_size
118 |         self.outputs_per_step = hp.outputs_per_step
119 |         self.valid_batches = valid_batches
120 |         self.load_for_rayhame = load_for_rayhame
121 | 
122 |     def load(self):
123 | 
124 |         # Load the tf record files to a tf.data.Dataset object
125 |         auto_tune = tf.data.experimental.AUTOTUNE
126 |         file_pattern = os.path.join(self.tfr_dir, '*.tfr')
127 |         tfrecord_files_num = len(glob.glob(file_pattern))
128 |         tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True)
129 |         dataset = tfrecord_files.apply(
130 |             tf.data.experimental.parallel_interleave(
131 |                 tf.data.TFRecordDataset,
132 |                 cycle_length=min(tfrecord_files_num, 128),
133 |                 block_length=1
134 |             )
135 |         )
136 | 
137 |         # Deserialize each tf record example to a dict of Tensors
138 |         dataset = dataset.map(lambda x: parse_single_example(x),
139 |                               num_parallel_calls=auto_tune)
140 | 
141 |         # Filter sampels by spectrum lengths for removing some mismatched text-audio pairs
142 |         def len_filter(x):
143 |             return tf.logical_not(
144 |                 tf.logical_or(
145 |                     tf.logical_or(x['spec_lengths'] < 80, x['spec_lengths'] > 800),
146 |                     tf.logical_and(x['input_lengths'] < 70, x['spec_lengths'] > 700)
147 |                 )
148 |             )
149 |         dataset = dataset.filter(len_filter)
150 | 
151 |         def trim_down_lengths(x, prepare_stop_targets=True):
152 |             r = self.outputs_per_step
153 |             spec_len = x['spec_lengths']
154 |             trim_len = tf.cast(spec_len / r, dtype=tf.int32) * r
155 |             x['mel_targets'] = x['mel_targets'][: trim_len]
156 |             x['linear_targets'] = x['linear_targets'][: trim_len]
157 |             x['spec_lengths'] = trim_len
158 |             if prepare_stop_targets:
159 |                 x['token_targets'] = tf.concat([tf.zeros(trim_len - r), tf.ones(r)], axis=0)
160 |             return x
161 |         # Load for rayhame' tacotron2 model
162 |         if self.load_for_rayhame:
163 |             assert self.valid_batches is not None
164 |             assert self.outputs_per_step is not None
165 |             dataset = dataset.map(trim_down_lengths, auto_tune)
166 | 
167 |         # Maybe split the valid dataset and training dataset
168 |         valid_dataset = None
169 |         if self.valid_batches:
170 |             valid_size = self.valid_batches * self.batch_size
171 |             dataset = dataset.shuffle(buffer_size=10000)
172 |             valid_dataset = dataset.take(valid_size)  # validation set
173 |             dataset = dataset.skip(valid_size)        # training set
174 | 
175 |         # Perform a bucket and padded batch transform for training set
176 |         bucket_num = 10
177 |         bucket_batch_sizes = [self.batch_size] * bucket_num
178 |         bucket_boundaries = [25, 40, 55, 70, 85, 100, 135, 170, 220]
179 |         padded_shapes = {'uid': [],
180 |                          'inputs': [None],
181 |                          'input_lengths': [],
182 |                          'mel_targets': [None, self.num_mels],
183 |                          'linear_targets': [None, self.num_spec],
184 |                          'spec_lengths': [],
185 |                          'soft_emo_labels': [None],
186 |                          'soft_arousal_labels': [None],
187 |                          'soft_valance_labels': [None],
188 |                          'arousal_embedding': [256],
189 |                          'valance_embedding': [256]}
190 |         padded_values = {'uid': '\0',
191 |                          'inputs': 0,
192 |                          'input_lengths': 0,
193 |                          'mel_targets': _pad,
194 |                          'linear_targets': _pad,
195 |                          'spec_lengths': 0,
196 |                          'soft_emo_labels': _pad_emo,
197 |                          'soft_arousal_labels': _pad_bemo,
198 |                          'soft_valance_labels': _pad_bemo,
199 |                          'arousal_embedding': _pad,
200 |                          'valance_embedding': _pad}
201 | 
202 |         if self.load_for_rayhame:
203 |             padded_shapes.update({'token_targets': [None]})
204 |             padded_values.update({'token_targets': _pad_token})
205 | 
206 |         dataset = dataset.apply(
207 |             tf.data.experimental.bucket_by_sequence_length(
208 |                 lambda x: x['input_lengths'],
209 |                 bucket_boundaries=bucket_boundaries,
210 |                 bucket_batch_sizes=bucket_batch_sizes,
211 |                 padded_shapes=padded_shapes,
212 |                 padding_values=padded_values,
213 |                 pad_to_bucket_boundary=False,
214 |                 no_padding=False
215 |             )
216 |         )
217 | 
218 |         # Shffle and repeat infinitely and prefetch 10 batches
219 |         dataset = dataset.apply(
220 |             tf.data.experimental.shuffle_and_repeat(buffer_size=128))
221 |         dataset = dataset.prefetch(buffer_size=10)   # Prefetch 10 batch of samples
222 |         if self.cache_path:  # not None an not ''
223 |             # assert os.path.isdir(self.cache_path)
224 |             # dataset = dataset.cache(os.path.join(self.cache_path, 'cached_bc2013'))
225 |             pass   # cache 有问题, 直接爆磁盘和内存
226 | 
227 |         # Perform padded batch transform for validation dataset
228 |         if valid_dataset is not None:
229 |             valid_dataset = valid_dataset.apply(
230 |                 tf.data.experimental.shuffle_and_repeat(self.valid_batches, count=1))
231 |             valid_dataset = valid_dataset.padded_batch(
232 |                 self.batch_size, padded_shapes, padded_values)
233 |             valid_dataset = valid_dataset.cache()
234 | 
235 |         self.dataset = dataset
236 |         self.valid_dataset = valid_dataset
237 | 
238 |     def get_train_next(self):
239 |         if not hasattr(self, 'dataset'):
240 |             self.load()
241 |         train_next = self.dataset.make_one_shot_iterator().get_next()
242 |         return train_next
243 | 
244 |     def get_valid_iter_and_next(self):
245 |         assert self.valid_batches is not None
246 |         if not hasattr(self, 'valid_dataset'):
247 |             self.load()
248 |         init_iter = self.valid_dataset.make_initializable_iterator()
249 |         return init_iter.initializer, init_iter.get_next()
250 | 
251 | 
252 | def test():
253 |     pass
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     test()
258 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from scipy import signal
  5 | from scipy.io import wavfile
  6 | 
  7 | 
  8 | _mel_basis = None
  9 | 
 10 | 
 11 | def load_wav(hp, path):
 12 |     wav, sr = librosa.core.load(path, sr=hp.sample_rate)
 13 |     wav = wav / np.abs(wav).max() * 0.999
 14 |     return wav, sr
 15 | 
 16 | 
 17 | def save_wav(hp, wav, path):
 18 |     wav /= max(0.01, np.max(np.abs(wav)))
 19 |     wavfile.write(path, hp.sample_rate, (wav * 32766).astype(np.int16))
 20 | 
 21 | 
 22 | def preemphasis(hp, x):
 23 |     return signal.lfilter([1, -hp.preemphasis], [1], x)
 24 | 
 25 | 
 26 | def inv_preemphasis(hp, x):
 27 |     return signal.lfilter([1], [1, -hp.preemphasis], x)
 28 | 
 29 | 
 30 | def spectrogram(hp, y):
 31 |     if hp.preemphasis is None:
 32 |         D = _stft(hp, y)
 33 |     else:
 34 |         D = _stft(hp, preemphasis(hp, y))
 35 |     S = _amp_to_db(np.abs(D)) - hp.ref_level_db
 36 |     return _normalize(hp, S)
 37 | 
 38 | 
 39 | def inv_spectrogram(hp, spectrogram):
 40 |     '''Converts spectrogram to waveform using librosa'''
 41 |     S = _db_to_amp(_denormalize(hp, spectrogram) + hp.ref_level_db)  # Convert back to linear
 42 |     return inv_preemphasis(hp, _griffin_lim(hp, S ** hp.griffin_lim_power))  # Reconstruct phase
 43 | 
 44 | 
 45 | def inv_spectrogram_tensorflow(hp, spectrogram):
 46 |     '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
 47 | 
 48 |     Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
 49 |     inv_preemphasis on the output after running the graph.
 50 |     '''
 51 |     with tf.name_scope('griffin_lim'):
 52 |         S = _db_to_amp_tensorflow(_denormalize_tensorflow(hp, spectrogram) + hp.ref_level_db)
 53 |         return _griffin_lim_tensorflow(hp, tf.pow(S, hp.griffin_lim_power))
 54 | 
 55 | 
 56 | def melspectrogram(hp, y):
 57 |     if hp.preemphasis is None:
 58 |         D = _stft(hp, y)
 59 |     else:
 60 |         D = _stft(hp, preemphasis(hp, y))
 61 |     S = _amp_to_db(_linear_to_mel(hp, np.abs(D))) - hp.ref_level_db
 62 |     return _normalize(hp, S)
 63 | 
 64 | 
 65 | def mfcc(hp, y):
 66 |     pass
 67 | 
 68 | 
 69 | def find_endpoint(hp, wav, threshold_db=-40, min_silence_sec=0.8):
 70 |     window_length = int(hp.sample_rate * min_silence_sec)
 71 |     hop_length = int(window_length / 4)
 72 |     threshold = _db_to_amp(threshold_db)
 73 |     for x in range(hop_length, len(wav) - window_length, hop_length):
 74 |         if np.max(wav[x:x + window_length]) < threshold:
 75 |             return x + hop_length
 76 |     return len(wav)
 77 | 
 78 | 
 79 | def _griffin_lim(hp, S):
 80 |     '''librosa implementation of Griffin-Lim
 81 |     Based on https://github.com/librosa/librosa/issues/434
 82 |     '''
 83 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 84 |     S_complex = np.abs(S).astype(np.complex)
 85 |     y = _istft(hp, S_complex * angles)
 86 |     for i in range(hp.griffin_lim_iters):
 87 |         angles = np.exp(1j * np.angle(_stft(hp, y)))
 88 |         y = _istft(hp, S_complex * angles)
 89 |     return y   # reconstructed wav
 90 | 
 91 | 
 92 | def _griffin_lim_tensorflow(hp, S):
 93 |     '''TensorFlow implementation of Griffin-Lim
 94 |     Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
 95 |     '''
 96 |     with tf.variable_scope('griffinlim'):
 97 |         # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
 98 |         S = tf.expand_dims(S, 0)
 99 |         S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
100 |         y = _istft_tensorflow(hp, S_complex)
101 |         for i in range(hp.griffin_lim_iters):
102 |             est = _stft_tensorflow(hp, y)
103 |             angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
104 |             y = _istft_tensorflow(hp, S_complex * angles)
105 |         return tf.squeeze(y, 0)
106 | 
107 | 
108 | def _stft(hp, y):
109 |     n_fft, hop_length, win_length = _stft_parameters(hp)
110 |     # shape (1 + n_fft/2, n_frames)
111 |     return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
112 | 
113 | 
114 | def _istft(hp, y):
115 |     _, hop_length, win_length = _stft_parameters(hp)
116 |     return librosa.istft(y, hop_length=hop_length, win_length=win_length)
117 | 
118 | 
119 | def _stft_tensorflow(hp, signals):
120 |     n_fft, hop_length, win_length = _stft_parameters(hp)
121 |     return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
122 | 
123 | 
124 | def _istft_tensorflow(hp, stfts):
125 |     n_fft, hop_length, win_length = _stft_parameters(hp)
126 |     return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
127 | 
128 | 
129 | def _stft_parameters(hp):
130 |     n_fft = hp.n_fft
131 |     hop_length = int(hp.hop_ms / 1000 * hp.sample_rate)
132 |     win_length = int(hp.win_ms / 1000 * hp.sample_rate)
133 |     return n_fft, hop_length, win_length
134 | 
135 | 
136 | def _linear_to_mel(hp, spectrogram):
137 |     global _mel_basis
138 |     if _mel_basis is None:
139 |         _mel_basis = _build_mel_basis(hp)
140 |     return np.dot(_mel_basis, spectrogram)
141 | 
142 | 
143 | def _build_mel_basis(hp):
144 |     n_fft = hp.n_fft
145 |     return librosa.filters.mel(hp.sample_rate, n_fft, n_mels=hp.num_mels)
146 | 
147 | 
148 | def _amp_to_db(x):
149 |     # return 20 * np.log10(np.maximum(1e-5, x))
150 |     return 20 * np.log10(np.maximum(1e-4, x))  # 最小为-80dB, 因为还有减去ref_dB
151 | 
152 | 
153 | def _db_to_amp(x):
154 |     return np.power(10.0, x * 0.05)
155 | 
156 | 
157 | def _db_to_amp_tensorflow(x):
158 |     return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
159 | 
160 | 
161 | def _normalize(hp, S):
162 |     # 这个做法存疑, 因为S>0时, 都会被截断成0, 即如果S>ref_db, 都会
163 |     # 变成ref_db
164 |     return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
165 | 
166 | 
167 | def _denormalize(hp, S):
168 |     return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
169 | 
170 | 
171 | def _denormalize_tensorflow(hp, S):
172 |     return (tf.clip_by_value(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
173 | 


--------------------------------------------------------------------------------
/utils/ce_loss_util.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | 
3 | 
4 | def ce_loss(soft_labels, logits):
5 |     probs = tf.clip_by_value(tf.nn.softmax(logits, axis=-1), 1e-10, 10)
6 |     ce = -tf.reduce_mean(tf.reduce_sum(soft_labels * tf.log(probs), axis=-1))
7 |     return ce
8 | 


--------------------------------------------------------------------------------
/utils/center_loss_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def calc_center_loss(features, centers, emo_labels, is_l1=True):
 5 |     """
 6 |     Args:
 7 |         features:   [batch_size, dim]
 8 |         centers:    [emo_num, dim]
 9 |         emo_labels: [batch_size, emo_num]
10 |     Returns:
11 | 
12 |     """
13 |     features_ = tf.expand_dims(features, 1)  # [batch_size, 1, dim]
14 |     centers_ = tf.expand_dims(centers, 0)  # [1, emo_num, dim[
15 |     diff = features_ - centers_  # [batch_size, emo_num, dim]
16 |     dist = tf.reduce_sum(tf.square(diff), axis=-1)  # [batch_size, emo_num]
17 |     if is_l1:
18 |         dist = tf.sqrt(dist)
19 |     loss = tf.reduce_mean(tf.reduce_sum(dist * emo_labels, axis=-1))
20 |     return loss
21 | 
22 | 
23 | def update_center(features, centers, emo_labels, alpha):
24 |     """
25 |         Args:
26 |             features:   [batch_size, dim]
27 |             centers:    [emo_num, dim]
28 |             emo_labels: [batch_size, emo_num]
29 |         Returns:
30 |     """
31 |     features_ = tf.expand_dims(features, 1)  # [batch_size, 1, dim]
32 |     centers_ = tf.expand_dims(centers, 0)  # [1, emo_num, dim]
33 |     emo_labels_ = tf.expand_dims(emo_labels, -1)  # [batch_size, emo_num, 1]
34 |     diff = features_ - centers_  # [batch_size, emo_num, dim]
35 |     weighted_emo_diff = diff * emo_labels_  # [batch_size, emo_num, dim]
36 |     sum_emo_diff = tf.reduce_sum(weighted_emo_diff, axis=0)  # [emo_num, dim]
37 |     emo_sum = tf.clip_by_value(tf.reduce_sum(emo_labels_, axis=0), 0.001, 100)  # [emo_num, 1]
38 |     alpha = tf.clip_by_value(alpha, 0., 1.)
39 |     c_diff = tf.math.divide(sum_emo_diff, emo_sum)  # [emo_num, dim]
40 |     alpha_c_diff = alpha * c_diff
41 |     update_center_op = tf.assign_add(centers, alpha_c_diff)
42 |     return update_center_op
43 | 
44 | 
45 | def test_calc_center_loss():
46 |     with tf.Session() as sess:
47 |         centers = tf.Variable([[1, -1], [0, 1]], trainable=False, dtype=tf.float32)
48 |         features = tf.constant([[-1, 0], [-1, -1], [2, 0]], dtype=tf.float32)
49 |         emo_labels = tf.constant([[0, 1.0], [0, 1.0], [0, 1.0]], dtype=tf.float32)
50 |         sess.run(tf.global_variables_initializer())
51 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
52 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
53 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
54 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
55 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
56 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
57 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
58 |         print(sess.run(centers))
59 |         # loss = sess.run(calc_center_loss(features, centers, emo_labels, is_l1=True))
60 |         # print(loss)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     test_calc_center_loss()
65 | 


--------------------------------------------------------------------------------
/utils/data.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from tqdm import tqdm
  4 | from functools import partial
  5 | from concurrent.futures import ProcessPoolExecutor
  6 | 
  7 | 
  8 | def bytes_feature(value):
  9 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 10 | 
 11 | 
 12 | def float_feature(value):
 13 |     return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
 14 | 
 15 | 
 16 | def int64_feature(value):
 17 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 18 | 
 19 | 
 20 | def data_process_pipeline(hp_or_obj, meta_file, process_one_line_fun=None,
 21 |                           postprocess_fun=None, max_workers=None, **kwargs):
 22 |     """This func reads meta file and runs three custom funcs to get the final data
 23 | 
 24 |     This func performs the following data preprocessing pipeline:
 25 |         01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun
 26 |             must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict
 27 |         02 parse the geted data sample to the feature_fun func to get a feature data
 28 |         03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing
 29 |             e.g., normalizations, fixed length padding
 30 | 
 31 |     # Arguments
 32 |         hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters
 33 |             can be accessed as it's attributes
 34 |         meta_file: the meta file where each line generally contains the path of data sample and its labels
 35 |         meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of
 36 |             tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature
 37 |             of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts])
 38 |             or (None, None) if no sample returned. Note, even there is only one sample returned, it also must
 39 |             be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}.
 40 |             Note: this can return (None, None) for the reason of some samples may be filtered out, e.g.,  its
 41 |                 length does not meet the requirements
 42 |         feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs,
 43 |             and returns a single feature sample. The signature is: def feature_fun(hp, sample)
 44 |         postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same
 45 |             list of postprocessed feature samples with the same length. Generally, the feature normalizations,
 46 |             fixed length padding and sorting the samples by length  are performed in this func. The signature
 47 |             is: def postprocess_fun(hp, features)
 48 |         kwargs: some extra key word arguments will be passed to all three funcs.
 49 | 
 50 |     # Returns
 51 |         A tuple of length 2, the first element is the list of all postprocessed features, and the second element
 52 |             is the list of all labels(each label is a list converted by the labels dict). For example, the return
 53 |             value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]])
 54 | 
 55 |     # Exceptions
 56 |         TypeError: if both value of meta_fun's return is not a list
 57 |     """
 58 |     if process_one_line_fun is None and not hasattr(hp_or_obj, 'process_one_line'):
 59 |         raise ValueError('hp_or_obj without process_one_line method and process_one_line_fun is None')
 60 |     if postprocess_fun is None and not hasattr(hp_or_obj, 'postprocess'):
 61 |         raise ValueError('hp_or_obj without postprocess method and postprocess_fun is None')
 62 | 
 63 |     with open(meta_file) as fr:
 64 |         lines = [line for line in fr if line.strip() and line[0] != '#']
 65 | 
 66 |     if hasattr(hp_or_obj, 'process_one_line'):
 67 |         process_one_line_fun = type(hp_or_obj).process_one_line
 68 |     if hasattr(hp_or_obj, 'postprocess'):
 69 |         postprocess_fun = type(hp_or_obj).postprocess
 70 | 
 71 |     # 处理meta line, 获取sample和标签
 72 |     print('    step 1: parsing meta and get features ...')
 73 |     num = len(lines)
 74 |     hps = [hp_or_obj] * num
 75 |     features, labels = [], []
 76 |     with ProcessPoolExecutor(max_workers) as p:
 77 |         for r in tqdm(p.map(partial(process_one_line_fun, **kwargs), hps, lines), total=num):
 78 |             ds, ls = r
 79 |             if (ds, ls) != (None, None):
 80 |                 if type(ds) != list or type(ls) != list:
 81 |                     raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
 82 |                 features += ds
 83 |                 labels += ls
 84 |     '''
 85 |     for line in tqdm(lines):
 86 |         ds, ls = process_one_line_fun(hp_or_obj, line)
 87 |         if (ds, ls) != (None, None):
 88 |             if type(ds) != list or type(ls) != list:
 89 |                 raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
 90 |             features += ds
 91 |             labels += ls
 92 |     '''
 93 | 
 94 |     # np.save('recola_wav01_mel_nonorm_before_post.npy', features[0])
 95 |     # print('DEBUG before post', features[0].shape, labels[0])
 96 |     # 后处理
 97 |     print('    step 2: postprocessing for features and labels ...')
 98 |     features, labels = postprocess_fun(hp_or_obj, features, labels, **kwargs)
 99 |     return features, labels
100 | 
101 | 
102 | def get_class_weights(class_nums, type=0, power=1):
103 |     if type == 0:
104 |         return [1.] * len(class_nums)
105 |     # 根据power, 重新计算class_nums和total
106 |     total, class_ws = 0, class_nums.copy()
107 |     for cls in range(len(class_ws)):
108 |         class_ws[cls] = class_ws[cls] ** power
109 |         total += class_ws[cls]
110 |     # 权值取倒数后除以所有权值的均值(结果为1均值), 参考老代的做法
111 |     if type == 1:
112 |         wsum = 0
113 |         for cls in class_ws:
114 |             class_ws[cls] = 1 / class_ws[cls]
115 |             wsum += class_ws[cls]
116 |         wmean = wsum / len(class_nums)
117 |         class_ws = [w / wmean for cls, w in class_ws]
118 |     # 取倒数后乘total/2, https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
119 |     elif type == 2:
120 |         class_ws = [0.5 * total / w for w in class_ws]
121 |     return class_ws
122 | 


--------------------------------------------------------------------------------
/utils/data.py.bak-0707:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from functools import partial
 3 | from concurrent.futures import ProcessPoolExecutor
 4 | 
 5 | 
 6 | def data_process_pipeline(hp, meta_file, meta_fun, feature_fun, postprocess_fun,
 7 |                           max_workers=None, **kwargs):
 8 |     """This func reads meta file and runs three custom funcs to get the final data
 9 | 
10 |     This func performs the following data preprocessing pipeline:
11 |         01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun
12 |             must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict
13 |         02 parse the geted data sample to the feature_fun func to get a feature data
14 |         03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing
15 |             e.g., normalizations, fixed length padding
16 | 
17 |     # Arguments
18 |         hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters
19 |             can be accessed as it's attributes
20 |         meta_file: the meta file where each line generally contains the path of data sample and its labels
21 |         meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of
22 |             tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature
23 |             of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts])
24 |             or (None, None) if no sample returned. Note, even there is only one sample returned, it also must
25 |             be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}.
26 |             Note: this can return (None, None) for the reason of some samples may be filtered out, e.g.,  its
27 |                 length does not meet the requirements
28 |         feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs,
29 |             and returns a single feature sample. The signature is: def feature_fun(hp, sample)
30 |         postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same
31 |             list of postprocessed feature samples with the same length. Generally, the feature normalizations,
32 |             fixed length padding and sorting the samples by length  are performed in this func. The signature
33 |             is: def postprocess_fun(hp, features)
34 |         kwargs: some extra key word arguments will be passed to all three funcs.
35 | 
36 |     # Returns
37 |         A tuple of length 2, the first element is the list of all postprocessed features, and the second element
38 |             is the list of all labels(each label is a list converted by the labels dict). For example, the return
39 |             value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]])
40 | 
41 |     # Exceptions
42 |         TypeError: if both value of meta_fun's return is not a list
43 |     """
44 |     with open(meta_file) as fr:
45 |         lines = fr.readlines()
46 | 
47 |     # 处理meta line, 获取sample和标签
48 |     print('\n[Beginning process datas ... ]')
49 |     print('step 1: parsing meta and loading orignal data samples ...')
50 |     datas, labels = [], []
51 |     if max_workers == -1:
52 |         for line in tqdm(lines):
53 |             ds, ls, info = meta_fun(hp, line, **kwargs)
54 |             if (ds, ls) != (None, None):
55 |                 if type(ds) != list or type(ls) != list:
56 |                     raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
57 |                 datas += ds
58 |                 labels += ls
59 |     else:
60 |         num = len(lines)
61 |         hps = [hp] * num
62 |         with ProcessPoolExecutor(max_workers) as p:
63 |             for r in tqdm(p.map(partial(meta_fun, **kwargs), hps, lines), total=num):
64 |                 ds, ls, info = r
65 |                 if (ds, ls) != (None, None):
66 |                     if type(ds) != list or type(ls) != list:
67 |                         raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
68 |                     datas += ds
69 |                     labels += ls
70 |     hp.sr = info['sr']
71 | 
72 |     # 处理sample, 获取feature
73 |     print('step 2: getting features from original data samples ...')
74 |     if max_workers == -1:
75 |         datas = [feature_fun(hp, x, **kwargs) for x in tqdm(datas)]
76 |     else:
77 |         num = len(datas)
78 |         hps = [hp] * num
79 |         with ProcessPoolExecutor(max_workers) as p:
80 |             datas = [r for r in tqdm(p.map(partial(feature_fun, **kwargs), hps, datas), total=num)]
81 | 
82 |     # 后处理
83 |     print('step 3: postprocessing for features ...')
84 |     datas = postprocess_fun(hp, datas, **kwargs)
85 | 
86 |     for i in range(len(labels)):
87 |         labels[i] = list(labels[i].values())  # 用list更好, tuple无法修改
88 |     return datas, labels
89 | 


--------------------------------------------------------------------------------
/utils/debug.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def debug_print(*args, **kwargs):
 5 |     print_op = tf.print(*args, **kwargs)
 6 |     tf.add_to_collection('print_ops', print_op)
 7 | 
 8 | 
 9 | def get_ops():
10 |     return tf.get_collection('print_ops')
11 | 


--------------------------------------------------------------------------------
/utils/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 |   global _file, _run_name, _slack_url
16 |   _close_logfile()
17 |   _file = open(filename, 'a')
18 |   _file.write('\n-----------------------------------------------------------------\n')
19 |   _file.write('Starting new training run\n')
20 |   _file.write('-----------------------------------------------------------------\n')
21 |   _run_name = run_name
22 |   _slack_url = slack_url
23 | 
24 | 
25 | def log(msg, slack=False, is_print=True):
26 |   if is_print:
27 |     print(msg)
28 |   if _file is not None:
29 |     _file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
30 |   if slack and _slack_url is not None:
31 |     Thread(target=_send_slack, args=(msg,)).start()
32 | 
33 | 
34 | def _close_logfile():
35 |   global _file
36 |   if _file is not None:
37 |     _file.close()
38 |     _file = None
39 | 
40 | 
41 | def _send_slack(msg):
42 |   req = Request(_slack_url)
43 |   req.add_header('Content-Type', 'application/json')
44 |   urlopen(req, json.dumps({
45 |     'username': 'tacotron',
46 |     'icon_emoji': ':taco:',
47 |     'text': '*%s*: %s' % (_run_name, msg)
48 |   }).encode())
49 | 
50 | 
51 | atexit.register(_close_logfile)
52 | 


--------------------------------------------------------------------------------
/utils/ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def shape_list(x):
 5 |   """Return list of dims, statically where possible."""
 6 |   x = tf.convert_to_tensor(x)
 7 | 
 8 |   # If unknown rank, return dynamic shape
 9 |   if x.get_shape().dims is None:
10 |     return tf.shape(x)
11 | 
12 |   static = x.get_shape().as_list()
13 |   shape = tf.shape(x)
14 | 
15 |   ret = []
16 |   for i in range(len(static)):
17 |     dim = static[i]
18 |     if dim is None:
19 |       dim = shape[i]
20 |     ret.append(dim)
21 |   return ret
22 | 


--------------------------------------------------------------------------------
/utils/parameter.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import json
 3 | 
 4 | 
 5 | # hyper parameter util class
 6 | class HParams:
 7 |     def __init__(self, **kwargs):
 8 |         """A simple alternative implementation for tf.contrib.training.HParams
 9 | 
10 |         # Arguments
11 |             kwargs: all key word parameters which will be added as instance atrributes
12 |                 used as hyper parameters
13 |         """
14 |         for k, v in six.iteritems(kwargs):
15 |             self.add_hparam(k, v)
16 | 
17 |     def add_hparam(self, name, value):
18 |         """add a new hyperparameter given a name and value
19 | 
20 |         if name is an existed hyperparameter, then it's value is
21 |             updated as the new value
22 | 
23 |         # Arguments
24 |             name: str name of the new hyperparameter will be added
25 |             value: the value of new hyperparameter
26 |         """
27 |         setattr(self, name, value)
28 | 
29 |     def del_hparam(self, name):
30 |         """delete a hyperparameter named name
31 | 
32 |         # Arguments
33 |             name: str name of the hyperparameter will be deleted
34 |         """
35 |         delattr(self, name)
36 | 
37 |     def update(self, D, **kwargs):
38 |         """update or add hyper parameters
39 | 
40 |         # Arguments
41 |             D: a object that has keys() method, or can be iterated
42 |                 as for k, v in D
43 |             kwargs: extra key workd arguments for update hyper patameters
44 |         """
45 |         self.__dict__.update(D, **kwargs)
46 | 
47 |     def parse(self, values):
48 |         """parse a str that is splited by ';' and update them into attributes
49 | 
50 |         Note: we use ';' as the delimiter not ',' as in tf.contrib.training.HParams
51 |             because ',' will conflict the delimiter ',' in  list and dict
52 | 
53 |         # Arguments
54 |             values: a str contains hyper parameters which is splited with ';'
55 |                 and paired with '=', e.g., 'epochs=20,learning_rate=0.001'
56 |         """
57 |         pairs = values.split(";")
58 |         pairs = [x.strip().split("=") for x in pairs if x.strip() and '=' in x]
59 |         dict_pairs = dict(pairs)
60 |         for k in dict_pairs:
61 |             if k not in self.__dict__:
62 |                 raise KeyError('can not parse a not existing hyperparameter:"{}"'.format(k))
63 |             # self.__dict__[k] = type(self.__dict__[k])(dict_pairs[k])    # 还无法解析字典和列表元素
64 |             try:
65 |                 v = json.loads(dict_pairs[k])    # note: 参数值如果是字典, 则该字典的key只能是字符串(json要求)
66 |             except json.JSONDecodeError:
67 |                 v = json.loads('"' + dict_pairs[k] + '"')  # 直接解析字符串hello会报错, 必须解析"hello"才可以
68 |             self.__dict__[k] = v
69 |         return self
70 | 
71 |     def print(self):
72 |         """this func prints all hyper parameters"""
73 |         print('\n\n')
74 |         print('--------------------------------------------------')
75 |         print('All Hyper Parameters:')
76 |         print('--------------------------------------------------')
77 |         hps = self.__dict__
78 |         for hp in hps:
79 |             print('  {}={}'.format(hp, hps[hp]))
80 |         print('--------------------------------------------------')
81 |         print('\n\n')
82 | 
83 |     def to_string(self):
84 |         hp = '\n'
85 |         hp += '--------------------------------------------------\n'
86 |         hp += 'All Hyper Parameters:\n'
87 |         hp += '--------------------------------------------------\n'
88 |         hps = self.__dict__
89 |         for k in hps:
90 |             hp += '  {}={}\n'.format(k, hps[k])
91 |         hp += '--------------------------------------------------\n'
92 |         hp += '\n'
93 |         return hp
94 | 


--------------------------------------------------------------------------------
/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | matplotlib.use('Agg')
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def plot_alignment(alignment, path, info=None, title=None, text=None):
 8 |     """
 9 |     # Arguments
10 |         text: the text str where each char is used to draw yticks
11 |     """
12 |     if text is None:
13 |         figsize = None
14 |         yticks = None
15 |         ytick_labels = None
16 |     else:
17 |         yticks = np.arange(len(text))
18 |         ytick_labels = list(text)
19 |         ytick_labels[-1] = len(text)
20 |         figsize = (0.02 * alignment.shape[1], 0.10 * len(text))
21 | 
22 |     fig, ax = plt.subplots(figsize=figsize)
23 |     im = ax.imshow(alignment,
24 |                    aspect='auto',
25 |                    origin='lower',
26 |                    interpolation='none')
27 |     fig.colorbar(im, ax=ax)
28 |     xlabel = 'Decoder timestep'
29 |     if info is not None:
30 |         xlabel += '\n\n' + info
31 |     plt.xlabel(xlabel)
32 |     plt.ylabel('Encoder timestep')
33 |     plt.yticks(yticks, ytick_labels)
34 |     plt.title(title)
35 |     plt.tight_layout()
36 |     plt.savefig(path, format='png')
37 |     plt.close('all')
38 | 
39 | 
40 | def plot_mel(mel, path, info=None, title=None, gt_mel=None):
41 |     nrows = 1 if gt_mel is None else 2
42 |     fig, ax = plt.subplots(nrows, squeeze=False)
43 | 
44 |     def plot(mel, ax, y_label='pred_freq'):
45 |         im = ax.imshow(mel,
46 |                        aspect='auto',
47 |                        origin='lower',
48 |                        interpolation='none')
49 |         ax.set_ylabel(y_label)
50 |         fig.colorbar(im, ax=ax)
51 | 
52 |     plot(mel.T, ax[0][0])  # mel shape [time_step, num_mels]
53 |     ax[0][0].set_title(title)
54 | 
55 |     if gt_mel is not None:
56 |         plot(gt_mel.T, ax[1][0], y_label='truth_freq')
57 | 
58 |     plt.xlabel(info or 'time step')
59 |     plt.tight_layout()
60 |     plt.savefig(path, format='png')
61 |     plt.close('all')
62 | 


--------------------------------------------------------------------------------
/utils/tool_wrappers.py:
--------------------------------------------------------------------------------
 1 | from tensorflow import keras
 2 | 
 3 | 
 4 | def get_loss(loss):
 5 |     if type(loss) == str or type(loss) == dict:
 6 |         loss_name, kwargs = loss, {}
 7 |         if type(loss) == dict:
 8 |             loss_name = loss['loss_name']
 9 |             kwargs = loss.pop('loss_name')
10 |         kwargs.setdefault('name', loss_name)
11 | 
12 |         if loss_name == 'scce':
13 |             kwargs.setdefault('from_logits', True)
14 |             return keras.losses.SparseCategoricalCrossentropy(**kwargs)
15 |         elif loss_name == 'cce':
16 |             kwargs.setdefault('from_logits', True)
17 |             return keras.losses.CategoricalCrossentropy(**kwargs)
18 |         elif loss_name == 'bce':
19 |             kwargs.setdefault('from_logits', True)
20 |             return keras.losses.BinaryCrossentropy(**kwargs)
21 |         elif loss_name == 'mae':
22 |             return keras.losses.MeanAbsoluteError(**kwargs)
23 |         elif loss_name == 'mse':
24 |             return keras.losses.MeanSquaredError(**kwargs)
25 |         elif loss_name == 'focal':
26 |             pass
27 |         else:
28 |             raise ValueError('{} is unsupported loss now'.format(loss))
29 |     if callable(loss):
30 |         return loss
31 |     raise TypeError('type of loss must be str or callable, but {} is found'.format(type(loss)))
32 | 
33 | 
34 | def get_metric(metric):
35 |     if type(metric) == str or type(metric) == dict:
36 |         metric_name, kwargs = metric, {}
37 |         if type(metric) == dict:
38 |             metric_name = metric['metric_name']
39 |             kwargs = metric.pop('metric_name')
40 |         metric_name = metric_name.lower()
41 |         kwargs.setdefault('name', metric_name)
42 | 
43 |         if metric == 'sca':
44 |             return keras.metrics.SparseCategoricalAccuracy(**kwargs)
45 |         elif metric == 'ca':
46 |             return keras.metrics.CategoricalAccuracy(**kwargs)
47 |         elif metric == 'ba':
48 |             return keras.metrics.BinaryAccuracy(**kwargs)
49 |         elif metric == 'recall':
50 |             return keras.metrics.Recall(**kwargs)
51 |         elif metric == 'precision':
52 |             return keras.metrics.Precision(**kwargs)
53 |         elif metric == 'mae':
54 |             return keras.metrics.MeanSquaredError(**kwargs)
55 |         elif metric == 'mse':
56 |             return keras.metrics.MeanAbsoluteError(**kwargs)
57 |         else:
58 |             raise ValueError('{} is unsupported metric now'.format(metric))
59 |     if callable(metric):
60 |         return metric
61 |     raise TypeError('type of metric must be str or callable, but {} is found'.format(type(metric)))
62 | 
63 | 
64 | def get_optimizer(optimizer, lr_schedule=0.001, **kwargs):
65 |     if type(optimizer) == str:
66 |         optimizer = optimizer.lower()
67 |         if optimizer == 'adam':
68 |             return keras.optimizers.Adam(learning_rate=lr_schedule, **kwargs)
69 |         elif optimizer == 'sgd':
70 |             return keras.optimizers.SGD(learning_rate=lr_schedule, **kwargs)
71 |         elif optimizer == 'rmsprop':
72 |             return keras.optimizers.RMSprop(learning_rate=lr_schedule, **kwargs)
73 |         else:
74 |             raise ValueError('{} is unsupported optimizer now'.format(optimizer))
75 |     if isinstance(optimizer, keras.optimizers.Optimizer):
76 |         return optimizer
77 |     raise TypeError('type of optimizer must be str or callable, but {} is found'.format(type(optimizer)))
78 | 
79 | 
80 | def get_regularizer(reg):
81 |     if reg is None:
82 |         return None
83 |     if type(reg) == str:
84 |         return keras.regularizers.get(reg.lower())
85 |     if type(reg) == dict:
86 |         if 'l1' in reg and 'l2' in reg:
87 |             return keras.regularizers.l1_l2(l1=reg['l1'], l2=reg['l2'])
88 |         elif 'l1' in reg:
89 |             return keras.regularizers.l1(reg['l1'])
90 |         elif 'l2' in reg:
91 |             return keras.regularizers.l2(reg['l2'])
92 |         else:
93 |             raise 'the dict keys for regularizer must be "l1", "l2", or both of them'
94 |     if callable(reg):
95 |         return reg
96 |     raise TypeError(f'type of regularizer must be None, str, dict or callable, but {type(reg)} is found')
97 | 


--------------------------------------------------------------------------------
/utils/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/utils/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from scipy import signal
  5 | from scipy.io import wavfile
  6 | 
  7 | 
  8 | _mel_basis = None
  9 | 
 10 | 
 11 | def load_wav(hp, path):
 12 |     wav, sr = librosa.core.load(path, sr=hp.sample_rate)
 13 |     wav = wav / np.abs(wav).max() * 0.999
 14 |     return wav, sr
 15 | 
 16 | 
 17 | def save_wav(hp, wav, path):
 18 |     wav /= max(0.01, np.max(np.abs(wav)))
 19 |     wavfile.write(path, hp.sample_rate, (wav * 32766).astype(np.int16))
 20 | 
 21 | 
 22 | def preemphasis(hp, x):
 23 |     return signal.lfilter([1, -hp.preemphasis], [1], x)
 24 | 
 25 | 
 26 | def inv_preemphasis(hp, x):
 27 |     return signal.lfilter([1], [1, -hp.preemphasis], x)
 28 | 
 29 | 
 30 | def spectrogram(hp, y):
 31 |     if hp.preemphasis is None:
 32 |         D = _stft(hp, y)
 33 |     else:
 34 |         D = _stft(hp, preemphasis(hp, y))
 35 |     S = _amp_to_db(np.abs(D)) - hp.ref_level_db
 36 |     return _normalize(hp, S)
 37 | 
 38 | 
 39 | def inv_spectrogram(hp, spectrogram):
 40 |     '''Converts spectrogram to waveform using librosa'''
 41 |     S = _db_to_amp(_denormalize(hp, spectrogram) + hp.ref_level_db)  # Convert back to linear
 42 |     return inv_preemphasis(hp, _griffin_lim(hp, S ** hp.griffin_lim_power))  # Reconstruct phase
 43 | 
 44 | 
 45 | def inv_spectrogram_tensorflow(hp, spectrogram):
 46 |     '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
 47 | 
 48 |     Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
 49 |     inv_preemphasis on the output after running the graph.
 50 |     '''
 51 |     with tf.name_scope('griffin_lim'):
 52 |         S = _db_to_amp_tensorflow(_denormalize_tensorflow(hp, spectrogram) + hp.ref_level_db)
 53 |         return _griffin_lim_tensorflow(hp, tf.pow(S, hp.griffin_lim_power))
 54 | 
 55 | 
 56 | def melspectrogram(hp, y):
 57 |     if hp.preemphasis is None:
 58 |         D = _stft(hp, y)
 59 |     else:
 60 |         D = _stft(hp, preemphasis(hp, y))
 61 |     S = _amp_to_db(_linear_to_mel(hp, np.abs(D))) - hp.ref_level_db
 62 |     return _normalize(hp, S)
 63 | 
 64 | 
 65 | def mfcc(hp, y):
 66 |     pass
 67 | 
 68 | 
 69 | def find_endpoint(hp, wav, threshold_db=-40, min_silence_sec=0.8):
 70 |     window_length = int(hp.sample_rate * min_silence_sec)
 71 |     hop_length = int(window_length / 4)
 72 |     threshold = _db_to_amp(threshold_db)
 73 |     for x in range(hop_length, len(wav) - window_length, hop_length):
 74 |         if np.max(wav[x:x + window_length]) < threshold:
 75 |             return x + hop_length
 76 |     return len(wav)
 77 | 
 78 | 
 79 | def _griffin_lim(hp, S):
 80 |     '''librosa implementation of Griffin-Lim
 81 |     Based on https://github.com/librosa/librosa/issues/434
 82 |     '''
 83 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 84 |     S_complex = np.abs(S).astype(np.complex)
 85 |     y = _istft(hp, S_complex * angles)
 86 |     for i in range(hp.griffin_lim_iters):
 87 |         angles = np.exp(1j * np.angle(_stft(hp, y)))
 88 |         y = _istft(hp, S_complex * angles)
 89 |     return y   # reconstructed wav
 90 | 
 91 | 
 92 | def _griffin_lim_tensorflow(hp, S):
 93 |     '''TensorFlow implementation of Griffin-Lim
 94 |     Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
 95 |     '''
 96 |     with tf.variable_scope('griffinlim'):
 97 |         # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
 98 |         S = tf.expand_dims(S, 0)
 99 |         S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
100 |         y = _istft_tensorflow(hp, S_complex)
101 |         for i in range(hp.griffin_lim_iters):
102 |             est = _stft_tensorflow(hp, y)
103 |             angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
104 |             y = _istft_tensorflow(hp, S_complex * angles)
105 |         return tf.squeeze(y, 0)
106 | 
107 | 
108 | def _stft(hp, y):
109 |     n_fft, hop_length, win_length = _stft_parameters(hp)
110 |     # shape (1 + n_fft/2, n_frames)
111 |     return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
112 | 
113 | 
114 | def _istft(hp, y):
115 |     _, hop_length, win_length = _stft_parameters(hp)
116 |     return librosa.istft(y, hop_length=hop_length, win_length=win_length)
117 | 
118 | 
119 | def _stft_tensorflow(hp, signals):
120 |     n_fft, hop_length, win_length = _stft_parameters(hp)
121 |     return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
122 | 
123 | 
124 | def _istft_tensorflow(hp, stfts):
125 |     n_fft, hop_length, win_length = _stft_parameters(hp)
126 |     return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
127 | 
128 | 
129 | def _stft_parameters(hp):
130 |     n_fft = hp.n_fft
131 |     hop_length = int(hp.hop_ms / 1000 * hp.sample_rate)
132 |     win_length = int(hp.win_ms / 1000 * hp.sample_rate)
133 |     return n_fft, hop_length, win_length
134 | 
135 | 
136 | def _linear_to_mel(hp, spectrogram):
137 |     global _mel_basis
138 |     if _mel_basis is None:
139 |         _mel_basis = _build_mel_basis(hp)
140 |     return np.dot(_mel_basis, spectrogram)
141 | 
142 | 
143 | def _build_mel_basis(hp):
144 |     n_fft = hp.n_fft
145 |     return librosa.filters.mel(hp.sample_rate, n_fft, n_mels=hp.num_mels)
146 | 
147 | 
148 | def _amp_to_db(x):
149 |     # return 20 * np.log10(np.maximum(1e-5, x))
150 |     return 20 * np.log10(np.maximum(1e-4, x))  # 最小为-80dB, 因为还有减去ref_dB
151 | 
152 | 
153 | def _db_to_amp(x):
154 |     return np.power(10.0, x * 0.05)
155 | 
156 | 
157 | def _db_to_amp_tensorflow(x):
158 |     return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
159 | 
160 | 
161 | def _normalize(hp, S):
162 |     # 这个做法存疑, 因为S>0时, 都会被截断成0, 即如果S>ref_db, 都会
163 |     # 变成ref_db
164 |     return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
165 | 
166 | 
167 | def _denormalize(hp, S):
168 |     return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
169 | 
170 | 
171 | def _denormalize_tensorflow(hp, S):
172 |     return (tf.clip_by_value(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
173 | 


--------------------------------------------------------------------------------
/utils/utils/ce_loss_util.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | 
3 | 
4 | def ce_loss(soft_labels, logits):
5 |     probs = tf.clip_by_value(tf.nn.softmax(logits, axis=-1), 1e-10, 10)
6 |     ce = -tf.reduce_mean(tf.reduce_sum(soft_labels * tf.log(probs), axis=-1))
7 |     return ce
8 | 


--------------------------------------------------------------------------------
/utils/utils/center_loss_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def calc_center_loss(features, centers, emo_labels, is_l1=True):
 5 |     """
 6 |     Args:
 7 |         features:   [batch_size, dim]
 8 |         centers:    [emo_num, dim]
 9 |         emo_labels: [batch_size, emo_num]
10 |     Returns:
11 | 
12 |     """
13 |     features_ = tf.expand_dims(features, 1)  # [batch_size, 1, dim]
14 |     centers_ = tf.expand_dims(centers, 0)  # [1, emo_num, dim[
15 |     diff = features_ - centers_  # [batch_size, emo_num, dim]
16 |     dist = tf.reduce_sum(tf.square(diff), axis=-1)  # [batch_size, emo_num]
17 |     if is_l1:
18 |         dist = tf.sqrt(dist)
19 |     loss = tf.reduce_mean(tf.reduce_sum(dist * emo_labels, axis=-1))
20 |     return loss
21 | 
22 | 
23 | def update_center(features, centers, emo_labels, alpha):
24 |     """
25 |         Args:
26 |             features:   [batch_size, dim]
27 |             centers:    [emo_num, dim]
28 |             emo_labels: [batch_size, emo_num]
29 |         Returns:
30 |     """
31 |     features_ = tf.expand_dims(features, 1)  # [batch_size, 1, dim]
32 |     centers_ = tf.expand_dims(centers, 0)  # [1, emo_num, dim]
33 |     emo_labels_ = tf.expand_dims(emo_labels, -1)  # [batch_size, emo_num, 1]
34 |     diff = features_ - centers_  # [batch_size, emo_num, dim]
35 |     weighted_emo_diff = diff * emo_labels_  # [batch_size, emo_num, dim]
36 |     sum_emo_diff = tf.reduce_sum(weighted_emo_diff, axis=0)  # [emo_num, dim]
37 |     emo_sum = tf.clip_by_value(tf.reduce_sum(emo_labels_, axis=0), 0.001, 100)  # [emo_num, 1]
38 |     alpha = tf.clip_by_value(alpha, 0., 1.)
39 |     c_diff = tf.math.divide(sum_emo_diff, emo_sum)  # [emo_num, dim]
40 |     alpha_c_diff = alpha * c_diff
41 |     update_center_op = tf.assign_add(centers, alpha_c_diff)
42 |     return update_center_op
43 | 
44 | 
45 | def test_calc_center_loss():
46 |     with tf.Session() as sess:
47 |         centers = tf.Variable([[1, -1], [0, 1]], trainable=False, dtype=tf.float32)
48 |         features = tf.constant([[-1, 0], [-1, -1], [2, 0]], dtype=tf.float32)
49 |         emo_labels = tf.constant([[0, 1.0], [0, 1.0], [0, 1.0]], dtype=tf.float32)
50 |         sess.run(tf.global_variables_initializer())
51 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
52 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
53 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
54 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
55 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
56 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
57 |         print(sess.run(update_center(features, centers, emo_labels, alpha=0.5)))
58 |         print(sess.run(centers))
59 |         # loss = sess.run(calc_center_loss(features, centers, emo_labels, is_l1=True))
60 |         # print(loss)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     test_calc_center_loss()
65 | 


--------------------------------------------------------------------------------
/utils/utils/data.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from tqdm import tqdm
  4 | from functools import partial
  5 | from concurrent.futures import ProcessPoolExecutor
  6 | 
  7 | 
  8 | def bytes_feature(value):
  9 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 10 | 
 11 | 
 12 | def float_feature(value):
 13 |     return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
 14 | 
 15 | 
 16 | def int64_feature(value):
 17 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 18 | 
 19 | 
 20 | def data_process_pipeline(hp_or_obj, meta_file, process_one_line_fun=None,
 21 |                           postprocess_fun=None, max_workers=None, **kwargs):
 22 |     """This func reads meta file and runs three custom funcs to get the final data
 23 | 
 24 |     This func performs the following data preprocessing pipeline:
 25 |         01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun
 26 |             must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict
 27 |         02 parse the geted data sample to the feature_fun func to get a feature data
 28 |         03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing
 29 |             e.g., normalizations, fixed length padding
 30 | 
 31 |     # Arguments
 32 |         hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters
 33 |             can be accessed as it's attributes
 34 |         meta_file: the meta file where each line generally contains the path of data sample and its labels
 35 |         meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of
 36 |             tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature
 37 |             of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts])
 38 |             or (None, None) if no sample returned. Note, even there is only one sample returned, it also must
 39 |             be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}.
 40 |             Note: this can return (None, None) for the reason of some samples may be filtered out, e.g.,  its
 41 |                 length does not meet the requirements
 42 |         feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs,
 43 |             and returns a single feature sample. The signature is: def feature_fun(hp, sample)
 44 |         postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same
 45 |             list of postprocessed feature samples with the same length. Generally, the feature normalizations,
 46 |             fixed length padding and sorting the samples by length  are performed in this func. The signature
 47 |             is: def postprocess_fun(hp, features)
 48 |         kwargs: some extra key word arguments will be passed to all three funcs.
 49 | 
 50 |     # Returns
 51 |         A tuple of length 2, the first element is the list of all postprocessed features, and the second element
 52 |             is the list of all labels(each label is a list converted by the labels dict). For example, the return
 53 |             value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]])
 54 | 
 55 |     # Exceptions
 56 |         TypeError: if both value of meta_fun's return is not a list
 57 |     """
 58 |     if process_one_line_fun is None and not hasattr(hp_or_obj, 'process_one_line'):
 59 |         raise ValueError('hp_or_obj without process_one_line method and process_one_line_fun is None')
 60 |     if postprocess_fun is None and not hasattr(hp_or_obj, 'postprocess'):
 61 |         raise ValueError('hp_or_obj without postprocess method and postprocess_fun is None')
 62 | 
 63 |     with open(meta_file) as fr:
 64 |         lines = [line for line in fr if line.strip() and line[0] != '#']
 65 | 
 66 |     if hasattr(hp_or_obj, 'process_one_line'):
 67 |         process_one_line_fun = type(hp_or_obj).process_one_line
 68 |     if hasattr(hp_or_obj, 'postprocess'):
 69 |         postprocess_fun = type(hp_or_obj).postprocess
 70 | 
 71 |     # 处理meta line, 获取sample和标签
 72 |     print('    step 1: parsing meta and get features ...')
 73 |     num = len(lines)
 74 |     hps = [hp_or_obj] * num
 75 |     features, labels = [], []
 76 |     with ProcessPoolExecutor(max_workers) as p:
 77 |         for r in tqdm(p.map(partial(process_one_line_fun, **kwargs), hps, lines), total=num):
 78 |             ds, ls = r
 79 |             if (ds, ls) != (None, None):
 80 |                 if type(ds) != list or type(ls) != list:
 81 |                     raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
 82 |                 features += ds
 83 |                 labels += ls
 84 |     '''
 85 |     for line in tqdm(lines):
 86 |         ds, ls = process_one_line_fun(hp_or_obj, line)
 87 |         if (ds, ls) != (None, None):
 88 |             if type(ds) != list or type(ls) != list:
 89 |                 raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
 90 |             features += ds
 91 |             labels += ls
 92 |     '''
 93 | 
 94 |     # np.save('recola_wav01_mel_nonorm_before_post.npy', features[0])
 95 |     # print('DEBUG before post', features[0].shape, labels[0])
 96 |     # 后处理
 97 |     print('    step 2: postprocessing for features and labels ...')
 98 |     features, labels = postprocess_fun(hp_or_obj, features, labels, **kwargs)
 99 |     return features, labels
100 | 
101 | 
102 | def get_class_weights(class_nums, type=0, power=1):
103 |     if type == 0:
104 |         return [1.] * len(class_nums)
105 |     # 根据power, 重新计算class_nums和total
106 |     total, class_ws = 0, class_nums.copy()
107 |     for cls in range(len(class_ws)):
108 |         class_ws[cls] = class_ws[cls] ** power
109 |         total += class_ws[cls]
110 |     # 权值取倒数后除以所有权值的均值(结果为1均值), 参考老代的做法
111 |     if type == 1:
112 |         wsum = 0
113 |         for cls in class_ws:
114 |             class_ws[cls] = 1 / class_ws[cls]
115 |             wsum += class_ws[cls]
116 |         wmean = wsum / len(class_nums)
117 |         class_ws = [w / wmean for cls, w in class_ws]
118 |     # 取倒数后乘total/2, https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
119 |     elif type == 2:
120 |         class_ws = [0.5 * total / w for w in class_ws]
121 |     return class_ws
122 | 


--------------------------------------------------------------------------------
/utils/utils/data.py.bak-0707:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from functools import partial
 3 | from concurrent.futures import ProcessPoolExecutor
 4 | 
 5 | 
 6 | def data_process_pipeline(hp, meta_file, meta_fun, feature_fun, postprocess_fun,
 7 |                           max_workers=None, **kwargs):
 8 |     """This func reads meta file and runs three custom funcs to get the final data
 9 | 
10 |     This func performs the following data preprocessing pipeline:
11 |         01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun
12 |             must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict
13 |         02 parse the geted data sample to the feature_fun func to get a feature data
14 |         03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing
15 |             e.g., normalizations, fixed length padding
16 | 
17 |     # Arguments
18 |         hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters
19 |             can be accessed as it's attributes
20 |         meta_file: the meta file where each line generally contains the path of data sample and its labels
21 |         meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of
22 |             tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature
23 |             of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts])
24 |             or (None, None) if no sample returned. Note, even there is only one sample returned, it also must
25 |             be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}.
26 |             Note: this can return (None, None) for the reason of some samples may be filtered out, e.g.,  its
27 |                 length does not meet the requirements
28 |         feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs,
29 |             and returns a single feature sample. The signature is: def feature_fun(hp, sample)
30 |         postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same
31 |             list of postprocessed feature samples with the same length. Generally, the feature normalizations,
32 |             fixed length padding and sorting the samples by length  are performed in this func. The signature
33 |             is: def postprocess_fun(hp, features)
34 |         kwargs: some extra key word arguments will be passed to all three funcs.
35 | 
36 |     # Returns
37 |         A tuple of length 2, the first element is the list of all postprocessed features, and the second element
38 |             is the list of all labels(each label is a list converted by the labels dict). For example, the return
39 |             value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]])
40 | 
41 |     # Exceptions
42 |         TypeError: if both value of meta_fun's return is not a list
43 |     """
44 |     with open(meta_file) as fr:
45 |         lines = fr.readlines()
46 | 
47 |     # 处理meta line, 获取sample和标签
48 |     print('\n[Beginning process datas ... ]')
49 |     print('step 1: parsing meta and loading orignal data samples ...')
50 |     datas, labels = [], []
51 |     if max_workers == -1:
52 |         for line in tqdm(lines):
53 |             ds, ls, info = meta_fun(hp, line, **kwargs)
54 |             if (ds, ls) != (None, None):
55 |                 if type(ds) != list or type(ls) != list:
56 |                     raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
57 |                 datas += ds
58 |                 labels += ls
59 |     else:
60 |         num = len(lines)
61 |         hps = [hp] * num
62 |         with ProcessPoolExecutor(max_workers) as p:
63 |             for r in tqdm(p.map(partial(meta_fun, **kwargs), hps, lines), total=num):
64 |                 ds, ls, info = r
65 |                 if (ds, ls) != (None, None):
66 |                     if type(ds) != list or type(ls) != list:
67 |                         raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls)))
68 |                     datas += ds
69 |                     labels += ls
70 |     hp.sr = info['sr']
71 | 
72 |     # 处理sample, 获取feature
73 |     print('step 2: getting features from original data samples ...')
74 |     if max_workers == -1:
75 |         datas = [feature_fun(hp, x, **kwargs) for x in tqdm(datas)]
76 |     else:
77 |         num = len(datas)
78 |         hps = [hp] * num
79 |         with ProcessPoolExecutor(max_workers) as p:
80 |             datas = [r for r in tqdm(p.map(partial(feature_fun, **kwargs), hps, datas), total=num)]
81 | 
82 |     # 后处理
83 |     print('step 3: postprocessing for features ...')
84 |     datas = postprocess_fun(hp, datas, **kwargs)
85 | 
86 |     for i in range(len(labels)):
87 |         labels[i] = list(labels[i].values())  # 用list更好, tuple无法修改
88 |     return datas, labels
89 | 


--------------------------------------------------------------------------------
/utils/utils/debug.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def debug_print(*args, **kwargs):
 5 |     print_op = tf.print(*args, **kwargs)
 6 |     tf.add_to_collection('print_ops', print_op)
 7 | 
 8 | 
 9 | def get_ops():
10 |     return tf.get_collection('print_ops')
11 | 


--------------------------------------------------------------------------------
/utils/utils/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 2 | <html>
 3 | <head>
 4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 | <title>Directory listing for /home/tiger/v2_tacotron2/utils/utils/</title>
 6 | </head>
 7 | <body>
 8 | <h1>Directory listing for /home/tiger/v2_tacotron2/utils/utils/</h1>
 9 | <hr>
10 | <ul>
11 | <li><a href="__init__.py">__init__.py</a></li>
12 | <li><a href="__pycache__/">__pycache__/</a></li>
13 | <li><a href="audio.py">audio.py</a></li>
14 | <li><a href="ce_loss_util.py">ce_loss_util.py</a></li>
15 | <li><a href="center_loss_util.py">center_loss_util.py</a></li>
16 | <li><a href="data.py">data.py</a></li>
17 | <li><a href="data.py.bak-0707">data.py.bak-0707</a></li>
18 | <li><a href="debug.py">debug.py</a></li>
19 | <li><a href="infolog.py">infolog.py</a></li>
20 | <li><a href="mmd_utils.py">mmd_utils.py</a></li>
21 | <li><a href="ops.py">ops.py</a></li>
22 | <li><a href="parameter.py">parameter.py</a></li>
23 | <li><a href="plot.py">plot.py</a></li>
24 | <li><a href="tool_wrappers.py">tool_wrappers.py</a></li>
25 | </ul>
26 | <hr>
27 | </body>
28 | </html>
29 | 


--------------------------------------------------------------------------------
/utils/utils/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 |   global _file, _run_name, _slack_url
16 |   _close_logfile()
17 |   _file = open(filename, 'a')
18 |   _file.write('\n-----------------------------------------------------------------\n')
19 |   _file.write('Starting new training run\n')
20 |   _file.write('-----------------------------------------------------------------\n')
21 |   _run_name = run_name
22 |   _slack_url = slack_url
23 | 
24 | 
25 | def log(msg, slack=False, is_print=True):
26 |   if is_print:
27 |     print(msg)
28 |   if _file is not None:
29 |     _file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
30 |   if slack and _slack_url is not None:
31 |     Thread(target=_send_slack, args=(msg,)).start()
32 | 
33 | 
34 | def _close_logfile():
35 |   global _file
36 |   if _file is not None:
37 |     _file.close()
38 |     _file = None
39 | 
40 | 
41 | def _send_slack(msg):
42 |   req = Request(_slack_url)
43 |   req.add_header('Content-Type', 'application/json')
44 |   urlopen(req, json.dumps({
45 |     'username': 'tacotron',
46 |     'icon_emoji': ':taco:',
47 |     'text': '*%s*: %s' % (_run_name, msg)
48 |   }).encode())
49 | 
50 | 
51 | atexit.register(_close_logfile)
52 | 


--------------------------------------------------------------------------------
/utils/utils/mmd_utils.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def compute_upsample_values(input_tensor, upsample_height, upsample_width):
  6 |     """Compute values for an upsampling op (ops.BatchCropAndResize).
  7 | 
  8 |     Args:
  9 |       input_tensor: image tensor with shape [batch, height, width, in_channels]
 10 |       upsample_height: integer
 11 |       upsample_width: integer
 12 | 
 13 |     Returns:
 14 |       grid_centers: tensor with shape [batch, 1]
 15 |       crop_sizes: tensor with shape [batch, 1]
 16 |       output_height: integer
 17 |       output_width: integer
 18 |     """
 19 |     batch, input_height, input_width, _ = input_tensor.shape
 20 | 
 21 |     height_half = input_height / 2.
 22 |     width_half = input_width / 2.
 23 |     grid_centers = tf.constant(batch * [[height_half, width_half]])
 24 |     crop_sizes = tf.constant(batch * [[input_height, input_width]])
 25 |     output_height = input_height * upsample_height
 26 |     output_width = input_width * upsample_width
 27 | 
 28 |     return grid_centers, tf.to_float(crop_sizes), output_height, output_width
 29 | 
 30 | 
 31 | def compute_pairwise_distances(x, y):
 32 |     """Computes the squared pairwise Euclidean distances between x and y.
 33 | 
 34 |     Args:
 35 |       x: a tensor of shape [num_x_samples, num_features]
 36 |       y: a tensor of shape [num_y_samples, num_features]
 37 | 
 38 |     Returns:
 39 |       a distance matrix of dimensions [num_x_samples, num_y_samples].
 40 | 
 41 |     Raises:
 42 |       ValueError: if the inputs do no matched the specified dimensions.
 43 |     """
 44 | 
 45 |     if not len(x.get_shape()) == len(y.get_shape()) == 2:
 46 |         raise ValueError('Both inputs should be matrices.')
 47 | 
 48 |     if x.get_shape().as_list()[1] != y.get_shape().as_list()[1]:
 49 |         raise ValueError('The number of features should be the same.')
 50 | 
 51 |     norm = lambda x: tf.reduce_sum(tf.square(x), 1)
 52 | 
 53 |     # By making the `inner' dimensions of the two matrices equal to 1 using
 54 |     # broadcasting then we are essentially substracting every pair of rows
 55 |     # of x and y.
 56 |     # x will be num_samples x num_features x 1,
 57 |     # and y will be 1 x num_features x num_samples (after broadcasting).
 58 |     # After the substraction we will get a
 59 |     # num_x_samples x num_features x num_y_samples matrix.
 60 |     # The resulting dist will be of shape num_y_samples x num_x_samples.
 61 |     # and thus we need to transpose it again.
 62 |     return tf.transpose(norm(tf.expand_dims(x, 2) - tf.transpose(y)))
 63 | 
 64 | 
 65 | def gaussian_kernel_matrix(x, y, sigmas):
 66 |     r"""Computes a Guassian Radial Basis Kernel between the samples of x and y.
 67 | 
 68 |     We create a sum of multiple gaussian kernels each having a width sigma_i.
 69 | 
 70 |     Args:
 71 |       x: a tensor of shape [num_samples, num_features]
 72 |       y: a tensor of shape [num_samples, num_features]
 73 |       sigmas: a tensor of floats which denote the widths of each of the
 74 |         gaussians in the kernel.
 75 |     Returns:
 76 |       A tensor of shape [num_samples{x}, num_samples{y}] with the RBF kernel.
 77 |     """
 78 |     beta = 1. / (2. * (tf.expand_dims(sigmas, 1)))
 79 | 
 80 |     dist = compute_pairwise_distances(x, y)
 81 | 
 82 |     s = tf.matmul(beta, tf.reshape(dist, (1, -1)))
 83 | 
 84 |     return tf.reshape(tf.reduce_sum(tf.exp(-s), 0), tf.shape(dist))
 85 | 
 86 | 
 87 | def maximum_mean_discrepancy(x, y, kernel=gaussian_kernel_matrix):
 88 |     r"""Computes the Maximum Mean Discrepancy (MMD) of two samples: x and y.
 89 | 
 90 |     Maximum Mean Discrepancy (MMD) is a distance-measure between the samples of
 91 |     the distributions of x and y. Here we use the kernel two sample estimate
 92 |     using the empirical mean of the two distributions.
 93 | 
 94 |     MMD^2(P, Q) = || \E{\phi(x)} - \E{\phi(y)} ||^2
 95 |                 = \E{ K(x, x) } + \E{ K(y, y) } - 2 \E{ K(x, y) },
 96 | 
 97 |     where K = <\phi(x), \phi(y)>,
 98 |       is the desired kernel function, in this case a radial basis kernel.
 99 | 
100 |     Args:
101 |         x: a tensor of shape [num_samples, num_features]
102 |         y: a tensor of shape [num_samples, num_features]
103 |         kernel: a function which computes the kernel in MMD. Defaults to the
104 |                 GaussianKernelMatrix.
105 | 
106 |     Returns:
107 |         a scalar denoting the squared maximum mean discrepancy loss.
108 |     """
109 |     with tf.name_scope('MaximumMeanDiscrepancy'):
110 |         # \E{ K(x, x) } + \E{ K(y, y) } - 2 \E{ K(x, y) }
111 |         cost = tf.reduce_mean(kernel(x, x))
112 |         cost += tf.reduce_mean(kernel(y, y))
113 |         cost -= 2 * tf.reduce_mean(kernel(x, y))
114 | 
115 |         # We do not allow the loss to become negative.
116 |         cost = tf.where(cost > 0, cost, 0, name='value')
117 |     return cost
118 | 
119 | 
120 | def mmd_loss(source_samples, target_samples, weight):
121 |     """Adds a similarity loss term, the MMD between two representations.
122 | 
123 |     This Maximum Mean Discrepancy (MMD) loss is calculated with a number of
124 |     different Gaussian kernels.
125 | 
126 |     Args:
127 |       source_samples: a tensor of shape [num_samples, num_features].
128 |       target_samples: a tensor of shape [num_samples, num_features].
129 |       weight: the weight of the MMD loss.
130 | 
131 |     Returns:
132 |       a scalar tensor representing the MMD loss value.
133 |     """
134 |     sigmas = [
135 |         1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 20, 25, 30, 35, 100,
136 |         1e3, 1e4, 1e5, 1e6
137 |     ]
138 |     gaussian_kernel = partial(
139 |         gaussian_kernel_matrix, sigmas=tf.constant(sigmas))
140 | 
141 |     loss_value = maximum_mean_discrepancy(
142 |         source_samples, target_samples, kernel=gaussian_kernel)
143 |     loss_value = tf.maximum(1e-4, loss_value) * weight
144 |     # assert_op = tf.Assert(tf.is_finite(loss_value), [loss_value])
145 |     # with tf.control_dependencies([assert_op]):
146 |     #     tag = 'MMD Loss'
147 |     #     if scope:
148 |     #         tag = scope + tag
149 |     #     tf.summary.scalar(tag, loss_value)
150 |     #     tf.losses.add_loss(loss_value)
151 | 
152 |     return loss_value
153 | 
154 | 
155 | def correlation_loss(source_samples, target_samples, weight, scope=None):
156 |     """Adds a similarity loss term, the correlation between two representations.
157 | 
158 |     Args:
159 |       source_samples: a tensor of shape [num_samples, num_features]
160 |       target_samples: a tensor of shape [num_samples, num_features]
161 |       weight: a scalar weight for the loss.
162 |       scope: optional name scope for summary tags.
163 | 
164 |     Returns:
165 |       a scalar tensor representing the correlation loss value.
166 |     """
167 |     with tf.name_scope('corr_loss'):
168 |         source_samples -= tf.reduce_mean(source_samples, 0)
169 |         target_samples -= tf.reduce_mean(target_samples, 0)
170 | 
171 |         source_samples = tf.nn.l2_normalize(source_samples, 1)
172 |         target_samples = tf.nn.l2_normalize(target_samples, 1)
173 | 
174 |         source_cov = tf.matmul(tf.transpose(source_samples), source_samples)
175 |         target_cov = tf.matmul(tf.transpose(target_samples), target_samples)
176 | 
177 |         corr_loss = tf.reduce_mean(tf.square(source_cov - target_cov)) * weight
178 | 
179 |     assert_op = tf.Assert(tf.is_finite(corr_loss), [corr_loss])
180 |     with tf.control_dependencies([assert_op]):
181 |         tag = 'Correlation Loss'
182 |         if scope:
183 |             tag = scope + tag
184 |         tf.summary.scalar(tag, corr_loss)
185 |         tf.losses.add_loss(corr_loss)
186 | 
187 |     return corr_loss
188 | 


--------------------------------------------------------------------------------
/utils/utils/ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def shape_list(x):
 5 |   """Return list of dims, statically where possible."""
 6 |   x = tf.convert_to_tensor(x)
 7 | 
 8 |   # If unknown rank, return dynamic shape
 9 |   if x.get_shape().dims is None:
10 |     return tf.shape(x)
11 | 
12 |   static = x.get_shape().as_list()
13 |   shape = tf.shape(x)
14 | 
15 |   ret = []
16 |   for i in range(len(static)):
17 |     dim = static[i]
18 |     if dim is None:
19 |       dim = shape[i]
20 |     ret.append(dim)
21 |   return ret
22 | 


--------------------------------------------------------------------------------
/utils/utils/parameter.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import json
 3 | 
 4 | 
 5 | # hyper parameter util class
 6 | class HParams:
 7 |     def __init__(self, **kwargs):
 8 |         """A simple alternative implementation for tf.contrib.training.HParams
 9 | 
10 |         # Arguments
11 |             kwargs: all key word parameters which will be added as instance atrributes
12 |                 used as hyper parameters
13 |         """
14 |         for k, v in six.iteritems(kwargs):
15 |             self.add_hparam(k, v)
16 | 
17 |     def add_hparam(self, name, value):
18 |         """add a new hyperparameter given a name and value
19 | 
20 |         if name is an existed hyperparameter, then it's value is
21 |             updated as the new value
22 | 
23 |         # Arguments
24 |             name: str name of the new hyperparameter will be added
25 |             value: the value of new hyperparameter
26 |         """
27 |         setattr(self, name, value)
28 | 
29 |     def del_hparam(self, name):
30 |         """delete a hyperparameter named name
31 | 
32 |         # Arguments
33 |             name: str name of the hyperparameter will be deleted
34 |         """
35 |         delattr(self, name)
36 | 
37 |     def update(self, D, **kwargs):
38 |         """update or add hyper parameters
39 | 
40 |         # Arguments
41 |             D: a object that has keys() method, or can be iterated
42 |                 as for k, v in D
43 |             kwargs: extra key workd arguments for update hyper patameters
44 |         """
45 |         self.__dict__.update(D, **kwargs)
46 | 
47 |     def parse(self, values):
48 |         """parse a str that is splited by ';' and update them into attributes
49 | 
50 |         Note: we use ';' as the delimiter not ',' as in tf.contrib.training.HParams
51 |             because ',' will conflict the delimiter ',' in  list and dict
52 | 
53 |         # Arguments
54 |             values: a str contains hyper parameters which is splited with ';'
55 |                 and paired with '=', e.g., 'epochs=20,learning_rate=0.001'
56 |         """
57 |         pairs = values.split(";")
58 |         pairs = [x.strip().split("=") for x in pairs if x.strip() and '=' in x]
59 |         dict_pairs = dict(pairs)
60 |         for k in dict_pairs:
61 |             if k not in self.__dict__:
62 |                 raise KeyError('can not parse a not existing hyperparameter:"{}"'.format(k))
63 |             # self.__dict__[k] = type(self.__dict__[k])(dict_pairs[k])    # 还无法解析字典和列表元素
64 |             try:
65 |                 v = json.loads(dict_pairs[k])    # note: 参数值如果是字典, 则该字典的key只能是字符串(json要求)
66 |             except json.JSONDecodeError:
67 |                 v = json.loads('"' + dict_pairs[k] + '"')  # 直接解析字符串hello会报错, 必须解析"hello"才可以
68 |             self.__dict__[k] = v
69 |         return self
70 | 
71 |     def print(self):
72 |         """this func prints all hyper parameters"""
73 |         print('\n\n')
74 |         print('--------------------------------------------------')
75 |         print('All Hyper Parameters:')
76 |         print('--------------------------------------------------')
77 |         hps = self.__dict__
78 |         for hp in hps:
79 |             print('  {}={}'.format(hp, hps[hp]))
80 |         print('--------------------------------------------------')
81 |         print('\n\n')
82 | 
83 |     def to_string(self):
84 |         hp = '\n'
85 |         hp += '--------------------------------------------------\n'
86 |         hp += 'All Hyper Parameters:\n'
87 |         hp += '--------------------------------------------------\n'
88 |         hps = self.__dict__
89 |         for k in hps:
90 |             hp += '  {}={}\n'.format(k, hps[k])
91 |         hp += '--------------------------------------------------\n'
92 |         hp += '\n'
93 |         return hp
94 | 


--------------------------------------------------------------------------------
/utils/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | matplotlib.use('Agg')
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def plot_alignment(alignment, path, info=None, title=None, text=None):
 8 |     """
 9 |     # Arguments
10 |         text: the text str where each char is used to draw yticks
11 |     """
12 |     if text is None:
13 |         figsize = None
14 |         yticks = None
15 |         ytick_labels = None
16 |     else:
17 |         yticks = np.arange(len(text))
18 |         ytick_labels = list(text)
19 |         ytick_labels[-1] = len(text)
20 |         figsize = (0.02 * alignment.shape[1], 0.10 * len(text))
21 | 
22 |     fig, ax = plt.subplots(figsize=figsize)
23 |     im = ax.imshow(alignment,
24 |                    aspect='auto',
25 |                    origin='lower',
26 |                    interpolation='none')
27 |     fig.colorbar(im, ax=ax)
28 |     xlabel = 'Decoder timestep'
29 |     if info is not None:
30 |         xlabel += '\n\n' + info
31 |     plt.xlabel(xlabel)
32 |     plt.ylabel('Encoder timestep')
33 |     plt.yticks(yticks, ytick_labels)
34 |     plt.title(title)
35 |     plt.tight_layout()
36 |     plt.savefig(path, format='png')
37 |     plt.close('all')
38 | 
39 | 
40 | def plot_mel(mel, path, info=None, title=None, gt_mel=None):
41 |     nrows = 1 if gt_mel is None else 2
42 |     fig, ax = plt.subplots(nrows, squeeze=False)
43 | 
44 |     def plot(mel, ax):
45 |         im = ax.imshow(mel,
46 |                        aspect='auto',
47 |                        origin='lower',
48 |                        interpolation='none')
49 |         ax.set_ylabel('freq')
50 |         fig.colorbar(im, ax=ax)
51 | 
52 |     plot(mel.T, ax[0][0])  # mel shape [time_step, num_mels]
53 |     ax[0][0].set_title(title)
54 | 
55 |     if gt_mel is not None:
56 |         plot(gt_mel.T, ax[1][0])
57 | 
58 |     plt.xlabel(info or 'time step')
59 |     plt.tight_layout()
60 |     plt.savefig(path, format='png')
61 |     plt.close('all')
62 | 


--------------------------------------------------------------------------------
/utils/utils/tool_wrappers.py:
--------------------------------------------------------------------------------
 1 | from tensorflow import keras
 2 | 
 3 | 
 4 | def get_loss(loss):
 5 |     if type(loss) == str or type(loss) == dict:
 6 |         loss_name, kwargs = loss, {}
 7 |         if type(loss) == dict:
 8 |             loss_name = loss['loss_name']
 9 |             kwargs = loss.pop('loss_name')
10 |         kwargs.setdefault('name', loss_name)
11 | 
12 |         if loss_name == 'scce':
13 |             kwargs.setdefault('from_logits', True)
14 |             return keras.losses.SparseCategoricalCrossentropy(**kwargs)
15 |         elif loss_name == 'cce':
16 |             kwargs.setdefault('from_logits', True)
17 |             return keras.losses.CategoricalCrossentropy(**kwargs)
18 |         elif loss_name == 'bce':
19 |             kwargs.setdefault('from_logits', True)
20 |             return keras.losses.BinaryCrossentropy(**kwargs)
21 |         elif loss_name == 'mae':
22 |             return keras.losses.MeanAbsoluteError(**kwargs)
23 |         elif loss_name == 'mse':
24 |             return keras.losses.MeanSquaredError(**kwargs)
25 |         elif loss_name == 'focal':
26 |             pass
27 |         else:
28 |             raise ValueError('{} is unsupported loss now'.format(loss))
29 |     if callable(loss):
30 |         return loss
31 |     raise TypeError('type of loss must be str or callable, but {} is found'.format(type(loss)))
32 | 
33 | 
34 | def get_metric(metric):
35 |     if type(metric) == str or type(metric) == dict:
36 |         metric_name, kwargs = metric, {}
37 |         if type(metric) == dict:
38 |             metric_name = metric['metric_name']
39 |             kwargs = metric.pop('metric_name')
40 |         metric_name = metric_name.lower()
41 |         kwargs.setdefault('name', metric_name)
42 | 
43 |         if metric == 'sca':
44 |             return keras.metrics.SparseCategoricalAccuracy(**kwargs)
45 |         elif metric == 'ca':
46 |             return keras.metrics.CategoricalAccuracy(**kwargs)
47 |         elif metric == 'ba':
48 |             return keras.metrics.BinaryAccuracy(**kwargs)
49 |         elif metric == 'recall':
50 |             return keras.metrics.Recall(**kwargs)
51 |         elif metric == 'precision':
52 |             return keras.metrics.Precision(**kwargs)
53 |         elif metric == 'mae':
54 |             return keras.metrics.MeanSquaredError(**kwargs)
55 |         elif metric == 'mse':
56 |             return keras.metrics.MeanAbsoluteError(**kwargs)
57 |         else:
58 |             raise ValueError('{} is unsupported metric now'.format(metric))
59 |     if callable(metric):
60 |         return metric
61 |     raise TypeError('type of metric must be str or callable, but {} is found'.format(type(metric)))
62 | 
63 | 
64 | def get_optimizer(optimizer, lr_schedule=0.001, **kwargs):
65 |     if type(optimizer) == str:
66 |         optimizer = optimizer.lower()
67 |         if optimizer == 'adam':
68 |             return keras.optimizers.Adam(learning_rate=lr_schedule, **kwargs)
69 |         elif optimizer == 'sgd':
70 |             return keras.optimizers.SGD(learning_rate=lr_schedule, **kwargs)
71 |         elif optimizer == 'rmsprop':
72 |             return keras.optimizers.RMSprop(learning_rate=lr_schedule, **kwargs)
73 |         else:
74 |             raise ValueError('{} is unsupported optimizer now'.format(optimizer))
75 |     if isinstance(optimizer, keras.optimizers.Optimizer):
76 |         return optimizer
77 |     raise TypeError('type of optimizer must be str or callable, but {} is found'.format(type(optimizer)))
78 | 
79 | 
80 | def get_regularizer(reg):
81 |     if reg is None:
82 |         return None
83 |     if type(reg) == str:
84 |         return keras.regularizers.get(reg.lower())
85 |     if type(reg) == dict:
86 |         if 'l1' in reg and 'l2' in reg:
87 |             return keras.regularizers.l1_l2(l1=reg['l1'], l2=reg['l2'])
88 |         elif 'l1' in reg:
89 |             return keras.regularizers.l1(reg['l1'])
90 |         elif 'l2' in reg:
91 |             return keras.regularizers.l2(reg['l2'])
92 |         else:
93 |             raise 'the dict keys for regularizer must be "l1", "l2", or both of them'
94 |     if callable(reg):
95 |         return reg
96 |     raise TypeError(f'type of regularizer must be None, str, dict or callable, but {type(reg)} is found')
97 | 


--------------------------------------------------------------------------------