├── .gitignore ├── __init__.py ├── choose_emotion_samples.py ├── emogst_hparams.py ├── emogst_train.py ├── eval.py ├── models ├── __init__.py ├── base.py ├── emogst_tacotron2.py ├── sygst_tacotron2.py └── tacotron2.py ├── modules ├── __init__.py ├── attention.py ├── custom_functions.py ├── custom_layers.py ├── decoders.py ├── encoders.py └── losses.py ├── predict_attention.py ├── prepare_meta_from_tfr.py ├── sygst_hparams.py ├── sygst_train.py ├── synthesizer.py ├── text ├── __init__.py ├── cleaners.py ├── cmudict.py ├── numbers.py └── symbols.py ├── tfr_dset.py └── utils ├── __init__.py ├── audio.py ├── ce_loss_util.py ├── center_loss_util.py ├── data.py ├── data.py.bak-0707 ├── debug.py ├── infolog.py ├── ops.py ├── parameter.py ├── plot.py ├── tool_wrappers.py └── utils ├── __init__.py ├── audio.py ├── ce_loss_util.py ├── center_loss_util.py ├── data.py ├── data.py.bak-0707 ├── debug.py ├── index.html ├── infolog.py ├── mmd_utils.py ├── ops.py ├── parameter.py ├── plot.py └── tool_wrappers.py /.gitignore: -------------------------------------------------------------------------------- 1 | *DS_Store* 2 | *__pycache__* 3 | *.swp* 4 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import models 2 | import modules 3 | import taco2_hparams 4 | import utils 5 | -------------------------------------------------------------------------------- /choose_emotion_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | import numpy as np 5 | 6 | meta_file = 'bc2013/full_meta.txt' 7 | wavs_path = 'bc2013/wavs' 8 | mels_path = 'bc2013/mels' 9 | specs_path = 'bc2013/specs' 10 | 11 | 12 | def save_topk_files(saved_wavs_path, saved_mels_path, meta_path, lines): 13 | # copy wavs and mels 14 | for i, line in enumerate(lines): 15 | wav_name = line[6] + '.wav' 16 | mel_name = os.path.basename(line[7]) 17 | source_wav_name = os.path.join(wavs_path, wav_name) 18 | source_mel_name = os.path.join(mels_path, mel_name) 19 | saved_wav_name = os.path.join(saved_wavs_path, wav_name) 20 | saved_mel_name = os.path.join(saved_mels_path, f'{i:03d}-' + mel_name) # add a sorted prefix 21 | os.makedirs(os.path.dirname(saved_wav_name), exist_ok=True) 22 | os.makedirs(os.path.dirname(saved_mel_name), exist_ok=True) 23 | shutil.copy(source_wav_name, saved_wav_name) 24 | shutil.copy(source_mel_name, saved_mel_name) 25 | 26 | # save meta lines 27 | os.makedirs(os.path.dirname(meta_path), exist_ok=True) 28 | lines = ['{}|{}|{}|{}|{}|{}|{}|{}|{}\n'.format(*line) for line in lines] 29 | with open(meta_path, 'w') as fw: 30 | fw.writelines(lines) 31 | 32 | 33 | def choose_emo_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200): 34 | mels_path = os.path.join(base_dir, 'emo_mel_npys/emo{}') 35 | wavs_path = os.path.join(base_dir, 'emo_wavs/emo{}') 36 | meta_path = os.path.join(base_dir, 'emo_metas/emo{}.txt') 37 | for i in range(4): 38 | topk_lines = [] 39 | # line[0] is the 4 categories of soft label: [neutral, angry, happy, sad] 40 | sorted_lines = sorted(meta_lines, key=lambda x: x[0][i], reverse=True) 41 | for line in sorted_lines: 42 | if line[3] >= min_chars and line[4] >= min_frames: 43 | if i == 0: 44 | topk_lines.append(line) 45 | # elif i == 1 and line[1][1] > 0.5 and line[2][0] > 0.5: # angry 46 | elif i == 1 and line[1][1] > 0.5 and line[2][0] > 0.45: # angry 47 | topk_lines.append(line) 48 | elif i == 2 and line[1][1] > 0.5 and line[2][1] > 0.45: # happy 49 | topk_lines.append(line) 50 | # elif i == 3 and line[1][0] > 0.5 and line[2][0] > 0.5: # sad 51 | elif i == 3 and line[1][0] > 0.40 and line[2][0] > 0.45: # sad 52 | topk_lines.append(line) 53 | if len(topk_lines) == top_k: 54 | break 55 | save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines) 56 | 57 | 58 | def choose_aro_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200): 59 | mels_path = os.path.join(base_dir, 'emo2d_mel_npys/arousal{}') 60 | wavs_path = os.path.join(base_dir, 'emo2d_wavs/arousal{}') 61 | meta_path = os.path.join(base_dir, 'emo2d_metas/arousal{}.txt') 62 | for i in range(2): 63 | topk_lines = [] 64 | # line[1] is the arousal soft label 65 | sorted_lines = sorted(meta_lines, key=lambda x: x[1][i], reverse=True) 66 | for line in sorted_lines: 67 | if line[3] >= min_chars and line[4] >= min_frames: 68 | if i == 0 and np.argmax(line[0]) in [0, 3]: 69 | topk_lines.append(line) 70 | elif i == 1 and np.argmax(line[0]) in [1, 2]: 71 | topk_lines.append(line) 72 | if len(topk_lines) == top_k: 73 | break 74 | save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines) 75 | 76 | 77 | def choose_val_samples(meta_lines, base_dir='tmp', top_k=100, min_chars=50, min_frames=200): 78 | mels_path = os.path.join(base_dir, 'emo2d_mel_npys/valence{}') 79 | wavs_path = os.path.join(base_dir, 'emo2d_wavs/valence{}') 80 | meta_path = os.path.join(base_dir, 'emo2d_metas/valence{}.txt') 81 | for i in range(2): 82 | topk_lines = [] 83 | # line[2] is the valence soft label 84 | sorted_lines = sorted(meta_lines, key=lambda x: x[2][i], reverse=True) 85 | for line in sorted_lines: 86 | if line[3] >= min_chars and line[4] >= min_frames: 87 | if i == 0 and np.argmax(line[0]) in [1, 3]: 88 | topk_lines.append(line) 89 | elif i == 1 and np.argmax(line[0]) in [0, 2]: 90 | topk_lines.append(line) 91 | if len(topk_lines) == top_k: 92 | break 93 | save_topk_files(wavs_path.format(i), mels_path.format(i), meta_path.format(i), topk_lines) 94 | 95 | 96 | def main(): 97 | with open(meta_file, 'r') as fr: 98 | meta_lines = fr.readlines() 99 | 100 | def parse_emostrs(line): 101 | line[0] = json.loads(line[0]) # emotion 4 categories label 102 | line[1] = json.loads(line[1]) # arousal label 103 | line[2] = json.loads(line[2]) # valence label 104 | line[3] = int(line[3]) # text length 105 | line[4] = int(line[4]) # mel frame number 106 | return line 107 | meta_lines = [line.strip().split('|') for line in meta_lines if line.strip()[0] != '#'] 108 | meta_lines = list(map(parse_emostrs, meta_lines)) 109 | 110 | """ 111 | choose_emo_samples(meta_lines, 'emogst_emo_data') 112 | choose_aro_samples(meta_lines, 'sygst_emo_data') 113 | choose_val_samples(meta_lines, 'sygst_emo_data') 114 | """ 115 | choose_emo_samples(meta_lines, top_k=200) 116 | choose_aro_samples(meta_lines, top_k=200) 117 | choose_val_samples(meta_lines, top_k=200) 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /emogst_hparams.py: -------------------------------------------------------------------------------- 1 | from utils.parameter import HParams 2 | 3 | hp = HParams( 4 | # text 5 | cleaners='english_cleaners', 6 | 7 | # audio 8 | num_mels=80, 9 | num_spec=1025, # n_fft / 2 + 1 only used when adding linear spectrograms post processing network 10 | sample_rate=16000, 11 | win_ms=50, # For 22050Hz, 1100 ~= 50 ms (If None, win_size=n_fft) (0.05 * sample_rate) 12 | hop_ms=12.5, # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 13 | n_fft=2048, 14 | min_level_db=-100, 15 | ref_level_db=20, 16 | fmin=95, # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 17 | fmax=7600, # To be increased/reduced depending on data. 18 | preemphasis=0.97, # filter coefficient. 19 | griffin_lim_power=1.5, # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice. 20 | griffin_lim_iters=60, # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence. 21 | 22 | # Tacotron 23 | outputs_per_step=3, # number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality) 24 | feed_last_frame=True, # whether feed all r frames or only the last frame of all the r frames 25 | stop_at_any=True, # Determines whether the decoder should stop when predicting to any frame or to all of them (True works pretty well) 26 | clip_outputs=True, # Whether to clip spectrograms to T2_output_range (even in loss computation). ie: Don't penalize model for exceeding output range and bring back to borders. 27 | lower_bound_decay=0.0, # Small regularizer for noise synthesis by adding small range of penalty for silence regions. Set to 0 to clip in Tacotron range. 28 | clip_min=0, 29 | clip_max=1, 30 | 31 | # Input parameters 32 | num_symbols=150, 33 | embedding_dim=512, # dimension of embedding space 34 | 35 | # Encoder parameters 36 | encoder_type='taco2', # taco encoder is cbhg encoder 37 | encoder_cnns=[3, 5, 512], # num_layers, kernel_size, channels 38 | encoder_rnns_units=256, # number of lstm units for each direction (forward and backward) 39 | 40 | # reference encoder parameters 41 | reference_channels=[32, 32, 64, 64, 128, 128], 42 | reference_rnn_units=128, 43 | 44 | # gst parameters 45 | gst_heads=4, 46 | gst_tokens=10, 47 | gst_units=256, 48 | gst_atten_units=128, 49 | gst_atten_type='mlp', # attention type for gst self-attention module(dot or mlp) 50 | gst_activation=None, 51 | gst_trainable=True, # False at nvidia gst code 52 | 53 | # emotion parameters 54 | emo_used=False, 55 | emotion_embedding_units=128, 56 | 57 | # Attention mechanism 58 | smoothing=False, # Whether to smooth the attention normalization function 59 | attention_type='location', # sma: stepwise monotonic; location: location sensitive 60 | attention_units=128, # dimension of attention space 61 | attention_filters=32, # number of attention convolution filters 62 | attention_kernel_size=(31, ), # kernel size of attention convolution 63 | attention_sma_normalize=True, 64 | attention_sma_sigmoid_noise=2.0, 65 | attention_sma_sigmoid_noise_seed=None, 66 | attention_sma_score_bias_init=3.5, 67 | attention_sma_mode='parallel', 68 | 69 | # Attention synthesis constraints 70 | # "Monotonic" constraint forces the model to only look at the forwards attention_win_size steps. 71 | # "Window" allows the model to look at attention_win_size neighbors, both forward and backward steps. 72 | synthesis_constraint=True, # Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis) 73 | # synthesis_constraint_type='window', # can be in ('window', 'monotonic'). 74 | synthesis_win_size=7, # Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window. 75 | synthesis_softmax_temp=1.0, 76 | 77 | # Decoder 78 | prenet_units=[256, 256], # number of layers and number of units of prenet 79 | attention_rnn_units=[1024, 1024], # number of decoder lstm layers 80 | decode_rnn_units=None, # number of decoder lstm units on each layer 81 | max_iters=2000, # Max decoder steps during inference (Just for safety from infinite loop cases) 82 | impute_finished=False, 83 | frame_activation='relu', 84 | 85 | # Residual postnet 86 | postnet_cnns=[5, 5, 512], # num_layers, kernel_size, channels 87 | 88 | # CBHG mel->linear postnet 89 | post_cbhg=True, 90 | cbhg_kernels=8, # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act as "K-grams" 91 | cbhg_conv_channels=128, # Channels of the convolution bank 92 | cbhg_pool_size=2, # pooling size of the CBHG 93 | cbhg_projection=256, # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels) 94 | cbhg_projection_kernel_size=3, # kernel_size of the CBHG projections 95 | cbhg_highway_nums=4, # Number of HighwayNet layers 96 | cbhg_highway_units=128, # Number of units used in HighwayNet fully connected layers 97 | cbhg_rnn_units=128, # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in shape 98 | 99 | # Loss params 100 | mask_encoder=True, # whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence. 101 | mask_decoder=False, # set False for alignments converging faster 102 | cross_entropy_pos_weight=20, # Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1=disabled) 103 | mel_loss='mae', 104 | spec_loss='mae', 105 | 106 | 107 | # Tacotron Training 108 | # Reproduction seeds 109 | random_seed=5339, # Determines initial graph and operations (i.e: model) random state for reproducibility 110 | # tacotron_data_random_state=1234, # random state for train test split repeatability 111 | 112 | # performance parameters 113 | tacotron_swap_with_cpu=False, # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!) 114 | 115 | # train/test split ratios, mini-batches sizes 116 | batch_size=32, # number of training samples on each training steps 117 | # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing). 118 | # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder. 119 | tacotron_synthesis_batch_size=1, # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!! 120 | tacotron_test_size=0.05, # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit) 121 | tacotron_test_batches=None, # number of test batches. 122 | 123 | # Learning rate schedule 124 | decay_learning_rate=True, # boolean, determines if the learning rate will follow an exponential decay 125 | start_decay=40000, # Step at which learning decay starts 126 | decay_steps=18000, # Determines the learning rate decay slope (UNDER TEST) 127 | decay_rate=0.5, # learning rate decay rate (UNDER TEST) 128 | # initial_learning_rate=1e-3, # starting learning rate 129 | initial_learning_rate=0.002, 130 | final_learning_rate=1e-4, # minimal learning rate 131 | 132 | # Optimization parameters 133 | adam_beta1=0.9, # AdamOptimizer beta1 parameter 134 | adam_beta2=0.999, # AdamOptimizer beta2 parameter 135 | adam_epsilon=1e-6, # AdamOptimizer Epsilon parameter 136 | 137 | # Regularization parameters 138 | # reg_weight=1e-6, # regularization weight (for L2 regularization) 139 | reg_weight=None, # regularization weight (for L2 regularization) 140 | scale_regularization=False, # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model) 141 | zoneout_rate=0.1, # zoneout rate for all LSTM cells in the network 142 | dropout_rate=0.5, # dropout rate for all convolutional layers + prenet 143 | clip_gradients=True, # whether to clip gradients 144 | ) 145 | -------------------------------------------------------------------------------- /emogst_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import time 4 | import argparse 5 | import traceback 6 | import numpy as np 7 | import tensorflow as tf 8 | from datetime import datetime 9 | 10 | 11 | from tfr_dset import TFDataSet 12 | from text import sequence_to_text 13 | from utils import audio, plot, infolog, ValueWindow # , debug 14 | 15 | from emogst_hparams import hp 16 | from models.emogst_tacotron2 import Tacotron2EMOGST 17 | 18 | log = infolog.log 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 20 | 21 | 22 | _max_step = 500000 23 | 24 | # spec_length max = 1116 25 | # text length max = 99 26 | 27 | 28 | def time_string(): 29 | return datetime.now().strftime('%Y-%m-%d %H:%M') 30 | 31 | 32 | def debug_data(batch=32, time_in=100, time_out=500): 33 | text_x = np.random.randint(0, 150, size=(batch, time_in), dtype=np.int32) 34 | mel = np.random.randn(batch, time_out, 80).astype(np.float32) 35 | spec = np.random.randn(batch, time_out, 1025).astype(np.float32) 36 | spec_len = np.random.randint(time_out // 2, time_out, size=batch, dtype=np.int32) 37 | aro_label = np.random.rand(batch, 2).astype(np.float32) 38 | val_label = np.random.rand(batch, 2).astype(np.float32) 39 | 40 | print('text_input:', text_x[0], 'spec_len:', spec_len, sep='\n') 41 | return text_x, mel, spec, spec_len, aro_label, val_label 42 | 43 | 44 | def train(log_dir, args): 45 | checkpoint_path = os.path.join(log_dir, 'model.ckpt') 46 | # input_path = os.path.join(args.base_dir, args.input) 47 | log(hp.to_string(), is_print=False) 48 | log('Loading training data from: %s' % args.tfr_dir) 49 | log('Checkpoint path: %s' % checkpoint_path) 50 | log('Using model: emogst tacotron2') 51 | 52 | tf_dset = TFDataSet(hp, args.tfr_dir) 53 | feats = tf_dset.get_train_next() 54 | # Set up model: 55 | global_step = tf.Variable(0, name='global_step', trainable=False) 56 | training = tf.placeholder_with_default(True, shape=(), name='training') 57 | with tf.name_scope('model'): 58 | model = Tacotron2EMOGST(hp) 59 | model(feats['inputs'], 60 | mel_inputs=feats['mel_targets'], 61 | spec_inputs=feats['linear_targets'], 62 | spec_lengths=feats['spec_lengths'], 63 | ref_inputs=feats['mel_targets'], 64 | ref_lengths=feats['spec_lengths'], 65 | emo_labels=feats['soft_emo_labels'], 66 | training=training) 67 | model.add_loss() 68 | model.add_optimizer(global_step) 69 | stats = model.add_stats() 70 | 71 | # Bookkeeping: 72 | step = 0 73 | time_window = ValueWindow(100) 74 | loss_window = ValueWindow(100) 75 | saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2) 76 | 77 | # Train! 78 | config = tf.ConfigProto(allow_soft_placement=True, 79 | gpu_options=tf.GPUOptions(allow_growth=True)) 80 | with tf.Session(config=config) as sess: 81 | try: 82 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 83 | sess.run(tf.global_variables_initializer()) 84 | if args.restore_step: 85 | # Restore from a checkpoint if the user requested it. 86 | restore_path = '%s-%s' % (checkpoint_path, args.restore_step) 87 | saver.restore(sess, restore_path) 88 | log('Resuming from checkpoint: %s' % restore_path, slack=True) 89 | else: 90 | log('Starting a new training run ...', slack=True) 91 | 92 | """ 93 | fetches = [global_step, model.optimize, model.loss, model.mel_loss, 94 | model.spec_loss, model.stop_loss, model.emo_loss, model.mel_grad_norms_max, 95 | model.spec_grad_norms_max, model.stop_grad_norms_max, model.emo_grad_norms_max] 96 | """ 97 | fetches = [global_step, model.optimize, model.loss, model.mel_loss, 98 | model.spec_loss, model.stop_loss, model.emo_loss] 99 | for _ in range(_max_step): 100 | start_time = time.time() 101 | # sess.run(debug.get_ops()) 102 | # step, _, loss, mel_loss, spec_loss, stop_loss, emo_loss, mel_g, spec_g, stop_g, emo_g = sess.run(fetches) 103 | step, _, loss, mel_loss, spec_loss, stop_loss, emo_loss = sess.run(fetches) 104 | time_window.append(time.time() - start_time) 105 | loss_window.append(loss) 106 | """ 107 | message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,el=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,eg=%.4f]' % ( 108 | step, time_window.average, mel_loss, spec_loss, stop_loss, emo_loss, mel_g, spec_g, stop_g, emo_g) 109 | """ 110 | message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,el=%.3f]' % ( 111 | step, time_window.average, mel_loss, spec_loss, stop_loss, emo_loss) 112 | log(message, slack=(step % args.checkpoint_interval == 0)) 113 | 114 | if loss > 100 or math.isnan(loss): 115 | log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True) 116 | raise Exception('Loss Exploded') 117 | 118 | if step % args.summary_interval == 0: 119 | log('Writing summary at step: %d' % step) 120 | try: 121 | summary_writer.add_summary(sess.run(stats), step) 122 | except Exception as e: 123 | log(f'summary failed and ignored: {str(e)}') 124 | 125 | if step % args.checkpoint_interval == 0: 126 | log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) 127 | saver.save(sess, checkpoint_path, global_step=step) 128 | log('Saving audio and alignment...') 129 | gt_mel, gt_spec, seq, mel, spec, align = sess.run([model.mel_targets[0], model.spec_targets[0], 130 | model.text_targets[0], model.mel_outputs[0], 131 | model.spec_outputs[0], model.alignment_outputs[0]]) 132 | text = sequence_to_text(seq) 133 | wav = audio.inv_spectrogram(hp, spec.T) 134 | wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step) 135 | mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step) 136 | spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step) 137 | align_path = os.path.join(log_dir, 'step-%d-align.png' % step) 138 | info = '%s, %s, step=%d, loss=%.5f\n %s' % (args.model, time_string(), step, loss, text) 139 | plot.plot_alignment(align, align_path, info=info) 140 | plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel) 141 | plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec) 142 | audio.save_wav(hp, wav, wav_path) 143 | log('Input: %s' % text) 144 | 145 | except Exception as e: 146 | log('Exiting due to exception: %s' % e, slack=True) 147 | traceback.print_exc() 148 | 149 | 150 | def main(): 151 | parser = argparse.ArgumentParser() 152 | parser.add_argument('--gpu', default='0') 153 | parser.add_argument('--log', '-l', default='') 154 | parser.add_argument('--restore_step', '-r', default=None) 155 | parser.add_argument('--tfr_dir', default='bc2013/training/tfrs_with_emo_feature') 156 | args = parser.parse_args() 157 | 158 | args.model = 'emogst_taco2' 159 | args.summary_interval = 200 160 | args.checkpoint_interval = 5000 161 | # args.summary_interval = 2 162 | # args.checkpoint_interval = 5 163 | 164 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 165 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 166 | log_dir = 'emogst_logs' + ('_' + args.log if args.log else '') 167 | os.makedirs(log_dir, exist_ok=True) 168 | 169 | tf.set_random_seed(hp.random_seed) 170 | infolog.init(os.path.join(log_dir, 'train.log'), args.model) 171 | 172 | train(log_dir, args) 173 | 174 | 175 | if __name__ == '__main__': 176 | main() 177 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thuhcsi/icassp2021-emotion-tts/45bc25405e7c0f51f45727cc91cd41573c05bc65/models/__init__.py -------------------------------------------------------------------------------- /models/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | # from tensorflow.keras import layers 5 | from tensorflow.keras import backend as K 6 | 7 | from utils.infolog import log 8 | # from utils.debug import debug_print 9 | from modules import custom_layers as cl 10 | from modules import custom_functions as cf 11 | from modules.losses import get_mel_loss, get_spec_loss, get_stop_loss 12 | 13 | 14 | class TacotronBase(keras.Model): 15 | def __init__(self, hp, 16 | encoder, 17 | decoder, 18 | postnet=None, 19 | postcbhg=None, 20 | name='TacoBase'): 21 | super(TacotronBase, self).__init__(name=name) 22 | self.hp = hp 23 | self.encoder = encoder 24 | self.decoder = decoder 25 | self.postnet = postnet 26 | self.postcbhg = postcbhg 27 | 28 | def call(self, text_inputs, mel_inputs=None, 29 | spec_inputs=None, spec_lengths=None, training=None): 30 | 31 | self.training = K.learning_phase() if training is None else training 32 | self.batch_size = tf.shape(text_inputs)[0] 33 | 34 | # trim the inputs with a length of multiplies of r 35 | if mel_inputs is not None and spec_inputs is not None: 36 | mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs( 37 | self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths) 38 | 39 | # encoder 40 | encoder_outputs = self.encoder_call(text_inputs) 41 | 42 | # set values for attention layer and text_input_shape for decoder_cell 43 | self.atten_layer.set_values_keys(values=encoder_outputs) 44 | self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1]) 45 | 46 | # decoder 47 | outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths) 48 | return outputs 49 | 50 | def encoder_call(self, text_inputs): 51 | # encoder: takes text(char ids) as input, outupts text embeddings(with mask) 52 | # [batch, text_time, embeding_dim] 53 | encoder_outputs = self.encoder(text_inputs, training=self.training) 54 | 55 | self.text_targets = text_inputs 56 | self.encoder_outputs = encoder_outputs 57 | 58 | return encoder_outputs 59 | 60 | def decoder_call(self, mel_inputs=None, spec_inputs=None, spec_lengths=None): 61 | hp = self.hp 62 | training = self.training 63 | batch_size = self.batch_size 64 | 65 | # decoder: take mels as input, outupts mels, stop_tokens, alignments, and seq_lengths 66 | # mel [batch, mel_time / r, mel_num * r], stop_token [batch, mel_time / r, r] 67 | # alignments [batch, mel_time / r, text_time] seq_length [batch] 68 | decoder_outputs = self.decoder(inputs=mel_inputs, 69 | inputs_lengths=spec_lengths, 70 | batch_size=batch_size, 71 | training=training) 72 | (decoder_mel, stop_outputs, alignments), _, seq_length_outputs = decoder_outputs 73 | 74 | # output r frame of mels at each time step 75 | # mel [batch, mel_time, mel_num], stop_token [batch, mel_time, 1] 76 | decoder_mel = tf.reshape(decoder_mel, shape=[batch_size, -1, hp.num_mels]) 77 | stop_outputs = tf.reshape(stop_outputs, shape=[batch_size, -1]) 78 | if hp.clip_outputs: 79 | c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max 80 | decoder_mel = tf.clip_by_value(decoder_mel, c_min, c_max) 81 | 82 | # Postnet 83 | if self.postnet: 84 | residual_mel = self.postnet(decoder_mel, training) 85 | else: 86 | residual_mel = 0. 87 | 88 | # Mel outputs 89 | mel_outputs = decoder_mel + residual_mel 90 | if hp.clip_outputs: 91 | c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max 92 | # mel_outputs = mel_outputs - 0.05 93 | # mel_outputs = tf.where(mel_outputs < 0.3, mel_outputs - 0.05, mel_outputs) 94 | mel_outputs = tf.clip_by_value(mel_outputs, c_min, c_max) 95 | 96 | # CBHG convert mel to linear spectrum 97 | if self.postcbhg: 98 | self.spec_projection = cl.FrameProjection(hp.num_spec, hp.frame_activation) 99 | post_outputs = self.postcbhg(mel_outputs, training) 100 | spec_outputs = self.spec_projection(post_outputs) 101 | 102 | if hp.clip_outputs: 103 | c_min, c_max = hp.clip_min - hp.lower_bound_decay, hp.clip_max 104 | # spec_outputs = spec_outputs - 0.05 105 | # spec_outputs = tf.where(spec_outputs < 0.6, spec_outputs - 0.1, spec_outputs) 106 | spec_outputs = tf.clip_by_value(spec_outputs, c_min, c_max) 107 | self.spec_outputs = spec_outputs 108 | 109 | self.mel_targets = mel_inputs 110 | self.spec_targets = spec_inputs 111 | self.spec_length_targets = spec_lengths 112 | 113 | self.mel_outputs = mel_outputs 114 | self.mel_decoder_outputs = decoder_mel 115 | self.stop_outputs = stop_outputs 116 | self.alignment_outputs = tf.transpose(alignments, [0, 2, 1]) # [batch, encode_time, decode_time] 117 | self.seq_length_outputs = seq_length_outputs 118 | 119 | self.all_vars = tf.trainable_variables() 120 | 121 | is_print = True 122 | log(f'{self.name} Model Dimensions: ', is_print=is_print) 123 | log(' text embedding: %d' % self.encoder.embed_output.shape[-1], is_print=is_print) 124 | log(' encoder out: %d' % self.encoder_outputs.shape[-1], is_print=is_print) 125 | log(' decoder out: %d' % mel_outputs.shape[-1], is_print=is_print) 126 | log(' postcbhg out: %d' % post_outputs.shape[-1], is_print=is_print) 127 | log(' linear out: %d' % spec_outputs.shape[-1], is_print=is_print) 128 | log(' Model Parameters {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000)) 129 | 130 | return mel_outputs, seq_length_outputs 131 | 132 | def add_loss(self, name='loss'): 133 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 134 | 135 | priority_freq_n = int(2000 / (self.hp.sample_rate * 0.5) * self.hp.num_spec) 136 | with tf.name_scope(name): 137 | mel_loss, spec_loss = self.hp.mel_loss, self.hp.spec_loss 138 | if self.hp.mask_decoder: 139 | self.before_mel_loss = get_mel_loss(self.mel_targets, self.mel_decoder_outputs, 140 | self.spec_length_targets, method=mel_loss) 141 | self.after_mel_loss = get_mel_loss(self.mel_targets, self.mel_outputs, 142 | self.spec_length_targets, method=mel_loss) 143 | self.spec_loss = get_spec_loss(self.spec_targets, self.spec_outputs, priority_freq_n, 144 | self.spec_length_targets, method=spec_loss) 145 | self.stop_loss = get_stop_loss(None, self.stop_outputs, self.hp.outputs_per_step, 146 | self.spec_length_targets, do_mask=True, 147 | pos_weight=self.hp.cross_entropy_pos_weight) 148 | else: 149 | self.before_mel_loss = get_mel_loss(self.mel_targets, self.mel_decoder_outputs, method=mel_loss) 150 | self.after_mel_loss = get_mel_loss(self.mel_targets, self.mel_outputs, method=mel_loss) 151 | self.spec_loss = get_spec_loss(self.spec_targets, self.spec_outputs, priority_freq_n, method=spec_loss) 152 | self.stop_loss = get_stop_loss(None, self.stop_outputs, self.hp.outputs_per_step, 153 | self.spec_length_targets) 154 | 155 | self.reg_loss = tf.constant(0.0) 156 | if self.hp.reg_weight is not None: 157 | """ 158 | self.reg_vars = [v for v in self.all_vars if not('bias' in v.name or 'atten_cell' in v.name 159 | or '_projection' in v.name or 'embeddings' in v.name 160 | or 'gru' in v.name or 'lstm' in v.name)] 161 | """ 162 | self.reg_vars = [v for v in self.all_vars if not('bias' in v.name or '_projection' in v.name or 'embeddings' in v.name)] 163 | self.reg_loss = self.hp.reg_weight * tf.add_n([tf.nn.l2_loss(v) for v in self.reg_vars], name='reg_loss') 164 | 165 | self.mel_loss = self.before_mel_loss + self.after_mel_loss 166 | self.loss = self.mel_loss + self.spec_loss + self.stop_loss + self.reg_loss 167 | 168 | def add_optimizer(self, global_step, update_step=True, name='optimizer'): 169 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 170 | # Arguments 171 | global_step: int32 scalar Tensor representing current global step in training 172 | ''' 173 | with tf.name_scope(name): 174 | hp = self.hp 175 | if hp.decay_learning_rate: 176 | self.learning_rate = self.add_learning_rate(hp.initial_learning_rate, global_step) 177 | else: 178 | self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) 179 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) 180 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 181 | clipped_gradients = [tf.clip_by_value(g, -1.0, 1.0) for g in gradients] # 经常出现梯度爆炸 182 | clipped_gradients, global_norm = tf.clip_by_global_norm(clipped_gradients, 1.0) 183 | 184 | with tf.control_dependencies(self.updates): 185 | self.optimize = optimizer.apply_gradients( 186 | zip(clipped_gradients, variables), 187 | global_step=global_step if update_step else None, 188 | ) 189 | 190 | self.optimizer = optimizer 191 | self.gradients = gradients 192 | self.global_gradient_norm = global_norm 193 | 194 | def add_stats(self, name='stats'): 195 | with tf.name_scope(name): 196 | tf.summary.scalar('loss', self.loss) 197 | tf.summary.scalar('reg_loss', self.reg_loss) 198 | tf.summary.scalar('mel_loss', self.mel_loss) 199 | tf.summary.scalar('spec_loss', self.spec_loss) 200 | tf.summary.scalar('stop_loss', self.stop_loss) 201 | tf.summary.scalar('learning_rate', self.learning_rate) 202 | tf.summary.histogram('spec_outputs', self.spec_outputs) 203 | tf.summary.histogram('spec_targets', self.spec_targets) 204 | tf.summary.histogram('mel_outputs', self.mel_outputs) 205 | tf.summary.histogram('mel_targets', self.mel_targets) 206 | 207 | self.total_grad_norms = [tf.norm(g) for g in self.gradients] 208 | self.reg_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.reg_loss) if g[0] is not None] 209 | self.mel_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.mel_loss) if g[0] is not None] 210 | self.spec_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.spec_loss) if g[0] is not None] 211 | self.stop_grad_norms = [tf.norm(g[0]) for g in self.optimizer.compute_gradients(self.stop_loss) if g[0] is not None] 212 | self.total_grad_norms_max = tf.reduce_max(self.total_grad_norms) 213 | self.reg_grad_norms_max = tf.reduce_max(self.reg_grad_norms) 214 | self.mel_grad_norms_max = tf.reduce_max(self.mel_grad_norms) 215 | self.spec_grad_norms_max = tf.reduce_max(self.spec_grad_norms) 216 | self.stop_grad_norms_max = tf.reduce_max(self.stop_grad_norms) 217 | 218 | tf.summary.scalar('global_grad_norm', self.global_gradient_norm) 219 | tf.summary.scalar('total_grad_norms_max', self.total_grad_norms_max) 220 | tf.summary.scalar('reg_grad_norms_max', self.reg_grad_norms_max) 221 | tf.summary.scalar('mel_grad_norms_max', self.mel_grad_norms_max) 222 | tf.summary.scalar('spec_grad_norms_max', self.spec_grad_norms_max) 223 | tf.summary.scalar('stop_grad_norms_max', self.stop_grad_norms_max) 224 | 225 | tf.summary.histogram('total_grad_norms', self.total_grad_norms) 226 | tf.summary.histogram('reg_grad_norms', self.reg_grad_norms) 227 | tf.summary.histogram('mel_grad_norms', self.mel_grad_norms) 228 | tf.summary.histogram('spec_grad_norms', self.spec_grad_norms) 229 | tf.summary.histogram('stop_grad_norms', self.stop_grad_norms) 230 | return tf.summary.merge_all() 231 | 232 | def add_learning_rate(self, init_lr, global_step): 233 | # Noam scheme from tensor2tensor: 234 | warmup_steps = 4000.0 235 | step = tf.cast(global_step + 1, dtype=tf.float32) 236 | return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) 237 | -------------------------------------------------------------------------------- /models/emogst_tacotron2.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | from tensorflow.keras import layers 5 | from tensorflow.keras import backend as K 6 | 7 | # from utils.debug import debug_print 8 | 9 | from modules import custom_layers as cl 10 | from modules import custom_functions as cf 11 | from modules.attention import GSTAttention 12 | 13 | from models.tacotron2 import Tacotron2 14 | 15 | 16 | class Tacotron2EMOGST(Tacotron2): 17 | def __init__(self, hp, gta=False, name='taco2_emogst'): 18 | super(Tacotron2EMOGST, self).__init__(hp, gta, name=name) 19 | self.reference_encoder = cl.ReferenceEncoder( 20 | channels=hp.reference_channels, 21 | rnn_units=hp.reference_rnn_units, 22 | output_units=hp.emotion_embedding_units) 23 | self.gst_attention = GSTAttention( 24 | num_heads=hp.gst_heads, 25 | num_tokens=hp.gst_tokens, 26 | gst_units=hp.gst_units, 27 | attention_units=hp.gst_atten_units, 28 | attention_type=hp.gst_atten_type, 29 | activation=hp.gst_activation, 30 | trainable=hp.gst_trainable) 31 | 32 | def call(self, 33 | text_inputs, 34 | mel_inputs=None, 35 | spec_inputs=None, 36 | spec_lengths=None, 37 | ref_inputs=None, 38 | ref_lengths=None, 39 | emo_labels=None, 40 | atten_weights_ph=None, 41 | training=None): 42 | 43 | self.training = K.learning_phase() if training is None else training 44 | self.batch_size = tf.shape(text_inputs)[0] 45 | 46 | # trim the inputs with a length of multiplies of r 47 | if mel_inputs is not None and spec_inputs is not None: 48 | mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs( 49 | self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths) 50 | 51 | # set reference to mel if it is not given 52 | if ref_inputs is None: 53 | ref_inputs = mel_inputs 54 | ref_lengths = spec_lengths 55 | 56 | # encoder 57 | encoder_outputs = self.encoder_call(text_inputs) 58 | 59 | # reference encoder 60 | ref_outputs = None 61 | if ref_inputs is not None: # 之前没有传递training参数, 导致bn层参数梯度nan 62 | ref_outputs = self.reference_encoder(ref_inputs, x_length=ref_lengths, training=training) 63 | gst_outputs = self.gst_attention(ref_outputs, atten_weights_ph=atten_weights_ph, training=training) 64 | 65 | self.ref_outputs = ref_outputs # [N, ref_output_units] 66 | self.gst_outputs = gst_outputs # [N, 1, gst_units] 67 | self.gst_weights = self.gst_attention.atten_weights # [N, gst_heads, 1, gst_tokens] 68 | 69 | self.add_emotion_task(self.gst_weights) 70 | self.emo_labels = emo_labels 71 | 72 | gst_outputs = tf.tile(gst_outputs, [1, tf.shape(encoder_outputs)[1], 1]) 73 | encoder_outputs = tf.concat([encoder_outputs, gst_outputs], axis=-1) 74 | 75 | # set values for attention layer and batch_timesteps for decoder_cell 76 | self.atten_layer.set_values_keys(values=encoder_outputs) 77 | self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1]) 78 | 79 | # decoder 80 | outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths) 81 | return outputs 82 | 83 | def add_emotion_task(self, gst_weights): 84 | if self.hp.emo_used: 85 | weights_dim = self.hp.gst_heads * self.hp.gst_tokens 86 | gst_weights = tf.reshape(gst_weights, [-1, weights_dim]) 87 | self.emo_logits = layers.Dense(4, name='emo_dense')(gst_weights) 88 | else: 89 | self.emo_logits = None 90 | 91 | def add_loss(self): 92 | self.emo_loss = tf.constant(0.0) 93 | if self.emo_logits is not None: 94 | loss_fn = keras.losses.CategoricalCrossentropy(from_logits=True) 95 | self.emo_loss = loss_fn(self.emo_labels, self.emo_logits) 96 | 97 | super().add_loss() 98 | self.loss = self.loss + self.emo_loss 99 | 100 | def add_stats(self): 101 | with tf.variable_scope('stats'): 102 | if self.hp.emo_used: 103 | emo_grads = self.optimizer.compute_gradients(self.emo_loss) 104 | self.emo_grad_norms = [tf.norm(g[0]) for g in emo_grads if g[0] is not None] 105 | self.emo_grad_norms_max = tf.reduce_max(self.emo_grad_norms) 106 | tf.summary.scalar('emo_grad_norms_max', self.emo_grad_norms_max) 107 | tf.summary.histogram('emo_grad_norms', self.emo_grad_norms) 108 | else: 109 | self.emo_grad_norms = tf.constant(0) 110 | self.emo_grad_norms_max = tf.constant(0) 111 | 112 | tf.summary.scalar('emo_loss', self.emo_loss) 113 | super().add_stats(name='base_stats') 114 | return tf.summary.merge_all() 115 | -------------------------------------------------------------------------------- /models/sygst_tacotron2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | from tensorflow.keras import layers 5 | from tensorflow.keras import backend as K 6 | 7 | # from utils.debug import debug_print 8 | 9 | from modules import custom_layers as cl 10 | from modules import custom_functions as cf 11 | from modules.attention import GSTAttention 12 | 13 | from models.tacotron2 import Tacotron2 14 | 15 | 16 | class Tacotron2SYGST(Tacotron2): 17 | def __init__(self, hp, gta=False, name='taco2_sygst'): 18 | super(Tacotron2SYGST, self).__init__(hp, gta, name=name) 19 | self.reference_encoder = cl.ReferenceEncoder( 20 | channels=hp.reference_channels, 21 | rnn_units=hp.reference_rnn_units, 22 | output_units=hp.emotion_embedding_units) 23 | self.gst_attention = GSTAttention( 24 | num_heads=hp.gst_heads, 25 | num_tokens=hp.gst_tokens, 26 | gst_units=hp.gst_units, 27 | attention_units=hp.gst_atten_units, 28 | attention_type=hp.gst_atten_type, 29 | activation=hp.gst_activation, 30 | trainable=hp.gst_trainable) 31 | 32 | def call(self, 33 | text_inputs, 34 | mel_inputs=None, 35 | spec_inputs=None, 36 | spec_lengths=None, 37 | ref_inputs=None, 38 | ref_lengths=None, 39 | arousal_labels=None, 40 | valence_labels=None, 41 | atten_weights_ph=None, 42 | training=None): 43 | 44 | self.training = K.learning_phase() if training is None else training 45 | self.batch_size = tf.shape(text_inputs)[0] 46 | 47 | # trim the inputs with a length of multiplies of r 48 | if mel_inputs is not None and spec_inputs is not None: 49 | mel_inputs, spec_inputs, spec_lengths = cf.trim_inputs( 50 | self.hp.outputs_per_step, mel_inputs, spec_inputs, spec_lengths) 51 | 52 | # set reference to mel if it is not given 53 | if ref_inputs is None: 54 | ref_inputs = mel_inputs 55 | ref_lengths = spec_lengths 56 | 57 | # encoder 58 | encoder_outputs = self.encoder_call(text_inputs) 59 | 60 | # reference encoder 61 | ref_outputs = None 62 | if ref_inputs is not None: # 之前没有传递training参数, 导致bn层参数梯度nan 63 | ref_outputs = self.reference_encoder(ref_inputs, x_length=ref_lengths, training=training) 64 | gst_outputs = self.gst_attention(ref_outputs, atten_weights_ph=atten_weights_ph, training=training) 65 | 66 | self.ref_outputs = ref_outputs # [N, ref_output_units] 67 | self.gst_outputs = gst_outputs # [N, 1, gst_units] 68 | self.gst_weights = self.gst_attention.atten_weights # [N, gst_heads, 1, gst_tokens] 69 | 70 | self.add_emotion_task(self.gst_weights) 71 | self.arousal_labels = arousal_labels 72 | self.valence_labels = valence_labels 73 | 74 | gst_outputs = tf.tile(gst_outputs, [1, tf.shape(encoder_outputs)[1], 1]) 75 | encoder_outputs = tf.concat([encoder_outputs, gst_outputs], axis=-1) 76 | 77 | # set values for attention layer and batch_timesteps for decoder_cell 78 | self.atten_layer.set_values_keys(values=encoder_outputs) 79 | self.decoder_cell.set_batch_timesteps(self.batch_size, tf.shape(encoder_outputs)[1]) 80 | 81 | # decoder 82 | outputs = self.decoder_call(mel_inputs, spec_inputs, spec_lengths) 83 | return outputs 84 | 85 | def add_emotion_task(self, gst_weights): 86 | units, emo_loss = self.hp.emo_output_units, self.hp.emo_loss 87 | units = 1 if emo_loss in ['mae', 'mse', 'sigmoid'] else units 88 | 89 | if self.hp.emo_used: 90 | arousal_weights, valence_weights = tf.split(gst_weights, 2, axis=1) 91 | weights_dim = np.prod(arousal_weights.shape[1:]) 92 | arousal_weights = tf.reshape(arousal_weights, [-1, weights_dim]) 93 | valence_weights = tf.reshape(valence_weights, [-1, weights_dim]) 94 | self.arousal_logits = layers.Dense(units, name='aro_dense')(arousal_weights) 95 | self.valence_logits = layers.Dense(units, name='val_dense')(valence_weights) 96 | else: 97 | self.arousal_logits = None 98 | self.valence_logits = None 99 | 100 | def add_loss(self): 101 | emo_loss = self.hp.emo_loss 102 | if emo_loss in ['mae', 'mse']: 103 | loss_fn = keras.losses.get(emo_loss) 104 | elif emo_loss == 'sigmoid': 105 | loss_fn = keras.losses.BinaryCrossentropy(from_logits=True) 106 | elif emo_loss == 'softmax': 107 | loss_fn = keras.losses.CategoricalCrossentropy(from_logits=True) 108 | else: 109 | raise ValueError(f'The emo_loss={emo_loss} is not valid') 110 | 111 | self.arousal_loss = tf.constant(0.0) 112 | self.valence_loss = tf.constant(0.0) 113 | if self.arousal_logits is not None: 114 | self.arousal_loss = loss_fn(self.arousal_labels, self.arousal_logits) 115 | self.valence_loss = loss_fn(self.valence_labels, self.valence_logits) 116 | self.emo_loss = self.arousal_loss + self.valence_loss 117 | 118 | super().add_loss() 119 | self.loss = self.loss + self.emo_loss 120 | 121 | def add_stats(self, name='stats'): 122 | with tf.name_scope(name): 123 | if self.hp.emo_used: 124 | aro_grads = self.optimizer.compute_gradients(self.arousal_loss) 125 | val_grads = self.optimizer.compute_gradients(self.valence_loss) 126 | self.aro_grad_norms = [tf.norm(g[0]) for g in aro_grads if g[0] is not None] 127 | self.val_grad_norms = [tf.norm(g[0]) for g in val_grads if g[0] is not None] 128 | self.aro_grad_norms_max = tf.reduce_max(self.aro_grad_norms) 129 | self.val_grad_norms_max = tf.reduce_max(self.val_grad_norms) 130 | 131 | tf.summary.scalar('aro_grad_norms_max', self.aro_grad_norms_max) 132 | tf.summary.scalar('val_grad_norms_max', self.val_grad_norms_max) 133 | tf.summary.histogram('aro_grad_norms', self.aro_grad_norms) 134 | tf.summary.histogram('val_grad_norms', self.val_grad_norms) 135 | else: 136 | self.aro_grad_norms = tf.constant(0) 137 | self.val_grad_norms = tf.constant(0) 138 | self.aro_grad_norms_max = tf.constant(0) 139 | self.val_grad_norms_max = tf.constant(0) 140 | 141 | tf.summary.scalar('aro_loss', self.arousal_loss) 142 | tf.summary.scalar('val_loss', self.valence_loss) 143 | super().add_stats(name='base_stats') 144 | return tf.summary.merge_all() 145 | -------------------------------------------------------------------------------- /models/tacotron2.py: -------------------------------------------------------------------------------- 1 | # import tensorflow as tf 2 | from tensorflow import keras 3 | # from tensorflow.keras import layers 4 | # from tensorflow.keras import backend as K 5 | 6 | # from utils.debug import debug_print 7 | from modules import custom_layers as cl 8 | from modules.encoders import TacotronEncoder 9 | from modules.encoders import Tacotron2Encoder 10 | from modules.decoders import DecoderCell, Decoder 11 | from modules.attention import LocationSensitiveAttention 12 | from modules.attention import StepwiseMonotonicAttention 13 | 14 | from models.base import TacotronBase 15 | 16 | 17 | class Tacotron2(TacotronBase): 18 | def __init__(self, hp, gta=False, name='Tacotron2'): 19 | super(keras.Model, self).__init__(name=name) 20 | self.hp = hp 21 | self.gta = gta 22 | 23 | if hp.encoder_type == 'taco2': # tacotron2 encoder 24 | self.encoder = Tacotron2Encoder(hp) 25 | elif hp.encoder_type == 'taco': # actually is cbhg encoder 26 | self.encoder = TacotronEncoder(hp) 27 | else: 28 | raise ValueError('encoder_type must in [taco2, taco]') 29 | 30 | if hp.attention_type == 'location': # 各个模块的定义顺序最好不要改变 31 | self.atten_layer = LocationSensitiveAttention( 32 | units=hp.attention_units, 33 | location_filters=hp.attention_filters, 34 | location_kernel_size=hp.attention_kernel_size, 35 | synthesis_constraint=hp.synthesis_constraint, 36 | synthesis_win_size=hp.synthesis_win_size, 37 | synthesis_softmax_temp=hp.synthesis_softmax_temp 38 | ) 39 | elif hp.attention_type == 'sma': 40 | self.atten_layer = StepwiseMonotonicAttention( 41 | units=hp.attention_units, 42 | normalize=hp.attention_sma_normalize, 43 | sigmoid_noise=hp.attention_sma_sigmoid_noise, 44 | sigmoid_noise_seed=hp.attention_sma_sigmoid_noise_seed, 45 | score_bias_init=hp.attention_sma_score_bias_init, 46 | mode=hp.attention_sma_mode 47 | ) 48 | else: 49 | raise ValueError('attention_type must in [location, sma]') 50 | 51 | self.decoder_prenet = cl.Prenet(units=hp.prenet_units, drop_rate=hp.dropout_rate) 52 | self.atten_rnn_cell = cl.AttentionRNNCell(units=hp.attention_rnn_units, 53 | zone_rate=hp.zoneout_rate) 54 | self.frame_projection = cl.FrameProjection(units=hp.outputs_per_step * hp.num_mels, 55 | activation=hp.frame_activation) 56 | self.stop_projection = cl.StopProjection(units=hp.outputs_per_step) 57 | self.decoder_cell = DecoderCell(prenet=self.decoder_prenet, 58 | attention_rnn=self.atten_rnn_cell, 59 | attention_layer=self.atten_layer, 60 | frame_projection=self.frame_projection, 61 | stop_projection=self.stop_projection) 62 | self.decoder = Decoder(hp, 63 | self.decoder_cell, 64 | gta=gta, 65 | impute_finished=hp.impute_finished, 66 | maximum_steps=hp.max_iters) 67 | 68 | self.postnet = cl.Postnet(num_layers=hp.postnet_cnns[0], 69 | kernel_size=hp.postnet_cnns[1], 70 | channels=hp.postnet_cnns[2], 71 | drop_rate=hp.dropout_rate, 72 | output_units=hp.num_mels, 73 | output_activation=hp.frame_activation) 74 | 75 | self.postcbhg = None 76 | if hp.post_cbhg: 77 | self.postcbhg = cl.CBHG(K=hp.cbhg_kernels, 78 | conv_channels=hp.cbhg_conv_channels, 79 | pool_size=hp.cbhg_pool_size, 80 | projections=[hp.cbhg_projection, hp.num_mels], 81 | highway_units=hp.cbhg_highway_units, 82 | highway_nums=hp.cbhg_highway_nums, 83 | rnn_units=hp.cbhg_rnn_units, 84 | name='post_cbhg') 85 | 86 | super(Tacotron2, self).__init__(hp=hp, 87 | encoder=self.encoder, 88 | decoder=self.decoder, 89 | postnet=self.postnet, 90 | postcbhg=self.postcbhg, 91 | name=name) 92 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thuhcsi/icassp2021-emotion-tts/45bc25405e7c0f51f45727cc91cd41573c05bc65/modules/__init__.py -------------------------------------------------------------------------------- /modules/custom_functions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def trim_inputs(r, mel_inputs, spec_inputs, spec_lengths): 5 | """Trim the inputs' lengths to maximum multiplies of r 6 | """ 7 | r = tf.cast(r, tf.int32) 8 | mel_inputs = tf.cast(mel_inputs, tf.float32) # tf.cast 如果tensor dtype一样, 则返回原tensor 9 | spec_inputs = tf.cast(spec_inputs, tf.float32) # 如果是np.ndarray, 则创建tensor, 且能做类型转换 10 | spec_lengths = tf.cast(spec_lengths, tf.int32) # tf.convert_to_tensor, 类型不一致会报错 11 | 12 | max_len = tf.reduce_max(spec_lengths) 13 | max_len = tf.cast(max_len / r, dtype=tf.int32) * r # cast to int32 <=> floor 14 | 15 | mel_inputs = mel_inputs[:, : max_len, :] 16 | spec_inputs = spec_inputs[:, : max_len, :] 17 | spec_lengths = tf.clip_by_value(spec_lengths, 0, max_len) 18 | return mel_inputs, spec_inputs, spec_lengths 19 | -------------------------------------------------------------------------------- /modules/custom_layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | # from tensorflow import keras 3 | from tensorflow.keras import layers 4 | from tensorflow.compat.v1.logging import warn 5 | from tensorflow.python.keras import backend as K 6 | from tensorflow.python.keras.utils import tf_utils 7 | 8 | # from warnings import warn 9 | # from utils.debug import debug_print 10 | 11 | 12 | class ConvBlock(layers.Layer): 13 | def __init__(self, 14 | mode, 15 | conv_type, 16 | channels, 17 | kernel_size, 18 | dropout_rate=None, 19 | activation='relu', 20 | batch_norm=True, 21 | strides=1, 22 | padding='same', 23 | do_mask=False, 24 | name='conv_block'): 25 | """stack conv(c), activation(a), batch norm(b), dropout(d) layers 26 | # Arguments 27 | mode: a str with char in ['c', 'a', 'b', 'd'] denotes the order 28 | of calling the corresponding layer 29 | conv_type: '1D' '2D' '3D' for Conv1D, Conv2D, Conv3D 30 | """ 31 | super(ConvBlock, self).__init__(name=name) 32 | conv_map = {'1D': layers.Conv1D, '2D': layers.Conv2D, '3D': layers.Conv3D} 33 | conv_layer = conv_map.get(conv_type) 34 | 35 | self.conv = conv_layer(channels, kernel_size, strides, padding=padding, name='conv') 36 | self.activation = layers.Activation(activation) if activation else None 37 | self.bn = layers.BatchNormalization(name='bn') if batch_norm else None 38 | self.dropout = layers.Dropout(dropout_rate, name='drop') if dropout_rate else None 39 | 40 | self.do_mask = do_mask 41 | self.block_mode = mode 42 | self.supports_masking = True 43 | self.conv.supports_masking = True 44 | 45 | def call(self, x, training=None, mask=None): # 添加mask参数, 变成掩码使用层 46 | layer_map = {'c': self.conv, 'b': self.bn, 47 | 'a': self.activation, 'd': self.dropout} 48 | 49 | for c in self.block_mode: 50 | layer = layer_map.get(c) 51 | # assert layer is not None, 'Layer at conv block mode but not been built' 52 | if layer is not None: 53 | if c in ['b', 'd']: 54 | x = layer(x, training=training) 55 | else: 56 | x = layer(x) 57 | else: 58 | warn(RuntimeWarning(f'ConvBlock: {c} in mode str but layer not built')) 59 | 60 | if self.do_mask and mask: 61 | mask = tf.cast(mask, 'float32') 62 | x = x * tf.expand_dims(mask, axis=-1) # [N, T_in, 1] 63 | return x 64 | 65 | 66 | class ZoneoutLSTMCell(layers.LSTMCell): 67 | def __init__(self, 68 | units, 69 | output_rate=0., 70 | cell_rate=None, 71 | name='zolstm', 72 | **kwargs): 73 | cell_rate = output_rate if cell_rate is None else cell_rate 74 | assert output_rate >= 0 and output_rate <= 1 75 | assert cell_rate >= 0 and cell_rate <= 1 76 | 77 | super(ZoneoutLSTMCell, self).__init__(units, name=name, **kwargs) 78 | self._cell_dropout = layers.Dropout(cell_rate, name='drop_cell') 79 | self._output_dropout = layers.Dropout(output_rate, name='drop_output') 80 | self._output_rate = output_rate 81 | self._cell_rate = cell_rate 82 | 83 | def call(self, x, state, training=None): 84 | '''Runs vanilla LSTM Cell and applies zoneout. 85 | ''' 86 | # print('DEBUG custom_layers: training', training) # RNN()会调用cell.call 2次 87 | output, new_state = super().call(x, state, training) # <=> LSTMCell.call(self, x, state) 88 | pre_c, pre_h = state 89 | new_c, new_h = new_state 90 | 91 | c = (1 - self._cell_rate) * self._cell_dropout(new_c - pre_c, training) + pre_c 92 | h = (1 - self._output_rate) * self._output_dropout(new_h - pre_h, training) + pre_h 93 | 94 | new_state = [c, h] 95 | return output, new_state 96 | 97 | def get_config(self): 98 | config = {'output_rate': self._output_rate, 99 | 'cell_rate': self._cell_rate} 100 | super_config = super(ZoneoutLSTMCell, self).get_config() 101 | config.update(super_config) 102 | return config 103 | 104 | 105 | class AttentionRNNCell(layers.StackedRNNCells): 106 | def __init__(self, 107 | units=[1024, 1024], 108 | zone_rate=0.1, 109 | name='atten_cell'): 110 | self.units = units 111 | self.zone_rate = zone_rate 112 | self.lstm_cells = [ZoneoutLSTMCell(k, zone_rate) for k in units] 113 | super(AttentionRNNCell, self).__init__(cells=self.lstm_cells, name=name) 114 | 115 | def get_initial_state(self, inputs=None, batch_size=None, dtype=tf.float32): 116 | return super().get_initial_state(inputs, batch_size, dtype=dtype) 117 | 118 | 119 | class Prenet(layers.Layer): 120 | def __init__(self, 121 | units, 122 | drop_rate=0.5, 123 | activation='relu', 124 | name='prenet'): 125 | super(Prenet, self).__init__(name=name) 126 | self.units = units 127 | self.drop_rate = drop_rate 128 | self.activation = activation 129 | self.dense_layers = [layers.Dense(k, activation, name='dense') for k in units] 130 | self.drop_layers = [layers.Dropout(drop_rate, name='drop') for k in range(len(units))] 131 | 132 | def call(self, x): 133 | for dense, drop in zip(self.dense_layers, self.drop_layers): 134 | x = dense(x) 135 | x = drop(x, training=True) 136 | return x 137 | 138 | 139 | class StopProjection(layers.Layer): 140 | """Projection to a scalar and through a sigmoid activation 141 | """ 142 | def __init__(self, 143 | units=1, 144 | activation='sigmoid', 145 | name='stop_projection'): 146 | super(StopProjection, self).__init__(name=name) 147 | self.units = units 148 | self.activation = activation 149 | self.dense = layers.Dense(units, None, name='dense') 150 | self.activation_layer = layers.Activation(activation) 151 | 152 | def call(self, x, training=None): 153 | if training is None: 154 | training = K.learning_phase() 155 | x = self.dense(x) 156 | x = tf_utils.smart_cond(training, 157 | true_fn=lambda: tf.identity(x), 158 | false_fn=lambda: self.activation_layer(x)) 159 | return x 160 | 161 | 162 | class FrameProjection(layers.Layer): 163 | """Projection layer to r * num_mels dimensions or num_mels dimensions 164 | """ 165 | def __init__(self, 166 | units=80, 167 | activation='relu', # our mel is normalized to [0, 1]; sygst is None 168 | # activation=None, # sygst and rayhame are None 169 | name='frame_projection'): 170 | super(FrameProjection, self).__init__(name=name) 171 | self.units = units 172 | self.activation = activation 173 | self.dense = layers.Dense(units, activation, name='dense') 174 | 175 | def call(self, x): 176 | x = self.dense(x) 177 | return x 178 | 179 | 180 | class Postnet(layers.Layer): 181 | def __init__(self, 182 | num_layers, 183 | channels, 184 | kernel_size, 185 | drop_rate, 186 | output_units, 187 | output_activation, 188 | name='postnet'): 189 | """ 190 | # Arguments 191 | output_units: it is usually the num_mels for mel residual connection 192 | """ 193 | super(Postnet, self).__init__(name=name) 194 | self.cnns = [ConvBlock('cabd', '1D', channels, kernel_size, drop_rate, 'tanh') 195 | for i in range(num_layers - 1)] 196 | self.last_cnn = ConvBlock('cbd', '1D', channels, kernel_size, drop_rate, None) 197 | self.dim_projection = FrameProjection(output_units, output_activation, name='postnet_proj') 198 | 199 | def call(self, x, training=None): 200 | for cnn in self.cnns: 201 | x = cnn(x, training) 202 | x = self.last_cnn(x, training) 203 | x = self.dim_projection(x) 204 | return x 205 | 206 | 207 | class HighwayNet(layers.Layer): 208 | def __init__(self, units, name='highway_net'): 209 | super(HighwayNet, self).__init__(name=name) 210 | self.units = units 211 | self.h_layer = layers.Dense(units, 'relu', name='H') 212 | self.t_layer = layers.Dense(units, 'sigmoid', name='T', 213 | bias_initializer=tf.constant_initializer(-1.)) 214 | 215 | self.supports_masking = True 216 | 217 | def call(self, x): 218 | h = self.h_layer(x) 219 | t = self.t_layer(x) 220 | x = h * t + x * (1. - t) 221 | return x 222 | 223 | 224 | class CBHG(layers.Layer): 225 | def __init__(self, 226 | K, 227 | conv_channels, 228 | pool_size, 229 | projections, 230 | highway_units, 231 | highway_nums, 232 | rnn_units, 233 | name='cbhg'): 234 | super(CBHG, self).__init__(name=name) 235 | self.K = K 236 | self.conv_channels = conv_channels 237 | self.pool_size = pool_size 238 | self.projections = projections 239 | self.highway_units = highway_units 240 | self.highway_nums = highway_nums 241 | self.rnn_units = rnn_units 242 | 243 | self.conv_banks = [ConvBlock('cab', '1D', conv_channels, k, name='conv_banks') 244 | for k in range(1, K + 1)] 245 | self.pool = layers.MaxPool1D(pool_size, strides=1, padding='same', name='pool') 246 | acts = ['relu'] * (len(projections) - 1) + [None] 247 | self.conv_projections = [ConvBlock('cab', '1D', c, 3, None, a, name='conv_projs') 248 | for c, a in zip(projections, acts)] 249 | self.highway_nets = [HighwayNet(highway_units, name='highway') 250 | for _ in range(highway_nums)] 251 | self.gru_layer = layers.Bidirectional(layers.GRU(rnn_units, return_sequences=True), 252 | name='bigru') 253 | 254 | self.supports_masking = True 255 | 256 | def call(self, x, training=None): 257 | original_x = x 258 | # K conv banks: concat on the last axis to stack channels from all convs 259 | x = tf.concat([cnn(x, training=training) for cnn in self.conv_banks], axis=-1) 260 | 261 | # MaxPooling 262 | x = self.pool(x) 263 | 264 | # 2-layer conv projections 265 | for conv_proj in self.conv_projections: 266 | x = conv_proj(x, training) 267 | 268 | # Residual connection 269 | x = original_x + x 270 | 271 | # 4-layer HighwayNet 272 | if x.shape[2] != self.highway_units: 273 | x = layers.Dense(self.highway_units, name='dim_dense')(x) 274 | for highway in self.highway_nets: 275 | x = highway(x) 276 | 277 | # 1-layer bidirectional GRU 278 | x = self.gru_layer(x, training=training) 279 | return x 280 | 281 | 282 | class ReferenceEncoder(layers.Layer): 283 | def __init__(self, 284 | channels, 285 | kernel_size=(3, 3), 286 | strides=(2, 2), 287 | rnn_units=128, 288 | output_units=128, 289 | dropout_rate=0.5, 290 | name='ref_encoder'): 291 | super(ReferenceEncoder, self).__init__(name=name) 292 | self.stride = strides[0] 293 | self.rnn_units = rnn_units 294 | self.output_units = output_units 295 | 296 | # 6-layer conv2d 297 | self.cnns = [ConvBlock('cabd', '2D', c, kernel_size, dropout_rate, strides=strides) 298 | for c in channels] 299 | 300 | # 1-layer bi-gru 301 | single_layer = layers.GRU(rnn_units, return_state=False) 302 | self.rnn = layers.Bidirectional(single_layer, name='bigru') 303 | 304 | # 1-layer dense 305 | self.dense = layers.Dense(output_units, 'tanh', name='output_dense') 306 | 307 | def call(self, x, x_length, training=None): 308 | if x.shape.ndims == 3: 309 | x = tf.expand_dims(x, axis=-1) 310 | 311 | # 6-layer conv2d 312 | for cnn in self.cnns: 313 | x = cnn(x, training) 314 | x_length = (x_length + self.stride - 1) // self.stride 315 | 316 | # stack the rest mel dim to the conv channels 317 | x = tf.concat(tf.unstack(x, axis=2), axis=-1) # [N, time, mel_dim * channels] 318 | 319 | # 1-layer bigru with mask 320 | rnn_mask = tf.sequence_mask(x_length, maxlen=tf.shape(x)[1]) 321 | x._keras_mask = rnn_mask 322 | x = self.rnn(x, training=training) # [N, rnn_units * 2] 323 | 324 | # 1-layer dense for final output 325 | x = self.dense(x) # [N, output_units] 326 | return x 327 | -------------------------------------------------------------------------------- /modules/decoders.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | # from tensorflow import keras 3 | from tensorflow.keras import layers 4 | 5 | from tensorflow.python import rnn 6 | from tensorflow.python.util import nest 7 | from tensorflow.python.keras.utils import tf_utils 8 | 9 | from collections import namedtuple 10 | 11 | # from utils.debug import debug_print 12 | 13 | """ 14 | # 此种写法不支持tf.convert_to_tensor() 15 | class DecoderCellState: 16 | def __init__(self, 17 | attention_rnn_state, 18 | attention_context, 19 | attention_alignments, 20 | attention_accum_alignments, 21 | decode_rnn_state): 22 | self.attention_rnn_state = attention_rnn_state 23 | self.attention_context = attention_context 24 | self.attention_alignments = attention_alignments 25 | self.attention_accum_alignments = attention_accum_alignments 26 | self.decode_rnn_state = decode_rnn_state 27 | """ 28 | 29 | DecoderCellState = namedtuple('DecoderCellState', 30 | ['attention_state', 31 | 'attention_rnn_state', 32 | 'decode_rnn_state']) 33 | 34 | 35 | class DecoderCell(layers.Layer): 36 | def __init__(self, 37 | prenet, 38 | attention_rnn, 39 | attention_layer, 40 | frame_projection, 41 | stop_projection, 42 | decode_rnn=None, 43 | text_input_shape=None, 44 | name='decoder_cell'): 45 | """ 46 | # Arguments 47 | text_input_shape: the shape of encoder outputs, must be get by tf.shape 48 | """ 49 | super(DecoderCell, self).__init__(name=name) 50 | self.prenet = prenet 51 | self.attention_rnn = attention_rnn 52 | self.attention_layer = attention_layer 53 | self.frame_projection = frame_projection 54 | self.stop_projection = stop_projection 55 | self.decode_rnn = decode_rnn 56 | 57 | self._batch_size = None 58 | self._text_time_steps = None 59 | 60 | def call(self, x, state, training=None): 61 | # prenet 62 | prenet_output = self.prenet(x) 63 | atten_rnn_input = tf.concat([prenet_output, 64 | state.attention_state.context], 65 | axis=-1, name='prenet_concat') 66 | # attention rnn 67 | atten_rnn_output, atten_rnn_state = self.attention_rnn(atten_rnn_input, 68 | state.attention_rnn_state, 69 | training=training) 70 | # nvidia-torch -> drop(atten_rnn_output, 0.1) 71 | 72 | # attention computation 73 | atten_context, atten_state = self.attention_layer(atten_rnn_output, 74 | state.attention_state, 75 | training=training) 76 | atten_rnn_context_cat = tf.concat([atten_rnn_output, atten_context], 77 | axis=-1, name='atten_rnn_concat') 78 | if self.decode_rnn: 79 | decode_rnn_output, decode_rnn_state = self.decode_rnn(atten_rnn_context_cat, 80 | state.decode_rnn_state, 81 | training=training) 82 | # nvidia-torch -> drop(decode_rnn_output, 0.1) 83 | projection_input = tf.concat([decode_rnn_output, atten_context], 84 | axis=-1, name='decode_rnn_concat') 85 | else: 86 | decode_rnn_state = tf.zeros(tf.shape(x)[0]) 87 | projection_input = atten_rnn_context_cat 88 | 89 | cell_outputs = self.frame_projection(projection_input) 90 | stop_tokens = self.stop_projection(projection_input) 91 | 92 | next_state = DecoderCellState(atten_state, 93 | atten_rnn_state, 94 | decode_rnn_state) 95 | return (cell_outputs, stop_tokens, atten_state.alignments), next_state 96 | 97 | def set_batch_timesteps(self, batch_size, text_time_steps): 98 | self.batch_size = batch_size 99 | self.text_time_steps = text_time_steps 100 | 101 | def get_init_state(self): 102 | assert self.batch_size is not None, 'batch_size is None, can not get initial state' 103 | 104 | attention_state = self.attention_layer.init_state 105 | attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=self.batch_size) 106 | if self.decode_rnn: 107 | decode_rnn_state = self.decode_rnn.get_initial_state(batch_size=self.batch_size) 108 | else: 109 | decode_rnn_state = tf.zeros(self.batch_size) # 不能写成None 110 | return DecoderCellState(attention_state, 111 | attention_rnn_state, 112 | decode_rnn_state) 113 | 114 | @property 115 | def output_size(self): 116 | return (self.frame_projection.units, 117 | self.stop_projection.units, 118 | self.text_time_steps) 119 | 120 | @property 121 | def output_dtype(self): 122 | return (tf.float32, tf.float32, tf.float32) 123 | 124 | @property 125 | def batch_size(self): 126 | return self._batch_size 127 | 128 | @batch_size.setter 129 | def batch_size(self, batch_size): 130 | assert isinstance(batch_size, (tf.Tensor, int)) 131 | self._batch_size = batch_size 132 | 133 | @property 134 | def text_time_steps(self): 135 | return self._text_time_steps 136 | 137 | @text_time_steps.setter 138 | def text_time_steps(self, text_time_steps): 139 | assert isinstance(text_time_steps, tf.Tensor) 140 | self._text_time_steps = text_time_steps 141 | 142 | 143 | class Decoder(layers.Layer): 144 | def __init__(self, 145 | hp, 146 | decoder_cell, 147 | gta=False, # ground truth alignment 148 | impute_finished=False, 149 | maximum_steps=2000, 150 | parallel_iterations=32, 151 | swap_memory=False, 152 | name='decoder'): 153 | super(Decoder, self).__init__(name=name) 154 | self.hp = hp 155 | self.cell = decoder_cell 156 | self.gta = gta 157 | self.impute_finished = impute_finished 158 | self.maximum_steps = maximum_steps 159 | self.parallel_iterations = parallel_iterations 160 | self.swap_memory = swap_memory 161 | 162 | self.r = hp.outputs_per_step 163 | self.feed_last_frame = hp.feed_last_frame 164 | self.input_dim = hp.num_mels * (1 if hp.feed_last_frame else self.r) 165 | 166 | def set_inputs(self, inputs, inputs_lengths): 167 | self._inputs, self._inputs_lengths = None, None 168 | if inputs is not None and inputs_lengths is not None: 169 | batch = tf.shape(inputs)[0] 170 | self._inputs_lengths = tf.cast(inputs_lengths / self.r, dtype=tf.int32) 171 | if self.feed_last_frame: 172 | self._inputs = inputs[:, self.r - 1::self.r, :] 173 | else: 174 | self._inputs = tf.reshape(inputs, [batch, -1, self.input_dim]) 175 | 176 | def get_init_values(self): 177 | with tf.name_scope('while_loop_init_values'): 178 | init_time = tf.constant(0, dtype=tf.int32) 179 | init_state = self.cell.get_init_state() 180 | init_inputs = tf.zeros(shape=(self.batch_size, self.input_dim)) 181 | init_finished = tf.tile([False], [self.batch_size]) 182 | init_seq_lengths = tf.zeros(shape=self.batch_size, dtype=tf.int32) 183 | 184 | def _create_tensor_array(s, d): 185 | return tf.TensorArray(size=0, dtype=d, 186 | dynamic_size=True, 187 | # element_shape=(self.batch_size, s)) 188 | element_shape=None) 189 | init_outputs_ta = nest.map_structure(_create_tensor_array, 190 | self.cell.output_size, 191 | self.cell.output_dtype) 192 | 193 | return init_time, init_outputs_ta, init_state, init_inputs, init_finished, init_seq_lengths 194 | 195 | def _next_inputs(self, time, outputs): 196 | mel_output, stop_token = outputs[:2] # [N, mel_num * r], [N, r] 197 | 198 | def _true_fn(): # training=True or gta=True 199 | next_inputs = self._inputs[:, time, :] 200 | finished = (time + 1 >= self._inputs_lengths) # 下一个时间步是否完成 201 | # finished = (time + 1 >= tf.shape(self._inputs)[1]) # 防止出现整个batch最大长度小于time_step 202 | # 此句报错(构建计算图时), 说迭代前后shape不一致, 可能是因为tf.shape(self._inputs)[1]的原因 203 | # 后面是直接将原始的输入做trim到r的整数倍就可以直接用time>=self._input_lengths了 204 | return next_inputs, finished 205 | 206 | def _false_fn(): 207 | next_inputs = mel_output[:, -self.input_dim:] 208 | finished = tf.cast(tf.round(stop_token), tf.bool) # >0.5->1, <=0.5->0 209 | finished = tf.reduce_any(finished, axis=1) # maximum_iteraions is set at tf.while_loop 210 | return next_inputs, finished 211 | assert not (self.gta and self._inputs is None) 212 | pred = tf.logical_or(self.training, self.gta) 213 | return tf_utils.smart_cond(pred, _true_fn, _false_fn) 214 | 215 | def step(self, time, inputs, state): 216 | """ 217 | # Arguments 218 | time: current time step 219 | inputs: current time step inputs 220 | state: previous time step state 221 | # Returns 222 | outputs: current time step outputs 223 | next_state: current time step sate 224 | next_inputs: next time step inputs 225 | next_finished: whether the next time step is finished 226 | # ps: when at the i-th time step 227 | inputs[i] = targets[i - 1] 228 | param state = state[i - 1] 229 | outputs[i] = targets[i] = inputs[i + 1] 230 | next_state = state[i] = param state[i + 1] 231 | next_finished = finished[i + 1] 232 | """ 233 | outputs, next_state = self.cell(inputs, state, self.training) 234 | next_inputs, next_finished = self._next_inputs(time, outputs) 235 | return outputs, next_state, next_inputs, next_finished 236 | 237 | def call(self, inputs=None, inputs_lengths=None, batch_size=None, training=None): 238 | """ 239 | # Arguments 240 | inputs: mel spectrum inputs, with shape [N, frame_nums, num_mels] 241 | it can be None when in inference phase 242 | inputs_lengths: mel spectrum lengths 243 | batch_size: used in inference phase where inputs is None 244 | """ 245 | assert inputs is not None or batch_size is not None 246 | 247 | self.training = training 248 | self.batch_size = batch_size if inputs is None else tf.shape(inputs)[0] 249 | self.set_inputs(inputs, inputs_lengths) 250 | init_values = self.get_init_values() 251 | zero_outputs = nest.map_structure(lambda s, d: tf.zeros((self.batch_size, s), d), 252 | self.cell.output_size, 253 | self.cell.output_dtype) 254 | 255 | def condition(unused_time, unused_outputs_ta, unused_state, 256 | unused_inputs, finished, unused_seq_lengths): 257 | return tf.logical_not(tf.reduce_all(finished)) 258 | 259 | def body(time, outputs_ta, state, inputs, finished, seq_lengths): 260 | """ 261 | # Arguments 262 | time: current time step 263 | outputs_ta: the tensor array outputs, for collecting outputs at each time step 264 | inputs: current time step inputs 265 | finished: whether current time step is marked as finished 266 | seq_lengths: the actually lengths for each sample in batch(only used at inference phase)) 267 | 268 | # Returns 269 | final_outputs: a sequence of outputs from decoder cell, i.e., (mel, stop, alignments) 270 | final_state: the last state from decoder cell 271 | final_seq_lengths: the actually lengths of outputs sequence (used at inference) 272 | """ 273 | (outputs, next_state, next_inputs, next_finished) = self.step(time, inputs, state) 274 | next_finished = tf.logical_or(finished, next_finished) 275 | next_seq_lengths = seq_lengths + tf.cast(tf.logical_not(finished), dtype=tf.int32) 276 | 277 | nest.assert_same_structure(state, next_state) 278 | nest.assert_same_structure(outputs_ta, outputs) 279 | nest.assert_same_structure(inputs, next_inputs) 280 | # note: the following two lines use the 'finished' instead of 'next_finished' 281 | # the output at time after finished will be zero 282 | if self.impute_finished: # output zero and copy the state 283 | emit = nest.map_structure(lambda zero, out: tf.where(finished, zero, out), 284 | zero_outputs, outputs) 285 | # the state at time after finished will be copied the last state 286 | next_state = nest.map_structure(lambda cur, new: tf.where(finished, cur, new), 287 | state, next_state) 288 | else: 289 | emit = outputs 290 | outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), 291 | outputs_ta, emit) 292 | return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_seq_lengths) 293 | 294 | res = tf.while_loop(condition, body, loop_vars=init_values, 295 | parallel_iterations=self.parallel_iterations, 296 | maximum_iterations=self.maximum_steps, 297 | swap_memory=self.swap_memory) 298 | final_outputs_ta = res[1] 299 | final_state = res[2] 300 | final_seq_lengths = res[5] 301 | 302 | final_outputs = nest.map_structure(lambda ta: ta.stack(), 303 | final_outputs_ta) 304 | final_outputs = nest.map_structure(rnn._transpose_batch_time, 305 | final_outputs) 306 | 307 | return final_outputs, final_state, final_seq_lengths 308 | -------------------------------------------------------------------------------- /modules/encoders.py: -------------------------------------------------------------------------------- 1 | # import tensorflow as tf 2 | # from tensorflow import keras 3 | from tensorflow.keras import layers 4 | 5 | 6 | # from taco2_hparams import hp 7 | from modules import custom_layers as cs 8 | 9 | 10 | class Tacotron2Encoder(layers.Layer): 11 | def __init__(self, hp, name='taco2_encoder'): 12 | super(Tacotron2Encoder, self).__init__(name=name) 13 | # embedding layer 14 | self.embed_layer = layers.Embedding(hp.num_symbols, hp.embedding_dim, mask_zero=True) 15 | 16 | # 3-layer conv1d 17 | cnns_num, ksize, channels = hp.encoder_cnns 18 | self.cnns = [cs.ConvBlock('cabd', '1D', channels, ksize, hp.dropout_rate) 19 | for i in range(cnns_num)] 20 | 21 | # 1-layer bi-lstm 22 | units, zo_rate = hp.encoder_rnns_units, hp.zoneout_rate 23 | single_layer = layers.RNN(cs.ZoneoutLSTMCell(units, zo_rate), 24 | return_sequences=True) 25 | # with mask, outputs zero for time step that mask is 0 26 | self.rnn = layers.Bidirectional(single_layer, name='bilstm') 27 | 28 | def call(self, x, training=None): 29 | x = self.embed_output = self.embed_layer(x) # x是mask_tensor, 即, x._keras_tensor != None 30 | for cnn in self.cnns: 31 | x = cnn(x, training=training) # 有bn和dropout, 必须传递training 32 | x = self.rnn(x, training=training) # bi-rnn必须用keyword arguments传递 33 | return x 34 | 35 | 36 | class TacotronEncoder(layers.Layer): 37 | def __init__(self, hp, name='taco_encoder'): 38 | super(TacotronEncoder, self).__init__(name=name) 39 | 40 | # embedding layer 41 | self.embed_layer = layers.Embedding(hp.num_symbols, hp.embedding_dim, mask_zero=True) 42 | 43 | # 2-dense-layer prenet 44 | # self.prenet = cs.Prenet(units=[256, 128], name='prenet') 45 | self.prenet = cs.Prenet(units=[256, 256], name='prenet') 46 | 47 | # cbhg block 48 | """ 49 | self.cbhg = cs.CBHG(K=16, 50 | conv_channels=128, 51 | pool_size=2, 52 | projections=[128, 128], 53 | highway_units=128, 54 | highway_nums=4, 55 | rnn_units=128, # 标准tacotron是128 56 | name='cbhg') 57 | """ 58 | self.cbhg = cs.CBHG(K=16, 59 | conv_channels=256, 60 | pool_size=2, 61 | projections=[256, 256], 62 | highway_units=256, 63 | highway_nums=4, 64 | rnn_units=256, # 标准tacotron是128 65 | name='cbhg') 66 | 67 | def call(self, x, training=None): 68 | x = self.embed_output = self.embed_layer(x) 69 | x = self.prenet(x) 70 | x = self.cbhg(x, training=training) 71 | return x 72 | -------------------------------------------------------------------------------- /modules/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def get_mel_loss(targets, outputs, spec_len=None, method='mse'): 5 | assert method in ['mse', 'mae'], 'loss method:{method} is not valid' 6 | 7 | norm_func = tf.square if method == 'mse' else tf.abs 8 | norm = norm_func(targets - outputs) 9 | 10 | if spec_len is not None: # mask loss 11 | time_step = tf.shape(outputs)[1] 12 | mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32) 13 | sum_n = tf.reduce_sum(mask) * tf.cast(tf.shape(outputs)[-1], tf.float32) 14 | loss = norm * tf.expand_dims(mask, axis=-1) 15 | else: 16 | sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32) 17 | loss = norm 18 | 19 | name = 'mel_{}{}_loss'.format('' if spec_len is None else 'mask_', method) 20 | loss = tf.truediv(tf.reduce_sum(loss), sum_n, name=name) 21 | return loss 22 | 23 | 24 | def get_spec_loss(targets, outputs, priority_freq_n, spec_len=None, method='mae'): 25 | assert method in ['mse', 'mae'], 'loss method:{method} is not valid' 26 | 27 | norm_func = tf.square if method == 'mse' else tf.abs 28 | norm = norm_func(targets - outputs) 29 | priority_freq_n = tf.cast(priority_freq_n, tf.int32) 30 | 31 | if spec_len is not None: # mask loss 32 | time_step = tf.shape(outputs)[1] 33 | mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32) 34 | sum_n = tf.reduce_sum(mask) * tf.cast(tf.shape(outputs)[-1], tf.float32) 35 | sum_m = tf.reduce_sum(mask) * tf.cast(priority_freq_n, tf.float32) 36 | loss = norm * tf.expand_dims(mask, axis=-1) 37 | else: 38 | sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32) 39 | sum_m = tf.cast(tf.reduce_prod(tf.shape(outputs)[: 2]) * priority_freq_n, tf.float32) 40 | loss = norm 41 | 42 | name = 'spec_{}{}_loss'.format('' if spec_len is None else 'mask_', method) 43 | # Prioritize loss for frequencies under 2000 Hz. 44 | loss_low = loss[:, :, 0: priority_freq_n] 45 | loss = [tf.reduce_sum(loss) / sum_n, tf.reduce_sum(loss_low) / sum_m] 46 | loss = tf.tensordot(loss, [0.5, 0.5], axes=1, name=name) 47 | return loss 48 | 49 | 50 | def get_stop_loss(targets, outputs, outputs_per_step=None, spec_len=None, do_mask=False, pos_weight=1.): 51 | time_step = tf.shape(outputs)[1] 52 | if targets is None: 53 | assert spec_len is not None, 'stop token targets and spec_len can not be both None' 54 | pre_zero_len = spec_len - outputs_per_step 55 | pre_zero_mask = tf.cast(tf.sequence_mask(pre_zero_len, time_step), tf.float32) 56 | targets = tf.ones_like(outputs) - pre_zero_mask 57 | 58 | loss = tf.nn.weighted_cross_entropy_with_logits(labels=targets, logits=outputs, pos_weight=pos_weight) 59 | 60 | if do_mask: 61 | assert spec_len is not None, 'do_mask=True requires spec_len is not None' 62 | mask = tf.cast(tf.sequence_mask(spec_len, time_step), tf.float32) 63 | sum_n = tf.reduce_sum(mask) 64 | loss = loss * mask 65 | else: 66 | sum_n = tf.cast(tf.reduce_prod(tf.shape(outputs)), tf.float32) 67 | 68 | name = 'stop_{}loss'.format('mask_' if do_mask else '') 69 | loss = tf.truediv(tf.reduce_sum(loss), sum_n, name=name) 70 | return loss 71 | -------------------------------------------------------------------------------- /predict_attention.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import argparse 4 | import numpy as np 5 | 6 | import tensorflow as tf 7 | from tensorflow.keras.layers import Input 8 | 9 | from text import text_to_sequence 10 | 11 | from sygst_hparams import hp as sygst_hp 12 | from embjoint_hparams import hp as embgst_joint_hp 13 | from ser.hparams import hp as ser_hp 14 | from emogst_hparams import hp as emogst_hp 15 | from models.sygst_tacotron2 import Tacotron2SYGST 16 | from models.embgst_tacotron2_joint import Tacotron2EMBGSTJoint 17 | from models.emogst_tacotron2 import Tacotron2EMOGST 18 | 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 20 | 21 | map_model = {'sygst': Tacotron2SYGST, 'embgst_joint': Tacotron2EMBGSTJoint, 'emogst': Tacotron2EMOGST} 22 | map_hp = {'sygst': sygst_hp, 'embgst_joint': embgst_joint_hp, 'emogst': emogst_hp} 23 | 24 | 25 | class AttentionPredictor: 26 | def __init__(self, model_name='sygst'): 27 | assert model_name in ['sygst', 'embgst', 'embgst_joint', 'emogst'] 28 | 29 | self.hp = map_hp[model_name] 30 | self.model = map_model[model_name](self.hp) if model_name != 'embgst' else map_model[model_name](self.hp, ser_hp) 31 | self.model_name = model_name 32 | self.cleaner_names = [x.strip() for x in self.hp.cleaners.split(',')] 33 | 34 | # build model 35 | with tf.name_scope('model'): # 只能和训练时的scope一致 36 | d = self.hp.emotion_embedding_units 37 | self.text_inputs = Input([None], dtype=tf.int32, name='text_inputs') 38 | self.mel_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='mel_inputs') 39 | self.spec_lengths = Input([], dtype=tf.int32, name='spec_lengths') 40 | self.aro_embed = Input([d], dtype=tf.float32, name='aro_embed') 41 | self.val_embed = Input([d], dtype=tf.float32, name='val_embed') 42 | 43 | call_fn_kwargs = {'mel_inputs': self.mel_inputs, 44 | 'spec_lengths': self.spec_lengths, 45 | 'training': False} 46 | if model_name == 'embgst': 47 | call_fn_kwargs.update(arousal_embedding=self.aro_embed, 48 | valence_embedding=self.val_embed) 49 | self.model(self.text_inputs, **call_fn_kwargs) 50 | 51 | def load(self, ckpt_path): 52 | print('Loading checkpoint: %s' % ckpt_path) 53 | self.eval_step = re.search(r'ckpt-(\d+)', ckpt_path).group(1) 54 | self.session = tf.Session() 55 | saver = tf.train.Saver() 56 | saver.restore(self.session, ckpt_path) 57 | 58 | def predict(self, mel_inputs=None, spec_lengths=None, aro_embed=None, val_embed=None): 59 | seq = text_to_sequence('hello', self.cleaner_names) 60 | feed_dict = {self.text_inputs: [np.asarray(seq, dtype=np.int32)]} 61 | if mel_inputs is not None: 62 | assert spec_lengths is not None 63 | mel_inputs = np.expand_dims(mel_inputs, 0).astype(np.float32) 64 | spec_lengths = np.expand_dims(spec_lengths, 0).astype(np.int32) 65 | feed_dict.update({self.mel_inputs: mel_inputs, self.spec_lengths: spec_lengths}) 66 | if self.model_name in ['sygst', 'emogst']: 67 | attention_outputs = self.model.gst_weights 68 | elif self.model_name == 'embgst_joint': 69 | attention_outputs = [self.model.aro_weights, self.model.val_weights] 70 | else: 71 | raise ValueError('when mel_inputs is not None, model must be sygst or embgst_joint') 72 | else: 73 | assert aro_embed is not None or val_embed is not None 74 | if aro_embed is not None: 75 | aro_embed = np.expand_dims(aro_embed, 0) 76 | feed_dict.update({self.arousal_embedding: aro_embed.astype(np.float32)}) 77 | attention_outputs = self.model.aro_weights 78 | else: 79 | val_embed = np.expand_dims(val_embed, 0) 80 | feed_dict.update({self.valence_embedding: val_embed.astype(np.float32)}) 81 | attention_outputs = self.model.val_weights 82 | 83 | attention_weights = self.session.run(attention_outputs, feed_dict=feed_dict) 84 | return attention_weights 85 | 86 | 87 | def process_fold(args, model, ref_path, output_path, emo_type='arousal', max_items=50): 88 | atten_list = [] 89 | # ref_names = [os.path.join(ref_path, name) for name in sorted(os.listdir(ref_path))] 90 | ref_names = [os.path.join(ref_path, name) for name in os.listdir(ref_path)] 91 | ref_names = ref_names[:max_items] if max_items is not None else ref_names 92 | 93 | for ref_name in ref_names: 94 | ref_feature = np.load(ref_name) 95 | if args.model_name in ['sygst', 'embgst_joint', 'emogst']: 96 | ref_len = ref_feature.shape[0] 97 | # if ref_len < 250 or ref_len > 1000: 98 | # continue 99 | atten_weight = model.predict(mel_inputs=ref_feature, spec_lengths=ref_len) 100 | if args.model_name == 'embgst_joint': 101 | atten_weight = atten_weight[0] if args.emo_type == 'arousal' else atten_weight[1] 102 | else: 103 | assert emo_type in ['arousal', 'valence'] 104 | if emo_type == 'arousal': 105 | atten_weight = model.predict(aro_embed=ref_feature) 106 | else: 107 | atten_weight = model.predict(val_embed=ref_feature) 108 | atten = np.squeeze(atten_weight, 0) # [num_heads, 1, num_tokens] 109 | atten_list.append(atten) 110 | atten_list_np = np.array(atten_list) 111 | avg_atten = np.mean(atten_list_np, axis=0) 112 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 113 | np.save(output_path, avg_atten) 114 | print(f'Process finished for {args.model_name} {emo_type} with shape: {atten_list_np.shape}') 115 | 116 | 117 | def process_arousal(args, model): 118 | for i in range(2): 119 | ref_path = args.model_name + f'_emo_data/emo2d_mel_npys/arousal{i}' 120 | output_path = args.model_name + f'_emo_data/emo2d_mel_gst_weights/arousal{i}.npy' 121 | if args.model_name == 'embgst': 122 | ref_path = ref_path.replace('_mel_', '_embed_') 123 | output_path = output_path.replace('_mel_', '_embed_') 124 | process_fold(args, model, ref_path, output_path, 'arousal') 125 | 126 | 127 | def process_valence(args, model): 128 | for i in range(2): 129 | ref_path = args.model_name + f'_emo_data/emo2d_mel_npys/valence{i}' 130 | output_path = args.model_name + f'_emo_data/emo2d_mel_gst_weights/valence{i}.npy' 131 | if args.model_name == 'embgst': 132 | ref_path = ref_path.replace('_mel_', '_embed_') 133 | output_path = output_path.replace('_mel_', '_embed_') 134 | process_fold(args, model, ref_path, output_path, 'valence') 135 | 136 | 137 | def process_emotion(args, model): 138 | for i in range(4): 139 | ref_path = args.model_name + f'_emo_data/emo_mel_npys/emo{i}' 140 | output_path = args.model_name + f'_emo_data/emo_gst_weights/emo{i}.npy' 141 | process_fold(args, model, ref_path, output_path) 142 | 143 | 144 | def main(): 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument('--model_name', '-m', default='sygst') 147 | parser.add_argument('--ckpt_step', '-c', default=None) 148 | args = parser.parse_args() 149 | 150 | assert args.model_name in ['sygst', 'embgst', 'embgst_joint', 'emogst'] 151 | 152 | ckpt_path = args.model_name + f'_emo_data/ckpts/model.ckpt-{args.ckpt_step}' 153 | model = AttentionPredictor(args.model_name) 154 | model.load(ckpt_path) 155 | 156 | if args.model_name == 'emogst': 157 | process_emotion(args, model) 158 | else: 159 | process_arousal(args, model) 160 | process_valence(args, model) 161 | 162 | 163 | if __name__ == '__main__': 164 | main() 165 | -------------------------------------------------------------------------------- /prepare_meta_from_tfr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from tqdm import tqdm 5 | from text import sequence_to_text 6 | from tfr_dset import load_for_prepare_meta_from_tfr 7 | 8 | 9 | max_steps = 100000 # actual 95650 10 | tfr_dir = 'bc2013/training/tfrs_with_emo_feature' 11 | meta_path = 'bc2013/full_meta.txt' 12 | mel_path = 'bc2013/mels' 13 | spec_path = 'bc2013/specs' 14 | 15 | 16 | def main(): 17 | tf_dset = load_for_prepare_meta_from_tfr(tfr_dir) 18 | feats = tf_dset.make_one_shot_iterator().get_next() 19 | 20 | i, lines, mels, specs = 1, [], [], [] 21 | lines.append('# emo|aro|val|text_len|spec_len|text|uid|mel|spec\n') 22 | 23 | pbar = tqdm(max_steps) 24 | sess = tf.Session() 25 | try: 26 | while True: 27 | fetched_feats = sess.run(feats) 28 | # uid = fetched_feats['uid'].tobytes().decode('utf-8') 29 | uid = fetched_feats['uid'].decode('utf-8') 30 | text = sequence_to_text(fetched_feats['inputs']) 31 | text_lens = fetched_feats['input_lengths'] 32 | mel = fetched_feats['mel_targets'] 33 | spec = fetched_feats['linear_targets'] 34 | spec_lens = fetched_feats['spec_lengths'] 35 | emo = '[{:.5f}, {:.5f}, {:.5f}, {:.5f}]'.format(*fetched_feats['soft_emo_labels']) 36 | aro = '[{:.5f}, {:.5f}]'.format(*fetched_feats['soft_arousal_labels']) 37 | val = '[{:.5f}, {:.5f}]'.format(*fetched_feats['soft_valance_labels']) 38 | mel_name = os.path.join(mel_path, f'bc13-mel-{i:06d}.npy') 39 | spec_name = os.path.join(spec_path, f'bc13-spec-{i:06d}.npy') 40 | line = f'{emo}|{aro}|{val}|{text_lens}|{spec_lens}|{text}|{uid}|{mel_name}|{spec_name}\n' 41 | lines.append(line) 42 | mels.append([mel, mel_name]) 43 | specs.append([spec, spec_name]) 44 | pbar.update(1) 45 | i += 1 46 | except tf.errors.OutOfRangeError: 47 | print('sess.run finished!') 48 | finally: 49 | pbar.close() 50 | sess.close() 51 | 52 | os.makedirs(mel_path, exist_ok=True) 53 | os.makedirs(spec_path, exist_ok=True) 54 | for mel, mel_name in tqdm(mels): 55 | np.save(mel_name, mel) 56 | for spec, spec_name in tqdm(specs): 57 | np.save(spec_name, spec) 58 | with open(meta_path, 'w') as fw: 59 | fw.writelines(lines) 60 | print(f'total {i} items finished!') 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /sygst_hparams.py: -------------------------------------------------------------------------------- 1 | from utils.parameter import HParams 2 | 3 | hp = HParams( 4 | # text 5 | cleaners='english_cleaners', 6 | 7 | # audio 8 | num_mels=80, 9 | num_spec=1025, # n_fft / 2 + 1 only used when adding linear spectrograms post processing network 10 | sample_rate=16000, 11 | win_ms=50, # For 22050Hz, 1100 ~= 50 ms (If None, win_size=n_fft) (0.05 * sample_rate) 12 | hop_ms=12.5, # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) 13 | n_fft=2048, 14 | min_level_db=-100, 15 | ref_level_db=20, 16 | fmin=95, # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) 17 | fmax=7600, # To be increased/reduced depending on data. 18 | preemphasis=0.97, # filter coefficient. 19 | griffin_lim_power=1.5, # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice. 20 | griffin_lim_iters=60, # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence. 21 | 22 | # Tacotron 23 | outputs_per_step=3, # number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality) 24 | feed_last_frame=True, # whether feed all r frames or only the last frame of all the r frames 25 | stop_at_any=True, # Determines whether the decoder should stop when predicting to any frame or to all of them (True works pretty well) 26 | clip_outputs=True, # Whether to clip spectrograms to T2_output_range (even in loss computation). ie: Don't penalize model for exceeding output range and bring back to borders. 27 | lower_bound_decay=0.0, # Small regularizer for noise synthesis by adding small range of penalty for silence regions. Set to 0 to clip in Tacotron range. 28 | clip_min=0, 29 | clip_max=1, 30 | 31 | # Input parameters 32 | num_symbols=150, 33 | embedding_dim=512, # dimension of embedding space 34 | 35 | # Encoder parameters 36 | encoder_type='taco2', # ['taco2', 'taco'] taco encoder is cbhg encoder 37 | encoder_cnns=[3, 5, 512], # num_layers, kernel_size, channels 38 | encoder_rnns_units=256, # number of lstm units for each direction (forward and backward) 39 | 40 | # reference encoder parameters 41 | reference_channels=[32, 32, 64, 64, 128, 128], 42 | reference_rnn_units=128, 43 | 44 | # gst parameters 45 | #gst_heads=4, 46 | gst_heads=8, 47 | # gst_tokens=10, 48 | gst_tokens=16, 49 | # gst_units=256, 50 | gst_units=512, 51 | gst_atten_units=128, 52 | gst_atten_type='mlp', # attention type for gst self-attention module(dot or mlp) 53 | gst_activation=None, 54 | gst_trainable=True, # False at nvidia gst code 55 | 56 | # emotion parameters 57 | emo_used=True, 58 | emo_loss='softmax', # ['mae', 'mse', 'sigmoid', 'softmax'] 59 | emo_output_units=2, 60 | emotion_embedding_units=128, 61 | 62 | # Attention mechanism 63 | smoothing=False, # Whether to smooth the attention normalization function 64 | attention_type='location', # sma: stepwise monotonic; location: location sensitive 65 | attention_units=128, # dimension of attention space 66 | attention_filters=32, # number of attention convolution filters 67 | attention_kernel_size=(31, ), # kernel size of attention convolution 68 | attention_sma_normalize=True, 69 | attention_sma_sigmoid_noise=2.0, 70 | attention_sma_sigmoid_noise_seed=None, 71 | attention_sma_score_bias_init=3.5, 72 | attention_sma_mode='parallel', 73 | 74 | # Attention synthesis constraints 75 | # "Monotonic" constraint forces the model to only look at the forwards attention_win_size steps. 76 | # "Window" allows the model to look at attention_win_size neighbors, both forward and backward steps. 77 | synthesis_constraint=False, # Whether to use attention windows constraints in synthesis only (Useful for long utterances synthesis) 78 | # synthesis_constraint_type='window', # can be in ('window', 'monotonic'). 79 | synthesis_win_size=7, # Side of the window. Current step does not count. If mode is window and attention_win_size is not pair, the 1 extra is provided to backward part of the window. 80 | synthesis_softmax_temp=1.0, 81 | 82 | # Decoder 83 | prenet_units=[256, 256], # number of layers and number of units of prenet 84 | attention_rnn_units=[1024, 1024], # number of decoder lstm layers 85 | decode_rnn_units=None, # number of decoder lstm units on each layer 86 | max_iters=2000, # Max decoder steps during inference (Just for safety from infinite loop cases) 87 | impute_finished=False, 88 | frame_activation='relu', 89 | 90 | # Residual postnet 91 | postnet_cnns=[5, 5, 512], # num_layers, kernel_size, channels 92 | 93 | # CBHG mel->linear postnet 94 | post_cbhg=True, 95 | cbhg_kernels=8, # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act as "K-grams" 96 | cbhg_conv_channels=128, # Channels of the convolution bank 97 | cbhg_pool_size=2, # pooling size of the CBHG 98 | cbhg_projection=256, # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels) 99 | cbhg_projection_kernel_size=3, # kernel_size of the CBHG projections 100 | cbhg_highway_nums=4, # Number of HighwayNet layers 101 | cbhg_highway_units=128, # Number of units used in HighwayNet fully connected layers 102 | cbhg_rnn_units=128, # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in shape 103 | 104 | # Loss params 105 | mask_encoder=True, # whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence. 106 | mask_decoder=False, # set False for alignments converging faster 107 | cross_entropy_pos_weight=20, # Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1=disabled) 108 | mel_loss='mae', 109 | spec_loss='mae', 110 | 111 | 112 | # Tacotron Training 113 | # Reproduction seeds 114 | random_seed=2020, # Determines initial graph and operations (i.e: model) random state for reproducibility 115 | # tacotron_data_random_state=1234, # random state for train test split repeatability 116 | 117 | # performance parameters 118 | tacotron_swap_with_cpu=False, # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!) 119 | 120 | # train/test split ratios, mini-batches sizes 121 | batch_size=32, # number of training samples on each training steps 122 | # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing). 123 | # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder. 124 | tacotron_synthesis_batch_size=1, # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!! 125 | tacotron_test_size=0.05, # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit) 126 | tacotron_test_batches=None, # number of test batches. 127 | 128 | # Learning rate schedule 129 | decay_learning_rate=True, # boolean, determines if the learning rate will follow an exponential decay 130 | start_decay=40000, # Step at which learning decay starts 131 | decay_steps=18000, # Determines the learning rate decay slope (UNDER TEST) 132 | decay_rate=0.5, # learning rate decay rate (UNDER TEST) 133 | # initial_learning_rate=1e-3, # starting learning rate 134 | initial_learning_rate=0.002, 135 | final_learning_rate=1e-4, # minimal learning rate 136 | 137 | # Optimization parameters 138 | adam_beta1=0.9, # AdamOptimizer beta1 parameter 139 | adam_beta2=0.999, # AdamOptimizer beta2 parameter 140 | adam_epsilon=1e-6, # AdamOptimizer Epsilon parameter 141 | 142 | # Regularization parameters 143 | # reg_weight=1e-6, # regularization weight (for L2 regularization) 144 | reg_weight=None, # regularization weight (for L2 regularization) 145 | scale_regularization=False, # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model) 146 | zoneout_rate=0.1, # zoneout rate for all LSTM cells in the network 147 | dropout_rate=0.5, # dropout rate for all convolutional layers + prenet 148 | clip_gradients=True, # whether to clip gradients 149 | ) 150 | -------------------------------------------------------------------------------- /sygst_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import time 4 | import argparse 5 | import traceback 6 | import numpy as np 7 | import tensorflow as tf 8 | from datetime import datetime 9 | 10 | 11 | from tfr_dset import TFDataSet 12 | from text import sequence_to_text 13 | from utils import audio, plot, infolog, ValueWindow, debug 14 | 15 | from sygst_hparams import hp 16 | from models.sygst_tacotron2 import Tacotron2SYGST 17 | 18 | log = infolog.log 19 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 20 | 21 | 22 | _max_step = 500000 23 | hdfs_ckpts='hdfs://haruna/home/byte_speech_sv/user/caixiong/ckpts' 24 | 25 | # spec_length max = 1116 26 | # text length max = 99 27 | 28 | 29 | def time_string(): 30 | return datetime.now().strftime('%Y-%m-%d %H:%M') 31 | 32 | 33 | def debug_data(batch=32, time_in=100, time_out=500): 34 | text_x = np.random.randint(0, 150, size=(batch, time_in), dtype=np.int32) 35 | mel = np.random.randn(batch, time_out, 80).astype(np.float32) 36 | spec = np.random.randn(batch, time_out, 1025).astype(np.float32) 37 | spec_len = np.random.randint(time_out // 2, time_out, size=batch, dtype=np.int32) 38 | aro_label = np.random.rand(batch, 2).astype(np.float32) 39 | val_label = np.random.rand(batch, 2).astype(np.float32) 40 | 41 | print('text_input:', text_x[0], 'spec_len:', spec_len, sep='\n') 42 | return text_x, mel, spec, spec_len, aro_label, val_label 43 | 44 | 45 | def train(log_dir, args): 46 | checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt') 47 | log(hp.to_string(), is_print=False) 48 | log('Loading training data from: %s' % args.tfr_dir) 49 | log('Checkpoint path: %s' % checkpoint_path) 50 | log('Using model: sygst tacotron2') 51 | 52 | tf_dset = TFDataSet(hp, args.tfr_dir) 53 | feats = tf_dset.get_train_next() 54 | # Set up model: 55 | global_step = tf.Variable(0, name='global_step', trainable=False) 56 | training = tf.placeholder_with_default(True, shape=(), name='training') 57 | with tf.name_scope('model'): 58 | model = Tacotron2SYGST(hp) 59 | model(feats['inputs'], 60 | mel_inputs=feats['mel_targets'], 61 | spec_inputs=feats['linear_targets'], 62 | spec_lengths=feats['spec_lengths'], 63 | ref_inputs=feats['mel_targets'], 64 | ref_lengths=feats['spec_lengths'], 65 | arousal_labels=feats['soft_arousal_labels'], 66 | valence_labels=feats['soft_valance_labels'], 67 | training=training) 68 | """ 69 | text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10) 70 | model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training) 71 | """ 72 | model.add_loss() 73 | model.add_optimizer(global_step) 74 | stats = model.add_stats() 75 | 76 | # Bookkeeping: 77 | step = 0 78 | time_window = ValueWindow(100) 79 | loss_window = ValueWindow(100) 80 | saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2) 81 | 82 | # Train! 83 | config = tf.ConfigProto(allow_soft_placement=True, 84 | gpu_options=tf.GPUOptions(allow_growth=True)) 85 | with tf.Session(config=config) as sess: 86 | try: 87 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 88 | sess.run(tf.global_variables_initializer()) 89 | if args.restore_step: 90 | # Restore from a checkpoint if the user requested it. 91 | restore_path = '%s-%s' % (checkpoint_path, args.restore_step) 92 | saver.restore(sess, restore_path) 93 | log('Resuming from checkpoint: %s' % restore_path, slack=True) 94 | else: 95 | log('Starting a new training run ...', slack=True) 96 | 97 | """ 98 | fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, 99 | model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max, 100 | model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max] 101 | """ 102 | fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, 103 | model.stop_loss, model.arousal_loss, model.valence_loss] 104 | for _ in range(_max_step): 105 | start_time = time.time() 106 | sess.run(debug.get_ops()) 107 | # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches) 108 | step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run(fetches) 109 | time_window.append(time.time() - start_time) 110 | loss_window.append(loss) 111 | """ 112 | message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % ( 113 | step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g) 114 | """ 115 | message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % ( 116 | step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss) 117 | log(message, slack=(step % args.checkpoint_interval == 0)) 118 | 119 | if loss > 100 or math.isnan(loss): 120 | log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True) 121 | raise Exception('Loss Exploded') 122 | 123 | if step % args.summary_interval == 0: 124 | log('Writing summary at step: %d' % step) 125 | try: 126 | summary_writer.add_summary(sess.run(stats), step) 127 | except Exception as e: 128 | log(f'summary failed and ignored: {str(e)}') 129 | 130 | if step % args.checkpoint_interval == 0: 131 | log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) 132 | saver.save(sess, checkpoint_path, global_step=step) 133 | log('Saving audio and alignment...') 134 | gt_mel, gt_spec, seq, mel, spec, align = sess.run([model.mel_targets[0], model.spec_targets[0], 135 | model.text_targets[0], model.mel_outputs[0], 136 | model.spec_outputs[0], model.alignment_outputs[0]]) 137 | text = sequence_to_text(seq) 138 | wav = audio.inv_spectrogram(hp, spec.T) 139 | wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step) 140 | mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step) 141 | spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step) 142 | align_path = os.path.join(log_dir, 'step-%d-align.png' % step) 143 | info = '%s, %s, step=%d, loss=%.5f\n %s' % (args.model, time_string(), step, loss, text) 144 | plot.plot_alignment(align, align_path, info=info) 145 | plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel) 146 | plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec) 147 | audio.save_wav(hp, wav, wav_path) 148 | log('Input: %s' % text) 149 | 150 | except Exception as e: 151 | log('Exiting due to exception: %s' % e, slack=True) 152 | traceback.print_exc() 153 | 154 | 155 | def main(): 156 | parser = argparse.ArgumentParser() 157 | parser.add_argument('--gpu', default='0') 158 | parser.add_argument('--log', '-l', default='') 159 | parser.add_argument('--restore_step', '-r', default=None) 160 | parser.add_argument('--tfr_dir', default='bc2013/training/tfrs_with_emo_feature') 161 | args = parser.parse_args() 162 | 163 | args.model = 'sygst_taco2' 164 | args.summary_interval = 200 165 | args.checkpoint_interval = 5000 166 | # args.summary_interval = 2 167 | # args.checkpoint_interval = 5 168 | 169 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 170 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 171 | log_dir = 'sygst_logs' + ('_' + args.log if args.log else '') 172 | os.makedirs(log_dir, exist_ok=True) 173 | 174 | tf.set_random_seed(hp.random_seed) 175 | infolog.init(os.path.join(log_dir, 'train.log'), args.model) 176 | 177 | train(log_dir, args) 178 | 179 | 180 | if __name__ == '__main__': 181 | main() 182 | -------------------------------------------------------------------------------- /synthesizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import textwrap 3 | from datetime import datetime 4 | 5 | import tensorflow as tf 6 | from tensorflow.keras.layers import Input 7 | 8 | from utils import audio, plot 9 | 10 | from ser.hparams import hp as ser_hp 11 | from taco2_hparams import hp as taco2_hp 12 | from sygst_hparams import hp as sygst_hp 13 | from emogst_hparams import hp as emogst_hp 14 | from embjoint_hparams import hp as embgst_joint_hp 15 | 16 | from models.tacotron2 import Tacotron2 17 | from models.sygst_tacotron2 import Tacotron2SYGST 18 | from models.emogst_tacotron2 import Tacotron2EMOGST 19 | from models.embgst_tacotron2_joint import Tacotron2EMBGSTJoint 20 | 21 | tf.compat.v1.logging.set_verbosity(40) # Only print error infos 22 | 23 | map_model = {'taco2': Tacotron2, 'sygst': Tacotron2SYGST, 24 | 'emogst': Tacotron2EMOGST, 'embgst_joint': Tacotron2EMBGSTJoint} 25 | map_hp = {'taco2': taco2_hp, 'sygst': sygst_hp, 26 | 'emogst': emogst_hp, 'embgst_joint': embgst_joint_hp} 27 | 28 | 29 | class Synthesizer: 30 | def __init__(self, use_gta=False, use_ref=False, use_att=True, model_name='taco2'): 31 | 32 | assert model_name in ['taco2', 'sygst', 'embgst', 'emogst', 'embgst_joint'] 33 | 34 | self.use_gta = use_gta # whether using ground truth alignment 35 | self.use_ref = use_ref # whether using reference mel 36 | self.use_att = use_att # whether using attention weights 37 | self.hp = map_hp[model_name] 38 | self.model = map_model[model_name](self.hp, use_gta) if model_name != 'embgst_joint' else map_model[model_name](embgst_joint_hp, ser_hp, use_gta) 39 | self.model_name = model_name 40 | 41 | # build model 42 | with tf.name_scope('model'): 43 | h, t = self.hp.gst_heads, self.hp.gst_tokens 44 | self.text_inputs = Input([None], dtype=tf.int32, name='text_inputs') 45 | self.mel_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='mel_inputs') 46 | self.mel_lengths = Input([], dtype=tf.int32, name='mel_lengths') 47 | self.ref_inputs = Input([None, self.hp.num_mels], dtype=tf.float32, name='ref_inputs') 48 | self.ref_lengths = Input([], dtype=tf.int32, name='ref_lengths') 49 | self.aro_weights_ph = Input([h, 1, t], dtype=tf.float32, name='arousal_weitght_ph') 50 | self.val_weights_ph = Input([h, 1, t], dtype=tf.float32, name='valence_weitght_ph') 51 | self.atten_weights_ph = Input([h, 1, t], dtype=tf.float32, name='attention_weights_ph') 52 | 53 | call_fn_kwargs = {} 54 | if use_gta: 55 | assert not use_ref 56 | call_fn_kwargs.update(mel_inputs=self.mel_inputs, 57 | spec_lengths=self.mel_lengths) 58 | if use_ref: 59 | assert not use_att and model_name != 'taco2' 60 | call_fn_kwargs.update(ref_inputs=self.ref_inputs, 61 | ref_lengths=self.ref_lengths) 62 | if use_att: 63 | if model_name in ['sygst', 'emogst']: 64 | call_fn_kwargs.update(atten_weights_ph=self.atten_weights_ph) 65 | elif model_name in ['embgst', 'embgst_joint']: 66 | call_fn_kwargs.update(aro_weights_ph=self.aro_weights_ph, 67 | val_weights_ph=self.val_weights_ph) 68 | self.model_call_fn_kwargs = call_fn_kwargs 69 | self.model(self.text_inputs, training=False, **call_fn_kwargs) 70 | 71 | # outputs 72 | model = self.model if self.model_name != 'embgst_joint' else self.model.tts_model 73 | self.seq_length_outputs = model.seq_length_outputs 74 | self.mel_outputs = model.mel_outputs 75 | self.spec_outputs = model.spec_outputs 76 | self.wav_outputs = audio.inv_spectrogram_tensorflow(self.hp, self.spec_outputs) 77 | self.alignment_outputs = model.alignment_outputs 78 | 79 | def load(self, ckpt_path): 80 | self.eval_step = re.search(r'ckpt-(\d+)', ckpt_path).group(1) 81 | self.session = tf.Session() 82 | saver = tf.train.Saver() 83 | saver.restore(self.session, ckpt_path) 84 | 85 | def synthesize(self, text_seqs, texts, output_path, 86 | mel_inputs=None, mel_lengths=None, 87 | ref_inputs=None, ref_lengths=None, 88 | atten_weights=None, aro_weights=None, val_weights=None): 89 | 90 | feed_dict = {self.text_inputs: text_seqs} 91 | if mel_inputs is not None: 92 | feed_dict.update({self.mel_inputs: mel_inputs, 93 | self.mel_lengths: mel_lengths}) 94 | if ref_inputs is not None: 95 | feed_dict.update({self.ref_inputs: ref_inputs, 96 | self.ref_lengths: ref_lengths}) 97 | if aro_weights is not None and val_weights is not None: 98 | feed_dict.update({self.aro_weights_ph: aro_weights, 99 | self.val_weights_ph: val_weights}) 100 | if atten_weights is not None: 101 | feed_dict.update({self.atten_weights_ph: atten_weights}) 102 | 103 | self.now_time = datetime.now().strftime('%Y-%m-%d %H:%M') 104 | lens, wavs, mels, specs, aligns = self.session.run([self.seq_length_outputs, 105 | self.wav_outputs, 106 | self.mel_outputs, 107 | self.spec_outputs, 108 | self.alignment_outputs], 109 | feed_dict=feed_dict) 110 | self.post_process(output_path, texts, lens, wavs, mels, specs, aligns) 111 | 112 | def post_process(self, output_path, texts, lens, wavs, mels, specs, aligns): 113 | 114 | zipped_inputs = zip(output_path, texts, lens, wavs, mels, specs, aligns) 115 | for path, text, mel_len, wav, mel, spec, align in zipped_inputs: 116 | wav = audio.inv_preemphasis(self.hp, wav) 117 | end_point = audio.find_endpoint(self.hp, wav) 118 | # end_point = wav.shape[0] 119 | # end_point = int((mel_len * self.hp.hop_ms / 1000) * self.hp.sample_rate) 120 | wav = wav[:end_point] 121 | mel_len = int(end_point / (self.hp.hop_ms / 1000 * self.hp.sample_rate)) + 1 122 | pathes = [path + suffix for suffix in ['.wav', '-mel.png', '-spec.png', '-align.png']] 123 | wav_path, mel_path, spec_path, align_path = pathes 124 | title = f'{self.model_name}, {self.eval_step}, {self.now_time}' 125 | info = '\n'.join(textwrap.wrap(text, 70, break_long_words=False)) 126 | plot.plot_alignment(align[:, : mel_len], align_path, info, title) 127 | plot.plot_mel(mel[: mel_len, :], mel_path, info, title) 128 | plot.plot_mel(spec[: mel_len, :], spec_path, info, title) 129 | audio.save_wav(self.hp, wav, wav_path) 130 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from text import cleaners 3 | from text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | # Regular expression matching whitespace: 18 | _whitespace_re = re.compile(r'\s+') 19 | 20 | # List of (regular expression, replacement) pairs for abbreviations: 21 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 22 | ('mrs', 'misess'), 23 | ('mr', 'mister'), 24 | ('dr', 'doctor'), 25 | ('st', 'saint'), 26 | ('co', 'company'), 27 | ('jr', 'junior'), 28 | ('maj', 'major'), 29 | ('gen', 'general'), 30 | ('drs', 'doctors'), 31 | ('rev', 'reverend'), 32 | ('lt', 'lieutenant'), 33 | ('hon', 'honorable'), 34 | ('sgt', 'sergeant'), 35 | ('capt', 'captain'), 36 | ('esq', 'esquire'), 37 | ('ltd', 'limited'), 38 | ('col', 'colonel'), 39 | ('ft', 'fort'), 40 | ]] 41 | 42 | 43 | def expand_abbreviations(text): 44 | for regex, replacement in _abbreviations: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def expand_numbers(text): 50 | return normalize_numbers(text) 51 | 52 | 53 | def lowercase(text): 54 | return text.lower() 55 | 56 | 57 | def collapse_whitespace(text): 58 | return re.sub(_whitespace_re, ' ', text) 59 | 60 | 61 | def convert_to_ascii(text): 62 | return unidecode(text) 63 | 64 | 65 | def basic_cleaners(text): 66 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 67 | text = lowercase(text) 68 | text = collapse_whitespace(text) 69 | return text 70 | 71 | 72 | def transliteration_cleaners(text): 73 | '''Pipeline for non-English text that transliterates to ASCII.''' 74 | text = convert_to_ascii(text) 75 | text = lowercase(text) 76 | text = collapse_whitespace(text) 77 | return text 78 | 79 | 80 | def english_cleaners(text): 81 | '''Pipeline for English text, including number and abbreviation expansion.''' 82 | text = convert_to_ascii(text) 83 | text = lowercase(text) 84 | text = expand_numbers(text) 85 | text = expand_abbreviations(text) 86 | text = collapse_whitespace(text) 87 | return text 88 | -------------------------------------------------------------------------------- /text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | def __init__(self, file_or_path, keep_ambiguous=True): 20 | if isinstance(file_or_path, str): 21 | with open(file_or_path, encoding='latin-1') as f: 22 | entries = _parse_cmudict(f) 23 | else: 24 | entries = _parse_cmudict(file_or_path) 25 | if not keep_ambiguous: 26 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 27 | self._entries = entries 28 | 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | 34 | def lookup(self, word): 35 | '''Returns list of ARPAbet pronunciations of the given word.''' 36 | return self._entries.get(word.upper()) 37 | 38 | 39 | def _parse_cmudict(file): 40 | cmudict = {} 41 | for line in file: 42 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 43 | parts = line.split(' ') 44 | word = parts[0] 45 | pronunciation = _get_pronunciation(parts[1]) 46 | if pronunciation: 47 | if word in cmudict: 48 | cmudict[word].append(pronunciation) 49 | else: 50 | cmudict[word] = [pronunciation] 51 | return cmudict 52 | 53 | 54 | def _get_pronunciation(s): 55 | parts = s.strip().split(' ') 56 | for part in parts: 57 | if part not in _valid_symbol_set: 58 | return None 59 | return ' '.join(parts) 60 | -------------------------------------------------------------------------------- /text/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | _inflect = inflect.engine() 5 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 6 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 7 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 8 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 9 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 10 | _number_re = re.compile(r'[0-9]+') 11 | 12 | 13 | def _remove_commas(m): 14 | return m.group(1).replace(',', '') 15 | 16 | 17 | def _expand_decimal_point(m): 18 | return m.group(1).replace('.', ' point ') 19 | 20 | 21 | def _expand_dollars(m): 22 | match = m.group(1) 23 | parts = match.split('.') 24 | if len(parts) > 2: 25 | return match + ' dollars' # Unexpected format 26 | dollars = int(parts[0]) if parts[0] else 0 27 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 28 | if dollars and cents: 29 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 30 | cent_unit = 'cent' if cents == 1 else 'cents' 31 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 32 | elif dollars: 33 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 34 | return '%s %s' % (dollars, dollar_unit) 35 | elif cents: 36 | cent_unit = 'cent' if cents == 1 else 'cents' 37 | return '%s %s' % (cents, cent_unit) 38 | else: 39 | return 'zero dollars' 40 | 41 | 42 | def _expand_ordinal(m): 43 | return _inflect.number_to_words(m.group(0)) 44 | 45 | 46 | def _expand_number(m): 47 | num = int(m.group(0)) 48 | if num > 1000 and num < 3000: 49 | if num == 2000: 50 | return 'two thousand' 51 | elif num > 2000 and num < 2010: 52 | return 'two thousand ' + _inflect.number_to_words(num % 100) 53 | elif num % 100 == 0: 54 | return _inflect.number_to_words(num // 100) + ' hundred' 55 | else: 56 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 57 | else: 58 | return _inflect.number_to_words(num, andword='') 59 | 60 | 61 | def normalize_numbers(text): 62 | text = re.sub(_comma_number_re, _remove_commas, text) 63 | text = re.sub(_pounds_re, r'\1 pounds', text) 64 | text = re.sub(_dollars_re, _expand_dollars, text) 65 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 66 | text = re.sub(_ordinal_re, _expand_ordinal, text) 67 | text = re.sub(_number_re, _expand_number, text) 68 | return text 69 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from text import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = '"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet 18 | -------------------------------------------------------------------------------- /tfr_dset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import tensorflow as tf 4 | 5 | # from taco2_hparams import hp 6 | # from hparams import hparams as hp 7 | # cpu_num = os.cpu_count() 8 | 9 | _pad = 0. 10 | _pad_emo = 0.25 11 | _pad_bemo = 0.5 12 | _pad_token = 1. # stop token pad 1. for marking sequences finished 13 | 14 | 15 | def parse_single_example(example_proto): 16 | features = {'uid': tf.FixedLenFeature([], tf.string), 17 | 'inputs': tf.FixedLenFeature([], tf.string), 18 | 'input_lengths': tf.FixedLenFeature([], tf.int64), 19 | 'mel_targets': tf.FixedLenFeature([], tf.string), 20 | 'linear_targets': tf.FixedLenFeature([], tf.string), 21 | 'spec_lengths': tf.FixedLenFeature([], tf.int64), 22 | 'soft_emo_labels': tf.FixedLenFeature([], tf.string), 23 | 'soft_arousal_labels': tf.FixedLenFeature([], tf.string), 24 | 'soft_valance_labels': tf.FixedLenFeature([], tf.string), 25 | 'arousal_embedding': tf.FixedLenFeature([], tf.string), 26 | 'valance_embedding': tf.FixedLenFeature([], tf.string)} 27 | parsed = tf.parse_single_example(example_proto, features=features) 28 | inputs = tf.decode_raw(parsed['inputs'], tf.int32) 29 | input_lengths = tf.cast(parsed['input_lengths'], tf.int32) 30 | spec_lengths = tf.cast(parsed['spec_lengths'], tf.int32) 31 | mel_targets = tf.reshape(tf.decode_raw(parsed['mel_targets'], tf.float32), [spec_lengths, -1]) 32 | linear_targets = tf.reshape(tf.decode_raw(parsed['linear_targets'], tf.float32), [spec_lengths, -1]) 33 | soft_emo_labels = tf.decode_raw(parsed['soft_emo_labels'], tf.float32) 34 | soft_arousal_labels = tf.decode_raw(parsed['soft_arousal_labels'], tf.float32) 35 | soft_valance_labels = tf.decode_raw(parsed['soft_valance_labels'], tf.float32) 36 | arousal_embedding = tf.decode_raw(parsed['arousal_embedding'], tf.float32) 37 | valance_embedding = tf.decode_raw(parsed['valance_embedding'], tf.float32) 38 | return {'uid': parsed['uid'], 39 | 'inputs': inputs, 40 | 'input_lengths': input_lengths, 41 | 'mel_targets': mel_targets, 42 | 'linear_targets': linear_targets, 43 | 'spec_lengths': spec_lengths, 44 | 'soft_emo_labels': soft_emo_labels, 45 | 'soft_arousal_labels': soft_arousal_labels, 46 | 'soft_valance_labels': soft_valance_labels, 47 | 'arousal_embedding': arousal_embedding, 48 | 'valance_embedding': valance_embedding} 49 | 50 | 51 | def parse_single_example_for_merge_emo_feature(example_proto): 52 | features = {'uid': tf.FixedLenFeature([], tf.string), 53 | 'inputs': tf.FixedLenFeature([], tf.string), 54 | 'input_lengths': tf.FixedLenFeature([], tf.int64), 55 | 'mel_targets': tf.FixedLenFeature([], tf.string), 56 | 'linear_targets': tf.FixedLenFeature([], tf.string), 57 | 'spec_lengths': tf.FixedLenFeature([], tf.int64), 58 | 'soft_emo_labels': tf.FixedLenFeature([], tf.string)} 59 | parsed = tf.parse_single_example(example_proto, features=features) 60 | return parsed 61 | 62 | 63 | def load_for_merge_emo_features(tfr_dir): 64 | file_pattern = os.path.join(tfr_dir, '*.tfr') 65 | tfrecord_files_num = len(glob.glob(file_pattern)) 66 | tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True) 67 | dataset = tfrecord_files.apply( 68 | tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, 69 | cycle_length=min( 70 | tfrecord_files_num, 71 | 128), 72 | block_length=1)) 73 | dataset = dataset.map(parse_single_example_for_merge_emo_feature, 74 | num_parallel_calls=os.cpu_count()) 75 | return dataset 76 | 77 | 78 | def load_for_prepare_meta_from_tfr(tfr_dir): 79 | file_pattern = os.path.join(tfr_dir, '*.tfr') 80 | tfrecord_files_num = len(glob.glob(file_pattern)) 81 | tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True) 82 | dataset = tfrecord_files.apply( 83 | tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, 84 | cycle_length=min( 85 | tfrecord_files_num, 86 | 128), 87 | block_length=1)) 88 | dataset = dataset.map(parse_single_example, 89 | num_parallel_calls=os.cpu_count()) 90 | return dataset 91 | 92 | 93 | class TFDataSet(object): 94 | def __init__(self, 95 | hp, 96 | tfr_dir, 97 | cache_path=None, 98 | valid_batches=None, 99 | load_for_rayhame=False): 100 | """Load bc2013 dataset as a tf.data.Dataset object for training tts models 101 | # Arguments 102 | tfr_dir: the path of tf record files for bc2013 103 | batch_size: the batch size used for training and evaluating 104 | valid_batches: split the first 'valid_bathes' batch data as validation 105 | set, if None, will return the whole dataset as training set 106 | outputs_per_step: emit this number of frames at each tacotron 107 | decoder time step, it is passed to trim the spectrum lengths 108 | load_for_rayhame: whether load data for training Rayhame's Tacotron2 109 | model, if True, wu will trim down the spectrums and compute the 110 | stop token targets 111 | """ 112 | 113 | self.tfr_dir = tfr_dir 114 | self.cache_path = cache_path 115 | self.num_mels = hp.num_mels 116 | self.num_spec = hp.num_spec 117 | self.batch_size = hp.batch_size 118 | self.outputs_per_step = hp.outputs_per_step 119 | self.valid_batches = valid_batches 120 | self.load_for_rayhame = load_for_rayhame 121 | 122 | def load(self): 123 | 124 | # Load the tf record files to a tf.data.Dataset object 125 | auto_tune = tf.data.experimental.AUTOTUNE 126 | file_pattern = os.path.join(self.tfr_dir, '*.tfr') 127 | tfrecord_files_num = len(glob.glob(file_pattern)) 128 | tfrecord_files = tf.data.Dataset.list_files(file_pattern, shuffle=True) 129 | dataset = tfrecord_files.apply( 130 | tf.data.experimental.parallel_interleave( 131 | tf.data.TFRecordDataset, 132 | cycle_length=min(tfrecord_files_num, 128), 133 | block_length=1 134 | ) 135 | ) 136 | 137 | # Deserialize each tf record example to a dict of Tensors 138 | dataset = dataset.map(lambda x: parse_single_example(x), 139 | num_parallel_calls=auto_tune) 140 | 141 | # Filter sampels by spectrum lengths for removing some mismatched text-audio pairs 142 | def len_filter(x): 143 | return tf.logical_not( 144 | tf.logical_or( 145 | tf.logical_or(x['spec_lengths'] < 80, x['spec_lengths'] > 800), 146 | tf.logical_and(x['input_lengths'] < 70, x['spec_lengths'] > 700) 147 | ) 148 | ) 149 | dataset = dataset.filter(len_filter) 150 | 151 | def trim_down_lengths(x, prepare_stop_targets=True): 152 | r = self.outputs_per_step 153 | spec_len = x['spec_lengths'] 154 | trim_len = tf.cast(spec_len / r, dtype=tf.int32) * r 155 | x['mel_targets'] = x['mel_targets'][: trim_len] 156 | x['linear_targets'] = x['linear_targets'][: trim_len] 157 | x['spec_lengths'] = trim_len 158 | if prepare_stop_targets: 159 | x['token_targets'] = tf.concat([tf.zeros(trim_len - r), tf.ones(r)], axis=0) 160 | return x 161 | # Load for rayhame' tacotron2 model 162 | if self.load_for_rayhame: 163 | assert self.valid_batches is not None 164 | assert self.outputs_per_step is not None 165 | dataset = dataset.map(trim_down_lengths, auto_tune) 166 | 167 | # Maybe split the valid dataset and training dataset 168 | valid_dataset = None 169 | if self.valid_batches: 170 | valid_size = self.valid_batches * self.batch_size 171 | dataset = dataset.shuffle(buffer_size=10000) 172 | valid_dataset = dataset.take(valid_size) # validation set 173 | dataset = dataset.skip(valid_size) # training set 174 | 175 | # Perform a bucket and padded batch transform for training set 176 | bucket_num = 10 177 | bucket_batch_sizes = [self.batch_size] * bucket_num 178 | bucket_boundaries = [25, 40, 55, 70, 85, 100, 135, 170, 220] 179 | padded_shapes = {'uid': [], 180 | 'inputs': [None], 181 | 'input_lengths': [], 182 | 'mel_targets': [None, self.num_mels], 183 | 'linear_targets': [None, self.num_spec], 184 | 'spec_lengths': [], 185 | 'soft_emo_labels': [None], 186 | 'soft_arousal_labels': [None], 187 | 'soft_valance_labels': [None], 188 | 'arousal_embedding': [256], 189 | 'valance_embedding': [256]} 190 | padded_values = {'uid': '\0', 191 | 'inputs': 0, 192 | 'input_lengths': 0, 193 | 'mel_targets': _pad, 194 | 'linear_targets': _pad, 195 | 'spec_lengths': 0, 196 | 'soft_emo_labels': _pad_emo, 197 | 'soft_arousal_labels': _pad_bemo, 198 | 'soft_valance_labels': _pad_bemo, 199 | 'arousal_embedding': _pad, 200 | 'valance_embedding': _pad} 201 | 202 | if self.load_for_rayhame: 203 | padded_shapes.update({'token_targets': [None]}) 204 | padded_values.update({'token_targets': _pad_token}) 205 | 206 | dataset = dataset.apply( 207 | tf.data.experimental.bucket_by_sequence_length( 208 | lambda x: x['input_lengths'], 209 | bucket_boundaries=bucket_boundaries, 210 | bucket_batch_sizes=bucket_batch_sizes, 211 | padded_shapes=padded_shapes, 212 | padding_values=padded_values, 213 | pad_to_bucket_boundary=False, 214 | no_padding=False 215 | ) 216 | ) 217 | 218 | # Shffle and repeat infinitely and prefetch 10 batches 219 | dataset = dataset.apply( 220 | tf.data.experimental.shuffle_and_repeat(buffer_size=128)) 221 | dataset = dataset.prefetch(buffer_size=10) # Prefetch 10 batch of samples 222 | if self.cache_path: # not None an not '' 223 | # assert os.path.isdir(self.cache_path) 224 | # dataset = dataset.cache(os.path.join(self.cache_path, 'cached_bc2013')) 225 | pass # cache 有问题, 直接爆磁盘和内存 226 | 227 | # Perform padded batch transform for validation dataset 228 | if valid_dataset is not None: 229 | valid_dataset = valid_dataset.apply( 230 | tf.data.experimental.shuffle_and_repeat(self.valid_batches, count=1)) 231 | valid_dataset = valid_dataset.padded_batch( 232 | self.batch_size, padded_shapes, padded_values) 233 | valid_dataset = valid_dataset.cache() 234 | 235 | self.dataset = dataset 236 | self.valid_dataset = valid_dataset 237 | 238 | def get_train_next(self): 239 | if not hasattr(self, 'dataset'): 240 | self.load() 241 | train_next = self.dataset.make_one_shot_iterator().get_next() 242 | return train_next 243 | 244 | def get_valid_iter_and_next(self): 245 | assert self.valid_batches is not None 246 | if not hasattr(self, 'valid_dataset'): 247 | self.load() 248 | init_iter = self.valid_dataset.make_initializable_iterator() 249 | return init_iter.initializer, init_iter.get_next() 250 | 251 | 252 | def test(): 253 | pass 254 | 255 | 256 | if __name__ == '__main__': 257 | test() 258 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import tensorflow as tf 4 | from scipy import signal 5 | from scipy.io import wavfile 6 | 7 | 8 | _mel_basis = None 9 | 10 | 11 | def load_wav(hp, path): 12 | wav, sr = librosa.core.load(path, sr=hp.sample_rate) 13 | wav = wav / np.abs(wav).max() * 0.999 14 | return wav, sr 15 | 16 | 17 | def save_wav(hp, wav, path): 18 | wav /= max(0.01, np.max(np.abs(wav))) 19 | wavfile.write(path, hp.sample_rate, (wav * 32766).astype(np.int16)) 20 | 21 | 22 | def preemphasis(hp, x): 23 | return signal.lfilter([1, -hp.preemphasis], [1], x) 24 | 25 | 26 | def inv_preemphasis(hp, x): 27 | return signal.lfilter([1], [1, -hp.preemphasis], x) 28 | 29 | 30 | def spectrogram(hp, y): 31 | if hp.preemphasis is None: 32 | D = _stft(hp, y) 33 | else: 34 | D = _stft(hp, preemphasis(hp, y)) 35 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 36 | return _normalize(hp, S) 37 | 38 | 39 | def inv_spectrogram(hp, spectrogram): 40 | '''Converts spectrogram to waveform using librosa''' 41 | S = _db_to_amp(_denormalize(hp, spectrogram) + hp.ref_level_db) # Convert back to linear 42 | return inv_preemphasis(hp, _griffin_lim(hp, S ** hp.griffin_lim_power)) # Reconstruct phase 43 | 44 | 45 | def inv_spectrogram_tensorflow(hp, spectrogram): 46 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow. 47 | 48 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call 49 | inv_preemphasis on the output after running the graph. 50 | ''' 51 | with tf.name_scope('griffin_lim'): 52 | S = _db_to_amp_tensorflow(_denormalize_tensorflow(hp, spectrogram) + hp.ref_level_db) 53 | return _griffin_lim_tensorflow(hp, tf.pow(S, hp.griffin_lim_power)) 54 | 55 | 56 | def melspectrogram(hp, y): 57 | if hp.preemphasis is None: 58 | D = _stft(hp, y) 59 | else: 60 | D = _stft(hp, preemphasis(hp, y)) 61 | S = _amp_to_db(_linear_to_mel(hp, np.abs(D))) - hp.ref_level_db 62 | return _normalize(hp, S) 63 | 64 | 65 | def mfcc(hp, y): 66 | pass 67 | 68 | 69 | def find_endpoint(hp, wav, threshold_db=-40, min_silence_sec=0.8): 70 | window_length = int(hp.sample_rate * min_silence_sec) 71 | hop_length = int(window_length / 4) 72 | threshold = _db_to_amp(threshold_db) 73 | for x in range(hop_length, len(wav) - window_length, hop_length): 74 | if np.max(wav[x:x + window_length]) < threshold: 75 | return x + hop_length 76 | return len(wav) 77 | 78 | 79 | def _griffin_lim(hp, S): 80 | '''librosa implementation of Griffin-Lim 81 | Based on https://github.com/librosa/librosa/issues/434 82 | ''' 83 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 84 | S_complex = np.abs(S).astype(np.complex) 85 | y = _istft(hp, S_complex * angles) 86 | for i in range(hp.griffin_lim_iters): 87 | angles = np.exp(1j * np.angle(_stft(hp, y))) 88 | y = _istft(hp, S_complex * angles) 89 | return y # reconstructed wav 90 | 91 | 92 | def _griffin_lim_tensorflow(hp, S): 93 | '''TensorFlow implementation of Griffin-Lim 94 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb 95 | ''' 96 | with tf.variable_scope('griffinlim'): 97 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 98 | S = tf.expand_dims(S, 0) 99 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 100 | y = _istft_tensorflow(hp, S_complex) 101 | for i in range(hp.griffin_lim_iters): 102 | est = _stft_tensorflow(hp, y) 103 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 104 | y = _istft_tensorflow(hp, S_complex * angles) 105 | return tf.squeeze(y, 0) 106 | 107 | 108 | def _stft(hp, y): 109 | n_fft, hop_length, win_length = _stft_parameters(hp) 110 | # shape (1 + n_fft/2, n_frames) 111 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 112 | 113 | 114 | def _istft(hp, y): 115 | _, hop_length, win_length = _stft_parameters(hp) 116 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 117 | 118 | 119 | def _stft_tensorflow(hp, signals): 120 | n_fft, hop_length, win_length = _stft_parameters(hp) 121 | return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False) 122 | 123 | 124 | def _istft_tensorflow(hp, stfts): 125 | n_fft, hop_length, win_length = _stft_parameters(hp) 126 | return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) 127 | 128 | 129 | def _stft_parameters(hp): 130 | n_fft = hp.n_fft 131 | hop_length = int(hp.hop_ms / 1000 * hp.sample_rate) 132 | win_length = int(hp.win_ms / 1000 * hp.sample_rate) 133 | return n_fft, hop_length, win_length 134 | 135 | 136 | def _linear_to_mel(hp, spectrogram): 137 | global _mel_basis 138 | if _mel_basis is None: 139 | _mel_basis = _build_mel_basis(hp) 140 | return np.dot(_mel_basis, spectrogram) 141 | 142 | 143 | def _build_mel_basis(hp): 144 | n_fft = hp.n_fft 145 | return librosa.filters.mel(hp.sample_rate, n_fft, n_mels=hp.num_mels) 146 | 147 | 148 | def _amp_to_db(x): 149 | # return 20 * np.log10(np.maximum(1e-5, x)) 150 | return 20 * np.log10(np.maximum(1e-4, x)) # 最小为-80dB, 因为还有减去ref_dB 151 | 152 | 153 | def _db_to_amp(x): 154 | return np.power(10.0, x * 0.05) 155 | 156 | 157 | def _db_to_amp_tensorflow(x): 158 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 159 | 160 | 161 | def _normalize(hp, S): 162 | # 这个做法存疑, 因为S>0时, 都会被截断成0, 即如果S>ref_db, 都会 163 | # 变成ref_db 164 | return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1) 165 | 166 | 167 | def _denormalize(hp, S): 168 | return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 169 | 170 | 171 | def _denormalize_tensorflow(hp, S): 172 | return (tf.clip_by_value(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 173 | -------------------------------------------------------------------------------- /utils/ce_loss_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def ce_loss(soft_labels, logits): 5 | probs = tf.clip_by_value(tf.nn.softmax(logits, axis=-1), 1e-10, 10) 6 | ce = -tf.reduce_mean(tf.reduce_sum(soft_labels * tf.log(probs), axis=-1)) 7 | return ce 8 | -------------------------------------------------------------------------------- /utils/center_loss_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def calc_center_loss(features, centers, emo_labels, is_l1=True): 5 | """ 6 | Args: 7 | features: [batch_size, dim] 8 | centers: [emo_num, dim] 9 | emo_labels: [batch_size, emo_num] 10 | Returns: 11 | 12 | """ 13 | features_ = tf.expand_dims(features, 1) # [batch_size, 1, dim] 14 | centers_ = tf.expand_dims(centers, 0) # [1, emo_num, dim[ 15 | diff = features_ - centers_ # [batch_size, emo_num, dim] 16 | dist = tf.reduce_sum(tf.square(diff), axis=-1) # [batch_size, emo_num] 17 | if is_l1: 18 | dist = tf.sqrt(dist) 19 | loss = tf.reduce_mean(tf.reduce_sum(dist * emo_labels, axis=-1)) 20 | return loss 21 | 22 | 23 | def update_center(features, centers, emo_labels, alpha): 24 | """ 25 | Args: 26 | features: [batch_size, dim] 27 | centers: [emo_num, dim] 28 | emo_labels: [batch_size, emo_num] 29 | Returns: 30 | """ 31 | features_ = tf.expand_dims(features, 1) # [batch_size, 1, dim] 32 | centers_ = tf.expand_dims(centers, 0) # [1, emo_num, dim] 33 | emo_labels_ = tf.expand_dims(emo_labels, -1) # [batch_size, emo_num, 1] 34 | diff = features_ - centers_ # [batch_size, emo_num, dim] 35 | weighted_emo_diff = diff * emo_labels_ # [batch_size, emo_num, dim] 36 | sum_emo_diff = tf.reduce_sum(weighted_emo_diff, axis=0) # [emo_num, dim] 37 | emo_sum = tf.clip_by_value(tf.reduce_sum(emo_labels_, axis=0), 0.001, 100) # [emo_num, 1] 38 | alpha = tf.clip_by_value(alpha, 0., 1.) 39 | c_diff = tf.math.divide(sum_emo_diff, emo_sum) # [emo_num, dim] 40 | alpha_c_diff = alpha * c_diff 41 | update_center_op = tf.assign_add(centers, alpha_c_diff) 42 | return update_center_op 43 | 44 | 45 | def test_calc_center_loss(): 46 | with tf.Session() as sess: 47 | centers = tf.Variable([[1, -1], [0, 1]], trainable=False, dtype=tf.float32) 48 | features = tf.constant([[-1, 0], [-1, -1], [2, 0]], dtype=tf.float32) 49 | emo_labels = tf.constant([[0, 1.0], [0, 1.0], [0, 1.0]], dtype=tf.float32) 50 | sess.run(tf.global_variables_initializer()) 51 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 52 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 53 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 54 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 55 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 56 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 57 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 58 | print(sess.run(centers)) 59 | # loss = sess.run(calc_center_loss(features, centers, emo_labels, is_l1=True)) 60 | # print(loss) 61 | 62 | 63 | if __name__ == '__main__': 64 | test_calc_center_loss() 65 | -------------------------------------------------------------------------------- /utils/data.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tqdm import tqdm 4 | from functools import partial 5 | from concurrent.futures import ProcessPoolExecutor 6 | 7 | 8 | def bytes_feature(value): 9 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 10 | 11 | 12 | def float_feature(value): 13 | return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) 14 | 15 | 16 | def int64_feature(value): 17 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 18 | 19 | 20 | def data_process_pipeline(hp_or_obj, meta_file, process_one_line_fun=None, 21 | postprocess_fun=None, max_workers=None, **kwargs): 22 | """This func reads meta file and runs three custom funcs to get the final data 23 | 24 | This func performs the following data preprocessing pipeline: 25 | 01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun 26 | must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict 27 | 02 parse the geted data sample to the feature_fun func to get a feature data 28 | 03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing 29 | e.g., normalizations, fixed length padding 30 | 31 | # Arguments 32 | hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters 33 | can be accessed as it's attributes 34 | meta_file: the meta file where each line generally contains the path of data sample and its labels 35 | meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of 36 | tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature 37 | of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts]) 38 | or (None, None) if no sample returned. Note, even there is only one sample returned, it also must 39 | be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}. 40 | Note: this can return (None, None) for the reason of some samples may be filtered out, e.g., its 41 | length does not meet the requirements 42 | feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs, 43 | and returns a single feature sample. The signature is: def feature_fun(hp, sample) 44 | postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same 45 | list of postprocessed feature samples with the same length. Generally, the feature normalizations, 46 | fixed length padding and sorting the samples by length are performed in this func. The signature 47 | is: def postprocess_fun(hp, features) 48 | kwargs: some extra key word arguments will be passed to all three funcs. 49 | 50 | # Returns 51 | A tuple of length 2, the first element is the list of all postprocessed features, and the second element 52 | is the list of all labels(each label is a list converted by the labels dict). For example, the return 53 | value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]]) 54 | 55 | # Exceptions 56 | TypeError: if both value of meta_fun's return is not a list 57 | """ 58 | if process_one_line_fun is None and not hasattr(hp_or_obj, 'process_one_line'): 59 | raise ValueError('hp_or_obj without process_one_line method and process_one_line_fun is None') 60 | if postprocess_fun is None and not hasattr(hp_or_obj, 'postprocess'): 61 | raise ValueError('hp_or_obj without postprocess method and postprocess_fun is None') 62 | 63 | with open(meta_file) as fr: 64 | lines = [line for line in fr if line.strip() and line[0] != '#'] 65 | 66 | if hasattr(hp_or_obj, 'process_one_line'): 67 | process_one_line_fun = type(hp_or_obj).process_one_line 68 | if hasattr(hp_or_obj, 'postprocess'): 69 | postprocess_fun = type(hp_or_obj).postprocess 70 | 71 | # 处理meta line, 获取sample和标签 72 | print(' step 1: parsing meta and get features ...') 73 | num = len(lines) 74 | hps = [hp_or_obj] * num 75 | features, labels = [], [] 76 | with ProcessPoolExecutor(max_workers) as p: 77 | for r in tqdm(p.map(partial(process_one_line_fun, **kwargs), hps, lines), total=num): 78 | ds, ls = r 79 | if (ds, ls) != (None, None): 80 | if type(ds) != list or type(ls) != list: 81 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 82 | features += ds 83 | labels += ls 84 | ''' 85 | for line in tqdm(lines): 86 | ds, ls = process_one_line_fun(hp_or_obj, line) 87 | if (ds, ls) != (None, None): 88 | if type(ds) != list or type(ls) != list: 89 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 90 | features += ds 91 | labels += ls 92 | ''' 93 | 94 | # np.save('recola_wav01_mel_nonorm_before_post.npy', features[0]) 95 | # print('DEBUG before post', features[0].shape, labels[0]) 96 | # 后处理 97 | print(' step 2: postprocessing for features and labels ...') 98 | features, labels = postprocess_fun(hp_or_obj, features, labels, **kwargs) 99 | return features, labels 100 | 101 | 102 | def get_class_weights(class_nums, type=0, power=1): 103 | if type == 0: 104 | return [1.] * len(class_nums) 105 | # 根据power, 重新计算class_nums和total 106 | total, class_ws = 0, class_nums.copy() 107 | for cls in range(len(class_ws)): 108 | class_ws[cls] = class_ws[cls] ** power 109 | total += class_ws[cls] 110 | # 权值取倒数后除以所有权值的均值(结果为1均值), 参考老代的做法 111 | if type == 1: 112 | wsum = 0 113 | for cls in class_ws: 114 | class_ws[cls] = 1 / class_ws[cls] 115 | wsum += class_ws[cls] 116 | wmean = wsum / len(class_nums) 117 | class_ws = [w / wmean for cls, w in class_ws] 118 | # 取倒数后乘total/2, https://www.tensorflow.org/tutorials/structured_data/imbalanced_data 119 | elif type == 2: 120 | class_ws = [0.5 * total / w for w in class_ws] 121 | return class_ws 122 | -------------------------------------------------------------------------------- /utils/data.py.bak-0707: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from functools import partial 3 | from concurrent.futures import ProcessPoolExecutor 4 | 5 | 6 | def data_process_pipeline(hp, meta_file, meta_fun, feature_fun, postprocess_fun, 7 | max_workers=None, **kwargs): 8 | """This func reads meta file and runs three custom funcs to get the final data 9 | 10 | This func performs the following data preprocessing pipeline: 11 | 01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun 12 | must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict 13 | 02 parse the geted data sample to the feature_fun func to get a feature data 14 | 03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing 15 | e.g., normalizations, fixed length padding 16 | 17 | # Arguments 18 | hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters 19 | can be accessed as it's attributes 20 | meta_file: the meta file where each line generally contains the path of data sample and its labels 21 | meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of 22 | tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature 23 | of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts]) 24 | or (None, None) if no sample returned. Note, even there is only one sample returned, it also must 25 | be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}. 26 | Note: this can return (None, None) for the reason of some samples may be filtered out, e.g., its 27 | length does not meet the requirements 28 | feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs, 29 | and returns a single feature sample. The signature is: def feature_fun(hp, sample) 30 | postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same 31 | list of postprocessed feature samples with the same length. Generally, the feature normalizations, 32 | fixed length padding and sorting the samples by length are performed in this func. The signature 33 | is: def postprocess_fun(hp, features) 34 | kwargs: some extra key word arguments will be passed to all three funcs. 35 | 36 | # Returns 37 | A tuple of length 2, the first element is the list of all postprocessed features, and the second element 38 | is the list of all labels(each label is a list converted by the labels dict). For example, the return 39 | value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]]) 40 | 41 | # Exceptions 42 | TypeError: if both value of meta_fun's return is not a list 43 | """ 44 | with open(meta_file) as fr: 45 | lines = fr.readlines() 46 | 47 | # 处理meta line, 获取sample和标签 48 | print('\n[Beginning process datas ... ]') 49 | print('step 1: parsing meta and loading orignal data samples ...') 50 | datas, labels = [], [] 51 | if max_workers == -1: 52 | for line in tqdm(lines): 53 | ds, ls, info = meta_fun(hp, line, **kwargs) 54 | if (ds, ls) != (None, None): 55 | if type(ds) != list or type(ls) != list: 56 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 57 | datas += ds 58 | labels += ls 59 | else: 60 | num = len(lines) 61 | hps = [hp] * num 62 | with ProcessPoolExecutor(max_workers) as p: 63 | for r in tqdm(p.map(partial(meta_fun, **kwargs), hps, lines), total=num): 64 | ds, ls, info = r 65 | if (ds, ls) != (None, None): 66 | if type(ds) != list or type(ls) != list: 67 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 68 | datas += ds 69 | labels += ls 70 | hp.sr = info['sr'] 71 | 72 | # 处理sample, 获取feature 73 | print('step 2: getting features from original data samples ...') 74 | if max_workers == -1: 75 | datas = [feature_fun(hp, x, **kwargs) for x in tqdm(datas)] 76 | else: 77 | num = len(datas) 78 | hps = [hp] * num 79 | with ProcessPoolExecutor(max_workers) as p: 80 | datas = [r for r in tqdm(p.map(partial(feature_fun, **kwargs), hps, datas), total=num)] 81 | 82 | # 后处理 83 | print('step 3: postprocessing for features ...') 84 | datas = postprocess_fun(hp, datas, **kwargs) 85 | 86 | for i in range(len(labels)): 87 | labels[i] = list(labels[i].values()) # 用list更好, tuple无法修改 88 | return datas, labels 89 | -------------------------------------------------------------------------------- /utils/debug.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def debug_print(*args, **kwargs): 5 | print_op = tf.print(*args, **kwargs) 6 | tf.add_to_collection('print_ops', print_op) 7 | 8 | 9 | def get_ops(): 10 | return tf.get_collection('print_ops') 11 | -------------------------------------------------------------------------------- /utils/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new training run\n') 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, slack=False, is_print=True): 26 | if is_print: 27 | print(msg) 28 | if _file is not None: 29 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 30 | if slack and _slack_url is not None: 31 | Thread(target=_send_slack, args=(msg,)).start() 32 | 33 | 34 | def _close_logfile(): 35 | global _file 36 | if _file is not None: 37 | _file.close() 38 | _file = None 39 | 40 | 41 | def _send_slack(msg): 42 | req = Request(_slack_url) 43 | req.add_header('Content-Type', 'application/json') 44 | urlopen(req, json.dumps({ 45 | 'username': 'tacotron', 46 | 'icon_emoji': ':taco:', 47 | 'text': '*%s*: %s' % (_run_name, msg) 48 | }).encode()) 49 | 50 | 51 | atexit.register(_close_logfile) 52 | -------------------------------------------------------------------------------- /utils/ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def shape_list(x): 5 | """Return list of dims, statically where possible.""" 6 | x = tf.convert_to_tensor(x) 7 | 8 | # If unknown rank, return dynamic shape 9 | if x.get_shape().dims is None: 10 | return tf.shape(x) 11 | 12 | static = x.get_shape().as_list() 13 | shape = tf.shape(x) 14 | 15 | ret = [] 16 | for i in range(len(static)): 17 | dim = static[i] 18 | if dim is None: 19 | dim = shape[i] 20 | ret.append(dim) 21 | return ret 22 | -------------------------------------------------------------------------------- /utils/parameter.py: -------------------------------------------------------------------------------- 1 | import six 2 | import json 3 | 4 | 5 | # hyper parameter util class 6 | class HParams: 7 | def __init__(self, **kwargs): 8 | """A simple alternative implementation for tf.contrib.training.HParams 9 | 10 | # Arguments 11 | kwargs: all key word parameters which will be added as instance atrributes 12 | used as hyper parameters 13 | """ 14 | for k, v in six.iteritems(kwargs): 15 | self.add_hparam(k, v) 16 | 17 | def add_hparam(self, name, value): 18 | """add a new hyperparameter given a name and value 19 | 20 | if name is an existed hyperparameter, then it's value is 21 | updated as the new value 22 | 23 | # Arguments 24 | name: str name of the new hyperparameter will be added 25 | value: the value of new hyperparameter 26 | """ 27 | setattr(self, name, value) 28 | 29 | def del_hparam(self, name): 30 | """delete a hyperparameter named name 31 | 32 | # Arguments 33 | name: str name of the hyperparameter will be deleted 34 | """ 35 | delattr(self, name) 36 | 37 | def update(self, D, **kwargs): 38 | """update or add hyper parameters 39 | 40 | # Arguments 41 | D: a object that has keys() method, or can be iterated 42 | as for k, v in D 43 | kwargs: extra key workd arguments for update hyper patameters 44 | """ 45 | self.__dict__.update(D, **kwargs) 46 | 47 | def parse(self, values): 48 | """parse a str that is splited by ';' and update them into attributes 49 | 50 | Note: we use ';' as the delimiter not ',' as in tf.contrib.training.HParams 51 | because ',' will conflict the delimiter ',' in list and dict 52 | 53 | # Arguments 54 | values: a str contains hyper parameters which is splited with ';' 55 | and paired with '=', e.g., 'epochs=20,learning_rate=0.001' 56 | """ 57 | pairs = values.split(";") 58 | pairs = [x.strip().split("=") for x in pairs if x.strip() and '=' in x] 59 | dict_pairs = dict(pairs) 60 | for k in dict_pairs: 61 | if k not in self.__dict__: 62 | raise KeyError('can not parse a not existing hyperparameter:"{}"'.format(k)) 63 | # self.__dict__[k] = type(self.__dict__[k])(dict_pairs[k]) # 还无法解析字典和列表元素 64 | try: 65 | v = json.loads(dict_pairs[k]) # note: 参数值如果是字典, 则该字典的key只能是字符串(json要求) 66 | except json.JSONDecodeError: 67 | v = json.loads('"' + dict_pairs[k] + '"') # 直接解析字符串hello会报错, 必须解析"hello"才可以 68 | self.__dict__[k] = v 69 | return self 70 | 71 | def print(self): 72 | """this func prints all hyper parameters""" 73 | print('\n\n') 74 | print('--------------------------------------------------') 75 | print('All Hyper Parameters:') 76 | print('--------------------------------------------------') 77 | hps = self.__dict__ 78 | for hp in hps: 79 | print(' {}={}'.format(hp, hps[hp])) 80 | print('--------------------------------------------------') 81 | print('\n\n') 82 | 83 | def to_string(self): 84 | hp = '\n' 85 | hp += '--------------------------------------------------\n' 86 | hp += 'All Hyper Parameters:\n' 87 | hp += '--------------------------------------------------\n' 88 | hps = self.__dict__ 89 | for k in hps: 90 | hp += ' {}={}\n'.format(k, hps[k]) 91 | hp += '--------------------------------------------------\n' 92 | hp += '\n' 93 | return hp 94 | -------------------------------------------------------------------------------- /utils/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def plot_alignment(alignment, path, info=None, title=None, text=None): 8 | """ 9 | # Arguments 10 | text: the text str where each char is used to draw yticks 11 | """ 12 | if text is None: 13 | figsize = None 14 | yticks = None 15 | ytick_labels = None 16 | else: 17 | yticks = np.arange(len(text)) 18 | ytick_labels = list(text) 19 | ytick_labels[-1] = len(text) 20 | figsize = (0.02 * alignment.shape[1], 0.10 * len(text)) 21 | 22 | fig, ax = plt.subplots(figsize=figsize) 23 | im = ax.imshow(alignment, 24 | aspect='auto', 25 | origin='lower', 26 | interpolation='none') 27 | fig.colorbar(im, ax=ax) 28 | xlabel = 'Decoder timestep' 29 | if info is not None: 30 | xlabel += '\n\n' + info 31 | plt.xlabel(xlabel) 32 | plt.ylabel('Encoder timestep') 33 | plt.yticks(yticks, ytick_labels) 34 | plt.title(title) 35 | plt.tight_layout() 36 | plt.savefig(path, format='png') 37 | plt.close('all') 38 | 39 | 40 | def plot_mel(mel, path, info=None, title=None, gt_mel=None): 41 | nrows = 1 if gt_mel is None else 2 42 | fig, ax = plt.subplots(nrows, squeeze=False) 43 | 44 | def plot(mel, ax, y_label='pred_freq'): 45 | im = ax.imshow(mel, 46 | aspect='auto', 47 | origin='lower', 48 | interpolation='none') 49 | ax.set_ylabel(y_label) 50 | fig.colorbar(im, ax=ax) 51 | 52 | plot(mel.T, ax[0][0]) # mel shape [time_step, num_mels] 53 | ax[0][0].set_title(title) 54 | 55 | if gt_mel is not None: 56 | plot(gt_mel.T, ax[1][0], y_label='truth_freq') 57 | 58 | plt.xlabel(info or 'time step') 59 | plt.tight_layout() 60 | plt.savefig(path, format='png') 61 | plt.close('all') 62 | -------------------------------------------------------------------------------- /utils/tool_wrappers.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | 3 | 4 | def get_loss(loss): 5 | if type(loss) == str or type(loss) == dict: 6 | loss_name, kwargs = loss, {} 7 | if type(loss) == dict: 8 | loss_name = loss['loss_name'] 9 | kwargs = loss.pop('loss_name') 10 | kwargs.setdefault('name', loss_name) 11 | 12 | if loss_name == 'scce': 13 | kwargs.setdefault('from_logits', True) 14 | return keras.losses.SparseCategoricalCrossentropy(**kwargs) 15 | elif loss_name == 'cce': 16 | kwargs.setdefault('from_logits', True) 17 | return keras.losses.CategoricalCrossentropy(**kwargs) 18 | elif loss_name == 'bce': 19 | kwargs.setdefault('from_logits', True) 20 | return keras.losses.BinaryCrossentropy(**kwargs) 21 | elif loss_name == 'mae': 22 | return keras.losses.MeanAbsoluteError(**kwargs) 23 | elif loss_name == 'mse': 24 | return keras.losses.MeanSquaredError(**kwargs) 25 | elif loss_name == 'focal': 26 | pass 27 | else: 28 | raise ValueError('{} is unsupported loss now'.format(loss)) 29 | if callable(loss): 30 | return loss 31 | raise TypeError('type of loss must be str or callable, but {} is found'.format(type(loss))) 32 | 33 | 34 | def get_metric(metric): 35 | if type(metric) == str or type(metric) == dict: 36 | metric_name, kwargs = metric, {} 37 | if type(metric) == dict: 38 | metric_name = metric['metric_name'] 39 | kwargs = metric.pop('metric_name') 40 | metric_name = metric_name.lower() 41 | kwargs.setdefault('name', metric_name) 42 | 43 | if metric == 'sca': 44 | return keras.metrics.SparseCategoricalAccuracy(**kwargs) 45 | elif metric == 'ca': 46 | return keras.metrics.CategoricalAccuracy(**kwargs) 47 | elif metric == 'ba': 48 | return keras.metrics.BinaryAccuracy(**kwargs) 49 | elif metric == 'recall': 50 | return keras.metrics.Recall(**kwargs) 51 | elif metric == 'precision': 52 | return keras.metrics.Precision(**kwargs) 53 | elif metric == 'mae': 54 | return keras.metrics.MeanSquaredError(**kwargs) 55 | elif metric == 'mse': 56 | return keras.metrics.MeanAbsoluteError(**kwargs) 57 | else: 58 | raise ValueError('{} is unsupported metric now'.format(metric)) 59 | if callable(metric): 60 | return metric 61 | raise TypeError('type of metric must be str or callable, but {} is found'.format(type(metric))) 62 | 63 | 64 | def get_optimizer(optimizer, lr_schedule=0.001, **kwargs): 65 | if type(optimizer) == str: 66 | optimizer = optimizer.lower() 67 | if optimizer == 'adam': 68 | return keras.optimizers.Adam(learning_rate=lr_schedule, **kwargs) 69 | elif optimizer == 'sgd': 70 | return keras.optimizers.SGD(learning_rate=lr_schedule, **kwargs) 71 | elif optimizer == 'rmsprop': 72 | return keras.optimizers.RMSprop(learning_rate=lr_schedule, **kwargs) 73 | else: 74 | raise ValueError('{} is unsupported optimizer now'.format(optimizer)) 75 | if isinstance(optimizer, keras.optimizers.Optimizer): 76 | return optimizer 77 | raise TypeError('type of optimizer must be str or callable, but {} is found'.format(type(optimizer))) 78 | 79 | 80 | def get_regularizer(reg): 81 | if reg is None: 82 | return None 83 | if type(reg) == str: 84 | return keras.regularizers.get(reg.lower()) 85 | if type(reg) == dict: 86 | if 'l1' in reg and 'l2' in reg: 87 | return keras.regularizers.l1_l2(l1=reg['l1'], l2=reg['l2']) 88 | elif 'l1' in reg: 89 | return keras.regularizers.l1(reg['l1']) 90 | elif 'l2' in reg: 91 | return keras.regularizers.l2(reg['l2']) 92 | else: 93 | raise 'the dict keys for regularizer must be "l1", "l2", or both of them' 94 | if callable(reg): 95 | return reg 96 | raise TypeError(f'type of regularizer must be None, str, dict or callable, but {type(reg)} is found') 97 | -------------------------------------------------------------------------------- /utils/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /utils/utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import tensorflow as tf 4 | from scipy import signal 5 | from scipy.io import wavfile 6 | 7 | 8 | _mel_basis = None 9 | 10 | 11 | def load_wav(hp, path): 12 | wav, sr = librosa.core.load(path, sr=hp.sample_rate) 13 | wav = wav / np.abs(wav).max() * 0.999 14 | return wav, sr 15 | 16 | 17 | def save_wav(hp, wav, path): 18 | wav /= max(0.01, np.max(np.abs(wav))) 19 | wavfile.write(path, hp.sample_rate, (wav * 32766).astype(np.int16)) 20 | 21 | 22 | def preemphasis(hp, x): 23 | return signal.lfilter([1, -hp.preemphasis], [1], x) 24 | 25 | 26 | def inv_preemphasis(hp, x): 27 | return signal.lfilter([1], [1, -hp.preemphasis], x) 28 | 29 | 30 | def spectrogram(hp, y): 31 | if hp.preemphasis is None: 32 | D = _stft(hp, y) 33 | else: 34 | D = _stft(hp, preemphasis(hp, y)) 35 | S = _amp_to_db(np.abs(D)) - hp.ref_level_db 36 | return _normalize(hp, S) 37 | 38 | 39 | def inv_spectrogram(hp, spectrogram): 40 | '''Converts spectrogram to waveform using librosa''' 41 | S = _db_to_amp(_denormalize(hp, spectrogram) + hp.ref_level_db) # Convert back to linear 42 | return inv_preemphasis(hp, _griffin_lim(hp, S ** hp.griffin_lim_power)) # Reconstruct phase 43 | 44 | 45 | def inv_spectrogram_tensorflow(hp, spectrogram): 46 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow. 47 | 48 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call 49 | inv_preemphasis on the output after running the graph. 50 | ''' 51 | with tf.name_scope('griffin_lim'): 52 | S = _db_to_amp_tensorflow(_denormalize_tensorflow(hp, spectrogram) + hp.ref_level_db) 53 | return _griffin_lim_tensorflow(hp, tf.pow(S, hp.griffin_lim_power)) 54 | 55 | 56 | def melspectrogram(hp, y): 57 | if hp.preemphasis is None: 58 | D = _stft(hp, y) 59 | else: 60 | D = _stft(hp, preemphasis(hp, y)) 61 | S = _amp_to_db(_linear_to_mel(hp, np.abs(D))) - hp.ref_level_db 62 | return _normalize(hp, S) 63 | 64 | 65 | def mfcc(hp, y): 66 | pass 67 | 68 | 69 | def find_endpoint(hp, wav, threshold_db=-40, min_silence_sec=0.8): 70 | window_length = int(hp.sample_rate * min_silence_sec) 71 | hop_length = int(window_length / 4) 72 | threshold = _db_to_amp(threshold_db) 73 | for x in range(hop_length, len(wav) - window_length, hop_length): 74 | if np.max(wav[x:x + window_length]) < threshold: 75 | return x + hop_length 76 | return len(wav) 77 | 78 | 79 | def _griffin_lim(hp, S): 80 | '''librosa implementation of Griffin-Lim 81 | Based on https://github.com/librosa/librosa/issues/434 82 | ''' 83 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 84 | S_complex = np.abs(S).astype(np.complex) 85 | y = _istft(hp, S_complex * angles) 86 | for i in range(hp.griffin_lim_iters): 87 | angles = np.exp(1j * np.angle(_stft(hp, y))) 88 | y = _istft(hp, S_complex * angles) 89 | return y # reconstructed wav 90 | 91 | 92 | def _griffin_lim_tensorflow(hp, S): 93 | '''TensorFlow implementation of Griffin-Lim 94 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb 95 | ''' 96 | with tf.variable_scope('griffinlim'): 97 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 98 | S = tf.expand_dims(S, 0) 99 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 100 | y = _istft_tensorflow(hp, S_complex) 101 | for i in range(hp.griffin_lim_iters): 102 | est = _stft_tensorflow(hp, y) 103 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 104 | y = _istft_tensorflow(hp, S_complex * angles) 105 | return tf.squeeze(y, 0) 106 | 107 | 108 | def _stft(hp, y): 109 | n_fft, hop_length, win_length = _stft_parameters(hp) 110 | # shape (1 + n_fft/2, n_frames) 111 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 112 | 113 | 114 | def _istft(hp, y): 115 | _, hop_length, win_length = _stft_parameters(hp) 116 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 117 | 118 | 119 | def _stft_tensorflow(hp, signals): 120 | n_fft, hop_length, win_length = _stft_parameters(hp) 121 | return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False) 122 | 123 | 124 | def _istft_tensorflow(hp, stfts): 125 | n_fft, hop_length, win_length = _stft_parameters(hp) 126 | return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) 127 | 128 | 129 | def _stft_parameters(hp): 130 | n_fft = hp.n_fft 131 | hop_length = int(hp.hop_ms / 1000 * hp.sample_rate) 132 | win_length = int(hp.win_ms / 1000 * hp.sample_rate) 133 | return n_fft, hop_length, win_length 134 | 135 | 136 | def _linear_to_mel(hp, spectrogram): 137 | global _mel_basis 138 | if _mel_basis is None: 139 | _mel_basis = _build_mel_basis(hp) 140 | return np.dot(_mel_basis, spectrogram) 141 | 142 | 143 | def _build_mel_basis(hp): 144 | n_fft = hp.n_fft 145 | return librosa.filters.mel(hp.sample_rate, n_fft, n_mels=hp.num_mels) 146 | 147 | 148 | def _amp_to_db(x): 149 | # return 20 * np.log10(np.maximum(1e-5, x)) 150 | return 20 * np.log10(np.maximum(1e-4, x)) # 最小为-80dB, 因为还有减去ref_dB 151 | 152 | 153 | def _db_to_amp(x): 154 | return np.power(10.0, x * 0.05) 155 | 156 | 157 | def _db_to_amp_tensorflow(x): 158 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 159 | 160 | 161 | def _normalize(hp, S): 162 | # 这个做法存疑, 因为S>0时, 都会被截断成0, 即如果S>ref_db, 都会 163 | # 变成ref_db 164 | return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1) 165 | 166 | 167 | def _denormalize(hp, S): 168 | return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 169 | 170 | 171 | def _denormalize_tensorflow(hp, S): 172 | return (tf.clip_by_value(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 173 | -------------------------------------------------------------------------------- /utils/utils/ce_loss_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def ce_loss(soft_labels, logits): 5 | probs = tf.clip_by_value(tf.nn.softmax(logits, axis=-1), 1e-10, 10) 6 | ce = -tf.reduce_mean(tf.reduce_sum(soft_labels * tf.log(probs), axis=-1)) 7 | return ce 8 | -------------------------------------------------------------------------------- /utils/utils/center_loss_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def calc_center_loss(features, centers, emo_labels, is_l1=True): 5 | """ 6 | Args: 7 | features: [batch_size, dim] 8 | centers: [emo_num, dim] 9 | emo_labels: [batch_size, emo_num] 10 | Returns: 11 | 12 | """ 13 | features_ = tf.expand_dims(features, 1) # [batch_size, 1, dim] 14 | centers_ = tf.expand_dims(centers, 0) # [1, emo_num, dim[ 15 | diff = features_ - centers_ # [batch_size, emo_num, dim] 16 | dist = tf.reduce_sum(tf.square(diff), axis=-1) # [batch_size, emo_num] 17 | if is_l1: 18 | dist = tf.sqrt(dist) 19 | loss = tf.reduce_mean(tf.reduce_sum(dist * emo_labels, axis=-1)) 20 | return loss 21 | 22 | 23 | def update_center(features, centers, emo_labels, alpha): 24 | """ 25 | Args: 26 | features: [batch_size, dim] 27 | centers: [emo_num, dim] 28 | emo_labels: [batch_size, emo_num] 29 | Returns: 30 | """ 31 | features_ = tf.expand_dims(features, 1) # [batch_size, 1, dim] 32 | centers_ = tf.expand_dims(centers, 0) # [1, emo_num, dim] 33 | emo_labels_ = tf.expand_dims(emo_labels, -1) # [batch_size, emo_num, 1] 34 | diff = features_ - centers_ # [batch_size, emo_num, dim] 35 | weighted_emo_diff = diff * emo_labels_ # [batch_size, emo_num, dim] 36 | sum_emo_diff = tf.reduce_sum(weighted_emo_diff, axis=0) # [emo_num, dim] 37 | emo_sum = tf.clip_by_value(tf.reduce_sum(emo_labels_, axis=0), 0.001, 100) # [emo_num, 1] 38 | alpha = tf.clip_by_value(alpha, 0., 1.) 39 | c_diff = tf.math.divide(sum_emo_diff, emo_sum) # [emo_num, dim] 40 | alpha_c_diff = alpha * c_diff 41 | update_center_op = tf.assign_add(centers, alpha_c_diff) 42 | return update_center_op 43 | 44 | 45 | def test_calc_center_loss(): 46 | with tf.Session() as sess: 47 | centers = tf.Variable([[1, -1], [0, 1]], trainable=False, dtype=tf.float32) 48 | features = tf.constant([[-1, 0], [-1, -1], [2, 0]], dtype=tf.float32) 49 | emo_labels = tf.constant([[0, 1.0], [0, 1.0], [0, 1.0]], dtype=tf.float32) 50 | sess.run(tf.global_variables_initializer()) 51 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 52 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 53 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 54 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 55 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 56 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 57 | print(sess.run(update_center(features, centers, emo_labels, alpha=0.5))) 58 | print(sess.run(centers)) 59 | # loss = sess.run(calc_center_loss(features, centers, emo_labels, is_l1=True)) 60 | # print(loss) 61 | 62 | 63 | if __name__ == '__main__': 64 | test_calc_center_loss() 65 | -------------------------------------------------------------------------------- /utils/utils/data.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tqdm import tqdm 4 | from functools import partial 5 | from concurrent.futures import ProcessPoolExecutor 6 | 7 | 8 | def bytes_feature(value): 9 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 10 | 11 | 12 | def float_feature(value): 13 | return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) 14 | 15 | 16 | def int64_feature(value): 17 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 18 | 19 | 20 | def data_process_pipeline(hp_or_obj, meta_file, process_one_line_fun=None, 21 | postprocess_fun=None, max_workers=None, **kwargs): 22 | """This func reads meta file and runs three custom funcs to get the final data 23 | 24 | This func performs the following data preprocessing pipeline: 25 | 01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun 26 | must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict 27 | 02 parse the geted data sample to the feature_fun func to get a feature data 28 | 03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing 29 | e.g., normalizations, fixed length padding 30 | 31 | # Arguments 32 | hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters 33 | can be accessed as it's attributes 34 | meta_file: the meta file where each line generally contains the path of data sample and its labels 35 | meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of 36 | tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature 37 | of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts]) 38 | or (None, None) if no sample returned. Note, even there is only one sample returned, it also must 39 | be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}. 40 | Note: this can return (None, None) for the reason of some samples may be filtered out, e.g., its 41 | length does not meet the requirements 42 | feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs, 43 | and returns a single feature sample. The signature is: def feature_fun(hp, sample) 44 | postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same 45 | list of postprocessed feature samples with the same length. Generally, the feature normalizations, 46 | fixed length padding and sorting the samples by length are performed in this func. The signature 47 | is: def postprocess_fun(hp, features) 48 | kwargs: some extra key word arguments will be passed to all three funcs. 49 | 50 | # Returns 51 | A tuple of length 2, the first element is the list of all postprocessed features, and the second element 52 | is the list of all labels(each label is a list converted by the labels dict). For example, the return 53 | value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]]) 54 | 55 | # Exceptions 56 | TypeError: if both value of meta_fun's return is not a list 57 | """ 58 | if process_one_line_fun is None and not hasattr(hp_or_obj, 'process_one_line'): 59 | raise ValueError('hp_or_obj without process_one_line method and process_one_line_fun is None') 60 | if postprocess_fun is None and not hasattr(hp_or_obj, 'postprocess'): 61 | raise ValueError('hp_or_obj without postprocess method and postprocess_fun is None') 62 | 63 | with open(meta_file) as fr: 64 | lines = [line for line in fr if line.strip() and line[0] != '#'] 65 | 66 | if hasattr(hp_or_obj, 'process_one_line'): 67 | process_one_line_fun = type(hp_or_obj).process_one_line 68 | if hasattr(hp_or_obj, 'postprocess'): 69 | postprocess_fun = type(hp_or_obj).postprocess 70 | 71 | # 处理meta line, 获取sample和标签 72 | print(' step 1: parsing meta and get features ...') 73 | num = len(lines) 74 | hps = [hp_or_obj] * num 75 | features, labels = [], [] 76 | with ProcessPoolExecutor(max_workers) as p: 77 | for r in tqdm(p.map(partial(process_one_line_fun, **kwargs), hps, lines), total=num): 78 | ds, ls = r 79 | if (ds, ls) != (None, None): 80 | if type(ds) != list or type(ls) != list: 81 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 82 | features += ds 83 | labels += ls 84 | ''' 85 | for line in tqdm(lines): 86 | ds, ls = process_one_line_fun(hp_or_obj, line) 87 | if (ds, ls) != (None, None): 88 | if type(ds) != list or type(ls) != list: 89 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 90 | features += ds 91 | labels += ls 92 | ''' 93 | 94 | # np.save('recola_wav01_mel_nonorm_before_post.npy', features[0]) 95 | # print('DEBUG before post', features[0].shape, labels[0]) 96 | # 后处理 97 | print(' step 2: postprocessing for features and labels ...') 98 | features, labels = postprocess_fun(hp_or_obj, features, labels, **kwargs) 99 | return features, labels 100 | 101 | 102 | def get_class_weights(class_nums, type=0, power=1): 103 | if type == 0: 104 | return [1.] * len(class_nums) 105 | # 根据power, 重新计算class_nums和total 106 | total, class_ws = 0, class_nums.copy() 107 | for cls in range(len(class_ws)): 108 | class_ws[cls] = class_ws[cls] ** power 109 | total += class_ws[cls] 110 | # 权值取倒数后除以所有权值的均值(结果为1均值), 参考老代的做法 111 | if type == 1: 112 | wsum = 0 113 | for cls in class_ws: 114 | class_ws[cls] = 1 / class_ws[cls] 115 | wsum += class_ws[cls] 116 | wmean = wsum / len(class_nums) 117 | class_ws = [w / wmean for cls, w in class_ws] 118 | # 取倒数后乘total/2, https://www.tensorflow.org/tutorials/structured_data/imbalanced_data 119 | elif type == 2: 120 | class_ws = [0.5 * total / w for w in class_ws] 121 | return class_ws 122 | -------------------------------------------------------------------------------- /utils/utils/data.py.bak-0707: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from functools import partial 3 | from concurrent.futures import ProcessPoolExecutor 4 | 5 | 6 | def data_process_pipeline(hp, meta_file, meta_fun, feature_fun, postprocess_fun, 7 | max_workers=None, **kwargs): 8 | """This func reads meta file and runs three custom funcs to get the final data 9 | 10 | This func performs the following data preprocessing pipeline: 11 | 01 read the meta file and run the meta_fun func to parse each line in meta file, the meta_fun 12 | must return a tuple of two elements: data sample(e.g., a wav or image) and labels dict 13 | 02 parse the geted data sample to the feature_fun func to get a feature data 14 | 03 the list of feature samples is passed to the postprocess_fun to do some post preproceesing 15 | e.g., normalizations, fixed length padding 16 | 17 | # Arguments 18 | hp: the hyper parameter object with type 'Hparams' or a other type object where all hyper parameters 19 | can be accessed as it's attributes 20 | meta_file: the meta file where each line generally contains the path of data sample and its labels 21 | meta_fun: this func takes a line in meta_file and the hp object as inputs and returns a pair of 22 | tuple of data samples(often a list arrays) and its labels dict(a list of dicts). The signature 23 | of meta_fun is: def fun_name(hp, line), and its return values are:([data samples], [labels dicts]) 24 | or (None, None) if no sample returned. Note, even there is only one sample returned, it also must 25 | be a list of length 1. A labels dict example is {'L1': label_1, 'L2': label_2}. 26 | Note: this can return (None, None) for the reason of some samples may be filtered out, e.g., its 27 | length does not meet the requirements 28 | feature_fun: this func takes a single data sample returned by meta_fun and the hp object as inputs, 29 | and returns a single feature sample. The signature is: def feature_fun(hp, sample) 30 | postprocess_fun: this func takes the list of all feature samples and hp as inputs and return a same 31 | list of postprocessed feature samples with the same length. Generally, the feature normalizations, 32 | fixed length padding and sorting the samples by length are performed in this func. The signature 33 | is: def postprocess_fun(hp, features) 34 | kwargs: some extra key word arguments will be passed to all three funcs. 35 | 36 | # Returns 37 | A tuple of length 2, the first element is the list of all postprocessed features, and the second element 38 | is the list of all labels(each label is a list converted by the labels dict). For example, the return 39 | value can be: ([sample1, .., samplen], [[label1_1, label2_2], [label2_1, label_22], .., [labeln_1, labeln_2]]) 40 | 41 | # Exceptions 42 | TypeError: if both value of meta_fun's return is not a list 43 | """ 44 | with open(meta_file) as fr: 45 | lines = fr.readlines() 46 | 47 | # 处理meta line, 获取sample和标签 48 | print('\n[Beginning process datas ... ]') 49 | print('step 1: parsing meta and loading orignal data samples ...') 50 | datas, labels = [], [] 51 | if max_workers == -1: 52 | for line in tqdm(lines): 53 | ds, ls, info = meta_fun(hp, line, **kwargs) 54 | if (ds, ls) != (None, None): 55 | if type(ds) != list or type(ls) != list: 56 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 57 | datas += ds 58 | labels += ls 59 | else: 60 | num = len(lines) 61 | hps = [hp] * num 62 | with ProcessPoolExecutor(max_workers) as p: 63 | for r in tqdm(p.map(partial(meta_fun, **kwargs), hps, lines), total=num): 64 | ds, ls, info = r 65 | if (ds, ls) != (None, None): 66 | if type(ds) != list or type(ls) != list: 67 | raise TypeError('meta_fun func must return a tuple of "list", not {} or {}'.format(type(ds), type(ls))) 68 | datas += ds 69 | labels += ls 70 | hp.sr = info['sr'] 71 | 72 | # 处理sample, 获取feature 73 | print('step 2: getting features from original data samples ...') 74 | if max_workers == -1: 75 | datas = [feature_fun(hp, x, **kwargs) for x in tqdm(datas)] 76 | else: 77 | num = len(datas) 78 | hps = [hp] * num 79 | with ProcessPoolExecutor(max_workers) as p: 80 | datas = [r for r in tqdm(p.map(partial(feature_fun, **kwargs), hps, datas), total=num)] 81 | 82 | # 后处理 83 | print('step 3: postprocessing for features ...') 84 | datas = postprocess_fun(hp, datas, **kwargs) 85 | 86 | for i in range(len(labels)): 87 | labels[i] = list(labels[i].values()) # 用list更好, tuple无法修改 88 | return datas, labels 89 | -------------------------------------------------------------------------------- /utils/utils/debug.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def debug_print(*args, **kwargs): 5 | print_op = tf.print(*args, **kwargs) 6 | tf.add_to_collection('print_ops', print_op) 7 | 8 | 9 | def get_ops(): 10 | return tf.get_collection('print_ops') 11 | -------------------------------------------------------------------------------- /utils/utils/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Directory listing for /home/tiger/v2_tacotron2/utils/utils/ 6 | 7 | 8 |

Directory listing for /home/tiger/v2_tacotron2/utils/utils/

9 |
10 | 26 |
27 | 28 | 29 | -------------------------------------------------------------------------------- /utils/utils/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new training run\n') 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, slack=False, is_print=True): 26 | if is_print: 27 | print(msg) 28 | if _file is not None: 29 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 30 | if slack and _slack_url is not None: 31 | Thread(target=_send_slack, args=(msg,)).start() 32 | 33 | 34 | def _close_logfile(): 35 | global _file 36 | if _file is not None: 37 | _file.close() 38 | _file = None 39 | 40 | 41 | def _send_slack(msg): 42 | req = Request(_slack_url) 43 | req.add_header('Content-Type', 'application/json') 44 | urlopen(req, json.dumps({ 45 | 'username': 'tacotron', 46 | 'icon_emoji': ':taco:', 47 | 'text': '*%s*: %s' % (_run_name, msg) 48 | }).encode()) 49 | 50 | 51 | atexit.register(_close_logfile) 52 | -------------------------------------------------------------------------------- /utils/utils/mmd_utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import tensorflow as tf 3 | 4 | 5 | def compute_upsample_values(input_tensor, upsample_height, upsample_width): 6 | """Compute values for an upsampling op (ops.BatchCropAndResize). 7 | 8 | Args: 9 | input_tensor: image tensor with shape [batch, height, width, in_channels] 10 | upsample_height: integer 11 | upsample_width: integer 12 | 13 | Returns: 14 | grid_centers: tensor with shape [batch, 1] 15 | crop_sizes: tensor with shape [batch, 1] 16 | output_height: integer 17 | output_width: integer 18 | """ 19 | batch, input_height, input_width, _ = input_tensor.shape 20 | 21 | height_half = input_height / 2. 22 | width_half = input_width / 2. 23 | grid_centers = tf.constant(batch * [[height_half, width_half]]) 24 | crop_sizes = tf.constant(batch * [[input_height, input_width]]) 25 | output_height = input_height * upsample_height 26 | output_width = input_width * upsample_width 27 | 28 | return grid_centers, tf.to_float(crop_sizes), output_height, output_width 29 | 30 | 31 | def compute_pairwise_distances(x, y): 32 | """Computes the squared pairwise Euclidean distances between x and y. 33 | 34 | Args: 35 | x: a tensor of shape [num_x_samples, num_features] 36 | y: a tensor of shape [num_y_samples, num_features] 37 | 38 | Returns: 39 | a distance matrix of dimensions [num_x_samples, num_y_samples]. 40 | 41 | Raises: 42 | ValueError: if the inputs do no matched the specified dimensions. 43 | """ 44 | 45 | if not len(x.get_shape()) == len(y.get_shape()) == 2: 46 | raise ValueError('Both inputs should be matrices.') 47 | 48 | if x.get_shape().as_list()[1] != y.get_shape().as_list()[1]: 49 | raise ValueError('The number of features should be the same.') 50 | 51 | norm = lambda x: tf.reduce_sum(tf.square(x), 1) 52 | 53 | # By making the `inner' dimensions of the two matrices equal to 1 using 54 | # broadcasting then we are essentially substracting every pair of rows 55 | # of x and y. 56 | # x will be num_samples x num_features x 1, 57 | # and y will be 1 x num_features x num_samples (after broadcasting). 58 | # After the substraction we will get a 59 | # num_x_samples x num_features x num_y_samples matrix. 60 | # The resulting dist will be of shape num_y_samples x num_x_samples. 61 | # and thus we need to transpose it again. 62 | return tf.transpose(norm(tf.expand_dims(x, 2) - tf.transpose(y))) 63 | 64 | 65 | def gaussian_kernel_matrix(x, y, sigmas): 66 | r"""Computes a Guassian Radial Basis Kernel between the samples of x and y. 67 | 68 | We create a sum of multiple gaussian kernels each having a width sigma_i. 69 | 70 | Args: 71 | x: a tensor of shape [num_samples, num_features] 72 | y: a tensor of shape [num_samples, num_features] 73 | sigmas: a tensor of floats which denote the widths of each of the 74 | gaussians in the kernel. 75 | Returns: 76 | A tensor of shape [num_samples{x}, num_samples{y}] with the RBF kernel. 77 | """ 78 | beta = 1. / (2. * (tf.expand_dims(sigmas, 1))) 79 | 80 | dist = compute_pairwise_distances(x, y) 81 | 82 | s = tf.matmul(beta, tf.reshape(dist, (1, -1))) 83 | 84 | return tf.reshape(tf.reduce_sum(tf.exp(-s), 0), tf.shape(dist)) 85 | 86 | 87 | def maximum_mean_discrepancy(x, y, kernel=gaussian_kernel_matrix): 88 | r"""Computes the Maximum Mean Discrepancy (MMD) of two samples: x and y. 89 | 90 | Maximum Mean Discrepancy (MMD) is a distance-measure between the samples of 91 | the distributions of x and y. Here we use the kernel two sample estimate 92 | using the empirical mean of the two distributions. 93 | 94 | MMD^2(P, Q) = || \E{\phi(x)} - \E{\phi(y)} ||^2 95 | = \E{ K(x, x) } + \E{ K(y, y) } - 2 \E{ K(x, y) }, 96 | 97 | where K = <\phi(x), \phi(y)>, 98 | is the desired kernel function, in this case a radial basis kernel. 99 | 100 | Args: 101 | x: a tensor of shape [num_samples, num_features] 102 | y: a tensor of shape [num_samples, num_features] 103 | kernel: a function which computes the kernel in MMD. Defaults to the 104 | GaussianKernelMatrix. 105 | 106 | Returns: 107 | a scalar denoting the squared maximum mean discrepancy loss. 108 | """ 109 | with tf.name_scope('MaximumMeanDiscrepancy'): 110 | # \E{ K(x, x) } + \E{ K(y, y) } - 2 \E{ K(x, y) } 111 | cost = tf.reduce_mean(kernel(x, x)) 112 | cost += tf.reduce_mean(kernel(y, y)) 113 | cost -= 2 * tf.reduce_mean(kernel(x, y)) 114 | 115 | # We do not allow the loss to become negative. 116 | cost = tf.where(cost > 0, cost, 0, name='value') 117 | return cost 118 | 119 | 120 | def mmd_loss(source_samples, target_samples, weight): 121 | """Adds a similarity loss term, the MMD between two representations. 122 | 123 | This Maximum Mean Discrepancy (MMD) loss is calculated with a number of 124 | different Gaussian kernels. 125 | 126 | Args: 127 | source_samples: a tensor of shape [num_samples, num_features]. 128 | target_samples: a tensor of shape [num_samples, num_features]. 129 | weight: the weight of the MMD loss. 130 | 131 | Returns: 132 | a scalar tensor representing the MMD loss value. 133 | """ 134 | sigmas = [ 135 | 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 20, 25, 30, 35, 100, 136 | 1e3, 1e4, 1e5, 1e6 137 | ] 138 | gaussian_kernel = partial( 139 | gaussian_kernel_matrix, sigmas=tf.constant(sigmas)) 140 | 141 | loss_value = maximum_mean_discrepancy( 142 | source_samples, target_samples, kernel=gaussian_kernel) 143 | loss_value = tf.maximum(1e-4, loss_value) * weight 144 | # assert_op = tf.Assert(tf.is_finite(loss_value), [loss_value]) 145 | # with tf.control_dependencies([assert_op]): 146 | # tag = 'MMD Loss' 147 | # if scope: 148 | # tag = scope + tag 149 | # tf.summary.scalar(tag, loss_value) 150 | # tf.losses.add_loss(loss_value) 151 | 152 | return loss_value 153 | 154 | 155 | def correlation_loss(source_samples, target_samples, weight, scope=None): 156 | """Adds a similarity loss term, the correlation between two representations. 157 | 158 | Args: 159 | source_samples: a tensor of shape [num_samples, num_features] 160 | target_samples: a tensor of shape [num_samples, num_features] 161 | weight: a scalar weight for the loss. 162 | scope: optional name scope for summary tags. 163 | 164 | Returns: 165 | a scalar tensor representing the correlation loss value. 166 | """ 167 | with tf.name_scope('corr_loss'): 168 | source_samples -= tf.reduce_mean(source_samples, 0) 169 | target_samples -= tf.reduce_mean(target_samples, 0) 170 | 171 | source_samples = tf.nn.l2_normalize(source_samples, 1) 172 | target_samples = tf.nn.l2_normalize(target_samples, 1) 173 | 174 | source_cov = tf.matmul(tf.transpose(source_samples), source_samples) 175 | target_cov = tf.matmul(tf.transpose(target_samples), target_samples) 176 | 177 | corr_loss = tf.reduce_mean(tf.square(source_cov - target_cov)) * weight 178 | 179 | assert_op = tf.Assert(tf.is_finite(corr_loss), [corr_loss]) 180 | with tf.control_dependencies([assert_op]): 181 | tag = 'Correlation Loss' 182 | if scope: 183 | tag = scope + tag 184 | tf.summary.scalar(tag, corr_loss) 185 | tf.losses.add_loss(corr_loss) 186 | 187 | return corr_loss 188 | -------------------------------------------------------------------------------- /utils/utils/ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def shape_list(x): 5 | """Return list of dims, statically where possible.""" 6 | x = tf.convert_to_tensor(x) 7 | 8 | # If unknown rank, return dynamic shape 9 | if x.get_shape().dims is None: 10 | return tf.shape(x) 11 | 12 | static = x.get_shape().as_list() 13 | shape = tf.shape(x) 14 | 15 | ret = [] 16 | for i in range(len(static)): 17 | dim = static[i] 18 | if dim is None: 19 | dim = shape[i] 20 | ret.append(dim) 21 | return ret 22 | -------------------------------------------------------------------------------- /utils/utils/parameter.py: -------------------------------------------------------------------------------- 1 | import six 2 | import json 3 | 4 | 5 | # hyper parameter util class 6 | class HParams: 7 | def __init__(self, **kwargs): 8 | """A simple alternative implementation for tf.contrib.training.HParams 9 | 10 | # Arguments 11 | kwargs: all key word parameters which will be added as instance atrributes 12 | used as hyper parameters 13 | """ 14 | for k, v in six.iteritems(kwargs): 15 | self.add_hparam(k, v) 16 | 17 | def add_hparam(self, name, value): 18 | """add a new hyperparameter given a name and value 19 | 20 | if name is an existed hyperparameter, then it's value is 21 | updated as the new value 22 | 23 | # Arguments 24 | name: str name of the new hyperparameter will be added 25 | value: the value of new hyperparameter 26 | """ 27 | setattr(self, name, value) 28 | 29 | def del_hparam(self, name): 30 | """delete a hyperparameter named name 31 | 32 | # Arguments 33 | name: str name of the hyperparameter will be deleted 34 | """ 35 | delattr(self, name) 36 | 37 | def update(self, D, **kwargs): 38 | """update or add hyper parameters 39 | 40 | # Arguments 41 | D: a object that has keys() method, or can be iterated 42 | as for k, v in D 43 | kwargs: extra key workd arguments for update hyper patameters 44 | """ 45 | self.__dict__.update(D, **kwargs) 46 | 47 | def parse(self, values): 48 | """parse a str that is splited by ';' and update them into attributes 49 | 50 | Note: we use ';' as the delimiter not ',' as in tf.contrib.training.HParams 51 | because ',' will conflict the delimiter ',' in list and dict 52 | 53 | # Arguments 54 | values: a str contains hyper parameters which is splited with ';' 55 | and paired with '=', e.g., 'epochs=20,learning_rate=0.001' 56 | """ 57 | pairs = values.split(";") 58 | pairs = [x.strip().split("=") for x in pairs if x.strip() and '=' in x] 59 | dict_pairs = dict(pairs) 60 | for k in dict_pairs: 61 | if k not in self.__dict__: 62 | raise KeyError('can not parse a not existing hyperparameter:"{}"'.format(k)) 63 | # self.__dict__[k] = type(self.__dict__[k])(dict_pairs[k]) # 还无法解析字典和列表元素 64 | try: 65 | v = json.loads(dict_pairs[k]) # note: 参数值如果是字典, 则该字典的key只能是字符串(json要求) 66 | except json.JSONDecodeError: 67 | v = json.loads('"' + dict_pairs[k] + '"') # 直接解析字符串hello会报错, 必须解析"hello"才可以 68 | self.__dict__[k] = v 69 | return self 70 | 71 | def print(self): 72 | """this func prints all hyper parameters""" 73 | print('\n\n') 74 | print('--------------------------------------------------') 75 | print('All Hyper Parameters:') 76 | print('--------------------------------------------------') 77 | hps = self.__dict__ 78 | for hp in hps: 79 | print(' {}={}'.format(hp, hps[hp])) 80 | print('--------------------------------------------------') 81 | print('\n\n') 82 | 83 | def to_string(self): 84 | hp = '\n' 85 | hp += '--------------------------------------------------\n' 86 | hp += 'All Hyper Parameters:\n' 87 | hp += '--------------------------------------------------\n' 88 | hps = self.__dict__ 89 | for k in hps: 90 | hp += ' {}={}\n'.format(k, hps[k]) 91 | hp += '--------------------------------------------------\n' 92 | hp += '\n' 93 | return hp 94 | -------------------------------------------------------------------------------- /utils/utils/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def plot_alignment(alignment, path, info=None, title=None, text=None): 8 | """ 9 | # Arguments 10 | text: the text str where each char is used to draw yticks 11 | """ 12 | if text is None: 13 | figsize = None 14 | yticks = None 15 | ytick_labels = None 16 | else: 17 | yticks = np.arange(len(text)) 18 | ytick_labels = list(text) 19 | ytick_labels[-1] = len(text) 20 | figsize = (0.02 * alignment.shape[1], 0.10 * len(text)) 21 | 22 | fig, ax = plt.subplots(figsize=figsize) 23 | im = ax.imshow(alignment, 24 | aspect='auto', 25 | origin='lower', 26 | interpolation='none') 27 | fig.colorbar(im, ax=ax) 28 | xlabel = 'Decoder timestep' 29 | if info is not None: 30 | xlabel += '\n\n' + info 31 | plt.xlabel(xlabel) 32 | plt.ylabel('Encoder timestep') 33 | plt.yticks(yticks, ytick_labels) 34 | plt.title(title) 35 | plt.tight_layout() 36 | plt.savefig(path, format='png') 37 | plt.close('all') 38 | 39 | 40 | def plot_mel(mel, path, info=None, title=None, gt_mel=None): 41 | nrows = 1 if gt_mel is None else 2 42 | fig, ax = plt.subplots(nrows, squeeze=False) 43 | 44 | def plot(mel, ax): 45 | im = ax.imshow(mel, 46 | aspect='auto', 47 | origin='lower', 48 | interpolation='none') 49 | ax.set_ylabel('freq') 50 | fig.colorbar(im, ax=ax) 51 | 52 | plot(mel.T, ax[0][0]) # mel shape [time_step, num_mels] 53 | ax[0][0].set_title(title) 54 | 55 | if gt_mel is not None: 56 | plot(gt_mel.T, ax[1][0]) 57 | 58 | plt.xlabel(info or 'time step') 59 | plt.tight_layout() 60 | plt.savefig(path, format='png') 61 | plt.close('all') 62 | -------------------------------------------------------------------------------- /utils/utils/tool_wrappers.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | 3 | 4 | def get_loss(loss): 5 | if type(loss) == str or type(loss) == dict: 6 | loss_name, kwargs = loss, {} 7 | if type(loss) == dict: 8 | loss_name = loss['loss_name'] 9 | kwargs = loss.pop('loss_name') 10 | kwargs.setdefault('name', loss_name) 11 | 12 | if loss_name == 'scce': 13 | kwargs.setdefault('from_logits', True) 14 | return keras.losses.SparseCategoricalCrossentropy(**kwargs) 15 | elif loss_name == 'cce': 16 | kwargs.setdefault('from_logits', True) 17 | return keras.losses.CategoricalCrossentropy(**kwargs) 18 | elif loss_name == 'bce': 19 | kwargs.setdefault('from_logits', True) 20 | return keras.losses.BinaryCrossentropy(**kwargs) 21 | elif loss_name == 'mae': 22 | return keras.losses.MeanAbsoluteError(**kwargs) 23 | elif loss_name == 'mse': 24 | return keras.losses.MeanSquaredError(**kwargs) 25 | elif loss_name == 'focal': 26 | pass 27 | else: 28 | raise ValueError('{} is unsupported loss now'.format(loss)) 29 | if callable(loss): 30 | return loss 31 | raise TypeError('type of loss must be str or callable, but {} is found'.format(type(loss))) 32 | 33 | 34 | def get_metric(metric): 35 | if type(metric) == str or type(metric) == dict: 36 | metric_name, kwargs = metric, {} 37 | if type(metric) == dict: 38 | metric_name = metric['metric_name'] 39 | kwargs = metric.pop('metric_name') 40 | metric_name = metric_name.lower() 41 | kwargs.setdefault('name', metric_name) 42 | 43 | if metric == 'sca': 44 | return keras.metrics.SparseCategoricalAccuracy(**kwargs) 45 | elif metric == 'ca': 46 | return keras.metrics.CategoricalAccuracy(**kwargs) 47 | elif metric == 'ba': 48 | return keras.metrics.BinaryAccuracy(**kwargs) 49 | elif metric == 'recall': 50 | return keras.metrics.Recall(**kwargs) 51 | elif metric == 'precision': 52 | return keras.metrics.Precision(**kwargs) 53 | elif metric == 'mae': 54 | return keras.metrics.MeanSquaredError(**kwargs) 55 | elif metric == 'mse': 56 | return keras.metrics.MeanAbsoluteError(**kwargs) 57 | else: 58 | raise ValueError('{} is unsupported metric now'.format(metric)) 59 | if callable(metric): 60 | return metric 61 | raise TypeError('type of metric must be str or callable, but {} is found'.format(type(metric))) 62 | 63 | 64 | def get_optimizer(optimizer, lr_schedule=0.001, **kwargs): 65 | if type(optimizer) == str: 66 | optimizer = optimizer.lower() 67 | if optimizer == 'adam': 68 | return keras.optimizers.Adam(learning_rate=lr_schedule, **kwargs) 69 | elif optimizer == 'sgd': 70 | return keras.optimizers.SGD(learning_rate=lr_schedule, **kwargs) 71 | elif optimizer == 'rmsprop': 72 | return keras.optimizers.RMSprop(learning_rate=lr_schedule, **kwargs) 73 | else: 74 | raise ValueError('{} is unsupported optimizer now'.format(optimizer)) 75 | if isinstance(optimizer, keras.optimizers.Optimizer): 76 | return optimizer 77 | raise TypeError('type of optimizer must be str or callable, but {} is found'.format(type(optimizer))) 78 | 79 | 80 | def get_regularizer(reg): 81 | if reg is None: 82 | return None 83 | if type(reg) == str: 84 | return keras.regularizers.get(reg.lower()) 85 | if type(reg) == dict: 86 | if 'l1' in reg and 'l2' in reg: 87 | return keras.regularizers.l1_l2(l1=reg['l1'], l2=reg['l2']) 88 | elif 'l1' in reg: 89 | return keras.regularizers.l1(reg['l1']) 90 | elif 'l2' in reg: 91 | return keras.regularizers.l2(reg['l2']) 92 | else: 93 | raise 'the dict keys for regularizer must be "l1", "l2", or both of them' 94 | if callable(reg): 95 | return reg 96 | raise TypeError(f'type of regularizer must be None, str, dict or callable, but {type(reg)} is found') 97 | --------------------------------------------------------------------------------