├── util ├── README.md ├── __pycache__ │ ├── image.cpython-36.pyc │ ├── layers.cpython-36.pyc │ └── wrapper.cpython-36.pyc ├── LICENSE ├── io.py ├── image.py ├── mnist.py ├── wrapper.py └── layers.py ├── etc ├── Sad.npf ├── Angry.npf ├── Happy.npf ├── Neutral.npf ├── emotion_vc_xmax.npf └── emotion_vc_xmin.npf ├── model ├── __pycache__ │ └── vawgan.cpython-36.pyc ├── vae.py ├── vawgan.py └── vaegan.py ├── trainer ├── __pycache__ │ ├── gan.cpython-36.pyc │ ├── vae.cpython-36.pyc │ └── vawgan.cpython-36.pyc ├── vawgan.py ├── vae.py └── gan.py ├── audio.json ├── README.md ├── architecture-vawgan-vcc2016.json ├── validate.py ├── main-vawgan.py ├── models.py ├── convert-vawgan-cwt.py └── analyzer.py /util/README.md: -------------------------------------------------------------------------------- 1 | # jutil 2 | Some helper functions I used in my codes 3 | -------------------------------------------------------------------------------- /etc/Sad.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/Sad.npf -------------------------------------------------------------------------------- /etc/Angry.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/Angry.npf -------------------------------------------------------------------------------- /etc/Happy.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/Happy.npf -------------------------------------------------------------------------------- /etc/Neutral.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/Neutral.npf -------------------------------------------------------------------------------- /etc/emotion_vc_xmax.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/emotion_vc_xmax.npf -------------------------------------------------------------------------------- /etc/emotion_vc_xmin.npf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/etc/emotion_vc_xmin.npf -------------------------------------------------------------------------------- /util/__pycache__/image.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/util/__pycache__/image.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/vawgan.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/model/__pycache__/vawgan.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/gan.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/trainer/__pycache__/gan.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/vae.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/trainer/__pycache__/vae.cpython-36.pyc -------------------------------------------------------------------------------- /util/__pycache__/layers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/util/__pycache__/layers.cpython-36.pyc -------------------------------------------------------------------------------- /util/__pycache__/wrapper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/util/__pycache__/wrapper.cpython-36.pyc -------------------------------------------------------------------------------- /audio.json: -------------------------------------------------------------------------------- 1 | { 2 | "fs": 16000, 3 | "fft_size": 1024, 4 | "f0_ceil": 500.0, 5 | "f0_floor": 71.0, 6 | "frame_period": 5.0 7 | } 8 | -------------------------------------------------------------------------------- /trainer/__pycache__/vawgan.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KunZhou9646/controllable_evc_code/HEAD/trainer/__pycache__/vawgan.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset 2 | 3 | This is an implementation of the paper "Seen and unseen emotional style transfer for voice conversion with a new emotional speech dataset" (Submitted to ICASSP 2021) 4 | (https://arxiv.org/abs/2010.14794) 5 | 6 | 7 | **Bibtex:** 8 | ``` 9 | @article{zhou2020seen, 10 | title={Seen and Unseen emotional style transfer for voice conversion with a new emotional speech dataset}, 11 | author={Zhou, Kun and Sisman, Berrak and Liu, Rui and Li, Haizhou}, 12 | journal={arXiv preprint arXiv:2010.14794}, 13 | year={2020} 14 | } 15 | 16 | ``` 17 | 18 | # Under construction 19 | -------------------------------------------------------------------------------- /util/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chin-Cheng Hsu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /util/io.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | def int64_feature(value): 5 | if isinstance(value, list): 6 | value = [tf.train.Feature( 7 | int64_list=tf.train.Int64List(value=[v])) for v in value] 8 | return tf.train.FeatureList(feature=value) 9 | else: 10 | value = tf.train.Int64List(value=[value]) 11 | return tf.train.Feature(int64_list=value) 12 | 13 | 14 | def bytes_feature(value): 15 | if isinstance(value, list): 16 | value = [tf.train.Feature( 17 | bytes_list=tf.train.BytesList(value=[v])) for v in value] 18 | return tf.train.FeatureList(feature=value) 19 | else: 20 | value = tf.train.BytesList(value=[value]) 21 | return tf.train.Feature(bytes_list=value) 22 | 23 | 24 | def float_feature(value): 25 | if isinstance(value, list): 26 | value = [tf.train.Feature( 27 | float_list=tf.train.FloatList(value=[v])) for v in value] 28 | return tf.train.FeatureList(feature=value) 29 | else: 30 | value = tf.train.FloatList(value=[value]) 31 | return tf.train.Feature(float_list=value) 32 | 33 | 34 | def read_float64_as_float32(filename): 35 | x = np.fromfile(filename, np.float64) 36 | return x.astype(np.float32) 37 | -------------------------------------------------------------------------------- /architecture-vawgan-vcc2016.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "VAWGAN", 3 | "hwc": [513, 1, 1], 4 | "z_dim": 128, 5 | "y_dim": 256, 6 | "y_emb_dim": 128, 7 | "f_dim": 1, 8 | "f_emb_dim": 1, 9 | "discriminator": { 10 | "merge_dim": 1024, 11 | "kernel": [[7, 1], [7, 1], [115, 1]], 12 | "stride": [[3, 1], [3, 1], [3, 1]], 13 | "output": [16, 32, 64], 14 | "l2-reg": 1e-6, 15 | "feature_layer": 1 16 | }, 17 | "encoder": { 18 | "kernel": [[7, 1], [7, 1], [7, 1], [7, 1], [7, 1]], 19 | "stride": [[3, 1], [3, 1], [3, 1], [3, 1], [3, 1]], 20 | "output": [16, 32, 64, 128, 256], 21 | "l2-reg": 1e-6 22 | }, 23 | "generator": { 24 | "hwc": [19, 1, 81], 25 | "merge_dim": 171, 26 | "kernel": [[9, 1], [7, 1], [7, 1], [1025, 1]], 27 | "stride": [[3, 1], [3, 1], [3, 1], [1, 1]], 28 | "output": [32, 16, 8, 1], 29 | "l2-reg": 1e-6 30 | }, 31 | "training": { 32 | "src_dir": "./data/bin/training_set/Neutral/*.bin", 33 | "trg_dir": "./data/bin/training_set/Happy/*.bin", 34 | "batch_size": 256, 35 | "lr": 1e-5, 36 | "lr_schedule": [ [999, 1e-4]], 37 | "beta1": 0.5, 38 | "beta2": 0.999, 39 | "nIterD": 5, 40 | "lambda": 10, 41 | "max_iter": 200000, 42 | "epoch_vae": 15, 43 | "epoch_vawgan": 45, 44 | "gamma": 50, 45 | "n_unroll": 5, 46 | "n_unroll_intense": 100, 47 | "clamping": 0.01 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /util/image.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def nchw_to_nhwc(x): 5 | return tf.transpose(x, [0, 2, 3, 1]) 6 | 7 | 8 | def nhwc_to_nchw(x): 9 | return tf.transpose(x, [0, 3, 1, 2]) 10 | 11 | 12 | def make_png_thumbnail(x, n): 13 | ''' 14 | Input: 15 | `x`: Tensor, value range=[-1, 1), shape=[n*n, h, w, c] 16 | `n`: sqrt of the number of images 17 | 18 | Return: 19 | `tf.string` (bytes) of the PNG. 20 | (write these binary directly into a file) 21 | ''' 22 | with tf.name_scope('MakeThumbnail'): 23 | _, h, w, c = x.get_shape().as_list() 24 | x = tf.reshape(x, [n, n, h, w, c]) 25 | x = tf.transpose(x, [0, 2, 1, 3, 4]) 26 | x = tf.reshape(x, [n * h, n * w, c]) 27 | x = x / 2. + .5 28 | x = tf.image.convert_image_dtype(x, tf.uint8, saturate=True) 29 | x = tf.image.encode_png(x) 30 | return x 31 | 32 | 33 | def line(x, xa, xb, ya, yb): 34 | ''' a line determined by two points ''' 35 | return ya + (x - xa) * (yb - ya) / (xb - xa) 36 | 37 | 38 | def clip_to_boundary(line1, line2, minval, maxval): 39 | x = tf.minimum(line1, line2) 40 | x = tf.minimum(x, maxval) 41 | x = tf.maximum(x, minval) 42 | return x 43 | 44 | 45 | def gray2jet(x): 46 | ''' NHWC (channel last) format ''' 47 | with tf.name_scope('Gray2Jet'): 48 | r = clip_to_boundary( 49 | line(x, .3515, .66, 0., 1.), 50 | line(x, .8867, 1., 1., .5), 51 | minval=0., 52 | maxval=1., 53 | ) 54 | g = clip_to_boundary( 55 | line(x, .125, .375, 0., 1.), 56 | line(x, .64, .91, 1., 0.), 57 | minval=0., 58 | maxval=1., 59 | ) 60 | b = clip_to_boundary( 61 | line(x, .0, .1132, 0.5, 1.0), 62 | line(x, .34, .648, 1., 0.), 63 | minval=0., 64 | maxval=1., 65 | ) 66 | return tf.concat([r, g, b], axis=-1) 67 | 68 | 69 | def make_png_jet_thumbnail(x, n): 70 | ''' 71 | Input: 72 | `x`: Tensor, value range=[-1, 1), shape=[n*n, h, w, c] 73 | `n`: sqrt of the number of images 74 | 75 | Return: 76 | `tf.string` (bytes) of the PNG. 77 | (write these binary directly into a file) 78 | ''' 79 | with tf.name_scope('MakeThumbnail'): 80 | _, h, w, c = x.get_shape().as_list() 81 | x = tf.reshape(x, [n, n, h, w, c]) 82 | x = tf.transpose(x, [0, 2, 1, 3, 4]) 83 | x = tf.reshape(x, [n * h, n * w, c]) 84 | x = x / 2. + .5 85 | x = gray2jet(x) 86 | x = tf.image.convert_image_dtype(x, tf.uint8, saturate=True) 87 | x = tf.image.encode_png(x) 88 | return x -------------------------------------------------------------------------------- /util/mnist.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import keras 3 | 4 | 5 | def mnist_batcher_in_tanh_vector( 6 | batch_size, 7 | capacity=256, 8 | min_after_dequeue=128, 9 | ): 10 | (x, y), (_, _) = keras.datasets.mnist.load_data() 11 | x = tf.constant(x) 12 | x = tf.cast(x, tf.float32) 13 | x = keras.layers.Flatten()(x) / 127.5 - 1. 14 | y = tf.cast(y, tf.int64) 15 | 16 | return tf.train.shuffle_batch( 17 | [x, y], 18 | batch_size=batch_size, 19 | capacity=capacity, 20 | min_after_dequeue=min_after_dequeue, 21 | enqueue_many=True 22 | ) 23 | 24 | 25 | def mnist16_batcher_in_tanh( 26 | batch_size, 27 | capacity=256, 28 | min_after_dequeue=128, 29 | ): 30 | ''' 31 | NCHW (channel first) 32 | ''' 33 | with tf.name_scope('MNIST_nchw'): 34 | (x, y), (_, _) = keras.datasets.mnist.load_data() 35 | x = x / 127.5 - 1. 36 | x = tf.constant(x, dtype=tf.float32) 37 | x = tf.expand_dims(x, -1) # n, h, w, c 38 | x = tf.image.resize_nearest_neighbor(x, [16, 16]) 39 | x = tf.transpose(x, [0, 3, 1, 2]) 40 | 41 | x = tf.Variable(x, trainable=False, name='image', dtype=tf.float32) 42 | y = tf.Variable(y, trainable=False, name='label', dtype=tf.int64) 43 | 44 | # TODO: using `slice_input_producer` slows down training!!! WHY? 45 | # 9.7 steps per sec -> 5.7 steps per sec 46 | # (but it consumes more GPU memory and GPU-Util, why?!) 47 | 48 | x, y = tf.train.slice_input_producer([x, y]) #, shuffle=False) 49 | 50 | return tf.train.batch( 51 | # return tf.train.shuffle_batch( 52 | [x, y], 53 | batch_size=batch_size, 54 | num_threads=8, 55 | # capacity=capacity, 56 | # min_after_dequeue=min_after_dequeue, 57 | # enqueue_many=True 58 | ) 59 | 60 | def mnist16_batcher_in_z_norm( 61 | batch_size, 62 | capacity=256, 63 | min_after_dequeue=128, 64 | ): 65 | ''' 66 | NCHW (channel first) 67 | ''' 68 | (x, y), (_, _) = keras.datasets.mnist.load_data() 69 | x_mu = x.mean(0) 70 | x_std = x.std(0) 71 | x = (x - x_mu) / (x_std + 1e-6) 72 | y = tf.constant(y, dtype=tf.int64) 73 | # x = tf.constant(x, dtype=tf.float32) / 127.5 - 1. 74 | x = tf.constant(x, dtype=tf.float32) 75 | x = tf.clip_by_value(x, -5, 5) 76 | x = tf.expand_dims(x, -1) # n, h, w, c 77 | x = tf.image.resize_bilinear(x, [16, 16]) 78 | x = tf.transpose(x, [0, 3, 1, 2]) 79 | 80 | return tf.train.shuffle_batch( 81 | [x, y], 82 | batch_size=batch_size, 83 | capacity=capacity, 84 | min_after_dequeue=min_after_dequeue, 85 | enqueue_many=True 86 | ) 87 | -------------------------------------------------------------------------------- /validate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | 8 | from models import MLPcVAE 9 | from vcc2016io import VCC2016TFRManager 10 | 11 | 12 | args = tf.app.flags.FLAGS 13 | tf.app.flags.DEFINE_string('logdir', 'logdir', 'log dir') 14 | tf.app.flags.DEFINE_integer('target_id', 9, 'target id (SF1 = 1, TM3 = 9)') 15 | tf.app.flags.DEFINE_string( 16 | 'file_pattern', None, 'filename filter') 17 | 18 | # speakers = ['SF1', 'SF2', 'SF3', 'SM1', 'SM2', 'TF1', 'TF2', 'TM1', 'TM2', 'TM3'] 19 | 20 | def main(): 21 | if args.logdir is None: 22 | raise ValueError('Please specify the logdir file') 23 | 24 | ckpt = get_checkpoint(args.logdir) 25 | 26 | if ckpt is None: 27 | raise ValueError('No checkpoints in {}'.format(args.logdir)) 28 | 29 | with open(os.path.join(args.logdir, 'architecture.json')) as f: 30 | arch = json.load(f) 31 | 32 | reader = VCC2016TFRManager() 33 | features = reader.read_whole(args.file_pattern, num_epochs=1) 34 | x = features['frame'] 35 | y = features['label'] 36 | filename = features['filename'] 37 | y_conv = y * 0 + args.target_id 38 | 39 | net = MLPcVAE(arch=arch, is_training=False) 40 | z = net.encode(x) 41 | xh = net.decode(z, y) 42 | x_conv = net.decode(z, y_conv) 43 | 44 | pre_train_saver = tf.train.Saver() 45 | def load_pretrain(sess): 46 | pre_train_saver.restore(sess, ckpt) 47 | sv = tf.train.Supervisor(init_fn=load_pretrain) 48 | gpu_options = tf.GPUOptions(allow_growth=True) 49 | sess_config = tf.ConfigProto( 50 | allow_soft_placement=True, 51 | gpu_options=gpu_options) 52 | with sv.managed_session(config=sess_config) as sess: 53 | for _ in range(reader.n_files): 54 | if sv.should_stop(): 55 | break 56 | fetch_dict = {'x': x, 'xh': xh, 'x_conv': x_conv, 'f': filename} 57 | results = sess.run(fetch_dict) 58 | plot_spectra(results) 59 | 60 | 61 | def get_checkpoint(logdir): 62 | ckpt = tf.train.get_checkpoint_state(logdir) 63 | if ckpt: 64 | return ckpt.model_checkpoint_path 65 | else: 66 | print('No checkpoint found') 67 | return None 68 | 69 | 70 | def plot_spectra(results): 71 | plt.figure(figsize=(10, 4)) 72 | plt.imshow( 73 | np.concatenate( 74 | [np.flipud(results['x'].T), 75 | np.flipud(results['xh'].T), 76 | np.flipud(results['x_conv'].T)], 77 | 0), 78 | aspect='auto', 79 | cmap='jet', 80 | ) 81 | plt.colorbar() 82 | plt.title('Upper: Real input; Mid: Reconstrution; Lower: Conversion to target.') 83 | plt.savefig( 84 | os.path.join( 85 | args.logdir, 86 | '{}.png'.format( 87 | os.path.split(str(results['f'], 'utf-8'))[-1] 88 | ) 89 | ) 90 | ) 91 | 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /main-vawgan.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from importlib import import_module 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from analyzer import Tanhize, read 10 | from util.wrapper import validate_log_dirs 11 | 12 | args = tf.app.flags.FLAGS 13 | tf.app.flags.DEFINE_string( 14 | 'corpus_name', 'emotion_vc', 'Corpus name') 15 | tf.app.flags.DEFINE_string( 16 | 'logdir_root', None, 'root of log dir') 17 | tf.app.flags.DEFINE_string( 18 | 'logdir', None, 'log dir') 19 | tf.app.flags.DEFINE_string( 20 | 'restore_from', None, 'restore from dir (not from *.ckpt)') 21 | tf.app.flags.DEFINE_string('gpu_cfg', None, 'GPU configuration') 22 | tf.app.flags.DEFINE_integer('summary_freq', 1000, 'Update summary') 23 | tf.app.flags.DEFINE_string( 24 | 'ckpt', None, 'specify the ckpt in restore_from (if there are multiple ckpts)') # TODO 25 | tf.app.flags.DEFINE_string( 26 | 'architecture', 'architecture-vawgan-vcc2016.json', 'network architecture') 27 | 28 | tf.app.flags.DEFINE_string('model_module', 'model.vawgan', 'Model module') 29 | tf.app.flags.DEFINE_string('model', 'VAWGAN', 'Model: ConvVAE, VAWGAN') 30 | 31 | tf.app.flags.DEFINE_string('trainer_module', 'trainer.vawgan', 'Trainer module') 32 | tf.app.flags.DEFINE_string('trainer', 'VAWGANTrainer', 'Trainer: VAETrainer, VAWGANTrainer') 33 | 34 | 35 | def main(unused_args=None): 36 | ''' NOTE: The input is rescaled to [-1, 1] ''' 37 | module = import_module(args.model_module, package=None) 38 | MODEL = getattr(module, args.model) 39 | 40 | module = import_module(args.trainer_module, package=None) 41 | TRAINER = getattr(module, args.trainer) 42 | 43 | 44 | dirs = validate_log_dirs(args) 45 | tf.gfile.MakeDirs(dirs['logdir']) 46 | 47 | with open(args.architecture) as f: 48 | arch = json.load(f) 49 | 50 | with open(os.path.join(dirs['logdir'], args.architecture), 'w') as f: 51 | json.dump(arch, f, indent=4) 52 | 53 | normalizer = Tanhize( 54 | xmax=np.fromfile('./etc/{}_xmax.npf'.format(args.corpus_name)), 55 | xmin=np.fromfile('./etc/{}_xmin.npf'.format(args.corpus_name)), 56 | ) 57 | 58 | x_s, y_s, f0_s = read( 59 | file_pattern=arch['training']['src_dir'], 60 | batch_size=arch['training']['batch_size'], 61 | capacity=2048, 62 | min_after_dequeue=1024, 63 | normalizer=normalizer, 64 | data_format='NHWC', 65 | ) 66 | 67 | x_t, y_t, f0_t = read( 68 | file_pattern=arch['training']['trg_dir'], 69 | batch_size=arch['training']['batch_size'], 70 | capacity=2048, 71 | min_after_dequeue=1024, 72 | normalizer=normalizer, 73 | data_format='NHWC', 74 | ) 75 | 76 | machine = MODEL(arch, is_training=True) 77 | # y_s_new = tf.stack([y_s,f0_s],axis=1) 78 | # y_t_new = tf.stack([y_t,f0_t],axis=1) 79 | 80 | loss = machine.loss(x_s, y_s, f0_s, x_t, y_t, f0_t) 81 | trainer = TRAINER(loss, arch, args, dirs) 82 | trainer.train(nIter=arch['training']['max_iter'], machine=machine) 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | # from tensorflow.contrib import slim 3 | import tensorflow as tf 4 | from util.layers import GaussianLogDensity, GaussianKLD, \ 5 | GaussianSampleLayer, lrelu 6 | 7 | # TODO: 8 | # 1. Multi-stream? 9 | # 2. Separate MLP & CNN? 10 | # (or we can also just use CNN by viewing MLP input as D-channels) 11 | # 3. Conditional input as an option during __init__? 12 | 13 | 14 | class MLPcVAE(object): 15 | ''' 16 | Conditional Variational Auto-encoder implemented in multi-layer perceptron 17 | APIs: 18 | z = encode(x) 19 | xh = decode(z, y) 20 | Notation: 21 | shape: 22 | `b`: batch_size, 23 | `c`: indicates the dimension 24 | ''' 25 | 26 | def __init__(self, arch, is_training=False): 27 | self.arch = arch 28 | self.is_training = is_training 29 | self._decode = tf.make_template('Decoder', self._generator) 30 | self._encode = tf.make_template('Encoder', self._encoder) 31 | 32 | def _l2_regularized_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 33 | with tf.variable_scope(scope_name): 34 | embeddings = tf.get_variable( 35 | name=var_name, 36 | shape=[n_class, h_dim], 37 | regularizer=tf.contrib.keras.regularizers.l2(1e-3), 38 | ) 39 | return embeddings 40 | 41 | def _encoder(self, x, is_training): 42 | ''' Expects 2D inputs (b, c) 43 | Input: 44 | `x`: shape=[b, c] 45 | Return: 46 | `z_mu`: mean vector of `z` 47 | `z_lv`: log var of `z` 48 | ''' 49 | for o in self.arch['encoder']['output']: 50 | x = tf.layers.dense(x, units=o, activation=lrelu) 51 | # x = tf.layers.batch_normalization(x, training=is_training) 52 | z_mu = tf.layers.dense(x, units=self.arch['z_dim'], name='z_mu') 53 | z_lv = tf.layers.dense(x, units=self.arch['z_dim'], name='z_lv') 54 | return z_mu, z_lv 55 | 56 | def _generator(self, z, y, is_training): 57 | ''' 58 | Input: 59 | z: shape=[b, c] 60 | y: speaker label; shape=[b,], dtype=int64 61 | Return: 62 | xh: reconstructed version of `x` (the input to the VAE) 63 | ''' 64 | self.speaker_repr = self._l2_regularized_embedding( 65 | n_class=self.arch['y_dim'], 66 | h_dim=self.arch['yemb_dim'], 67 | scope_name='y_embedding', 68 | var_name='y_emb' 69 | ) 70 | 71 | c = tf.nn.embedding_lookup(self.speaker_repr, y) 72 | x = tf.concat([z, c], -1) 73 | for o in self.arch['decoder']['output']: 74 | x = tf.layers.dense(x, units=o, activation=lrelu) 75 | # x = tf.layers.batch_normalization(x, training=is_training) 76 | return tf.layers.dense(x, units=self.arch['x_dim'], name='xh') 77 | 78 | def loss(self, x, y): 79 | ''' 80 | Args: 81 | x: shape=[s, b, c] 82 | y: shape=[s, b] 83 | Returns: 84 | a `dict` of losses 85 | ''' 86 | z_mu, z_lv = self._encode(x, is_training=self.is_training) 87 | z = GaussianSampleLayer(z_mu, z_lv) 88 | xh = self._decode(z, y, is_training=self.is_training) 89 | 90 | with tf.name_scope('loss'): 91 | with tf.name_scope('E_log_p_x_zy'): 92 | L_x = -1.0 * tf.reduce_mean( 93 | GaussianLogDensity(x, xh, tf.zeros_like(x)), 94 | ) 95 | with tf.name_scope('D_KL_z'): 96 | L_z = tf.reduce_mean( 97 | GaussianKLD( 98 | z_mu, z_lv, 99 | tf.zeros_like(z_mu), tf.zeros_like(z_lv) 100 | ) 101 | ) 102 | loss = { 103 | 'L_x': L_x, 104 | 'L_z': L_z, 105 | } 106 | 107 | tf.summary.scalar('L_x', L_x) 108 | tf.summary.scalar('L_z', L_z) 109 | return loss 110 | 111 | def encode(self, x): 112 | z_mu, z_lv = self._encode(x, is_training=False) 113 | return z_mu 114 | 115 | def decode(self, z, y, tanh=False): 116 | xh = self._decode(z, y, is_training=False) 117 | return xh 118 | -------------------------------------------------------------------------------- /util/wrapper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from datetime import datetime 5 | 6 | import tensorflow as tf 7 | 8 | 9 | def get_checkpoint(logdir): 10 | ''' Get the first checkpoint ''' 11 | ckpt = tf.train.get_checkpoint_state(logdir) 12 | if ckpt: 13 | return ckpt.model_checkpoint_path 14 | else: 15 | print('No checkpoint found') 16 | return None 17 | 18 | def save(saver, sess, logdir, step): 19 | ''' Save a model to logdir/model.ckpt-[step] ''' 20 | model_name = 'model.ckpt' 21 | checkpoint_path = os.path.join(logdir, model_name) 22 | print('Storing checkpoint to {} ...'.format(logdir), end="") 23 | sys.stdout.flush() 24 | 25 | if not os.path.exists(logdir): 26 | os.makedirs(logdir) 27 | 28 | saver.save(sess, checkpoint_path, global_step=step) 29 | print(' Done.') 30 | 31 | 32 | def load(saver, sess, logdir, ckpt=None): 33 | ''' 34 | Try to load model form a dir (search for the newest checkpoint) 35 | ''' 36 | print('Trying to restore checkpoints from {} ...'.format(logdir), 37 | end="") 38 | if ckpt: 39 | ckpt = os.path.join(logdir, ckpt) 40 | global_step = int( 41 | ckpt 42 | .split('/')[-1] 43 | .split('-')[-1]) 44 | print(' Global step: {}'.format(global_step)) 45 | print(' Restoring...', end="") 46 | saver.restore(sess, ckpt) 47 | return global_step 48 | else: 49 | ckpt = tf.train.get_checkpoint_state(logdir) 50 | if ckpt: 51 | print(' Checkpoint found: {}'.format(ckpt.model_checkpoint_path)) 52 | global_step = int( 53 | ckpt.model_checkpoint_path 54 | .split('/')[-1] 55 | .split('-')[-1]) 56 | print(' Global step: {}'.format(global_step)) 57 | print(' Restoring...', end="") 58 | saver.restore(sess, ckpt.model_checkpoint_path) 59 | return global_step 60 | else: 61 | print('No checkpoint found') 62 | return None 63 | 64 | # 65 | 66 | def configure_gpu_settings(gpu_cfg=None): 67 | session_conf = None 68 | if gpu_cfg: 69 | with open(gpu_cfg) as f: 70 | cfg = json.load(f) 71 | gpu_options = tf.GPUOptions( 72 | per_process_gpu_memory_fraction=cfg['per_process_gpu_memory_fraction']) 73 | session_conf = tf.ConfigProto( 74 | allow_soft_placement=cfg['allow_soft_placement'], 75 | log_device_placement=cfg['log_device_placement'], 76 | inter_op_parallelism_threads=cfg['inter_op_parallelism_threads'], 77 | intra_op_parallelism_threads=cfg['intra_op_parallelism_threads'], 78 | gpu_options=gpu_options) 79 | # Timeline 80 | # jit_level = 0 81 | # session_conf.graph_options.optimizer_options.global_jit_level = jit_level 82 | # sess = tf.Session( 83 | # config=session_conf) 84 | # else: 85 | # sess = tf.Session() 86 | return session_conf 87 | 88 | def restore_global_step(saver, sess, from_dir, ckpt): 89 | try: 90 | step = load(saver, sess, from_dir, ckpt) 91 | if step is None: 92 | step = 0 93 | except: 94 | print("Something's wrong while restoing checkpoints!") 95 | raise 96 | return step 97 | 98 | 99 | def validate_log_dirs(args): 100 | ''' Create a default log dir (if necessary) ''' 101 | def get_default_logdir(logdir_root): 102 | STARTED_DATESTRING = datetime.now().strftime('%0m%0d-%0H%0M-%0S-%Y') 103 | logdir = os.path.join(logdir_root, 'train', STARTED_DATESTRING) 104 | print('Using default logdir: {}'.format(logdir)) 105 | return logdir 106 | 107 | if args.logdir and args.restore_from: 108 | raise ValueError( 109 | 'You can only specify one of the following: ' + 110 | '--logdir and --restore_from') 111 | 112 | if args.logdir and args.log_root: 113 | raise ValueError( 114 | 'You can only specify either --logdir or --logdir_root') 115 | 116 | if args.logdir_root is None: 117 | logdir_root = 'logdir' 118 | 119 | if args.logdir is None: 120 | logdir = get_default_logdir(logdir_root) 121 | 122 | # Note: `logdir` and `restore_from` are exclusive 123 | if args.restore_from is None: 124 | restore_from = logdir 125 | else: 126 | restore_from = args.restore_from 127 | 128 | return { 129 | 'logdir': logdir, 130 | 'logdir_root': logdir_root, 131 | 'restore_from': restore_from, 132 | } 133 | -------------------------------------------------------------------------------- /model/vae.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import slim 3 | from util.image import nchw_to_nhwc 4 | from util.layers import (GaussianKLD, GaussianLogDensity, GaussianSampleLayer, 5 | Layernorm, conv2d_nchw_layernorm, lrelu) 6 | # from model.wgan import GradientPenaltyWGAN 7 | 8 | class ConvVAE(object): 9 | def __init__(self, arch, is_training=False): 10 | ''' 11 | Variational auto-encoder implemented in 2D convolutional neural nets 12 | Input: 13 | `arch`: network architecture (`dict`) 14 | `is_training`: (unused now) it was kept for historical reasons (for `BatchNorm`) 15 | ''' 16 | self.arch = arch 17 | self._sanity_check() 18 | self.is_training = is_training 19 | 20 | with tf.name_scope('SpeakerRepr'): 21 | self.y_emb = self._l2_regularized_embedding( 22 | self.arch['y_dim'], 23 | self.arch['z_dim'], 24 | 'y_embedding') 25 | 26 | self._generate = tf.make_template( 27 | 'Generator', 28 | self._generator) 29 | 30 | self._encode = tf.make_template( 31 | 'Encoder', 32 | self._encoder) 33 | 34 | self.generate = self.decode # for VAE-GAN extension 35 | 36 | 37 | def _sanity_check(self): 38 | for net in ['encoder', 'generator']: 39 | assert len(self.arch[net]['output']) == len(self.arch[net]['kernel']) == len(self.arch[net]['stride']) 40 | 41 | 42 | def _unit_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 43 | with tf.variable_scope(scope_name): 44 | embeddings = tf.get_variable( 45 | name=var_name, 46 | shape=[n_class, h_dim]) 47 | embeddings = tf.nn.l2_normalize(embeddings, dim=-1, name=var_name+'normalized') 48 | return embeddings 49 | 50 | 51 | def _merge(self, var_list, fan_out, l2_reg=1e-6): 52 | x = 0. 53 | with slim.arg_scope( 54 | [slim.fully_connected], 55 | num_outputs=fan_out, 56 | weights_regularizer=slim.l2_regularizer(l2_reg), 57 | normalizer_fn=None, 58 | activation_fn=None): 59 | for var in var_list: 60 | x = x + slim.fully_connected(var) 61 | return slim.bias_add(x) 62 | 63 | 64 | def _l2_regularized_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 65 | with tf.variable_scope(scope_name): 66 | embeddings = tf.get_variable( 67 | name=var_name, 68 | shape=[n_class, h_dim], 69 | regularizer=slim.l2_regularizer(1e-6)) 70 | return embeddings 71 | 72 | def _encoder(self, x, is_training=None): 73 | net = self.arch['encoder'] 74 | for i, (o, k, s) in enumerate(zip(net['output'], net['kernel'], net['stride'])): 75 | x = conv2d_nchw_layernorm( 76 | x, o, k, s, lrelu, 77 | name='Conv2d-{}'.format(i) 78 | ) 79 | x = slim.flatten(x) 80 | z_mu = tf.layers.dense(x, self.arch['z_dim']) 81 | z_lv = tf.layers.dense(x, self.arch['z_dim']) 82 | return z_mu, z_lv 83 | 84 | def _generator(self, z, y, is_training=None): 85 | net = self.arch['generator'] 86 | h, w, c = net['hwc'] 87 | 88 | if y is not None: 89 | y = tf.nn.embedding_lookup(self.y_emb, y) 90 | x = self._merge([z, y], h * w * c) 91 | else: 92 | x = z 93 | 94 | x = tf.reshape(x, [-1, c, h, w]) # channel first 95 | for i, (o, k, s) in enumerate(zip(net['output'], net['kernel'], net['stride'])): 96 | x = tf.layers.conv2d_transpose(x, o, k, s, 97 | padding='same', 98 | data_format='channels_first', 99 | ) 100 | if i < len(net['output']) -1: 101 | x = Layernorm(x, [1, 2, 3], 'ConvT-LN{}'.format(i)) 102 | x = lrelu(x) 103 | else: 104 | x = tf.nn.tanh(x) # not 100% necessary 105 | return x 106 | 107 | 108 | def loss(self, x, y): 109 | with tf.name_scope('loss'): 110 | z_mu, z_lv = self._encode(x) 111 | z = GaussianSampleLayer(z_mu, z_lv) 112 | xh = self._generate(z, y) 113 | 114 | D_KL = tf.reduce_mean( 115 | GaussianKLD( 116 | slim.flatten(z_mu), 117 | slim.flatten(z_lv), 118 | slim.flatten(tf.zeros_like(z_mu)), 119 | slim.flatten(tf.zeros_like(z_lv)), 120 | ) 121 | ) 122 | logPx = tf.reduce_mean( 123 | GaussianLogDensity( 124 | slim.flatten(x), 125 | slim.flatten(xh), 126 | tf.zeros_like(slim.flatten(xh))), # TODO exp -tf.log(1/9) + 127 | ) 128 | 129 | loss = dict() 130 | loss['G'] = - logPx + D_KL 131 | loss['D_KL'] = D_KL 132 | loss['logP'] = logPx 133 | 134 | tf.summary.scalar('KL-div', D_KL) 135 | tf.summary.scalar('logPx', logPx) 136 | 137 | tf.summary.histogram('xh', xh) 138 | tf.summary.histogram('x', x) 139 | return loss 140 | 141 | def encode(self, x): 142 | z_mu, _ = self._encode(x) 143 | return z_mu 144 | 145 | def decode(self, z, y): 146 | xh = self._generate(z, y) 147 | return nchw_to_nhwc(xh) 148 | -------------------------------------------------------------------------------- /trainer/vawgan.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import tensorflow as tf 5 | 6 | from trainer.vae import VAETrainer 7 | from util.wrapper import save 8 | 9 | 10 | def lr_schedule(step, schedule): 11 | for s, lr in schedule: 12 | if step < s: 13 | return lr 14 | return 1e-7 15 | 16 | class VAWGANTrainer(VAETrainer): 17 | def _optimize(self): 18 | hyperp = self.arch['training'] 19 | global_step = tf.Variable(0, name='global_step') 20 | lr = tf.placeholder(dtype=tf.float32) # can't add lr to summary 21 | 22 | optimizer_d = tf.train.RMSPropOptimizer(lr) #hyperp['lr']) 23 | optimizer_g = tf.train.RMSPropOptimizer(lr) #hyperp['lr']) 24 | 25 | trainables = tf.trainable_variables() 26 | g_vars = [v for v in trainables if 'Generator' in v.name] 27 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 28 | e_vars = [v for v in trainables if 'Encoder' in v.name] 29 | 30 | # ======== Standard ======== 31 | # r: a {0, R+} weighting 32 | r = tf.placeholder(shape=[], dtype=tf.float32) 33 | k = tf.constant(hyperp['clamping'], shape=[]) 34 | for term in ['reconst_t', 'reconst_s', 'conv_s2t', 'real_s_t']: 35 | self.loss[term] = self.loss[term] / k 36 | 37 | obj_Dx = - self.loss['conv_s2t'] * k 38 | obj_Gx = r * self.loss['conv_s2t'] + self.loss['Dis'] 39 | obj_Ez = self.loss['KL(z)'] + self.loss['Dis'] 40 | 41 | opt_d = optimizer_d.minimize(obj_Dx, var_list=d_vars) 42 | opt_ds = [opt_d] 43 | 44 | logging.info('The following variables are clamped:') 45 | with tf.control_dependencies(opt_ds): 46 | with tf.name_scope('Clamping'): 47 | for v in d_vars: 48 | v_clamped = tf.clip_by_value(v, -k, k) 49 | clamping = tf.assign(v, v_clamped) 50 | opt_ds.append(clamping) 51 | logging.info(v.name) 52 | 53 | opt_g = optimizer_g.minimize(obj_Gx, var_list=g_vars, global_step=global_step) 54 | opt_e = optimizer_g.minimize(obj_Ez, var_list=e_vars) 55 | 56 | return dict( 57 | d=opt_ds, 58 | g=opt_g, 59 | gz=opt_e, 60 | lr=lr, 61 | gamma=r, 62 | global_step=global_step) 63 | 64 | 65 | def train(self, nIter, machine=None, summary_op=None): 66 | vae_saver = tf.train.Saver() 67 | sv = tf.train.Supervisor( 68 | logdir=self.dirs['logdir'], 69 | save_model_secs=300, 70 | global_step=self.opt['global_step']) 71 | 72 | sess_config = tf.ConfigProto( 73 | allow_soft_placement=True, 74 | gpu_options=tf.GPUOptions(allow_growth=True)) 75 | 76 | n_iter_per_epoch = 2 * int(1e5) // self.arch['training']['batch_size'] # TODO: should use #frames 77 | 78 | def print_log(info, results): 79 | msg = 'Epoch [{:3d}/{:3d}] '.format(info['ep'], info['nEpoch']) 80 | msg += '[{:4d}/{:4d}] '.format(info['it'], info['nIter']) 81 | msg += 'W: {:.2f} '.format(results['conv_s2t']) 82 | msg += 'DIS={:5.2f}, KLD={:5.2f}'.format(results['Dis'], results['KL(z)']) 83 | print('\r{}'.format(msg), end='', flush=True) 84 | logging.info(msg) 85 | 86 | 87 | hyperp = self.arch['training'] 88 | info = {'nEpoch': hyperp['epoch_vae'], 'nIter': n_iter_per_epoch} 89 | fetches = { 90 | 'conv_s2t': self.loss['conv_s2t'], 91 | 'Dis': self.loss['Dis'], 92 | 'KL(z)': self.loss['KL(z)'], 93 | 'opt_g': self.opt['g'], 94 | 'opt_e': self.opt['gz'], 95 | 'step': self.opt['global_step'], 96 | } 97 | with sv.managed_session(config=sess_config) as sess: 98 | try: 99 | update_G_E = [self.opt['g'], self.opt['gz']] 100 | 101 | for ep in range(hyperp['epoch_vae']): 102 | lr = lr_schedule(ep, hyperp['lr_schedule']) 103 | feed_dict = {self.opt['gamma']: 0., self.opt['lr']: lr} 104 | for it in range(n_iter_per_epoch): 105 | if it % 100 == 0: # update print only every 100 iters 106 | results = sess.run(fetches, feed_dict=feed_dict) 107 | info.update({'ep': ep + 1, 'it': it + 1}) 108 | print_log(info, results) 109 | else: 110 | sess.run(update_G_E, feed_dict=feed_dict) 111 | 112 | save(vae_saver, sess, os.path.join(self.dirs['logdir'], 'VAE'), fetches['step']) 113 | info.update({'nEpoch': hyperp['epoch_vawgan'] + hyperp['epoch_vae']}) 114 | 115 | for ep in range(hyperp['epoch_vawgan']): 116 | ep = ep + hyperp['epoch_vae'] 117 | 118 | lr = lr_schedule(ep, hyperp['lr_schedule']) 119 | feed_dict = {self.opt['gamma']: hyperp['gamma'], self.opt['lr']: lr} 120 | 121 | info.update({'ep': ep}) 122 | for it in range(n_iter_per_epoch): 123 | if (ep == hyperp['epoch_vae'] and it < 25) or (it % 100 == 0): 124 | nIterD = hyperp['n_unroll_intense'] 125 | else: 126 | nIterD = hyperp['n_unroll'] 127 | 128 | for _ in range(nIterD): 129 | sess.run(self.opt['d'], feed_dict=feed_dict) 130 | 131 | if it % 100 != 0: 132 | sess.run(update_G_E, feed_dict=feed_dict) 133 | else: 134 | results = sess.run(fetches, feed_dict=feed_dict) 135 | info.update({'ep': ep + 1, 'it': it + 1}) 136 | print_log(info, results) 137 | except KeyboardInterrupt: 138 | print() 139 | finally: 140 | save(sv.saver, sess, self.dirs['logdir'], fetches['step']) 141 | print() 142 | 143 | -------------------------------------------------------------------------------- /convert-vawgan-cwt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import soundfile as sf 8 | from os.path import join 9 | from util.wrapper import load 10 | from analyzer_copy import read_whole_features, pw2wav, dic2npy 11 | from analyzer_copy import Tanhize 12 | from datetime import datetime 13 | from importlib import import_module 14 | 15 | args = tf.app.flags.FLAGS 16 | tf.app.flags.DEFINE_string('corpus_name', 'emotion_vc', 'Corpus name') 17 | tf.app.flags.DEFINE_string('checkpoint', './logdir/train/0921-2112-44-2020/model.ckpt-46860', 'root of log dir') 18 | tf.app.flags.DEFINE_string('src', 'Neutral', 'source speaker [SF1 - SM2]') 19 | tf.app.flags.DEFINE_string('trg', 'Angry', 'target speaker [SF1 - TM3]') 20 | tf.app.flags.DEFINE_string('output_dir', './logdir', 'root of output dir') 21 | tf.app.flags.DEFINE_string('module', 'model.vawgan', 'Module') 22 | tf.app.flags.DEFINE_string('model', 'VAWGAN', 'Model') 23 | #tf.app.flags.DEFINE_string('file_pattern', './test_condition_results/{}/*.bin', 'file pattern') 24 | tf.app.flags.DEFINE_string('file_pattern', './data/bin/evaluation_set/{}/*.bin', 'file pattern') 25 | tf.app.flags.DEFINE_string( 26 | 'speaker_list', '../vaw_original/etc/speakers.tsv', 'Speaker list (one speaker per line)' 27 | ) 28 | 29 | def make_output_wav_name(output_dir, filename): 30 | basename = str(filename, 'utf8') 31 | basename = os.path.split(basename)[-1] 32 | basename = os.path.splitext(basename)[0] 33 | return os.path.join( 34 | output_dir, 35 | '{}-{}-{}.wav'.format(args.src, args.trg, basename) 36 | ) 37 | 38 | def make_output_bin_name(output_dir, filename): 39 | basename = str(filename, 'utf8') 40 | basename = os.path.split(basename)[-1] 41 | basename = os.path.splitext(basename)[0] 42 | return basename 43 | 44 | def get_default_output(logdir_root): 45 | STARTED_DATESTRING = datetime.now().strftime('%0m%0d-%0H%0M-%0S-%Y') 46 | logdir = os.path.join(logdir_root, 'output', STARTED_DATESTRING) 47 | print('Using default logdir: {}'.format(logdir)) 48 | return logdir 49 | 50 | def convert_f0(f0, src, trg): 51 | mu_s, std_s = np.fromfile(os.path.join('./etc', '{}.npf'.format(src)), np.float32) 52 | mu_t, std_t = np.fromfile(os.path.join('./etc', '{}.npf'.format(trg)), np.float32) 53 | lf0 = tf.where(f0 > 1., tf.log(f0), f0) 54 | lf0 = tf.where(lf0 > 1., (lf0 - mu_s)/std_s * std_t + mu_t, lf0) 55 | lf0 = tf.where(lf0 > 1., tf.exp(lf0), lf0) 56 | return lf0 57 | 58 | 59 | def nh_to_nchw(x): 60 | with tf.name_scope('NH_to_NCHW'): 61 | x = tf.expand_dims(x, 1) # [b, h] => [b, c=1, h] 62 | return tf.expand_dims(x, -1) # => [b, c=1, h, w=1] 63 | 64 | def nh_to_nhwc(x): 65 | with tf.name_scope('NH_to_NHWC'): 66 | return tf.expand_dims(tf.expand_dims(x, -1), -1) 67 | 68 | 69 | def main(unused_args=None): 70 | # args(sys.argv) 71 | 72 | if args.model is None: 73 | raise ValueError( 74 | '\n You MUST specify `model`.' +\ 75 | '\n Use `python convert.py --help` to see applicable options.' 76 | ) 77 | 78 | module = import_module(args.module, package=None) 79 | MODEL = getattr(module, args.model) 80 | 81 | FS = 16000 82 | 83 | with open(args.speaker_list) as fp: 84 | SPEAKERS = [l.strip() for l in fp.readlines()] 85 | 86 | 87 | logdir, ckpt = os.path.split(args.checkpoint) 88 | if 'VAE' in logdir: 89 | _path_to_arch, _ = os.path.split(logdir) 90 | else: 91 | _path_to_arch = logdir 92 | arch = tf.gfile.Glob(os.path.join(_path_to_arch, 'architecture*.json')) 93 | if len(arch) != 1: 94 | print('WARNING: found more than 1 architecture files!') 95 | arch = arch[0] 96 | with open(arch) as fp: 97 | arch = json.load(fp) 98 | 99 | normalizer = Tanhize( 100 | xmax=np.fromfile('./etc/{}_xmax.npf'.format(args.corpus_name)), 101 | xmin=np.fromfile('./etc/{}_xmin.npf'.format(args.corpus_name)), 102 | ) 103 | 104 | features = read_whole_features(args.file_pattern.format(args.src)) 105 | 106 | x = normalizer.forward_process(features['sp']) 107 | x = nh_to_nhwc(x) 108 | 109 | 110 | #y_s = features['speaker'] 111 | #y_t_id = tf.placeholder(dtype=tf.int64, shape=[1,]) 112 | #y_t = y_t_id * tf.ones(shape=[tf.shape(x)[0],], dtype=tf.int64) 113 | 114 | 115 | y_t = features['speaker'] 116 | 117 | 118 | 119 | #f0_t = features['f0'] 120 | f0_s = features['f0'] 121 | f0_t = convert_f0(f0_s, args.src, args.trg) 122 | #f0_s_convert = tf.cast(f0_s,dtype=tf.int64) 123 | f0_t_convert = tf.cast(f0_t,dtype=tf.int64) 124 | 125 | 126 | machine = MODEL(arch, is_training=False) 127 | z = machine.encode(x) 128 | x_t = machine.decode(z, y_t, f0_t_convert) # NOTE: the API yields NHWC format 129 | x_t = tf.squeeze(x_t) 130 | x_t = normalizer.backward_process(x_t) 131 | 132 | # For sanity check (validation) 133 | # x_s = machine.decode(z, y_s, f0_s_convert) 134 | # x_s = tf.squeeze(x_s) 135 | # x_s = normalizer.backward_process(x_s) 136 | 137 | #f0_s = features['f0'] 138 | #f0_t = convert_f0(f0_s, args.src, args.trg) 139 | 140 | output_dir = get_default_output(args.output_dir) 141 | 142 | saver = tf.train.Saver() 143 | sv = tf.train.Supervisor(logdir=output_dir) 144 | with sv.managed_session() as sess: 145 | load(saver, sess, logdir, ckpt=ckpt) 146 | print() 147 | while True: 148 | try: 149 | feat, f0, sp = sess.run( 150 | [features, f0_t, x_t], 151 | #feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])} 152 | feed_dict = {y_t: np.load('./emo_feat/0018/angry.npy').reshape(1,-1)} 153 | ) 154 | feat.update({'sp': sp, 'f0': f0}) 155 | 156 | feats = dic2npy(feat) 157 | oFilename = make_output_bin_name(output_dir, feat['filename']) 158 | 159 | with open(join('./bin_results', '{}.bin'.format(oFilename)), 'wb') as fp: 160 | fp.write(feats.tostring()) 161 | 162 | 163 | y = pw2wav(feat) 164 | oFilename = make_output_wav_name(output_dir, feat['filename']) 165 | print('\rProcessing {}'.format(oFilename), end='') 166 | sf.write(oFilename, y, FS) 167 | except KeyboardInterrupt: 168 | break 169 | finally: 170 | pass 171 | print() 172 | 173 | if __name__ == '__main__': 174 | tf.app.run() 175 | -------------------------------------------------------------------------------- /util/layers.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | from math import pi 3 | import tensorflow as tf 4 | # from tensorflow.contrib import slim 5 | from tensorflow.python.ops.init_ops import VarianceScaling 6 | 7 | EPSILON = tf.constant(1e-6, dtype=tf.float32) 8 | PI = tf.constant(pi, dtype=tf.float32) 9 | 10 | def Layernorm(x, axis, name): 11 | ''' 12 | Layer normalization (Ba, 2016) 13 | J: Z-normalization using all nodes of the layer on a per-sample basis. 14 | 15 | Input: 16 | `x`: channel_first/NCHW format! (or fully-connected) 17 | `axis`: list 18 | `name`: must be assigned 19 | 20 | Example: 21 | ```python 22 | axis = [1, 2, 3] 23 | x = tf.random_normal([64, 3, 10, 10]) 24 | name = 'D_layernorm' 25 | ``` 26 | Return: 27 | (x - u)/s * scale + offset 28 | 29 | Source: 30 | https://github.com/igul222/improved_wgan_training/blob/master/tflib/ops/layernorm.py 31 | ''' 32 | mean, var = tf.nn.moments(x, axis, keep_dims=True) 33 | n_neurons = x.get_shape().as_list()[axis[0]] 34 | offset = tf.get_variable( 35 | name+'.offset', 36 | shape=[n_neurons] + [1 for _ in range(len(axis) -1)], 37 | initializer=tf.zeros_initializer 38 | ) 39 | scale = tf.get_variable( 40 | name+'.scale', 41 | shape=[n_neurons] + [1 for _ in range(len(axis) -1)], 42 | initializer=tf.ones_initializer 43 | ) 44 | return tf.nn.batch_normalization(x, mean, var, offset, scale, 1e-5) 45 | 46 | 47 | def conv2d_nchw_layernorm(x, o, k, s, activation, name): 48 | ''' 49 | Input: 50 | `x`: input in NCHW format 51 | `o`: num of output nodes 52 | `k`: kernel size 53 | `s`: stride 54 | ''' 55 | with tf.variable_scope(name): 56 | x = tf.layers.conv2d( 57 | inputs=x, 58 | filters=o, 59 | kernel_size=k, 60 | strides=s, 61 | padding='same', 62 | data_format='channels_first', 63 | name=name, 64 | ) 65 | x = Layernorm(x, [1, 2, 3], 'layernorm') 66 | return activation(x) 67 | 68 | 69 | def selu(x): 70 | with tf.name_scope('selu'): 71 | alpha = 1.6732632423543772848170429916717 72 | scale = 1.0507009873554804934193349852946 73 | return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x)) 74 | 75 | def selu_normal(seed=None): 76 | return VarianceScaling( 77 | scale=1., mode='fan_in', distribution='normal', seed=seed) 78 | 79 | def mu_law_encode_nonlinear(audio, quantization_channels=256): 80 | ''' 81 | Compress the waveform amplitudes using mu-law non-linearity. 82 | NOTE: This mu-law functions as a non-linear function as opposed to 83 | quantization. 84 | ''' 85 | with tf.name_scope('encode'): 86 | mu = tf.to_float(quantization_channels - 1) 87 | # Perform mu-law companding transformation (ITU-T, 1988). 88 | # Minimum operation is here to deal with rare large amplitudes caused 89 | # by resampling. 90 | safe_audio_abs = tf.minimum(tf.abs(audio), 1.0) 91 | magnitude = tf.log1p(mu * safe_audio_abs) / tf.log1p(mu) 92 | signal = tf.multiply(tf.sign(audio), magnitude, name='mulaw') 93 | # Quantize signal to the specified number of levels. 94 | # return tf.to_int32((signal + 1) / 2 * mu + 0.5) 95 | return signal 96 | 97 | 98 | def mu_law_decode_nonlinear(output, quantization_channels=256): 99 | ''' 100 | Uncompress the waveform amplitudes using mu-law non-linearity. 101 | NOTE: This mu-law functions as a non-linear function. 102 | ''' 103 | with tf.name_scope('decode'): 104 | mu = quantization_channels - 1 105 | # Map values back to [-1, 1]. 106 | # signal = 2 * (tf.to_float(output) / mu) - 1 107 | signal = output 108 | # Perform inverse of mu-law transformation. 109 | magnitude = (1 / mu) * ((1 + mu)**abs(signal) - 1) 110 | return tf.sign(signal) * magnitude 111 | 112 | 113 | def GumbelSampleLayer(y_mu): 114 | ''' Create Gumbel(0, 1) variable from Uniform[0, 1] ''' 115 | u = tf.random_uniform( 116 | minval=0.0, 117 | maxval=1.0, 118 | shape=tf.shape(y_mu)) 119 | g = - tf.log(- tf.log(u)) 120 | return y_mu + g 121 | 122 | 123 | def GumbelSoftmaxLogDensity(y, p, tau): 124 | # EPS = tf.constant(1e-10) 125 | k = tf.shape(y)[-1] 126 | k = tf.cast(k, tf.float32) 127 | # y = y + EPS 128 | # y = tf.divide(y, tf.reduce_sum(y, -1, keep_dims=True)) 129 | y = normalize_to_unit_sum(y) 130 | sum_p_over_y = tf.reduce_sum(tf.divide(p, tf.pow(y, tau)), -1) 131 | logp = tf.lgamma(k) 132 | logp = logp + (k - 1) * tf.log(tau) 133 | logp = logp - k * tf.log(sum_p_over_y) 134 | logp = logp + sum_p_over_y 135 | return logp 136 | 137 | 138 | def normalize_to_unit_sum(x, EPS=1e-10): 139 | ''' Along the last dim ''' 140 | EPS = tf.constant(EPS, dtype=tf.float32) 141 | x = x + EPS 142 | x_sum = tf.reduce_sum(x, -1, keep_dims=True) 143 | x = tf.divide(x, x_sum) 144 | return x 145 | 146 | 147 | def lrelu(x, leak=0.02, name="lrelu"): 148 | ''' Leaky ReLU ''' 149 | return tf.maximum(x, leak*x, name=name) 150 | 151 | 152 | def GaussianSampleLayer(z_mu, z_lv, name='GaussianSampleLayer'): 153 | with tf.name_scope(name): 154 | eps = tf.random_normal(tf.shape(z_mu)) 155 | std = tf.sqrt(tf.exp(z_lv)) 156 | return tf.add(z_mu, tf.multiply(eps, std)) 157 | 158 | 159 | def GaussianLogDensity(x, mu, log_var, name='GaussianLogDensity'): 160 | with tf.name_scope(name): 161 | c = tf.log(2. * PI) 162 | var = tf.exp(log_var) 163 | x_mu2 = tf.square(x - mu) # [Issue] not sure the dim works or not? 164 | x_mu2_over_var = tf.div(x_mu2, var + EPSILON) 165 | log_prob = -0.5 * (c + log_var + x_mu2_over_var) 166 | log_prob = tf.reduce_sum(log_prob, -1) # keep_dims=True, 167 | return log_prob 168 | 169 | 170 | def GaussianKLD(mu1, lv1, mu2, lv2): 171 | ''' Kullback-Leibler divergence of two Gaussians 172 | *Assuming that each dimension is independent 173 | mu: mean 174 | lv: log variance 175 | Equation: http://stats.stackexchange.com/questions/7440/kl-divergence-between-two-univariate-gaussians 176 | ''' 177 | with tf.name_scope('GaussianKLD'): 178 | v1 = tf.exp(lv1) 179 | v2 = tf.exp(lv2) 180 | mu_diff_sq = tf.square(mu1 - mu2) 181 | dimwise_kld = .5 * ( 182 | (lv2 - lv1) + tf.div(v1 + mu_diff_sq, v2 + EPSILON) - 1.) 183 | return tf.reduce_sum(dimwise_kld, -1) 184 | 185 | # Verification by CMU's implementation 186 | # http://www.cs.cmu.edu/~chanwook/MySoftware/rm1_Spk-by-Spk_MLLR/rm1_PNCC_MLLR_1/rm1/python/sphinx/divergence.py 187 | # def gau_kl(pm, pv, qm, qv): 188 | # """ 189 | # Kullback-Liebler divergence from Gaussian pm,pv to Gaussian qm,qv. 190 | # Also computes KL divergence from a single Gaussian pm,pv to a set 191 | # of Gaussians qm,qv. 192 | # Diagonal covariances are assumed. Divergence is expressed in nats. 193 | # """ 194 | # if (len(qm.shape) == 2): 195 | # axis = 1 196 | # else: 197 | # axis = 0 198 | # # Determinants of diagonal covariances pv, qv 199 | # dpv = pv.prod() 200 | # dqv = qv.prod(axis) 201 | # # Inverse of diagonal covariance qv 202 | # iqv = 1./qv 203 | # # Difference between means pm, qm 204 | # diff = qm - pm 205 | # return (0.5 * 206 | # (np.log(dqv / dpv) # log |\Sigma_q| / |\Sigma_p| 207 | # + (iqv * pv).sum(axis) # + tr(\Sigma_q^{-1} * \Sigma_p) 208 | # + (diff * iqv * diff).sum(axis) # + (\mu_q-\mu_p)^T\Sigma_q^{-1}(\mu_q-\mu_p) 209 | # - len(pm))) # - N 210 | -------------------------------------------------------------------------------- /trainer/vae.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | # from util.image import make_png_jet_thumbnail, make_png_thumbnail 7 | from trainer.gan import GANTrainer 8 | 9 | class VAETrainer(GANTrainer): 10 | def _optimize(self): 11 | ''' 12 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 13 | https://github.com/igul222/improved_wgan_training/issues/3 14 | ''' 15 | global_step = tf.Variable(0, name='global_step') 16 | lr = self.arch['training']['lr'] 17 | b1 = self.arch['training']['beta1'] 18 | b2 = self.arch['training']['beta2'] 19 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 20 | 21 | g_vars = tf.trainable_variables() 22 | 23 | with tf.name_scope('Update'): 24 | opt_g = optimizer.minimize(self.loss['G'], var_list=g_vars, global_step=global_step) 25 | return { 26 | 'g': opt_g, 27 | 'global_step': global_step 28 | } 29 | 30 | 31 | def _refresh_status(self, sess): 32 | fetches = { 33 | "D_KL": self.loss['D_KL'], 34 | "logP": self.loss['logP'], 35 | "step": self.opt['global_step'], 36 | } 37 | result = sess.run( 38 | fetches=fetches, 39 | # options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 40 | # run_metadata=run_metadata, 41 | ) 42 | 43 | # trace = timeline.Timeline(step_stats=run_metadata.step_stats) 44 | # with open(os.path.join(dirs['logdir'], 'timeline.ctf.json'), 'w') as fp: 45 | # fp.write(trace.generate_chrome_trace_format()) 46 | 47 | # Message 48 | msg = 'Iter {:05d}: '.format(result['step']) 49 | msg += 'log P(x|z, y) = {:.3e} '.format(result['logP']) 50 | msg += 'D_KL(z) = {:.3e} '.format(result['D_KL']) 51 | print('\r{}'.format(msg), end='', flush=True) 52 | logging.info(msg) 53 | 54 | 55 | # def _validate(self, machine, n=10): 56 | # N = n * n 57 | 58 | # # same row same z 59 | # z = tf.random_normal(shape=[n, self.arch['z_dim']]) 60 | # z = tf.tile(z, [1, n]) 61 | # z = tf.reshape(z, [N, -1]) 62 | # z = tf.Variable(z, trainable=False, dtype=tf.float32) 63 | 64 | # # same column same y 65 | # y = tf.range(0, 10, 1, dtype=tf.int64) 66 | # y = tf.reshape(y, [-1,]) 67 | # y = tf.tile(y, [n,]) 68 | 69 | # Xh = machine.generate(z, y) # 100, 64, 64, 3 70 | # Xh = make_png_thumbnail(Xh, n) 71 | # return Xh 72 | 73 | def train(self, nIter, machine=None, summary_op=None): 74 | # Xh = self._validate(machine=machine, n=10) 75 | 76 | run_metadata = tf.RunMetadata() 77 | 78 | sv = tf.train.Supervisor( 79 | logdir=self.dirs['logdir'], 80 | # summary_writer=summary_writer, 81 | # summary_op=None, 82 | # is_chief=True, 83 | save_model_secs=300, 84 | global_step=self.opt['global_step']) 85 | 86 | 87 | # sess_config = configure_gpu_settings(args.gpu_cfg) 88 | sess_config = tf.ConfigProto( 89 | allow_soft_placement=True, 90 | gpu_options=tf.GPUOptions(allow_growth=True)) 91 | 92 | with sv.managed_session(config=sess_config) as sess: 93 | sv.loop(60, self._refresh_status, (sess,)) 94 | for step in range(self.arch['training']['max_iter']): 95 | if sv.should_stop(): 96 | break 97 | 98 | # main loop 99 | sess.run(self.opt['g']) 100 | 101 | # # output img 102 | # if step % 1000 == 0: 103 | # xh = sess.run(Xh) 104 | # with tf.gfile.GFile( 105 | # os.path.join( 106 | # self.dirs['logdir'], 107 | # 'img-anime-{:03d}k.png'.format(step // 1000), 108 | # ), 109 | # mode='wb', 110 | # ) as fp: 111 | # fp.write(xh) 112 | 113 | 114 | 115 | class VAWGANTrainer(GANTrainer): 116 | def _optimize(self): 117 | ''' 118 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 119 | https://github.com/igul222/improved_wgan_training/issues/3 120 | ''' 121 | global_step = tf.Variable(0, name='global_step') 122 | lr = self.arch['training']['lr'] 123 | b1 = self.arch['training']['beta1'] 124 | b2 = self.arch['training']['beta2'] 125 | 126 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 127 | 128 | trainables = tf.trainable_variables() 129 | g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 130 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 131 | e_vars = [v for v in trainables if 'Encoder' in v.name] 132 | 133 | # # Debug =============== 134 | # debug(['Generator', 'Discriminator'], [g_vars, d_vars]) 135 | # # ============================ 136 | 137 | with tf.name_scope('Update'): 138 | opt_d = optimizer.minimize(self.loss['l_D'], var_list=d_vars) 139 | opt_e = optimizer.minimize(self.loss['l_E'], var_list=e_vars) 140 | with tf.control_dependencies([opt_e]): 141 | opt_g = optimizer.minimize(self.loss['l_G'], var_list=g_vars, global_step=global_step) 142 | return { 143 | 'd': opt_d, 144 | 'g': opt_g, 145 | 'e': opt_e, 146 | 'global_step': global_step 147 | } 148 | 149 | def train(self, nIter, machine=None, summary_op=None): 150 | # Xh = self._validate(machine=machine, n=10) 151 | 152 | run_metadata = tf.RunMetadata() 153 | 154 | # summary_op = tf.summary.merge_all() 155 | 156 | sv = tf.train.Supervisor( 157 | logdir=self.dirs['logdir'], 158 | # summary_writer=summary_writer, 159 | # summary_op=None, 160 | # is_chief=True, 161 | save_model_secs=100, 162 | global_step=self.opt['global_step']) 163 | 164 | 165 | # sess_config = configure_gpu_settings(args.gpu_cfg) 166 | sess_config = tf.ConfigProto( 167 | allow_soft_placement=True, 168 | gpu_options=tf.GPUOptions(allow_growth=True)) 169 | 170 | with sv.managed_session(config=sess_config) as sess: 171 | sv.loop(60, self._refresh_status, (sess,)) 172 | for step in range(self.arch['training']['max_iter']): 173 | if sv.should_stop(): 174 | break 175 | 176 | # main loop 177 | for _ in range(self.arch['training']['nIterD']): 178 | sess.run(self.opt['d']) 179 | sess.run(self.opt['g']) 180 | 181 | # # output img 182 | # if step % 1000 == 0: 183 | # xh = sess.run(Xh) 184 | # with tf.gfile.GFile( 185 | # os.path.join( 186 | # self.dirs['logdir'], 187 | # 'img-anime-{:03d}k.png'.format(step // 1000), 188 | # ), 189 | # mode='wb', 190 | # ) as fp: 191 | # fp.write(xh) 192 | 193 | def _refresh_status(self, sess): 194 | fetches = { 195 | "D_KL": self.loss['D_KL'], 196 | "logP": self.loss['logP'], 197 | "W_dist": self.loss['W_dist'], 198 | "gp": self.loss['gp'], 199 | "step": self.opt['global_step'], 200 | } 201 | result = sess.run( 202 | fetches=fetches, 203 | # options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 204 | # run_metadata=run_metadata, 205 | ) 206 | 207 | # trace = timeline.Timeline(step_stats=run_metadata.step_stats) 208 | # with open(os.path.join(dirs['logdir'], 'timeline.ctf.json'), 'w') as fp: 209 | # fp.write(trace.generate_chrome_trace_format()) 210 | 211 | # Message 212 | msg = 'Iter {:05d}: '.format(result['step']) 213 | msg += 'W_dist = {:.4e} '.format(result['W_dist']) 214 | msg += 'log P(x|z, y) = {:.4e} '.format(result['logP']) 215 | msg += 'D_KL(z) = {:.4e} '.format(result['D_KL']) 216 | msg += 'GP = {:.4e} '.format(result['gp']) 217 | print('\r{}'.format(msg), end='', flush=True) 218 | logging.info(msg) -------------------------------------------------------------------------------- /analyzer.py: -------------------------------------------------------------------------------- 1 | # Assuming directory structure: 2 | # ./dataset/vcc2016/wav/Training Set/SF1/100001.wav 3 | 4 | import os 5 | from os.path import join 6 | 7 | import librosa 8 | import numpy as np 9 | import pyworld as pw 10 | import tensorflow as tf 11 | 12 | 13 | args = tf.app.flags.FLAGS 14 | tf.app.flags.DEFINE_string('dir_to_wav', './data/wav', 'Dir to *.wav') 15 | tf.app.flags.DEFINE_string('dir_to_bin', './data/bin', 'Dir to output *.bin') 16 | tf.app.flags.DEFINE_string('dir_to_features','../emotion_features','Dir to emotion features') 17 | tf.app.flags.DEFINE_integer('fs', 16000, 'Global sampling frequency') 18 | tf.app.flags.DEFINE_float('f0_ceil', 500, 'Global f0 ceiling') 19 | 20 | FFT_SIZE = 1024 21 | SP_DIM = FFT_SIZE // 2 + 1 22 | FEAT_DIM = SP_DIM + SP_DIM + 1 + 1 + 256 + 1 # [sp, ap, f0, en, emo_feats, s] 23 | RECORD_BYTES = FEAT_DIM * 4 # all features saved in `float32` 24 | 25 | EPSILON = 1e-10 26 | 27 | 28 | 29 | def dic2npy(features, feat_dim=513, fs=16000): 30 | ''' NOTE: Use `order='C'` to ensure Cython compatibility ''' 31 | en = np.reshape(features['en'], [-1, 1]) 32 | sp = np.power(10., features['sp']) 33 | sp = en * sp 34 | 35 | f0 = features['f0'] 36 | f0 = np.reshape(f0,[-1,1]) 37 | 38 | 39 | # lf0_cwt = features['lf0_cwt'] 40 | # uv = np.reshape(features['uv'], [-1, 1]) 41 | # speaker = np.reshape(features['speaker'],[-1,1]) 42 | # filename = features['filename'] 43 | feats = np.concatenate([sp,f0],axis=1) 44 | feats = np.float32(feats) 45 | return feats 46 | 47 | def wav2pw(x, fs=16000, fft_size=FFT_SIZE): 48 | ''' Extract WORLD feature from waveform ''' 49 | _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil) # raw pitch extractor 50 | f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement 51 | sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) 52 | ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity 53 | return { 54 | 'f0': f0, 55 | 'sp': sp, 56 | 'ap': ap, 57 | } 58 | 59 | 60 | def list_dir(path): 61 | ''' retrieve the 'short name' of the dirs ''' 62 | return sorted([f for f in os.listdir(path) if os.path.isdir(join(path, f))]) 63 | 64 | 65 | def list_full_filenames(path): 66 | ''' return a generator of full filenames ''' 67 | return ( 68 | join(path, f) 69 | for f in os.listdir(path) 70 | if not os.path.isdir(join(path, f))) 71 | 72 | def extract(filename, fft_size=FFT_SIZE, dtype=np.float32): 73 | ''' Basic (WORLD) feature extraction ''' 74 | x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64) 75 | features = wav2pw(x, args.fs, fft_size=fft_size) 76 | ap = features['ap'] 77 | f0 = features['f0'].reshape([-1, 1]) 78 | sp = features['sp'] 79 | en = np.sum(sp + EPSILON, axis=1, keepdims=True) 80 | sp = np.log10(sp / en) 81 | return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype) 82 | 83 | def extract_and_save_bin_to(dir_to_wav, dir_to_bin, dir_to_features, speakers): 84 | ''' 85 | NOTE: the directory structure must be [args.dir_to_wav]/[Set]/[speakers] 86 | ''' 87 | counter = 1 88 | N = len(tf.gfile.Glob(join(dir_to_wav, '*', '*', '*.wav'))) 89 | for d in list_dir(dir_to_wav): # ['Training Set', 'Testing Set'] 90 | path = join(dir_to_wav, d) 91 | for s in list_dir(path): # ['SF1', ..., 'TM3'] 92 | path = join(dir_to_wav, d, s) 93 | output_dir = join(dir_to_bin, d, s) 94 | tf.gfile.MakeDirs(output_dir) 95 | for f in list_full_filenames(path): # ['10001.wav', ...] 96 | print('\rFile {}/{}: {:50}'.format(counter, N, f), end='') 97 | features = extract(f) # [sp, ap, f0, en] 98 | labels = speakers.index(s) * np.ones( 99 | [features.shape[0], 1], 100 | np.float32, 101 | ) 102 | 103 | b = os.path.splitext(f)[0] # ['10001', ...] 104 | _, b = os.path.split(b) 105 | 106 | 107 | #emo_path = join(dir_to_features, s) 108 | #with open(join(emo_path,'{}.npy'.format(b)), 'r') as file_dir: 109 | # print(file_dir) 110 | emo = np.load(dir_to_features + '/' + s + '/' + b + '.npy') 111 | emo_feats = emo * np.ones( 112 | [features.shape[0], 1], 113 | np.float32, 114 | ) 115 | 116 | features = np.concatenate([features, emo_feats, labels], 1) 117 | with open(join(output_dir, '{}.bin'.format(b)), 'wb') as fp: 118 | fp.write(features.tostring()) 119 | counter += 1 120 | print() 121 | 122 | 123 | class Tanhize(object): 124 | ''' Normalizing `x` to [-1, 1] ''' 125 | def __init__(self, xmin, xmax): 126 | self.xmin = xmin 127 | self.xmax = xmax 128 | self.xscale = xmax - xmin 129 | 130 | def forward_process(self, x): 131 | x = (x - self.xmin) / self.xscale 132 | return tf.clip_by_value(x, 0., 1.) * 2. - 1. 133 | 134 | def backward_process(self, x): 135 | return (x * .5 + .5) * self.xscale + self.xmin 136 | 137 | def read( 138 | file_pattern, 139 | batch_size, 140 | record_bytes=RECORD_BYTES, 141 | capacity=2048, 142 | min_after_dequeue=1536, 143 | num_threads=8, 144 | data_format='NCHW', 145 | normalizer=None, 146 | ): 147 | ''' 148 | Read only `sp` and `speaker` 149 | Return: 150 | `feature`: [b, c] 151 | `speaker`: [b,] 152 | ''' 153 | with tf.device('cpu'): 154 | with tf.name_scope('InputSpectralFrame'): 155 | files = tf.gfile.Glob(file_pattern) 156 | filename_queue = tf.train.string_input_producer(files) 157 | 158 | 159 | reader = tf.FixedLengthRecordReader(record_bytes) 160 | _, value = reader.read(filename_queue) 161 | value = tf.decode_raw(value, tf.float32) 162 | 163 | value = tf.reshape(value, [FEAT_DIM,]) 164 | feature = value[:SP_DIM] # NCHW format 165 | 166 | if normalizer is not None: 167 | feature = normalizer.forward_process(feature) 168 | 169 | if data_format == 'NCHW': 170 | feature = tf.reshape(feature, [1, SP_DIM, 1]) 171 | elif data_format == 'NHWC': 172 | feature = tf.reshape(feature, [SP_DIM, 1, 1]) 173 | else: 174 | pass 175 | 176 | #speaker = tf.cast(value[-1], tf.int64) 177 | 178 | #zk 179 | f0 = tf.cast(value[-259], tf.int64) 180 | 181 | #speaker = tf.cast(value[-257:-1],tf.int64) 182 | 183 | 184 | #f0 = value[-259] 185 | speaker = value[-257:-1] 186 | 187 | 188 | return tf.train.shuffle_batch( 189 | [feature, speaker, f0], 190 | batch_size, 191 | capacity=capacity, 192 | min_after_dequeue=min_after_dequeue, 193 | num_threads=num_threads, 194 | # enqueue_many=True, 195 | ) 196 | 197 | 198 | def read_whole_features(file_pattern, num_epochs=1): 199 | ''' 200 | Return 201 | `feature`: `dict` whose keys are `sp`, `ap`, `f0`, `en`, `speaker` 202 | ''' 203 | with tf.device('cpu'): 204 | with tf.name_scope('InputPipline'): 205 | files = tf.gfile.Glob(file_pattern) 206 | print('{} files found'.format(len(files))) 207 | filename_queue = tf.train.string_input_producer(files, num_epochs=num_epochs) 208 | reader = tf.WholeFileReader() 209 | key, value = reader.read(filename_queue) 210 | print("Processing {}".format(key), flush=True) 211 | value = tf.decode_raw(value, tf.float32) 212 | value = tf.reshape(value, [-1, FEAT_DIM]) 213 | return { 214 | 'sp': value[:, :SP_DIM], 215 | 'ap': value[:, SP_DIM : 2*SP_DIM], 216 | 'f0': value[:, SP_DIM * 2], 217 | 'en': value[:, SP_DIM * 2 + 1], 218 | 'speaker': value[:, SP_DIM * 2 + 1 + 1 : SP_DIM * 2 + 1 +1 +256], 219 | 'filename': key, 220 | } 221 | 222 | 223 | def pw2wav(features, feat_dim=513, fs=16000): 224 | ''' NOTE: Use `order='C'` to ensure Cython compatibility ''' 225 | en = np.reshape(features['en'], [-1, 1]) 226 | sp = np.power(10., features['sp']) 227 | sp = en * sp 228 | if isinstance(features, dict): 229 | return pw.synthesize( 230 | features['f0'].astype(np.float64).copy(order='C'), 231 | sp.astype(np.float64).copy(order='C'), 232 | features['ap'].astype(np.float64).copy(order='C'), 233 | fs, 234 | ) 235 | features = features.astype(np.float64) 236 | sp = features[:, :feat_dim] 237 | ap = features[:, feat_dim:feat_dim*2] 238 | f0 = features[:, feat_dim*2] 239 | en = features[:, feat_dim*2 + 1] 240 | en = np.reshape(en, [-1, 1]) 241 | sp = np.power(10., sp) 242 | sp = en * sp 243 | return pw.synthesize( 244 | f0.copy(order='C'), 245 | sp.copy(order='C'), 246 | ap.copy(order='C'), 247 | fs 248 | ) 249 | 250 | 251 | def make_speaker_tsv(path): 252 | speakers = [] 253 | for d in list_dir(path): 254 | speakers += list_dir(join(path, d)) 255 | speakers = sorted(set(speakers)) 256 | with open('./etc/speakers.tsv', 'w') as fp: 257 | for s in speakers: 258 | fp.write('{}\n'.format(s)) 259 | return speakers 260 | 261 | if __name__ == '__main__': 262 | speakers = make_speaker_tsv(args.dir_to_wav) 263 | extract_and_save_bin_to( 264 | args.dir_to_wav, 265 | args.dir_to_bin, 266 | args.dir_to_features, 267 | speakers, 268 | ) 269 | -------------------------------------------------------------------------------- /model/vawgan.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | from tensorflow.contrib import slim 3 | import tensorflow as tf 4 | from util.layers import GaussianLogDensity, GaussianKLD, \ 5 | GaussianSampleLayer, lrelu 6 | 7 | class VAWGAN(object): 8 | ''' 9 | VC-GAN 10 | = CVAE-CGAN 11 | = Convolutional Variational Auto-encoder 12 | with Conditional Generative Adversarial Net 13 | ''' 14 | def __init__(self, arch, is_training=False): 15 | self.arch = arch 16 | self._sanity_check() 17 | self.is_training = is_training 18 | 19 | with tf.name_scope('Generator'): 20 | self.y_emb = self._unit_embedding( 21 | self.arch['y_dim'], 22 | self.arch['z_dim'], 23 | 'y_embedding') 24 | self.f_emb = self._unit_embedding( 25 | self.arch['y_dim'], 26 | self.arch['f_dim'], 27 | 'f_embedding') 28 | 29 | self._generate = tf.make_template( 30 | 'Generator', 31 | self._generator) 32 | self._discriminate = tf.make_template( 33 | 'Discriminator', 34 | self._discriminator) 35 | self._encode = tf.make_template( 36 | 'Encoder', 37 | self._encoder) 38 | 39 | 40 | def _sanity_check(self): 41 | for net in ['encoder', 'generator', 'discriminator']: 42 | assert len(self.arch[net]['output']) > 2 43 | assert len(self.arch[net]['output']) == len(self.arch[net]['kernel']) 44 | assert len(self.arch[net]['output']) == len(self.arch[net]['stride']) 45 | 46 | 47 | def _unit_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 48 | with tf.variable_scope(scope_name): 49 | embeddings = tf.get_variable( 50 | name=var_name, 51 | shape=[n_class, h_dim]) 52 | embeddings = tf.nn.l2_normalize(embeddings, dim=-1, name=var_name+'normalized') 53 | return embeddings 54 | 55 | 56 | def _merge(self, var_list, fan_out, l2_reg=1e-6): 57 | ''' 58 | Note: Don't apply BN on this because 'y' 59 | tends to be the same inside a batch. 60 | ''' 61 | x = 0. 62 | with slim.arg_scope( 63 | [slim.fully_connected], 64 | num_outputs=fan_out, 65 | weights_regularizer=slim.l2_regularizer(l2_reg), 66 | normalizer_fn=None, 67 | activation_fn=None): 68 | for var in var_list: 69 | x = x + slim.fully_connected(var) 70 | x = slim.bias_add(x) 71 | return x 72 | 73 | 74 | def _l2_regularized_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 75 | with tf.variable_scope(scope_name): 76 | embeddings = tf.get_variable( 77 | name=var_name, 78 | shape=[n_class, h_dim], 79 | regularizer=slim.l2_regularizer(1e-6)) 80 | return embeddings 81 | 82 | 83 | def _encoder(self, x, is_training): 84 | n_layer = len(self.arch['encoder']['output']) 85 | subnet = self.arch['encoder'] 86 | 87 | with slim.arg_scope( 88 | [slim.batch_norm], 89 | scale=True, scope='BN', 90 | updates_collections=None, 91 | decay=0.9, epsilon=1e-5, # [TODO] Test these hyper-parameters 92 | is_training=is_training): 93 | with slim.arg_scope( 94 | [slim.conv2d], 95 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 96 | normalizer_fn=slim.batch_norm, 97 | activation_fn=lrelu): 98 | 99 | for i in range(n_layer): 100 | x = slim.conv2d( 101 | x, 102 | subnet['output'][i], 103 | subnet['kernel'][i], 104 | subnet['stride'][i]) 105 | tf.summary.image( 106 | 'down-sample{:d}'.format(i), 107 | tf.transpose(x[:, :, :, 0:3], [2, 1, 0, 3])) 108 | 109 | x = slim.flatten(x) 110 | 111 | with slim.arg_scope( 112 | [slim.fully_connected], 113 | num_outputs=self.arch['z_dim'], 114 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 115 | normalizer_fn=None, 116 | activation_fn=None): 117 | z_mu = slim.fully_connected(x) 118 | z_lv = slim.fully_connected(x) 119 | return z_mu, z_lv 120 | 121 | 122 | def _generator(self, z, y, f, is_training): 123 | ''' In this version, we only generate the target, so `y` is useless ''' 124 | subnet = self.arch['generator'] 125 | n_layer = len(subnet['output']) 126 | h, w, c = subnet['hwc'] 127 | 128 | #y = tf.nn.embedding_lookup(self.y_emb, y) 129 | f = tf.nn.embedding_lookup(self.f_emb, f) 130 | 131 | x = self._merge([z, y, f], subnet['merge_dim']) 132 | x = lrelu(x) 133 | with slim.arg_scope( 134 | [slim.batch_norm], 135 | scale=True, scope='BN', 136 | updates_collections=None, 137 | decay=0.9, epsilon=1e-5, 138 | is_training=is_training): 139 | 140 | x = slim.fully_connected( 141 | x, 142 | h * w * c, 143 | normalizer_fn=slim.batch_norm, 144 | activation_fn=lrelu) 145 | 146 | x = tf.reshape(x, [-1, h, w, c]) 147 | 148 | with slim.arg_scope( 149 | [slim.conv2d_transpose], 150 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 151 | normalizer_fn=slim.batch_norm, 152 | activation_fn=lrelu): 153 | 154 | for i in range(n_layer -1): 155 | x = slim.conv2d_transpose( 156 | x, 157 | subnet['output'][i], 158 | subnet['kernel'][i], 159 | subnet['stride'][i] 160 | # normalizer_fn=None 161 | ) 162 | 163 | # Don't apply BN for the last layer of G 164 | x = slim.conv2d_transpose( 165 | x, 166 | subnet['output'][-1], 167 | subnet['kernel'][-1], 168 | subnet['stride'][-1], 169 | normalizer_fn=None, 170 | activation_fn=None) 171 | 172 | logit = x 173 | x = tf.nn.tanh(logit) 174 | return x, logit 175 | 176 | 177 | def _discriminator(self, x, is_training): 178 | ''' Note: In this version, `y` is useless ''' 179 | subnet = self.arch['discriminator'] 180 | n_layer = len(subnet['output']) 181 | 182 | intermediate = list() 183 | intermediate.append(x) 184 | 185 | # x = tf.concat(3, [x, y_vec]) # inject y into x 186 | with slim.arg_scope( 187 | [slim.batch_norm], 188 | scale=True, scope='BN', 189 | updates_collections=None, 190 | decay=0.9, epsilon=1e-5, 191 | is_training=is_training): 192 | with slim.arg_scope( 193 | [slim.conv2d], 194 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 195 | normalizer_fn=slim.batch_norm, 196 | activation_fn=lrelu): 197 | 198 | # Radford: [do] not applying batchnorm to the discriminator input layer 199 | x = slim.conv2d( 200 | x, 201 | subnet['output'][0], 202 | subnet['kernel'][0], 203 | subnet['stride'][0], 204 | normalizer_fn=None) 205 | intermediate.append(x) 206 | for i in range(1, n_layer): 207 | x = slim.conv2d( 208 | x, 209 | subnet['output'][i], 210 | subnet['kernel'][i], 211 | subnet['stride'][i]) 212 | intermediate.append(x) 213 | tf.summary.image( 214 | 'upsampling{:d}'.format(i), 215 | tf.transpose(x[:, :, :, 0:3], [2, 1, 0, 3])) 216 | 217 | # Don't apply BN for the last layer 218 | x = slim.flatten(x) 219 | h = slim.flatten(intermediate[subnet['feature_layer'] - 1]) 220 | 221 | x = slim.fully_connected( 222 | x, 223 | 1, 224 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 225 | activation_fn=None) 226 | 227 | return x, h # no explicit `sigmoid` 228 | 229 | 230 | def loss(self, x_s, y_s,f0_s, x_t, y_t,f0_t): 231 | def circuit_loop(x, y,f): 232 | 233 | z_mu, z_lv = self._encode(x, is_training=self.is_training) 234 | z = GaussianSampleLayer(z_mu, z_lv) 235 | 236 | x_logit, x_feature = self._discriminate( 237 | x, is_training=self.is_training) 238 | 239 | xh, xh_sig_logit = self._generate(z, y, f, is_training=self.is_training) 240 | 241 | zh_mu, zh_lv = self._encode(xh, is_training=self.is_training) 242 | 243 | xh_logit, xh_feature = self._discriminate( 244 | xh, is_training=self.is_training) 245 | 246 | return dict( 247 | z=z, 248 | z_mu=z_mu, 249 | z_lv=z_lv, 250 | xh=xh, 251 | xh_sig_logit=xh_sig_logit, 252 | x_logit=x_logit, 253 | x_feature=x_feature, 254 | zh_mu=zh_mu, 255 | zh_lv=zh_lv, 256 | xh_logit=xh_logit, 257 | xh_feature=xh_feature, 258 | ) 259 | 260 | s = circuit_loop(x_s, y_s, f0_s) 261 | t = circuit_loop(x_t, y_t, f0_t) 262 | s2t = circuit_loop(x_s, y_t,f0_t) 263 | 264 | with tf.name_scope('loss'): 265 | def mean_sigmoid_cross_entropy_with_logits(logit, truth): 266 | ''' 267 | truth: 0. or 1. 268 | ''' 269 | return tf.reduce_mean( 270 | tf.nn.sigmoid_cross_entropy_with_logits( 271 | logit, 272 | truth * tf.ones_like(logit))) 273 | 274 | loss = dict() 275 | 276 | # Parallel 277 | loss['reconst_t'] = \ 278 | tf.reduce_mean(t['x_logit']) \ 279 | - tf.reduce_mean(t['xh_logit']) 280 | 281 | # Parallel 282 | loss['reconst_s'] = \ 283 | tf.reduce_mean(s['x_logit']) \ 284 | - tf.reduce_mean(s['xh_logit']) 285 | 286 | # Non-parallel 287 | loss['conv_s2t'] = \ 288 | tf.reduce_mean(t['x_logit']) \ 289 | - tf.reduce_mean(s2t['xh_logit']) 290 | 291 | # Non-parallel: s v. t 292 | loss['real_s_t'] = \ 293 | tf.reduce_mean(t['x_logit']) \ 294 | - tf.reduce_mean(s['x_logit']) 295 | 296 | # That's why I only take the last term into consideration 297 | loss['WGAN'] = loss['conv_s2t'] 298 | 299 | # VAE's Kullback-Leibler Divergence 300 | loss['KL(z)'] = \ 301 | tf.reduce_mean( 302 | GaussianKLD( 303 | s['z_mu'], s['z_lv'], 304 | tf.zeros_like(s['z_mu']), tf.zeros_like(s['z_lv']))) +\ 305 | tf.reduce_mean( 306 | GaussianKLD( 307 | t['z_mu'], t['z_lv'], 308 | tf.zeros_like(t['z_mu']), tf.zeros_like(t['z_lv']))) 309 | loss['KL(z)'] /= 2.0 310 | 311 | # VAE's Reconstruction Neg. Log-Likelihood (on the 'feature' space of Dx) 312 | loss['Dis'] = \ 313 | tf.reduce_mean( 314 | GaussianLogDensity( 315 | slim.flatten(x_t), 316 | slim.flatten(t['xh']), 317 | tf.zeros_like(slim.flatten(x_t)))) +\ 318 | tf.reduce_mean( 319 | GaussianLogDensity( 320 | slim.flatten(x_s), 321 | slim.flatten(s['xh']), 322 | tf.zeros_like(slim.flatten(x_s)))) 323 | loss['Dis'] /= - 2.0 324 | 325 | # For summaries 326 | with tf.name_scope('Summary'): 327 | tf.summary.scalar('DKL_z', loss['KL(z)']) 328 | tf.summary.scalar('MMSE', loss['Dis']) 329 | 330 | 331 | tf.summary.scalar('WGAN', loss['WGAN']) 332 | tf.summary.scalar('WGAN-s', loss['reconst_s']) 333 | tf.summary.scalar('WGAN-t', loss['reconst_t']) 334 | tf.summary.scalar('WGAN-s2t', loss['conv_s2t']) 335 | tf.summary.scalar('WGAN-t-s', loss['real_s_t']) 336 | 337 | tf.summary.histogram('y', tf.concat([y_t, y_s], 0)) 338 | tf.summary.histogram('z', tf.concat([s['z'], t['z']], 0)) 339 | 340 | tf.summary.histogram('z_s', s['z']) 341 | tf.summary.histogram('z_t', t['z']) 342 | 343 | tf.summary.histogram('z_mu', tf.concat([s['z_mu'], t['z_mu']], 0)) 344 | tf.summary.histogram('z_mu_s', s['z_mu']) 345 | tf.summary.histogram('z_mu_t', t['z_mu']) 346 | 347 | tf.summary.histogram('z_lv', tf.concat([s['z_lv'], t['z_lv']], 0)) 348 | tf.summary.histogram('z_lv_s', s['z_lv']) 349 | tf.summary.histogram('z_lv_t', t['z_lv']) 350 | 351 | tf.summary.histogram('logit_t_from_t', t['xh_logit']) 352 | tf.summary.histogram('logit_t_from_s', s2t['xh_logit']) 353 | tf.summary.histogram('logit_t', t['x_logit']) 354 | 355 | tf.summary.histogram( 356 | 'logit_t_True_FromT_FromS', 357 | tf.concat([t['x_logit'], t['xh_logit'], s2t['xh_logit']], 0)) 358 | tf.summary.histogram( 359 | 'logit_s_v_sh', 360 | tf.concat([s['x_logit'], s['xh_logit']], 0)) 361 | tf.summary.histogram( 362 | 'logit_t_v_th', 363 | tf.concat([t['x_logit'], t['xh_logit']], 0)) 364 | return loss 365 | 366 | 367 | def encode(self, x): 368 | z_mu, z_lv = self._encode(x, is_training=False) 369 | return z_mu 370 | 371 | def decode(self, z, y, f, tanh=False): 372 | xh, _ = self._generate(z, y, f, is_training=False) 373 | return xh 374 | 375 | def discriminate(self, x): 376 | ''' 377 | To estimate the EMD, we need D to assign a score per sample. 378 | *The batches can be of different size 379 | ''' 380 | s, _ = self._discriminate(x, is_training=False) 381 | return s 382 | -------------------------------------------------------------------------------- /trainer/gan.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import logging, os 4 | from util.image import make_png_thumbnail, make_png_jet_thumbnail 5 | 6 | def debug(labels, var_list): 7 | for name, vs in zip(labels, var_list): 8 | print(name) 9 | for v in vs: 10 | print(v.name) 11 | print() 12 | 13 | class GANTrainer(object): 14 | def __init__(self, loss, arch, args, dirs): 15 | self.loss = loss 16 | self.arch = arch 17 | self.args = args 18 | self.dirs = dirs 19 | self.opt = self._optimize() 20 | logging.basicConfig( 21 | level=logging.INFO, 22 | filename=os.path.join(dirs['logdir'], 'training.log') 23 | ) 24 | 25 | def _optimize(self): 26 | ''' 27 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 28 | https://github.com/igul222/improved_wgan_training/issues/3 29 | ''' 30 | global_step = tf.Variable(0, name='global_step') 31 | lr = self.arch['training']['lr'] 32 | b1 = self.arch['training']['beta1'] 33 | b2 = self.arch['training']['beta2'] 34 | 35 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 36 | 37 | trainables = tf.trainable_variables() 38 | g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 39 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 40 | 41 | # # Debug =============== 42 | # debug(['Generator', 'Discriminator'], [g_vars, d_vars]) 43 | # # ============================ 44 | 45 | with tf.name_scope('Update'): 46 | opt_g = optimizer.minimize(self.loss['l_G'], var_list=g_vars, global_step=global_step) 47 | opt_d = optimizer.minimize(self.loss['l_D'], var_list=d_vars) 48 | return { 49 | 'd': opt_d, 50 | 'g': opt_g, 51 | 'global_step': global_step 52 | } 53 | 54 | 55 | def _validate(self, machine, n=10): 56 | N = n * n 57 | 58 | z = np.random.uniform(-np.pi, np.pi, size=[n, self.arch['z_dim']]) 59 | z = np.cos(z) 60 | z = np.concatenate([z] * n, axis=1) 61 | z = np.reshape(z, [N, -1]).astype(np.float32) # consecutive rows 62 | # y = np.random.randint(1, arch['y_dim'], size=[N, 2]) 63 | y = np.asarray( 64 | [[5, 0, 0 ], # 5 65 | [9, 0, 0 ], # 9 66 | [12, 0, 0 ], # 2 67 | [17, 0, 0 ], # 7 68 | [19, 0, 0 ], 69 | [161, 0, 0 ], 70 | [170, 0, 0 ], 71 | [170, 16, 0 ], 72 | [161, 9, 4 ], 73 | [19, 24, 50]], 74 | dtype=np.int64) 75 | y = np.concatenate([y] * n, axis=0) 76 | 77 | Z = tf.constant(z) 78 | Y = tf.constant(y) 79 | Xh = machine.generate(Z, Y) # 100, 64, 64, 3 80 | Xh = make_png_thumbnail(Xh, n) 81 | return Xh 82 | 83 | 84 | def _refresh_status(self, sess): 85 | fetches = { 86 | "l_D": self.loss['l_D'], 87 | "l_G": self.loss['l_G'], 88 | "step": self.opt['global_step'], 89 | } 90 | result = sess.run( 91 | fetches=fetches, 92 | # options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 93 | # run_metadata=run_metadata, 94 | ) 95 | 96 | # trace = timeline.Timeline(step_stats=run_metadata.step_stats) 97 | # with open(os.path.join(dirs['logdir'], 'timeline.ctf.json'), 'w') as fp: 98 | # fp.write(trace.generate_chrome_trace_format()) 99 | 100 | # Message 101 | msg = 'Iter {:05d}: '.format(result['step']) 102 | msg += 'l_D={:.3e} '.format(result['l_D']) 103 | msg += 'l_G={:.3e} '.format(result['l_G']) 104 | print('\r{}'.format(msg), end='', flush=True) 105 | logging.info(msg) 106 | 107 | 108 | def train(self, nIter, machine=None, summary_op=None): 109 | Xh = self._validate(machine=machine, n=10) 110 | 111 | run_metadata = tf.RunMetadata() 112 | 113 | # summary_op = tf.summary.merge_all() 114 | 115 | sv = tf.train.Supervisor( 116 | logdir=self.dirs['logdir'], 117 | # summary_writer=summary_writer, 118 | # summary_op=None, 119 | # is_chief=True, 120 | # save_model_secs=600, 121 | global_step=self.opt['global_step']) 122 | 123 | 124 | # sess_config = configure_gpu_settings(args.gpu_cfg) 125 | sess_config = tf.ConfigProto( 126 | allow_soft_placement=True, 127 | gpu_options=tf.GPUOptions(allow_growth=True)) 128 | 129 | with sv.managed_session(config=sess_config) as sess: 130 | sv.loop(60, self._refresh_status, (sess,)) 131 | for step in range(self.arch['training']['max_iter']): 132 | if sv.should_stop(): 133 | break 134 | 135 | # main loop 136 | for _ in range(self.arch['training']['nIterD']): 137 | sess.run(self.opt['d']) 138 | sess.run(self.opt['g']) 139 | 140 | # output img 141 | if step % 1000 == 0: 142 | xh = sess.run(Xh) 143 | with tf.gfile.GFile( 144 | os.path.join( 145 | self.dirs['logdir'], 146 | 'img-anime-{:03d}k.png'.format(step // 1000), 147 | ), 148 | mode='wb', 149 | ) as fp: 150 | fp.write(xh) 151 | 152 | 153 | 154 | 155 | class FisherGANTrainer(GANTrainer): 156 | def _optimize(self): 157 | ''' 158 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 159 | https://github.com/igul222/improved_wgan_training/issues/3 160 | ''' 161 | global_step = tf.Variable(0, name='global_step') 162 | lr = self.arch['training']['lr'] 163 | b1 = self.arch['training']['beta1'] 164 | b2 = self.arch['training']['beta2'] 165 | rho = self.arch['training']['rho'] 166 | 167 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 168 | optimizer_l = tf.train.GradientDescentOptimizer(rho) 169 | 170 | trainables = tf.trainable_variables() 171 | g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 172 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 173 | l_vars = [v for v in trainables if 'lambda' in v.name] 174 | 175 | # # Debug =============== 176 | # debug(['G', 'D', 'lambda'], [g_vars, d_vars, l_vars]) 177 | # # ============================ 178 | 179 | with tf.name_scope('Update'): 180 | opt_g = optimizer.minimize(self.loss['l_G'], var_list=g_vars, global_step=global_step) 181 | opt_l = optimizer_l.minimize(- self.loss['l_D'], var_list=l_vars) 182 | with tf.control_dependencies([opt_l]): 183 | opt_d = optimizer.minimize(self.loss['l_D'], var_list=d_vars) 184 | return { 185 | 'd': opt_d, 186 | 'g': opt_g, 187 | 'l': opt_l, 188 | 'global_step': global_step 189 | } 190 | 191 | def _validate(self, machine, n=10): 192 | N = n * n 193 | 194 | # same row same z 195 | z = tf.random_normal(shape=[n, self.arch['z_dim']]) 196 | z = tf.tile(z, [1, n]) 197 | z = tf.reshape(z, [N, -1]) 198 | z = tf.Variable(z, trainable=False, dtype=tf.float32) 199 | 200 | # same column same y 201 | y = tf.range(0, 10, 1, dtype=tf.int64) 202 | y = tf.reshape(y, [-1, 1]) 203 | y = tf.tile(y, [n, 1]) 204 | 205 | Xh = machine.generate(z, y) # 100, 64, 64, 3 206 | # Xh = gray2jet(Xh) 207 | # Xh = make_png_thumbnail(Xh, n) 208 | Xh = make_png_jet_thumbnail(Xh, n) 209 | return Xh 210 | 211 | 212 | class FisherGANTrainerHw3(FisherGANTrainer): 213 | def _validate(self, machine, n=10): 214 | N = n * n 215 | z = np.random.normal(0., 1., size=[n, self.arch['z_dim']]) 216 | z = np.concatenate([z] * n, axis=1) 217 | z = np.reshape(z, [N, -1]).astype(np.float32) # consecutive rows 218 | y = np.asarray( 219 | [[5, 0, 0 ], 220 | [9, 0, 0 ], 221 | [12, 0, 0 ], 222 | [17, 0, 0 ], 223 | [19, 0, 0 ], 224 | [161, 0, 0 ], 225 | [170, 0, 0 ], 226 | [170, 16, 0 ], 227 | [161, 9, 4 ], 228 | [19, 24, 50]], 229 | dtype=np.int64) 230 | y = np.concatenate([y] * n, axis=0) 231 | Z = tf.constant(z) 232 | Y = tf.constant(y) 233 | Xh = machine.generate(Z, Y) # 100, 64, 64, 3 234 | Xh = make_png_thumbnail(Xh, n) 235 | return Xh 236 | 237 | 238 | 239 | class BiFisherGANTrainer(FisherGANTrainer): 240 | def _optimize(self): 241 | ''' 242 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 243 | https://github.com/igul222/improved_wgan_training/issues/3 244 | ''' 245 | global_step = tf.Variable(0, name='global_step') 246 | lr = self.arch['training']['lr'] 247 | b1 = self.arch['training']['beta1'] 248 | b2 = self.arch['training']['beta2'] 249 | rho = self.arch['training']['rho'] 250 | 251 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 252 | optimizer_l = tf.train.GradientDescentOptimizer(rho) 253 | 254 | trainables = tf.trainable_variables() 255 | g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 256 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 257 | l_vars = [v for v in trainables if 'lambda' in v.name] 258 | e_vars = [v for v in trainables if 'Encoder' in v.name] 259 | 260 | # # Debug =============== 261 | # for name, vs in zip( 262 | # ['Generator', 'Discriminator', 'Encoder', 'lambda'], 263 | # [g_vars, d_vars, e_vars, l_vars]): 264 | # print(name) 265 | # for v in vs: 266 | # print(v.name) 267 | # print() 268 | # # ============================ 269 | 270 | with tf.name_scope('Update'): 271 | opt_e = optimizer.minimize(self.loss['l_G'], var_list=e_vars) 272 | with tf.control_dependencies([opt_e]): 273 | opt_g = optimizer.minimize(self.loss['l_G'], var_list=g_vars, global_step=global_step) 274 | 275 | opt_l = optimizer_l.minimize(- self.loss['l_D'], var_list=l_vars) # opposite sign to D 276 | with tf.control_dependencies([opt_l]): 277 | opt_d = optimizer.minimize(self.loss['l_D'], var_list=d_vars) 278 | return { 279 | 'd': opt_d, 280 | 'g': opt_g, 281 | 'l': opt_l, 282 | 'global_step': global_step 283 | } 284 | 285 | class CycleFisherGANTrainer(FisherGANTrainer): 286 | def _optimize(self): 287 | ''' 288 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 289 | https://github.com/igul222/improved_wgan_training/issues/3 290 | ''' 291 | global_step = tf.Variable(0, name='global_step') 292 | lr = self.arch['training']['lr'] 293 | b1 = self.arch['training']['beta1'] 294 | b2 = self.arch['training']['beta2'] 295 | rho = self.arch['training']['rho'] 296 | 297 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 298 | optimizer_l = tf.train.GradientDescentOptimizer(rho) 299 | 300 | trainables = tf.trainable_variables() 301 | g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 302 | 303 | d_vars = [v for v in trainables if 'Discriminator' in v.name] 304 | l_vars = [v for v in trainables if 'lambda' in v.name] 305 | 306 | e_vars = [v for v in trainables if 'Encoder' in v.name] 307 | 308 | z_vars = [v for v in trainables if 'Dz'] 309 | j_vars = [v for v in trainables if 'lambdz' in v.name] 310 | # # Debug =============== 311 | # # ============================ 312 | 313 | with tf.name_scope('Update'): 314 | opt_j = optimizer_l.minimize(- self.loss['l_Dz'], var_list=j_vars) 315 | opt_z = optimizer.minimize(self.loss['l_Dz'], var_list=z_vars) 316 | 317 | opt_e = optimizer.minimize(self.loss['l_E'], var_list=e_vars) 318 | with tf.control_dependencies([opt_e, opt_j, opt_z]): 319 | opt_g = optimizer.minimize(self.loss['l_G'], var_list=g_vars, global_step=global_step) 320 | 321 | opt_l = optimizer_l.minimize(- self.loss['l_D'], var_list=l_vars) # opposite sign to D 322 | with tf.control_dependencies([opt_l]): 323 | opt_d = optimizer.minimize(self.loss['l_D'], var_list=d_vars) 324 | return { 325 | 'd': opt_d, 326 | 'g': opt_g, 327 | 'l': opt_l, 328 | 'global_step': global_step 329 | } 330 | 331 | 332 | # class BEGANTrainer(GANTrainer): 333 | # def _optimize(self): #loss, args, arch, net=None): 334 | # ''' 335 | # [TODO] 336 | # Although most of the trainer structures are the same, 337 | # I think we have to use different training scripts for VAE- and DC-GAN 338 | # (but do we have to have two different classes of VAE- and DC-?) 339 | # ''' 340 | # global_step = tf.Variable(0, name='global_step') 341 | # lr = self.arch['training']['lr'] 342 | # # optimizer_d = tf.train.AdamOptimizer(lr) 343 | # optimizer_g = tf.train.AdamOptimizer(lr) 344 | 345 | # trainables = tf.trainable_variables() 346 | # g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 347 | # d_vars = [v for v in trainables if 'Critic' in v.name] 348 | 349 | # # Debug =============== 350 | # for name, vs in zip( 351 | # ['Generator', 'Critic'], 352 | # [g_vars, d_vars]): 353 | # print(name) 354 | # for v in vs: 355 | # print(v.name) 356 | # print() 357 | # # ============================ 358 | 359 | # with tf.name_scope('Update'): 360 | # opt_g = optimizer_g.minimize(self.loss['G'], var_list=g_vars) 361 | # opt_d = optimizer_g.minimize(self.loss['D'], global_step=global_step, var_list=d_vars) 362 | # with tf.control_dependencies([opt_d, opt_g]): 363 | # k_delta = self.arch['training']['lambda'] * self.loss['k_delta'] 364 | # opt_k = tf.assign( 365 | # net.k_t, tf.clip_by_value(net.k_t + k_delta, 0, 1)) 366 | 367 | # # logging.info('The following variables are clamped:') 368 | 369 | # return { 370 | # 'd': opt_d, 371 | # 'g': opt_g, 372 | # 'k': opt_k, 373 | # 'global_step': global_step 374 | # } 375 | 376 | class VAETrainer(GANTrainer): 377 | def _optimize(self): 378 | ''' 379 | NOTE: The author said that there was no need for 100 d_iter per 100 iters. 380 | https://github.com/igul222/improved_wgan_training/issues/3 381 | ''' 382 | global_step = tf.Variable(0, name='global_step') 383 | lr = self.arch['training']['lr'] 384 | b1 = self.arch['training']['beta1'] 385 | b2 = self.arch['training']['beta2'] 386 | 387 | optimizer = tf.train.AdamOptimizer(lr, b1, b2) 388 | 389 | trainables = tf.trainable_variables() 390 | g_vars = trainables 391 | # g_vars = [v for v in trainables if 'Generator' in v.name or 'y_emb' in v.name] 392 | 393 | with tf.name_scope('Update'): 394 | opt_g = optimizer.minimize(self.loss['G'], var_list=g_vars, global_step=global_step) 395 | return { 396 | 'g': opt_g, 397 | 'global_step': global_step 398 | } 399 | 400 | 401 | def _refresh_status(self, sess): 402 | fetches = { 403 | "D_KL": self.loss['D_KL'], 404 | "logP": self.loss['logP'], 405 | "step": self.opt['global_step'], 406 | } 407 | result = sess.run( 408 | fetches=fetches, 409 | # options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 410 | # run_metadata=run_metadata, 411 | ) 412 | 413 | # trace = timeline.Timeline(step_stats=run_metadata.step_stats) 414 | # with open(os.path.join(dirs['logdir'], 'timeline.ctf.json'), 'w') as fp: 415 | # fp.write(trace.generate_chrome_trace_format()) 416 | 417 | # Message 418 | msg = 'Iter {:05d}: '.format(result['step']) 419 | msg += 'log P(x|z, y) = {:.3e} '.format(result['logP']) 420 | msg += 'D_KL(z) = {:.3e} '.format(result['D_KL']) 421 | print('\r{}'.format(msg), end='', flush=True) 422 | logging.info(msg) 423 | 424 | 425 | def _validate(self, machine, n=10): 426 | N = n * n 427 | 428 | # same row same z 429 | z = tf.random_normal(shape=[n, self.arch['z_dim']]) 430 | z = tf.tile(z, [1, n]) 431 | z = tf.reshape(z, [N, -1]) 432 | z = tf.Variable(z, trainable=False, dtype=tf.float32) 433 | 434 | # same column same y 435 | y = tf.range(0, 10, 1, dtype=tf.int64) 436 | y = tf.reshape(y, [-1,]) 437 | y = tf.tile(y, [n,]) 438 | 439 | Xh = machine.generate(z, y) # 100, 64, 64, 3 440 | Xh = make_png_thumbnail(Xh, n) 441 | return Xh 442 | 443 | def train(self, nIter, machine=None, summary_op=None): 444 | Xh = self._validate(machine=machine, n=10) 445 | 446 | run_metadata = tf.RunMetadata() 447 | 448 | sv = tf.train.Supervisor( 449 | logdir=self.dirs['logdir'], 450 | # summary_writer=summary_writer, 451 | # summary_op=None, 452 | # is_chief=True, 453 | # save_model_secs=600, 454 | global_step=self.opt['global_step']) 455 | 456 | 457 | # sess_config = configure_gpu_settings(args.gpu_cfg) 458 | sess_config = tf.ConfigProto( 459 | allow_soft_placement=True, 460 | gpu_options=tf.GPUOptions(allow_growth=True)) 461 | 462 | with sv.managed_session(config=sess_config) as sess: 463 | sv.loop(60, self._refresh_status, (sess,)) 464 | for step in range(self.arch['training']['max_iter']): 465 | if sv.should_stop(): 466 | break 467 | 468 | # main loop 469 | sess.run(self.opt['g']) 470 | 471 | # output img 472 | if step % 1000 == 0: 473 | xh = sess.run(Xh) 474 | with tf.gfile.GFile( 475 | os.path.join( 476 | self.dirs['logdir'], 477 | 'img-anime-{:03d}k.png'.format(step // 1000), 478 | ), 479 | mode='wb', 480 | ) as fp: 481 | fp.write(xh) 482 | -------------------------------------------------------------------------------- /model/vaegan.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | from tensorflow.contrib import slim 3 | import tensorflow as tf 4 | from util.layer import GaussianLogDensity, GaussianKLD, \ 5 | GaussianSampleLayer, lrelu 6 | 7 | class VAWGAN(object): 8 | ''' 9 | VC-GAN 10 | = CVAE-CGAN 11 | = Convolutional Variational Auto-encoder 12 | with Conditional Generative Adversarial Net 13 | ''' 14 | def __init__(self, arch, is_training=False): 15 | self.arch = arch 16 | self._sanity_check() 17 | self.is_training = is_training 18 | 19 | with tf.name_scope('SpeakerRepr'): 20 | self.y_emb = self._unit_embedding( 21 | self.arch['y_dim'], 22 | self.arch['z_dim'], 23 | 'y_embedding') 24 | 25 | self._generate = tf.make_template( 26 | 'Generator', 27 | self._generator) 28 | self._discriminate = tf.make_template( 29 | 'Discriminator', 30 | self._discriminator) 31 | self._encode = tf.make_template( 32 | 'Encoder', 33 | self._encoder) 34 | 35 | 36 | def _sanity_check(self): 37 | for net in ['encoder', 'generator', 'discriminator']: 38 | assert len(self.arch[net]['output']) > 2 39 | assert len(self.arch[net]['output']) == len(self.arch[net]['kernel']) 40 | assert len(self.arch[net]['output']) == len(self.arch[net]['stride']) 41 | 42 | 43 | def _unit_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 44 | with tf.variable_scope(scope_name): 45 | embeddings = tf.get_variable( 46 | name=var_name, 47 | shape=[n_class, h_dim]) 48 | embeddings = tf.nn.l2_normalize(embeddings, dim=-1, name=var_name+'normalized') 49 | return embeddings 50 | 51 | 52 | def _merge(self, var_list, fan_out, l2_reg=1e-6): 53 | ''' 54 | Note: Don't apply BN on this because 'y' 55 | tends to be the same inside a batch. 56 | ''' 57 | x = 0. 58 | with slim.arg_scope( 59 | [slim.fully_connected], 60 | num_outputs=fan_out, 61 | weights_regularizer=slim.l2_regularizer(l2_reg), 62 | normalizer_fn=None, 63 | activation_fn=None): 64 | for var in var_list: 65 | x = x + slim.fully_connected(var) 66 | x = slim.bias_add(x) 67 | return x 68 | 69 | 70 | def _l2_regularized_embedding(self, n_class, h_dim, scope_name, var_name='y_emb'): 71 | with tf.variable_scope(scope_name): 72 | embeddings = tf.get_variable( 73 | name=var_name, 74 | shape=[n_class, h_dim], 75 | regularizer=slim.l2_regularizer(1e-6)) 76 | return embeddings 77 | 78 | 79 | def _encoder(self, x, is_training): 80 | n_layer = len(self.arch['encoder']['output']) 81 | subnet = self.arch['encoder'] 82 | 83 | with slim.arg_scope( 84 | [slim.batch_norm], 85 | scale=True, scope='BN', 86 | updates_collections=None, 87 | decay=0.9, epsilon=1e-5, # [TODO] Test these hyper-parameters 88 | is_training=is_training): 89 | with slim.arg_scope( 90 | [slim.conv2d], 91 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 92 | normalizer_fn=slim.batch_norm, 93 | activation_fn=lrelu): 94 | 95 | for i in range(n_layer): 96 | x = slim.conv2d( 97 | x, 98 | subnet['output'][i], 99 | subnet['kernel'][i], 100 | subnet['stride'][i]) 101 | tf.summary.image( 102 | 'down-sample{:d}'.format(i), 103 | tf.transpose(x[:, :, :, 0:3], [2, 1, 0, 3])) 104 | 105 | x = slim.flatten(x) 106 | 107 | with slim.arg_scope( 108 | [slim.fully_connected], 109 | num_outputs=self.arch['z_dim'], 110 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 111 | normalizer_fn=None, 112 | activation_fn=None): 113 | z_mu = slim.fully_connected(x) 114 | z_lv = slim.fully_connected(x) 115 | return z_mu, z_lv 116 | 117 | 118 | def _generator(self, z, y, is_training): 119 | ''' In this version, we only generate the target, so `y` is useless ''' 120 | subnet = self.arch['generator'] 121 | n_layer = len(subnet['output']) 122 | h, w, c = subnet['hwc'] 123 | 124 | y = tf.nn.embedding_lookup(self.y_emb, y) 125 | 126 | x = self._merge([z, y], subnet['merge_dim']) 127 | x = lrelu(x) 128 | with slim.arg_scope( 129 | [slim.batch_norm], 130 | scale=True, scope='BN', 131 | updates_collections=None, 132 | decay=0.9, epsilon=1e-5, 133 | is_training=is_training): 134 | 135 | x = slim.fully_connected( 136 | x, 137 | h * w * c, 138 | normalizer_fn=slim.batch_norm, 139 | activation_fn=lrelu) 140 | 141 | x = tf.reshape(x, [-1, h, w, c]) 142 | 143 | with slim.arg_scope( 144 | [slim.conv2d_transpose], 145 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 146 | normalizer_fn=slim.batch_norm, 147 | activation_fn=lrelu): 148 | 149 | for i in range(n_layer -1): 150 | x = slim.conv2d_transpose( 151 | x, 152 | subnet['output'][i], 153 | subnet['kernel'][i], 154 | subnet['stride'][i] 155 | # normalizer_fn=None 156 | ) 157 | 158 | # Don't apply BN for the last layer of G 159 | x = slim.conv2d_transpose( 160 | x, 161 | subnet['output'][-1], 162 | subnet['kernel'][-1], 163 | subnet['stride'][-1], 164 | normalizer_fn=None, 165 | activation_fn=None) 166 | 167 | # pdb.set_trace() 168 | logit = x 169 | x = tf.nn.tanh(logit) 170 | return x, logit 171 | 172 | 173 | def _discriminator(self, x, is_training): 174 | # def _discriminator(self, x, is_training): 175 | ''' Note: In this version, `y` is useless ''' 176 | subnet = self.arch['discriminator'] 177 | n_layer = len(subnet['output']) 178 | 179 | # y_dim = self.arch['y_dim'] 180 | # h, w, _ = self.arch['hwc'] 181 | # y_emb = self._l2_regularized_embedding(y_dim, h * w, 'y_embedding_disc_in') 182 | # y_vec = tf.nn.embedding_lookup(y_emb, y) 183 | # y_vec = tf.reshape(y_vec, [-1, h, w, 1]) 184 | 185 | intermediate = list() 186 | intermediate.append(x) 187 | 188 | # x = tf.concat(3, [x, y_vec]) # inject y into x 189 | with slim.arg_scope( 190 | [slim.batch_norm], 191 | scale=True, scope='BN', 192 | updates_collections=None, 193 | decay=0.9, epsilon=1e-5, 194 | is_training=is_training): 195 | with slim.arg_scope( 196 | [slim.conv2d], 197 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 198 | normalizer_fn=slim.batch_norm, 199 | activation_fn=lrelu): 200 | 201 | # Radford: [do] not applying batchnorm to the discriminator input layer 202 | x = slim.conv2d( 203 | x, 204 | subnet['output'][0], 205 | subnet['kernel'][0], 206 | subnet['stride'][0], 207 | normalizer_fn=None) 208 | intermediate.append(x) 209 | for i in range(1, n_layer): 210 | x = slim.conv2d( 211 | x, 212 | subnet['output'][i], 213 | subnet['kernel'][i], 214 | subnet['stride'][i]) 215 | intermediate.append(x) 216 | tf.summary.image( 217 | 'upsampling{:d}'.format(i), 218 | tf.transpose(x[:, :, :, 0:3], [2, 1, 0, 3])) 219 | 220 | # Don't apply BN for the last layer 221 | x = slim.flatten(x) 222 | h = slim.flatten(intermediate[subnet['feature_layer'] - 1]) 223 | 224 | # y_vec = tf.nn.embedding_lookup(self.y_emb, y) 225 | 226 | # x = self._merge([x], subnet['merge_dim']) 227 | # x = lrelu(x) 228 | 229 | # x = slim.fully_connected( 230 | # x, 231 | # subnet['merge_dim'], 232 | # weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 233 | # activation_fn=lrelu) 234 | 235 | x = slim.fully_connected( 236 | x, 237 | 1, 238 | weights_regularizer=slim.l2_regularizer(subnet['l2-reg']), 239 | activation_fn=None) 240 | 241 | return x, h # no explicit `sigmoid` 242 | 243 | 244 | def loss(self, x_s, y_s, x_t, y_t): 245 | ''' ''' 246 | # def thresholding_add(x, v, minval=-1., maxval=1.): 247 | # x_v = x + v 248 | # x_v = tf.maximum(x_v, minval) 249 | # x_v = tf.minimum(x_v, maxval) 250 | # return x_v 251 | 252 | def circuit_loop(x, y): 253 | # v = decay * tf.random_normal(shape=tf.shape(x)) 254 | 255 | z_mu, z_lv = self._encode(x, is_training=self.is_training) 256 | z = GaussianSampleLayer(z_mu, z_lv) 257 | 258 | x_logit, x_feature = self._discriminate( 259 | x, is_training=self.is_training) 260 | 261 | xh, xh_sig_logit = self._generate(z, y, is_training=self.is_training) 262 | 263 | zh_mu, zh_lv = self._encode(xh, is_training=self.is_training) 264 | 265 | xh_logit, xh_feature = self._discriminate( 266 | xh, is_training=self.is_training) 267 | 268 | return dict( 269 | z=z, 270 | z_mu=z_mu, 271 | z_lv=z_lv, 272 | # y_logit=y_logit, 273 | xh=xh, 274 | xh_sig_logit=xh_sig_logit, 275 | # xo=xo, 276 | x_logit=x_logit, 277 | x_feature=x_feature, 278 | zh_mu=zh_mu, 279 | zh_lv=zh_lv, 280 | # yh_logit=yh_logit, 281 | xh_logit=xh_logit, 282 | xh_feature=xh_feature, 283 | # xr=xr 284 | ) 285 | 286 | s = circuit_loop(x_s, y_s) 287 | t = circuit_loop(x_t, y_t) 288 | s2t = circuit_loop(x_s, y_t) 289 | 290 | with tf.name_scope('loss'): 291 | def mean_sigmoid_cross_entropy_with_logits(logit, truth): 292 | ''' 293 | truth: 0. or 1. 294 | ''' 295 | return tf.reduce_mean( 296 | tf.nn.sigmoid_cross_entropy_with_logits( 297 | logit, 298 | truth * tf.ones_like(logit))) 299 | 300 | loss = dict() 301 | 302 | # Parallel 303 | loss['reconst_t'] = \ 304 | tf.reduce_mean(t['x_logit']) \ 305 | - tf.reduce_mean(t['xh_logit']) 306 | 307 | # Parallel 308 | loss['reconst_s'] = \ 309 | tf.reduce_mean(s['x_logit']) \ 310 | - tf.reduce_mean(s['xh_logit']) 311 | 312 | # Non-parallel 313 | loss['conv_s2t'] = \ 314 | tf.reduce_mean(t['x_logit']) \ 315 | - tf.reduce_mean(s2t['xh_logit']) 316 | 317 | # Non-parallel: s v. t 318 | loss['real_s_t'] = \ 319 | tf.reduce_mean(t['x_logit']) \ 320 | - tf.reduce_mean(s['x_logit']) 321 | 322 | # That's why I only take the last term into consideration 323 | loss['WGAN'] = loss['conv_s2t'] 324 | 325 | # VAE's Kullback-Leibler Divergence 326 | loss['KL(z)'] = \ 327 | tf.reduce_mean( 328 | GaussianKLD( 329 | s['z_mu'], s['z_lv'], 330 | tf.zeros_like(s['z_mu']), tf.zeros_like(s['z_lv']))) +\ 331 | tf.reduce_mean( 332 | GaussianKLD( 333 | t['z_mu'], t['z_lv'], 334 | tf.zeros_like(t['z_mu']), tf.zeros_like(t['z_lv']))) 335 | loss['KL(z)'] /= 2.0 336 | 337 | # VAE's Reconstruction Neg. Log-Likelihood (on the 'feature' space of Dx) 338 | loss['Dis'] = \ 339 | tf.reduce_mean( 340 | GaussianLogDensity( 341 | slim.flatten(x_t), 342 | slim.flatten(t['xh']), 343 | tf.zeros_like(slim.flatten(x_t)))) +\ 344 | tf.reduce_mean( 345 | GaussianLogDensity( 346 | slim.flatten(x_s), 347 | slim.flatten(s['xh']), 348 | tf.zeros_like(slim.flatten(x_s)))) 349 | loss['Dis'] /= - 2.0 350 | 351 | # loss['Dis'] = tf.reduce_mean( 352 | # tf.reduce_sum( 353 | # tf.nn.sigmoid_cross_entropy_with_logits( 354 | # slim.flatten(t['xh_sig_logit']), 355 | # slim.flatten(x_t * .5 + .5)) +\ 356 | # tf.nn.sigmoid_cross_entropy_with_logits( 357 | # slim.flatten(s['xh_sig_logit']), 358 | # slim.flatten(x_s * .5 + .5)), 359 | # -1)) 360 | # # Note: Please remember to apply reduce_sum along the last dim 361 | # # (here, it's not 1 as is in D/G loss.) 362 | 363 | # loss['Dis'] = tf.reduce_mean( 364 | # (s2t['xh_logit'] + s['xh_logit'] + t['xh_logit']) / 3.) 365 | 366 | # # Use 'feature' item instead of raw input because the former is 'flattened' 367 | # loss['Dis'] = tf.reduce_mean( 368 | # tf.reduce_sum(tf.square(s['x_feature'] - s['xh_feature']), -1) +\ 369 | # tf.reduce_sum(tf.square(t['x_feature'] - t['xh_feature']), -1)) 370 | 371 | # [TODO] Maybe I should normalize (divide by 2 or 3) the above costs. 372 | 373 | # For summaries 374 | with tf.name_scope('Summary'): 375 | # tf.summary.scalar('DKL_x', loss['KL(x)']) 376 | tf.summary.scalar('DKL_z', loss['KL(z)']) 377 | tf.summary.scalar('MMSE', loss['Dis']) 378 | 379 | # tf.summary.scalar('D_real', loss['D_real']) 380 | # tf.summary.scalar('G_fake', loss['G_fake']) 381 | 382 | tf.summary.scalar('WGAN', loss['WGAN']) 383 | tf.summary.scalar('WGAN-s', loss['reconst_s']) 384 | tf.summary.scalar('WGAN-t', loss['reconst_t']) 385 | tf.summary.scalar('WGAN-s2t', loss['conv_s2t']) 386 | tf.summary.scalar('WGAN-t-s', loss['real_s_t']) 387 | 388 | 389 | tf.summary.histogram('y', tf.concat(0, [y_t, y_s])) 390 | 391 | # tf.summary.histogram('yh_s', tf.argmax(s['yh_logit'], 1)) 392 | # tf.summary.histogram('yh_t', tf.argmax(t['yh_logit'], 1)) 393 | 394 | tf.summary.histogram('z', tf.concat(0, [s['z'], t['z']])) 395 | tf.summary.histogram('z_s', s['z']) 396 | tf.summary.histogram('z_t', t['z']) 397 | 398 | tf.summary.histogram('z_mu', tf.concat(0, [s['z_mu'], t['z_mu']])) 399 | tf.summary.histogram('z_mu_s', s['z_mu']) 400 | tf.summary.histogram('z_mu_t', t['z_mu']) 401 | 402 | tf.summary.histogram('z_lv', tf.concat(0, [s['z_lv'], t['z_lv']])) 403 | tf.summary.histogram('z_lv_s', s['z_lv']) 404 | tf.summary.histogram('z_lv_t', t['z_lv']) 405 | 406 | # tf.summary.histogram('D_t_t', tf.nn.sigmoid(x_t_from_t_logit)) 407 | # tf.summary.histogram('D_t_s', tf.nn.sigmoid(x_t_from_s_logit)) 408 | # tf.summary.histogram('D_t', tf.nn.sigmoid(x_t_logit)) 409 | # tf.summary.histogram('D_s', tf.nn.sigmoid(x_s_logit)) 410 | 411 | 412 | tf.summary.histogram('logit_t_from_t', t['xh_logit']) 413 | tf.summary.histogram('logit_t_from_s', s2t['xh_logit']) 414 | tf.summary.histogram('logit_t', t['x_logit']) 415 | # tf.summary.histogram('logit_s', s['x_logit']) 416 | 417 | # tf.summary.histogram( 418 | # 'logit_s_True_and_Fake)', 419 | # tf.concat(0, [x_s_from_s_logit, x_s_logit])) 420 | tf.summary.histogram( 421 | 'logit_t_True_FromT_FromS', 422 | tf.concat(0, [t['x_logit'], t['xh_logit'], s2t['xh_logit']])) 423 | tf.summary.histogram( 424 | 'logit_s_v_sh', 425 | tf.concat(0, [s['x_logit'], s['xh_logit']])) 426 | tf.summary.histogram( 427 | 'logit_t_v_th', 428 | tf.concat(0, [t['x_logit'], t['xh_logit']])) 429 | # # tf.image_summary("G", xh) 430 | 431 | # tf.summary.scalar('Ent_on_y', loss['y']) 432 | # tf.summary.scalar('Diff_zs_zt', loss['z']) 433 | return loss 434 | 435 | # def sample(self, z=128): 436 | # ''' Generate fake samples given `z` 437 | # if z is not given or is an `int`, 438 | # this fcn generates (z=128) samples 439 | # ''' 440 | # z = tf.random_uniform( 441 | # shape=[z, self.arch['z_dim']], 442 | # minval=-1.0, 443 | # maxval=1.0, 444 | # name='z_test') 445 | # return self._generate(z, is_training=False) 446 | 447 | def encode(self, x): 448 | z_mu, z_lv = self._encode(x, is_training=False) 449 | return dict(mu=z_mu, log_var=z_lv) 450 | 451 | def decode(self, z, y, tanh=False): 452 | # if tanh: 453 | # return self._generate(z, y, is_training=False) 454 | # else: 455 | # return self._generate(z, y, is_training=False) 456 | xh, _ = self._generate(z, y, is_training=False) 457 | # tf.summary.image('xh', tf.transpose(xh, [2, 1, 0, 3])) 458 | # return self._filter(xh, is_training=False) 459 | return xh 460 | 461 | def discriminate(self, x): 462 | ''' 463 | To estimate the EMD, we need D to assign a score per sample. 464 | *The batches can be of different size 465 | ''' 466 | # s_true, _ = self._discriminate(x_true, is_training=False) 467 | # s_fake, _ = self._discriminate(x_fake, is_training=False) 468 | # return s_true, s_fake 469 | s, _ = self._discriminate(x, is_training=False) 470 | return s 471 | 472 | # def classify(self, x): 473 | # return self._classify(tf.nn.softmax(x), is_training=False) 474 | 475 | # def interpolate(self, x1, x2, n): 476 | # ''' Interpolation from the latent space ''' 477 | # x1 = tf.expand_dims(x1, 0) 478 | # x2 = tf.expand_dims(x2, 0) 479 | # z1, _ = self._encode(x1, is_training=False) 480 | # z2, _ = self._encode(x2, is_training=False) 481 | # a = tf.reshape(tf.linspace(0., 1., n), [n, 1]) 482 | 483 | # z1 = tf.matmul(1. - a, z1) 484 | # z2 = tf.matmul(a, z2) 485 | # z = tf.nn.tanh(tf.add(z1, z2)) # Gaussian-to-Uniform 486 | # xh = self._generate(z, is_training=False) 487 | # xh = tf.concat(0, [x1, xh, x2]) 488 | # return xh 489 | --------------------------------------------------------------------------------