├── README.md ├── VRAE.py ├── dataset.py └── train_VRAE.py /README.md: -------------------------------------------------------------------------------- 1 | # chainer-Variational-Recurrent-Autoencoder 2 | 3 | ## Reference 4 | 5 | * http://arxiv.org/pdf/1412.6581.pdf 6 | * https://github.com/y0ast/Variational-Recurrent-Autoencoder 7 | -------------------------------------------------------------------------------- /VRAE.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | 5 | 6 | from chainer import cuda, Variable, function, FunctionSet, optimizers 7 | from chainer import functions as F 8 | 9 | class VRAE(FunctionSet): 10 | def __init__(self, **layers): 11 | super(VRAE, self).__init__(**layers) 12 | 13 | def softplus(self, x): 14 | return F.log(F.exp(x) + 1) 15 | 16 | def binary_cross_entropy(self, y, t): 17 | ya = y.data 18 | ta = t.data 19 | d = -1*np.sum(ya*np.log(ta) + (1-ya)*np.log(1-ta)) 20 | v_d = Variable(np.array(d).astype(np.float32)) 21 | return v_d 22 | 23 | def forward_one_step(self, x_data, state, continuous=True, nonlinear_q='tanh', nonlinear_p='tanh', output_f = 'sigmoid', gpu=-1): 24 | 25 | output = np.zeros( x_data.shape ).astype(np.float32) 26 | 27 | nonlinear = {'sigmoid': F.sigmoid, 'tanh': F.tanh, 'softplus': self.softplus, 'relu': F.relu} 28 | nonlinear_f_q = nonlinear[nonlinear_q] 29 | nonlinear_f_p = nonlinear[nonlinear_p] 30 | 31 | output_a_f = nonlinear[output_f] 32 | 33 | # compute q(z|x) 34 | for i in range(x_data.shape[0]): 35 | x_in_t = Variable(x_data[i].reshape((1, x_data.shape[1]))) 36 | hidden_q_t = nonlinear_f_q( self.recog_in_h( x_in_t ) + self.recog_h_h( state['recog_h'] ) ) 37 | state['recog_h'] = hidden_q_t 38 | 39 | q_mean = self.recog_mean( state['recog_h'] ) 40 | q_log_sigma = 0.5 * self.recog_log_sigma( state['recog_h'] ) 41 | 42 | eps = np.random.normal(0, 1, q_log_sigma.data.shape ).astype(np.float32) 43 | 44 | if gpu >= 0: 45 | eps = cuda.to_gpu(eps) 46 | 47 | eps = Variable(eps) 48 | z = q_mean + F.exp(q_log_sigma) * eps 49 | 50 | # compute p( x | z) 51 | 52 | h0 = nonlinear_f_p( self.z(z) ) 53 | out= self.output(h0) 54 | x_0 = output_a_f( out ) 55 | state['gen_h'] = h0 56 | if gpu >= 0: 57 | np_x_0 = cuda.to_cpu(x_0.data) 58 | output[0] = np_x_0 59 | else: 60 | output[0] = x_0.data 61 | 62 | if continuous == True: 63 | rec_loss = F.mean_squared_error(x_0, Variable(x_data[0].reshape((1, x_data.shape[1])))) 64 | else: 65 | rec_loss = F.sigmoid_cross_entropy(out, Variable(x_data[0].reshape((1, x_data.shape[1])).astype(np.int32))) 66 | 67 | x_t = x_0 68 | 69 | for i in range(1, x_data.shape[0]): 70 | h_t_1 = nonlinear_f_p( self.gen_in_h( x_t ) + self.gen_h_h(state['gen_h']) ) 71 | x_t_1 = self.output(h_t_1) 72 | state['gen_h'] = h_t_1 73 | 74 | if continuous == True: 75 | output_t = output_a_f( x_t_1 ) 76 | rec_loss += F.mean_squared_error(output_t, Variable(x_data[i].reshape((1, x_data.shape[1])))) 77 | 78 | else: 79 | out = x_t_1 80 | rec_loss += F.sigmoid_cross_entropy(out, Variable(x_data[i].reshape((1,x_data.shape[1])).astype(np.int32))) 81 | x_t = output_t = output_a_f( x_t_1 ) 82 | 83 | if gpu >= 0: 84 | np_output_t = cuda.to_cpu(output_t.data) 85 | output[i] = np_output_t 86 | else: 87 | output[i] = output_t.data 88 | 89 | 90 | KLD = -0.0005 * F.sum(1 + q_log_sigma - q_mean**2 - F.exp(q_log_sigma)) 91 | 92 | return output, rec_loss, KLD, state 93 | 94 | 95 | def generate_z_x(self, seq_length_per_z, sample_z, nonlinear_q='tanh', nonlinear_p='tanh', output_f='sigmoid', gpu=-1): 96 | 97 | output = np.zeros((seq_length_per_z * sample_z.shape[0], self.recog_in_h.W.shape[1])) 98 | 99 | nonlinear = {'sigmoid': F.sigmoid, 'tanh': F.tanh, 'softplus': self.softplus, 'relu': F.relu} 100 | nonlinear_f_q = nonlinear[nonlinear_q] 101 | nonlinear_f_p = nonlinear[nonlinear_p] 102 | 103 | output_a_f = nonlinear[output_f] 104 | 105 | for epoch in xrange(sample_z.shape[0]): 106 | gen_out = np.zeros((seq_length_per_z, output.shape[1])) 107 | z = Variable(sample_z[epoch].reshape((1, sample_z.shape[1]))) 108 | 109 | # compute p( x | z) 110 | h0 = nonlinear_f_p( self.z(z) ) 111 | x_gen_0 = output_a_f( self.output(h0) ) 112 | state['gen_h'] = h0 113 | if gpu >= 0: 114 | np_x_gen_0 = cuda.to_cpu(x_gen_0.data) 115 | gen_out[0] = np_x_gen_0 116 | else: 117 | gen_out[0] = x_gen_0.data 118 | 119 | x_t_1 = x_gen_0 120 | 121 | for i in range(1, seq_length_per_z): 122 | hidden_p_t = nonlinear_f_p( self.gen_in_h(x_t_1) + self.gen_h_h(state['gen_h']) ) 123 | output_t = output_a_f( self.output(hidden_p_t) ) 124 | if gpu >= 0: 125 | np_output_t = cuda.to_cpu(output_t.data) 126 | gen_out[i] = np_output_t 127 | else: 128 | gen_out[i] = output_t.data 129 | state['gen_h'] = hidden_p_t 130 | x_t_1 = output_t 131 | 132 | output[epoch*seq_length_per_z+1:(epoch+1)*seq_length_per_z, :] = gen_out[1:] 133 | 134 | return output 135 | 136 | def make_initial_state(n_units, state_pattern, Train=True): 137 | return {name: Variable(np.zeros((1, n_units), dtype=np.float32), volatile=not Train) for name in state_pattern} 138 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | 4 | import numpy as np 5 | import cPickle as pickle 6 | import six 7 | from six.moves.urllib import request 8 | import scipy 9 | from scipy import io 10 | from sklearn import decomposition 11 | 12 | 13 | ''' 14 | 15 | BVH 16 | 17 | ''' 18 | def load_bvh_data(file_path): 19 | 20 | frames = 0 21 | frame_time = 0.0 22 | 23 | 24 | with open(file_path, "rb") as f: 25 | lines = f.readlines() 26 | 27 | n = 0 28 | while lines[n].find('MOTION') < 0: 29 | n += 1 30 | 31 | assert n < len(lines) 32 | 33 | # frames 34 | n += 1 35 | frames = int(lines[n].split(" ")[-1].replace('\n', '')) 36 | 37 | # frame time 38 | n += 1 39 | frame_time = float(lines[n].split(" ")[-1].replace('\n', '')) 40 | 41 | # motion data 42 | n += 1 43 | for i in range(frames): 44 | motion = lines[n + i].split(' ') 45 | 46 | if i == 0: 47 | dim = len(motion) 48 | global motion_data 49 | motion_data = np.zeros(frames * dim, dtype=np.float32).reshape((frames, dim)) 50 | 51 | for j in range(dim): 52 | motion_data[i, j] = float(motion[j].replace('\n', '')) 53 | 54 | return frames, frame_time, motion_data 55 | 56 | 57 | 58 | 59 | ''' 60 | 61 | MNIST 62 | 63 | ''' 64 | 65 | def load_mnist(images, labels, num): 66 | dim = 784 67 | data = np.zeros(num * dim, dtype=np.uint8).reshape((num, dim)) 68 | target = np.zeros(num, dtype=np.uint8).reshape((num, )) 69 | 70 | with gzip.open(images, 'rb') as f_images,\ 71 | gzip.open(labels, 'rb') as f_labels: 72 | f_images.read(16) 73 | f_labels.read(8) 74 | for i in six.moves.range(num): 75 | target[i] = ord(f_labels.read(1)) 76 | for j in six.moves.range(dim): 77 | data[i, j] = ord(f_images.read(1)) 78 | 79 | return data, target 80 | 81 | 82 | def download_mnist_data(data_dir): 83 | 84 | parent = 'http://yann.lecun.com/exdb/mnist' 85 | train_images = 'train-images-idx3-ubyte.gz' 86 | train_labels = 'train-labels-idx1-ubyte.gz' 87 | test_images = 't10k-images-idx3-ubyte.gz' 88 | test_labels = 't10k-labels-idx1-ubyte.gz' 89 | num_train = 60000 90 | num_test = 10000 91 | 92 | print('Downloading {:s}...'.format(train_images)) 93 | request.urlretrieve('{:s}/{:s}'.format(parent, train_images), train_images) 94 | print('Done') 95 | print('Downloading {:s}...'.format(train_labels)) 96 | request.urlretrieve('{:s}/{:s}'.format(parent, train_labels), train_labels) 97 | print('Done') 98 | print('Downloading {:s}...'.format(test_images)) 99 | request.urlretrieve('{:s}/{:s}'.format(parent, test_images), test_images) 100 | print('Done') 101 | print('Downloading {:s}...'.format(test_labels)) 102 | request.urlretrieve('{:s}/{:s}'.format(parent, test_labels), test_labels) 103 | print('Done') 104 | 105 | print('Converting training data...') 106 | data_train, target_train = load_mnist(train_images, train_labels, 107 | num_train) 108 | print('Done') 109 | print('Converting test data...') 110 | data_test, target_test = load_mnist(test_images, test_labels, num_test) 111 | mnist = {} 112 | mnist['data'] = np.append(data_train, data_test, axis=0) 113 | mnist['target'] = np.append(target_train, target_test, axis=0) 114 | 115 | print('Done') 116 | print('Save output...') 117 | with open('%s/mnist/mnist.pkl' % data_dir, 'wb') as output: 118 | six.moves.cPickle.dump(mnist, output, -1) 119 | print('Done') 120 | print('Convert completed') 121 | 122 | 123 | def load_mnist_data(data_dir): 124 | if not os.path.exists('%s/mnist/mnist.pkl' % data_dir): 125 | download_mnist_data(data_dir) 126 | with open('%s/mnist/mnist.pkl' % data_dir, 'rb') as mnist_pickle: 127 | mnist = six.moves.cPickle.load(mnist_pickle) 128 | return mnist 129 | 130 | ''' 131 | SVHN 132 | 133 | ''' 134 | 135 | def download_svhn_data(data_dir): 136 | 137 | parent = 'http://ufldl.stanford.edu/housenumbers' 138 | train_images = 'train_32x32.mat' 139 | test_images = 'test_32x32.mat' 140 | 141 | data_path = data_dir+"/SVHN/" 142 | if not os.path.exists(data_path): 143 | os.mkdir(data_path) 144 | 145 | print('Downloading {:s}...'.format(train_images)) 146 | request.urlretrieve('{:s}/{:s}'.format(parent, train_images), data_path+train_images) 147 | print('Done') 148 | print('Downloading {:s}...'.format(test_images)) 149 | request.urlretrieve('{:s}/{:s}'.format(parent, test_images), data_path+test_images) 150 | print('Done') 151 | 152 | 153 | 154 | 155 | def svhn_pickle_checker(data_dir): 156 | if os.path.exists(data_dir+'/SVHN/train_x.pkl') and os.path.exists(data_dir+'/SVHN/train_y.pkl') \ 157 | and os.path.exists(data_dir+'/SVHN/test_x.pkl') and os.path.exists(data_dir+'/SVHN/test_y.pkl'): 158 | return 1 159 | else: 160 | return 0 161 | 162 | def load_svhn(data_dir, toFloat=True, binarize_y=True, dtype=np.float32, pca=False, n_components=1000): 163 | 164 | # if svhn_pickle_checker(data_dir) == 1: 165 | # print "load from pickle file." 166 | # train_x = pickle.load(open(data_dir+'/SVHN/train_x.pkl')) 167 | # train_y = pickle.load(open(data_dir+'/SVHN/train_y.pkl')) 168 | # test_x = pickle.load(open(data_dir+'/SVHN/test_x.pkl')) 169 | # test_y = pickle.load(open(data_dir+'/SVHN/test_y.pkl')) 170 | # 171 | # return train_x, train_y, test_x, test_y 172 | 173 | if not os.path.exists(data_dir+'/SVHN/train_32x32.mat') or not os.path.exists(data_dir+'/SVHN/test_32x32.mat'): 174 | download_svhn_data(data_dir) 175 | 176 | train = scipy.io.loadmat(data_dir+'/SVHN/train_32x32.mat') 177 | train_x = train['X'].swapaxes(0,1).T.reshape((train['X'].shape[3], -1)) 178 | train_y = train['y'].reshape((-1)) - 1 179 | test = scipy.io.loadmat(data_dir+'/SVHN/test_32x32.mat') 180 | test_x = test['X'].swapaxes(0,1).T.reshape((test['X'].shape[3], -1)) 181 | test_y = test['y'].reshape((-1)) - 1 182 | if toFloat: 183 | train_x = train_x.astype(dtype)/256. 184 | test_x = test_x.astype(dtype)/256. 185 | if binarize_y: 186 | train_y = binarize_labels(train_y) 187 | test_y = binarize_labels(test_y) 188 | 189 | # if pca: 190 | # x_stack = np.vstack([train_x, test_x]) 191 | # pca = decomposition.PCA(n_components=n_components) 192 | # pca.whiten=True 193 | # # pca.fit(x_stack) 194 | # # x_pca = pca.transform(x_stack) 195 | # x_pca = pca.fit_transform(x_stack) 196 | # train_x = x_pca[:train_x.shape[0], :] 197 | # test_x = x_pca[train_x.shape[0]:, :] 198 | # 199 | # with open('%s/SVHN/pca.pkl' % data_dir, "wb") as f: 200 | # pickle.dump(pca, f) 201 | # with open('%s/SVHN/train_x.pkl' % data_dir, "wb") as f: 202 | # pickle.dump(train_x, f) 203 | # with open('%s/SVHN/train_y.pkl' % data_dir, "wb") as f: 204 | # pickle.dump(train_y, f) 205 | # with open('%s/SVHN/test_x.pkl' % data_dir, "wb") as f: 206 | # pickle.dump(test_x, f) 207 | # with open('%s/SVHN/test_y.pkl' % data_dir, "wb") as f: 208 | # pickle.dump(test_y, f) 209 | 210 | return train_x, train_y, test_x, test_y 211 | 212 | def binarize_labels(y, n_classes=10): 213 | new_y = np.zeros((y.shape[0], n_classes)) 214 | for i in range(y.shape[0]): 215 | new_y[i, y[i]] = 1 216 | return new_y.astype(np.float32) 217 | 218 | 219 | 220 | ''' 221 | 222 | Shakespeare 223 | 224 | ''' 225 | def load_shakespeare(data_dir): 226 | vocab = {} 227 | words = open('%s/tinyshakespeare/input.txt' % data_dir, 'rb').read() 228 | words = list(words) 229 | dataset = np.ndarray((len(words), ), dtype=np.int32) 230 | for i, word in enumerate(words): 231 | if word not in vocab: 232 | vocab[word] = len(vocab) 233 | dataset[i] = vocab[word] 234 | 235 | return dataset, words, vocab 236 | 237 | 238 | ''' 239 | 240 | music 241 | 242 | ''' 243 | 244 | def load_midi_data(data_dir): 245 | import midi.utils as utils 246 | 247 | midi_data = utils.midiread(data_dir, dt=0.5) 248 | 249 | return midi_data.piano_roll 250 | -------------------------------------------------------------------------------- /train_VRAE.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import time 3 | import math 4 | import sys 5 | import argparse 6 | import cPickle as pickle 7 | import copy 8 | import os 9 | import six 10 | 11 | import numpy as np 12 | from chainer import cuda, Variable, FunctionSet, optimizers 13 | import chainer.functions as F 14 | from VRAE import VRAE, make_initial_state 15 | 16 | import dataset 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--data_path', type=str, default="dataset") 20 | parser.add_argument('--output_dir', type=str, default="model") 21 | parser.add_argument('--dataset', type=str, default="midi") 22 | parser.add_argument('--init_from', type=str, default="") 23 | parser.add_argument('--clip_grads', type=int, default=5) 24 | parser.add_argument('--gpu', type=int, default=-1) 25 | 26 | args = parser.parse_args() 27 | 28 | if not os.path.exists(args.output_dir): 29 | os.mkdir(args.output_dir) 30 | 31 | 32 | if args.dataset == 'midi': 33 | midi = dataset.load_midi_data('%s/midi/sample.mid' % args.data_path) 34 | train_x = midi[:120].astype(np.float32) 35 | 36 | n_x = train_x.shape[1] 37 | n_hidden = [500] 38 | n_z = 2 39 | n_y = n_x 40 | 41 | frames = train_x.shape[0] 42 | n_batch = 6 43 | seq_length = frames / n_batch 44 | 45 | split_x = np.vsplit(train_x, n_batch) 46 | 47 | n_epochs = 500 48 | continuous = False 49 | 50 | 51 | if args.dataset == 'bvh': 52 | frames, frame_time, motion_data = dataset.load_bvh_data("%s/bvh/sample.bvh") 53 | max_motion = np.max(motion_data, axis=0) 54 | min_motion = np.min(motion_data, axis=0) 55 | 56 | norm_motion_data = (motion_data - min_motion) / (max_motion - min_motion) 57 | train_x = norm_motion_data 58 | train_y = norm_motion_data 59 | 60 | n_x = train_x.shape[1] 61 | n_hidden = [250] 62 | n_z = 10 63 | n_y = n_x 64 | 65 | n_online= 10 66 | n_batch = train_x.shape[0] / n_online 67 | 68 | if train_x.shape[0] % n_online != 0: 69 | reduced_sample = train_x.shape[0] % n_online 70 | train_x = train_x[:train_x.shape[0] - reduced_sample] 71 | 72 | n_epochs = 500 73 | continuous = True 74 | 75 | 76 | 77 | n_hidden_recog = n_hidden 78 | n_hidden_gen = n_hidden 79 | n_layers_recog = len(n_hidden_recog) 80 | n_layers_gen = len(n_hidden_gen) 81 | 82 | layers = {} 83 | 84 | # Recognition model. 85 | rec_layer_sizes = [(train_x.shape[1], n_hidden_recog[0])] 86 | rec_layer_sizes += zip(n_hidden_recog[:-1], n_hidden_recog[1:]) 87 | rec_layer_sizes += [(n_hidden_recog[-1], n_z)] 88 | 89 | layers['recog_in_h'] = F.Linear(train_x.shape[1], n_hidden_recog[0], nobias=True) 90 | layers['recog_h_h'] = F.Linear(n_hidden_recog[0], n_hidden_recog[0]) 91 | 92 | layers['recog_mean'] = F.Linear(n_hidden_recog[-1], n_z) 93 | layers['recog_log_sigma'] = F.Linear(n_hidden_recog[-1], n_z) 94 | 95 | # Generating model. 96 | gen_layer_sizes = [(n_z, n_hidden_gen[0])] 97 | gen_layer_sizes += zip(n_hidden_gen[:-1], n_hidden_gen[1:]) 98 | gen_layer_sizes += [(n_hidden_gen[-1], train_x.shape[1])] 99 | 100 | layers['z'] = F.Linear(n_z, n_hidden_gen[0]) 101 | layers['gen_in_h'] = F.Linear(train_x.shape[1], n_hidden_gen[0], nobias=True) 102 | layers['gen_h_h'] = F.Linear(n_hidden_gen[0], n_hidden_gen[0]) 103 | 104 | layers['output'] = F.Linear(n_hidden_gen[-1], train_x.shape[1]) 105 | 106 | if args.init_from == "": 107 | model = VRAE(**layers) 108 | else: 109 | model = pickle.load(open(args.init_from)) 110 | 111 | # state pattern 112 | state_pattern = ['recog_h', 'gen_h'] 113 | 114 | if args.gpu >= 0: 115 | cuda.init(args.gpu) 116 | model.to_gpu() 117 | 118 | 119 | # use Adam 120 | optimizer = optimizers.Adam() 121 | optimizer.setup(model.collect_parameters()) 122 | 123 | total_losses = np.zeros(n_epochs, dtype=np.float32) 124 | 125 | for epoch in xrange(1, n_epochs + 1): 126 | print('epoch', epoch) 127 | 128 | t1 = time.time() 129 | total_rec_loss = 0.0 130 | total_kl_loss = 0.0 131 | total_loss = 0.0 132 | outputs = np.zeros(train_x.shape, dtype=np.float32) 133 | # state = make_initial_state(n_hidden_recog[0], state_pattern) 134 | for i in xrange(n_batch): 135 | state = make_initial_state(n_hidden_recog[0], state_pattern) 136 | x_batch = split_x[i] 137 | 138 | if args.gpu >= 0: 139 | x_batch = cuda.to_gpu(x_batch) 140 | 141 | output, rec_loss, kl_loss, state = model.forward_one_step(x_batch, state, continuous, nonlinear_q='tanh', nonlinear_p='tanh', output_f = 'sigmoid', gpu=-1) 142 | 143 | outputs[i*seq_length:(i+1)*seq_length, :] = output 144 | 145 | loss = rec_loss + kl_loss 146 | total_loss += loss 147 | total_rec_loss += rec_loss 148 | total_losses[epoch-1] = total_loss.data 149 | 150 | optimizer.zero_grads() 151 | loss.backward() 152 | loss.unchain_backward() 153 | optimizer.clip_grads(args.clip_grads) 154 | optimizer.update() 155 | 156 | saved_output = outputs 157 | 158 | print "{}/{}, train_loss = {}, total_rec_loss = {}, time = {}".format(epoch, n_epochs, total_loss.data, total_rec_loss.data, time.time()-t1) 159 | 160 | if epoch % 100 == 0: 161 | model_path = "%s/VRAE_%s_%d.pkl" % (args.output_dir, args.dataset, epoch) 162 | with open(model_path, "w") as f: 163 | pickle.dump(copy.deepcopy(model).to_cpu(), f) 164 | --------------------------------------------------------------------------------