├── README.md ├── dataset.py ├── dataset.pyc ├── freqdomain.py ├── lib ├── __init__.py ├── __init__.pyc ├── ops.py ├── ops.pyc ├── optimizers.py └── optimizers.pyc ├── my_three_tier.py ├── temp.py ├── train.py ├── vctk_dataset.py ├── vctk_dataset.pyc ├── wavenet.py ├── wavenet_controller.py └── wavenet_worker.py /README.md: -------------------------------------------------------------------------------- 1 | # WaveNet implementation in Theano 2 | Based on https://deepmind.com/blog/wavenet-generative-model-raw-audio/ and https://arxiv.org/pdf/1609.03499.pdf. 3 | 4 | Disclaimer: this is a re-implementation of the model described in the WaveNet paper by Google Deepmind. This repository is not associated with Google Deepmind. 5 | 6 | [Listen to a sample 🎶!](https://soundcloud.com/rithesh-kumar-772989650/sets/wavenet-samples) 7 | 8 | - wavenet.py -> Train the network 9 | - lib/ops.py -> Mini theano library 10 | - generate() function in wavenet.py ==> generates samples 11 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import scipy.io.wavfile 3 | import scikits.audiolab 4 | 5 | import random 6 | import time 7 | import numpy as np 8 | 9 | 10 | random_seed = 123 11 | 12 | def feed_epoch(data_path, n_files, BATCH_SIZE, SEQ_LEN, OVERLAP, Q_LEVELS, Q_ZERO,RF=1024): 13 | global random_seed 14 | """ 15 | Generator that yields training inputs (subbatch, reset). `subbatch` contains 16 | quantized audio data; `reset` is a boolean indicating the start of a new 17 | sequence (i.e. you should reset h0 whenever `reset` is True). 18 | Feeds subsequences which overlap by a specified amount, so that the model 19 | can always have target for every input in a given subsequence. 20 | Loads sequentially-named FLAC files in a directory 21 | (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac) 22 | Assumes all flac files have the same length. 23 | data_path: directory containing the flac files 24 | n_files: how many FLAC files are in the directory 25 | (see two_tier.py for a description of the constants) 26 | returns: (subbatch, reset) 27 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP) 28 | reset: True or False 29 | """ 30 | 31 | def round_to(x, y): 32 | """round x up to the nearest y""" 33 | return int(numpy.ceil(x / float(y))) * y 34 | 35 | def mewlaw_quantize(data): 36 | final_data = [] 37 | for i in xrange(data.shape[0]): 38 | final_data.append(float_to_uint8(ulaw(wav_to_float(data[i])))) 39 | return np.asarray(final_data,dtype=np.uint8) 40 | 41 | def ulaw(x, u=255): 42 | x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u)) 43 | return x 44 | 45 | def invulaw(y,u=255): 46 | y = np.sign(y)*(1./u)*(np.power(1+u,np.abs(y))-1) 47 | return y 48 | 49 | def float_to_uint8(x): 50 | x += 1. 51 | x /= 2. 52 | uint8_max_value = np.iinfo('uint8').max 53 | x *= uint8_max_value 54 | x = x.astype('uint8') 55 | return x 56 | 57 | 58 | def wav_to_float(x): 59 | try: 60 | max_value = np.iinfo(x.dtype).max 61 | min_value = np.iinfo(x.dtype).min 62 | except: 63 | max_value = np.finfo(x.dtype).max 64 | min_value = np.finfo(x.dtype).min 65 | x = x.astype('float64', casting='safe') 66 | x -= min_value 67 | x /= ((max_value - min_value) / 2.) 68 | x -= 1. 69 | return x 70 | 71 | def batch_quantize(data): 72 | """ 73 | floats in (-1, 1) to ints in [0, Q_LEVELS-1] 74 | scales normalized across axis 1 75 | """ 76 | eps = numpy.float64(1e-5) 77 | companded = np.sign(data)*(np.log(1+255*np.abs(data))/np.log(256)) 78 | data = companded 79 | 80 | data -= data.min(axis=1)[:, None] 81 | 82 | data *= ((Q_LEVELS - eps) / data.max(axis=1)[:, None]) 83 | data += eps/2 84 | # print "WARNING using zero-dc-offset normalization" 85 | # data -= data.mean(axis=1)[:, None] 86 | # data *= (((Q_LEVELS/2.) - eps) / numpy.abs(data).max(axis=1)[:, None]) 87 | # data += Q_LEVELS/2 88 | 89 | data = data.astype('uint8') 90 | 91 | return data 92 | 93 | start=100 94 | paths = [data_path+'/p{}.flac'.format(start+i) for i in xrange(n_files)] 95 | #rand_idx = np.random.randint(0,141867,n_files) 96 | #paths = [data_path+'/p{}.flac'.format(i) for i in rand_idx] 97 | 98 | random.seed(random_seed) 99 | random.shuffle(paths) 100 | random_seed += 1 101 | 102 | batches = [] 103 | for i in xrange(len(paths) / BATCH_SIZE): 104 | batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE]) 105 | 106 | random.shuffle(batches) 107 | 108 | for batch_paths in batches: 109 | # batch_seq_len = length of longest sequence in the batch, rounded up to 110 | # the nearest SEQ_LEN. 111 | batch_seq_len = len(scikits.audiolab.flacread(batch_paths[0])[0]) 112 | batch_seq_len = round_to(batch_seq_len, SEQ_LEN) 113 | 114 | batch = numpy.zeros( 115 | (BATCH_SIZE, batch_seq_len), 116 | dtype='float64' 117 | ) 118 | 119 | for i, path in enumerate(batch_paths): 120 | data, fs, enc = scikits.audiolab.flacread(path) 121 | batch[i, :len(data)] = data 122 | 123 | if Q_LEVELS != None: 124 | batch = batch_quantize(batch) 125 | 126 | batch = numpy.concatenate([ 127 | numpy.full((BATCH_SIZE, OVERLAP), Q_ZERO, dtype=np.uint8), 128 | batch 129 | ], axis=1) 130 | else: 131 | batch = numpy.concatenate([ 132 | numpy.full((BATCH_SIZE, OVERLAP), 0, dtype='float32'), 133 | batch 134 | ], axis=1) 135 | batch = batch.astype('float32') 136 | 137 | batch -= batch.mean() 138 | batch /= batch.std() 139 | 140 | for i in xrange(0,batch.shape[1]-RF-OVERLAP,OVERLAP): 141 | reset = numpy.int32(i==0) 142 | start = i 143 | end = i+RF+OVERLAP 144 | subbatch = batch[:, start : end] 145 | yield (subbatch, reset) 146 | 147 | def blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, STRIDE, RF=1025, N_FILES=None, DISTRIBUTED=False,WORKER_ID=None): 148 | global random_seed 149 | def process_wav(desired_sample_rate, filename, use_ulaw): 150 | channels = scipy.io.wavfile.read(filename) 151 | file_sample_rate, audio = channels 152 | audio = ensure_mono(audio) 153 | audio = wav_to_float(audio) 154 | if use_ulaw: 155 | audio = ulaw(audio) 156 | audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio) 157 | audio = float_to_uint8(audio) 158 | return audio 159 | 160 | def process_flac(desired_sample_rate, filename, use_ulaw): 161 | channels = scikits.audiolab.flacread(filename) 162 | file_sample_rate = channels[1] 163 | audio = channels[0] 164 | audio = ensure_mono(audio) 165 | #audio = wav_to_float(audio) 166 | if use_ulaw: 167 | audio = ulaw(audio) 168 | audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio) 169 | audio = float_to_uint8(audio) 170 | return audio 171 | 172 | def ulaw(x, u=255): 173 | x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u)) 174 | return x 175 | 176 | 177 | def float_to_uint8(x): 178 | x += 1. 179 | x /= 2. 180 | uint8_max_value = np.iinfo('uint8').max 181 | x *= uint8_max_value 182 | x = x.astype('uint8') 183 | return x 184 | 185 | 186 | def wav_to_float(x): 187 | try: 188 | max_value = np.iinfo(x.dtype).max 189 | min_value = np.iinfo(x.dtype).min 190 | except: 191 | max_value = np.finfo(x.dtype).max 192 | min_value = np.finfo(x.dtype).min 193 | x = x.astype('float64', casting='safe') 194 | x -= min_value 195 | x /= ((max_value - min_value) / 2.) 196 | x -= 1. 197 | return x 198 | 199 | 200 | def ulaw2lin(x, u=255.): 201 | max_value = np.iinfo('uint8').max 202 | min_value = np.iinfo('uint8').min 203 | x = x.astype('float64', casting='safe') 204 | x -= min_value 205 | x /= ((max_value - min_value) / 2.) 206 | x -= 1. 207 | x = np.sign(x) * (1 / u) * (((1 + u) ** np.abs(x)) - 1) 208 | x = float_to_uint8(x) 209 | return x 210 | 211 | def ensure_sample_rate(desired_sample_rate, file_sample_rate, mono_audio): 212 | if file_sample_rate != desired_sample_rate: 213 | mono_audio = scipy.signal.resample_poly(mono_audio, desired_sample_rate, file_sample_rate) 214 | return mono_audio 215 | 216 | 217 | def ensure_mono(raw_audio): 218 | """ 219 | Just use first channel. 220 | """ 221 | if raw_audio.ndim == 2: 222 | raw_audio = raw_audio[:, 0] 223 | return raw_audio 224 | 225 | start=100 226 | DATA_PATH = "/data/lisatmp3/kumarrit/blizzard/" 227 | if DISTRIBUTED: 228 | random.seed(WORKER_ID) 229 | start = random.choice(xrange(120000)) 230 | paths = ['p%d.flac'%(start+i) for i in xrange(N_FILES)] 231 | random_seed += 1 232 | batches = [] 233 | 234 | for i in xrange(len(paths) / BATCH_SIZE): 235 | batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE]) 236 | 237 | random.seed(random_seed) 238 | random.shuffle(batches) 239 | for batch_paths in batches: 240 | data = [] 241 | for fname in batch_paths: 242 | data.append(process_flac(16000,DATA_PATH+fname,True)) 243 | max_len = max([len(vec) for vec in data]) 244 | for i in xrange(len(data)): 245 | data[i] = np.hstack((data[i],np.full(max_len-len(data[i]),128,dtype=np.uint8))) 246 | data = np.asarray(data).astype(np.uint8) 247 | for i in xrange(0,data.shape[1]-RF-STRIDE,STRIDE): 248 | start = i 249 | end = i+RF+STRIDE 250 | subbatch = data[:, start : end] 251 | yield (subbatch,start) 252 | -------------------------------------------------------------------------------- /dataset.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/dataset.pyc -------------------------------------------------------------------------------- /freqdomain.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.setrecursionlimit(10000) 3 | import numpy as np 4 | import numpy 5 | numpy.random.seed(123) 6 | import random 7 | random.seed(123) 8 | import dataset 9 | import theano 10 | import theano.tensor as T 11 | theano.config.floatX='float32' 12 | import lib.ops 13 | import scipy.io.wavfile 14 | import time 15 | import lasagne 16 | import theano.tensor.fft 17 | 18 | # Hyperparams 19 | NB_EPOCH=100 20 | BATCH_SIZE = 8 21 | FRAME_SIZE = 0 # How many samples per frame 22 | Q_LEVELS = None # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 23 | #DATA_PATH = '/data/lisatmp3/kumarrit/blizzard' 24 | DATA_PATH='/home/rithesh/DeepLearning/Vocal Synthesis/data' 25 | N_FILES = 8 26 | BITRATE = 16000 27 | 28 | Q_ZERO = None # Discrete value correponding to zero amplitude 29 | N_BLOCKS=1 30 | RF=N_BLOCKS*32-N_BLOCKS+2 31 | SEQ_LEN=2*RF 32 | n_filters=256 33 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)) 34 | 35 | def network(input_sequences): 36 | batch_size = input_sequences.shape[0] 37 | length = input_sequences.shape[1] 38 | inp = input_sequences[:,None,None,:] 39 | dilations = np.asarray([[1,2,4,8,16]*N_BLOCKS]).tolist()[0] 40 | conv1 = lib.ops.conv1d("causal-conv",inp,2,1,n_filters,1,bias=False,batchnorm=False,pad=(0,1))[:,:,:,:length] 41 | prev_conv = conv1 42 | #prev_skip = [] 43 | prev_skip = T.zeros_like(conv1) 44 | i=0 45 | for value in dilations: 46 | i+=1 47 | x,y = lib.ops.WaveNetConv1d("Block-%d"%i,prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value) 48 | prev_conv = x 49 | prev_skip += y 50 | out = T.nnet.relu(prev_skip) 51 | out2 = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=False,batchnorm=False)) 52 | output = lib.ops.conv1d("Output.2",out2,1,1,34,n_filters,bias=False,batchnorm=False) 53 | 54 | result = output[:,:,0,-1] 55 | result2 = T.nnet.relu(lib.ops.Dense('Op.1',34,512,result,weightnorm=False)) 56 | result3 = lib.ops.Dense('Op.2',512,34,result2,weightnorm=False) 57 | return output[:,:,0,-1].reshape((batch_size,17,2)) 58 | 59 | print "Model settings:" 60 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')] 61 | all_vars = sorted(all_vars, key=lambda x: x[0]) 62 | for var_name, var_value in all_vars: 63 | print "\t{}: {}".format(var_name, var_value) 64 | 65 | sequences = T.fmatrix('sequences') 66 | input_sequences = sequences[:,:RF] 67 | target_sequences = sequences[:,RF:] 68 | 69 | pred_freq = network(input_sequences) 70 | target_freq = theano.tensor.fft.rfft(target_sequences) 71 | cost = T.sqr(pred_freq-target_freq).mean() 72 | #lib.load_params('iter_latest_wavenet.p') 73 | # cost = T.nnet.categorical_crossentropy( 74 | # predicted_sequences, 75 | # target_sequences.flatten() 76 | # ).mean() 77 | 78 | # By default we report cross-entropy cost in bits. 79 | # Switch to nats by commenting out this line: 80 | #cost = cost * lib.floatX(1.44269504089) 81 | 82 | params = lib.search(cost, lambda x: hasattr(x, 'param')) 83 | lib.print_params_info(cost, params) 84 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) 85 | grads = T.grad(cost, wrt=params) 86 | lr = T.fscalar() 87 | updates = lasagne.updates.adam(grads, params, learning_rate=lr) 88 | 89 | print "Gradients Computed" 90 | 91 | train_fn = theano.function( 92 | [sequences,lr], 93 | [cost,pred_freq], 94 | updates=updates, 95 | on_unused_input='warn' 96 | ) 97 | 98 | 99 | print "Training!" 100 | DATA_PATH="/data/lisatmp3/kumarrit/blizzard" 101 | for epoch in xrange(NB_EPOCH): 102 | costs = [] 103 | times = [] 104 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO,RF)) 105 | data_feeder = list(dataset.preprocess(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN,RF)) 106 | results = [] 107 | print "Epoch : ",epoch 108 | total_iters = 0 109 | i=0 110 | for seqs, t, m, s in data_feeder: 111 | start_time = time.time() 112 | cost,pred = train_fn(seqs,0.001) 113 | results.append(pred) 114 | i += 1 115 | total_time = time.time() - start_time 116 | times.append(total_time) 117 | total_iters += 1 118 | print "Batch ",total_iters," (Epoch %d)"%(epoch) 119 | costs.append(cost) 120 | print "\tCost: ",np.mean(costs) 121 | print "\tTime: ",np.mean(times) 122 | del results 123 | 124 | 125 | def plot(i): 126 | import matplotlib.pyplot as plt 127 | f,axarr = plt.subplots(8) 128 | for j in xrange(8): 129 | axarr[j].plot(data[i][0][j][1025:]) 130 | axarr[j].plot(results[i][j],color='green') 131 | plt.show() 132 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | import ops 2 | import numpy 3 | import theano 4 | import theano.tensor as T 5 | import cPickle as pickle 6 | from collections import OrderedDict 7 | import os 8 | import matplotlib 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | 12 | _params = OrderedDict() 13 | 14 | import locale 15 | 16 | locale.setlocale(locale.LC_ALL, '') 17 | 18 | def print_params_info(cost, params): 19 | """Print information about the parameters in the given param set.""" 20 | 21 | params = sorted(params, key=lambda p: p.name) 22 | values = [p.get_value(borrow=True) for p in params] 23 | shapes = [p.shape for p in values] 24 | print "Params for cost:" 25 | for param, value, shape in zip(params, values, shapes): 26 | print "\t{0} ({1})".format( 27 | param.name, 28 | ",".join([str(x) for x in shape]) 29 | ) 30 | 31 | total_param_count = 0 32 | for shape in shapes: 33 | param_count = 1 34 | for dim in shape: 35 | param_count *= dim 36 | total_param_count += param_count 37 | print "Total parameter count: {0}".format( 38 | locale.format("%d", total_param_count, grouping=True) 39 | ) 40 | 41 | def param(name, *args, **kwargs): 42 | """ 43 | A wrapper for `theano.shared` which enables parameter sharing in models. 44 | 45 | Creates and returns theano shared variables similarly to `theano.shared`, 46 | except if you try to create a param with the same name as a 47 | previously-created one, `param(...)` will just return the old one instead of 48 | making a new one. 49 | 50 | This constructor also adds a `param` attribute to the shared variables it 51 | creates, so that you can easily search a graph for all params. 52 | """ 53 | if name not in _params: 54 | kwargs['name'] = name 55 | train = not 'train' in kwargs 56 | if not train: 57 | del kwargs['train'] 58 | param = theano.shared(*args, **kwargs) 59 | if train: 60 | param.param = train 61 | _params[name] = param 62 | return _params[name] 63 | 64 | def delete_params(name): 65 | to_delete = [p_name for p_name in _params if name in p_name] 66 | for p_name in to_delete: 67 | del _params[p_name] 68 | 69 | def search(node, critereon): 70 | """ 71 | Traverse the Theano graph starting at `node` and return a list of all nodes 72 | which match the `critereon` function. When optimizing a cost function, you 73 | can use this to get a list of all of the trainable params in the graph, like 74 | so: 75 | 76 | `lib.search(cost, lambda x: hasattr(x, "param"))` 77 | """ 78 | 79 | def _search(node, critereon, visited): 80 | if node in visited: 81 | return [] 82 | visited.add(node) 83 | 84 | results = [] 85 | if isinstance(node, T.Apply): 86 | for inp in node.inputs: 87 | results += _search(inp, critereon, visited) 88 | else: # Variable node 89 | if critereon(node): 90 | results.append(node) 91 | if node.owner is not None: 92 | results += _search(node.owner, critereon, visited) 93 | return results 94 | 95 | return _search(node, critereon, set()) 96 | 97 | def floatX(x): 98 | """ 99 | Convert `x` to the numpy type specified in `theano.config.floatX`. 100 | """ 101 | return numpy.float32(x) 102 | 103 | def save_params(path): 104 | param_vals = {} 105 | for name, param in _params.iteritems(): 106 | param_vals[name] = param.get_value() 107 | 108 | try: 109 | with open(path, 'wb') as f: 110 | pickle.dump(param_vals, f) 111 | except IOError: 112 | os.makedirs(os.path.split(path)[0]) 113 | f = open(path,"wb") 114 | pickle.dump(param_vals, f) 115 | 116 | def load_params(path): 117 | with open(path, 'rb') as f: 118 | param_vals = pickle.load(f) 119 | 120 | for name, val in param_vals.iteritems(): 121 | _params[name].set_value(val) 122 | 123 | def clear_all_params(): 124 | to_delete = [p_name for p_name in _params] 125 | for p_name in to_delete: 126 | del _params[p_name] 127 | 128 | __train_log_file_name = 'train_info.pkl' 129 | def save_training_info(values, path): 130 | """ 131 | Gets a set of values as dictionary and append them to a log file. 132 | stores in /train_log.pkl 133 | """ 134 | file_name = os.path.join(path, __train_log_file_name) 135 | try: 136 | with open(file_name, "rb") as f: 137 | log = pickle.load(f) 138 | except IOError: # first time 139 | if not os.path.exists(path): 140 | os.makedirs(path) 141 | log = {} 142 | for k in values.keys(): 143 | log[k] = [] 144 | for k, v in values.items(): 145 | log[k].append(v) 146 | with open(file_name, "wb") as f: 147 | pickle.dump(log, f) 148 | 149 | def plot_traing_info(x, ylist, path): 150 | """ 151 | Loads log file and plot x and y values as provided by input. 152 | Saves as /train_log.png 153 | """ 154 | file_name = os.path.join(path, __train_log_file_name) 155 | try: 156 | with open(file_name, "rb") as f: 157 | log = pickle.load(f) 158 | except IOError: # first time 159 | warnings.warn("There is no {} file here!!!".format(file_name)) 160 | return 161 | plt.figure() 162 | x_vals = log[x] 163 | for y in ylist: 164 | y_vals = log[y] 165 | if len(y_vals) != len(x_vals): 166 | warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x)) 167 | plt.plot(x_vals, y_vals, label=y) 168 | # assert len(y_vals) == len(x_vals), "not the same len" 169 | plt.xlabel(x) 170 | plt.legend() 171 | #plt.show() 172 | plt.savefig(file_name[:-3]+'png', bbox_inches='tight') 173 | plt.close('all') 174 | -------------------------------------------------------------------------------- /lib/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/__init__.pyc -------------------------------------------------------------------------------- /lib/ops.py: -------------------------------------------------------------------------------- 1 | import lib 2 | import numpy as np 3 | import numpy 4 | import theano 5 | import theano.tensor as T 6 | theano.config.floatX='float32' 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 8 | import time 9 | import lasagne 10 | import math 11 | 12 | srng = RandomStreams(seed=234) 13 | 14 | def BatchNorm(layer_name,input, insize, mode=0,run_mode=0, momentum=0.9, layer='default'): 15 | ''' 16 | # params : 17 | input_shape : 18 | when mode is 0, we assume 2D input. (mini_batch_size, # features) 19 | when mode is 1, we assume 4D input. (mini_batch_size, # of channel, # row, # column) 20 | mode : 21 | 0 : feature-wise mode (normal BN) 22 | 1 : window-wise mode (CNN mode BN) 23 | momentum : momentum for exponential average 24 | ''' 25 | input_shape = input.shape 26 | # random setting of gamma and beta, setting initial mean and std 27 | rng = np.random.RandomState(int(time.time())) 28 | 29 | gamma_val = np.asarray(rng.uniform(low=-1.0/math.sqrt(insize), high=1.0/math.sqrt(insize), size=(insize)),dtype=theano.config.floatX) 30 | if layer=='recurrent': 31 | gamma = lib.param(layer_name+'.gamma', np.full(shape=(insize),fill_value=0.1,dtype=theano.config.floatX), borrow=True) 32 | else: 33 | gamma = lib.param(layer_name+'.gamma', gamma_val, borrow=True) 34 | beta = lib.param(layer_name+'.beta',np.zeros((insize), dtype=theano.config.floatX), borrow=True) 35 | mean = lib.param(layer_name+'.mean',np.zeros((insize),dtype=theano.config.floatX), train=False, borrow=True) 36 | var = lib.param(layer_name+'.var',np.ones((insize), dtype=theano.config.floatX),train = False, borrow=True) 37 | 38 | epsilon = 1e-06 39 | 40 | if mode==0 : 41 | if run_mode==0 : 42 | now_mean = T.mean(input, axis=0) 43 | now_var = T.var(input, axis=0) 44 | now_normalize = (input - now_mean) / T.sqrt(now_var+epsilon) # should be broadcastable.. 45 | output = gamma * now_normalize + beta 46 | # mean, var update 47 | # run_mean = theano.clone(mean,share_inputs=False) 48 | # run_var = theano.clone(var, share_inputs=False) 49 | # run_mean.default_update = momentum * mean + (1.0-momentum) * now_mean 50 | # run_var.default_update = momentum * var + (1.0-momentum) * (input_shape[0]/(input_shape[0]-1)*now_var) 51 | mean = momentum*mean + (1.0-momentum) * now_mean 52 | var = momentum*var + (1.0-momentum)*(input_shape[0]/(input_shape[0]-1))*now_var 53 | else : 54 | output = gamma * (input - mean) / T.sqrt(var+epsilon) + beta 55 | 56 | else : 57 | # in CNN mode, gamma and beta exists for every single channel separately. 58 | # for each channel, calculate mean and std for (mini_batch_size * row * column) elements. 59 | # then, each channel has own scalar gamma/beta parameters. 60 | axes = (0,2,3) 61 | if run_mode==0 : 62 | now_mean = T.mean(input, axis=axes) 63 | now_var = T.var(input, axis=axes) 64 | # mean, var update 65 | # run_mean = theano.clone(mean,share_inputs=False) 66 | # run_var = theano.clone(var, share_inputs=False) 67 | # run_mean.default_update = momentum * mean + (1.0-momentum) * now_mean 68 | # run_var.default_update = momentum * var + (1.0-momentum) * (input_shape[0]/(input_shape[0]-1)*now_var) 69 | mean = momentum*mean + (1.0-momentum) * now_mean 70 | var = momentum*var + (1.0-momentum)*(input_shape[0]/(input_shape[0]-1))*now_var 71 | else : 72 | now_mean = mean 73 | now_var = var 74 | # change shape to fit input shape 75 | 76 | param_axes = iter(range(input.ndim - len(axes))) 77 | pattern = ['x' if input_axis in axes 78 | else next(param_axes) 79 | for input_axis in range(input.ndim)] 80 | now_mean = now_mean.dimshuffle(pattern) 81 | now_var = now_var.dimshuffle(pattern) 82 | now_gamma = gamma.dimshuffle(pattern) 83 | now_beta = beta.dimshuffle(pattern) 84 | output = now_gamma * (input - now_mean) / T.sqrt(now_var+epsilon) + now_beta 85 | 86 | return output.astype('float32') 87 | 88 | def get_fans(shape): 89 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:]) 90 | fan_out = shape[1] if len(shape) == 2 else shape[0] 91 | return fan_in, fan_out 92 | 93 | def glorot_uniform(shape,init='glorot'): 94 | def uniform(shape, scale=0.05, name=None): 95 | return np.random.uniform(low=-scale, high=scale, size=shape) 96 | fan_in, fan_out = get_fans(shape) 97 | s = np.sqrt(6. / (fan_in + fan_out)) 98 | if init=='he': 99 | s = np.sqrt(6./fan_in) 100 | return uniform(shape,s) 101 | else: 102 | return uniform(shape, s) 103 | 104 | def init_weights(fan_in,fan_out,init='he'): 105 | 106 | def uniform(stdev, size): 107 | """uniform distribution with the given stdev and size""" 108 | return numpy.random.uniform( 109 | low=-stdev * numpy.sqrt(3), 110 | high=stdev * numpy.sqrt(3), 111 | size=size 112 | ).astype(theano.config.floatX) 113 | 114 | if init == 'lecun' or (init == None and fan_in != fan_out): 115 | weight_values = uniform(numpy.sqrt(1. / fan_in), (fan_in, fan_out)) 116 | 117 | elif init == 'he': 118 | weight_values = uniform(numpy.sqrt(2. / fan_in), (fan_in, fan_out)) 119 | 120 | elif init == 'orthogonal' or (init == None and fan_in == fan_out): 121 | # From lasagne 122 | def sample(shape): 123 | if len(shape) < 2: 124 | raise RuntimeError("Only shapes of length 2 or more are " 125 | "supported.") 126 | flat_shape = (shape[0], numpy.prod(shape[1:])) 127 | # TODO: why normal and not uniform? 128 | a = numpy.random.normal(0.0, 1.0, flat_shape) 129 | u, _, v = numpy.linalg.svd(a, full_matrices=False) 130 | # pick the one with the correct shape 131 | q = u if u.shape == flat_shape else v 132 | q = q.reshape(shape) 133 | return q.astype(theano.config.floatX) 134 | weight_values = sample((fan_in, fan_out)) 135 | return weight_values 136 | 137 | def Dense(name, input_dim, output_dim, inputs, bias=True, init=None, weightnorm=True,hidden_dim=None): 138 | 139 | weight_values = init_weights(input_dim,output_dim,init) 140 | 141 | weight = lib.param( 142 | name + '.W', 143 | weight_values 144 | ) 145 | 146 | batch_size = None 147 | if inputs.ndim==3: 148 | batch_size = inputs.shape[0] 149 | inputs = inputs.reshape((-1,input_dim)) 150 | 151 | if weightnorm: 152 | norm_values = numpy.linalg.norm(weight_values, axis=0) 153 | norms = lib.param( 154 | name + '.g', 155 | norm_values 156 | ) 157 | 158 | normed_weight = weight * (norms / weight.norm(2, axis=0)).dimshuffle('x', 0) 159 | result = T.dot(inputs, normed_weight) 160 | 161 | else: 162 | result = T.dot(inputs, weight) 163 | 164 | if bias: 165 | b = lib.param( 166 | name + '.b', 167 | numpy.zeros((output_dim,), dtype=theano.config.floatX) 168 | ) 169 | result += b 170 | 171 | result.name = name+".output" 172 | if batch_size!=None: 173 | return result.reshape((batch_size,hidden_dim,output_dim)) 174 | else: 175 | return result 176 | 177 | def Embedding(name, n_symbols, output_dim, indices): 178 | vectors = lib.param( 179 | name, 180 | numpy.random.randn( 181 | n_symbols, 182 | output_dim 183 | ).astype(theano.config.floatX) 184 | ) 185 | 186 | output_shape = tuple(list(indices.shape) + [output_dim]) 187 | 188 | return vectors[indices.flatten()].reshape(output_shape) 189 | 190 | def softmax_and_sample(logits): 191 | old_shape = logits.shape 192 | flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1])) 193 | samples = T.cast( 194 | srng.multinomial(pvals=T.nnet.softmax(flattened_logits)), 195 | theano.config.floatX 196 | ).reshape(old_shape) 197 | return T.argmax(samples, axis=samples.ndim-1) 198 | 199 | def GRUStep(name, input_dim, hidden_dim, x_t, h_tm1): 200 | processed_input = lib.ops.Dense( 201 | name+'.Input', 202 | input_dim, 203 | 3 * hidden_dim, 204 | x_t 205 | ) 206 | 207 | gates = T.nnet.sigmoid( 208 | lib.ops.Dense( 209 | name+'.Recurrent_Gates', 210 | hidden_dim, 211 | 2 * hidden_dim, 212 | h_tm1, 213 | bias=False 214 | ) + processed_input[:, :2*hidden_dim] 215 | ) 216 | 217 | update = gates[:, :hidden_dim] 218 | reset = gates[:, hidden_dim:] 219 | 220 | scaled_hidden = reset * h_tm1 221 | 222 | candidate = T.tanh( 223 | lib.ops.Dense( 224 | name+'.Recurrent_Candidate', 225 | hidden_dim, 226 | hidden_dim, 227 | scaled_hidden, 228 | bias=False, 229 | init='orthogonal' 230 | ) + processed_input[:, 2*hidden_dim:] 231 | ) 232 | 233 | one = lib.floatX(1.0) 234 | return (update * candidate) + ((one - update) * h_tm1) 235 | 236 | def __ConvLSTMStep( 237 | name, 238 | seq_len, 239 | input_dim, 240 | hidden_dim, 241 | current_input, 242 | last_hidden, 243 | last_cell, 244 | dilation_depth=10, 245 | inp_bias_init=0., 246 | forget_bias_init=3., 247 | out_bias_init=0., 248 | g_bias_init=0.): 249 | # X_t*(U^i, U^f, U^o, U^g) 250 | 251 | dilations = [2**i for i in xrange(dilation_depth)] 252 | prev_conv = current_input 253 | last_cell_stack = T.concatenate((last_cell,last_cell),axis=1) 254 | for i,value in enumerate(dilations): 255 | #prev_conv = lib.ops.conv1d(name+".WaveNetConv%d"%(i+1),prev_conv,2,1,hidden_dim,input_dim,True,False,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:current_input.shape[2],:] 256 | prev_conv,y = lib.ops.WaveNetConv1d("WaveNetBlock-%d"%(i+1),prev_conv,2,hidden_dim,input_dim,bias=True,batchnorm=False,dilation=value) 257 | 258 | prev_conv = T.concatenate((prev_conv,last_hidden),axis=1) 259 | prev_conv = lib.ops.conv1d(name+".ConvGates",prev_conv,1,1,4*hidden_dim,2*input_dim,True,False) 260 | 261 | W_cell = lib.param(name+'.CellWeights',lasagne.init.HeNormal().sample((3*hidden_dim,seq_len,1))) 262 | inp_forget = T.nnet.sigmoid(prev_conv[:,:2*hidden_dim] + W_cell[:2*hidden_dim]*last_cell_stack) 263 | i_t = inp_forget[:,:hidden_dim] 264 | f_t = inp_forget[:,hidden_dim:] 265 | 266 | C_t = f_t*last_cell + i_t*T.tanh(prev_conv[:,2*hidden_dim:3*hidden_dim]) 267 | 268 | o_t = T.nnet.sigmoid(prev_conv[:,3*hidden_dim:]+W_cell[2*hidden_dim:]*C_t) 269 | 270 | H_t = o_t*T.tanh(C_t) 271 | 272 | return H_t,C_t 273 | 274 | def ConvLSTM(name, seq_len, input_dim, hidden_dim, inputs, h0=None, c0=None): 275 | #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE) 276 | 277 | def step(x_t, h_tm1, c_tm1): 278 | return __ConvLSTMStep( 279 | name+'.Step', 280 | seq_len, 281 | input_dim, 282 | hidden_dim, 283 | x_t, 284 | h_tm1, 285 | c_tm1 286 | ) 287 | 288 | outputs, _ = theano.scan( 289 | step, 290 | sequences=[inputs], 291 | outputs_info=[h0,c0], 292 | ) 293 | 294 | return outputs 295 | 296 | def GRU(name, input_dim, hidden_dim, inputs, h0=None): 297 | #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE) 298 | inputs = inputs.transpose(1,0,2) 299 | 300 | def step(x_t, h_tm1): 301 | return GRUStep( 302 | name+'.Step', 303 | input_dim, 304 | hidden_dim, 305 | x_t, 306 | h_tm1 307 | ) 308 | 309 | outputs, _ = theano.scan( 310 | step, 311 | sequences=[inputs], 312 | outputs_info=[h0], 313 | ) 314 | 315 | out = outputs.dimshuffle(1,0,2) 316 | out.name = name+'.output' 317 | return out 318 | 319 | 320 | 321 | def recurrent_fn(x_t, h_tm1,name,input_dim,hidden_dim,W1,b1,W2,b2): 322 | A1 = T.nnet.sigmoid(BatchNorm(name+".Inp2Hid",T.dot(x_t,W1[:input_dim]),2*hidden_dim,layer='recurrent') + 323 | BatchNorm(name+".Hid2Hid",T.dot(h_tm1,W1[input_dim:]),2*hidden_dim,layer='recurrent') + b1) 324 | 325 | #A1 = T.nnet.sigmoid(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1) + b1) 326 | 327 | z = A1[:,:hidden_dim] 328 | 329 | r = A1[:,hidden_dim:] 330 | 331 | scaled_hidden = r*h_tm1 332 | 333 | h = T.tanh(BatchNorm(name+".Candidate",T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2),hidden_dim,layer='recurrent')+b2) 334 | 335 | # h = T.tanh(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2)+b2) 336 | 337 | one = lib.floatX(1.0) 338 | return ((z * h) + ((one - z) * h_tm1)).astype('float32') 339 | 340 | def myGRU(name, input_dim, hidden_dim, inputs, h0=None): 341 | #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE) 342 | inputs = inputs.transpose(1,0,2) 343 | 344 | weight_values = lasagne.init.GlorotUniform().sample((input_dim+hidden_dim,2*hidden_dim)) 345 | W1 = lib.param( 346 | name+'.Gates.W', 347 | weight_values 348 | ) 349 | 350 | b1 = lib.param( 351 | name+'.Gates.b', 352 | np.ones(2*hidden_dim).astype(theano.config.floatX) 353 | ) 354 | 355 | weight_values = lasagne.init.GlorotUniform().sample((input_dim+hidden_dim,hidden_dim)) 356 | W2 = lib.param( 357 | name+'.Candidate.W', 358 | weight_values 359 | ) 360 | 361 | b2 = lib.param( 362 | name+'.Candidate.b', 363 | np.zeros(hidden_dim).astype(theano.config.floatX) 364 | ) 365 | 366 | def step(x_t, h_tm1): 367 | return recurrent_fn( 368 | x_t, 369 | h_tm1, 370 | name, 371 | input_dim, 372 | hidden_dim, 373 | W1,b1,W2,b2 374 | ) 375 | 376 | outputs, _ = theano.scan( 377 | step, 378 | sequences=[inputs], 379 | outputs_info=[h0], 380 | ) 381 | 382 | out = outputs.dimshuffle(1,0,2) 383 | out.name = name+'.output' 384 | return out 385 | 386 | 387 | def recurrent_fn_hred(x_t, h_tm1,hidden_dim,W1,b1,W2,b2): 388 | global DIM 389 | #A1 = T.nnet.sigmoid(lib.ops.BatchNorm(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1),name="FrameLevel.GRU"+str(name)+".Input.",length=2*512) + b1) 390 | A1 = T.nnet.sigmoid(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1) + b1) 391 | 392 | z = A1[:,:hidden_dim] 393 | 394 | r = A1[:,hidden_dim:] 395 | 396 | scaled_hidden = r*h_tm1 397 | 398 | #h = T.tanh(lib.ops.BatchNorm(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2),name="FrameLevel.GRU"+str(name)+".Output.",length=512)+b2) 399 | h = T.tanh(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2) + b2) 400 | 401 | one = lib.floatX(1.0) 402 | return ((z * h) + ((one - z) * h_tm1)).astype('float32') 403 | 404 | def HRED_GRU(name, input_dim, hidden_dim, inputs, h0=None): 405 | #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE) 406 | global DIM 407 | inputs = inputs.transpose(1,0,2) 408 | 409 | weight_values = init_weights(input_dim+hidden_dim,2*hidden_dim) 410 | 411 | s_W1 = lib.param( 412 | 'Session.Gates.W', 413 | weight_values 414 | ) 415 | 416 | s_b1 = lib.param( 417 | 'Session.Gates.b', 418 | np.ones(2*hidden_dim).astype(theano.config.floatX) 419 | ) 420 | 421 | weight_values = init_weights(input_dim+hidden_dim,hidden_dim) 422 | s_W2 = lib.param( 423 | 'Session.Candidate.W', 424 | weight_values 425 | ) 426 | 427 | s_b2 = lib.param( 428 | 'Session.Candidate.b', 429 | np.zeros(hidden_dim).astype(theano.config.floatX) 430 | ) 431 | 432 | weight_values = init_weights(input_dim+hidden_dim,2*hidden_dim) 433 | 434 | W1 = lib.param( 435 | name+'.Gates.W', 436 | weight_values 437 | ) 438 | 439 | b1 = lib.param( 440 | name+'.Gates.b', 441 | np.ones(2*hidden_dim).astype(theano.config.floatX) 442 | ) 443 | 444 | weight_values = init_weights(input_dim+hidden_dim,hidden_dim) 445 | W2 = lib.param( 446 | name+'.Candidate.W', 447 | weight_values 448 | ) 449 | 450 | b2 = lib.param( 451 | name+'.Candidate.b', 452 | np.zeros(hidden_dim).astype(theano.config.floatX) 453 | ) 454 | 455 | outputs, _ = theano.scan( 456 | recurrent_fn_hred, 457 | sequences=[inputs], 458 | outputs_info=[T.alloc(0,inputs.shape[1],hidden_dim).astype(dtype=theano.config.floatX)], 459 | non_sequences=[hidden_dim,W1,b1,W2,b2] 460 | ) 461 | 462 | #out = recurrent_fn(outputs[-1],h0,hidden_dim,s_W1,s_b1,s_W2,s_b2,"0") 463 | out = recurrent_fn(outputs[-1],h0,hidden_dim,s_W1,s_b1,s_W2,s_b2) 464 | 465 | #DIM=hidden_dim 466 | #out = outputs.dimshuffle(1,0,2) 467 | #out.name = name+'.output' 468 | return out 469 | 470 | 471 | def conv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,pad='valid',filter_dilation=(1,1),run_mode=0): 472 | W = lib.param( 473 | name+'.W', 474 | lasagne.init.HeNormal().sample((n_filters,depth,kernel,1)).astype('float32') 475 | ) 476 | 477 | out = T.nnet.conv2d(input,W,subsample=(stride,1),border_mode=pad,filter_dilation=filter_dilation) 478 | 479 | if bias: 480 | b = lib.param( 481 | name + '.b', 482 | np.zeros(n_filters).astype('float32') 483 | ) 484 | 485 | out += b[None,:,None,None] 486 | 487 | if batchnorm: 488 | out = BatchNorm(name,out,n_filters,mode=1,run_mode=run_mode) 489 | 490 | return out 491 | 492 | def ResNetConv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False): 493 | if stride==1 and n_filters==depth: 494 | project = input 495 | else: 496 | project = lib.ops.conv1d(name+".Projection.conv",input,1,stride,n_filters,depth,bias=bias,batchnorm=batchnorm) 497 | pad = (kernel-1)/2 498 | conv1 = T.nnet.relu(lib.ops.conv1d(name+".conv1",input,kernel,stride,n_filters,depth,bias=bias,batchnorm=batchnorm,pad=(0,pad))) 499 | conv2 = lib.ops.conv1d(name+".conv2",conv1,kernel,1,n_filters,n_filters,bias=bias,batchnorm=batchnorm,pad=(0,pad)) 500 | 501 | out = T.nnet.relu(conv2+project) 502 | return out 503 | 504 | def WaveNetConv1d(name,input,kernel,n_filters,depth,bias=False,batchnorm=False,dilation=1): 505 | conv1 = lib.ops.conv1d(name+".filter&gate",input,kernel,1,2*n_filters,depth,True,batchnorm,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:input.shape[2],:] 506 | z = T.nnet.sigmoid(conv1[:,:n_filters,:,:])*T.tanh(conv1[:,n_filters:,:,:]) 507 | out = lib.ops.conv1d(name+".projection¶m_skip",z,1,1,2*depth,n_filters,bias=bias,batchnorm=batchnorm) 508 | return out[:,:depth,:,:]+input,out[:,depth:,:,:] 509 | 510 | def DenseNetConv1d(name,input,kernel,n_filters,depth,bias=False,batchnorm=False,dilation=1): 511 | conv1 = lib.ops.conv1d(name+".filter&gate",input,kernel,1,2*n_filters,depth,True,batchnorm,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:input.shape[2],:] 512 | z = T.nnet.sigmoid(conv1[:,:n_filters,:,:])*T.tanh(conv1[:,n_filters:,:,:]) 513 | return z 514 | 515 | def ResNetDeconv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,act=True): 516 | if stride==1 and n_filters==depth: 517 | project = input 518 | else: 519 | project = lib.ops.deconv1d(name+".Projection.conv",input,1,stride,n_filters,depth,bias=bias,batchnorm=batchnorm,output=stride*input.shape[-1]) 520 | pad = (kernel-1)/2 521 | 522 | conv2 = T.nnet.relu(lib.ops.deconv1d(name+".conv2",input,kernel,1,n_filters,depth,bias=bias,batchnorm=batchnorm,output=input.shape[-1],pad=(0,pad))) 523 | conv1 = T.nnet.relu(lib.ops.deconv1d(name+".conv1",conv2,kernel,stride,n_filters,n_filters,bias=bias,batchnorm=batchnorm,output=stride*conv2.shape[-1],pad=(0,pad))) 524 | 525 | if act: 526 | out = T.nnet.relu(conv1+project) 527 | else: 528 | out = conv1+project 529 | return out 530 | 531 | def deconv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,pad='valid'): 532 | 533 | o = output = stride*(input.shape[2]-1) + kernel 534 | if type(pad)==tuple: 535 | o -= 2*pad[0] 536 | 537 | W = lib.param( 538 | name+'.W', 539 | lasagne.init.GlorotUniform().sample((depth,n_filters,kernel,1)).astype('float32') 540 | ) 541 | 542 | out = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=input,filters=W,input_shape=(None,n_filters,o,1),border_mode=pad,subsample=(stride,1)) 543 | 544 | if bias: 545 | b = lib.param( 546 | name + '.b', 547 | np.zeros(n_filters).astype('float32') 548 | ) 549 | 550 | out += b[None,:,None,None] 551 | 552 | if batchnorm: 553 | out = BatchNorm(name,out,n_filters,mode=1) 554 | 555 | return out 556 | 557 | def pool1d(input,subsample,pad,pool_indices=None): 558 | import theano.tensor.signal.pool 559 | out = T.signal.pool.pool_2d(input,(subsample,1),ignore_border=True,padding=(pad,0)) 560 | if pool_indices: 561 | indices = T.grad(None,wrt=input,known_grads={out:T.ones_like(out)}) 562 | return out,indices 563 | return out 564 | 565 | def unpool1d(input,upsample,desired_length,pool_indices=None): 566 | out = T.extra_ops.repeat(input,upsample,axis=2) 567 | if pool_indices: 568 | temp = pool_indices*out[:,:,upsample-1:upsample-1] 569 | pad = T.alloc(0,temp.shape[0],temp.shape[1],upsample-1,temp.shape[3]) 570 | return T.concatenate((pad,temp),axis=2)[:,:,:desired_length] 571 | return out[:,:,:desired_length] 572 | -------------------------------------------------------------------------------- /lib/ops.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/ops.pyc -------------------------------------------------------------------------------- /lib/optimizers.py: -------------------------------------------------------------------------------- 1 | import lib 2 | import theano 3 | import numpy as np 4 | import theano.tensor as T 5 | 6 | def RMSprop(cost, params, learnrate, rho=0.90, epsilon=1e-6): 7 | gparams = [] 8 | iter = 1 9 | for param in params: 10 | gparam = T.grad(cost, param) 11 | gparams.append(gparam) 12 | print param['name'] + " completed" 13 | updates=[] 14 | for param, gparam in zip(params, gparams): 15 | acc = theano.shared(param.get_value() * 0.) 16 | acc_new = rho * acc + (1 - rho) * gparam ** 2 17 | gradient_scaling = T.sqrt(acc_new + epsilon) 18 | gparam = gparam / gradient_scaling 19 | updates.append((acc, acc_new)) 20 | updates.append((param, param - gparam * learnrate)) 21 | return updates 22 | 23 | def Adam(cost, params, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8,gradClip=True,value=1.): 24 | gparams = [] 25 | iter = 1 26 | for param in params: 27 | gparam = T.grad(cost,param) 28 | if gradClip: 29 | gparam = T.clip(gparam,lib.floatX(-value), lib.floatX(value)) 30 | gparams.append(gparam) 31 | print str(iter) + " completed" 32 | iter += 1 33 | updates = [] 34 | for p, g in zip(params, gparams): 35 | m = theano.shared(p.get_value() * 0.) 36 | v = theano.shared(p.get_value() * 0.) 37 | 38 | m_new = beta1 * m + (1 - beta1) * g 39 | v_new = beta2 * v + (1 - beta2) * (g ** 2) 40 | 41 | gradient_scaling = T.sqrt(v_new + epsilon) 42 | updates.append((m, m_new)) 43 | updates.append((v, v_new)) 44 | updates.append((p, p - lr * m / gradient_scaling)) 45 | return updates 46 | -------------------------------------------------------------------------------- /lib/optimizers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/optimizers.pyc -------------------------------------------------------------------------------- /my_three_tier.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.path.append(os.getcwd()) 3 | 4 | import numpy as np 5 | import numpy 6 | numpy.random.seed(123) 7 | import random 8 | random.seed(123) 9 | 10 | import dataset 11 | 12 | import theano 13 | import theano.tensor as T 14 | theano.config.floatX='float32' 15 | from theano.tensor.nnet import neighbours 16 | import theano.ifelse 17 | import lib 18 | import lib.optimizers 19 | import lasagne 20 | import scipy.io.wavfile 21 | 22 | import time 23 | import functools 24 | import itertools 25 | 26 | # Hyperparams 27 | NB_EPOCH=10 28 | BATCH_SIZE = 128 29 | N_FRAMES = 256 # How many 'frames' to include in each truncated BPTT pass 30 | FRAME_SIZE = 4 # How many samples per frame 31 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples. 32 | N_GRUS = 2 # How many GRUs to stack in the frame-level model 33 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 34 | GRAD_CLIP = 1 # Elementwise grad clip threshold 35 | 36 | # Dataset 37 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard' 38 | N_FILES = 1000 39 | # DATA_PATH = '/PersimmonData/kiwi_parts' 40 | # N_FILES = 516 41 | BITRATE = 16000 42 | 43 | TEST_SET_SIZE = 128 # How many audio files to use for the test set 44 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence 45 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 46 | 47 | print "Model settings:" 48 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')] 49 | all_vars = sorted(all_vars, key=lambda x: x[0]) 50 | for var_name, var_value in all_vars: 51 | print "\t{}: {}".format(var_name, var_value) 52 | 53 | def frame_level_rnn(input_sequences, h0, reset): 54 | """ 55 | input_sequences.shape: (batch size, N_FRAMES * FRAME_SIZE) 56 | h0.shape: (batch size, N_GRUS, DIM) 57 | reset.shape: () 58 | output.shape: (batch size, N_FRAMES * FRAME_SIZE, DIM) 59 | """ 60 | batch_size = input_sequences.shape[0] 61 | n_frames = input_sequences.shape[1]/FRAME_SIZE 62 | 63 | learned_h0 = lib.param( 64 | 'FrameLevel.h0', 65 | numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX) 66 | ) 67 | 68 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM) 69 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 70 | 71 | frames = input_sequences.reshape(( 72 | input_sequences.shape[0], 73 | input_sequences.shape[1] / (FRAME_SIZE * FRAME_SIZE), 74 | FRAME_SIZE*FRAME_SIZE 75 | )) 76 | 77 | # frames = emb.reshape(( 78 | # input_sequences.shape[0], 79 | # input_sequences.shape[1] / (FRAME_SIZE*FRAME_SIZE), 80 | # FRAME_SIZE*Q_LEVELS 81 | # )) 82 | 83 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 84 | # (a reasonable range to pass as inputs to the RNN) 85 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 86 | # frames *= lib.floatX(2) 87 | 88 | gru1 = lib.ops.myGRU('FrameLevel.GRU1', FRAME_SIZE*FRAME_SIZE, DIM, frames, h0=h0[:, 0]) 89 | 90 | gru1_output = lib.ops.Dense( 91 | 'FrameLevel1.Output', 92 | DIM, 93 | FRAME_SIZE * DIM, 94 | gru1, 95 | init='he', 96 | hidden_dim=64 97 | ).reshape((batch_size,256,DIM)) 98 | 99 | 100 | gru2 = lib.ops.myGRU('FrameLevel.GRU2', DIM, DIM, gru1_output, h0=h0[:, 1]) 101 | #gru3 = lib.ops.myGRU('FrameLevel.GRU3', DIM, DIM, gru2, h0=h0[:, 2]) 102 | 103 | #gru1,gru2,gru3 = lib.ops.myGRU('FrameLevel.GRU', FRAME_SIZE, DIM, frames, h0=h0) 104 | 105 | # gru3.shape = (batch_size,N_FRAMES,DIM) 106 | 107 | gru2_output = lib.ops.Dense( 108 | 'FrameLevel2.Output', 109 | DIM, 110 | FRAME_SIZE * DIM, 111 | gru2, 112 | init='he', 113 | hidden_dim=256 114 | ).reshape((batch_size,1024,DIM)) 115 | 116 | 117 | last_hidden = T.stack([gru1[:, -1], gru2[:, -1]], axis=1) 118 | 119 | return (gru2_output, last_hidden) 120 | 121 | def sample_level_predictor(frame_level_outputs, prev_samples): 122 | """ 123 | frame_level_outputs.shape: (batch size*SEQ_LEN, DIM) 124 | prev_samples.shape: (batch size*SEQ_LEN, FRAME_SIZE) 125 | output.shape: (batch size*SEQ_LEN, Q_LEVELS) 126 | """ 127 | 128 | prev_samples = lib.ops.Embedding( 129 | 'SampleLevel.Embedding', 130 | Q_LEVELS, 131 | Q_LEVELS, 132 | prev_samples 133 | ).reshape((-1, FRAME_SIZE * Q_LEVELS)) 134 | 135 | # prev_samples.shape = (batch_size*SEQ_LEN,FRAME_SIZE,Q_LEVELS) 136 | 137 | out = lib.ops.Dense( 138 | 'SampleLevel.L1_PrevSamples', 139 | FRAME_SIZE * Q_LEVELS, 140 | DIM, 141 | prev_samples, 142 | bias=False, 143 | init='he', 144 | ) ##(128,256,512) 145 | out += frame_level_outputs 146 | out = T.nnet.relu(out) 147 | 148 | out = lib.ops.Dense('SampleLevel.L2', DIM, DIM, out, init='he') 149 | out = T.nnet.relu(out) 150 | 151 | out = lib.ops.Dense('SampleLevel.L3', DIM, DIM, out, init='he') 152 | out = T.nnet.relu(out) 153 | 154 | # We apply the softmax later 155 | return lib.ops.Dense('SampleLevel.Output', DIM, Q_LEVELS, out) 156 | 157 | sequences = T.imatrix('sequences') 158 | h0 = T.tensor3('h0') 159 | reset = T.iscalar('reset') 160 | 161 | input_sequences = sequences[:, :-FRAME_SIZE] 162 | target_sequences = sequences[:, FRAME_SIZE:] 163 | 164 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset) 165 | 166 | # frame_level_outputs.shape = (batch_size,SEQ_LEN,DIM) 167 | 168 | prev_samples = sequences[:, :-1] 169 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1)) 170 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') 171 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE)) 172 | 173 | sample_level_outputs = sample_level_predictor( 174 | frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)), 175 | prev_samples 176 | ) 177 | 178 | cost = T.nnet.categorical_crossentropy( 179 | T.nnet.softmax(sample_level_outputs), 180 | target_sequences.flatten() 181 | ).mean() 182 | 183 | # By default we report cross-entropy cost in bits. 184 | # Switch to nats by commenting out this line: 185 | cost = cost * lib.floatX(1.44269504089) 186 | 187 | params = lib.search(cost, lambda x: hasattr(x, 'param')) 188 | 189 | lib.print_params_info(cost, params) 190 | 191 | #pdates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) 192 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn') 193 | 194 | #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 195 | 196 | print "Gradients Computed" 197 | 198 | updates = lasagne.updates.adam(grads, params) 199 | 200 | train_fn = theano.function( 201 | [sequences, h0, reset], 202 | [cost, new_h0], 203 | updates=updates, 204 | on_unused_input='warn' 205 | ) 206 | 207 | frame_level_generate_fn = theano.function( 208 | [sequences, h0, reset], 209 | frame_level_rnn(sequences, h0, reset), 210 | on_unused_input='warn' 211 | ) 212 | 213 | frame_level_outputs = T.matrix('frame_level_outputs') 214 | prev_samples = T.imatrix('prev_samples') 215 | sample_level_generate_fn = theano.function( 216 | [frame_level_outputs, prev_samples], 217 | lib.ops.softmax_and_sample( 218 | sample_level_predictor( 219 | frame_level_outputs, 220 | prev_samples 221 | ) 222 | ), 223 | on_unused_input='warn' 224 | ) 225 | 226 | def generate_and_save_samples(tag): 227 | 228 | def write_audio_file(name, data): 229 | 230 | data = data.astype('float32') 231 | data -= data.min() 232 | data /= data.max() 233 | data -= 0.5 234 | data *= 0.95 235 | 236 | import scipy.io.wavfile 237 | scipy.io.wavfile.write(name+'.wav',BITRATE,data) 238 | 239 | # Generate 5 sample files, each 5 seconds long 240 | N_SEQS = 5 241 | LENGTH = 8*BITRATE 242 | 243 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 244 | samples[:, :FRAME_SIZE] = Q_ZERO 245 | 246 | h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32') 247 | frame_level_outputs = None 248 | 249 | for t in xrange(FRAME_SIZE, LENGTH): 250 | 251 | if t % FRAME_SIZE == 0: 252 | frame_level_outputs, h0 = frame_level_generate_fn( 253 | samples[:, t-FRAME_SIZE:t], 254 | h0, 255 | numpy.int32(t == FRAME_SIZE) 256 | ) 257 | 258 | samples[:, t] = sample_level_generate_fn( 259 | frame_level_outputs[:, t % FRAME_SIZE], 260 | samples[:, t-FRAME_SIZE:t] 261 | ) 262 | 263 | for i in xrange(N_SEQS): 264 | write_audio_file("sample_{}_{}".format(tag, i), samples[i]) 265 | 266 | print "Training!" 267 | total_iters = 0 268 | 269 | for epoch in xrange(NB_EPOCH): 270 | h0 = np.zeros((BATCH_SIZE, N_GRUS, DIM)).astype(theano.config.floatX) 271 | costs = [] 272 | times = [] 273 | data = dataset.get_data(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN+FRAME_SIZE, 0, Q_LEVELS, Q_ZERO) 274 | 275 | for seqs, reset in data: 276 | start_time = time.time() 277 | cost, h0 = train_fn(seqs, h0, reset) 278 | total_time = time.time() - start_time 279 | times.append(total_time) 280 | total_iters += 1 281 | print "Batch ",total_iters 282 | costs.append(cost) 283 | print "\tCost: ",np.mean(costs) 284 | print "\tTime: ",np.mean(times) 285 | if total_iters%10000==0: 286 | generate_and_save_samples('iterno_%d'%total_iters) 287 | break 288 | # print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format( 289 | # epoch, 290 | # total_iters, 291 | # numpy.mean(costs), 292 | # total_time, 293 | # total_time / total_iters 294 | # ) 295 | # tag = "iters{}_time{}".format(total_iters, total_time) 296 | # generate_and_save_samples(tag) 297 | # lib.save_params('params_{}.pkl'.format(tag)) 298 | 299 | # costs = [] 300 | # last_print_time += PRINT_TIME 301 | # last_print_iters += PRINT_ITERS 302 | -------------------------------------------------------------------------------- /temp.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy as np 3 | import theano.tensor as T 4 | arr = np.arange(1,33).astype('float32').reshape((1,1,1,32)) 5 | filt = np.asarray([1,2,3]).astype('float32').reshape((1,1,1,3)) 6 | filt2 = np.asarray([1]).astype('float32').reshape((1,1,1,1)) 7 | 8 | arr = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=arr,filters=filt,input_shape=(None,1,1,arr.shape[-1]*2+1),subsample=(1,2)).eval() 9 | 10 | filt = np.asarray([1,1,1]).astype('float32').reshape((1,1,1,3)) 11 | arr = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=arr,filters=filt,input_shape=(None,1,1,arr.shape[-1]+1)).eval() 12 | 13 | #arr = T.nnet.conv2d(arr,filt,subsample=(1,2),border_mode=(0,1)).eval() 14 | #arr = T.concatenate((arr[:,:,:,0][:,:,:,None],arr),axis=-1).eval() 15 | 16 | 17 | import numpy as np 18 | import theano 19 | import theano.tensor as T 20 | import theano.tensor.signal.pool 21 | 22 | def unpool1d(input,upsample,desired_length,pool_indices=None): 23 | out = T.extra_ops.repeat(input,upsample,axis=2)[:,:,:desired_length] 24 | if pool_indices: 25 | mask = T.lt(pool_indices,0) 26 | return mask*out 27 | return out 28 | 29 | dilations = np.asarray([[1,2,4,8,16,32]*1]).tolist()[0] 30 | length=6000 31 | N_BLOCKS=2 32 | arr = -1*np.arange(1,length+1).astype('float32').reshape((1,1,length,1)) 33 | W = np.asarray([0,1]).astype('float32').reshape((1,1,2,1)) 34 | indices=[] 35 | for j in xrange(N_BLOCKS): 36 | for value in dilations: 37 | # arr = T.nnet.conv2d(arr,W,filter_dilation=(1,value)) 38 | arr = T.nnet.conv2d(arr,W,filter_dilation=(value,1),border_mode=(value,0))[:,:,:length] 39 | arr,idx = lib.ops.pool1d(arr,4,3,True) 40 | indices+=[idx] 41 | length=arr.shape[2] 42 | for j in xrange(N_BLOCKS): 43 | leng = indices[-(j+1)].shape[2] 44 | arr = unpool1d(arr,4,leng,indices[-(j+1)]) 45 | 46 | ((36*4+32)*4+32)*4+32 47 | 48 | l=49 49 | arr = T.as_tensor_variable(np.random.randint(0,256,(1,1,l,1)).astype('float32')) 50 | #arr = T.as_tensor_variable(np.arange(1,l+1).astype('float32').reshape((1,1,l,1))) 51 | (res,idx) = lib.ops.pool1d(arr,4,3,True) 52 | #res = lib.ops.pool1d(arr,4,3,False) 53 | res2 = T.as_tensor_variable(np.random.randint(0,256,res.shape.eval())) 54 | out = lib.ops.unpool1d(res,4,l,idx).eval() 55 | #out = lib.ops.unpool1d(res,4,None).eval() 56 | print res.shape.eval() 57 | print arr.eval().flatten() 58 | print res.eval().flatten() 59 | print idx.eval().flatten() 60 | #print res2.eval().flatten() 61 | print out.flatten() 62 | 63 | import lib 64 | import lib.ops 65 | import numpy as np 66 | import theano 67 | import theano.tensor as T 68 | input_sequences = T.imatrix() 69 | Q_LEVELS=256 70 | n_filters=64 71 | length = input_sequences.shape[1] 72 | start = (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,None,:] 73 | conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=False,batchnorm=False,pad=(0,1))[:,:,:,:length] 74 | f = theano.function([input_sequences],[conv1]) 75 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | import numpy 4 | numpy.random.seed(123) 5 | import random 6 | random.seed(123) 7 | import dataset 8 | import theano 9 | import theano.tensor as T 10 | theano.config.floatX='float32' 11 | import lib.ops 12 | import scipy.io.wavfile 13 | import time 14 | import lasagne 15 | 16 | 17 | # Hyperparams 18 | NB_EPOCH=10 19 | BATCH_SIZE = 32 20 | N_FRAMES = 256 # How many 'frames' to include in each truncated BPTT pass 21 | FRAME_SIZE = 768 # How many samples per frame 22 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 23 | GRAD_CLIP = 1 # Elementwise grad clip threshold 24 | DIM = 512 25 | # Dataset 26 | DATA_PATH = '/home/rithesh/DeepLearning/Vocal Synthesis/data' 27 | #DATA_PATH = '/data/lisatmp3/kumarrit/blizzard' 28 | N_FILES = 50 29 | BITRATE = 16000 30 | 31 | TEST_SET_SIZE = 128 # How many audio files to use for the test set 32 | SEQ_LEN = 8192 # Total length (# of samples) of each truncated BPTT sequence 33 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 34 | 35 | def network(input_sequences,h0,reset): 36 | 37 | batch_size = input_sequences.shape[0] 38 | 39 | learned_h0 = lib.param( 40 | 'Session.h0', 41 | numpy.zeros(DIM, dtype=theano.config.floatX) 42 | ) 43 | 44 | learned_h0 = T.alloc(learned_h0, h0.shape[0], DIM) 45 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 46 | 47 | emb = lib.ops.Embedding( 48 | 'Embedding', 49 | Q_LEVELS, 50 | Q_LEVELS, 51 | input_sequences, 52 | ).transpose(0,2,1)[:,:,None,:] #(32, 256, 1, 8960) 53 | 54 | # conv1 = T.nnet.relu(lib.ops.conv1d("conv1",emb,3,1,128,256,bias=True,batchnorm=True)) #(32, 512, 1, 255) - 289 - 31 55 | # conv2 = T.nnet.relu(lib.ops.conv1d("conv2",conv1,3,2,256,128,bias=True,batchnorm=True)) #(32, 512, 1, 127) - 143 - 15 56 | # conv3 = T.nnet.relu(lib.ops.conv1d("conv3",conv2,3,2,512,256,bias=True,batchnorm=True)) #(32, 512, 1, 63) - 71 - 7 57 | # conv4 = T.nnet.relu(lib.ops.conv1d("conv4",conv3,3,2,1024,512,bias=True,batchnorm=True)) #(32, 512, 1, 31) - 35 - 3 58 | # conv5 = T.nnet.relu(lib.ops.conv1d("conv5",conv4,3,2,2048,1024,bias=True,batchnorm=True)) #(32, 512, 1, 15) - 17 - 1 59 | start = lib.ops.ResNetConv1d("ResNet-Enc-0",emb,3,1,256,256,bias=True,batchnorm=True) # 8960 RF - 2 60 | rconv1 = lib.ops.ResNetConv1d("ResNet-Enc-1",start,3,2,128,256,bias=True,batchnorm=True) # 4480 RF - 5 61 | rconv2 = lib.ops.ResNetConv1d("ResNet-Enc-2",rconv1,3,2,128,128,bias=True,batchnorm=True) # 2240 RF - 11 62 | rconv3 = lib.ops.ResNetConv1d("ResNet-Enc-3",rconv2,3,2,128,128,bias=True,batchnorm=True) # 1120 RF - 23 63 | rconv4 = lib.ops.ResNetConv1d("ResNet-Enc-4",rconv3,3,2,256,128,bias=True,batchnorm=True) # 560 RF - 47 64 | rconv5 = lib.ops.ResNetConv1d("ResNet-Enc-5",rconv4,3,2,256,256,bias=True,batchnorm=True) # 280 RF - 95 65 | rconv6 = lib.ops.ResNetConv1d("ResNet-Enc-6",rconv5,3,2,256,256,bias=True,batchnorm=True) # 140 RF - 191 66 | rconv7 = lib.ops.ResNetConv1d("ResNet-Enc-7",rconv6,3,2,512,256,bias=True,batchnorm=True) # 70 RF - 383 67 | rconv8 = lib.ops.ResNetConv1d("ResNet-Enc-8",rconv7,3,2,512,512,bias=True,batchnorm=True) # 35 RF - 767 68 | 69 | #gru1 = lib.ops.myGRU('Encoder.GRU1',DIM,DIM,rconv7.transpose(2,0,3,1)[0][:,:15,:],h0=h0) # (32, 15, 512) 70 | gru1 = lib.ops.myGRU('Encoder.GRU1',DIM,DIM,rconv8.transpose(2,0,3,1)[0][:,:32,:],h0=h0) # (32, 15, 512) 71 | gru = gru1.transpose(0,2,1)[:,:,None,:] #(32, 512, 1, 15) 72 | #project = lib.ops.conv1d("Project.GRU",gru,1,1,4096,512,bias=True,batchnorm=True) 73 | 74 | rdeconv8 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-8",gru,3,2,512,512,bias=True,batchnorm=True)+rconv7[:,:,:,3:67]) # 64 75 | rdeconv7 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-7",rdeconv8,3,2,256,512,bias=True,batchnorm=True)+rconv6[:,:,:,9:137]) #128 76 | rdeconv6 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-6",rdeconv7,3,2,256,256,bias=True,batchnorm=True)+rconv5[:,:,:,21:277]) #256 77 | rdeconv5 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-5",rdeconv6,3,2,256,256,bias=True,batchnorm=True)+rconv4[:,:,:,45:557]) #512 78 | rdeconv4 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-4",rdeconv5,3,2,128,256,bias=True,batchnorm=True)+rconv3[:,:,:,93:1117]) #1024 79 | rdeconv3 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-3",rdeconv4,3,2,128,128,bias=True,batchnorm=True)+rconv2[:,:,:,189:2237]) #2048 80 | rdeconv2 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-2",rdeconv3,3,2,128,128,bias=True,batchnorm=True)+rconv1[:,:,:,381:4477]) #4096 81 | rdeconv1 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-1",rdeconv2,3,2,256,128,bias=True,batchnorm=True)+start[:,:,:,765:8957]) #8192 82 | rdeconv0 = lib.ops.ResNetDeconv1d("ResNet-Dec-0",rdeconv1,3,1,256,256,bias=True,batchnorm=True,act=False) #8192 83 | 84 | # deconv5 = T.nnet.relu(lib.ops.deconv1d("deconv5",gru,3,2,1024,2048,bias=True,batchnorm=True)+conv4[:,:,:,2:33]) # (32, 512, 1, 31) 85 | # deconv4 = T.nnet.relu(lib.ops.deconv1d("deconv4",deconv5,3,2,512,1024,bias=True,batchnorm=True)+conv3[:,:,:,6:69]) # (32, 512, 1, 63) 86 | # deconv3 = T.nnet.relu(lib.ops.deconv1d("deconv3",deconv4,3,2,256,512,bias=True,batchnorm=True)+conv2[:,:,:,14:141]) # (32, 512, 1, 127) 87 | # deconv2 = T.nnet.relu(lib.ops.deconv1d("deconv2",deconv3,3,2,128,256,bias=True,batchnorm=True)+conv1[:,:,:,30:285]) # (32, 512, 1, 255) 88 | # deconv1 = lib.ops.deconv1d("deconv1",deconv2,3,1,256,128,bias=True,batchnorm=True) # (32, 256, 1, 257) 89 | 90 | # output = rdeconv1[:,:,0,:].transpose(0,2,1) 91 | output = rdeconv0[:,:,0,:].transpose(0,2,1) 92 | return (gru[:,:,0,-1],output) 93 | 94 | 95 | print "Model settings:" 96 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')] 97 | all_vars = sorted(all_vars, key=lambda x: x[0]) 98 | for var_name, var_value in all_vars: 99 | print "\t{}: {}".format(var_name, var_value) 100 | 101 | sequences = T.imatrix('sequences') 102 | h0 = T.fmatrix('h0') 103 | reset = T.iscalar('reset') 104 | 105 | input_sequences = sequences[:,] 106 | target_sequences = sequences[:,768:] 107 | 108 | new_h0, predicted_sequences = network(input_sequences,h0,reset) 109 | cost = T.nnet.categorical_crossentropy( 110 | T.nnet.softmax(predicted_sequences.reshape((-1,Q_LEVELS))), 111 | target_sequences.flatten() 112 | ).mean() 113 | 114 | # By default we report cross-entropy cost in bits. 115 | # Switch to nats by commenting out this line: 116 | cost = cost * lib.floatX(1.44269504089) 117 | 118 | params = lib.search(cost, lambda x: hasattr(x, 'param')) 119 | lib.print_params_info(cost, params) 120 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) 121 | grads = T.grad(cost, wrt=params) 122 | updates = lasagne.updates.adam(grads, params, learning_rate=0.01) 123 | 124 | print "Gradients Computed" 125 | 126 | train_fn = theano.function( 127 | [sequences, h0, reset], 128 | [cost, new_h0,predicted_sequences], 129 | updates=updates, 130 | on_unused_input='warn' 131 | ) 132 | 133 | input_seq = T.imatrix() 134 | test_h0 = T.fmatrix() 135 | test_reset = T.iscalar() 136 | 137 | test_new_h0,test_predict = network(input_seq,test_h0,test_reset) 138 | test_fn = theano.function( 139 | [input_seq, test_h0, test_reset], 140 | [test_new_h0,T.nnet.softmax(test_predict.reshape((-1,Q_LEVELS)))] 141 | ) 142 | 143 | def generate_and_save_samples(tag,seed_h0): 144 | 145 | def write_audio_file(name, data): 146 | 147 | data = data.astype('float32') 148 | data -= data.min() 149 | data /= data.max() 150 | data -= 0.5 151 | data *= 0.95 152 | 153 | import scipy.io.wavfile 154 | scipy.io.wavfile.write(name+'.wav',BITRATE,data) 155 | 156 | # Generate 5 sample files, each 5 seconds long 157 | N_SEQS = 32 158 | LENGTH = 8*BITRATE 159 | LENGTH += LENGTH%31 160 | 161 | samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32') 162 | samples[:, :SEQ_LEN] = Q_ZERO 163 | 164 | #if seed_h0: 165 | # h0 = seed_h0 166 | #else: 167 | # h0 = numpy.zeros((N_SEQS, DIM), dtype='float32') 168 | h0 = seed_h0 169 | frame_level_outputs = None 170 | 171 | for t in xrange(31, LENGTH-31,31): 172 | h0,probs = test_fn(samples[:,t-31:t],h0,0) 173 | probs = probs.reshape((N_SEQS,31,Q_LEVELS)) 174 | samples[:,t:t+31] = np.argmax(probs,axis=2) 175 | print t 176 | 177 | for i in xrange(N_SEQS): 178 | write_audio_file("sample_{}_{}".format(tag, i), samples[i]) 179 | 180 | 181 | #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 182 | 183 | print "Training!" 184 | total_iters = 0 185 | 186 | for epoch in xrange(NB_EPOCH): 187 | h0 = np.zeros((BATCH_SIZE, DIM)).astype(theano.config.floatX) 188 | costs = [] 189 | times = [] 190 | data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)) 191 | # seqs = data_feeder[20][0] 192 | # reset = data_feeder[20][1] 193 | 194 | for seqs, reset in data_feeder: 195 | # while True: 196 | start_time = time.time() 197 | cost, h0, _ = train_fn(seqs, h0, reset) 198 | total_time = time.time() - start_time 199 | times.append(total_time) 200 | total_iters += 1 201 | print "Batch ",total_iters 202 | costs.append(cost) 203 | print "\tCost: ",np.mean(costs) 204 | print "\tTime: ",np.mean(times) 205 | # if total_iters%500==0: 206 | # generate_and_save_samples('iterno_%d'%total_iters) 207 | -------------------------------------------------------------------------------- /vctk_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import scipy.io.wavfile 3 | import scikits.audiolab 4 | import scipy.signal 5 | import random 6 | import time 7 | import numpy as np 8 | import glob 9 | 10 | 11 | random_seed = 123 12 | 13 | def feed_epoch(speaker_id,BATCH_SIZE, SEQ_LEN, STRIDE, RF=1025, N_FILES=None): 14 | global random_seed 15 | def process_wav(desired_sample_rate, filename, use_ulaw): 16 | channels = scipy.io.wavfile.read(filename) 17 | file_sample_rate, audio = channels 18 | audio = ensure_mono(audio) 19 | audio = wav_to_float(audio) 20 | if use_ulaw: 21 | audio = ulaw(audio) 22 | audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio) 23 | audio = float_to_uint8(audio) 24 | return audio 25 | 26 | 27 | def ulaw(x, u=255): 28 | x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u)) 29 | return x 30 | 31 | 32 | def float_to_uint8(x): 33 | x += 1. 34 | x /= 2. 35 | uint8_max_value = np.iinfo('uint8').max 36 | x *= uint8_max_value 37 | x = x.astype('uint8') 38 | return x 39 | 40 | 41 | def wav_to_float(x): 42 | try: 43 | max_value = np.iinfo(x.dtype).max 44 | min_value = np.iinfo(x.dtype).min 45 | except: 46 | max_value = np.finfo(x.dtype).max 47 | min_value = np.finfo(x.dtype).min 48 | x = x.astype('float32', casting='safe') 49 | x -= min_value 50 | x /= ((max_value - min_value) / 2.) 51 | x -= 1. 52 | return x 53 | 54 | 55 | def ulaw2lin(x, u=255.): 56 | max_value = np.iinfo('uint8').max 57 | min_value = np.iinfo('uint8').min 58 | x = x.astype('float64', casting='safe') 59 | x -= min_value 60 | x /= ((max_value - min_value) / 2.) 61 | x -= 1. 62 | x = np.sign(x) * (1 / u) * (((1 + u) ** np.abs(x)) - 1) 63 | x = float_to_uint8(x) 64 | return x 65 | 66 | def ensure_sample_rate(desired_sample_rate, file_sample_rate, mono_audio): 67 | if file_sample_rate != desired_sample_rate: 68 | mono_audio = scipy.signal.resample_poly(mono_audio, desired_sample_rate, file_sample_rate) 69 | return mono_audio 70 | 71 | 72 | def ensure_mono(raw_audio): 73 | """ 74 | Just use first channel. 75 | """ 76 | if raw_audio.ndim == 2: 77 | raw_audio = raw_audio[:, 0] 78 | return raw_audio 79 | 80 | DATA_PATH = "/tmp/kumarrit/vctk/VCTK-Corpus/wav48/p" + str(speaker_id) + "/*" 81 | paths = glob.glob(DATA_PATH) 82 | if N_FILES: 83 | paths = paths[:N_FILES] 84 | random_seed += 1 85 | batches = [] 86 | for i in xrange(len(paths) / BATCH_SIZE): 87 | batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE]) 88 | random.shuffle(batches) 89 | for batch_paths in batches: 90 | data = [] 91 | for fname in batch_paths: 92 | data.append(process_wav(16000,fname,True)) 93 | max_len = max([len(vec) for vec in data]) 94 | for i in xrange(len(data)): 95 | data[i] = np.hstack((data[i],np.full(max_len-len(data[i]),128,dtype=np.uint8))) 96 | data = np.asarray(data).astype(np.uint8) 97 | for i in xrange(0,data.shape[1]-RF-STRIDE,STRIDE): 98 | start = i 99 | end = i+RF+STRIDE 100 | subbatch = data[:, start : end] 101 | yield (subbatch,reset) 102 | -------------------------------------------------------------------------------- /vctk_dataset.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/vctk_dataset.pyc -------------------------------------------------------------------------------- /wavenet.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.setrecursionlimit(10000) 3 | import numpy as np 4 | import numpy 5 | numpy.random.seed(123) 6 | import random 7 | random.seed(123) 8 | import dataset 9 | import theano 10 | import theano.tensor as T 11 | theano.config.floatX='float32' 12 | import lib.ops 13 | import scipy.io.wavfile 14 | import time 15 | import lasagne 16 | import vctk_dataset 17 | import tqdm 18 | from tqdm import tqdm 19 | import new_dataset 20 | 21 | # Hyperparams 22 | NB_EPOCH=200 23 | BATCH_SIZE = 8 24 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 25 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard' 26 | N_FILES = 8192 27 | BITRATE = 16000 28 | GRAD_CLIP=1 29 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 30 | Q_TYPE='linear' 31 | 32 | N_BLOCKS=5 33 | DILATION_DEPTH=10 34 | RF=N_BLOCKS*(2**(DILATION_DEPTH))-N_BLOCKS+2 35 | n_filters=64 36 | 37 | #FRAME_SIZE = RF # How many samples per frame 38 | #SEQ_LEN=2*RF 39 | OVERLAP=RF 40 | SEQ_LEN=1600 41 | 42 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)) 43 | 44 | def network(input_sequences): 45 | batch_size = input_sequences.shape[0] 46 | length = input_sequences.shape[1] 47 | dilations = np.asarray([[2**i for i in xrange(DILATION_DEPTH)]*N_BLOCKS]).tolist()[0] 48 | #skip_weights = lib.param("scaling_weights", numpy.ones(len(dilations)).astype('float32')) 49 | 50 | #start = T.extra_ops.to_one_hot(input_sequences.flatten(),nb_class=256).reshape((batch_size,length,256)).transpose(0,2,1)[:,:,None,:] 51 | start = (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,:,None] 52 | conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=True,batchnorm=False,pad=(1,0))[:,:,:length,:] 53 | prev_conv = conv1 54 | #prev_skip = [] 55 | prev_skip = T.zeros((batch_size,n_filters,length,1)) 56 | for i,value in enumerate(dilations): 57 | prev_conv,y = lib.ops.WaveNetConv1d("Block-%d"%(i+1),prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value) 58 | #prev_skip += y*skip_weights[i] 59 | prev_skip += y 60 | #prev_skip += [y] 61 | 62 | #out = T.nnet.relu(T.sum(prev_skip,axis=0)) 63 | out = T.nnet.relu(prev_skip) 64 | #out = prev_skip 65 | out = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 66 | out = T.nnet.relu(lib.ops.conv1d("Output.2",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 67 | out = T.nnet.relu(lib.ops.conv1d("Output.3",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 68 | 69 | out = lib.ops.conv1d("Output.4",out,1,1,256,n_filters,bias=True,batchnorm=False) 70 | 71 | return out[:,:,RF-1:,0].transpose(0,2,1).reshape((-1,Q_LEVELS)) 72 | 73 | print "Model settings:" 74 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')] 75 | all_vars = sorted(all_vars, key=lambda x: x[0]) 76 | for var_name, var_value in all_vars: 77 | print "\t{}: {}".format(var_name, var_value) 78 | 79 | sequences = T.imatrix('sequences') 80 | input_sequences = sequences[:,:-1] 81 | target_sequences = sequences[:,RF:] 82 | 83 | predicted_sequences = T.nnet.softmax(network(input_sequences)) 84 | 85 | #lib.load_params('iter_latest_wavenet.p') 86 | cost = T.nnet.categorical_crossentropy( 87 | predicted_sequences, 88 | target_sequences.flatten() 89 | ).mean() 90 | 91 | # By default we report cross-entropy cost in bits. 92 | # Switch to nats by commenting out this line: 93 | cost = cost * lib.floatX(1.44269504089) 94 | 95 | params = lib.search(cost, lambda x: hasattr(x, 'param')) 96 | lib.print_params_info(cost, params) 97 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) 98 | grads = T.grad(cost, wrt=params) 99 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 100 | 101 | lr = T.fscalar() 102 | updates = lasagne.updates.adam(grads, params, learning_rate=lr) 103 | 104 | print "Gradients Computed" 105 | 106 | train_fn = theano.function( 107 | [sequences,lr], 108 | [cost], 109 | updates=updates, 110 | on_unused_input='warn' 111 | ) 112 | 113 | print "Compiled Train Function" 114 | 115 | test_fn = theano.function( 116 | [sequences], 117 | [cost], 118 | on_unused_input='warn' 119 | ) 120 | 121 | print "Compiled Test Function" 122 | 123 | generate_fn = theano.function( 124 | [sequences], 125 | [lib.ops.softmax_and_sample(network(sequences))], 126 | on_unused_input='warn' 127 | ) 128 | 129 | print "Compiled Generate Function" 130 | 131 | def generate(generate_fn): 132 | tag = 'test_iter' 133 | N_SEQS = 8 134 | LENGTH = 3*BITRATE 135 | samples = numpy.full((N_SEQS, LENGTH), fill_value = Q_ZERO, dtype=np.uint8) 136 | 137 | def write_audio_file(name, data): 138 | data = data.astype('float32') 139 | data -= data.min() 140 | data /= data.max() 141 | data -= 0.5 142 | data *= 0.95 143 | import scipy.io.wavfile 144 | scipy.io.wavfile.write(name+'.wav',BITRATE,data) 145 | 146 | #data = data_feeder.next() 147 | #data_feeder = list(dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True, 34965)) 148 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)) 149 | #data_feeder = list(vctk_dataset.feed_epoch(225, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES)) 150 | print "File loaded" 151 | #data = data[0][:] 152 | #samples[:, :RF] = data[:N_SEQS,:RF] 153 | 154 | for t in xrange(RF, LENGTH): 155 | samples[:,t] = generate_fn(samples[:,t-RF:t])[0] 156 | #samples[:,t] = probs.flatten() 157 | print t, samples[:,t] 158 | 159 | for i in xrange(N_SEQS): 160 | write_audio_file("sample_{}_{}".format(tag, i), samples[i][RF:]) 161 | 162 | print "Training!" 163 | for epoch in xrange(1,NB_EPOCH): 164 | costs = [] 165 | times = [] 166 | #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES) 167 | #data_feeder = vctk_dataset.feed_epoch(225, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES) 168 | data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE) 169 | print "Epoch : ",epoch 170 | total_iters = 0 171 | for seqs,reset,mask in tqdm(data_feeder): 172 | total_iters += 1 173 | start_time = time.time() 174 | cost,pred = train_fn(seqs,0.001) 175 | total_time = time.time() - start_time 176 | costs.append(cost) 177 | times.append(total_time) 178 | if total_iters%1000==0: 179 | print "\tCost : ", np.mean(costs) 180 | print "\tTime : ", np.mean(times) 181 | 182 | print "\tCost : ", np.mean(costs) 183 | print "\tTime : ", np.mean(times) 184 | #if epoch%50==0: 185 | # generate() 186 | -------------------------------------------------------------------------------- /wavenet_controller.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | import os 3 | import sys 4 | import time 5 | 6 | import numpy 7 | 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) 9 | from platoon.channel import Controller 10 | 11 | 12 | class WaveNetController(Controller): 13 | def __init__(self, max_mb,saveFreq,default_args): 14 | """ 15 | Initialize the WaveNetController 16 | Parameters 17 | ---------- 18 | max_mb : int 19 | Max number of minibatches to train on. 20 | patience: : int 21 | Training stops when this many minibatches have been trained on 22 | without any reported improvement. 23 | valid_freq : int 24 | Number of minibatches to train on between every monitoring step. 25 | default_args : dict 26 | Arguments of default class Controller 27 | """ 28 | 29 | super(WaveNetController, self).__init__(**default_args) 30 | self.max_mb = int(max_mb) 31 | 32 | self.uidx = 0 33 | self.eidx = 0 34 | self.saveFreq = saveFreq 35 | 36 | self._save_params = False 37 | self.start_time = None 38 | self._should_stop = False 39 | 40 | def handle_control(self, req, worker_id, req_info): 41 | """ 42 | Handles a control_request received from a worker 43 | Parameters 44 | ---------- 45 | req : str or dict 46 | Control request received from a worker. 47 | The control request can be one of the following 48 | 1) "next" : request by a worker to be informed of its next action 49 | to perform. The answers from the server can be 'train' (the 50 | worker should keep training on its training data), 'valid' (the 51 | worker should perform monitoring on its validation set and test 52 | set) or 'stop' (the worker should stop training). 53 | 2) dict of format {"done":N} : used by a worker to inform the 54 | server that is has performed N more training iterations and 55 | synced its parameters. The server will respond 'stop' if the 56 | maximum number of training minibatches has been reached. 57 | 3) dict of format {"valid_err":x, "test_err":x2} : used by a worker 58 | to inform the server that it has performed a monitoring step 59 | and obtained the included errors on the monitoring datasets. 60 | The server will respond "best" if this is the best reported 61 | validation error so far, otherwise it will respond 'stop' if 62 | the patience has been exceeded. 63 | """ 64 | control_response = "" 65 | 66 | if req == 'next': 67 | if not self._should_stop: 68 | if self.start_time is None: 69 | self.start_time = time.time() 70 | if self._save_params: 71 | control_response = 'save' 72 | else: 73 | control_response = 'train' 74 | else: 75 | control_response = 'stop' 76 | elif req == 'done': 77 | self.uidx += req_info['train_len'] 78 | if self.uidx%self.saveFreq==0: 79 | self._save_params=True 80 | 81 | elif req == 'saved': 82 | self._save_params=False 83 | 84 | if self.uidx > self.max_mb: 85 | if not self._should_stop: 86 | print("Training time {:.4f}s".format(time.time() - self.start_time)) 87 | print("Number of samples:", self.uidx) 88 | ##NEVER STOPPING! 89 | self._should_stop = False 90 | 91 | return control_response 92 | 93 | 94 | def wavenet_control(saveFreq=1110, saveto=None): 95 | parser = Controller.default_parser() 96 | parser.add_argument('--max-mb', default=((5000 * 1998) / 10), type=int, 97 | required=False, help='Maximum mini-batches to train upon in total.') 98 | 99 | args = parser.parse_args() 100 | 101 | l = WaveNetController(max_mb=10000,saveFreq=1000, 102 | default_args=Controller.default_arguments(args)) 103 | 104 | print("Controller is ready") 105 | return l.serve() 106 | 107 | if __name__ == '__main__': 108 | rcode = wavenet_control() 109 | if rcode != 0: 110 | sys.exit(rcode) 111 | -------------------------------------------------------------------------------- /wavenet_worker.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.setrecursionlimit(10000) 3 | import numpy as np 4 | import numpy 5 | numpy.random.seed(123) 6 | import random 7 | random.seed(123) 8 | import dataset 9 | import theano 10 | import theano.tensor as T 11 | theano.config.floatX='float32' 12 | import lib.ops 13 | import scipy.io.wavfile 14 | import time 15 | import lasagne 16 | from six import iteritems 17 | from platoon.channel import Worker 18 | from platoon.param_sync import EASGD 19 | import argparse 20 | import pickle 21 | import new_dataset 22 | from model import network 23 | 24 | worker = None 25 | # Hyperparams 26 | NB_EPOCH=200 27 | BATCH_SIZE = 8 28 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization 29 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard' 30 | N_FILES = 8192 31 | BITRATE = 16000 32 | GRAD_CLIP=1 33 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 34 | Q_TYPE='linear' 35 | 36 | N_BLOCKS=5 37 | DILATION_DEPTH=10 38 | RF=N_BLOCKS*(2**(DILATION_DEPTH))-N_BLOCKS+2 39 | n_filters=64 40 | 41 | #FRAME_SIZE = RF # How many samples per frame 42 | #SEQ_LEN=2*RF 43 | OVERLAP=RF 44 | SEQ_LEN=1600 45 | 46 | N_GPUS=4 47 | alpha = 1./N_GPUS 48 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO)) 49 | 50 | def floatX(arr): 51 | return np.asarray(arr, dtype=theano.config.floatX) 52 | 53 | def adam(lr, tparams, grads, sequences, cost, epsilon=1e-8,beta1=0.9,beta2=0.999): 54 | 55 | zipped_grads = [lib.param('%s_grad' % k,p.get_value() * floatX(0.)) 56 | for k, p in iteritems(tparams)] 57 | running_grads = [lib.param('%s_rgrad' % k,p.get_value() * floatX(0.)) 58 | for k, p in iteritems(tparams)] 59 | running_grads2 = [lib.param('%s_rgrad2' % k,p.get_value() * floatX(0.)) 60 | for k, p in iteritems(tparams)] 61 | 62 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 63 | rgup = [(rg, beta1 * rg + (1-beta1) * g) for rg, g in zip(running_grads, grads)] 64 | rg2up = [(rg2, beta2 * rg2 + (1-beta2) * (g ** 2)) for rg2, g in zip(running_grads2, grads)] 65 | 66 | t_prev = lib.param('t_prev',floatX(0.)) 67 | one = T.constant(1) 68 | t = t_prev+1 69 | a_t = lr*T.sqrt(1-beta2**t)/(1-beta1**t) 70 | 71 | f_grad_shared = theano.function([sequences], cost, 72 | updates=zgup + rgup + rg2up, 73 | name='adam_f_grad_shared') 74 | 75 | updir = [lib.param('%s_updir' % k,p.get_value() * floatX(0.)) 76 | for k, p in iteritems(tparams)] 77 | 78 | updir_new = [(ud, a_t * rg / T.sqrt(rg2 + epsilon)) 79 | for ud, rg, rg2 in zip(updir, running_grads, running_grads2)] 80 | param_up = [(p, p - udn[1]) 81 | for p, udn in zip(tparams.values(), updir_new)] 82 | f_update = theano.function([lr], [], updates=updir_new + param_up + [(t_prev,t)] , 83 | on_unused_input='ignore', 84 | name='adam_f_update') 85 | 86 | return f_grad_shared, f_update 87 | 88 | print "Model settings:" 89 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')] 90 | all_vars = sorted(all_vars, key=lambda x: x[0]) 91 | for var_name, var_value in all_vars: 92 | print "\t{}: {}".format(var_name, var_value) 93 | 94 | def build_model(worker,train_len=100,param_sync_api=True): 95 | sequences = T.imatrix('sequences') 96 | input_sequences = sequences[:,:-1] 97 | target_sequences = sequences[:,RF:] 98 | 99 | # def network_my(input_sequences): 100 | # batch_size = input_sequences.shape[0] 101 | # length = input_sequences.shape[1] 102 | # dilations = np.asarray([[2**i for i in xrange(DILATION_DEPTH)]*N_BLOCKS]).tolist()[0] 103 | # #skip_weights = lib.param("scaling_weights", numpy.ones(len(dilations)).astype('float32')) 104 | # 105 | # #start = T.extra_ops.to_one_hot(input_sequences.flatten(),nb_class=256).reshape((batch_size,length,256)).transpose(0,2,1)[:,:,None,:] 106 | # start = (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,:,None] 107 | # conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=True,batchnorm=False,pad=(1,0))[:,:,:length,:] 108 | # prev_conv = conv1 109 | # #prev_skip = [] 110 | # prev_skip = T.zeros((batch_size,n_filters,length,1)) 111 | # for i,value in enumerate(dilations): 112 | # prev_conv,y = lib.ops.WaveNetConv1d("Block-%d"%(i+1),prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value) 113 | # #prev_skip += y*skip_weights[i] 114 | # prev_skip += y 115 | # #prev_skip += [y] 116 | # 117 | # #out = T.nnet.relu(T.sum(prev_skip,axis=0)) 118 | # out = T.nnet.relu(prev_skip) 119 | # #out = prev_skip 120 | # out = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 121 | # out = T.nnet.relu(lib.ops.conv1d("Output.2",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 122 | # out = T.nnet.relu(lib.ops.conv1d("Output.3",out,1,1,n_filters,n_filters,bias=True,batchnorm=False)) 123 | # 124 | # out = lib.ops.conv1d("Output.4",out,1,1,256,n_filters,bias=True,batchnorm=False) 125 | # 126 | # return out[:,:,RF-1:,0].transpose(0,2,1).reshape((-1,Q_LEVELS)) 127 | 128 | predicted_sequences = T.nnet.softmax(network(input_sequences)) 129 | #lib.load_params('iter_latest_wavenet.p') 130 | cost = T.nnet.categorical_crossentropy( 131 | predicted_sequences, 132 | target_sequences.flatten() 133 | ).mean() 134 | 135 | cost = cost * lib.floatX(1.44269504089) 136 | 137 | 138 | params = lib.search(cost, lambda x: hasattr(x, 'param')) 139 | tparams = {p.name:p for p in params} 140 | 141 | copy_params = lambda tparams: {x:theano.shared(y.get_value(),name=x) for x,y in tparams.iteritems()} 142 | 143 | lib.print_params_info(cost, params) 144 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) 145 | 146 | list_tparams = list(tparams.values()) 147 | 148 | if param_sync_api: 149 | worker.init_shared_params(list_tparams, param_sync_rule=EASGD(alpha)) 150 | else: 151 | from platoon.training import global_dynamics as gd 152 | cparams = copy_params(tparams) 153 | list_cparams = list(cparams.values()) 154 | easgd = gd.EASGD(worker) 155 | easgd.make_rule(list_tparams, list_cparams, alpha) 156 | 157 | 158 | grads = T.grad(cost, wrt=list_tparams) 159 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 160 | 161 | lr = T.fscalar() 162 | 163 | f_grad_shared,f_update = adam(lr,tparams,grads,sequences,cost) 164 | 165 | def save_params(_params,path): 166 | param_vals = {} 167 | for name, param in _params.iteritems(): 168 | param_vals[name] = param.get_value() 169 | 170 | with open(path, 'wb') as f: 171 | pickle.dump(param_vals, f) 172 | 173 | if param_sync_api: 174 | worker.copy_to_local() 175 | 176 | costs = [] 177 | #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True,worker.global_rank()) 178 | new_dataset.random_seed = worker._worker_id 179 | data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE) 180 | 181 | iter_count=0 182 | while True: 183 | step = worker.send_req('next') 184 | 185 | if step == 'train': 186 | for i in range(train_len): 187 | try: 188 | seqs,reset,mask = next(data_feeder) 189 | except StopIteration: 190 | #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True,worker.global_rank()) 191 | new_dataset.random_seed = worker._worker_id 192 | data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE) 193 | seqs,reset,mask = next(data_feeder) 194 | print('Train cost:', np.mean(costs)) 195 | costs = [] 196 | 197 | costs.append(f_grad_shared(seqs)) 198 | f_update(0.001) 199 | iter_count += 1 200 | 201 | step = worker.send_req('done', {'train_len': train_len}) 202 | if iter_count%5000==0: 203 | print('Train cost:',np.mean(costs)) 204 | 205 | if param_sync_api: 206 | #print("Syncing with global params") 207 | worker.sync_params(synchronous=True) 208 | else: 209 | easgd() 210 | 211 | if step=='save': 212 | if param_sync_api: 213 | save_params(tparams,"worker_%d.p"%worker._worker_id) 214 | step = worker.send_req('saved') 215 | print('Saving now') 216 | else: 217 | save_params(cparams,"worker%d.p"%worker._worker_id)) 218 | step = worker.send_req('saved') 219 | 220 | if step == 'stop': 221 | break 222 | 223 | # Release all shared resources. 224 | worker.close() 225 | 226 | if __name__ == '__main__': 227 | # See function train for all possible parameter and there definition. 228 | global worker 229 | parser = Worker.default_parser() 230 | parser.add_argument('--valid_sync', dest='valid_sync', action='store_true', default=False) 231 | parser.add_argument('--param-sync-api', action='store_true', default=True) 232 | #SEED = 123 233 | #lib.random_seed = SEED+worker._worker_id 234 | args = parser.parse_args() 235 | worker = Worker(**Worker.default_arguments(args)) 236 | 237 | build_model(worker,train_len=10,param_sync_api=args.param_sync_api) 238 | --------------------------------------------------------------------------------