├── README.md
├── dataset.py
├── dataset.pyc
├── freqdomain.py
├── lib
    ├── __init__.py
    ├── __init__.pyc
    ├── ops.py
    ├── ops.pyc
    ├── optimizers.py
    └── optimizers.pyc
├── my_three_tier.py
├── temp.py
├── train.py
├── vctk_dataset.py
├── vctk_dataset.pyc
├── wavenet.py
├── wavenet_controller.py
└── wavenet_worker.py


/README.md:
--------------------------------------------------------------------------------
 1 | # WaveNet implementation in Theano
 2 | Based on https://deepmind.com/blog/wavenet-generative-model-raw-audio/ and https://arxiv.org/pdf/1609.03499.pdf.
 3 | 
 4 | Disclaimer: this is a re-implementation of the model described in the WaveNet paper by Google Deepmind. This repository is not associated with Google Deepmind.
 5 | 
 6 | [Listen to a sample 🎶!](https://soundcloud.com/rithesh-kumar-772989650/sets/wavenet-samples)
 7 | 
 8 | - wavenet.py -> Train the network
 9 | - lib/ops.py -> Mini theano library
10 | - generate() function in wavenet.py ==> generates samples
11 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import scipy.io.wavfile
  3 | import scikits.audiolab
  4 | 
  5 | import random
  6 | import time
  7 | import numpy as np
  8 | 
  9 | 
 10 | random_seed = 123
 11 | 
 12 | def feed_epoch(data_path, n_files, BATCH_SIZE, SEQ_LEN, OVERLAP, Q_LEVELS, Q_ZERO,RF=1024):
 13 |     global random_seed
 14 |     """
 15 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
 16 |     quantized audio data; `reset` is a boolean indicating the start of a new
 17 |     sequence (i.e. you should reset h0 whenever `reset` is True).
 18 |     Feeds subsequences which overlap by a specified amount, so that the model
 19 |     can always have target for every input in a given subsequence.
 20 |     Loads sequentially-named FLAC files in a directory
 21 |     (p0.flac, p1.flac, p2.flac, ..., p[n_files-1].flac)
 22 |     Assumes all flac files have the same length.
 23 |     data_path: directory containing the flac files
 24 |     n_files: how many FLAC files are in the directory
 25 |     (see two_tier.py for a description of the constants)
 26 |     returns: (subbatch, reset)
 27 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
 28 |     reset: True or False
 29 |     """
 30 | 
 31 |     def round_to(x, y):
 32 |         """round x up to the nearest y"""
 33 |         return int(numpy.ceil(x / float(y))) * y
 34 | 
 35 |     def mewlaw_quantize(data):
 36 |     	final_data = []
 37 |     	for i in xrange(data.shape[0]):
 38 |     	    final_data.append(float_to_uint8(ulaw(wav_to_float(data[i]))))
 39 |             return np.asarray(final_data,dtype=np.uint8)
 40 | 
 41 |     def ulaw(x, u=255):
 42 |         x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u))
 43 |         return x
 44 | 
 45 |     def invulaw(y,u=255):
 46 |         y = np.sign(y)*(1./u)*(np.power(1+u,np.abs(y))-1)
 47 |         return y
 48 | 
 49 |     def float_to_uint8(x):
 50 |         x += 1.
 51 |         x /= 2.
 52 |         uint8_max_value = np.iinfo('uint8').max
 53 |         x *= uint8_max_value
 54 |         x = x.astype('uint8')
 55 |         return x
 56 | 
 57 | 
 58 |     def wav_to_float(x):
 59 |         try:
 60 |             max_value = np.iinfo(x.dtype).max
 61 |             min_value = np.iinfo(x.dtype).min
 62 |         except:
 63 |             max_value = np.finfo(x.dtype).max
 64 |             min_value = np.finfo(x.dtype).min
 65 |         x = x.astype('float64', casting='safe')
 66 |         x -= min_value
 67 |         x /= ((max_value - min_value) / 2.)
 68 |         x -= 1.
 69 |         return x
 70 | 
 71 |     def batch_quantize(data):
 72 |         """
 73 |         floats in (-1, 1) to ints in [0, Q_LEVELS-1]
 74 |         scales normalized across axis 1
 75 |         """
 76 |         eps = numpy.float64(1e-5)
 77 |         companded = np.sign(data)*(np.log(1+255*np.abs(data))/np.log(256))
 78 |         data = companded
 79 | 
 80 |         data -= data.min(axis=1)[:, None]
 81 | 
 82 |         data *= ((Q_LEVELS - eps) / data.max(axis=1)[:, None])
 83 |         data += eps/2
 84 |         # print "WARNING using zero-dc-offset normalization"
 85 |         # data -= data.mean(axis=1)[:, None]
 86 |         # data *= (((Q_LEVELS/2.) - eps) / numpy.abs(data).max(axis=1)[:, None])
 87 |         # data += Q_LEVELS/2
 88 | 
 89 |         data = data.astype('uint8')
 90 | 
 91 |         return data
 92 | 
 93 |     start=100
 94 |     paths = [data_path+'/p{}.flac'.format(start+i) for i in xrange(n_files)]
 95 |     #rand_idx = np.random.randint(0,141867,n_files)
 96 |     #paths = [data_path+'/p{}.flac'.format(i) for i in rand_idx]
 97 | 
 98 |     random.seed(random_seed)
 99 |     random.shuffle(paths)
100 |     random_seed += 1
101 | 
102 |     batches = []
103 |     for i in xrange(len(paths) / BATCH_SIZE):
104 |         batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
105 | 
106 |     random.shuffle(batches)
107 | 
108 |     for batch_paths in batches:
109 |         # batch_seq_len = length of longest sequence in the batch, rounded up to
110 |         # the nearest SEQ_LEN.
111 |         batch_seq_len = len(scikits.audiolab.flacread(batch_paths[0])[0])
112 |         batch_seq_len = round_to(batch_seq_len, SEQ_LEN)
113 | 
114 |         batch = numpy.zeros(
115 |             (BATCH_SIZE, batch_seq_len),
116 |             dtype='float64'
117 |         )
118 | 
119 |         for i, path in enumerate(batch_paths):
120 |             data, fs, enc = scikits.audiolab.flacread(path)
121 |             batch[i, :len(data)] = data
122 | 
123 |         if Q_LEVELS != None:
124 |             batch = batch_quantize(batch)
125 | 
126 |             batch = numpy.concatenate([
127 |                 numpy.full((BATCH_SIZE, OVERLAP), Q_ZERO, dtype=np.uint8),
128 |                 batch
129 |             ], axis=1)
130 |         else:
131 |             batch = numpy.concatenate([
132 |                 numpy.full((BATCH_SIZE, OVERLAP), 0, dtype='float32'),
133 |                 batch
134 |             ], axis=1)
135 |             batch = batch.astype('float32')
136 | 
137 |             batch -= batch.mean()
138 |             batch /= batch.std()
139 | 
140 |         for i in xrange(0,batch.shape[1]-RF-OVERLAP,OVERLAP):
141 |             reset = numpy.int32(i==0)
142 |             start = i
143 |             end = i+RF+OVERLAP
144 |             subbatch = batch[:, start : end]
145 |             yield (subbatch, reset)
146 | 
147 | def blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, STRIDE, RF=1025, N_FILES=None, DISTRIBUTED=False,WORKER_ID=None):
148 |     global random_seed
149 |     def process_wav(desired_sample_rate, filename, use_ulaw):
150 |         channels = scipy.io.wavfile.read(filename)
151 |         file_sample_rate, audio = channels
152 |         audio = ensure_mono(audio)
153 |         audio = wav_to_float(audio)
154 |         if use_ulaw:
155 |             audio = ulaw(audio)
156 |         audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio)
157 |         audio = float_to_uint8(audio)
158 |         return audio
159 | 
160 |     def process_flac(desired_sample_rate, filename, use_ulaw):
161 |         channels = scikits.audiolab.flacread(filename)
162 |         file_sample_rate = channels[1]
163 |         audio = channels[0]
164 |         audio = ensure_mono(audio)
165 |         #audio = wav_to_float(audio)
166 |         if use_ulaw:
167 |             audio = ulaw(audio)
168 |         audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio)
169 |         audio = float_to_uint8(audio)
170 |         return audio
171 | 
172 |     def ulaw(x, u=255):
173 |         x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u))
174 |         return x
175 | 
176 | 
177 |     def float_to_uint8(x):
178 |         x += 1.
179 |         x /= 2.
180 |         uint8_max_value = np.iinfo('uint8').max
181 |         x *= uint8_max_value
182 |         x = x.astype('uint8')
183 |         return x
184 | 
185 | 
186 |     def wav_to_float(x):
187 |         try:
188 |             max_value = np.iinfo(x.dtype).max
189 |             min_value = np.iinfo(x.dtype).min
190 |         except:
191 |             max_value = np.finfo(x.dtype).max
192 |             min_value = np.finfo(x.dtype).min
193 |         x = x.astype('float64', casting='safe')
194 |         x -= min_value
195 |         x /= ((max_value - min_value) / 2.)
196 |         x -= 1.
197 |         return x
198 | 
199 | 
200 |     def ulaw2lin(x, u=255.):
201 |         max_value = np.iinfo('uint8').max
202 |         min_value = np.iinfo('uint8').min
203 |         x = x.astype('float64', casting='safe')
204 |         x -= min_value
205 |         x /= ((max_value - min_value) / 2.)
206 |         x -= 1.
207 |         x = np.sign(x) * (1 / u) * (((1 + u) ** np.abs(x)) - 1)
208 |         x = float_to_uint8(x)
209 |         return x
210 | 
211 |     def ensure_sample_rate(desired_sample_rate, file_sample_rate, mono_audio):
212 |         if file_sample_rate != desired_sample_rate:
213 |             mono_audio = scipy.signal.resample_poly(mono_audio, desired_sample_rate, file_sample_rate)
214 |         return mono_audio
215 | 
216 | 
217 |     def ensure_mono(raw_audio):
218 |         """
219 |         Just use first channel.
220 |         """
221 |         if raw_audio.ndim == 2:
222 |             raw_audio = raw_audio[:, 0]
223 |         return raw_audio
224 | 
225 |     start=100
226 |     DATA_PATH = "/data/lisatmp3/kumarrit/blizzard/"
227 |     if DISTRIBUTED:
228 |         random.seed(WORKER_ID)
229 |         start = random.choice(xrange(120000))
230 |     paths = ['p%d.flac'%(start+i) for i in xrange(N_FILES)]
231 |     random_seed += 1
232 |     batches = []
233 | 
234 |     for i in xrange(len(paths) / BATCH_SIZE):
235 |         batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
236 | 
237 |     random.seed(random_seed)
238 |     random.shuffle(batches)
239 |     for batch_paths in batches:
240 |         data = []
241 |         for fname in batch_paths:
242 |             data.append(process_flac(16000,DATA_PATH+fname,True))
243 |         max_len = max([len(vec) for vec in data])
244 |         for i in xrange(len(data)):
245 |             data[i] = np.hstack((data[i],np.full(max_len-len(data[i]),128,dtype=np.uint8)))
246 |         data = np.asarray(data).astype(np.uint8)
247 |         for i in xrange(0,data.shape[1]-RF-STRIDE,STRIDE):
248 |             start = i
249 |             end = i+RF+STRIDE
250 |             subbatch = data[:, start : end]
251 |             yield (subbatch,start)
252 | 


--------------------------------------------------------------------------------
/dataset.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/dataset.pyc


--------------------------------------------------------------------------------
/freqdomain.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | sys.setrecursionlimit(10000)
  3 | import numpy as np
  4 | import numpy
  5 | numpy.random.seed(123)
  6 | import random
  7 | random.seed(123)
  8 | import dataset
  9 | import theano
 10 | import theano.tensor as T
 11 | theano.config.floatX='float32'
 12 | import lib.ops
 13 | import scipy.io.wavfile
 14 | import time
 15 | import lasagne
 16 | import theano.tensor.fft
 17 | 
 18 | # Hyperparams
 19 | NB_EPOCH=100
 20 | BATCH_SIZE = 8
 21 | FRAME_SIZE = 0 # How many samples per frame
 22 | Q_LEVELS = None # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 23 | #DATA_PATH = '/data/lisatmp3/kumarrit/blizzard'
 24 | DATA_PATH='/home/rithesh/DeepLearning/Vocal Synthesis/data'
 25 | N_FILES = 8
 26 | BITRATE = 16000
 27 | 
 28 | Q_ZERO = None # Discrete value correponding to zero amplitude
 29 | N_BLOCKS=1
 30 | RF=N_BLOCKS*32-N_BLOCKS+2
 31 | SEQ_LEN=2*RF
 32 | n_filters=256
 33 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO))
 34 | 
 35 | def network(input_sequences):
 36 |     batch_size = input_sequences.shape[0]
 37 |     length = input_sequences.shape[1]
 38 |     inp = input_sequences[:,None,None,:]
 39 |     dilations = np.asarray([[1,2,4,8,16]*N_BLOCKS]).tolist()[0]
 40 |     conv1 = lib.ops.conv1d("causal-conv",inp,2,1,n_filters,1,bias=False,batchnorm=False,pad=(0,1))[:,:,:,:length]
 41 |     prev_conv = conv1
 42 |     #prev_skip = []
 43 |     prev_skip = T.zeros_like(conv1)
 44 |     i=0
 45 |     for value in dilations:
 46 |         i+=1
 47 |         x,y = lib.ops.WaveNetConv1d("Block-%d"%i,prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value)
 48 |         prev_conv = x
 49 |         prev_skip += y
 50 |     out = T.nnet.relu(prev_skip)
 51 |     out2 = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=False,batchnorm=False))
 52 |     output = lib.ops.conv1d("Output.2",out2,1,1,34,n_filters,bias=False,batchnorm=False)
 53 | 
 54 |     result = output[:,:,0,-1]
 55 |     result2 = T.nnet.relu(lib.ops.Dense('Op.1',34,512,result,weightnorm=False))
 56 |     result3 = lib.ops.Dense('Op.2',512,34,result2,weightnorm=False)
 57 |     return output[:,:,0,-1].reshape((batch_size,17,2))
 58 | 
 59 | print "Model settings:"
 60 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 61 | all_vars = sorted(all_vars, key=lambda x: x[0])
 62 | for var_name, var_value in all_vars:
 63 |     print "\t{}: {}".format(var_name, var_value)
 64 | 
 65 | sequences   = T.fmatrix('sequences')
 66 | input_sequences = sequences[:,:RF]
 67 | target_sequences = sequences[:,RF:]
 68 | 
 69 | pred_freq = network(input_sequences)
 70 | target_freq = theano.tensor.fft.rfft(target_sequences)
 71 | cost = T.sqr(pred_freq-target_freq).mean()
 72 | #lib.load_params('iter_latest_wavenet.p')
 73 | # cost = T.nnet.categorical_crossentropy(
 74 | #     predicted_sequences,
 75 | #     target_sequences.flatten()
 76 | # ).mean()
 77 | 
 78 | # By default we report cross-entropy cost in bits.
 79 | # Switch to nats by commenting out this line:
 80 | #cost = cost * lib.floatX(1.44269504089)
 81 | 
 82 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
 83 | lib.print_params_info(cost, params)
 84 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP)
 85 | grads = T.grad(cost, wrt=params)
 86 | lr = T.fscalar()
 87 | updates = lasagne.updates.adam(grads, params, learning_rate=lr)
 88 | 
 89 | print "Gradients Computed"
 90 | 
 91 | train_fn = theano.function(
 92 |     [sequences,lr],
 93 |     [cost,pred_freq],
 94 |     updates=updates,
 95 |     on_unused_input='warn'
 96 | )
 97 | 
 98 | 
 99 | print "Training!"
100 | DATA_PATH="/data/lisatmp3/kumarrit/blizzard"
101 | for epoch in xrange(NB_EPOCH):
102 |     costs = []
103 |     times = []
104 |     #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO,RF))
105 |     data_feeder = list(dataset.preprocess(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN,RF))
106 |     results = []
107 |     print "Epoch : ",epoch
108 |     total_iters = 0
109 |     i=0
110 |     for seqs, t, m, s  in data_feeder:
111 |         start_time = time.time()
112 |         cost,pred = train_fn(seqs,0.001)
113 |         results.append(pred)
114 |         i += 1
115 |         total_time = time.time() - start_time
116 |         times.append(total_time)
117 |         total_iters += 1
118 |         print "Batch ",total_iters," (Epoch %d)"%(epoch)
119 |         costs.append(cost)
120 |         print "\tCost: ",np.mean(costs)
121 |         print "\tTime: ",np.mean(times)
122 |     del results
123 | 
124 | 
125 | def plot(i):
126 |     import matplotlib.pyplot as plt
127 |     f,axarr = plt.subplots(8)
128 |     for j in xrange(8):
129 |         axarr[j].plot(data[i][0][j][1025:])
130 |         axarr[j].plot(results[i][j],color='green')
131 |     plt.show()
132 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
  1 | import ops
  2 | import numpy
  3 | import theano
  4 | import theano.tensor as T
  5 | import cPickle as pickle
  6 | from collections import OrderedDict
  7 | import os
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | _params = OrderedDict()
 13 | 
 14 | import locale
 15 | 
 16 | locale.setlocale(locale.LC_ALL, '')
 17 | 
 18 | def print_params_info(cost, params):
 19 |     """Print information about the parameters in the given param set."""
 20 | 
 21 |     params = sorted(params, key=lambda p: p.name)
 22 |     values = [p.get_value(borrow=True) for p in params]
 23 |     shapes = [p.shape for p in values]
 24 |     print "Params for cost:"
 25 |     for param, value, shape in zip(params, values, shapes):
 26 |         print "\t{0} ({1})".format(
 27 |             param.name,
 28 |             ",".join([str(x) for x in shape])
 29 |         )
 30 | 
 31 |     total_param_count = 0
 32 |     for shape in shapes:
 33 |         param_count = 1
 34 |         for dim in shape:
 35 |             param_count *= dim
 36 |         total_param_count += param_count
 37 |     print "Total parameter count: {0}".format(
 38 |         locale.format("%d", total_param_count, grouping=True)
 39 |     )
 40 | 
 41 | def param(name, *args, **kwargs):
 42 |     """
 43 |     A wrapper for `theano.shared` which enables parameter sharing in models.
 44 | 
 45 |     Creates and returns theano shared variables similarly to `theano.shared`,
 46 |     except if you try to create a param with the same name as a
 47 |     previously-created one, `param(...)` will just return the old one instead of
 48 |     making a new one.
 49 | 
 50 |     This constructor also adds a `param` attribute to the shared variables it
 51 |     creates, so that you can easily search a graph for all params.
 52 |     """
 53 |     if name not in _params:
 54 |         kwargs['name'] = name
 55 |         train = not 'train' in kwargs
 56 |         if not train:
 57 |             del kwargs['train']
 58 |         param = theano.shared(*args, **kwargs)
 59 |         if train:
 60 |             param.param = train
 61 |         _params[name] = param
 62 |     return _params[name]
 63 | 
 64 | def delete_params(name):
 65 |     to_delete = [p_name for p_name in _params if name in p_name]
 66 |     for p_name in to_delete:
 67 |         del _params[p_name]
 68 | 
 69 | def search(node, critereon):
 70 |     """
 71 |     Traverse the Theano graph starting at `node` and return a list of all nodes
 72 |     which match the `critereon` function. When optimizing a cost function, you
 73 |     can use this to get a list of all of the trainable params in the graph, like
 74 |     so:
 75 | 
 76 |     `lib.search(cost, lambda x: hasattr(x, "param"))`
 77 |     """
 78 | 
 79 |     def _search(node, critereon, visited):
 80 |         if node in visited:
 81 |             return []
 82 |         visited.add(node)
 83 | 
 84 |         results = []
 85 |         if isinstance(node, T.Apply):
 86 |             for inp in node.inputs:
 87 |                 results += _search(inp, critereon, visited)
 88 |         else: # Variable node
 89 |             if critereon(node):
 90 |                 results.append(node)
 91 |             if node.owner is not None:
 92 |                 results += _search(node.owner, critereon, visited)
 93 |         return results
 94 | 
 95 |     return _search(node, critereon, set())
 96 | 
 97 | def floatX(x):
 98 |     """
 99 |     Convert `x` to the numpy type specified in `theano.config.floatX`.
100 |     """
101 |     return numpy.float32(x)
102 | 
103 | def save_params(path):
104 |     param_vals = {}
105 |     for name, param in _params.iteritems():
106 |         param_vals[name] = param.get_value()
107 | 
108 |     try:
109 |         with open(path, 'wb') as f:
110 |             pickle.dump(param_vals, f)
111 |     except IOError:
112 |         os.makedirs(os.path.split(path)[0])
113 |         f = open(path,"wb")
114 |         pickle.dump(param_vals, f)
115 | 
116 | def load_params(path):
117 |     with open(path, 'rb') as f:
118 |         param_vals = pickle.load(f)
119 | 
120 |     for name, val in param_vals.iteritems():
121 |         _params[name].set_value(val)
122 | 
123 | def clear_all_params():
124 |     to_delete = [p_name for p_name in _params]
125 |     for p_name in to_delete:
126 |         del _params[p_name]
127 | 
128 | __train_log_file_name = 'train_info.pkl'
129 | def save_training_info(values, path):
130 |     """
131 |     Gets a set of values as dictionary and append them to a log file.
132 |     stores in <path>/train_log.pkl
133 |     """
134 |     file_name = os.path.join(path, __train_log_file_name)
135 |     try:
136 |         with open(file_name, "rb") as f:
137 |             log = pickle.load(f)
138 |     except IOError:  # first time
139 |         if not os.path.exists(path):
140 |             os.makedirs(path)
141 |         log = {}
142 |         for k in values.keys():
143 |             log[k] = []
144 |     for k, v in values.items():
145 |         log[k].append(v)
146 |     with open(file_name, "wb") as f:
147 |         pickle.dump(log, f)
148 | 
149 | def plot_traing_info(x, ylist, path):
150 |     """
151 |     Loads log file and plot x and y values as provided by input.
152 |     Saves as <path>/train_log.png
153 |     """
154 |     file_name = os.path.join(path, __train_log_file_name)
155 |     try:
156 |         with open(file_name, "rb") as f:
157 |             log = pickle.load(f)
158 |     except IOError:  # first time
159 |         warnings.warn("There is no {} file here!!!".format(file_name))
160 |         return
161 |     plt.figure()
162 |     x_vals = log[x]
163 |     for y in ylist:
164 |         y_vals = log[y]
165 |         if len(y_vals) != len(x_vals):
166 |             warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x))
167 |         plt.plot(x_vals, y_vals, label=y)
168 |         # assert len(y_vals) == len(x_vals), "not the same len"
169 |     plt.xlabel(x)
170 |     plt.legend()
171 |     #plt.show()
172 |     plt.savefig(file_name[:-3]+'png', bbox_inches='tight')
173 |     plt.close('all')
174 | 


--------------------------------------------------------------------------------
/lib/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/__init__.pyc


--------------------------------------------------------------------------------
/lib/ops.py:
--------------------------------------------------------------------------------
  1 | import lib
  2 | import numpy as np
  3 | import numpy
  4 | import theano
  5 | import theano.tensor as T
  6 | theano.config.floatX='float32'
  7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  8 | import time
  9 | import lasagne
 10 | import math
 11 | 
 12 | srng = RandomStreams(seed=234)
 13 | 
 14 | def BatchNorm(layer_name,input, insize, mode=0,run_mode=0, momentum=0.9, layer='default'):
 15 |     '''
 16 |     # params :
 17 |     input_shape :
 18 |         when mode is 0, we assume 2D input. (mini_batch_size, # features)
 19 |         when mode is 1, we assume 4D input. (mini_batch_size, # of channel, # row, # column)
 20 |     mode :
 21 |         0 : feature-wise mode (normal BN)
 22 |         1 : window-wise mode (CNN mode BN)
 23 |     momentum : momentum for exponential average
 24 |     '''
 25 |     input_shape = input.shape
 26 |     # random setting of gamma and beta, setting initial mean and std
 27 |     rng = np.random.RandomState(int(time.time()))
 28 | 
 29 |     gamma_val = np.asarray(rng.uniform(low=-1.0/math.sqrt(insize), high=1.0/math.sqrt(insize), size=(insize)),dtype=theano.config.floatX)
 30 |     if layer=='recurrent':
 31 |         gamma = lib.param(layer_name+'.gamma', np.full(shape=(insize),fill_value=0.1,dtype=theano.config.floatX), borrow=True)
 32 |     else:
 33 |         gamma = lib.param(layer_name+'.gamma', gamma_val, borrow=True)
 34 |     beta = lib.param(layer_name+'.beta',np.zeros((insize), dtype=theano.config.floatX), borrow=True)
 35 |     mean = lib.param(layer_name+'.mean',np.zeros((insize),dtype=theano.config.floatX),  train=False, borrow=True)
 36 |     var = lib.param(layer_name+'.var',np.ones((insize), dtype=theano.config.floatX),train = False, borrow=True)
 37 | 
 38 |     epsilon = 1e-06
 39 | 
 40 |     if mode==0 :
 41 |         if run_mode==0 :
 42 |             now_mean = T.mean(input, axis=0)
 43 |             now_var = T.var(input, axis=0)
 44 |             now_normalize = (input - now_mean) / T.sqrt(now_var+epsilon) # should be broadcastable..
 45 |             output = gamma * now_normalize + beta
 46 |             # mean, var update
 47 |             # run_mean = theano.clone(mean,share_inputs=False)
 48 |             # run_var = theano.clone(var, share_inputs=False)
 49 |             # run_mean.default_update = momentum * mean + (1.0-momentum) * now_mean
 50 |             # run_var.default_update = momentum * var + (1.0-momentum) * (input_shape[0]/(input_shape[0]-1)*now_var)
 51 |             mean = momentum*mean + (1.0-momentum) * now_mean
 52 |             var = momentum*var + (1.0-momentum)*(input_shape[0]/(input_shape[0]-1))*now_var
 53 |         else :
 54 |             output = gamma * (input - mean) / T.sqrt(var+epsilon) + beta
 55 | 
 56 |     else :
 57 |         # in CNN mode, gamma and beta exists for every single channel separately.
 58 |         # for each channel, calculate mean and std for (mini_batch_size * row * column) elements.
 59 |         # then, each channel has own scalar gamma/beta parameters.
 60 |         axes = (0,2,3)
 61 |         if run_mode==0 :
 62 |             now_mean = T.mean(input, axis=axes)
 63 |             now_var = T.var(input, axis=axes)
 64 |             # mean, var update
 65 |             # run_mean = theano.clone(mean,share_inputs=False)
 66 |             # run_var = theano.clone(var, share_inputs=False)
 67 |             # run_mean.default_update = momentum * mean + (1.0-momentum) * now_mean
 68 |             # run_var.default_update = momentum * var + (1.0-momentum) * (input_shape[0]/(input_shape[0]-1)*now_var)
 69 |             mean = momentum*mean + (1.0-momentum) * now_mean
 70 |             var = momentum*var + (1.0-momentum)*(input_shape[0]/(input_shape[0]-1))*now_var
 71 |         else :
 72 |             now_mean = mean
 73 |             now_var = var
 74 |         # change shape to fit input shape
 75 | 
 76 |         param_axes = iter(range(input.ndim - len(axes)))
 77 |         pattern = ['x' if input_axis in axes
 78 |                else next(param_axes)
 79 |                for input_axis in range(input.ndim)]
 80 |         now_mean = now_mean.dimshuffle(pattern)
 81 |         now_var = now_var.dimshuffle(pattern)
 82 |         now_gamma = gamma.dimshuffle(pattern)
 83 |         now_beta = beta.dimshuffle(pattern)
 84 |         output = now_gamma * (input - now_mean) / T.sqrt(now_var+epsilon) + now_beta
 85 | 
 86 |     return output.astype('float32')
 87 | 
 88 | def get_fans(shape):
 89 |     fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
 90 |     fan_out = shape[1] if len(shape) == 2 else shape[0]
 91 |     return fan_in, fan_out
 92 | 
 93 | def glorot_uniform(shape,init='glorot'):
 94 |     def uniform(shape, scale=0.05, name=None):
 95 |         return np.random.uniform(low=-scale, high=scale, size=shape)
 96 |     fan_in, fan_out = get_fans(shape)
 97 |     s = np.sqrt(6. / (fan_in + fan_out))
 98 |     if init=='he':
 99 |         s = np.sqrt(6./fan_in)
100 |         return uniform(shape,s)
101 |     else:
102 |         return uniform(shape, s)
103 | 
104 | def init_weights(fan_in,fan_out,init='he'):
105 | 
106 |     def uniform(stdev, size):
107 |         """uniform distribution with the given stdev and size"""
108 |         return numpy.random.uniform(
109 |             low=-stdev * numpy.sqrt(3),
110 |             high=stdev * numpy.sqrt(3),
111 |             size=size
112 |         ).astype(theano.config.floatX)
113 | 
114 |     if init == 'lecun' or (init == None and fan_in != fan_out):
115 |         weight_values = uniform(numpy.sqrt(1. / fan_in), (fan_in, fan_out))
116 | 
117 |     elif init == 'he':
118 |         weight_values = uniform(numpy.sqrt(2. / fan_in), (fan_in, fan_out))
119 | 
120 |     elif init == 'orthogonal' or (init == None and fan_in == fan_out):
121 |         # From lasagne
122 |         def sample(shape):
123 |             if len(shape) < 2:
124 |                 raise RuntimeError("Only shapes of length 2 or more are "
125 |                                    "supported.")
126 |             flat_shape = (shape[0], numpy.prod(shape[1:]))
127 |             # TODO: why normal and not uniform?
128 |             a = numpy.random.normal(0.0, 1.0, flat_shape)
129 |             u, _, v = numpy.linalg.svd(a, full_matrices=False)
130 |             # pick the one with the correct shape
131 |             q = u if u.shape == flat_shape else v
132 |             q = q.reshape(shape)
133 |             return q.astype(theano.config.floatX)
134 |         weight_values = sample((fan_in, fan_out))
135 |     return weight_values
136 | 
137 | def Dense(name, input_dim, output_dim, inputs, bias=True, init=None, weightnorm=True,hidden_dim=None):
138 | 
139 |     weight_values = init_weights(input_dim,output_dim,init)
140 | 
141 |     weight = lib.param(
142 |         name + '.W',
143 |         weight_values
144 |     )
145 | 
146 |     batch_size = None
147 |     if inputs.ndim==3:
148 |         batch_size = inputs.shape[0]
149 |         inputs = inputs.reshape((-1,input_dim))
150 | 
151 |     if weightnorm:
152 |         norm_values = numpy.linalg.norm(weight_values, axis=0)
153 |         norms = lib.param(
154 |             name + '.g',
155 |             norm_values
156 |         )
157 | 
158 |         normed_weight = weight * (norms / weight.norm(2, axis=0)).dimshuffle('x', 0)
159 |         result = T.dot(inputs, normed_weight)
160 | 
161 |     else:
162 |         result = T.dot(inputs, weight)
163 | 
164 |     if bias:
165 |         b = lib.param(
166 |             name + '.b',
167 |             numpy.zeros((output_dim,), dtype=theano.config.floatX)
168 |         )
169 |         result += b
170 | 
171 |     result.name = name+".output"
172 |     if batch_size!=None:
173 |         return result.reshape((batch_size,hidden_dim,output_dim))
174 |     else:
175 |         return result
176 | 
177 | def Embedding(name, n_symbols, output_dim, indices):
178 |     vectors = lib.param(
179 |         name,
180 |         numpy.random.randn(
181 |             n_symbols,
182 |             output_dim
183 |         ).astype(theano.config.floatX)
184 |     )
185 | 
186 |     output_shape = tuple(list(indices.shape) + [output_dim])
187 | 
188 |     return vectors[indices.flatten()].reshape(output_shape)
189 | 
190 | def softmax_and_sample(logits):
191 |     old_shape = logits.shape
192 |     flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1]))
193 |     samples = T.cast(
194 |         srng.multinomial(pvals=T.nnet.softmax(flattened_logits)),
195 |         theano.config.floatX
196 |     ).reshape(old_shape)
197 |     return T.argmax(samples, axis=samples.ndim-1)
198 | 
199 | def GRUStep(name, input_dim, hidden_dim, x_t, h_tm1):
200 |     processed_input = lib.ops.Dense(
201 |         name+'.Input',
202 |         input_dim,
203 |         3 * hidden_dim,
204 |         x_t
205 |     )
206 | 
207 |     gates = T.nnet.sigmoid(
208 |         lib.ops.Dense(
209 |             name+'.Recurrent_Gates',
210 |             hidden_dim,
211 |             2 * hidden_dim,
212 |             h_tm1,
213 |             bias=False
214 |         ) + processed_input[:, :2*hidden_dim]
215 |     )
216 | 
217 |     update = gates[:, :hidden_dim]
218 |     reset  = gates[:, hidden_dim:]
219 | 
220 |     scaled_hidden = reset * h_tm1
221 | 
222 |     candidate = T.tanh(
223 |         lib.ops.Dense(
224 |             name+'.Recurrent_Candidate',
225 |             hidden_dim,
226 |             hidden_dim,
227 |             scaled_hidden,
228 |             bias=False,
229 |             init='orthogonal'
230 |         ) + processed_input[:, 2*hidden_dim:]
231 |     )
232 | 
233 |     one = lib.floatX(1.0)
234 |     return (update * candidate) + ((one - update) * h_tm1)
235 | 
236 | def __ConvLSTMStep(
237 |         name,
238 |         seq_len,
239 |         input_dim,
240 |         hidden_dim,
241 |         current_input,
242 |         last_hidden,
243 |         last_cell,
244 |         dilation_depth=10,
245 |         inp_bias_init=0.,
246 |         forget_bias_init=3.,
247 |         out_bias_init=0.,
248 |         g_bias_init=0.):
249 |     # X_t*(U^i, U^f, U^o, U^g)
250 | 
251 |     dilations = [2**i for i in xrange(dilation_depth)]
252 |     prev_conv = current_input
253 |     last_cell_stack = T.concatenate((last_cell,last_cell),axis=1)
254 |     for i,value in enumerate(dilations):
255 |         #prev_conv = lib.ops.conv1d(name+".WaveNetConv%d"%(i+1),prev_conv,2,1,hidden_dim,input_dim,True,False,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:current_input.shape[2],:]
256 |         prev_conv,y = lib.ops.WaveNetConv1d("WaveNetBlock-%d"%(i+1),prev_conv,2,hidden_dim,input_dim,bias=True,batchnorm=False,dilation=value)
257 | 
258 |     prev_conv = T.concatenate((prev_conv,last_hidden),axis=1)
259 |     prev_conv = lib.ops.conv1d(name+".ConvGates",prev_conv,1,1,4*hidden_dim,2*input_dim,True,False)
260 | 
261 |     W_cell = lib.param(name+'.CellWeights',lasagne.init.HeNormal().sample((3*hidden_dim,seq_len,1)))
262 |     inp_forget = T.nnet.sigmoid(prev_conv[:,:2*hidden_dim] + W_cell[:2*hidden_dim]*last_cell_stack)
263 |     i_t = inp_forget[:,:hidden_dim]
264 |     f_t = inp_forget[:,hidden_dim:]
265 | 
266 |     C_t = f_t*last_cell + i_t*T.tanh(prev_conv[:,2*hidden_dim:3*hidden_dim])
267 | 
268 |     o_t = T.nnet.sigmoid(prev_conv[:,3*hidden_dim:]+W_cell[2*hidden_dim:]*C_t)
269 | 
270 |     H_t = o_t*T.tanh(C_t)
271 | 
272 |     return H_t,C_t
273 | 
274 | def ConvLSTM(name, seq_len, input_dim, hidden_dim, inputs, h0=None, c0=None):
275 |     #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE)
276 | 
277 |     def step(x_t, h_tm1, c_tm1):
278 |         return __ConvLSTMStep(
279 |             name+'.Step',
280 |             seq_len,
281 |             input_dim,
282 |             hidden_dim,
283 |             x_t,
284 |             h_tm1,
285 |             c_tm1
286 |         )
287 | 
288 |     outputs, _ = theano.scan(
289 |         step,
290 |         sequences=[inputs],
291 |         outputs_info=[h0,c0],
292 |     )
293 | 
294 |     return outputs
295 | 
296 | def GRU(name, input_dim, hidden_dim, inputs, h0=None):
297 |     #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE)
298 |     inputs = inputs.transpose(1,0,2)
299 | 
300 |     def step(x_t, h_tm1):
301 |         return GRUStep(
302 |             name+'.Step',
303 |             input_dim,
304 |             hidden_dim,
305 |             x_t,
306 |             h_tm1
307 |         )
308 | 
309 |     outputs, _ = theano.scan(
310 |         step,
311 |         sequences=[inputs],
312 |         outputs_info=[h0],
313 |     )
314 | 
315 |     out = outputs.dimshuffle(1,0,2)
316 |     out.name = name+'.output'
317 |     return out
318 | 
319 | 
320 | 
321 | def recurrent_fn(x_t, h_tm1,name,input_dim,hidden_dim,W1,b1,W2,b2):
322 |     A1 = T.nnet.sigmoid(BatchNorm(name+".Inp2Hid",T.dot(x_t,W1[:input_dim]),2*hidden_dim,layer='recurrent') +
323 |                         BatchNorm(name+".Hid2Hid",T.dot(h_tm1,W1[input_dim:]),2*hidden_dim,layer='recurrent') + b1)
324 | 
325 |     #A1 = T.nnet.sigmoid(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1) + b1)
326 | 
327 |     z = A1[:,:hidden_dim]
328 | 
329 |     r = A1[:,hidden_dim:]
330 | 
331 |     scaled_hidden = r*h_tm1
332 | 
333 |     h = T.tanh(BatchNorm(name+".Candidate",T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2),hidden_dim,layer='recurrent')+b2)
334 | 
335 |     # h = T.tanh(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2)+b2)
336 | 
337 |     one = lib.floatX(1.0)
338 |     return ((z * h) + ((one - z) * h_tm1)).astype('float32')
339 | 
340 | def myGRU(name, input_dim, hidden_dim, inputs, h0=None):
341 |     #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE)
342 |     inputs = inputs.transpose(1,0,2)
343 | 
344 |     weight_values = lasagne.init.GlorotUniform().sample((input_dim+hidden_dim,2*hidden_dim))
345 |     W1 = lib.param(
346 |         name+'.Gates.W',
347 |         weight_values
348 |     )
349 | 
350 |     b1 = lib.param(
351 |         name+'.Gates.b',
352 |         np.ones(2*hidden_dim).astype(theano.config.floatX)
353 |         )
354 | 
355 |     weight_values = lasagne.init.GlorotUniform().sample((input_dim+hidden_dim,hidden_dim))
356 |     W2 = lib.param(
357 |         name+'.Candidate.W',
358 |         weight_values
359 |     )
360 | 
361 |     b2 = lib.param(
362 |         name+'.Candidate.b',
363 |         np.zeros(hidden_dim).astype(theano.config.floatX)
364 |         )
365 | 
366 |     def step(x_t, h_tm1):
367 |         return recurrent_fn(
368 |             x_t,
369 |             h_tm1,
370 |             name,
371 |             input_dim,
372 |             hidden_dim,
373 |             W1,b1,W2,b2
374 |         )
375 | 
376 |     outputs, _ = theano.scan(
377 |         step,
378 |         sequences=[inputs],
379 |         outputs_info=[h0],
380 |     )
381 | 
382 |     out = outputs.dimshuffle(1,0,2)
383 |     out.name = name+'.output'
384 |     return out
385 | 
386 | 
387 | def recurrent_fn_hred(x_t, h_tm1,hidden_dim,W1,b1,W2,b2):
388 |     global DIM
389 |     #A1 = T.nnet.sigmoid(lib.ops.BatchNorm(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1),name="FrameLevel.GRU"+str(name)+".Input.",length=2*512) + b1)
390 |     A1 = T.nnet.sigmoid(T.dot(T.concatenate((x_t,h_tm1),axis=1),W1) + b1)
391 | 
392 |     z = A1[:,:hidden_dim]
393 | 
394 |     r = A1[:,hidden_dim:]
395 | 
396 |     scaled_hidden = r*h_tm1
397 | 
398 |     #h = T.tanh(lib.ops.BatchNorm(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2),name="FrameLevel.GRU"+str(name)+".Output.",length=512)+b2)
399 |     h = T.tanh(T.dot(T.concatenate((scaled_hidden,x_t),axis=1),W2) + b2)
400 | 
401 |     one = lib.floatX(1.0)
402 |     return ((z * h) + ((one - z) * h_tm1)).astype('float32')
403 | 
404 | def HRED_GRU(name, input_dim, hidden_dim, inputs, h0=None):
405 |     #inputs.shape = (batch_size,N_FRAMES,FRAME_SIZE)
406 |     global DIM
407 |     inputs = inputs.transpose(1,0,2)
408 | 
409 |     weight_values = init_weights(input_dim+hidden_dim,2*hidden_dim)
410 | 
411 |     s_W1 = lib.param(
412 |         'Session.Gates.W',
413 |         weight_values
414 |     )
415 | 
416 |     s_b1 = lib.param(
417 |         'Session.Gates.b',
418 |         np.ones(2*hidden_dim).astype(theano.config.floatX)
419 |         )
420 | 
421 |     weight_values = init_weights(input_dim+hidden_dim,hidden_dim)
422 |     s_W2 = lib.param(
423 |         'Session.Candidate.W',
424 |         weight_values
425 |     )
426 | 
427 |     s_b2 = lib.param(
428 |         'Session.Candidate.b',
429 |         np.zeros(hidden_dim).astype(theano.config.floatX)
430 |         )
431 | 
432 |     weight_values = init_weights(input_dim+hidden_dim,2*hidden_dim)
433 | 
434 |     W1 = lib.param(
435 |         name+'.Gates.W',
436 |         weight_values
437 |     )
438 | 
439 |     b1 = lib.param(
440 |         name+'.Gates.b',
441 |         np.ones(2*hidden_dim).astype(theano.config.floatX)
442 |         )
443 | 
444 |     weight_values = init_weights(input_dim+hidden_dim,hidden_dim)
445 |     W2 = lib.param(
446 |         name+'.Candidate.W',
447 |         weight_values
448 |     )
449 | 
450 |     b2 = lib.param(
451 |         name+'.Candidate.b',
452 |         np.zeros(hidden_dim).astype(theano.config.floatX)
453 |         )
454 | 
455 |     outputs, _ = theano.scan(
456 |         recurrent_fn_hred,
457 |         sequences=[inputs],
458 |         outputs_info=[T.alloc(0,inputs.shape[1],hidden_dim).astype(dtype=theano.config.floatX)],
459 |         non_sequences=[hidden_dim,W1,b1,W2,b2]
460 |     )
461 | 
462 |     #out = recurrent_fn(outputs[-1],h0,hidden_dim,s_W1,s_b1,s_W2,s_b2,"0")
463 |     out = recurrent_fn(outputs[-1],h0,hidden_dim,s_W1,s_b1,s_W2,s_b2)
464 | 
465 |     #DIM=hidden_dim
466 |     #out = outputs.dimshuffle(1,0,2)
467 |     #out.name = name+'.output'
468 |     return out
469 | 
470 | 
471 | def conv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,pad='valid',filter_dilation=(1,1),run_mode=0):
472 |     W = lib.param(
473 |         name+'.W',
474 |         lasagne.init.HeNormal().sample((n_filters,depth,kernel,1)).astype('float32')
475 |         )
476 | 
477 |     out = T.nnet.conv2d(input,W,subsample=(stride,1),border_mode=pad,filter_dilation=filter_dilation)
478 | 
479 |     if bias:
480 |         b = lib.param(
481 |             name + '.b',
482 |             np.zeros(n_filters).astype('float32')
483 |             )
484 | 
485 |         out += b[None,:,None,None]
486 | 
487 |     if batchnorm:
488 |         out = BatchNorm(name,out,n_filters,mode=1,run_mode=run_mode)
489 | 
490 |     return out
491 | 
492 | def ResNetConv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False):
493 |     if stride==1 and n_filters==depth:
494 |         project = input
495 |     else:
496 |         project = lib.ops.conv1d(name+".Projection.conv",input,1,stride,n_filters,depth,bias=bias,batchnorm=batchnorm)
497 |     pad = (kernel-1)/2
498 |     conv1 = T.nnet.relu(lib.ops.conv1d(name+".conv1",input,kernel,stride,n_filters,depth,bias=bias,batchnorm=batchnorm,pad=(0,pad)))
499 |     conv2 = lib.ops.conv1d(name+".conv2",conv1,kernel,1,n_filters,n_filters,bias=bias,batchnorm=batchnorm,pad=(0,pad))
500 | 
501 |     out = T.nnet.relu(conv2+project)
502 |     return out
503 | 
504 | def WaveNetConv1d(name,input,kernel,n_filters,depth,bias=False,batchnorm=False,dilation=1):
505 |     conv1 = lib.ops.conv1d(name+".filter&gate",input,kernel,1,2*n_filters,depth,True,batchnorm,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:input.shape[2],:]
506 |     z = T.nnet.sigmoid(conv1[:,:n_filters,:,:])*T.tanh(conv1[:,n_filters:,:,:])
507 |     out = lib.ops.conv1d(name+".projection&param_skip",z,1,1,2*depth,n_filters,bias=bias,batchnorm=batchnorm)
508 |     return out[:,:depth,:,:]+input,out[:,depth:,:,:]
509 | 
510 | def DenseNetConv1d(name,input,kernel,n_filters,depth,bias=False,batchnorm=False,dilation=1):
511 |     conv1 = lib.ops.conv1d(name+".filter&gate",input,kernel,1,2*n_filters,depth,True,batchnorm,pad=(dilation,0),filter_dilation=(dilation,1))[:,:,:input.shape[2],:]
512 |     z = T.nnet.sigmoid(conv1[:,:n_filters,:,:])*T.tanh(conv1[:,n_filters:,:,:])
513 |     return z
514 | 
515 | def ResNetDeconv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,act=True):
516 |     if stride==1 and n_filters==depth:
517 |         project = input
518 |     else:
519 |         project = lib.ops.deconv1d(name+".Projection.conv",input,1,stride,n_filters,depth,bias=bias,batchnorm=batchnorm,output=stride*input.shape[-1])
520 |     pad = (kernel-1)/2
521 | 
522 |     conv2 = T.nnet.relu(lib.ops.deconv1d(name+".conv2",input,kernel,1,n_filters,depth,bias=bias,batchnorm=batchnorm,output=input.shape[-1],pad=(0,pad)))
523 |     conv1 = T.nnet.relu(lib.ops.deconv1d(name+".conv1",conv2,kernel,stride,n_filters,n_filters,bias=bias,batchnorm=batchnorm,output=stride*conv2.shape[-1],pad=(0,pad)))
524 | 
525 |     if act:
526 |         out = T.nnet.relu(conv1+project)
527 |     else:
528 |         out = conv1+project
529 |     return out
530 | 
531 | def deconv1d(name,input,kernel,stride,n_filters,depth,bias=False,batchnorm=False,pad='valid'):
532 | 
533 |     o = output = stride*(input.shape[2]-1) + kernel
534 |     if type(pad)==tuple:
535 |         o -= 2*pad[0]
536 | 
537 |     W = lib.param(
538 |         name+'.W',
539 |         lasagne.init.GlorotUniform().sample((depth,n_filters,kernel,1)).astype('float32')
540 |         )
541 | 
542 |     out = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=input,filters=W,input_shape=(None,n_filters,o,1),border_mode=pad,subsample=(stride,1))
543 | 
544 |     if bias:
545 |         b = lib.param(
546 |             name + '.b',
547 |             np.zeros(n_filters).astype('float32')
548 |             )
549 | 
550 |         out += b[None,:,None,None]
551 | 
552 |     if batchnorm:
553 |         out = BatchNorm(name,out,n_filters,mode=1)
554 | 
555 |     return out
556 | 
557 | def pool1d(input,subsample,pad,pool_indices=None):
558 |     import theano.tensor.signal.pool
559 |     out = T.signal.pool.pool_2d(input,(subsample,1),ignore_border=True,padding=(pad,0))
560 |     if pool_indices:
561 |         indices = T.grad(None,wrt=input,known_grads={out:T.ones_like(out)})
562 |         return out,indices
563 |     return out
564 | 
565 | def unpool1d(input,upsample,desired_length,pool_indices=None):
566 |     out = T.extra_ops.repeat(input,upsample,axis=2)
567 |     if pool_indices:
568 |         temp = pool_indices*out[:,:,upsample-1:upsample-1]
569 |         pad = T.alloc(0,temp.shape[0],temp.shape[1],upsample-1,temp.shape[3])
570 |         return T.concatenate((pad,temp),axis=2)[:,:,:desired_length]
571 |     return out[:,:,:desired_length]
572 | 


--------------------------------------------------------------------------------
/lib/ops.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/ops.pyc


--------------------------------------------------------------------------------
/lib/optimizers.py:
--------------------------------------------------------------------------------
 1 | import lib
 2 | import theano
 3 | import numpy as np
 4 | import theano.tensor as T
 5 | 
 6 | def RMSprop(cost, params, learnrate, rho=0.90, epsilon=1e-6):
 7 |     gparams = []
 8 |     iter = 1
 9 |     for param in params:
10 |         gparam = T.grad(cost, param)
11 |         gparams.append(gparam)  
12 |         print param['name'] + " completed"
13 |     updates=[]
14 |     for param, gparam in zip(params, gparams):
15 |         acc = theano.shared(param.get_value() * 0.)
16 |         acc_new = rho * acc + (1 - rho) * gparam ** 2
17 |         gradient_scaling = T.sqrt(acc_new + epsilon)
18 |         gparam = gparam / gradient_scaling
19 |         updates.append((acc, acc_new))
20 |         updates.append((param, param - gparam * learnrate))
21 |     return updates
22 | 
23 | def Adam(cost, params, lr=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8,gradClip=True,value=1.):
24 |     gparams = []
25 |     iter = 1
26 |     for param in params:
27 |         gparam = T.grad(cost,param)
28 |         if gradClip:
29 |     	   gparam = T.clip(gparam,lib.floatX(-value), lib.floatX(value))
30 |     	gparams.append(gparam)
31 |     	print str(iter) + " completed"
32 |     	iter += 1
33 |     updates = []
34 |     for p, g in zip(params, gparams):
35 |         m = theano.shared(p.get_value() * 0.)
36 |         v = theano.shared(p.get_value() * 0.)
37 |         
38 |         m_new = beta1 * m + (1 - beta1) * g
39 |         v_new = beta2 * v + (1 - beta2) * (g ** 2)
40 |         
41 |         gradient_scaling = T.sqrt(v_new + epsilon)
42 |         updates.append((m, m_new))
43 |         updates.append((v, v_new))
44 |         updates.append((p, p - lr * m / gradient_scaling))
45 |     return updates
46 | 


--------------------------------------------------------------------------------
/lib/optimizers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/lib/optimizers.pyc


--------------------------------------------------------------------------------
/my_three_tier.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | sys.path.append(os.getcwd())
  3 | 
  4 | import numpy as np
  5 | import numpy
  6 | numpy.random.seed(123)
  7 | import random
  8 | random.seed(123)
  9 | 
 10 | import dataset
 11 | 
 12 | import theano
 13 | import theano.tensor as T
 14 | theano.config.floatX='float32'
 15 | from theano.tensor.nnet import neighbours
 16 | import theano.ifelse
 17 | import lib
 18 | import lib.optimizers
 19 | import lasagne
 20 | import scipy.io.wavfile
 21 | 
 22 | import time
 23 | import functools
 24 | import itertools
 25 | 
 26 | # Hyperparams
 27 | NB_EPOCH=10
 28 | BATCH_SIZE = 128
 29 | N_FRAMES = 256 # How many 'frames' to include in each truncated BPTT pass
 30 | FRAME_SIZE = 4 # How many samples per frame
 31 | DIM = 512 # Model dimensionality. 512 is sufficient for model development; 1024 if you want good samples.
 32 | N_GRUS = 2 # How many GRUs to stack in the frame-level model
 33 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 34 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 35 | 
 36 | # Dataset
 37 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard'
 38 | N_FILES = 1000
 39 | # DATA_PATH = '/PersimmonData/kiwi_parts'
 40 | # N_FILES = 516
 41 | BITRATE = 16000
 42 | 
 43 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 44 | SEQ_LEN = N_FRAMES * FRAME_SIZE # Total length (# of samples) of each truncated BPTT sequence
 45 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 46 | 
 47 | print "Model settings:"
 48 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 49 | all_vars = sorted(all_vars, key=lambda x: x[0])
 50 | for var_name, var_value in all_vars:
 51 |     print "\t{}: {}".format(var_name, var_value)
 52 | 
 53 | def frame_level_rnn(input_sequences, h0, reset):
 54 |     """
 55 |     input_sequences.shape: (batch size, N_FRAMES * FRAME_SIZE)
 56 |     h0.shape:              (batch size, N_GRUS, DIM)
 57 |     reset.shape:           ()
 58 |     output.shape:          (batch size, N_FRAMES * FRAME_SIZE, DIM)
 59 |     """
 60 |     batch_size = input_sequences.shape[0]
 61 |     n_frames = input_sequences.shape[1]/FRAME_SIZE
 62 | 
 63 |     learned_h0 = lib.param(
 64 |         'FrameLevel.h0',
 65 |         numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX)
 66 |     )
 67 | 
 68 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM)
 69 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 70 | 
 71 |     frames = input_sequences.reshape((
 72 |         input_sequences.shape[0],
 73 |         input_sequences.shape[1] / (FRAME_SIZE * FRAME_SIZE),
 74 |         FRAME_SIZE*FRAME_SIZE
 75 |     ))
 76 | 
 77 |     # frames = emb.reshape((
 78 |     #     input_sequences.shape[0],
 79 |     #     input_sequences.shape[1] / (FRAME_SIZE*FRAME_SIZE),
 80 |     #     FRAME_SIZE*Q_LEVELS
 81 |     # ))
 82 | 
 83 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
 84 |     # (a reasonable range to pass as inputs to the RNN)
 85 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
 86 |     # frames *= lib.floatX(2)
 87 | 
 88 |     gru1 = lib.ops.myGRU('FrameLevel.GRU1', FRAME_SIZE*FRAME_SIZE, DIM, frames, h0=h0[:, 0])
 89 | 
 90 |     gru1_output = lib.ops.Dense(
 91 |         'FrameLevel1.Output',
 92 |         DIM,
 93 |         FRAME_SIZE * DIM,
 94 |         gru1,
 95 |         init='he',
 96 |         hidden_dim=64
 97 |     ).reshape((batch_size,256,DIM))
 98 | 
 99 | 
100 |     gru2 = lib.ops.myGRU('FrameLevel.GRU2', DIM, DIM, gru1_output, h0=h0[:, 1])
101 |     #gru3 = lib.ops.myGRU('FrameLevel.GRU3', DIM, DIM, gru2, h0=h0[:, 2])
102 | 
103 |     #gru1,gru2,gru3 = lib.ops.myGRU('FrameLevel.GRU', FRAME_SIZE, DIM, frames, h0=h0)
104 | 
105 |     # gru3.shape = (batch_size,N_FRAMES,DIM)
106 | 
107 |     gru2_output = lib.ops.Dense(
108 |         'FrameLevel2.Output',
109 |         DIM,
110 |         FRAME_SIZE * DIM,
111 |         gru2,
112 |         init='he',
113 |         hidden_dim=256
114 |     ).reshape((batch_size,1024,DIM))
115 | 
116 | 
117 |     last_hidden = T.stack([gru1[:, -1], gru2[:, -1]], axis=1)
118 | 
119 |     return (gru2_output, last_hidden)
120 | 
121 | def sample_level_predictor(frame_level_outputs, prev_samples):
122 |     """
123 |     frame_level_outputs.shape: (batch size*SEQ_LEN, DIM)
124 |     prev_samples.shape:        (batch size*SEQ_LEN, FRAME_SIZE)
125 |     output.shape:              (batch size*SEQ_LEN, Q_LEVELS)
126 |     """
127 | 
128 |     prev_samples = lib.ops.Embedding(
129 |         'SampleLevel.Embedding',
130 |         Q_LEVELS,
131 |         Q_LEVELS,
132 |         prev_samples
133 |     ).reshape((-1, FRAME_SIZE * Q_LEVELS))
134 | 
135 |     # prev_samples.shape = (batch_size*SEQ_LEN,FRAME_SIZE,Q_LEVELS)
136 | 
137 |     out = lib.ops.Dense(
138 |         'SampleLevel.L1_PrevSamples',
139 |         FRAME_SIZE * Q_LEVELS,
140 |         DIM,
141 |         prev_samples,
142 |         bias=False,
143 |         init='he',
144 |     ) ##(128,256,512)
145 |     out += frame_level_outputs
146 |     out = T.nnet.relu(out)
147 | 
148 |     out = lib.ops.Dense('SampleLevel.L2', DIM, DIM, out, init='he')
149 |     out = T.nnet.relu(out)
150 | 
151 |     out = lib.ops.Dense('SampleLevel.L3', DIM, DIM, out, init='he')
152 |     out = T.nnet.relu(out)
153 | 
154 |     # We apply the softmax later
155 |     return lib.ops.Dense('SampleLevel.Output', DIM, Q_LEVELS, out)
156 | 
157 | sequences   = T.imatrix('sequences')
158 | h0          = T.tensor3('h0')
159 | reset       = T.iscalar('reset')
160 | 
161 | input_sequences = sequences[:, :-FRAME_SIZE]
162 | target_sequences = sequences[:, FRAME_SIZE:]
163 | 
164 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, h0, reset)
165 | 
166 | # frame_level_outputs.shape = (batch_size,SEQ_LEN,DIM)
167 | 
168 | prev_samples = sequences[:, :-1]
169 | prev_samples = prev_samples.reshape((1, BATCH_SIZE, 1, -1))
170 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid')
171 | prev_samples = prev_samples.reshape((BATCH_SIZE * SEQ_LEN, FRAME_SIZE))
172 | 
173 | sample_level_outputs = sample_level_predictor(
174 |     frame_level_outputs.reshape((BATCH_SIZE * SEQ_LEN, DIM)),
175 |     prev_samples
176 | )
177 | 
178 | cost = T.nnet.categorical_crossentropy(
179 |     T.nnet.softmax(sample_level_outputs),
180 |     target_sequences.flatten()
181 | ).mean()
182 | 
183 | # By default we report cross-entropy cost in bits.
184 | # Switch to nats by commenting out this line:
185 | cost = cost * lib.floatX(1.44269504089)
186 | 
187 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
188 | 
189 | lib.print_params_info(cost, params)
190 | 
191 | #pdates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP)
192 | grads = T.grad(cost, wrt=params, disconnected_inputs='warn')
193 | 
194 | #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
195 | 
196 | print "Gradients Computed"
197 | 
198 | updates = lasagne.updates.adam(grads, params)
199 | 
200 | train_fn = theano.function(
201 |     [sequences, h0, reset],
202 |     [cost, new_h0],
203 |     updates=updates,
204 |     on_unused_input='warn'
205 | )
206 | 
207 | frame_level_generate_fn = theano.function(
208 |     [sequences, h0, reset],
209 |     frame_level_rnn(sequences, h0, reset),
210 |     on_unused_input='warn'
211 | )
212 | 
213 | frame_level_outputs = T.matrix('frame_level_outputs')
214 | prev_samples        = T.imatrix('prev_samples')
215 | sample_level_generate_fn = theano.function(
216 |     [frame_level_outputs, prev_samples],
217 |     lib.ops.softmax_and_sample(
218 |         sample_level_predictor(
219 |             frame_level_outputs,
220 |             prev_samples
221 |         )
222 |     ),
223 |     on_unused_input='warn'
224 | )
225 | 
226 | def generate_and_save_samples(tag):
227 | 
228 |     def write_audio_file(name, data):
229 | 
230 |         data = data.astype('float32')
231 |         data -= data.min()
232 |         data /= data.max()
233 |         data -= 0.5
234 |         data *= 0.95
235 | 
236 |         import scipy.io.wavfile
237 |         scipy.io.wavfile.write(name+'.wav',BITRATE,data)
238 | 
239 |     # Generate 5 sample files, each 5 seconds long
240 |     N_SEQS = 5
241 |     LENGTH = 8*BITRATE
242 | 
243 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
244 |     samples[:, :FRAME_SIZE] = Q_ZERO
245 | 
246 |     h0 = numpy.zeros((N_SEQS, N_GRUS, DIM), dtype='float32')
247 |     frame_level_outputs = None
248 | 
249 |     for t in xrange(FRAME_SIZE, LENGTH):
250 | 
251 |         if t % FRAME_SIZE == 0:
252 |             frame_level_outputs, h0 = frame_level_generate_fn(
253 |                 samples[:, t-FRAME_SIZE:t],
254 |                 h0,
255 |                 numpy.int32(t == FRAME_SIZE)
256 |             )
257 | 
258 |         samples[:, t] = sample_level_generate_fn(
259 |             frame_level_outputs[:, t % FRAME_SIZE],
260 |             samples[:, t-FRAME_SIZE:t]
261 |         )
262 | 
263 |     for i in xrange(N_SEQS):
264 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
265 | 
266 | print "Training!"
267 | total_iters = 0
268 | 
269 | for epoch in xrange(NB_EPOCH):
270 |     h0 = np.zeros((BATCH_SIZE, N_GRUS, DIM)).astype(theano.config.floatX)
271 |     costs = []
272 |     times = []
273 |     data = dataset.get_data(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN+FRAME_SIZE, 0, Q_LEVELS, Q_ZERO)
274 | 
275 |     for seqs, reset in data:
276 |         start_time = time.time()
277 |         cost, h0 = train_fn(seqs, h0, reset)
278 |         total_time = time.time() - start_time
279 |         times.append(total_time)
280 |         total_iters += 1
281 |         print "Batch ",total_iters
282 |         costs.append(cost)
283 |         print "\tCost: ",np.mean(costs)
284 |         print "\tTime: ",np.mean(times)
285 |         if total_iters%10000==0:
286 |             generate_and_save_samples('iterno_%d'%total_iters)
287 |     break
288 |             # print "epoch:{}\ttotal iters:{}\ttrain cost:{}\ttotal time:{}\ttime per iter:{}".format(
289 |             #     epoch,
290 |             #     total_iters,
291 |             #     numpy.mean(costs),
292 |             #     total_time,
293 |             #     total_time / total_iters
294 |             # )
295 |             # tag = "iters{}_time{}".format(total_iters, total_time)
296 |             # generate_and_save_samples(tag)
297 |             # lib.save_params('params_{}.pkl'.format(tag))
298 | 
299 |             # costs = []
300 |             # last_print_time += PRINT_TIME
301 |             # last_print_iters += PRINT_ITERS
302 | 


--------------------------------------------------------------------------------
/temp.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import numpy as np
 3 | import theano.tensor as T
 4 | arr = np.arange(1,33).astype('float32').reshape((1,1,1,32))
 5 | filt = np.asarray([1,2,3]).astype('float32').reshape((1,1,1,3))
 6 | filt2 = np.asarray([1]).astype('float32').reshape((1,1,1,1))
 7 | 
 8 | arr = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=arr,filters=filt,input_shape=(None,1,1,arr.shape[-1]*2+1),subsample=(1,2)).eval()
 9 | 
10 | filt = np.asarray([1,1,1]).astype('float32').reshape((1,1,1,3))
11 | arr = T.nnet.abstract_conv.conv2d_grad_wrt_inputs(output_grad=arr,filters=filt,input_shape=(None,1,1,arr.shape[-1]+1)).eval()
12 | 
13 | #arr = T.nnet.conv2d(arr,filt,subsample=(1,2),border_mode=(0,1)).eval()
14 | #arr = T.concatenate((arr[:,:,:,0][:,:,:,None],arr),axis=-1).eval()
15 | 
16 | 
17 | import numpy as np
18 | import theano
19 | import theano.tensor as T
20 | import theano.tensor.signal.pool
21 | 
22 | def unpool1d(input,upsample,desired_length,pool_indices=None):
23 |     out = T.extra_ops.repeat(input,upsample,axis=2)[:,:,:desired_length]
24 |     if pool_indices:
25 |         mask = T.lt(pool_indices,0)
26 |         return mask*out
27 |     return out
28 | 
29 | dilations = np.asarray([[1,2,4,8,16,32]*1]).tolist()[0]
30 | length=6000
31 | N_BLOCKS=2
32 | arr = -1*np.arange(1,length+1).astype('float32').reshape((1,1,length,1))
33 | W = np.asarray([0,1]).astype('float32').reshape((1,1,2,1))
34 | indices=[]
35 | for j in xrange(N_BLOCKS):
36 |     for value in dilations:
37 |     #    arr = T.nnet.conv2d(arr,W,filter_dilation=(1,value))
38 |         arr = T.nnet.conv2d(arr,W,filter_dilation=(value,1),border_mode=(value,0))[:,:,:length]
39 |     arr,idx = lib.ops.pool1d(arr,4,3,True)
40 |     indices+=[idx]
41 |     length=arr.shape[2]
42 | for j in xrange(N_BLOCKS):
43 |     leng = indices[-(j+1)].shape[2]
44 |     arr = unpool1d(arr,4,leng,indices[-(j+1)])
45 | 
46 | ((36*4+32)*4+32)*4+32
47 | 
48 | l=49
49 | arr = T.as_tensor_variable(np.random.randint(0,256,(1,1,l,1)).astype('float32'))
50 | #arr = T.as_tensor_variable(np.arange(1,l+1).astype('float32').reshape((1,1,l,1)))
51 | (res,idx) = lib.ops.pool1d(arr,4,3,True)
52 | #res = lib.ops.pool1d(arr,4,3,False)
53 | res2 = T.as_tensor_variable(np.random.randint(0,256,res.shape.eval()))
54 | out = lib.ops.unpool1d(res,4,l,idx).eval()
55 | #out = lib.ops.unpool1d(res,4,None).eval()
56 | print res.shape.eval()
57 | print arr.eval().flatten()
58 | print res.eval().flatten()
59 | print idx.eval().flatten()
60 | #print res2.eval().flatten()
61 | print out.flatten()
62 | 
63 | import lib
64 | import lib.ops
65 | import numpy as np
66 | import theano
67 | import theano.tensor as T
68 | input_sequences = T.imatrix()
69 | Q_LEVELS=256
70 | n_filters=64
71 | length = input_sequences.shape[1]
72 | start =  (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,None,:]
73 | conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=False,batchnorm=False,pad=(0,1))[:,:,:,:length]
74 | f = theano.function([input_sequences],[conv1])
75 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import numpy as np
  3 | import numpy
  4 | numpy.random.seed(123)
  5 | import random
  6 | random.seed(123)
  7 | import dataset
  8 | import theano
  9 | import theano.tensor as T
 10 | theano.config.floatX='float32'
 11 | import lib.ops
 12 | import scipy.io.wavfile
 13 | import time
 14 | import lasagne
 15 | 
 16 | 
 17 | # Hyperparams
 18 | NB_EPOCH=10
 19 | BATCH_SIZE = 32
 20 | N_FRAMES = 256 # How many 'frames' to include in each truncated BPTT pass
 21 | FRAME_SIZE = 768 # How many samples per frame
 22 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 23 | GRAD_CLIP = 1 # Elementwise grad clip threshold
 24 | DIM = 512
 25 | # Dataset
 26 | DATA_PATH = '/home/rithesh/DeepLearning/Vocal Synthesis/data'
 27 | #DATA_PATH = '/data/lisatmp3/kumarrit/blizzard'
 28 | N_FILES = 50
 29 | BITRATE = 16000
 30 | 
 31 | TEST_SET_SIZE = 128 # How many audio files to use for the test set
 32 | SEQ_LEN = 8192 # Total length (# of samples) of each truncated BPTT sequence
 33 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 34 | 
 35 | def network(input_sequences,h0,reset):
 36 | 
 37 |     batch_size = input_sequences.shape[0]
 38 | 
 39 |     learned_h0 = lib.param(
 40 |         'Session.h0',
 41 |         numpy.zeros(DIM, dtype=theano.config.floatX)
 42 |     )
 43 | 
 44 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], DIM)
 45 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
 46 | 
 47 |     emb = lib.ops.Embedding(
 48 |         'Embedding',
 49 |         Q_LEVELS,
 50 |         Q_LEVELS,
 51 |         input_sequences,
 52 |     ).transpose(0,2,1)[:,:,None,:] #(32, 256, 1, 8960)
 53 | 
 54 |     # conv1 = T.nnet.relu(lib.ops.conv1d("conv1",emb,3,1,128,256,bias=True,batchnorm=True)) #(32, 512, 1, 255)   - 289 - 31
 55 |     # conv2 = T.nnet.relu(lib.ops.conv1d("conv2",conv1,3,2,256,128,bias=True,batchnorm=True)) #(32, 512, 1, 127) - 143 - 15
 56 |     # conv3 = T.nnet.relu(lib.ops.conv1d("conv3",conv2,3,2,512,256,bias=True,batchnorm=True)) #(32, 512, 1, 63)  - 71  -  7
 57 |     # conv4 = T.nnet.relu(lib.ops.conv1d("conv4",conv3,3,2,1024,512,bias=True,batchnorm=True)) #(32, 512, 1, 31)  - 35  -  3
 58 |     # conv5 = T.nnet.relu(lib.ops.conv1d("conv5",conv4,3,2,2048,1024,bias=True,batchnorm=True)) #(32, 512, 1, 15) - 17  -  1
 59 |     start = lib.ops.ResNetConv1d("ResNet-Enc-0",emb,3,1,256,256,bias=True,batchnorm=True) # 8960 RF - 2
 60 |     rconv1 = lib.ops.ResNetConv1d("ResNet-Enc-1",start,3,2,128,256,bias=True,batchnorm=True) # 4480 RF - 5
 61 |     rconv2 = lib.ops.ResNetConv1d("ResNet-Enc-2",rconv1,3,2,128,128,bias=True,batchnorm=True) # 2240 RF - 11
 62 |     rconv3 = lib.ops.ResNetConv1d("ResNet-Enc-3",rconv2,3,2,128,128,bias=True,batchnorm=True) # 1120 RF - 23
 63 |     rconv4 = lib.ops.ResNetConv1d("ResNet-Enc-4",rconv3,3,2,256,128,bias=True,batchnorm=True) # 560 RF - 47
 64 |     rconv5 = lib.ops.ResNetConv1d("ResNet-Enc-5",rconv4,3,2,256,256,bias=True,batchnorm=True) # 280 RF - 95
 65 |     rconv6 = lib.ops.ResNetConv1d("ResNet-Enc-6",rconv5,3,2,256,256,bias=True,batchnorm=True) # 140 RF - 191
 66 |     rconv7 = lib.ops.ResNetConv1d("ResNet-Enc-7",rconv6,3,2,512,256,bias=True,batchnorm=True) #  70 RF - 383
 67 |     rconv8 = lib.ops.ResNetConv1d("ResNet-Enc-8",rconv7,3,2,512,512,bias=True,batchnorm=True) #  35 RF - 767
 68 | 
 69 |     #gru1 = lib.ops.myGRU('Encoder.GRU1',DIM,DIM,rconv7.transpose(2,0,3,1)[0][:,:15,:],h0=h0) # (32, 15, 512)
 70 |     gru1 = lib.ops.myGRU('Encoder.GRU1',DIM,DIM,rconv8.transpose(2,0,3,1)[0][:,:32,:],h0=h0) # (32, 15, 512)
 71 |     gru = gru1.transpose(0,2,1)[:,:,None,:] #(32, 512, 1, 15)
 72 |     #project = lib.ops.conv1d("Project.GRU",gru,1,1,4096,512,bias=True,batchnorm=True)
 73 | 
 74 |     rdeconv8 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-8",gru,3,2,512,512,bias=True,batchnorm=True)+rconv7[:,:,:,3:67]) # 64
 75 |     rdeconv7 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-7",rdeconv8,3,2,256,512,bias=True,batchnorm=True)+rconv6[:,:,:,9:137]) #128
 76 |     rdeconv6 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-6",rdeconv7,3,2,256,256,bias=True,batchnorm=True)+rconv5[:,:,:,21:277]) #256
 77 |     rdeconv5 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-5",rdeconv6,3,2,256,256,bias=True,batchnorm=True)+rconv4[:,:,:,45:557]) #512
 78 |     rdeconv4 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-4",rdeconv5,3,2,128,256,bias=True,batchnorm=True)+rconv3[:,:,:,93:1117]) #1024
 79 |     rdeconv3 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-3",rdeconv4,3,2,128,128,bias=True,batchnorm=True)+rconv2[:,:,:,189:2237]) #2048
 80 |     rdeconv2 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-2",rdeconv3,3,2,128,128,bias=True,batchnorm=True)+rconv1[:,:,:,381:4477]) #4096
 81 |     rdeconv1 = T.nnet.relu(lib.ops.ResNetDeconv1d("ResNet-Dec-1",rdeconv2,3,2,256,128,bias=True,batchnorm=True)+start[:,:,:,765:8957]) #8192
 82 |     rdeconv0 = lib.ops.ResNetDeconv1d("ResNet-Dec-0",rdeconv1,3,1,256,256,bias=True,batchnorm=True,act=False) #8192
 83 | 
 84 |     # deconv5 = T.nnet.relu(lib.ops.deconv1d("deconv5",gru,3,2,1024,2048,bias=True,batchnorm=True)+conv4[:,:,:,2:33]) # (32, 512, 1, 31)
 85 |     # deconv4 = T.nnet.relu(lib.ops.deconv1d("deconv4",deconv5,3,2,512,1024,bias=True,batchnorm=True)+conv3[:,:,:,6:69]) # (32, 512, 1, 63)
 86 |     # deconv3 = T.nnet.relu(lib.ops.deconv1d("deconv3",deconv4,3,2,256,512,bias=True,batchnorm=True)+conv2[:,:,:,14:141]) # (32, 512, 1, 127)
 87 |     # deconv2 = T.nnet.relu(lib.ops.deconv1d("deconv2",deconv3,3,2,128,256,bias=True,batchnorm=True)+conv1[:,:,:,30:285]) # (32, 512, 1, 255)
 88 |     # deconv1 = lib.ops.deconv1d("deconv1",deconv2,3,1,256,128,bias=True,batchnorm=True) # (32, 256, 1, 257)
 89 | 
 90 |     # output = rdeconv1[:,:,0,:].transpose(0,2,1)
 91 |     output = rdeconv0[:,:,0,:].transpose(0,2,1)
 92 |     return (gru[:,:,0,-1],output)
 93 | 
 94 | 
 95 | print "Model settings:"
 96 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 97 | all_vars = sorted(all_vars, key=lambda x: x[0])
 98 | for var_name, var_value in all_vars:
 99 |     print "\t{}: {}".format(var_name, var_value)
100 | 
101 | sequences   = T.imatrix('sequences')
102 | h0          = T.fmatrix('h0')
103 | reset       = T.iscalar('reset')
104 | 
105 | input_sequences = sequences[:,]
106 | target_sequences = sequences[:,768:]
107 | 
108 | new_h0, predicted_sequences = network(input_sequences,h0,reset)
109 | cost = T.nnet.categorical_crossentropy(
110 |     T.nnet.softmax(predicted_sequences.reshape((-1,Q_LEVELS))),
111 |     target_sequences.flatten()
112 | ).mean()
113 | 
114 | # By default we report cross-entropy cost in bits.
115 | # Switch to nats by commenting out this line:
116 | cost = cost * lib.floatX(1.44269504089)
117 | 
118 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
119 | lib.print_params_info(cost, params)
120 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP)
121 | grads = T.grad(cost, wrt=params)
122 | updates = lasagne.updates.adam(grads, params, learning_rate=0.01)
123 | 
124 | print "Gradients Computed"
125 | 
126 | train_fn = theano.function(
127 |     [sequences, h0, reset],
128 |     [cost, new_h0,predicted_sequences],
129 |     updates=updates,
130 |     on_unused_input='warn'
131 | )
132 | 
133 | input_seq = T.imatrix()
134 | test_h0 = T.fmatrix()
135 | test_reset = T.iscalar()
136 | 
137 | test_new_h0,test_predict = network(input_seq,test_h0,test_reset)
138 | test_fn = theano.function(
139 |     [input_seq, test_h0, test_reset],
140 |     [test_new_h0,T.nnet.softmax(test_predict.reshape((-1,Q_LEVELS)))]
141 | )
142 | 
143 | def generate_and_save_samples(tag,seed_h0):
144 | 
145 |     def write_audio_file(name, data):
146 | 
147 |         data = data.astype('float32')
148 |         data -= data.min()
149 |         data /= data.max()
150 |         data -= 0.5
151 |         data *= 0.95
152 | 
153 |         import scipy.io.wavfile
154 |         scipy.io.wavfile.write(name+'.wav',BITRATE,data)
155 | 
156 |     # Generate 5 sample files, each 5 seconds long
157 |     N_SEQS = 32
158 |     LENGTH = 8*BITRATE
159 |     LENGTH += LENGTH%31
160 | 
161 |     samples = numpy.zeros((N_SEQS, LENGTH), dtype='int32')
162 |     samples[:, :SEQ_LEN] = Q_ZERO
163 | 
164 |     #if seed_h0:
165 |     #    h0 = seed_h0
166 |     #else:
167 |     #    h0 = numpy.zeros((N_SEQS, DIM), dtype='float32')
168 |     h0 = seed_h0
169 |     frame_level_outputs = None
170 | 
171 |     for t in xrange(31, LENGTH-31,31):
172 |         h0,probs = test_fn(samples[:,t-31:t],h0,0)
173 |         probs = probs.reshape((N_SEQS,31,Q_LEVELS))
174 |         samples[:,t:t+31] = np.argmax(probs,axis=2)
175 |         print t
176 | 
177 |     for i in xrange(N_SEQS):
178 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i])
179 | 
180 | 
181 | #grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
182 | 
183 | print "Training!"
184 | total_iters = 0
185 | 
186 | for epoch in xrange(NB_EPOCH):
187 |     h0 = np.zeros((BATCH_SIZE, DIM)).astype(theano.config.floatX)
188 |     costs = []
189 |     times = []
190 |     data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO))
191 | #    seqs = data_feeder[20][0]
192 | #    reset = data_feeder[20][1]
193 | 
194 |     for seqs, reset in data_feeder:
195 |     # while True:
196 |         start_time = time.time()
197 |         cost, h0, _ = train_fn(seqs, h0, reset)
198 |         total_time = time.time() - start_time
199 |         times.append(total_time)
200 |         total_iters += 1
201 |         print "Batch ",total_iters
202 |         costs.append(cost)
203 |         print "\tCost: ",np.mean(costs)
204 |         print "\tTime: ",np.mean(times)
205 | #        if total_iters%500==0:
206 | #            generate_and_save_samples('iterno_%d'%total_iters)
207 | 


--------------------------------------------------------------------------------
/vctk_dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import scipy.io.wavfile
  3 | import scikits.audiolab
  4 | import scipy.signal
  5 | import random
  6 | import time
  7 | import numpy as np
  8 | import glob
  9 | 
 10 | 
 11 | random_seed = 123
 12 | 
 13 | def feed_epoch(speaker_id,BATCH_SIZE, SEQ_LEN, STRIDE, RF=1025, N_FILES=None):
 14 |     global random_seed
 15 |     def process_wav(desired_sample_rate, filename, use_ulaw):
 16 |         channels = scipy.io.wavfile.read(filename)
 17 |         file_sample_rate, audio = channels
 18 |         audio = ensure_mono(audio)
 19 |         audio = wav_to_float(audio)
 20 |         if use_ulaw:
 21 |             audio = ulaw(audio)
 22 |         audio = ensure_sample_rate(desired_sample_rate, file_sample_rate, audio)
 23 |         audio = float_to_uint8(audio)
 24 |         return audio
 25 | 
 26 | 
 27 |     def ulaw(x, u=255):
 28 |         x = np.sign(x) * (np.log(1 + u * np.abs(x)) / np.log(1 + u))
 29 |         return x
 30 | 
 31 | 
 32 |     def float_to_uint8(x):
 33 |         x += 1.
 34 |         x /= 2.
 35 |         uint8_max_value = np.iinfo('uint8').max
 36 |         x *= uint8_max_value
 37 |         x = x.astype('uint8')
 38 |         return x
 39 | 
 40 | 
 41 |     def wav_to_float(x):
 42 |         try:
 43 |             max_value = np.iinfo(x.dtype).max
 44 |             min_value = np.iinfo(x.dtype).min
 45 |         except:
 46 |             max_value = np.finfo(x.dtype).max
 47 |             min_value = np.finfo(x.dtype).min
 48 |         x = x.astype('float32', casting='safe')
 49 |         x -= min_value
 50 |         x /= ((max_value - min_value) / 2.)
 51 |         x -= 1.
 52 |         return x
 53 | 
 54 | 
 55 |     def ulaw2lin(x, u=255.):
 56 |         max_value = np.iinfo('uint8').max
 57 |         min_value = np.iinfo('uint8').min
 58 |         x = x.astype('float64', casting='safe')
 59 |         x -= min_value
 60 |         x /= ((max_value - min_value) / 2.)
 61 |         x -= 1.
 62 |         x = np.sign(x) * (1 / u) * (((1 + u) ** np.abs(x)) - 1)
 63 |         x = float_to_uint8(x)
 64 |         return x
 65 | 
 66 |     def ensure_sample_rate(desired_sample_rate, file_sample_rate, mono_audio):
 67 |         if file_sample_rate != desired_sample_rate:
 68 |             mono_audio = scipy.signal.resample_poly(mono_audio, desired_sample_rate, file_sample_rate)
 69 |         return mono_audio
 70 | 
 71 | 
 72 |     def ensure_mono(raw_audio):
 73 |         """
 74 |         Just use first channel.
 75 |         """
 76 |         if raw_audio.ndim == 2:
 77 |             raw_audio = raw_audio[:, 0]
 78 |         return raw_audio
 79 | 
 80 |     DATA_PATH = "/tmp/kumarrit/vctk/VCTK-Corpus/wav48/p" + str(speaker_id) + "/*"
 81 |     paths = glob.glob(DATA_PATH)
 82 |     if N_FILES:
 83 |         paths = paths[:N_FILES]
 84 |     random_seed += 1
 85 |     batches = []
 86 |     for i in xrange(len(paths) / BATCH_SIZE):
 87 |         batches.append(paths[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
 88 |     random.shuffle(batches)
 89 |     for batch_paths in batches:
 90 |         data = []
 91 |         for fname in batch_paths:
 92 |             data.append(process_wav(16000,fname,True))
 93 |         max_len = max([len(vec) for vec in data])
 94 |         for i in xrange(len(data)):
 95 |             data[i] = np.hstack((data[i],np.full(max_len-len(data[i]),128,dtype=np.uint8)))
 96 |         data = np.asarray(data).astype(np.uint8)
 97 |         for i in xrange(0,data.shape[1]-RF-STRIDE,STRIDE):
 98 |             start = i
 99 |             end = i+RF+STRIDE
100 |             subbatch = data[:, start : end]
101 |             yield (subbatch,reset)
102 | 


--------------------------------------------------------------------------------
/vctk_dataset.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ritheshkumar95/WaveNet/569cc7569c501356d8633168acd96d80caab0c8f/vctk_dataset.pyc


--------------------------------------------------------------------------------
/wavenet.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | sys.setrecursionlimit(10000)
  3 | import numpy as np
  4 | import numpy
  5 | numpy.random.seed(123)
  6 | import random
  7 | random.seed(123)
  8 | import dataset
  9 | import theano
 10 | import theano.tensor as T
 11 | theano.config.floatX='float32'
 12 | import lib.ops
 13 | import scipy.io.wavfile
 14 | import time
 15 | import lasagne
 16 | import vctk_dataset
 17 | import tqdm
 18 | from tqdm import tqdm
 19 | import new_dataset
 20 | 
 21 | # Hyperparams
 22 | NB_EPOCH=200
 23 | BATCH_SIZE = 8
 24 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 25 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard'
 26 | N_FILES = 8192
 27 | BITRATE = 16000
 28 | GRAD_CLIP=1
 29 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 30 | Q_TYPE='linear'
 31 | 
 32 | N_BLOCKS=5
 33 | DILATION_DEPTH=10
 34 | RF=N_BLOCKS*(2**(DILATION_DEPTH))-N_BLOCKS+2
 35 | n_filters=64
 36 | 
 37 | #FRAME_SIZE = RF # How many samples per frame
 38 | #SEQ_LEN=2*RF
 39 | OVERLAP=RF
 40 | SEQ_LEN=1600
 41 | 
 42 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO))
 43 | 
 44 | def network(input_sequences):
 45 |     batch_size = input_sequences.shape[0]
 46 |     length = input_sequences.shape[1]
 47 |     dilations = np.asarray([[2**i for i in xrange(DILATION_DEPTH)]*N_BLOCKS]).tolist()[0]
 48 |     #skip_weights = lib.param("scaling_weights", numpy.ones(len(dilations)).astype('float32'))
 49 | 
 50 |     #start = T.extra_ops.to_one_hot(input_sequences.flatten(),nb_class=256).reshape((batch_size,length,256)).transpose(0,2,1)[:,:,None,:]
 51 |     start =  (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,:,None]
 52 |     conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=True,batchnorm=False,pad=(1,0))[:,:,:length,:]
 53 |     prev_conv = conv1
 54 |     #prev_skip = []
 55 |     prev_skip = T.zeros((batch_size,n_filters,length,1))
 56 |     for i,value in enumerate(dilations):
 57 |         prev_conv,y = lib.ops.WaveNetConv1d("Block-%d"%(i+1),prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value)
 58 |         #prev_skip += y*skip_weights[i]
 59 |         prev_skip += y
 60 |         #prev_skip += [y]
 61 | 
 62 |     #out = T.nnet.relu(T.sum(prev_skip,axis=0))
 63 |     out = T.nnet.relu(prev_skip)
 64 |     #out = prev_skip
 65 |     out = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
 66 |     out = T.nnet.relu(lib.ops.conv1d("Output.2",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
 67 |     out = T.nnet.relu(lib.ops.conv1d("Output.3",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
 68 | 
 69 |     out = lib.ops.conv1d("Output.4",out,1,1,256,n_filters,bias=True,batchnorm=False)
 70 | 
 71 |     return out[:,:,RF-1:,0].transpose(0,2,1).reshape((-1,Q_LEVELS))
 72 | 
 73 | print "Model settings:"
 74 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 75 | all_vars = sorted(all_vars, key=lambda x: x[0])
 76 | for var_name, var_value in all_vars:
 77 |     print "\t{}: {}".format(var_name, var_value)
 78 | 
 79 | sequences   = T.imatrix('sequences')
 80 | input_sequences = sequences[:,:-1]
 81 | target_sequences = sequences[:,RF:]
 82 | 
 83 | predicted_sequences = T.nnet.softmax(network(input_sequences))
 84 | 
 85 | #lib.load_params('iter_latest_wavenet.p')
 86 | cost = T.nnet.categorical_crossentropy(
 87 |     predicted_sequences,
 88 |     target_sequences.flatten()
 89 | ).mean()
 90 | 
 91 | # By default we report cross-entropy cost in bits.
 92 | # Switch to nats by commenting out this line:
 93 | cost = cost * lib.floatX(1.44269504089)
 94 | 
 95 | params = lib.search(cost, lambda x: hasattr(x, 'param'))
 96 | lib.print_params_info(cost, params)
 97 | #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP)
 98 | grads = T.grad(cost, wrt=params)
 99 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
100 | 
101 | lr = T.fscalar()
102 | updates = lasagne.updates.adam(grads, params, learning_rate=lr)
103 | 
104 | print "Gradients Computed"
105 | 
106 | train_fn = theano.function(
107 |     [sequences,lr],
108 |     [cost],
109 |     updates=updates,
110 |     on_unused_input='warn'
111 | )
112 | 
113 | print "Compiled Train Function"
114 | 
115 | test_fn = theano.function(
116 |     [sequences],
117 |     [cost],
118 |     on_unused_input='warn'
119 | )
120 | 
121 | print "Compiled Test Function"
122 | 
123 | generate_fn = theano.function(
124 |     [sequences],
125 |     [lib.ops.softmax_and_sample(network(sequences))],
126 |     on_unused_input='warn'
127 | )
128 | 
129 | print "Compiled Generate Function"
130 | 
131 | def generate(generate_fn):
132 |     tag = 'test_iter'
133 |     N_SEQS = 8
134 |     LENGTH = 3*BITRATE
135 |     samples = numpy.full((N_SEQS, LENGTH), fill_value = Q_ZERO, dtype=np.uint8)
136 | 
137 |     def write_audio_file(name, data):
138 |         data = data.astype('float32')
139 |         data -= data.min()
140 |         data /= data.max()
141 |         data -= 0.5
142 |         data *= 0.95
143 |         import scipy.io.wavfile
144 |         scipy.io.wavfile.write(name+'.wav',BITRATE,data)
145 | 
146 |     #data = data_feeder.next()
147 |     #data_feeder = list(dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True, 34965))
148 |     #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO))
149 |     #data_feeder = list(vctk_dataset.feed_epoch(225, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES))
150 |     print "File loaded"
151 |     #data = data[0][:]
152 |     #samples[:, :RF] = data[:N_SEQS,:RF]
153 | 
154 |     for t in xrange(RF, LENGTH):
155 |         samples[:,t] = generate_fn(samples[:,t-RF:t])[0]
156 |         #samples[:,t] = probs.flatten()
157 |         print t, samples[:,t]
158 | 
159 |     for i in xrange(N_SEQS):
160 |         write_audio_file("sample_{}_{}".format(tag, i), samples[i][RF:])
161 | 
162 | print "Training!"
163 | for epoch in xrange(1,NB_EPOCH):
164 |     costs = []
165 |     times = []
166 |     #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES)
167 |     #data_feeder = vctk_dataset.feed_epoch(225, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES)
168 |     data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE)
169 |     print "Epoch : ",epoch
170 |     total_iters = 0
171 |     for seqs,reset,mask in tqdm(data_feeder):
172 |         total_iters += 1
173 |         start_time = time.time()
174 |         cost,pred = train_fn(seqs,0.001)
175 |         total_time = time.time() - start_time
176 |         costs.append(cost)
177 |         times.append(total_time)
178 |         if total_iters%1000==0:
179 |             print "\tCost : ", np.mean(costs)
180 |             print "\tTime : ", np.mean(times)
181 | 
182 |     print "\tCost : ", np.mean(costs)
183 |     print "\tTime : ", np.mean(times)
184 |     #if epoch%50==0:
185 |     #    generate()
186 | 


--------------------------------------------------------------------------------
/wavenet_controller.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, print_function
  2 | import os
  3 | import sys
  4 | import time
  5 | 
  6 | import numpy
  7 | 
  8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
  9 | from platoon.channel import Controller
 10 | 
 11 | 
 12 | class WaveNetController(Controller):
 13 |     def __init__(self, max_mb,saveFreq,default_args):
 14 |         """
 15 |         Initialize the WaveNetController
 16 |         Parameters
 17 |         ----------
 18 |         max_mb : int
 19 |             Max number of minibatches to train on.
 20 |         patience: : int
 21 |             Training stops when this many minibatches have been trained on
 22 |             without any reported improvement.
 23 |         valid_freq : int
 24 |             Number of minibatches to train on between every monitoring step.
 25 |         default_args : dict
 26 |             Arguments of default class Controller
 27 |         """
 28 | 
 29 |         super(WaveNetController, self).__init__(**default_args)
 30 |         self.max_mb = int(max_mb)
 31 | 
 32 |         self.uidx = 0
 33 |         self.eidx = 0
 34 |         self.saveFreq = saveFreq
 35 | 
 36 |         self._save_params = False
 37 |         self.start_time = None
 38 |         self._should_stop = False
 39 | 
 40 |     def handle_control(self, req, worker_id, req_info):
 41 |         """
 42 |         Handles a control_request received from a worker
 43 |         Parameters
 44 |         ----------
 45 |         req : str or dict
 46 |             Control request received from a worker.
 47 |             The control request can be one of the following
 48 |             1) "next" : request by a worker to be informed of its next action
 49 |                to perform. The answers from the server can be 'train' (the
 50 |                worker should keep training on its training data), 'valid' (the
 51 |                worker should perform monitoring on its validation set and test
 52 |                set) or 'stop' (the worker should stop training).
 53 |             2) dict of format {"done":N} : used by a worker to inform the
 54 |                 server that is has performed N more training iterations and
 55 |                 synced its parameters. The server will respond 'stop' if the
 56 |                 maximum number of training minibatches has been reached.
 57 |             3) dict of format {"valid_err":x, "test_err":x2} : used by a worker
 58 |                 to inform the server that it has performed a monitoring step
 59 |                 and obtained the included errors on the monitoring datasets.
 60 |                 The server will respond "best" if this is the best reported
 61 |                 validation error so far, otherwise it will respond 'stop' if
 62 |                 the patience has been exceeded.
 63 |         """
 64 |         control_response = ""
 65 | 
 66 |         if req == 'next':
 67 |             if not self._should_stop:
 68 |                 if self.start_time is None:
 69 |                     self.start_time = time.time()
 70 |                 if self._save_params:
 71 |                     control_response = 'save'
 72 |                 else:
 73 |                     control_response = 'train'
 74 |             else:
 75 |                 control_response = 'stop'
 76 |         elif req == 'done':
 77 |             self.uidx += req_info['train_len']
 78 |             if self.uidx%self.saveFreq==0:
 79 |                 self._save_params=True
 80 | 
 81 |         elif req == 'saved':
 82 |             self._save_params=False
 83 | 
 84 |         if self.uidx > self.max_mb:
 85 |             if not self._should_stop:
 86 |                 print("Training time {:.4f}s".format(time.time() - self.start_time))
 87 |                 print("Number of samples:", self.uidx)
 88 |             ##NEVER STOPPING!
 89 |             self._should_stop = False
 90 | 
 91 |         return control_response
 92 | 
 93 | 
 94 | def wavenet_control(saveFreq=1110, saveto=None):
 95 |     parser = Controller.default_parser()
 96 |     parser.add_argument('--max-mb', default=((5000 * 1998) / 10), type=int,
 97 |                         required=False, help='Maximum mini-batches to train upon in total.')
 98 | 
 99 |     args = parser.parse_args()
100 | 
101 |     l = WaveNetController(max_mb=10000,saveFreq=1000,
102 |                        default_args=Controller.default_arguments(args))
103 | 
104 |     print("Controller is ready")
105 |     return l.serve()
106 | 
107 | if __name__ == '__main__':
108 |     rcode = wavenet_control()
109 |     if rcode != 0:
110 |         sys.exit(rcode)
111 | 


--------------------------------------------------------------------------------
/wavenet_worker.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | sys.setrecursionlimit(10000)
  3 | import numpy as np
  4 | import numpy
  5 | numpy.random.seed(123)
  6 | import random
  7 | random.seed(123)
  8 | import dataset
  9 | import theano
 10 | import theano.tensor as T
 11 | theano.config.floatX='float32'
 12 | import lib.ops
 13 | import scipy.io.wavfile
 14 | import time
 15 | import lasagne
 16 | from six import iteritems
 17 | from platoon.channel import Worker
 18 | from platoon.param_sync import EASGD
 19 | import argparse
 20 | import pickle
 21 | import new_dataset
 22 | from model import network
 23 | 
 24 | worker = None
 25 | # Hyperparams
 26 | NB_EPOCH=200
 27 | BATCH_SIZE = 8
 28 | Q_LEVELS = 256 # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization
 29 | DATA_PATH = '/data/lisatmp3/kumarrit/blizzard'
 30 | N_FILES = 8192
 31 | BITRATE = 16000
 32 | GRAD_CLIP=1
 33 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
 34 | Q_TYPE='linear'
 35 | 
 36 | N_BLOCKS=5
 37 | DILATION_DEPTH=10
 38 | RF=N_BLOCKS*(2**(DILATION_DEPTH))-N_BLOCKS+2
 39 | n_filters=64
 40 | 
 41 | #FRAME_SIZE = RF # How many samples per frame
 42 | #SEQ_LEN=2*RF
 43 | OVERLAP=RF
 44 | SEQ_LEN=1600
 45 | 
 46 | N_GPUS=4
 47 | alpha = 1./N_GPUS
 48 | #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO))
 49 | 
 50 | def floatX(arr):
 51 |     return np.asarray(arr, dtype=theano.config.floatX)
 52 | 
 53 | def adam(lr, tparams, grads, sequences, cost, epsilon=1e-8,beta1=0.9,beta2=0.999):
 54 | 
 55 |     zipped_grads = [lib.param('%s_grad' % k,p.get_value() * floatX(0.))
 56 |                     for k, p in iteritems(tparams)]
 57 |     running_grads = [lib.param('%s_rgrad' % k,p.get_value() * floatX(0.))
 58 |                      for k, p in iteritems(tparams)]
 59 |     running_grads2 = [lib.param('%s_rgrad2' % k,p.get_value() * floatX(0.))
 60 |                       for k, p in iteritems(tparams)]
 61 | 
 62 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 63 |     rgup = [(rg, beta1 * rg + (1-beta1) * g) for rg, g in zip(running_grads, grads)]
 64 |     rg2up = [(rg2, beta2 * rg2 + (1-beta2) * (g ** 2)) for rg2, g in zip(running_grads2, grads)]
 65 | 
 66 |     t_prev = lib.param('t_prev',floatX(0.))
 67 |     one = T.constant(1)
 68 |     t = t_prev+1
 69 |     a_t = lr*T.sqrt(1-beta2**t)/(1-beta1**t)
 70 | 
 71 |     f_grad_shared = theano.function([sequences], cost,
 72 |                                     updates=zgup + rgup + rg2up,
 73 |                                     name='adam_f_grad_shared')
 74 | 
 75 |     updir = [lib.param('%s_updir' % k,p.get_value() * floatX(0.))
 76 |              for k, p in iteritems(tparams)]
 77 | 
 78 |     updir_new = [(ud, a_t * rg / T.sqrt(rg2 + epsilon))
 79 |                  for ud, rg, rg2 in zip(updir, running_grads, running_grads2)]
 80 |     param_up = [(p, p - udn[1])
 81 |                 for p, udn in zip(tparams.values(), updir_new)]
 82 |     f_update = theano.function([lr], [], updates=updir_new + param_up + [(t_prev,t)] ,
 83 |                                on_unused_input='ignore',
 84 |                                name='adam_f_update')
 85 | 
 86 |     return f_grad_shared, f_update
 87 | 
 88 | print "Model settings:"
 89 | all_vars = [(k,v) for (k,v) in locals().items() if (k.isupper() and k != 'T')]
 90 | all_vars = sorted(all_vars, key=lambda x: x[0])
 91 | for var_name, var_value in all_vars:
 92 |     print "\t{}: {}".format(var_name, var_value)
 93 | 
 94 | def build_model(worker,train_len=100,param_sync_api=True):
 95 |     sequences   = T.imatrix('sequences')
 96 |     input_sequences = sequences[:,:-1]
 97 |     target_sequences = sequences[:,RF:]
 98 | 
 99 |     # def network_my(input_sequences):
100 |     #     batch_size = input_sequences.shape[0]
101 |     #     length = input_sequences.shape[1]
102 |     #     dilations = np.asarray([[2**i for i in xrange(DILATION_DEPTH)]*N_BLOCKS]).tolist()[0]
103 |     #     #skip_weights = lib.param("scaling_weights", numpy.ones(len(dilations)).astype('float32'))
104 |     #
105 |     #     #start = T.extra_ops.to_one_hot(input_sequences.flatten(),nb_class=256).reshape((batch_size,length,256)).transpose(0,2,1)[:,:,None,:]
106 |     #     start =  (input_sequences.astype('float32')/lib.floatX(Q_LEVELS-1) - lib.floatX(0.5))[:,None,:,None]
107 |     #     conv1 = lib.ops.conv1d("causal-conv",start,2,1,n_filters,1,bias=True,batchnorm=False,pad=(1,0))[:,:,:length,:]
108 |     #     prev_conv = conv1
109 |     #     #prev_skip = []
110 |     #     prev_skip = T.zeros((batch_size,n_filters,length,1))
111 |     #     for i,value in enumerate(dilations):
112 |     #         prev_conv,y = lib.ops.WaveNetConv1d("Block-%d"%(i+1),prev_conv,2,n_filters,n_filters,bias=False,batchnorm=False,dilation=value)
113 |     #         #prev_skip += y*skip_weights[i]
114 |     #         prev_skip += y
115 |     #         #prev_skip += [y]
116 |     #
117 |     #     #out = T.nnet.relu(T.sum(prev_skip,axis=0))
118 |     #     out = T.nnet.relu(prev_skip)
119 |     #     #out = prev_skip
120 |     #     out = T.nnet.relu(lib.ops.conv1d("Output.1",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
121 |     #     out = T.nnet.relu(lib.ops.conv1d("Output.2",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
122 |     #     out = T.nnet.relu(lib.ops.conv1d("Output.3",out,1,1,n_filters,n_filters,bias=True,batchnorm=False))
123 |     #
124 |     #     out = lib.ops.conv1d("Output.4",out,1,1,256,n_filters,bias=True,batchnorm=False)
125 |     #
126 |     #     return out[:,:,RF-1:,0].transpose(0,2,1).reshape((-1,Q_LEVELS))
127 | 
128 |     predicted_sequences = T.nnet.softmax(network(input_sequences))
129 |     #lib.load_params('iter_latest_wavenet.p')
130 |     cost = T.nnet.categorical_crossentropy(
131 |         predicted_sequences,
132 |         target_sequences.flatten()
133 |     ).mean()
134 | 
135 |     cost = cost * lib.floatX(1.44269504089)
136 | 
137 | 
138 |     params = lib.search(cost, lambda x: hasattr(x, 'param'))
139 |     tparams = {p.name:p for p in params}
140 | 
141 |     copy_params = lambda tparams: {x:theano.shared(y.get_value(),name=x) for x,y in tparams.iteritems()}
142 | 
143 |     lib.print_params_info(cost, params)
144 |     #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP)
145 | 
146 |     list_tparams = list(tparams.values())
147 | 
148 |     if param_sync_api:
149 |         worker.init_shared_params(list_tparams, param_sync_rule=EASGD(alpha))
150 |     else:
151 |         from platoon.training import global_dynamics as gd
152 |         cparams = copy_params(tparams)
153 |         list_cparams = list(cparams.values())
154 |         easgd = gd.EASGD(worker)
155 |         easgd.make_rule(list_tparams, list_cparams, alpha)
156 | 
157 | 
158 |     grads = T.grad(cost, wrt=list_tparams)
159 |     grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
160 | 
161 |     lr = T.fscalar()
162 | 
163 |     f_grad_shared,f_update = adam(lr,tparams,grads,sequences,cost)
164 | 
165 |     def save_params(_params,path):
166 |         param_vals = {}
167 |         for name, param in _params.iteritems():
168 |             param_vals[name] = param.get_value()
169 | 
170 |         with open(path, 'wb') as f:
171 |             pickle.dump(param_vals, f)
172 | 
173 |     if param_sync_api:
174 |         worker.copy_to_local()
175 | 
176 |     costs = []
177 |     #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True,worker.global_rank())
178 |     new_dataset.random_seed = worker._worker_id
179 |     data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE)
180 | 
181 |     iter_count=0
182 |     while True:
183 |         step = worker.send_req('next')
184 | 
185 |         if step == 'train':
186 |             for i in range(train_len):
187 |                 try:
188 |                     seqs,reset,mask = next(data_feeder)
189 |                 except StopIteration:
190 |                     #data_feeder = dataset.blizzard_feed_epoch(BATCH_SIZE, SEQ_LEN, FRAME_SIZE, RF, N_FILES, True,worker.global_rank())
191 |                     new_dataset.random_seed = worker._worker_id
192 |                     data_feeder = new_dataset.blizz_train_feed_epoch(BATCH_SIZE,SEQ_LEN,OVERLAP,Q_LEVELS,Q_ZERO,Q_TYPE)
193 |                     seqs,reset,mask = next(data_feeder)
194 |                     print('Train cost:', np.mean(costs))
195 |                     costs = []
196 | 
197 |                 costs.append(f_grad_shared(seqs))
198 |                 f_update(0.001)
199 |                 iter_count += 1
200 | 
201 |             step = worker.send_req('done', {'train_len': train_len})
202 |             if iter_count%5000==0:
203 |                 print('Train cost:',np.mean(costs))
204 | 
205 |             if param_sync_api:
206 |                 #print("Syncing with global params")
207 |                 worker.sync_params(synchronous=True)
208 |             else:
209 |                 easgd()
210 | 
211 |         if step=='save':
212 |             if param_sync_api:
213 |                 save_params(tparams,"worker_%d.p"%worker._worker_id)
214 |                 step = worker.send_req('saved')
215 |                 print('Saving now')
216 |             else:
217 |                 save_params(cparams,"worker%d.p"%worker._worker_id))
218 |                 step = worker.send_req('saved')
219 | 
220 |         if step == 'stop':
221 |             break
222 | 
223 |     # Release all shared resources.
224 |     worker.close()
225 | 
226 | if __name__ == '__main__':
227 |     # See function train for all possible parameter and there definition.
228 |     global worker
229 |     parser = Worker.default_parser()
230 |     parser.add_argument('--valid_sync', dest='valid_sync', action='store_true', default=False)
231 |     parser.add_argument('--param-sync-api', action='store_true', default=True)
232 |     #SEED = 123
233 |     #lib.random_seed = SEED+worker._worker_id
234 |     args = parser.parse_args()
235 |     worker = Worker(**Worker.default_arguments(args))
236 | 
237 |     build_model(worker,train_len=10,param_sync_api=args.param_sync_api)
238 | 


--------------------------------------------------------------------------------