├── CHRNN_HF
    ├── datasets
    │   ├── TIMIT
    │   │   └── _2npy_hf.py
    │   ├── __init__.py
    │   └── dataset.py
    ├── lib
    │   ├── __init__.py
    │   └── ops.py
    ├── models
    │   └── four_tier
    │   │   ├── four_tier_generation.py
    │   │   └── four_tier_train_valid.py
    └── readme.md
├── HRNN_HF
    ├── datasets
    │   ├── TIMIT
    │   │   └── _2npy_hf.py
    │   ├── __init__.py
    │   └── dataset.py
    ├── lib
    │   ├── __init__.py
    │   └── ops.py
    ├── models
    │   └── three_tier
    │   │   ├── three_tier_generation.py
    │   │   └── three_tier_train_valid.py
    └── readme.md
└── README.md


/CHRNN_HF/datasets/TIMIT/_2npy_hf.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import librosa
  3 | import random
  4 | import os
  5 | import glob
  6 | import math
  7 | 
  8 | __RAND_SEED = 123
  9 | def ReadFloatRawMat(datafile,column):
 10 | 	data = np.fromfile(datafile,dtype=np.float32)
 11 | 	if len(data)%column!=0:
 12 | 		print 'ReadFloatRawMat %s, column wrong!'%datafile
 13 | 		exit()
 14 | 	if len(data)==0:
 15 | 		print 'empty file: %s'%datafile
 16 | 		exit()
 17 | 	data.shape = [len(data)/column,column]
 18 | 	return np.float32(data)
 19 | 
 20 | def __fixed_shuffle(inp_list):
 21 |     if isinstance(inp_list, list):
 22 |         random.seed(__RAND_SEED)
 23 |         random.shuffle(inp_list)
 24 |         return
 25 |     if isinstance(inp_list, np.ndarray):
 26 |         np.random.seed(__RAND_SEED)
 27 |         np.random.shuffle(inp_list)
 28 |         return
 29 | 
 30 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
 31 | 
 32 | def clip_times(audio, times):
 33 | 
 34 |     audio = audio * times
 35 |     audio[audio > 1] = 1
 36 |     audio[audio < -1] = -1
 37 |     return audio
 38 | 
 39 | def wav2npy(data_path,con_data_path,save_path,name,fixed_shuffle=True,sample_rate=16000,frame_len=160,con_dim=100):
 40 | 	paths = sorted(glob.glob(data_path+"/*.wav"))
 41 | 	if name=='test':
 42 | 		fid=open(save_path+'/'+'test_list.scp','w')
 43 | 		for i in xrange(len(paths)):
 44 | 			fid.write(paths[i].split('/')[-1]+'\n')
 45 | 		fid.close()
 46 | 	con_paths=sorted(glob.glob(con_data_path+"/*.dat"))
 47 | 	if fixed_shuffle:
 48 | 		__fixed_shuffle(paths)
 49 | 		__fixed_shuffle(con_paths)
 50 | 	for i,path in enumerate(paths):
 51 | 		print i
 52 | 		print path
 53 | 		print con_paths[i]
 54 | 		audio16k, _ = librosa.load(path, sr=sample_rate, mono=True)
 55 | 		audio8k = librosa.core.resample(audio16k,sample_rate,sample_rate/2)
 56 | 		audio8k = librosa.core.resample(audio8k,sample_rate/2,sample_rate)
 57 | 		condition=ReadFloatRawMat(con_paths[i],1).reshape(1,-1)
 58 | 
 59 | 		if(len(audio8k)==len(audio16k)):
 60 | 			pass
 61 | 		elif(len(audio8k)>len(audio16k)):
 62 | 			audio8k=audio8k[0:len(audio16k)]
 63 | 		else:
 64 | 			audio16k=audio16k[0:len(audio8k)]
 65 | 
 66 | 		audio_up=audio16k-audio8k
 67 | 		audio_up = clip_times(audio_up, 3)
 68 | 
 69 | 		if len(audio8k)>condition.shape[1]/con_dim*frame_len:
 70 | 			diff=len(audio8k)-condition.shape[1]/con_dim*frame_len
 71 | 			audio8k=audio8k[:-diff]
 72 | 			audio_up=audio_up[:-diff]
 73 | 		elif len(audio8k)<condition.shape[1]/con_dim*frame_len:
 74 | 			diff=condition.shape[1]/con_dim*frame_len-len(audio8k)
 75 | 			audio8k=audio8k[:-(int(math.ceil(float(diff)/frame_len))*frame_len-diff)]
 76 | 			audio_up=audio_up[:-(int(math.ceil(float(diff)/frame_len))*frame_len-diff)]
 77 | 			condition=condition[:,:-int(math.ceil(float(diff)/frame_len))*con_dim]
 78 | 		else:
 79 | 			pass
 80 | 
 81 | 		if i==0:
 82 | 			max_len=len(audio_up)
 83 | 			max_con_len=condition.shape[1]
 84 | 			audio_mat_up=np.array(audio_up,dtype='float32').reshape(1,len(audio_up))
 85 | 			audio_mat8k=np.array(audio8k,dtype='float32').reshape(1,len(audio8k))
 86 | 			mask=np.ones(audio_mat_up.shape,dtype='float32')
 87 | 			con_mat=condition
 88 | 		else:
 89 | 			current_len=len(audio_up)
 90 | 			current_con_len=condition.shape[1]
 91 | 			if current_con_len>max_con_len:
 92 | 				con_mat=np.pad(con_mat,[[0,0],[0,current_con_len-max_con_len]],'constant')
 93 | 				con_mat=np.concatenate((con_mat,condition),axis=0)
 94 | 				max_con_len=current_con_len
 95 | 			else:
 96 | 				con_mat=np.concatenate((con_mat,np.pad(condition,[[0,0],[0,max_con_len-current_con_len]],'constant')),axis=0)
 97 | 			if current_len>max_len:
 98 | 				audio_mat_up=np.pad(audio_mat_up,[[0,0],[0,current_len-max_len]],'constant')
 99 | 				audio_mat_up=np.concatenate((audio_mat_up,np.array(audio_up,dtype='float32').reshape(1,current_len)),axis=0)
100 | 				audio_mat8k=np.pad(audio_mat8k,[[0,0],[0,current_len-max_len]],'constant')
101 | 				audio_mat8k=np.concatenate((audio_mat8k,np.array(audio8k,dtype='float32').reshape(1,current_len)),axis=0)
102 | 				mask=np.pad(mask,[[0,0],[0,current_len-max_len]],'constant')
103 | 				mask=np.concatenate((mask,np.ones((1,current_len),dtype='float32')),axis=0)
104 | 				max_len=current_len
105 | 			else:
106 | 				audio_mat_up=np.concatenate((audio_mat_up,np.pad(np.array(audio_up,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
107 | 				audio_mat8k=np.concatenate((audio_mat8k,np.pad(np.array(audio8k,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
108 | 				mask=np.concatenate((mask,np.pad(np.ones((1,current_len),dtype='float32'),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
109 | 
110 | 	np.save(save_path+'/'+'TIMIT_'+name+'_up.npy', audio_mat_up)
111 | 	np.save(save_path+'/'+'TIMIT_'+name+'_8k.npy', audio_mat8k)
112 | 	np.save(save_path+'/'+'TIMIT_'+name+'_mask.npy', mask)
113 | 	np.save(save_path+'/'+'TIMIT_'+name+'_con.npy', con_mat)
114 | 
115 | 	print name+' data storage is complete!'
116 | 
117 | 
118 | wav2npy('datasets/TIMIT/waveform/train','datasets/TIMIT/bn_norm_condition/train','datasets/TIMIT','train',fixed_shuffle=True,sample_rate=16000)
119 | wav2npy('datasets/TIMIT/waveform/valid','datasets/TIMIT/bn_norm_condition/valid','datasets/TIMIT','valid',fixed_shuffle=True,sample_rate=16000)
120 | wav2npy('datasets/TIMIT/waveform/test','datasets/TIMIT/bn_norm_condition/test','datasets/TIMIT','test',fixed_shuffle=False,sample_rate=16000)


--------------------------------------------------------------------------------
/CHRNN_HF/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aiyang8067/Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension/94c3daf9554e20ea2538eb2b7aa044024fedb9ed/CHRNN_HF/datasets/__init__.py


--------------------------------------------------------------------------------
/CHRNN_HF/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Vocal Generation Model
  3 | 
  4 | TIMIT data feeders.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import time
 10 | import os
 11 | import glob
 12 | 
 13 | __base = [
 14 |     ('Local', 'datasets/'),  
 15 | ]
 16 | 
 17 | __TIMIT_file = 'TIMIT/TIMIT_{}.npy'
 18 | 
 19 | __train_mask = lambda s: s.format('train_mask')
 20 | __train_up = lambda s: s.format('train_up')
 21 | __train8k = lambda s: s.format('train_8k')
 22 | __train_con = lambda s: s.format('train_con')
 23 | __valid_mask = lambda s: s.format('valid_mask')
 24 | __valid_up = lambda s: s.format('valid_up')
 25 | __valid8k = lambda s: s.format('valid_8k')
 26 | __valid_con = lambda s: s.format('valid_con')
 27 | __test_mask = lambda s: s.format('test_mask')
 28 | __test_up = lambda s: s.format('test_up')
 29 | __test8k = lambda s: s.format('test_8k')
 30 | __test_con = lambda s: s.format('test_con')
 31 | 
 32 | def find_dataset(filename):
 33 |     for (k, v) in __base:
 34 |         tmp_path = os.path.join(v, filename)
 35 |         if os.path.exists(tmp_path):
 36 |             return tmp_path
 37 |     raise Exception('{} NOT FOUND!'.format(filename))
 38 | 
 39 | ### Basic utils ###
 40 | def __round_to(x, y):
 41 |     """round x up to the nearest y"""
 42 |     return int(np.ceil(x / float(y))) * y
 43 | 
 44 | def __normalize(data):
 45 |     """To range [0., 1.]"""
 46 |     data -= data.min(axis=1)[:, None]
 47 |     data /= data.max(axis=1)[:, None]
 48 |     return data
 49 | 
 50 | def __linear_quantize(data, q_levels):
 51 |     """
 52 |     floats in (0, 1) to ints in [0, q_levels-1]
 53 |     scales normalized across axis 1
 54 |     """
 55 |     # Normalization is on mini-batch not whole file
 56 |     #eps = numpy.float64(1e-5)
 57 |     #data -= data.min(axis=1)[:, None]
 58 |     #data *= ((q_levels - eps) / data.max(axis=1)[:, None])
 59 |     #data += eps/2
 60 |     #data = data.astype('int32')
 61 | 
 62 |     eps = np.float64(1e-5)
 63 |     data *= (q_levels - eps)
 64 |     data += eps/2
 65 |     data = data.astype('int32')
 66 |     return data
 67 | 
 68 | def linear2mu(x, mu=255):
 69 |     """
 70 |     From Joao
 71 |     x should be normalized between -1 and 1
 72 |     Converts an array according to mu-law and discretizes it
 73 | 
 74 |     Note:
 75 |         mu2linear(linear2mu(x)) != x
 76 |         Because we are compressing to 8 bits here.
 77 |         They will sound pretty much the same, though.
 78 | 
 79 |     :usage:
 80 |         >>> bitrate, samples = scipy.io.wavfile.read('orig.wav')
 81 |         >>> norm = __normalize(samples)[None, :]  # It takes 2D as inp
 82 |         >>> mu_encoded = linear2mu(2.*norm-1.)  # From [0, 1] to [-1, 1]
 83 |         >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype
 84 |         0, 255, dtype('int16')
 85 |         >>> mu_decoded = mu2linear(mu_encoded)  # Back to linear
 86 |         >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype
 87 |         -1, 0.9574371, dtype('float32')
 88 |     """
 89 |     x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu)
 90 |     return ((x_mu + 1)/2 * mu).astype('int16')
 91 | 
 92 | def mu2linear(x, mu=255):
 93 |     """
 94 |     From Joao with modifications
 95 |     Converts an integer array from mu to linear
 96 | 
 97 |     For important notes and usage see: linear2mu
 98 |     """
 99 |     mu = float(mu)
100 |     x = x.astype('float32')
101 |     y = 2. * (x - (mu+1.)/2.) / (mu+1.)
102 |     return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.)
103 | 
104 | def __mu_law_quantize(data):
105 |     return linear2mu(data)
106 | 
107 | def __batch_quantize(data, q_levels, q_type):
108 |     """
109 |     One of 'linear', 'a-law', 'mu-law' for q_type.
110 |     """
111 |     data = data.astype('float64')
112 |     #data = __normalize(data)
113 |     if q_type == 'linear':
114 |         return __linear_quantize(data, q_levels)
115 |     if q_type == 'mu-law':
116 |         # from [0, 1] to [-1, 1]
117 |         #data = 2.*data-1.
118 |         # Automatically quantized to 256 bins.
119 |         return __mu_law_quantize(data)
120 |     raise NotImplementedError
121 | 
122 | __RAND_SEED = 123
123 | def __fixed_shuffle(inp_list):
124 |     if isinstance(inp_list, list):
125 |         random.seed(__RAND_SEED)
126 |         random.shuffle(inp_list)
127 |         return
128 |     if isinstance(inp_list, np.ndarray):
129 |         np.random.seed(__RAND_SEED)
130 |         np.random.shuffle(inp_list)
131 |         return
132 | 
133 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
134 | 
135 | def __make_random_batches(inp_list, batch_size,shuffle=True):
136 |     batches = []
137 |     for i in xrange(len(inp_list) / batch_size+1):
138 |         if i==len(inp_list) / batch_size:
139 |             if len(inp_list)%batch_size==0:
140 |                 break
141 |             else:
142 |                 batches.append(inp_list[i*batch_size:])
143 |         else:
144 |             batches.append(inp_list[i*batch_size:(i+1)*batch_size])
145 | 
146 |     if shuffle:
147 |         __fixed_shuffle(batches)
148 |     return batches
149 | 
150 | def __mask_sort(mask_matrix):
151 |     ind=[]
152 |     for i in xrange(len(mask_matrix)):
153 |         ind.append(len(np.where(mask_matrix[i]==1)[0]))
154 |     b=zip(ind,range(len(ind)))
155 |     b.sort(key=lambda x:x[0],reverse=True)
156 |     index=[x[1] for x in b]
157 | 
158 |     return index
159 | 
160 | ### TIMIT DATASET LOADER ###
161 | def __TIMIT_feed_epoch(files,
162 |                        mask_files,
163 |                        con_files,
164 |                        shuffle,
165 |                        sort,
166 |                        batch_size,
167 |                        seq_len,
168 |                        con_frame_size,
169 |                        con_dim,
170 |                        overlap,
171 |                        q_levels,
172 |                        q_zero,
173 |                        q_type,
174 |                        real_valued=False):
175 |     """
176 |     Helper function to load blizzard dataset.
177 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
178 |     quantized audio data; `reset` is a boolean indicating the start of a new
179 |     sequence (i.e. you should reset h0 whenever `reset` is True).
180 | 
181 |     Feeds subsequences which overlap by a specified amount, so that the model
182 |     can always have target for every input in a given subsequence.
183 | 
184 |     Assumes all flac files have the same length.
185 | 
186 |     returns: (subbatch, reset)
187 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
188 |     reset: True or False
189 |     """
190 |     if sort:
191 |         sort_index=__mask_sort(mask_files)
192 |         batches_8k = __make_random_batches(files[0][sort_index], batch_size,shuffle)
193 |         batches_up = __make_random_batches(files[1][sort_index], batch_size,shuffle)
194 |         mask_batches=__make_random_batches(mask_files[sort_index],batch_size,shuffle)
195 |         con_batches=__make_random_batches(con_files[sort_index],batch_size,shuffle)
196 |     else:
197 |         batches_8k = __make_random_batches(files[0], batch_size,shuffle)
198 |         batches_up = __make_random_batches(files[1], batch_size,shuffle)
199 |         mask_batches=__make_random_batches(mask_files,batch_size,shuffle)
200 |         con_batches=__make_random_batches(con_files,batch_size,shuffle)
201 | 
202 |     for index,bch_8k in enumerate(batches_8k):
203 | 
204 |         batch_num=len(bch_8k)
205 |         con=con_batches[index]
206 |         bch_up=batches_up[index]
207 |         mask=mask_batches[index]
208 |         mask_sum=np.sum(mask,axis=0)
209 |         mask_all0_index=np.where(mask_sum==0)[0]
210 |         if len(mask_all0_index!=0):
211 |             bch_up=bch_up[:,:-len(mask_all0_index)]
212 |             bch_8k=bch_8k[:,:-len(mask_all0_index)]
213 |             mask=mask[:,:-len(mask_all0_index)]
214 |             con=con[:,:-(len(mask_all0_index)/con_frame_size*con_dim)]
215 | 
216 |         batch_seq_len = len(bch_8k[0]) 
217 |         batch_seq_len = __round_to(batch_seq_len, seq_len)
218 | 
219 |         batch_8k = np.zeros(
220 |             (batch_num, batch_seq_len),
221 |             dtype='float64'
222 |         )
223 |         batch_up = np.zeros(
224 |             (batch_num, batch_seq_len),
225 |             dtype='float64'
226 |         )
227 | 
228 |         mask=np.pad(mask,[[0,0],[0,batch_seq_len-mask.shape[1]]],'constant')
229 |         con=np.pad(con,[[0,0],[0,batch_seq_len/con_frame_size*con_dim-con.shape[1]]],'constant')
230 |         for i, data in enumerate(bch_8k):
231 |             batch_8k[i, :len(data)] = data
232 |         for i, data in enumerate(bch_up):
233 |             batch_up[i, :len(data)] = data
234 | 
235 |         batch_8k_real=np.concatenate([
236 |                 batch_8k,
237 |                 np.full((batch_num, overlap), 0, dtype='float32')
238 |                 ], axis=1)
239 | 
240 |         if not real_valued:
241 |             batch_8k = __batch_quantize(batch_8k, q_levels, q_type)
242 |             batch_up = __batch_quantize(batch_up, q_levels, q_type)
243 | 
244 |             batch_8k = np.concatenate([
245 |                 batch_8k,
246 |                 np.full((batch_num, overlap), q_zero, dtype='int32')
247 |                 ], axis=1)
248 | 
249 |             batch_up = np.concatenate([
250 |                 batch_up,
251 |                 np.full((batch_num, overlap), q_zero, dtype='int32')
252 |                 ], axis=1)
253 | 
254 |         mask = np.concatenate([
255 |             mask,
256 |             np.full((batch_num, overlap), 0, dtype='float32')
257 |         ], axis=1)
258 | 
259 |         for i in xrange(batch_seq_len // seq_len):
260 |             reset = np.int32(i==0)
261 |             end_flag=np.int32(i==batch_seq_len // seq_len-1)
262 |             subbatch_8k_real=batch_8k_real[:, i*seq_len : (i+1)*seq_len+overlap]
263 |             subbatch_8k = batch_8k[:, i*seq_len : (i+1)*seq_len+overlap]
264 |             subbatch_up = batch_up[:, i*seq_len : (i+1)*seq_len+overlap]
265 |             submask = mask[:, i*seq_len : (i+1)*seq_len+overlap]
266 |             subcon=con[:,i*seq_len/con_frame_size*con_dim:(i+1)*seq_len/con_frame_size*con_dim]
267 |             yield (subbatch_8k, subbatch_up,reset, end_flag,submask,subcon,batch_num,subbatch_8k_real)
268 | 
269 | def TIMIT_train_feed_epoch(*args):
270 |     """
271 |     :parameters:
272 |         batch_size: int
273 |         seq_len:
274 |         overlap:
275 |         q_levels:
276 |         q_zero:
277 |         q_type: One the following 'linear', 'a-law', or 'mu-law'
278 | 
279 |     THE NEW SEG IS:
280 |     20.48hrs 36*256
281 |     3*256
282 |     3*256
283 | 
284 |     :returns:
285 |         A generator yielding (subbatch, reset, submask)
286 |     """
287 |     # Just check if valid/test sets are also available. If not, raise.
288 |     find_dataset(__valid_up(__TIMIT_file))
289 |     find_dataset(__valid8k(__TIMIT_file))
290 |     find_dataset(__valid_mask(__TIMIT_file))
291 |     find_dataset(__valid_con(__TIMIT_file))
292 |     find_dataset(__test_up(__TIMIT_file))
293 |     find_dataset(__test8k(__TIMIT_file))
294 |     find_dataset(__test_mask(__TIMIT_file))
295 |     find_dataset(__test_con(__TIMIT_file))
296 |     # Load train set
297 |     data_path_8k = find_dataset(__train8k(__TIMIT_file))
298 |     data_path_up = find_dataset(__train_up(__TIMIT_file))
299 |     data_mask_path=find_dataset(__train_mask(__TIMIT_file))
300 |     data_con_path=find_dataset(__train_con(__TIMIT_file))
301 |     files=[]
302 |     files.append(np.load(data_path_8k))
303 |     files.append(np.load(data_path_up))
304 |     mask_files=np.load(data_mask_path)
305 |     con_files=np.load(data_con_path)
306 |     shuffle=True
307 |     sort=True
308 |     generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args)
309 |     return generator
310 | 
311 | def TIMIT_valid_feed_epoch(*args):
312 |     """
313 |     See:
314 |         TIMIT_train_feed_epoch
315 |     """
316 |     data_path_8k = find_dataset(__valid8k(__TIMIT_file))
317 |     data_path_up = find_dataset(__valid_up(__TIMIT_file))
318 |     data_mask_path=find_dataset(__valid_mask(__TIMIT_file))
319 |     data_con_path=find_dataset(__valid_con(__TIMIT_file))
320 |     files=[]
321 |     files.append(np.load(data_path_8k))
322 |     files.append(np.load(data_path_up))
323 |     mask_files=np.load(data_mask_path)
324 |     con_files=np.load(data_con_path)
325 |     shuffle=True
326 |     sort=False
327 |     generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args)
328 |     return generator
329 | 
330 | def TIMIT_test_feed_epoch(*args):
331 |     """
332 |     See:
333 |         TIMIT_train_feed_epoch
334 |     """
335 |     data_path_8k = find_dataset(__test8k(__TIMIT_file))
336 |     data_path_up = find_dataset(__test_up(__TIMIT_file))
337 |     data_mask_path=find_dataset(__test_mask(__TIMIT_file))
338 |     data_con_path=find_dataset(__test_con(__TIMIT_file))
339 |     files=[]
340 |     files.append(np.load(data_path_8k))
341 |     files.append(np.load(data_path_up))
342 |     mask_files=np.load(data_mask_path)
343 |     con_files=np.load(data_con_path)
344 |     shuffle=False
345 |     sort=False
346 |     generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args)
347 |     return generator
348 | 


--------------------------------------------------------------------------------
/CHRNN_HF/lib/__init__.py:
--------------------------------------------------------------------------------
  1 | import ops
  2 | #import lasagne
  3 | #from theano.compile.nanguardmode import NanGuardMode
  4 | 
  5 | import math
  6 | import time
  7 | import locale
  8 | 
  9 | import numpy
 10 | import theano
 11 | import theano.tensor as T
 12 | import theano.gof
 13 | 
 14 | import cPickle as pickle
 15 | #import pickle
 16 | import warnings
 17 | import sys, os, errno, glob
 18 | 
 19 | # import matplotlib
 20 | # matplotlib.use('Agg')
 21 | # import matplotlib.pyplot as plt
 22 | 
 23 | # TODO: Grouping is not working on cluster! :-?
 24 | # Set a locale first or you won't get grouping at all
 25 | locale.setlocale(locale.LC_ALL, '')
 26 | # 'en_US.UTF-8'
 27 | 
 28 | _params = {}
 29 | def param(name, *args, **kwargs):
 30 |     """
 31 |     A wrapper for `theano.shared` which enables parameter sharing in models.
 32 | 
 33 |     Creates and returns theano shared variables similarly to `theano.shared`,
 34 |     except if you try to create a param with the same name as a
 35 |     previously-created one, `param(...)` will just return the old one instead of
 36 |     making a new one.
 37 | 
 38 |     This constructor also adds a `param` attribute to the shared variables it
 39 |     creates, so that you can easily search a graph for all params.
 40 |     """
 41 | 
 42 |     if name not in _params:
 43 |         kwargs['name'] = name
 44 |         param = theano.shared(*args, **kwargs)
 45 |         param.param = True
 46 |         _params[name] = param
 47 |     return _params[name]
 48 | 
 49 | def delete_params(name):
 50 |     to_delete = [p_name for p_name in _params if name in p_name]
 51 |     for p_name in to_delete:
 52 |         del _params[p_name]
 53 | 
 54 | def search(node, critereon):
 55 |     """
 56 |     Traverse the Theano graph starting at `node` and return a list of all nodes
 57 |     which match the `critereon` function. When optimizing a cost function, you
 58 |     can use this to get a list of all of the trainable params in the graph, like
 59 |     so:
 60 | 
 61 |     `lib.search(cost, lambda x: hasattr(x, "param"))`
 62 |     or
 63 |     `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)`
 64 |     """
 65 | 
 66 |     def _search(node, critereon, visited):
 67 |         if node in visited:
 68 |             return []
 69 |         visited.add(node)
 70 | 
 71 |         results = []
 72 |         if isinstance(node, T.Apply):
 73 |             for inp in node.inputs:
 74 |                 results += _search(inp, critereon, visited)
 75 |         else: # Variable node
 76 |             if critereon(node):
 77 |                 results.append(node)
 78 |             if node.owner is not None:
 79 |                 results += _search(node.owner, critereon, visited)
 80 |         return results
 81 | 
 82 |     return _search(node, critereon, set())
 83 | 
 84 | def floatX(x):
 85 |     """
 86 |     Convert `x` to the numpy type specified in `theano.config.floatX`.
 87 |     """
 88 |     if theano.config.floatX == 'float16':
 89 |         return numpy.float16(x)
 90 |     elif theano.config.floatX == 'float32':
 91 |         return numpy.float32(x)
 92 |     else: # Theano's default float type is float64
 93 |         print "Warning: lib.floatX using float64"
 94 |         return numpy.float64(x)
 95 | 
 96 | def save_params(path):
 97 |     param_vals = {}
 98 |     for name, param in _params.iteritems():
 99 |         param_vals[name] = param.get_value()
100 | 
101 |     with open(path, 'wb') as f:
102 |         pickle.dump(param_vals, f)
103 | 
104 | def load_params(path):
105 |     with open(path, 'rb') as f:
106 |         param_vals = pickle.load(f)
107 | 
108 |     for name, val in param_vals.iteritems():
109 |         _params[name].set_value(val)
110 | 
111 | def clear_all_params():
112 |     to_delete = [p_name for p_name in _params]
113 |     for p_name in to_delete:
114 |         del _params[p_name]
115 | 
116 | def ensure_dir(dirname):
117 |     """
118 |     Ensure that a named directory exists; if it does not, attempt to create it.
119 |     """
120 |     try:
121 |         os.makedirs(dirname)
122 |     except OSError, e:
123 |         if e.errno != errno.EEXIST:
124 |             raise
125 | 
126 | __model_setting_file_name = 'model_settings.txt'
127 | def print_model_settings(locals_var, path=None, sys_arg=False):
128 |     """
129 |     Prints all variables in upper case in locals_var,
130 |     except for T which usually stands for theano.tensor.
131 |     If locals() passed as input to this method, will print
132 |     all the variables in upper case defined so far, that is
133 |     model settings.
134 | 
135 |     With `path` as an address to a directory it will _append_ it
136 |     as a file named `model_settings.txt` as well.
137 | 
138 |     With `sys_arg` set to True, log information about Python, Numpy,
139 |     and Theano and passed arguments to the script will be added too.
140 |     args.pkl would be overwritten, specially in case of resuming a job.
141 |     But again that wouldn't be much of a problem as all the passed args
142 |     to the script except for '--resume' should be the same.
143 | 
144 |     With both `path` and `sys_arg` passed, dumps the theano.config.
145 | 
146 |     :usage:
147 |         >>> import theano.tensor as T
148 |         >>> import lib
149 |         >>> BATCH_SIZE, DIM = 128, 512
150 |         >>> DATA_PATH = '/Path/to/dataset'
151 |         >>> lib.print_model_settings(locals(), path='./')
152 |     """
153 |     log = ""
154 |     if sys_arg:
155 |         try:
156 |             log += "Python:\n"
157 |             log += "\tsys.version_info\t{}\n".format(str(sys.version_info))
158 |             log += "Numpy:\n"
159 |             log += "\t.__version__\t{}\n".format(numpy.__version__)
160 |             log += "Theano:\n"
161 |             log += "\t.__version__\t{}\n".format(theano.__version__)
162 |             log += "\n\nAll passed args:\n"
163 |             log += str(sys.argv)
164 |             log += "\n"
165 |         except:
166 |             print "Something went wrong during sys_arg logging. Continue anyway!"
167 | 
168 |     log += "\nModel settings:"
169 |     all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')]
170 |     all_vars = sorted(all_vars, key=lambda x: x[0])
171 |     for var_name, var_value in all_vars:
172 |         log += ("\n\t%-20s %s" % (var_name, var_value))
173 |     print log
174 |     if path is not None:
175 |         ensure_dir(path)
176 |         # Don't override, just append if by mistake there is something in the file.
177 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
178 |             f.write(log)
179 |         if sys_arg:
180 |             with open(os.path.join(path, 'th_conf.txt'), 'a+') as f:
181 |                 f.write(str(theano.config))
182 |             with open(os.path.join(path, 'args.pkl'), 'wb') as f:
183 |                 pickle.dump(sys.argv, f)
184 |                 # To load:
185 |                 # >>> import cPickle as pickle
186 |                 # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb'))
187 | 
188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True):
189 |     """
190 |     Default criterion:
191 |         lambda x: hasattr(x, 'param') and x.param==True
192 |     This will return every parameter for cost from computation graph.
193 | 
194 |     To exclude a parameter, just set 'param' to False:
195 |         >>> h0 = lib.param('h0',\
196 |                 numpy.zeros((3, 2*512), dtype=theano.config.floatX))
197 |         >>> print h0.param  # Default: True
198 |         >>> h0.param = False
199 | 
200 |     In this case one still can get list of all params (False or True) by:
201 |         >>> lib.get_params(cost, lambda x: hasattr(x, 'param')
202 | 
203 |     :returns:
204 |         A list of params
205 |     """
206 |     return search(cost, criterion)
207 | 
208 | def print_params_info(params, path=None):
209 |     """
210 |     Print information about the parameters in the given param set.
211 | 
212 |     With `path` as an address to a directory it will _append_ it
213 |     as a file named `model_settings.txt` as well.
214 | 
215 |     :usage:
216 |         >>> params = lib.get_params(cost)
217 |         >>> lib.print_params_info(params, path='./')
218 |     """
219 |     params = sorted(params, key=lambda p: p.name)
220 |     values = [p.get_value(borrow=True) for p in params]
221 |     shapes = [p.shape for p in values]
222 |     total_param_count = 0
223 |     multiply_all = lambda a, b: a*b
224 |     log = "\nParams for cost:"
225 |     for param, value, shape in zip(params, values, shapes):
226 |         log += ("\n\t%-20s %s" % (shape, param.name))
227 |         total_param_count += reduce(multiply_all, shape)
228 | 
229 |     log += "\nTotal parameter count for this cost:\n\t{0}".format(
230 |         locale.format("%d", total_param_count, grouping=True)
231 |     )
232 |     print log
233 | 
234 |     if path is not None:
235 |         ensure_dir(path)
236 |         # Don't override, just append if by mistake there is something in the file.
237 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
238 |             f.write(log)
239 | 
240 | __train_log_file_name = 'train_log.pkl'
241 | def save_training_info(values, path):
242 |     """
243 |     Gets a set of values as dictionary and append them to a log file.
244 |     stores in <path>/train_log.pkl
245 |     """
246 |     file_name = os.path.join(path, __train_log_file_name)
247 |     try:
248 |         with open(file_name, "rb") as f:
249 |             log = pickle.load(f)
250 |     except IOError:  # first time
251 |         log = {}
252 |         for k in values.keys():
253 |             log[k] = []
254 |     for k, v in values.items():
255 |         log[k].append(v)
256 |     with open(file_name, "wb") as f:
257 |         pickle.dump(log, f)
258 | 
259 | resume_key = 'last resume index'
260 | def resumable(path,
261 |               iter_key='iter',
262 |               epoch_key='epoch',
263 |               add_resume_counter=True,
264 |               other_keys=[]):
265 |     """
266 |     :warning:
267 |         This is a naive implementation of resuming a training session
268 |         and does not save and reload the training loop. The serialization
269 |         of training loop and everything is costly and error-prone.
270 | 
271 |     :todo:
272 |         - Save and load a serializable training loop. (See warning above)
273 |         - Heavily dependent on the "model" file and the names used there right
274 |           now. It's really easy to miss anything.
275 | 
276 |     `path` should be pointing at the root directory where `train_log.pkl`
277 |     (See __train_log_file_name) and `params/` reside.
278 | 
279 |     Always assuming all the values in the log dictionary (except `resume_key`),
280 |     are lists with the same length.
281 |     """
282 |     file_name = os.path.join(path, __train_log_file_name)
283 |     # Raise error if does not exists.
284 |     with open(file_name, "rb") as f:
285 |         log = pickle.load(f)
286 | 
287 |     param_found = False
288 |     res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl')
289 |     for reverse_idx in range(-1, -len(log[epoch_key])-1, -1):
290 |         ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx]
291 |         print "> Params file for epoch {} iter {}".format(ep, it),
292 |         last_path = glob.glob(res_path.format(ep, it))
293 |         if len(last_path) == 1:
294 |             res_path = last_path[0]
295 |             param_found = True
296 |             print "found."
297 |             break
298 |         elif len(last_path) == 0:
299 |             print "[NOT FOUND]. FALLING BACK TO..."
300 |         else:  # > 1
301 |             # choose one, warning, rare
302 |             print "[multiple version found]:"
303 |             for l_path in last_path:
304 |                 print l_path
305 |             res_path = last_path[0]
306 |             param_found = True
307 |             print "Arbitrarily choosing first:\n\t{}".format(res_path)
308 | 
309 |     assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log)
310 |     # Finishing for loop with no success
311 |     assert param_found, 'No matching params file with train_log'
312 | 
313 |     acceptable_len = reverse_idx+len(log[epoch_key])+1
314 |     if acceptable_len != len(log[epoch_key]):
315 |         # Backup of the old train_log
316 |         with open(file_name+'.backup', 'wb') as f:
317 |             pickle.dump(log, f)
318 | 
319 |         # Change the log file to match the last existing checkpoint.
320 |         for k, v in log.items():
321 |             # Fix resume indices
322 |             if k == resume_key:
323 |                 log[k] = [i for i in log[k] if i < acceptable_len]
324 |                 continue
325 |             # Rest is useless with no param file.
326 |             log[k] = v[:acceptable_len]
327 | 
328 |     epochs = log[epoch_key]
329 |     iters = log[iter_key]
330 | 
331 |     if add_resume_counter:
332 |         resume_val = len(epochs)
333 |         if not resume_key in log.keys():
334 |             log[resume_key] = [resume_val]
335 |         else:
336 |             if log[resume_key] == [] or log[resume_key][-1] != resume_val:
337 |                 log[resume_key].append(resume_val)
338 |         with open(file_name, "wb") as f:
339 |             pickle.dump(log, f)
340 | 
341 |     last_epoch = epochs[-1]
342 |     last_iter = iters[-1]
343 | 
344 |     # The if-else statement is more readable than `next`:
345 |     #iters_to_consume = next((last_iter%(i-1) for (e, i) in\
346 |     #       zip(epochs, iters) if e == 1), last_iter)
347 |     if last_epoch == 0:
348 |         iters_to_consume = last_iter
349 |     else:
350 |         for e, i in zip(epochs, iters):
351 |             # first time. Epoch turns from 0 to 1.
352 |             # At the end of each `epoch` there should be
353 |             # a monitoring step so it will gives number
354 |             # number of iterations per epoch
355 |             if e == 1:
356 |                 iters_per_epoch = i - 1
357 |                 break
358 |         iters_to_consume = last_iter % iters_per_epoch
359 | 
360 |     last_other_keys = [log[k][-1] for k in other_keys]
361 |     return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys
362 | 
363 | def plot_traing_info(x, ylist, path):
364 |     """
365 |     Loads log file and plot x and y values as provided by input.
366 |     Saves as <path>/train_log.png
367 |     """
368 |     file_name = os.path.join(path, __train_log_file_name)
369 |     try:
370 |         with open(file_name, "rb") as f:
371 |             log = pickle.load(f)
372 |     except IOError:  # first time
373 |         warnings.warn("There is no {} file here!!!".format(file_name))
374 |         return
375 |     plt.figure()
376 |     x_vals = log[x]
377 |     for y in ylist:
378 |         y_vals = log[y]
379 |         if len(y_vals) != len(x_vals):
380 |             warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x))
381 |         plt.plot(x_vals, y_vals, label=y)
382 |         # assert len(y_vals) == len(x_vals), "not the same len"
383 |     plt.xlabel(x)
384 |     plt.legend()
385 |     #plt.show()
386 |     plt.savefig(file_name[:-3]+'png', bbox_inches='tight')
387 |     plt.close('all')
388 | 
389 | def create_logging_folders(path):
390 |     """
391 |     Handle structure of folders and naming here instead of training file.
392 | 
393 |     :todo:
394 |         - Implement!
395 |     """
396 |     pass
397 | 
398 | def tv(var):
399 |     """
400 |     :todo:
401 |         - add tv() function for theano variables so that instead of calling
402 |         x.tag.test_value, you can get the same thing just by calling the method
403 |         in a faster way...
404 |         - also for x.tag.test_value.shape
405 |     """
406 |     # Based on EAFP (easier to ask for forgiveness than permission)
407 |     try:
408 |         return var.tag.test_value
409 |     except AttributeError:
410 |         print "NONE, test_value has not been set."
411 |         import ipdb; ipdb.set_trace()
412 | 
413 |     ## Rather than LBYL (look before you leap)
414 |     #if hasattr(var, 'tag'):
415 |     #    if hasattr(var.tag, 'test_value'):
416 |     #        return var.tag.test_value
417 |     #   else:
418 |     #       print "NONE, test_value has not set."
419 |     #       import ipdb; ipdb.set_trace()
420 |     #else:
421 |     #    print "NONE, tag has not set."
422 |     #    import ipdb; ipdb.set_trace()
423 | 
424 | def tvs(var):
425 |     """
426 |     :returns:
427 |         var.tag.test_value.shape
428 |     """
429 |     return tv(var).shape
430 | 
431 | def _is_symbolic(v):
432 |     r"""Return `True` if any of the arguments are symbolic.
433 |     See:
434 |         https://github.com/Theano/Theano/wiki/Cookbook
435 |     """
436 |     symbolic = False
437 |     v = list(v)
438 |     for _container, _iter in [(v, xrange(len(v)))]:
439 |         for _k in _iter:
440 |             _v = _container[_k]
441 |             if isinstance(_v, theano.gof.Variable):
442 |                 symbolic = True
443 |     return symbolic
444 | 
445 | def unique_list(inp_list):
446 |     """
447 |     returns a list with unique values of inp_list.
448 |     :usage:
449 |         >>> inp_list = ['a', 'b', 'c']
450 |         >>> unique_inp_list = unique_list(inp_list*2)
451 |     """
452 |     return list(set(inp_list))
453 | 


--------------------------------------------------------------------------------
/CHRNN_HF/models/four_tier/four_tier_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Audio Generation Model
  3 | 
  4 | Three-tier model, Quantized input
  5 | For more info:
  6 | $ python three_tier.py -h
  7 | 
  8 | How-to-run example:
  9 | sampleRNN$ pwd
 10 | /u/mehris/sampleRNN
 11 | 
 12 | 
 13 | sampleRNN$ \
 14 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \
 15 | models/three_tier/three_tier.py --exp AXIS1 --seq_len 512 --big_frame_size 8 \
 16 | --frame_size 2 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 \
 17 | --n_rnn 1 --rnn_type GRU --learn_h0 True --q_levels 256 --q_type mu-law \
 18 | --batch_size 50 --which_set TIMIT
 19 | 
 20 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the
 21 | resume code as many time as possible, depending on the TRAIN_MODE.
 22 | (folder name, file name, flags, their order, and the values are important)
 23 | """
 24 | from time import time
 25 | from datetime import datetime
 26 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
 27 | exp_start = time()
 28 | 
 29 | import os, sys, glob
 30 | sys.path.insert(1, os.getcwd())
 31 | import argparse
 32 | import itertools
 33 | 
 34 | import numpy
 35 | numpy.random.seed(123)
 36 | np = numpy
 37 | import random
 38 | random.seed(123)
 39 | 
 40 | import theano
 41 | import theano.tensor as T
 42 | import theano.ifelse
 43 | import lasagne
 44 | import scipy.io.wavfile
 45 | 
 46 | import lib
 47 | 
 48 | LEARNING_RATE = 0.001
 49 | 
 50 | ### Parsing passed args/hyperparameters ###
 51 | def get_args():
 52 |     def t_or_f(arg):
 53 |         ua = str(arg).upper()
 54 |         if 'TRUE'.startswith(ua):
 55 |             return True
 56 |         elif 'FALSE'.startswith(ua):
 57 |             return False
 58 |         else:
 59 |            raise ValueError('Arg is neither `True` nor `False`')
 60 | 
 61 |     def check_non_negative(value):
 62 |         ivalue = int(value)
 63 |         if ivalue < 0:
 64 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 65 |         return ivalue
 66 | 
 67 |     def check_positive(value):
 68 |         ivalue = int(value)
 69 |         if ivalue < 1:
 70 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 71 |         return ivalue
 72 | 
 73 |     def check_unit_interval(value):
 74 |         fvalue = float(value)
 75 |         if fvalue < 0 or fvalue > 1:
 76 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 77 |         return fvalue
 78 | 
 79 |     # No default value here. Indicate every single arguement.
 80 |     parser = argparse.ArgumentParser(
 81 |         description='three_tier.py\nNo default value! Indicate every argument.')
 82 | 
 83 |     # TODO: Fix the descriptions
 84 |     # Hyperparameter arguements:
 85 |     parser.add_argument('--exp', help='Experiment name',
 86 |             type=str, required=False, default='_')
 87 |     parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True)
 88 |     parser.add_argument('--con_dim', help='Condition dimension',\
 89 |             type=check_positive, required=True)
 90 |     parser.add_argument('--con_frame_size', help='How many samples per condition frame',\
 91 |             type=check_positive, required=True)
 92 |     parser.add_argument('--big_frame_size', help='How many samples per big frame',\
 93 |             type=check_positive, required=True)
 94 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 95 |             type=check_positive, required=True)
 96 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\
 97 |             type=t_or_f, required=True)
 98 |     parser.add_argument('--emb_size', help='Size of embedding layer (> 0)',
 99 |             type=check_positive, required=True)  # different than two_tier
100 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN',
101 |             type=t_or_f, required=True)
102 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
103 |             type=check_positive, required=True)
104 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
105 |             type=check_positive, choices=xrange(1,6), required=True)
106 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
107 |             required=True)
108 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
109 |             type=t_or_f, required=True)
110 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
111 |             type=check_positive, required=True)
112 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
113 |             choices=['linear', 'a-law', 'mu-law'], required=True)
114 |     parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK',
115 |             choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True)
116 |     parser.add_argument('--batch_size', help='size of mini-batch',
117 |             type=check_positive, choices=[50,64, 128, 256], required=True)
118 | 
119 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
120 |             required=False, default=True, action='store_true')
121 | 
122 |     args = parser.parse_args()
123 | 
124 |     # NEW
125 |     # Create tag for this experiment based on passed args
126 |     # tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
127 |     # tag += '-lr'+str(LEARNING_RATE)
128 |     tag='four_tier_model'
129 |     print "Created experiment tag for these args:"
130 |     print tag
131 | 
132 |     return args, tag
133 | 
134 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001
135 | args, tag = get_args()
136 | 
137 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512)
138 | #print "------------------previous SEQ_LEN:", SEQ_LEN
139 | # TODO: test incremental training
140 | #SEQ_LEN = 512 + 256
141 | #print "---------------------------new SEQ_LEN:", SEQ_LEN
142 | CON_DIM=args.con_dim
143 | CON_FRAME_SIZE=args.con_frame_size
144 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame
145 | FRAME_SIZE = args.frame_size # How many samples per frame
146 | WEIGHT_NORM = args.weight_norm #True
147 | EMB_SIZE = args.emb_size #(256)
148 | SKIP_CONN = args.skip_conn #(False)
149 | DIM = args.dim # Model dimensionality. (1024)
150 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024)
151 | CON_TIER_DIM=DIM
152 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1)
153 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1)
154 | N_CON_RNN=N_RNN
155 | RNN_TYPE = args.rnn_type #GRU
156 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1)
157 | LEARN_H0 = args.learn_h0 #(True)
158 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256)
159 | Q_TYPE = args.q_type # log- or linear-scale #(linear)
160 | WHICH_SET = args.which_set #(MUSIC)
161 | BATCH_SIZE = args.batch_size #(128)
162 | RESUME = args.resume #(False)
163 | assert SEQ_LEN % CON_FRAME_SIZE == 0,\
164 |     'seq_len should be divisible by con_frame_size'
165 | assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\
166 |     'con_frame_size should be divisible by big_frame_size'
167 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\
168 |     'big_frame_size should be divisible by frame_size'
169 | 
170 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
171 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
172 | 
173 | # Fixed hyperparams
174 | GRAD_CLIP = 1 # Elementwise grad clip threshold
175 | BITRATE = 16000
176 | 
177 | # Other constants
178 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
179 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
180 | #TRAIN_MODE = 'time-iters'
181 | # To use PRINT_TIME for validation,
182 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
183 | #TRAIN_MODE = 'iters-time'
184 | # To use PRINT_ITERS for validation,
185 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
186 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations.
187 | STOP_ITERS = 300000 # Stop after this many iterations
188 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds.
189 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
190 | N_SEQS = 5  # Number of samples to generate every time monitoring.
191 | RESULTS_DIR = 'results_4t'
192 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
193 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
194 | OVERLAP = BIG_FRAME_SIZE
195 | 
196 | epoch_str = 'epoch'
197 | iter_str = 'iter'
198 | lowest_valid_str = 'lowest valid cost'
199 | corresp_test_str = 'correponding test cost'
200 | train_nll_str, valid_nll_str, test_nll_str = \
201 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
202 | 
203 | ### Create directories ###
204 | #   FOLDER_PREFIX: root, contains:
205 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
206 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
207 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
208 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
209 | if not os.path.exists(FOLDER_PREFIX):
210 |     os.makedirs(FOLDER_PREFIX)
211 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
212 | if not os.path.exists(PARAMS_PATH):
213 |     os.makedirs(PARAMS_PATH)
214 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
215 | if not os.path.exists(SAMPLES_PATH):
216 |     os.makedirs(SAMPLES_PATH)
217 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
218 | if not os.path.exists(BEST_PATH):
219 |     os.makedirs(BEST_PATH)
220 | 
221 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
222 | 
223 | ### Import the data_feeder ###
224 | # Handling WHICH_SET
225 | if WHICH_SET == 'TIMIT':
226 |     from datasets.dataset import TIMIT_test_feed_epoch  as test_feeder
227 | 
228 | def load_data(data_feeder):
229 |     """
230 |     Helper function to deal with interface of different datasets.
231 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
232 |     """
233 |     return data_feeder(BATCH_SIZE,
234 |                        SEQ_LEN,
235 |                        CON_FRAME_SIZE,
236 |                        CON_DIM,
237 |                        OVERLAP,
238 |                        Q_LEVELS,
239 |                        Q_ZERO,
240 |                        Q_TYPE)
241 | 
242 | ### Creating computation graph ###
243 | def con_frame_level_rnn(input_sequences, h0, reset):
244 |     """
245 |     input_sequences.shape: (batch size, n con frames * CON_DIM) 
246 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
247 |     reset.shape:           ()
248 |     output[0].shape:       (batch size, n frames, DIM)
249 |     output[1].shape:       same as h0.shape
250 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
251 |     """
252 | 
253 |     frames = input_sequences.reshape((
254 |         input_sequences.shape[0],
255 |         input_sequences.shape[1] // CON_DIM,
256 |         CON_DIM
257 |     ))
258 | 
259 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
260 |     # (a reasonable range to pass as inputs to the RNN)
261 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
262 |     # frames *= lib.floatX(2)
263 | 
264 |     # Initial state of RNNs
265 |     learned_h0 = lib.param(
266 |         'ConFrameLevel.h0',
267 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
268 |     )
269 |     # Handling LEARN_H0
270 |     learned_h0.param = LEARN_H0 #True
271 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
272 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
273 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
274 | 
275 |     # Handling RNN_TYPE
276 |     # Handling SKIP_CONN
277 |     if RNN_TYPE == 'GRU':
278 |         rnns_out, last_hidden = lib.ops.stackedGRU('ConFrameLevel.GRU',
279 |                                                    N_CON_RNN,
280 |                                                    CON_DIM,
281 |                                                    CON_TIER_DIM,
282 |                                                    frames,
283 |                                                    h0=h0,
284 |                                                    weightnorm=WEIGHT_NORM,
285 |                                                    skip_conn=SKIP_CONN)
286 |     elif RNN_TYPE == 'LSTM':
287 |         rnns_out, last_hidden = lib.ops.stackedLSTM('ConFrameLevel.LSTM',
288 |                                                     N_CON_RNN,
289 |                                                     CON_DIM,
290 |                                                     CON_TIER_DIM,
291 |                                                     frames,
292 |                                                     h0=h0,
293 |                                                     weightnorm=WEIGHT_NORM,
294 |                                                     skip_conn=SKIP_CONN)
295 | 
296 |     output = lib.ops.Linear(       #batch*timestep*dim
297 |         'ConFrameLevel.Output',
298 |         CON_TIER_DIM,
299 |         BIG_DIM * CON_FRAME_SIZE / BIG_FRAME_SIZE,  #1024*8/2
300 |         rnns_out,
301 |         initialization='he',
302 |         weightnorm=WEIGHT_NORM
303 |     )
304 |     output = output.reshape((output.shape[0], output.shape[1] * CON_FRAME_SIZE / BIG_FRAME_SIZE, BIG_DIM))
305 | 
306 |     return (output, last_hidden) #last_hidden:#batch*1*dim
307 | 
308 | def big_frame_level_rnn(input_sequences, other_input,h0, reset):
309 |     """
310 |     input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8
311 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
312 |     reset.shape:           ()
313 |     output[0].shape:       (batch size, n frames, DIM)
314 |     output[1].shape:       same as h0.shape
315 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
316 |     """
317 |     frames = input_sequences.reshape((
318 |         input_sequences.shape[0],
319 |         input_sequences.shape[1] // (2*BIG_FRAME_SIZE),
320 |         2*BIG_FRAME_SIZE
321 |     ))
322 | 
323 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
324 |     # (a reasonable range to pass as inputs to the RNN)
325 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
326 |     frames *= lib.floatX(1)
327 | 
328 |     gru_input = lib.ops.Linear(
329 |         'BigFrameLevel.InputExpand',
330 |         2*BIG_FRAME_SIZE,
331 |         BIG_DIM,
332 |         frames,
333 |         initialization='he',
334 |         weightnorm=WEIGHT_NORM,
335 |         ) + other_input
336 | 
337 |     # Initial state of RNNs
338 |     learned_h0 = lib.param(
339 |         'BigFrameLevel.h0',
340 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
341 |     )
342 |     # Handling LEARN_H0
343 |     learned_h0.param = LEARN_H0 #True
344 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
345 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
346 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
347 | 
348 |     # Handling RNN_TYPE
349 |     # Handling SKIP_CONN
350 |     if RNN_TYPE == 'GRU':
351 |         rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU',
352 |                                                    N_BIG_RNN,
353 |                                                    BIG_DIM,
354 |                                                    BIG_DIM,
355 |                                                    gru_input,
356 |                                                    h0=h0,
357 |                                                    weightnorm=WEIGHT_NORM,
358 |                                                    skip_conn=SKIP_CONN)
359 |     elif RNN_TYPE == 'LSTM':
360 |         rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM',
361 |                                                     N_BIG_RNN,
362 |                                                     BIG_DIM,
363 |                                                     BIG_DIM,
364 |                                                     gru_input,
365 |                                                     h0=h0,
366 |                                                     weightnorm=WEIGHT_NORM,
367 |                                                     skip_conn=SKIP_CONN)
368 | 
369 |     output = lib.ops.Linear(       #batch*timestep*dim
370 |         'BigFrameLevel.Output',
371 |         BIG_DIM,
372 |         DIM * BIG_FRAME_SIZE / FRAME_SIZE,  #1024*8/2
373 |         rnns_out,
374 |         initialization='he',
375 |         weightnorm=WEIGHT_NORM
376 |     )
377 |     output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
378 | 
379 |     return (output, last_hidden) #last_hidden:#batch*1*dim
380 | 
381 | def frame_level_rnn(input_sequences, other_input, h0, reset):
382 |     """
383 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2
384 |     other_input.shape:     (batch size, n frames, DIM)
385 |     h0.shape:              (batch size, N_RNN, DIM)
386 |     reset.shape:           ()
387 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
388 |     """
389 |     frames = input_sequences.reshape((
390 |         input_sequences.shape[0],
391 |         input_sequences.shape[1] // (2*FRAME_SIZE),
392 |         2*FRAME_SIZE
393 |     ))
394 | 
395 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
396 |     # (a reasonable range to pass as inputs to the RNN)
397 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
398 |     frames *= lib.floatX(1)
399 | 
400 |     gru_input = lib.ops.Linear(
401 |         'FrameLevel.InputExpand',
402 |         2*FRAME_SIZE,
403 |         DIM,
404 |         frames,
405 |         initialization='he',
406 |         weightnorm=WEIGHT_NORM,
407 |         ) + other_input
408 | 
409 |     # Initial state of RNNs
410 |     learned_h0 = lib.param(
411 |         'FrameLevel.h0',
412 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
413 |     )
414 |     # Handling LEARN_H0
415 |     learned_h0.param = LEARN_H0
416 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
417 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
418 |     #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
419 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
420 | 
421 |     # Handling RNN_TYPE
422 |     # Handling SKIP_CONN
423 |     if RNN_TYPE == 'GRU':
424 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
425 |                                                    N_RNN,
426 |                                                    DIM,
427 |                                                    DIM,
428 |                                                    gru_input,
429 |                                                    h0=h0,
430 |                                                    weightnorm=WEIGHT_NORM,
431 |                                                    skip_conn=SKIP_CONN)
432 |     elif RNN_TYPE == 'LSTM':
433 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
434 |                                                     N_RNN,
435 |                                                     DIM,
436 |                                                     DIM,
437 |                                                     gru_input,
438 |                                                     h0=h0,
439 |                                                     weightnorm=WEIGHT_NORM,
440 |                                                     skip_conn=SKIP_CONN)
441 | 
442 |     output = lib.ops.Linear(
443 |         'FrameLevel.Output',
444 |         DIM,
445 |         FRAME_SIZE * DIM,
446 |         rnns_out,
447 |         initialization='he',
448 |         weightnorm=WEIGHT_NORM
449 |     )
450 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
451 | 
452 |     return (output, last_hidden)
453 | def sample_level_predictor(frame_level_outputs, prev_samples):
454 |     """
455 |     frame_level_outputs.shape: (batch size, DIM)
456 |     prev_samples.shape:        (batch size, FRAME_SIZE)
457 |     output.shape:              (batch size, Q_LEVELS)
458 |     """
459 |     # Handling EMB_SIZE
460 |     if EMB_SIZE == 0:  # no support for one-hot in three_tier and one_tier.
461 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
462 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
463 |         last_out_shape = Q_LEVELS
464 |     elif EMB_SIZE > 0:  #The embedding steps maps each of the q discrete values to a real-valued vector embedding.
465 |         prev_samples = lib.ops.Embedding(  #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE
466 |             'SampleLevel.Embedding',
467 |             Q_LEVELS,
468 |             EMB_SIZE,
469 |             prev_samples)
470 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
471 |         last_out_shape = EMB_SIZE
472 |     else:
473 |         raise ValueError('EMB_SIZE cannot be negative.')
474 | 
475 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE)
476 | 
477 |     out = lib.ops.Linear(
478 |         'SampleLevel.L1_PrevSamples',
479 |         FRAME_SIZE * last_out_shape,
480 |         DIM,
481 |         prev_samples,
482 |         biases=False,
483 |         initialization='he',
484 |         weightnorm=WEIGHT_NORM
485 |     )
486 | 
487 |     out += frame_level_outputs
488 |     # out = T.nnet.relu(out)  # commented out to be similar to two_tier
489 | 
490 |     out = lib.ops.Linear('SampleLevel.L2',
491 |                          DIM,
492 |                          DIM,
493 |                          out,
494 |                          initialization='he',
495 |                          weightnorm=WEIGHT_NORM)
496 |     out = T.nnet.relu(out)
497 | 
498 |     # L3
499 |     out = lib.ops.Linear('SampleLevel.L3',
500 |                          DIM,
501 |                          DIM,
502 |                          out,
503 |                          initialization='he',
504 |                          weightnorm=WEIGHT_NORM)
505 |     out = T.nnet.relu(out)
506 | 
507 |     # Output
508 |     # We apply the softmax later
509 |     out = lib.ops.Linear('SampleLevel.Output',
510 |                          DIM,
511 |                          Q_LEVELS,
512 |                          out,
513 |                          weightnorm=WEIGHT_NORM)
514 |     return out
515 | 
516 | sequences_8k   = T.imatrix('sequences_8k') #batch size*samplenum
517 | sequences_up   = T.imatrix('sequences_up')
518 | condition   = T.matrix('con')
519 | con_h0      = T.tensor3('con_h0')
520 | h0          = T.tensor3('h0')     #(batch size, N_RNN, DIM)
521 | big_h0      = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM)
522 | reset       = T.iscalar('reset')
523 | mask        = T.matrix('mask') #batch size*samplenum
524 | batch_size       =T.iscalar('batch_size')
525 | lr=T.scalar('lr')
526 | 
527 | con_input_sequences = condition
528 | 
529 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3)
530 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1))
531 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1,  2*OVERLAP), neib_step=(1, OVERLAP), mode='valid')
532 | big_input_sequences=big_input_sequences.reshape((batch_size,-1))
533 | 
534 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)]  #(tier2)
535 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1))
536 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1,  2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid')
537 | input_sequences=input_sequences.reshape((batch_size,-1))
538 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues
539 | 
540 | target_mask = mask[:,0:-OVERLAP]
541 | 
542 | con_frame_level_outputs, new_con_h0 = con_frame_level_rnn(con_input_sequences,con_h0,reset)
543 | 
544 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, con_frame_level_outputs,big_h0, reset)#tier3->tier2
545 | 
546 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1
547 | 
548 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)]
549 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1))
550 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1,  FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...])
551 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN,  FRAME_SIZE))
552 | 
553 | 
554 | sample_level_outputs = sample_level_predictor(
555 |     frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)),
556 |     prev_samples
557 | )          #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...]
558 | 
559 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences)
560 | accuracy=accuracy*target_mask
561 | accuracy=T.sum(accuracy,axis=1)
562 | mask_sum=T.sum(target_mask,axis=1)
563 | 
564 | cost = T.nnet.categorical_crossentropy(
565 |     T.nnet.softmax(sample_level_outputs),  #Every row represents a distribution(256 propability)
566 |     target_sequences.flatten()    #A list, represent the groundtruth of every row
567 | )
568 | cost = cost.reshape(target_sequences.shape)
569 | cost = cost * target_mask #dim: batch*num
570 | # Don't use these lines; could end up with NaN
571 | # Specially at the end of audio files where mask is
572 | # all zero for some of the shorter files in mini-batch.
573 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
574 | #cost = cost.mean(axis=0)
575 | cost_sum=T.sum(cost,axis=1)
576 | # Use this one instead.
577 | cost = cost.sum()
578 | cost = cost / target_mask.sum() #cost average by samples
579 | 
580 | # By default we report cross-entropy cost in bits.
581 | # Switch to nats by commenting out this line:
582 | # log_2(e) = 1.44269504089
583 | #cost = cost * lib.floatX(numpy.log2(numpy.e))
584 | 
585 | ###########
586 | 
587 | test_fn=theano.function(
588 |     [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size],
589 |     [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_con_h0,new_big_h0,new_h0],
590 |     on_unused_input='warn'
591 | )
592 | 
593 | def generate_and_save_samples(tag):
594 |     def write_audio_file(name, data):
595 |         data = data.astype('float32')
596 |         #data -= data.min()
597 |         #data /= data.max()
598 |         #data -= 0.5
599 |         #data *= 0.95
600 |         scipy.io.wavfile.write(
601 |                     os.path.join(SAMPLES_PATH, name),
602 |                     BITRATE,
603 |                     data)
604 | 
605 |     total_time=time()
606 |     costs_g = []
607 |     accuracys_g=[]
608 |     samples_low_list=[]
609 |     samples_list=[]
610 |     masks_g_index=[]
611 |     samples_number=0
612 |     count=0
613 |     data_feeder = load_data(test_feeder)
614 |     for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,con_g,batch_g,seqs_g_8k_real in data_feeder:
615 |         if reset_g==1:
616 |             con_h0_g=numpy.zeros((batch_g, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32')
617 |             big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
618 |             h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32')
619 |             cost_batch=np.zeros((batch_g,),dtype='float32')
620 |             accuracy_batch=np.zeros((batch_g,),dtype='float32')
621 |             mask_batch=np.zeros((batch_g,),dtype='float32')
622 |         cost_g, accuracy_g,mask_sum_g,sample, con_h0_g,big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, con_g,con_h0_g,big_h0_g,h0_g, reset_g, mask_g,batch_g)
623 |         cost_batch=cost_batch+cost_g
624 |         accuracy_batch=accuracy_batch+accuracy_g
625 |         mask_batch=mask_batch+mask_sum_g
626 |         if end_flag_g==1:
627 |             costs_g.extend(list(cost_batch/mask_batch))
628 |             accuracys_g.extend(list(accuracy_batch/mask_batch))
629 | 
630 |         if reset_g==1:
631 |             samples_low=seqs_g_8k_real[:,0:-OVERLAP]
632 |             samples=sample
633 |             masks_g=mask_g[:,0:-OVERLAP]
634 |         else:
635 |             samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1)
636 |             samples=np.concatenate([samples,sample],axis=1)
637 |             masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1)
638 | 
639 |         if end_flag_g==1:
640 |             samples_low_list.append(samples_low)
641 |             samples_list.append(samples)
642 |             masks_g_index.append(masks_g)
643 |     fid=open('datasets/TIMIT/test_list.scp','r')
644 |     test_id_list=fid.readlines()
645 |     for i in xrange(len(samples_list)):
646 |         samples_number+=samples_list[i].shape[0]*samples_list[i].shape[1]
647 |         for j in xrange(samples_list[i].shape[0]):
648 |             samples_lowi=samples_low_list[i][j]
649 |             samplei=samples_list[i][j]
650 |             maski=masks_g_index[i][j]
651 |             samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])]
652 |             samplei=samplei[0:len(np.where(maski==1)[0])]
653 |             if Q_TYPE == 'mu-law':
654 |                 from datasets.dataset import mu2linear
655 |                 samplei = mu2linear(samplei)
656 |             write_audio_file(test_id_list[count].split()[0], samplei/3+samples_lowi)
657 |             count+=1
658 | 
659 | 
660 |     total_time = time() - total_time
661 |     log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds."
662 |     log = log.format(total_time/60,total_time/samples_number*16000)
663 |     print log,
664 | 
665 |     return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time,list(np.array(accuracys_g)*100)
666 | 
667 | ### Handling the resume option:
668 | if RESUME:
669 |     # Check if checkpoint from previous run is not corrupted.
670 |     # Then overwrite some of the variables above.
671 |     iters_to_consume, res_path, epoch, total_iters,\
672 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
673 |         lib.resumable(path=FOLDER_PREFIX,
674 |                       iter_key=iter_str,
675 |                       epoch_key=epoch_str,
676 |                       add_resume_counter=True,
677 |                       other_keys=[lowest_valid_str,
678 |                                   corresp_test_str,
679 |                                   test_nll_str])
680 |     # At this point we saved the pkl file.
681 |     last_print_iters = total_iters
682 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
683 |     # Consumes this much iters to get to the last point in training data.
684 |     consume_time = time()
685 |     consume_time = time() - consume_time
686 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
687 |             format(consume_time, iters_to_consume)
688 | 
689 |     lib.load_params(res_path)
690 |     print "Parameters from last available checkpoint loaded."
691 | 
692 | tag='gen'
693 | test_cost, test_accuracy,test_time,test_accuracy_list=generate_and_save_samples(tag)
694 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time)


--------------------------------------------------------------------------------
/CHRNN_HF/models/four_tier/four_tier_train_valid.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | from datetime import datetime
  3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
  4 | exp_start = time()
  5 | 
  6 | import os, sys, glob
  7 | sys.path.insert(1, os.getcwd())
  8 | import argparse
  9 | import itertools
 10 | 
 11 | import numpy
 12 | numpy.random.seed(123)
 13 | np = numpy
 14 | import random
 15 | random.seed(123)
 16 | 
 17 | import theano
 18 | import theano.tensor as T
 19 | import theano.ifelse
 20 | import lasagne
 21 | import scipy.io.wavfile
 22 | 
 23 | import lib
 24 | 
 25 | LEARNING_RATE = 0.001
 26 | 
 27 | ### Parsing passed args/hyperparameters ###
 28 | def get_args():
 29 |     def t_or_f(arg):
 30 |         ua = str(arg).upper()
 31 |         if 'TRUE'.startswith(ua):
 32 |             return True
 33 |         elif 'FALSE'.startswith(ua):
 34 |             return False
 35 |         else:
 36 |            raise ValueError('Arg is neither `True` nor `False`')
 37 | 
 38 |     def check_non_negative(value):
 39 |         ivalue = int(value)
 40 |         if ivalue < 0:
 41 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 42 |         return ivalue
 43 | 
 44 |     def check_positive(value):
 45 |         ivalue = int(value)
 46 |         if ivalue < 1:
 47 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 48 |         return ivalue
 49 | 
 50 |     def check_unit_interval(value):
 51 |         fvalue = float(value)
 52 |         if fvalue < 0 or fvalue > 1:
 53 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 54 |         return fvalue
 55 | 
 56 |     # No default value here. Indicate every single arguement.
 57 |     parser = argparse.ArgumentParser(
 58 |         description='three_tier.py\nNo default value! Indicate every argument.')
 59 | 
 60 |     # TODO: Fix the descriptions
 61 |     # Hyperparameter arguements:
 62 |     parser.add_argument('--exp', help='Experiment name',
 63 |             type=str, required=False, default='_')
 64 |     parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True)
 65 |     parser.add_argument('--con_dim', help='Condition dimension',\
 66 |             type=check_positive, required=True)
 67 |     parser.add_argument('--con_frame_size', help='How many samples per condition frame',\
 68 |             type=check_positive, required=True)
 69 |     parser.add_argument('--big_frame_size', help='How many samples per big frame',\
 70 |             type=check_positive, required=True)
 71 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 72 |             type=check_positive, required=True)
 73 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\
 74 |             type=t_or_f, required=True)
 75 |     parser.add_argument('--emb_size', help='Size of embedding layer (> 0)',
 76 |             type=check_positive, required=True)  # different than two_tier
 77 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN',
 78 |             type=t_or_f, required=True)
 79 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 80 |             type=check_positive, required=True)
 81 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 82 |             type=check_positive, choices=xrange(1,6), required=True)
 83 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
 84 |             required=True)
 85 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
 86 |             type=t_or_f, required=True)
 87 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
 88 |             type=check_positive, required=True)
 89 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
 90 |             choices=['linear', 'a-law', 'mu-law'], required=True)
 91 |     parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK',
 92 |             choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True)
 93 |     parser.add_argument('--batch_size', help='size of mini-batch',
 94 |             type=check_positive, choices=[50,64, 128, 256], required=True)
 95 | 
 96 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
 97 |             required=False, default=False, action='store_true')
 98 | 
 99 |     args = parser.parse_args()
100 | 
101 |     # NEW
102 |     # Create tag for this experiment based on passed args
103 |     tag='four_tier_model'
104 |     print "Created experiment tag for these args:"
105 |     print tag
106 | 
107 |     return args, tag
108 | 
109 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001
110 | args, tag = get_args()
111 | 
112 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512)
113 | #print "------------------previous SEQ_LEN:", SEQ_LEN
114 | # TODO: test incremental training
115 | #SEQ_LEN = 512 + 256
116 | #print "---------------------------new SEQ_LEN:", SEQ_LEN
117 | CON_DIM=args.con_dim
118 | CON_FRAME_SIZE=args.con_frame_size
119 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame
120 | FRAME_SIZE = args.frame_size # How many samples per frame
121 | WEIGHT_NORM = args.weight_norm #True
122 | EMB_SIZE = args.emb_size #(256)
123 | SKIP_CONN = args.skip_conn #(False)
124 | DIM = args.dim # Model dimensionality. (1024)
125 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024)
126 | CON_TIER_DIM=DIM
127 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1)
128 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1)
129 | N_CON_RNN=N_RNN
130 | RNN_TYPE = args.rnn_type #GRU
131 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1)
132 | LEARN_H0 = args.learn_h0 #(True)
133 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256)
134 | Q_TYPE = args.q_type # log- or linear-scale #(linear)
135 | WHICH_SET = args.which_set #(MUSIC)
136 | BATCH_SIZE = args.batch_size #(128)
137 | RESUME = args.resume #(False)
138 | assert SEQ_LEN % CON_FRAME_SIZE == 0,\
139 |     'seq_len should be divisible by con_frame_size'
140 | assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\
141 |     'con_frame_size should be divisible by big_frame_size'
142 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\
143 |     'big_frame_size should be divisible by frame_size'
144 | 
145 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
146 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
147 | 
148 | # Fixed hyperparams
149 | GRAD_CLIP = 1 # Elementwise grad clip threshold
150 | BITRATE = 16000
151 | 
152 | # Other constants
153 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
154 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
155 | #TRAIN_MODE = 'time-iters'
156 | # To use PRINT_TIME for validation,
157 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
158 | #TRAIN_MODE = 'iters-time'
159 | # To use PRINT_ITERS for validation,
160 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
161 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations.
162 | STOP_ITERS = 300000 # Stop after this many iterations
163 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds.
164 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
165 | N_SEQS = 5  # Number of samples to generate every time monitoring.
166 | RESULTS_DIR = 'results_4t'
167 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
168 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
169 | OVERLAP = BIG_FRAME_SIZE
170 | 
171 | epoch_str = 'epoch'
172 | iter_str = 'iter'
173 | lowest_valid_str = 'lowest valid cost'
174 | corresp_test_str = 'correponding test cost'
175 | train_nll_str, valid_nll_str, test_nll_str = \
176 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
177 | 
178 | ### Create directories ###
179 | #   FOLDER_PREFIX: root, contains:
180 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
181 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
182 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
183 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
184 | if not os.path.exists(FOLDER_PREFIX):
185 |     os.makedirs(FOLDER_PREFIX)
186 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
187 | if not os.path.exists(PARAMS_PATH):
188 |     os.makedirs(PARAMS_PATH)
189 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
190 | if not os.path.exists(SAMPLES_PATH):
191 |     os.makedirs(SAMPLES_PATH)
192 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
193 | if not os.path.exists(BEST_PATH):
194 |     os.makedirs(BEST_PATH)
195 | 
196 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
197 | 
198 | ### Import the data_feeder ###
199 | # Handling WHICH_SET
200 | if WHICH_SET == 'TIMIT':
201 |     from datasets.dataset import TIMIT_train_feed_epoch as train_feeder
202 |     from datasets.dataset import TIMIT_valid_feed_epoch as valid_feeder
203 |     from datasets.dataset import TIMIT_test_feed_epoch  as test_feeder
204 | 
205 | def load_data(data_feeder):
206 |     """
207 |     Helper function to deal with interface of different datasets.
208 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
209 |     """
210 |     return data_feeder(BATCH_SIZE,
211 |                        SEQ_LEN,
212 |                        CON_FRAME_SIZE,
213 |                        CON_DIM,
214 |                        OVERLAP,
215 |                        Q_LEVELS,
216 |                        Q_ZERO,
217 |                        Q_TYPE)
218 | 
219 | ### Creating computation graph ###
220 | def con_frame_level_rnn(input_sequences, h0, reset):
221 |     """
222 |     input_sequences.shape: (batch size, n con frames * CON_DIM) 
223 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
224 |     reset.shape:           ()
225 |     output[0].shape:       (batch size, n frames, DIM)
226 |     output[1].shape:       same as h0.shape
227 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
228 |     """
229 | 
230 |     frames = input_sequences.reshape((
231 |         input_sequences.shape[0],
232 |         input_sequences.shape[1] // CON_DIM,
233 |         CON_DIM
234 |     ))
235 | 
236 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
237 |     # (a reasonable range to pass as inputs to the RNN)
238 |     # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
239 |     # frames *= lib.floatX(2)
240 | 
241 |     # Initial state of RNNs
242 |     learned_h0 = lib.param(
243 |         'ConFrameLevel.h0',
244 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
245 |     )
246 |     # Handling LEARN_H0
247 |     learned_h0.param = LEARN_H0 #True
248 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
249 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
250 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
251 | 
252 |     # Handling RNN_TYPE
253 |     # Handling SKIP_CONN
254 |     if RNN_TYPE == 'GRU':
255 |         rnns_out, last_hidden = lib.ops.stackedGRU('ConFrameLevel.GRU',
256 |                                                    N_CON_RNN,
257 |                                                    CON_DIM,
258 |                                                    CON_TIER_DIM,
259 |                                                    frames,
260 |                                                    h0=h0,
261 |                                                    weightnorm=WEIGHT_NORM,
262 |                                                    skip_conn=SKIP_CONN)
263 |     elif RNN_TYPE == 'LSTM':
264 |         rnns_out, last_hidden = lib.ops.stackedLSTM('ConFrameLevel.LSTM',
265 |                                                     N_CON_RNN,
266 |                                                     CON_DIM,
267 |                                                     CON_TIER_DIM,
268 |                                                     frames,
269 |                                                     h0=h0,
270 |                                                     weightnorm=WEIGHT_NORM,
271 |                                                     skip_conn=SKIP_CONN)
272 | 
273 |     output = lib.ops.Linear(       #batch*timestep*dim
274 |         'ConFrameLevel.Output',
275 |         CON_TIER_DIM,
276 |         BIG_DIM * CON_FRAME_SIZE / BIG_FRAME_SIZE,  #1024*8/2
277 |         rnns_out,
278 |         initialization='he',
279 |         weightnorm=WEIGHT_NORM
280 |     )
281 |     output = output.reshape((output.shape[0], output.shape[1] * CON_FRAME_SIZE / BIG_FRAME_SIZE, BIG_DIM))
282 | 
283 |     return (output, last_hidden) #last_hidden:#batch*1*dim
284 | 
285 | def big_frame_level_rnn(input_sequences, other_input,h0, reset):
286 |     """
287 |     input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8
288 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
289 |     reset.shape:           ()
290 |     output[0].shape:       (batch size, n frames, DIM)
291 |     output[1].shape:       same as h0.shape
292 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
293 |     """
294 |     frames = input_sequences.reshape((
295 |         input_sequences.shape[0],
296 |         input_sequences.shape[1] // (2*BIG_FRAME_SIZE),
297 |         2*BIG_FRAME_SIZE
298 |     ))
299 | 
300 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
301 |     # (a reasonable range to pass as inputs to the RNN)
302 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
303 |     frames *= lib.floatX(1)
304 | 
305 |     gru_input = lib.ops.Linear(
306 |         'BigFrameLevel.InputExpand',
307 |         2*BIG_FRAME_SIZE,
308 |         BIG_DIM,
309 |         frames,
310 |         initialization='he',
311 |         weightnorm=WEIGHT_NORM,
312 |         ) + other_input
313 | 
314 |     # Initial state of RNNs
315 |     learned_h0 = lib.param(
316 |         'BigFrameLevel.h0',
317 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
318 |     )
319 |     # Handling LEARN_H0
320 |     learned_h0.param = LEARN_H0 #True
321 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
322 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
323 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
324 | 
325 |     # Handling RNN_TYPE
326 |     # Handling SKIP_CONN
327 |     if RNN_TYPE == 'GRU':
328 |         rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU',
329 |                                                    N_BIG_RNN,
330 |                                                    BIG_DIM,
331 |                                                    BIG_DIM,
332 |                                                    gru_input,
333 |                                                    h0=h0,
334 |                                                    weightnorm=WEIGHT_NORM,
335 |                                                    skip_conn=SKIP_CONN)
336 |     elif RNN_TYPE == 'LSTM':
337 |         rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM',
338 |                                                     N_BIG_RNN,
339 |                                                     BIG_DIM,
340 |                                                     BIG_DIM,
341 |                                                     gru_input,
342 |                                                     h0=h0,
343 |                                                     weightnorm=WEIGHT_NORM,
344 |                                                     skip_conn=SKIP_CONN)
345 | 
346 |     output = lib.ops.Linear(       #batch*timestep*dim
347 |         'BigFrameLevel.Output',
348 |         BIG_DIM,
349 |         DIM * BIG_FRAME_SIZE / FRAME_SIZE,  #1024*8/2
350 |         rnns_out,
351 |         initialization='he',
352 |         weightnorm=WEIGHT_NORM
353 |     )
354 |     output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
355 | 
356 |     return (output, last_hidden) #last_hidden:#batch*1*dim
357 | 
358 | def frame_level_rnn(input_sequences, other_input, h0, reset):
359 |     """
360 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2
361 |     other_input.shape:     (batch size, n frames, DIM)
362 |     h0.shape:              (batch size, N_RNN, DIM)
363 |     reset.shape:           ()
364 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
365 |     """
366 |     frames = input_sequences.reshape((
367 |         input_sequences.shape[0],
368 |         input_sequences.shape[1] // (2*FRAME_SIZE),
369 |         2*FRAME_SIZE
370 |     ))
371 | 
372 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
373 |     # (a reasonable range to pass as inputs to the RNN)
374 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
375 |     frames *= lib.floatX(1)
376 | 
377 |     gru_input = lib.ops.Linear(
378 |         'FrameLevel.InputExpand',
379 |         2*FRAME_SIZE,
380 |         DIM,
381 |         frames,
382 |         initialization='he',
383 |         weightnorm=WEIGHT_NORM,
384 |         ) + other_input
385 | 
386 |     # Initial state of RNNs
387 |     learned_h0 = lib.param(
388 |         'FrameLevel.h0',
389 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
390 |     )
391 |     # Handling LEARN_H0
392 |     learned_h0.param = LEARN_H0
393 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
394 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
395 |     #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
396 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
397 | 
398 |     # Handling RNN_TYPE
399 |     # Handling SKIP_CONN
400 |     if RNN_TYPE == 'GRU':
401 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
402 |                                                    N_RNN,
403 |                                                    DIM,
404 |                                                    DIM,
405 |                                                    gru_input,
406 |                                                    h0=h0,
407 |                                                    weightnorm=WEIGHT_NORM,
408 |                                                    skip_conn=SKIP_CONN)
409 |     elif RNN_TYPE == 'LSTM':
410 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
411 |                                                     N_RNN,
412 |                                                     DIM,
413 |                                                     DIM,
414 |                                                     gru_input,
415 |                                                     h0=h0,
416 |                                                     weightnorm=WEIGHT_NORM,
417 |                                                     skip_conn=SKIP_CONN)
418 | 
419 |     output = lib.ops.Linear(
420 |         'FrameLevel.Output',
421 |         DIM,
422 |         FRAME_SIZE * DIM,
423 |         rnns_out,
424 |         initialization='he',
425 |         weightnorm=WEIGHT_NORM
426 |     )
427 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
428 | 
429 |     return (output, last_hidden)
430 | 
431 | def sample_level_predictor(frame_level_outputs, prev_samples):
432 |     """
433 |     frame_level_outputs.shape: (batch size, DIM)
434 |     prev_samples.shape:        (batch size, FRAME_SIZE)
435 |     output.shape:              (batch size, Q_LEVELS)
436 |     """
437 |     # Handling EMB_SIZE
438 |     if EMB_SIZE == 0:  # no support for one-hot in three_tier and one_tier.
439 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
440 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
441 |         last_out_shape = Q_LEVELS
442 |     elif EMB_SIZE > 0:  #The embedding steps maps each of the q discrete values to a real-valued vector embedding.
443 |         prev_samples = lib.ops.Embedding(  #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE
444 |             'SampleLevel.Embedding',
445 |             Q_LEVELS,
446 |             EMB_SIZE,
447 |             prev_samples)
448 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
449 |         last_out_shape = EMB_SIZE
450 |     else:
451 |         raise ValueError('EMB_SIZE cannot be negative.')
452 | 
453 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE)
454 | 
455 |     out = lib.ops.Linear(
456 |         'SampleLevel.L1_PrevSamples',
457 |         FRAME_SIZE * last_out_shape,
458 |         DIM,
459 |         prev_samples,
460 |         biases=False,
461 |         initialization='he',
462 |         weightnorm=WEIGHT_NORM
463 |     )
464 | 
465 |     out += frame_level_outputs
466 |     # out = T.nnet.relu(out)  # commented out to be similar to two_tier
467 | 
468 |     out = lib.ops.Linear('SampleLevel.L2',
469 |                          DIM,
470 |                          DIM,
471 |                          out,
472 |                          initialization='he',
473 |                          weightnorm=WEIGHT_NORM)
474 |     out = T.nnet.relu(out)
475 | 
476 |     # L3
477 |     out = lib.ops.Linear('SampleLevel.L3',
478 |                          DIM,
479 |                          DIM,
480 |                          out,
481 |                          initialization='he',
482 |                          weightnorm=WEIGHT_NORM)
483 |     out = T.nnet.relu(out)
484 | 
485 |     # Output
486 |     # We apply the softmax later
487 |     out = lib.ops.Linear('SampleLevel.Output',
488 |                          DIM,
489 |                          Q_LEVELS,
490 |                          out,
491 |                          weightnorm=WEIGHT_NORM)
492 |     return out
493 | 
494 | sequences_8k   = T.imatrix('sequences_8k') #batch size*samplenum
495 | sequences_up   = T.imatrix('sequences_up')
496 | condition   = T.matrix('con')
497 | con_h0      = T.tensor3('con_h0')
498 | h0          = T.tensor3('h0')     #(batch size, N_RNN, DIM)
499 | big_h0      = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM)
500 | reset       = T.iscalar('reset')
501 | mask        = T.matrix('mask') #batch size*samplenum
502 | batch_size       =T.iscalar('batch_size')
503 | lr=T.scalar('lr')
504 | 
505 | con_input_sequences = condition
506 | 
507 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3)
508 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1))
509 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1,  2*OVERLAP), neib_step=(1, OVERLAP), mode='valid')
510 | big_input_sequences=big_input_sequences.reshape((batch_size,-1))
511 | 
512 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)]  #(tier2)
513 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1))
514 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1,  2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid')
515 | input_sequences=input_sequences.reshape((batch_size,-1))
516 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues
517 | 
518 | target_mask = mask[:,0:-OVERLAP]
519 | 
520 | con_frame_level_outputs, new_con_h0 = con_frame_level_rnn(con_input_sequences,con_h0,reset)
521 | 
522 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, con_frame_level_outputs,big_h0, reset)#tier3->tier2
523 | 
524 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1
525 | 
526 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)]
527 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1))
528 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1,  FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...])
529 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN,  FRAME_SIZE))
530 | 
531 | sample_level_outputs = sample_level_predictor(
532 |     frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)),
533 |     prev_samples
534 | )          #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...]
535 | 
536 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences)
537 | accuracy=accuracy*target_mask
538 | accuracy=T.sum(accuracy,axis=1)
539 | mask_sum=T.sum(target_mask,axis=1)
540 | 
541 | cost = T.nnet.categorical_crossentropy(
542 |     T.nnet.softmax(sample_level_outputs),  #Every row represents a distribution(256 propability)
543 |     target_sequences.flatten()    #A list, represent the groundtruth of every row
544 | )
545 | cost = cost.reshape(target_sequences.shape)
546 | cost = cost * target_mask #dim: batch*num
547 | # Don't use these lines; could end up with NaN
548 | # Specially at the end of audio files where mask is
549 | # all zero for some of the shorter files in mini-batch.
550 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
551 | #cost = cost.mean(axis=0)
552 | cost_sum=T.sum(cost,axis=1)
553 | # Use this one instead.
554 | cost = cost.sum()
555 | cost = cost / target_mask.sum() #cost average by samples
556 | 
557 | # By default we report cross-entropy cost in bits.
558 | # Switch to nats by commenting out this line:
559 | # log_2(e) = 1.44269504089
560 | #cost = cost * lib.floatX(numpy.log2(numpy.e))
561 | 
562 | ###########
563 | all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) #if LEARN_H0=True,then learn_h0 is included in parmeters to train
564 | 
565 | lib.print_params_info(all_params, path=FOLDER_PREFIX)
566 | 
567 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
568 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
569 | 
570 | updates = lasagne.updates.adam(grads, all_params,learning_rate=lr)
571 | 
572 | # Training function(s)
573 | train_fn = theano.function(
574 |     [sequences_8k,sequences_up, condition, con_h0,big_h0, h0, reset, mask,batch_size,lr],
575 |     [cost, new_con_h0,new_big_h0, new_h0],
576 |     updates=updates,
577 |     on_unused_input='warn'
578 | )
579 | 
580 | # Validation and Test function, hence no updates
581 | valid_fn = theano.function(
582 |     [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size],
583 |     [cost_sum, accuracy,mask_sum,new_con_h0,new_big_h0,new_h0],
584 |     on_unused_input='warn'
585 | )
586 | 
587 | test_fn=theano.function(
588 |     [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size],
589 |     [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_con_h0,new_big_h0,new_h0],
590 |     on_unused_input='warn'
591 | )
592 | 
593 | def generate_and_save_samples(tag):
594 |     def write_audio_file(name, data):
595 |         data = data.astype('float32')
596 |         #data -= data.min()
597 |         #data /= data.max()
598 |         #data -= 0.5
599 |         #data *= 0.95
600 |         scipy.io.wavfile.write(
601 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
602 |                     BITRATE,
603 |                     data)
604 | 
605 |     total_time=time()
606 |     costs_g = []
607 |     accuracys_g=[]
608 |     count=0
609 |     data_feeder = load_data(test_feeder)
610 |     for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,con_g,batch_g,seqs_g_8k_real in data_feeder:
611 |         if reset_g==1:
612 |             con_h0_g=numpy.zeros((batch_g, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32')
613 |             big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
614 |             h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32')
615 |             cost_batch=np.zeros((batch_g,),dtype='float32')
616 |             accuracy_batch=np.zeros((batch_g,),dtype='float32')
617 |             mask_batch=np.zeros((batch_g,),dtype='float32')
618 |             count+=1
619 |         cost_g, accuracy_g,mask_sum_g,sample, con_h0_g,big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, con_g,con_h0_g,big_h0_g,h0_g, reset_g, mask_g,batch_g)
620 |         cost_batch=cost_batch+cost_g
621 |         accuracy_batch=accuracy_batch+accuracy_g
622 |         mask_batch=mask_batch+mask_sum_g
623 |         if end_flag_g==1:
624 |             costs_g.extend(list(cost_batch/mask_batch))
625 |             accuracys_g.extend(list(accuracy_batch/mask_batch))
626 | 
627 |         if count==1:
628 |             if reset_g==1:
629 |                 samples_low=seqs_g_8k_real[:,0:-OVERLAP]
630 |                 samples=sample
631 |                 masks_g=mask_g[:,0:-OVERLAP]
632 |             else:
633 |                 samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1)
634 |                 samples=np.concatenate([samples,sample],axis=1)
635 |                 masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1)
636 | 
637 | 
638 |     for i in xrange(N_SEQS):
639 |         samples_lowi=samples_low[i]
640 |         samplei=samples[i]
641 |         maski=masks_g[i]
642 |         samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])]
643 |         samplei=samplei[0:len(np.where(maski==1)[0])]
644 |         if Q_TYPE == 'mu-law':
645 |             from datasets.dataset import mu2linear
646 |             samplei = mu2linear(samplei)
647 |         write_audio_file("sample_{}_{}".format(tag, i), samplei/3+samples_lowi)
648 | 
649 |     total_time = time() - total_time
650 |     log = "{} samples generated in {} seconds."
651 |     log = log.format(N_SEQS, total_time)
652 |     print log,
653 | 
654 |     return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time
655 | 
656 | 
657 | def monitor(data_feeder):
658 |     """
659 |     Cost and time of test_fn on a given dataset section.
660 |     Pass only one of `valid_feeder` or `test_feeder`.
661 |     Don't pass `train_feed`.
662 | 
663 |     :returns:
664 |         Mean cost over the input dataset (data_feeder)
665 |         Total time spent
666 |     """
667 |     _total_time = time()
668 |     _costs = []
669 |     _accuracys=[]
670 |     _data_feeder = load_data(data_feeder)
671 |     for _seqs_8k,_seqs_up, _reset, _end_flag,_mask,_con,_batch,_seqs_8k_real in _data_feeder:
672 |         if _reset==1:
673 |             _con_h0=numpy.zeros((_batch, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32')
674 |             _big_h0=numpy.zeros((_batch, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
675 |             _h0 = numpy.zeros((_batch, N_RNN, H0_MULT*DIM), dtype='float32')
676 |             _cost_batch=np.zeros((_batch,),dtype='float32')
677 |             _accuracy_batch=np.zeros((_batch,),dtype='float32')
678 |             _mask_batch=np.zeros((_batch,),dtype='float32')
679 |         _cost, _accuracy,_mask_sum,_con_h0,_big_h0,_h0 = valid_fn(_seqs_8k,_seqs_up, _con,_con_h0,_big_h0,_h0, _reset, _mask,_batch)
680 |         _cost_batch=_cost_batch+_cost
681 |         _accuracy_batch=_accuracy_batch+_accuracy
682 |         _mask_batch=_mask_batch+_mask_sum
683 |         if _end_flag==1:
684 |             _costs.extend(list(_cost_batch/_mask_batch))
685 |             _accuracys.extend(list(_accuracy_batch/_mask_batch))
686 | 
687 | 
688 |     return numpy.mean(_costs), numpy.mean(_accuracys)*100,time() - _total_time
689 | 
690 | print "Wall clock time spent before training started: {:.2f}h"\
691 |         .format((time()-exp_start)/3600.)
692 | print "Training!"
693 | total_iters = 0
694 | total_time = 0.
695 | last_print_time = 0.
696 | last_print_iters = 0
697 | costs = []
698 | lowest_valid_cost = numpy.finfo(numpy.float32).max
699 | corresponding_test_cost = numpy.finfo(numpy.float32).max
700 | new_lowest_cost = False
701 | end_of_batch = False
702 | epoch = 0
703 | learning_rate=LEARNING_RATE
704 | 
705 | # Initial load train dataset
706 | tr_feeder = load_data(train_feeder)
707 | 
708 | ### Handling the resume option:
709 | if RESUME:
710 |     # Check if checkpoint from previous run is not corrupted.
711 |     # Then overwrite some of the variables above.
712 |     iters_to_consume, res_path, epoch, total_iters,\
713 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
714 |         lib.resumable(path=FOLDER_PREFIX,
715 |                       iter_key=iter_str,
716 |                       epoch_key=epoch_str,
717 |                       add_resume_counter=True,
718 |                       other_keys=[lowest_valid_str,
719 |                                   corresp_test_str,
720 |                                   test_nll_str])
721 |     # At this point we saved the pkl file.
722 |     last_print_iters = total_iters
723 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
724 |     # Consumes this much iters to get to the last point in training data.
725 |     consume_time = time()
726 |     for i in xrange(iters_to_consume):
727 |         tr_feeder.next()
728 |     consume_time = time() - consume_time
729 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
730 |             format(consume_time, iters_to_consume)
731 | 
732 |     lib.load_params(res_path)
733 |     print "Parameters from last available checkpoint loaded."
734 | 
735 | while True:
736 |     # THIS IS ONE ITERATION
737 |     if total_iters % 500 == 0:
738 |         print total_iters,
739 | 
740 |     total_iters += 1
741 | 
742 |     try:
743 |         # Take as many mini-batches as possible from train set
744 |         mini_batch = tr_feeder.next()
745 |     except StopIteration:
746 |         # Mini-batches are finished. Load it again.
747 |         # Basically, one epoch.
748 |         tr_feeder = load_data(train_feeder)
749 | 
750 |         # and start taking new mini-batches again.
751 |         mini_batch = tr_feeder.next()
752 |         epoch += 1
753 |         end_of_batch = True
754 |         print "[Another epoch]",
755 | 
756 |     seqs_8k, seqs_up,reset, end_flag,mask,con,batch_num,seqs_8k_real = mini_batch
757 |     if reset==1:
758 |         con_h0=numpy.zeros((batch_num, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32')
759 |         big_h0=numpy.zeros((batch_num, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
760 |         h0 = numpy.zeros((batch_num, N_RNN, H0_MULT*DIM), dtype='float32')
761 | 
762 |     start_time = time()
763 |     cost,con_h0,big_h0,h0 = train_fn(seqs_8k, seqs_up, con,con_h0, big_h0, h0, reset, mask,batch_num,learning_rate)
764 |     total_time += time() - start_time
765 |     #print "This cost:", cost, "This h0.mean()", h0.mean()
766 | 
767 |     costs.append(cost)
768 | 
769 |     # Monitoring step
770 |     if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
771 |         (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \
772 |         (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \
773 |         (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \
774 |         end_of_batch:
775 |         # 0. Validation
776 |         print "\nValidation!",
777 |         valid_cost, valid_accuracy,valid_time = monitor(valid_feeder)
778 |         print "Done!"
779 | 
780 |         # 1. Test
781 |         test_time = 0.
782 |         # Only when the validation cost is improved get the cost for test set.
783 |         if valid_cost < lowest_valid_cost:
784 |             lowest_valid_cost = valid_cost
785 |             print "\n>>> Best validation cost of {} reached."\
786 |                     .format(valid_cost),
787 |             #test_cost, test_time = monitor(test_feeder)
788 |             #print "Done!"
789 |             # Report last one which is the lowest on validation set:
790 |             #print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time)
791 |             #corresponding_test_cost = test_cost
792 |             new_lowest_cost = True
793 | 
794 |         tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}"
795 |         tag = tag.format(epoch,
796 |                          total_iters,
797 |                          total_time/3600,
798 |                          numpy.mean(cost),
799 |                          valid_cost)
800 |         tag += ("_best" if new_lowest_cost else "")
801 | 
802 |         print "Sampling!",
803 |         # Generate samples
804 |         test_cost, test_accuracy,test_time=generate_and_save_samples(tag)
805 |         print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time)
806 |         if new_lowest_cost:
807 |             corresponding_test_cost = test_cost
808 |         print "Done!"
809 | 
810 |         # 2. Stdout the training progress
811 |         print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
812 |         print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n"
813 |         print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n"
814 |         print_info += "\tvalid cost:{:.4f}\tvalid accuracy:{:.4f}%\ttotal time:{:.2f}h\n"
815 |         print_info += "\ttest  cost:{:.4f}\ttest accuracy:{:.4f}%\ttotal time:{:.2f}h"
816 |         print_info = print_info.format(epoch,
817 |                                        total_iters,
818 |                                        (time()-exp_start)/3600,
819 |                                        lowest_valid_cost,
820 |                                        corresponding_test_cost,
821 |                                        numpy.mean(costs),
822 |                                        total_time/3600,
823 |                                        total_time/total_iters,
824 |                                        valid_cost,
825 |                                        valid_accuracy,
826 |                                        valid_time/3600,
827 |                                        test_cost,
828 |                                        test_accuracy,
829 |                                        test_time/3600)
830 |         print print_info
831 | 
832 | 
833 |         # 3. Save params of model (IO bound, time consuming)
834 |         # If saving params is not successful, there shouldn't be any trace of
835 |         # successful monitoring step in train_log as well.
836 |         print "Saving params!",
837 |         lib.save_params(
838 |                 os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag))
839 |         )
840 |         print "Done!"
841 | 
842 |         # 4. Save and graph training progress (fast)
843 |         training_info = {epoch_str : epoch,
844 |                          iter_str : total_iters,
845 |                          train_nll_str : numpy.mean(costs),
846 |                          valid_nll_str : valid_cost,
847 |                          test_nll_str : test_cost,
848 |                          lowest_valid_str : lowest_valid_cost,
849 |                          corresp_test_str : corresponding_test_cost,
850 |                          'train time' : total_time,
851 |                          'valid time' : valid_time,
852 |                          'test time' : test_time,
853 |                          'wall clock time' : time()-exp_start}
854 |         lib.save_training_info(training_info, FOLDER_PREFIX)
855 |         print "Train info saved!",
856 | 
857 |         # y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str]
858 |         # lib.plot_traing_info(iter_str, y_axis_strs, FOLDER_PREFIX)
859 |         print "And plotted!"
860 | 
861 |         if total_iters-last_print_iters == PRINT_ITERS:
862 |                 # If we are here b/c of onom_end_of_batch, we shouldn't mess
863 |                 # with costs and last_print_iters
864 |             costs = []
865 |             last_print_time += PRINT_TIME
866 |             last_print_iters += PRINT_ITERS
867 | 
868 |         if epoch==6 and end_of_batch==True:
869 |             learning_rate=0.0001
870 |             print "\n Now learning rate is 0.0001."
871 | 
872 |         end_of_batch = False
873 |         new_lowest_cost = False
874 | 
875 |         print "Validation Done!\nBack to Training..."
876 | 
877 |     if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
878 |        (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \
879 |        ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \
880 |             (total_iters == STOP_ITERS or total_time >= STOP_TIME)):
881 | 
882 |         print "Done! Total iters:", total_iters, "Total time: ", total_time
883 |         print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
884 |         print "Wall clock time spent: {:.2f}h"\
885 |                     .format((time()-exp_start)/3600)
886 | 
887 |         sys.exit()


--------------------------------------------------------------------------------
/CHRNN_HF/readme.md:
--------------------------------------------------------------------------------
 1 | The CHRNN system in the paper:
 2 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018.
 3 | Usage:
 4 | First enter the root directory of the folder: `cd CHRNN_HF`.
 5 | 
 6 | Data preparation:
 7 | Put the train, validiation and test waveforms (16kHz sample rate) and bottleneck features into the corresponding folder in directory 'datasets/TIMIT/waveforms' and 'datasets/TIMIT/bn_norm_condition',
 8 | then run `python datasets/TIMIT/_2npy_hf.py` to generate the packaged data.
 9 | 
10 | Traning and validiation:
11 | Run:
12 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/four_tier_train_valid.py --exp BEST_4TIER --seq_len 480 --con_dim 100 --con_frame_size 160 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64`
13 | 
14 | Test:
15 | Run:
16 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/four_tier_test.py --exp BEST_4TIER --seq_len 480 --con_dim 100 --con_frame_size 160 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 1`


--------------------------------------------------------------------------------
/HRNN_HF/datasets/TIMIT/_2npy_hf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import librosa
 3 | import random
 4 | import os
 5 | import glob
 6 | 
 7 | __RAND_SEED = 123
 8 | def __fixed_shuffle(inp_list):
 9 |     if isinstance(inp_list, list):
10 |         random.seed(__RAND_SEED)
11 |         random.shuffle(inp_list)
12 |         return
13 |     if isinstance(inp_list, np.ndarray):
14 |         np.random.seed(__RAND_SEED)
15 |         np.random.shuffle(inp_list)
16 |         return
17 | 
18 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
19 | 
20 | def clip_times(audio, times):
21 | 
22 |     audio = audio * times
23 |     audio[audio > 1] = 1
24 |     audio[audio < -1] = -1
25 |     return audio
26 | 
27 | 
28 | def wav2npy(data_path,save_path,name,fixed_shuffle=True,sample_rate=16000):
29 | 	paths = sorted(glob.glob(data_path+"/*.wav"))
30 | 	if name=='test':
31 | 		fid=open(save_path+'/'+'test_list.scp','w')
32 | 		for i in xrange(len(paths)):
33 | 			fid.write(paths[i].split('/')[-1]+'\n')
34 | 		fid.close()
35 | 	if fixed_shuffle:
36 | 		__fixed_shuffle(paths)
37 | 	for i,path in enumerate(paths):
38 | 		audio16k, _ = librosa.load(path, sr=sample_rate, mono=True)
39 | 		audio8k = librosa.core.resample(audio16k,sample_rate,sample_rate/2)
40 | 		audio8k = librosa.core.resample(audio8k,sample_rate/2,sample_rate)
41 | 
42 | 		if(len(audio8k)==len(audio16k)):
43 | 			pass
44 | 		elif(len(audio8k)>len(audio16k)):
45 | 			audio8k=audio8k[0:len(audio16k)]
46 | 		else:
47 | 			audio16k=audio16k[0:len(audio8k)]
48 | 
49 | 		audio_up=audio16k-audio8k
50 | 		audio_up = clip_times(audio_up, 3)
51 | 
52 | 		if i==0:
53 | 			max_len=len(audio_up)
54 | 			audio_mat_up=np.array(audio_up,dtype='float32').reshape(1,len(audio_up))
55 | 			audio_mat8k=np.array(audio8k,dtype='float32').reshape(1,len(audio8k))
56 | 			mask=np.ones(audio_mat_up.shape,dtype='float32')
57 | 		else:
58 | 			current_len=len(audio_up)
59 | 			if current_len>max_len:
60 | 				audio_mat_up=np.pad(audio_mat_up,[[0,0],[0,current_len-max_len]],'constant')
61 | 				audio_mat_up=np.concatenate((audio_mat_up,np.array(audio_up,dtype='float32').reshape(1,current_len)),axis=0)
62 | 				audio_mat8k=np.pad(audio_mat8k,[[0,0],[0,current_len-max_len]],'constant')
63 | 				audio_mat8k=np.concatenate((audio_mat8k,np.array(audio8k,dtype='float32').reshape(1,current_len)),axis=0)
64 | 				mask=np.pad(mask,[[0,0],[0,current_len-max_len]],'constant')
65 | 				mask=np.concatenate((mask,np.ones((1,current_len),dtype='float32')),axis=0)
66 | 				max_len=current_len
67 | 			else:
68 | 				audio_mat_up=np.concatenate((audio_mat_up,np.pad(np.array(audio_up,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
69 | 				audio_mat8k=np.concatenate((audio_mat8k,np.pad(np.array(audio8k,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
70 | 				mask=np.concatenate((mask,np.pad(np.ones((1,current_len),dtype='float32'),[[0,0],[0,max_len-current_len]],'constant')),axis=0)
71 | 
72 | 	np.save(save_path+'/'+'TIMIT_'+name+'_up.npy', audio_mat_up)
73 | 	np.save(save_path+'/'+'TIMIT_'+name+'_8k.npy', audio_mat8k)
74 | 	np.save(save_path+'/'+'TIMIT_'+name+'_mask.npy', mask)
75 | 
76 | 	print name+' data storage is complete!'
77 | 
78 | 
79 | wav2npy('datasets/TIMIT/train','datasets/TIMIT','train',fixed_shuffle=True,sample_rate=16000)
80 | wav2npy('datasets/TIMIT/valid','datasets/TIMIT','valid',fixed_shuffle=True,sample_rate=16000)
81 | wav2npy('datasets/TIMIT/test','datasets/TIMIT','test',fixed_shuffle=False,sample_rate=16000)


--------------------------------------------------------------------------------
/HRNN_HF/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aiyang8067/Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension/94c3daf9554e20ea2538eb2b7aa044024fedb9ed/HRNN_HF/datasets/__init__.py


--------------------------------------------------------------------------------
/HRNN_HF/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RNN Vocal Generation Model
  3 | 
  4 | TIMIT data feeders.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import random
  9 | import time
 10 | import os
 11 | import glob
 12 | 
 13 | __base = [
 14 |     ('Local', 'datasets/'),  
 15 | ]
 16 | 
 17 | __TIMIT_file = 'TIMIT/TIMIT_{}.npy'
 18 | 
 19 | __train_mask = lambda s: s.format('train_mask')
 20 | __train_up = lambda s: s.format('train_up')
 21 | __train8k = lambda s: s.format('train_8k')
 22 | __valid_mask = lambda s: s.format('valid_mask')
 23 | __valid_up = lambda s: s.format('valid_up')
 24 | __valid8k = lambda s: s.format('valid_8k')
 25 | __test_mask = lambda s: s.format('test_mask')
 26 | __test_up = lambda s: s.format('test_up')
 27 | __test8k = lambda s: s.format('test_8k')
 28 | 
 29 | def find_dataset(filename):
 30 |     for (k, v) in __base:
 31 |         tmp_path = os.path.join(v, filename)
 32 |         if os.path.exists(tmp_path):
 33 |             return tmp_path
 34 |     raise Exception('{} NOT FOUND!'.format(filename))
 35 | 
 36 | ### Basic utils ###
 37 | def __round_to(x, y):
 38 |     """round x up to the nearest y"""
 39 |     return int(np.ceil(x / float(y))) * y
 40 | 
 41 | def __normalize(data):
 42 |     """To range [0., 1.]"""
 43 |     data -= data.min(axis=1)[:, None]
 44 |     data /= data.max(axis=1)[:, None]
 45 |     return data
 46 | 
 47 | def __linear_quantize(data, q_levels):
 48 |     """
 49 |     floats in (0, 1) to ints in [0, q_levels-1]
 50 |     scales normalized across axis 1
 51 |     """
 52 |     # Normalization is on mini-batch not whole file
 53 |     #eps = numpy.float64(1e-5)
 54 |     #data -= data.min(axis=1)[:, None]
 55 |     #data *= ((q_levels - eps) / data.max(axis=1)[:, None])
 56 |     #data += eps/2
 57 |     #data = data.astype('int32')
 58 | 
 59 |     eps = np.float64(1e-5)
 60 |     data *= (q_levels - eps)
 61 |     data += eps/2
 62 |     data = data.astype('int32')
 63 |     return data
 64 | 
 65 | def linear2mu(x, mu=255):
 66 |     """
 67 |     From Joao
 68 |     x should be normalized between -1 and 1
 69 |     Converts an array according to mu-law and discretizes it
 70 | 
 71 |     Note:
 72 |         mu2linear(linear2mu(x)) != x
 73 |         Because we are compressing to 8 bits here.
 74 |         They will sound pretty much the same, though.
 75 | 
 76 |     :usage:
 77 |         >>> bitrate, samples = scipy.io.wavfile.read('orig.wav')
 78 |         >>> norm = __normalize(samples)[None, :]  # It takes 2D as inp
 79 |         >>> mu_encoded = linear2mu(2.*norm-1.)  # From [0, 1] to [-1, 1]
 80 |         >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype
 81 |         0, 255, dtype('int16')
 82 |         >>> mu_decoded = mu2linear(mu_encoded)  # Back to linear
 83 |         >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype
 84 |         -1, 0.9574371, dtype('float32')
 85 |     """
 86 |     x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu)
 87 |     return ((x_mu + 1)/2 * mu).astype('int16')
 88 | 
 89 | def mu2linear(x, mu=255):
 90 |     """
 91 |     From Joao with modifications
 92 |     Converts an integer array from mu to linear
 93 | 
 94 |     For important notes and usage see: linear2mu
 95 |     """
 96 |     mu = float(mu)
 97 |     x = x.astype('float32')
 98 |     y = 2. * (x - (mu+1.)/2.) / (mu+1.)
 99 |     return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.)
100 | 
101 | def __mu_law_quantize(data):
102 |     return linear2mu(data)
103 | 
104 | def __batch_quantize(data, q_levels, q_type):
105 |     """
106 |     One of 'linear', 'a-law', 'mu-law' for q_type.
107 |     """
108 |     data = data.astype('float64')
109 |     #data = __normalize(data)
110 |     if q_type == 'linear':
111 |         return __linear_quantize(data, q_levels)
112 |     if q_type == 'mu-law':
113 |         # from [0, 1] to [-1, 1]
114 |         #data = 2.*data-1.
115 |         # Automatically quantized to 256 bins.
116 |         return __mu_law_quantize(data)
117 |     raise NotImplementedError
118 | 
119 | __RAND_SEED = 123
120 | def __fixed_shuffle(inp_list):
121 |     if isinstance(inp_list, list):
122 |         random.seed(__RAND_SEED)
123 |         random.shuffle(inp_list)
124 |         return
125 |     if isinstance(inp_list, np.ndarray):
126 |         np.random.seed(__RAND_SEED)
127 |         np.random.shuffle(inp_list)
128 |         return
129 | 
130 |     raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))
131 | 
132 | def __make_random_batches(inp_list, batch_size,shuffle=True):
133 |     batches = []
134 |     for i in xrange(len(inp_list) / batch_size+1):
135 |         if i==len(inp_list) / batch_size:
136 |             if len(inp_list)%batch_size==0:
137 |                 break
138 |             else:
139 |                 batches.append(inp_list[i*batch_size:])
140 |         else:
141 |             batches.append(inp_list[i*batch_size:(i+1)*batch_size])
142 | 
143 |     if shuffle:
144 |         __fixed_shuffle(batches)
145 |     return batches
146 | 
147 | def __mask_sort(mask_matrix):
148 |     ind=[]
149 |     for i in xrange(len(mask_matrix)):
150 |         ind.append(len(np.where(mask_matrix[i]==1)[0]))
151 |     b=zip(ind,range(len(ind)))
152 |     b.sort(key=lambda x:x[0],reverse=True)
153 |     index=[x[1] for x in b]
154 | 
155 |     return index
156 | 
157 | ### TIMIT DATASET LOADER ###
158 | def __TIMIT_feed_epoch(files,
159 |                        mask_files, 
160 |                        shuffle,
161 |                        is_train,
162 |                        batch_size,
163 |                        seq_len,
164 |                        overlap,
165 |                        q_levels,
166 |                        q_zero,
167 |                        q_type,
168 |                        real_valued=False):
169 |     """
170 |     Helper function to load blizzard dataset.
171 |     Generator that yields training inputs (subbatch, reset). `subbatch` contains
172 |     quantized audio data; `reset` is a boolean indicating the start of a new
173 |     sequence (i.e. you should reset h0 whenever `reset` is True).
174 | 
175 |     Feeds subsequences which overlap by a specified amount, so that the model
176 |     can always have target for every input in a given subsequence.
177 | 
178 |     Assumes all flac files have the same length.
179 | 
180 |     returns: (subbatch, reset)
181 |     subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP)
182 |     reset: True or False
183 |     """
184 |     if is_train:
185 |         sort_index=__mask_sort(mask_files)
186 |         batches_8k = __make_random_batches(files[0][sort_index], batch_size,shuffle)
187 |         batches_up = __make_random_batches(files[1][sort_index], batch_size,shuffle)
188 |         mask_batches=__make_random_batches(mask_files[sort_index],batch_size,shuffle)
189 |     else:
190 |         batches_8k = __make_random_batches(files[0], batch_size,shuffle)
191 |         batches_up = __make_random_batches(files[1], batch_size,shuffle)
192 |         mask_batches=__make_random_batches(mask_files,batch_size,shuffle)
193 | 
194 |     for index,bch_8k in enumerate(batches_8k):
195 | 
196 |         batch_num=len(bch_8k)
197 |         bch_up=batches_up[index]
198 |         mask=mask_batches[index]
199 |         mask_sum=np.sum(mask,axis=0)
200 |         mask_all0_index=np.where(mask_sum==0)[0]
201 |         if len(mask_all0_index!=0):
202 |             bch_up=bch_up[:,:-len(mask_all0_index)]
203 |             bch_8k=bch_8k[:,:-len(mask_all0_index)]
204 |             mask=mask[:,:-len(mask_all0_index)]
205 | 
206 |         batch_seq_len = len(bch_8k[0]) 
207 |         batch_seq_len = __round_to(batch_seq_len, seq_len)
208 | 
209 |         batch_8k = np.zeros(
210 |             (batch_num, batch_seq_len),
211 |             dtype='float64'
212 |         )
213 |         batch_up = np.zeros(
214 |             (batch_num, batch_seq_len),
215 |             dtype='float64'
216 |         )
217 | 
218 |         mask=np.pad(mask,[[0,0],[0,batch_seq_len-mask.shape[1]]],'constant')
219 |         for i, data in enumerate(bch_8k):
220 |             batch_8k[i, :len(data)] = data
221 |         for i, data in enumerate(bch_up):
222 |             batch_up[i, :len(data)] = data
223 | 
224 |         batch_8k_real=np.concatenate([
225 |                 batch_8k,
226 |                 np.full((batch_num, overlap), 0, dtype='float32')
227 |                 ], axis=1)
228 |         if not real_valued:
229 |             batch_8k = __batch_quantize(batch_8k, q_levels, q_type)
230 |             batch_up = __batch_quantize(batch_up, q_levels, q_type)
231 | 
232 |             batch_8k = np.concatenate([
233 |                 batch_8k,
234 |                 np.full((batch_num, overlap), q_zero, dtype='int32')
235 |                 ], axis=1)
236 | 
237 |             batch_up = np.concatenate([
238 |                 batch_up,
239 |                 np.full((batch_num, overlap), q_zero, dtype='int32')
240 |                 ], axis=1)
241 | 
242 |         mask = np.concatenate([
243 |             mask,
244 |             np.full((batch_num, overlap), 0, dtype='float32')
245 |         ], axis=1)
246 | 
247 | 
248 |         for i in xrange(batch_seq_len // seq_len):
249 |             reset = np.int32(i==0)
250 |             end_flag=np.int32(i==batch_seq_len // seq_len-1)
251 |             subbatch_8k_real=batch_8k_real[:, i*seq_len : (i+1)*seq_len+overlap]
252 |             subbatch_8k = batch_8k[:, i*seq_len : (i+1)*seq_len+overlap]
253 |             subbatch_up = batch_up[:, i*seq_len : (i+1)*seq_len+overlap]
254 |             submask = mask[:, i*seq_len : (i+1)*seq_len+overlap]
255 |             yield (subbatch_8k, subbatch_up,reset, end_flag,submask,batch_num,subbatch_8k_real)
256 | 
257 | def TIMIT_train_feed_epoch(*args):
258 |     """
259 |     :parameters:
260 |         batch_size: int
261 |         seq_len:
262 |         overlap:
263 |         q_levels:
264 |         q_zero:
265 |         q_type: One the following 'linear', 'a-law', or 'mu-law'
266 | 
267 |     THE NEW SEG IS:
268 |     20.48hrs 36*256
269 |     3*256
270 |     3*256
271 | 
272 |     :returns:
273 |         A generator yielding (subbatch, reset, submask)
274 |     """
275 |     # Just check if valid/test sets are also available. If not, raise.
276 |     find_dataset(__valid_up(__TIMIT_file))
277 |     find_dataset(__valid8k(__TIMIT_file))
278 |     find_dataset(__valid_mask(__TIMIT_file))
279 |     find_dataset(__test_up(__TIMIT_file))
280 |     find_dataset(__test8k(__TIMIT_file))
281 |     find_dataset(__test_mask(__TIMIT_file))
282 |     # Load train set
283 |     data_path_8k = find_dataset(__train8k(__TIMIT_file))
284 |     data_path_up = find_dataset(__train_up(__TIMIT_file))
285 |     data_mask_path=find_dataset(__train_mask(__TIMIT_file))
286 |     files=[]
287 |     files.append(np.load(data_path_8k))
288 |     files.append(np.load(data_path_up))
289 |     mask_files=np.load(data_mask_path)
290 |     shuffle=True
291 |     is_train=True
292 |     generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args)
293 |     return generator
294 | 
295 | def TIMIT_valid_feed_epoch(*args):
296 |     """
297 |     See:
298 |         TIMIT_train_feed_epoch
299 |     """
300 |     data_path_8k = find_dataset(__valid8k(__TIMIT_file))
301 |     data_path_up = find_dataset(__valid_up(__TIMIT_file))
302 |     data_mask_path=find_dataset(__valid_mask(__TIMIT_file))
303 |     files=[]
304 |     files.append(np.load(data_path_8k))
305 |     files.append(np.load(data_path_up))
306 |     mask_files=np.load(data_mask_path)
307 |     shuffle=True
308 |     is_train=False
309 |     generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args)
310 |     return generator
311 | 
312 | def TIMIT_test_feed_epoch(*args):
313 |     """
314 |     See:
315 |         TIMIT_train_feed_epoch
316 |     """
317 |     data_path_8k = find_dataset(__test8k(__TIMIT_file))
318 |     data_path_up = find_dataset(__test_up(__TIMIT_file))
319 |     data_mask_path=find_dataset(__test_mask(__TIMIT_file))
320 |     files=[]
321 |     files.append(np.load(data_path_8k))
322 |     files.append(np.load(data_path_up))
323 |     mask_files=np.load(data_mask_path)
324 |     shuffle=False
325 |     is_train=False
326 |     generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args)
327 |     return generator
328 | 


--------------------------------------------------------------------------------
/HRNN_HF/lib/__init__.py:
--------------------------------------------------------------------------------
  1 | import ops
  2 | #import lasagne
  3 | #from theano.compile.nanguardmode import NanGuardMode
  4 | 
  5 | import math
  6 | import time
  7 | import locale
  8 | 
  9 | import numpy
 10 | import theano
 11 | import theano.tensor as T
 12 | import theano.gof
 13 | 
 14 | import cPickle as pickle
 15 | #import pickle
 16 | import warnings
 17 | import sys, os, errno, glob
 18 | 
 19 | # import matplotlib
 20 | # matplotlib.use('Agg')
 21 | # import matplotlib.pyplot as plt
 22 | 
 23 | # TODO: Grouping is not working on cluster! :-?
 24 | # Set a locale first or you won't get grouping at all
 25 | locale.setlocale(locale.LC_ALL, '')
 26 | # 'en_US.UTF-8'
 27 | 
 28 | _params = {}
 29 | def param(name, *args, **kwargs):
 30 |     """
 31 |     A wrapper for `theano.shared` which enables parameter sharing in models.
 32 | 
 33 |     Creates and returns theano shared variables similarly to `theano.shared`,
 34 |     except if you try to create a param with the same name as a
 35 |     previously-created one, `param(...)` will just return the old one instead of
 36 |     making a new one.
 37 | 
 38 |     This constructor also adds a `param` attribute to the shared variables it
 39 |     creates, so that you can easily search a graph for all params.
 40 |     """
 41 | 
 42 |     if name not in _params:
 43 |         kwargs['name'] = name
 44 |         param = theano.shared(*args, **kwargs)
 45 |         param.param = True
 46 |         _params[name] = param
 47 |     return _params[name]
 48 | 
 49 | def delete_params(name):
 50 |     to_delete = [p_name for p_name in _params if name in p_name]
 51 |     for p_name in to_delete:
 52 |         del _params[p_name]
 53 | 
 54 | def search(node, critereon):
 55 |     """
 56 |     Traverse the Theano graph starting at `node` and return a list of all nodes
 57 |     which match the `critereon` function. When optimizing a cost function, you
 58 |     can use this to get a list of all of the trainable params in the graph, like
 59 |     so:
 60 | 
 61 |     `lib.search(cost, lambda x: hasattr(x, "param"))`
 62 |     or
 63 |     `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)`
 64 |     """
 65 | 
 66 |     def _search(node, critereon, visited):
 67 |         if node in visited:
 68 |             return []
 69 |         visited.add(node)
 70 | 
 71 |         results = []
 72 |         if isinstance(node, T.Apply):
 73 |             for inp in node.inputs:
 74 |                 results += _search(inp, critereon, visited)
 75 |         else: # Variable node
 76 |             if critereon(node):
 77 |                 results.append(node)
 78 |             if node.owner is not None:
 79 |                 results += _search(node.owner, critereon, visited)
 80 |         return results
 81 | 
 82 |     return _search(node, critereon, set())
 83 | 
 84 | def floatX(x):
 85 |     """
 86 |     Convert `x` to the numpy type specified in `theano.config.floatX`.
 87 |     """
 88 |     if theano.config.floatX == 'float16':
 89 |         return numpy.float16(x)
 90 |     elif theano.config.floatX == 'float32':
 91 |         return numpy.float32(x)
 92 |     else: # Theano's default float type is float64
 93 |         print "Warning: lib.floatX using float64"
 94 |         return numpy.float64(x)
 95 | 
 96 | def save_params(path):
 97 |     param_vals = {}
 98 |     for name, param in _params.iteritems():
 99 |         param_vals[name] = param.get_value()
100 | 
101 |     with open(path, 'wb') as f:
102 |         pickle.dump(param_vals, f)
103 | 
104 | def load_params(path):
105 |     with open(path, 'rb') as f:
106 |         param_vals = pickle.load(f)
107 | 
108 |     for name, val in param_vals.iteritems():
109 |         _params[name].set_value(val)
110 | 
111 | def clear_all_params():
112 |     to_delete = [p_name for p_name in _params]
113 |     for p_name in to_delete:
114 |         del _params[p_name]
115 | 
116 | def ensure_dir(dirname):
117 |     """
118 |     Ensure that a named directory exists; if it does not, attempt to create it.
119 |     """
120 |     try:
121 |         os.makedirs(dirname)
122 |     except OSError, e:
123 |         if e.errno != errno.EEXIST:
124 |             raise
125 | 
126 | __model_setting_file_name = 'model_settings.txt'
127 | def print_model_settings(locals_var, path=None, sys_arg=False):
128 |     """
129 |     Prints all variables in upper case in locals_var,
130 |     except for T which usually stands for theano.tensor.
131 |     If locals() passed as input to this method, will print
132 |     all the variables in upper case defined so far, that is
133 |     model settings.
134 | 
135 |     With `path` as an address to a directory it will _append_ it
136 |     as a file named `model_settings.txt` as well.
137 | 
138 |     With `sys_arg` set to True, log information about Python, Numpy,
139 |     and Theano and passed arguments to the script will be added too.
140 |     args.pkl would be overwritten, specially in case of resuming a job.
141 |     But again that wouldn't be much of a problem as all the passed args
142 |     to the script except for '--resume' should be the same.
143 | 
144 |     With both `path` and `sys_arg` passed, dumps the theano.config.
145 | 
146 |     :usage:
147 |         >>> import theano.tensor as T
148 |         >>> import lib
149 |         >>> BATCH_SIZE, DIM = 128, 512
150 |         >>> DATA_PATH = '/Path/to/dataset'
151 |         >>> lib.print_model_settings(locals(), path='./')
152 |     """
153 |     log = ""
154 |     if sys_arg:
155 |         try:
156 |             log += "Python:\n"
157 |             log += "\tsys.version_info\t{}\n".format(str(sys.version_info))
158 |             log += "Numpy:\n"
159 |             log += "\t.__version__\t{}\n".format(numpy.__version__)
160 |             log += "Theano:\n"
161 |             log += "\t.__version__\t{}\n".format(theano.__version__)
162 |             log += "\n\nAll passed args:\n"
163 |             log += str(sys.argv)
164 |             log += "\n"
165 |         except:
166 |             print "Something went wrong during sys_arg logging. Continue anyway!"
167 | 
168 |     log += "\nModel settings:"
169 |     all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')]
170 |     all_vars = sorted(all_vars, key=lambda x: x[0])
171 |     for var_name, var_value in all_vars:
172 |         log += ("\n\t%-20s %s" % (var_name, var_value))
173 |     print log
174 |     if path is not None:
175 |         ensure_dir(path)
176 |         # Don't override, just append if by mistake there is something in the file.
177 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
178 |             f.write(log)
179 |         if sys_arg:
180 |             with open(os.path.join(path, 'th_conf.txt'), 'a+') as f:
181 |                 f.write(str(theano.config))
182 |             with open(os.path.join(path, 'args.pkl'), 'wb') as f:
183 |                 pickle.dump(sys.argv, f)
184 |                 # To load:
185 |                 # >>> import cPickle as pickle
186 |                 # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb'))
187 | 
188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True):
189 |     """
190 |     Default criterion:
191 |         lambda x: hasattr(x, 'param') and x.param==True
192 |     This will return every parameter for cost from computation graph.
193 | 
194 |     To exclude a parameter, just set 'param' to False:
195 |         >>> h0 = lib.param('h0',\
196 |                 numpy.zeros((3, 2*512), dtype=theano.config.floatX))
197 |         >>> print h0.param  # Default: True
198 |         >>> h0.param = False
199 | 
200 |     In this case one still can get list of all params (False or True) by:
201 |         >>> lib.get_params(cost, lambda x: hasattr(x, 'param')
202 | 
203 |     :returns:
204 |         A list of params
205 |     """
206 |     return search(cost, criterion)
207 | 
208 | def print_params_info(params, path=None):
209 |     """
210 |     Print information about the parameters in the given param set.
211 | 
212 |     With `path` as an address to a directory it will _append_ it
213 |     as a file named `model_settings.txt` as well.
214 | 
215 |     :usage:
216 |         >>> params = lib.get_params(cost)
217 |         >>> lib.print_params_info(params, path='./')
218 |     """
219 |     params = sorted(params, key=lambda p: p.name)
220 |     values = [p.get_value(borrow=True) for p in params]
221 |     shapes = [p.shape for p in values]
222 |     total_param_count = 0
223 |     multiply_all = lambda a, b: a*b
224 |     log = "\nParams for cost:"
225 |     for param, value, shape in zip(params, values, shapes):
226 |         log += ("\n\t%-20s %s" % (shape, param.name))
227 |         total_param_count += reduce(multiply_all, shape)
228 | 
229 |     log += "\nTotal parameter count for this cost:\n\t{0}".format(
230 |         locale.format("%d", total_param_count, grouping=True)
231 |     )
232 |     print log
233 | 
234 |     if path is not None:
235 |         ensure_dir(path)
236 |         # Don't override, just append if by mistake there is something in the file.
237 |         with open(os.path.join(path, __model_setting_file_name), 'a+') as f:
238 |             f.write(log)
239 | 
240 | __train_log_file_name = 'train_log.pkl'
241 | def save_training_info(values, path):
242 |     """
243 |     Gets a set of values as dictionary and append them to a log file.
244 |     stores in <path>/train_log.pkl
245 |     """
246 |     file_name = os.path.join(path, __train_log_file_name)
247 |     try:
248 |         with open(file_name, "rb") as f:
249 |             log = pickle.load(f)
250 |     except IOError:  # first time
251 |         log = {}
252 |         for k in values.keys():
253 |             log[k] = []
254 |     for k, v in values.items():
255 |         log[k].append(v)
256 |     with open(file_name, "wb") as f:
257 |         pickle.dump(log, f)
258 | 
259 | resume_key = 'last resume index'
260 | def resumable(path,
261 |               iter_key='iter',
262 |               epoch_key='epoch',
263 |               add_resume_counter=True,
264 |               other_keys=[]):
265 |     """
266 |     :warning:
267 |         This is a naive implementation of resuming a training session
268 |         and does not save and reload the training loop. The serialization
269 |         of training loop and everything is costly and error-prone.
270 | 
271 |     :todo:
272 |         - Save and load a serializable training loop. (See warning above)
273 |         - Heavily dependent on the "model" file and the names used there right
274 |           now. It's really easy to miss anything.
275 | 
276 |     `path` should be pointing at the root directory where `train_log.pkl`
277 |     (See __train_log_file_name) and `params/` reside.
278 | 
279 |     Always assuming all the values in the log dictionary (except `resume_key`),
280 |     are lists with the same length.
281 |     """
282 |     file_name = os.path.join(path, __train_log_file_name)
283 |     # Raise error if does not exists.
284 |     with open(file_name, "rb") as f:
285 |         log = pickle.load(f)
286 | 
287 |     param_found = False
288 |     res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl')
289 |     for reverse_idx in range(-1, -len(log[epoch_key])-1, -1):
290 |         ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx]
291 |         print "> Params file for epoch {} iter {}".format(ep, it),
292 |         last_path = glob.glob(res_path.format(ep, it))
293 |         if len(last_path) == 1:
294 |             res_path = last_path[0]
295 |             param_found = True
296 |             print "found."
297 |             break
298 |         elif len(last_path) == 0:
299 |             print "[NOT FOUND]. FALLING BACK TO..."
300 |         else:  # > 1
301 |             # choose one, warning, rare
302 |             print "[multiple version found]:"
303 |             for l_path in last_path:
304 |                 print l_path
305 |             res_path = last_path[0]
306 |             param_found = True
307 |             print "Arbitrarily choosing first:\n\t{}".format(res_path)
308 | 
309 |     assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log)
310 |     # Finishing for loop with no success
311 |     assert param_found, 'No matching params file with train_log'
312 | 
313 |     acceptable_len = reverse_idx+len(log[epoch_key])+1
314 |     if acceptable_len != len(log[epoch_key]):
315 |         # Backup of the old train_log
316 |         with open(file_name+'.backup', 'wb') as f:
317 |             pickle.dump(log, f)
318 | 
319 |         # Change the log file to match the last existing checkpoint.
320 |         for k, v in log.items():
321 |             # Fix resume indices
322 |             if k == resume_key:
323 |                 log[k] = [i for i in log[k] if i < acceptable_len]
324 |                 continue
325 |             # Rest is useless with no param file.
326 |             log[k] = v[:acceptable_len]
327 | 
328 |     epochs = log[epoch_key]
329 |     iters = log[iter_key]
330 | 
331 |     if add_resume_counter:
332 |         resume_val = len(epochs)
333 |         if not resume_key in log.keys():
334 |             log[resume_key] = [resume_val]
335 |         else:
336 |             if log[resume_key] == [] or log[resume_key][-1] != resume_val:
337 |                 log[resume_key].append(resume_val)
338 |         with open(file_name, "wb") as f:
339 |             pickle.dump(log, f)
340 | 
341 |     last_epoch = epochs[-1]
342 |     last_iter = iters[-1]
343 | 
344 |     # The if-else statement is more readable than `next`:
345 |     #iters_to_consume = next((last_iter%(i-1) for (e, i) in\
346 |     #       zip(epochs, iters) if e == 1), last_iter)
347 |     if last_epoch == 0:
348 |         iters_to_consume = last_iter
349 |     else:
350 |         for e, i in zip(epochs, iters):
351 |             # first time. Epoch turns from 0 to 1.
352 |             # At the end of each `epoch` there should be
353 |             # a monitoring step so it will gives number
354 |             # number of iterations per epoch
355 |             if e == 1:
356 |                 iters_per_epoch = i - 1
357 |                 break
358 |         iters_to_consume = last_iter % iters_per_epoch
359 | 
360 |     last_other_keys = [log[k][-1] for k in other_keys]
361 |     return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys
362 | 
363 | def plot_traing_info(x, ylist, path):
364 |     """
365 |     Loads log file and plot x and y values as provided by input.
366 |     Saves as <path>/train_log.png
367 |     """
368 |     file_name = os.path.join(path, __train_log_file_name)
369 |     try:
370 |         with open(file_name, "rb") as f:
371 |             log = pickle.load(f)
372 |     except IOError:  # first time
373 |         warnings.warn("There is no {} file here!!!".format(file_name))
374 |         return
375 |     plt.figure()
376 |     x_vals = log[x]
377 |     for y in ylist:
378 |         y_vals = log[y]
379 |         if len(y_vals) != len(x_vals):
380 |             warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x))
381 |         plt.plot(x_vals, y_vals, label=y)
382 |         # assert len(y_vals) == len(x_vals), "not the same len"
383 |     plt.xlabel(x)
384 |     plt.legend()
385 |     #plt.show()
386 |     plt.savefig(file_name[:-3]+'png', bbox_inches='tight')
387 |     plt.close('all')
388 | 
389 | def create_logging_folders(path):
390 |     """
391 |     Handle structure of folders and naming here instead of training file.
392 | 
393 |     :todo:
394 |         - Implement!
395 |     """
396 |     pass
397 | 
398 | def tv(var):
399 |     """
400 |     :todo:
401 |         - add tv() function for theano variables so that instead of calling
402 |         x.tag.test_value, you can get the same thing just by calling the method
403 |         in a faster way...
404 |         - also for x.tag.test_value.shape
405 |     """
406 |     # Based on EAFP (easier to ask for forgiveness than permission)
407 |     try:
408 |         return var.tag.test_value
409 |     except AttributeError:
410 |         print "NONE, test_value has not been set."
411 |         import ipdb; ipdb.set_trace()
412 | 
413 |     ## Rather than LBYL (look before you leap)
414 |     #if hasattr(var, 'tag'):
415 |     #    if hasattr(var.tag, 'test_value'):
416 |     #        return var.tag.test_value
417 |     #   else:
418 |     #       print "NONE, test_value has not set."
419 |     #       import ipdb; ipdb.set_trace()
420 |     #else:
421 |     #    print "NONE, tag has not set."
422 |     #    import ipdb; ipdb.set_trace()
423 | 
424 | def tvs(var):
425 |     """
426 |     :returns:
427 |         var.tag.test_value.shape
428 |     """
429 |     return tv(var).shape
430 | 
431 | def _is_symbolic(v):
432 |     r"""Return `True` if any of the arguments are symbolic.
433 |     See:
434 |         https://github.com/Theano/Theano/wiki/Cookbook
435 |     """
436 |     symbolic = False
437 |     v = list(v)
438 |     for _container, _iter in [(v, xrange(len(v)))]:
439 |         for _k in _iter:
440 |             _v = _container[_k]
441 |             if isinstance(_v, theano.gof.Variable):
442 |                 symbolic = True
443 |     return symbolic
444 | 
445 | def unique_list(inp_list):
446 |     """
447 |     returns a list with unique values of inp_list.
448 |     :usage:
449 |         >>> inp_list = ['a', 'b', 'c']
450 |         >>> unique_inp_list = unique_list(inp_list*2)
451 |     """
452 |     return list(set(inp_list))
453 | 


--------------------------------------------------------------------------------
/HRNN_HF/models/three_tier/three_tier_generation.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | from datetime import datetime
  3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
  4 | exp_start = time()
  5 | 
  6 | import os, sys, glob
  7 | sys.path.insert(1, os.getcwd())
  8 | import argparse
  9 | import itertools
 10 | 
 11 | import numpy
 12 | numpy.random.seed(123)
 13 | np = numpy
 14 | import random
 15 | random.seed(123)
 16 | 
 17 | import theano
 18 | import theano.tensor as T
 19 | import theano.ifelse
 20 | import lasagne
 21 | import scipy.io.wavfile
 22 | 
 23 | import lib
 24 | 
 25 | LEARNING_RATE = 0.001
 26 | 
 27 | ### Parsing passed args/hyperparameters ###
 28 | def get_args():
 29 |     def t_or_f(arg):
 30 |         ua = str(arg).upper()
 31 |         if 'TRUE'.startswith(ua):
 32 |             return True
 33 |         elif 'FALSE'.startswith(ua):
 34 |             return False
 35 |         else:
 36 |            raise ValueError('Arg is neither `True` nor `False`')
 37 | 
 38 |     def check_non_negative(value):
 39 |         ivalue = int(value)
 40 |         if ivalue < 0:
 41 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 42 |         return ivalue
 43 | 
 44 |     def check_positive(value):
 45 |         ivalue = int(value)
 46 |         if ivalue < 1:
 47 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 48 |         return ivalue
 49 | 
 50 |     def check_unit_interval(value):
 51 |         fvalue = float(value)
 52 |         if fvalue < 0 or fvalue > 1:
 53 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 54 |         return fvalue
 55 | 
 56 |     # No default value here. Indicate every single arguement.
 57 |     parser = argparse.ArgumentParser(
 58 |         description='three_tier.py\nNo default value! Indicate every argument.')
 59 | 
 60 |     # TODO: Fix the descriptions
 61 |     # Hyperparameter arguements:
 62 |     parser.add_argument('--exp', help='Experiment name',
 63 |             type=str, required=False, default='_')
 64 |     parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True)
 65 |     parser.add_argument('--big_frame_size', help='How many samples per big frame',\
 66 |             type=check_positive, required=True)
 67 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 68 |             type=check_positive, required=True)
 69 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\
 70 |             type=t_or_f, required=True)
 71 |     parser.add_argument('--emb_size', help='Size of embedding layer (> 0)',
 72 |             type=check_positive, required=True)  # different than two_tier
 73 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN',
 74 |             type=t_or_f, required=True)
 75 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 76 |             type=check_positive, required=True)
 77 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 78 |             type=check_positive, choices=xrange(1,6), required=True)
 79 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
 80 |             required=True)
 81 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
 82 |             type=t_or_f, required=True)
 83 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
 84 |             type=check_positive, required=True)
 85 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
 86 |             choices=['linear', 'a-law', 'mu-law'], required=True)
 87 |     parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK',
 88 |             choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True)
 89 |     parser.add_argument('--batch_size', help='size of mini-batch',
 90 |             type=check_positive, choices=[1,50,64, 128, 256], required=True)
 91 | 
 92 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
 93 |             required=False, default=True, action='store_true')
 94 | 
 95 |     args = parser.parse_args()
 96 | 
 97 |     # NEW
 98 |     # Create tag for this experiment based on passed args
 99 |     # tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F')
100 |     # tag += '-lr'+str(LEARNING_RATE)
101 |     tag='three_tier_model'
102 |     print "Created experiment tag for these args:"
103 |     print tag
104 | 
105 |     return args, tag
106 | 
107 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001
108 | args, tag = get_args()
109 | 
110 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512)
111 | #print "------------------previous SEQ_LEN:", SEQ_LEN
112 | # TODO: test incremental training
113 | #SEQ_LEN = 512 + 256
114 | #print "---------------------------new SEQ_LEN:", SEQ_LEN
115 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame (8)
116 | FRAME_SIZE = args.frame_size # How many samples per frame (2)
117 | WEIGHT_NORM = args.weight_norm #True
118 | EMB_SIZE = args.emb_size #(256)
119 | SKIP_CONN = args.skip_conn #(False)
120 | DIM = args.dim # Model dimensionality. (1024)
121 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024)
122 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1)
123 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1)
124 | RNN_TYPE = args.rnn_type #GRU
125 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1)
126 | LEARN_H0 = args.learn_h0 #(True)
127 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256)
128 | Q_TYPE = args.q_type # log- or linear-scale #(linear)
129 | WHICH_SET = args.which_set #(MUSIC)
130 | BATCH_SIZE = args.batch_size #(128)
131 | RESUME = args.resume #(False)
132 | assert SEQ_LEN % BIG_FRAME_SIZE == 0,\
133 |     'seq_len should be divisible by big_frame_size'
134 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\
135 |     'big_frame_size should be divisible by frame_size'
136 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass
137 | 
138 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
139 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
140 | 
141 | # Fixed hyperparams
142 | GRAD_CLIP = 1 # Elementwise grad clip threshold
143 | BITRATE = 16000
144 | 
145 | # Other constants
146 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
147 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
148 | #TRAIN_MODE = 'time-iters'
149 | # To use PRINT_TIME for validation,
150 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
151 | #TRAIN_MODE = 'iters-time'
152 | # To use PRINT_ITERS for validation,
153 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
154 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations.
155 | STOP_ITERS = 300000 # Stop after this many iterations
156 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds.
157 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
158 | N_SEQS = 5  # Number of samples to generate every time monitoring.
159 | RESULTS_DIR = 'results_3t'
160 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
161 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
162 | OVERLAP = BIG_FRAME_SIZE
163 | 
164 | epoch_str = 'epoch'
165 | iter_str = 'iter'
166 | lowest_valid_str = 'lowest valid cost'
167 | corresp_test_str = 'correponding test cost'
168 | train_nll_str, valid_nll_str, test_nll_str = \
169 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
170 | 
171 | ### Create directories ###
172 | #   FOLDER_PREFIX: root, contains:
173 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
174 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
175 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
176 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
177 | if not os.path.exists(FOLDER_PREFIX):
178 |     os.makedirs(FOLDER_PREFIX)
179 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
180 | if not os.path.exists(PARAMS_PATH):
181 |     os.makedirs(PARAMS_PATH)
182 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
183 | if not os.path.exists(SAMPLES_PATH):
184 |     os.makedirs(SAMPLES_PATH)
185 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
186 | if not os.path.exists(BEST_PATH):
187 |     os.makedirs(BEST_PATH)
188 | 
189 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
190 | 
191 | ### Import the data_feeder ###
192 | # Handling WHICH_SET
193 | if WHICH_SET == 'TIMIT':
194 |     from datasets.dataset import TIMIT_test_feed_epoch  as test_feeder
195 | 
196 | def load_data(data_feeder):
197 |     """
198 |     Helper function to deal with interface of different datasets.
199 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
200 |     """
201 |     return data_feeder(BATCH_SIZE,
202 |                        SEQ_LEN,
203 |                        OVERLAP,
204 |                        Q_LEVELS,
205 |                        Q_ZERO,
206 |                        Q_TYPE)
207 | 
208 | ### Creating computation graph ###
209 | def big_frame_level_rnn(input_sequences, h0, reset):
210 |     """
211 |     input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8
212 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
213 |     reset.shape:           ()
214 |     output[0].shape:       (batch size, n frames, DIM)
215 |     output[1].shape:       same as h0.shape
216 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
217 |     """
218 |     frames = input_sequences.reshape((
219 |         input_sequences.shape[0],
220 |         input_sequences.shape[1] // (2*BIG_FRAME_SIZE),
221 |         2*BIG_FRAME_SIZE
222 |     ))
223 | 
224 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
225 |     # (a reasonable range to pass as inputs to the RNN)
226 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
227 |     frames *= lib.floatX(1)
228 | 
229 |     # Initial state of RNNs
230 |     learned_h0 = lib.param(
231 |         'BigFrameLevel.h0',
232 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
233 |     )
234 |     # Handling LEARN_H0
235 |     learned_h0.param = LEARN_H0 #True
236 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
237 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
238 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
239 | 
240 |     # Handling RNN_TYPE
241 |     # Handling SKIP_CONN
242 |     if RNN_TYPE == 'GRU':
243 |         rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU',
244 |                                                    N_BIG_RNN,
245 |                                                    2*BIG_FRAME_SIZE,
246 |                                                    BIG_DIM,
247 |                                                    frames,
248 |                                                    h0=h0,
249 |                                                    weightnorm=WEIGHT_NORM,
250 |                                                    skip_conn=SKIP_CONN)
251 |     elif RNN_TYPE == 'LSTM':
252 |         rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM',
253 |                                                     N_BIG_RNN,
254 |                                                     2*BIG_FRAME_SIZE,
255 |                                                     BIG_DIM,
256 |                                                     frames,
257 |                                                     h0=h0,
258 |                                                     weightnorm=WEIGHT_NORM,
259 |                                                     skip_conn=SKIP_CONN)
260 | 
261 |     output = lib.ops.Linear(       #batch*timestep*dim
262 |         'BigFrameLevel.Output',
263 |         BIG_DIM,
264 |         DIM * BIG_FRAME_SIZE / FRAME_SIZE,  #1024*8/2
265 |         rnns_out,
266 |         initialization='he',
267 |         weightnorm=WEIGHT_NORM
268 |     )
269 |     output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
270 | 
271 |     return (output, last_hidden) #last_hidden:#batch*1*dim
272 | 
273 | def frame_level_rnn(input_sequences, other_input, h0, reset):
274 |     """
275 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2
276 |     other_input.shape:     (batch size, n frames, DIM)
277 |     h0.shape:              (batch size, N_RNN, DIM)
278 |     reset.shape:           ()
279 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
280 |     """
281 |     frames = input_sequences.reshape((
282 |         input_sequences.shape[0],
283 |         input_sequences.shape[1] // (2*FRAME_SIZE),
284 |         2*FRAME_SIZE
285 |     ))
286 | 
287 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
288 |     # (a reasonable range to pass as inputs to the RNN)
289 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
290 |     frames *= lib.floatX(1)
291 | 
292 |     gru_input = lib.ops.Linear(
293 |         'FrameLevel.InputExpand',
294 |         2*FRAME_SIZE,
295 |         DIM,
296 |         frames,
297 |         initialization='he',
298 |         weightnorm=WEIGHT_NORM,
299 |         ) + other_input
300 | 
301 |     # Initial state of RNNs
302 |     learned_h0 = lib.param(
303 |         'FrameLevel.h0',
304 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
305 |     )
306 |     # Handling LEARN_H0
307 |     learned_h0.param = LEARN_H0
308 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
309 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
310 |     #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
311 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
312 | 
313 |     # Handling RNN_TYPE
314 |     # Handling SKIP_CONN
315 |     if RNN_TYPE == 'GRU':
316 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
317 |                                                    N_RNN,
318 |                                                    DIM,
319 |                                                    DIM,
320 |                                                    gru_input,
321 |                                                    h0=h0,
322 |                                                    weightnorm=WEIGHT_NORM,
323 |                                                    skip_conn=SKIP_CONN)
324 |     elif RNN_TYPE == 'LSTM':
325 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
326 |                                                     N_RNN,
327 |                                                     DIM,
328 |                                                     DIM,
329 |                                                     gru_input,
330 |                                                     h0=h0,
331 |                                                     weightnorm=WEIGHT_NORM,
332 |                                                     skip_conn=SKIP_CONN)
333 | 
334 |     output = lib.ops.Linear(
335 |         'FrameLevel.Output',
336 |         DIM,
337 |         FRAME_SIZE * DIM,
338 |         rnns_out,
339 |         initialization='he',
340 |         weightnorm=WEIGHT_NORM
341 |     )
342 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
343 | 
344 |     return (output, last_hidden)
345 | def sample_level_predictor(frame_level_outputs, prev_samples):
346 |     """
347 |     frame_level_outputs.shape: (batch size, DIM)
348 |     prev_samples.shape:        (batch size, FRAME_SIZE)
349 |     output.shape:              (batch size, Q_LEVELS)
350 |     """
351 |     # Handling EMB_SIZE
352 |     if EMB_SIZE == 0:  # no support for one-hot in three_tier and one_tier.
353 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
354 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
355 |         last_out_shape = Q_LEVELS
356 |     elif EMB_SIZE > 0:  #The embedding steps maps each of the q discrete values to a real-valued vector embedding.
357 |         prev_samples = lib.ops.Embedding(  #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE
358 |             'SampleLevel.Embedding',
359 |             Q_LEVELS,
360 |             EMB_SIZE,
361 |             prev_samples)
362 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
363 |         last_out_shape = EMB_SIZE
364 |     else:
365 |         raise ValueError('EMB_SIZE cannot be negative.')
366 | 
367 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE)
368 | 
369 |     out = lib.ops.Linear(
370 |         'SampleLevel.L1_PrevSamples',
371 |         FRAME_SIZE * last_out_shape,
372 |         DIM,
373 |         prev_samples,
374 |         biases=False,
375 |         initialization='he',
376 |         weightnorm=WEIGHT_NORM
377 |     )
378 | 
379 |     out += frame_level_outputs
380 |     # out = T.nnet.relu(out)  # commented out to be similar to two_tier
381 | 
382 |     out = lib.ops.Linear('SampleLevel.L2',
383 |                          DIM,
384 |                          DIM,
385 |                          out,
386 |                          initialization='he',
387 |                          weightnorm=WEIGHT_NORM)
388 |     out = T.nnet.relu(out)
389 | 
390 |     # L3
391 |     out = lib.ops.Linear('SampleLevel.L3',
392 |                          DIM,
393 |                          DIM,
394 |                          out,
395 |                          initialization='he',
396 |                          weightnorm=WEIGHT_NORM)
397 |     out = T.nnet.relu(out)
398 | 
399 |     # Output
400 |     # We apply the softmax later
401 |     out = lib.ops.Linear('SampleLevel.Output',
402 |                          DIM,
403 |                          Q_LEVELS,
404 |                          out,
405 |                          weightnorm=WEIGHT_NORM)
406 |     return out
407 | 
408 | sequences_8k   = T.imatrix('sequences_8k') #batch size*samplenum
409 | sequences_up   = T.imatrix('sequences_up')
410 | h0          = T.tensor3('h0')     #(batch size, N_RNN, DIM)
411 | big_h0      = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM)
412 | reset       = T.iscalar('reset')
413 | mask        = T.matrix('mask') #batch size*samplenum
414 | batch_size       =T.iscalar('batch_size')
415 | lr=T.scalar('lr')
416 | 
417 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3)
418 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1))
419 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1,  2*OVERLAP), neib_step=(1, OVERLAP), mode='valid')
420 | big_input_sequences=big_input_sequences.reshape((batch_size,-1))
421 | 
422 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)]  #(tier2)
423 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1))
424 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1,  2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid')
425 | input_sequences=input_sequences.reshape((batch_size,-1))
426 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues
427 | 
428 | target_mask = mask[:,0:-OVERLAP]
429 | 
430 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, big_h0, reset)#tier3->tier2
431 | 
432 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1
433 | 
434 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)]
435 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1))
436 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1,  FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...])
437 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN,  FRAME_SIZE))
438 | 
439 | 
440 | sample_level_outputs = sample_level_predictor(
441 |     frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)),
442 |     prev_samples
443 | )          #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...]
444 | 
445 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences)
446 | accuracy=accuracy*target_mask
447 | accuracy=T.sum(accuracy,axis=1)
448 | mask_sum=T.sum(target_mask,axis=1)
449 | 
450 | cost = T.nnet.categorical_crossentropy(
451 |     T.nnet.softmax(sample_level_outputs),  #Every row represents a distribution(256 propability)
452 |     target_sequences.flatten()    #A list, represent the groundtruth of every row
453 | )
454 | cost = cost.reshape(target_sequences.shape)
455 | cost = cost * target_mask #dim: batch*num
456 | # Don't use these lines; could end up with NaN
457 | # Specially at the end of audio files where mask is
458 | # all zero for some of the shorter files in mini-batch.
459 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
460 | #cost = cost.mean(axis=0)
461 | cost_sum=T.sum(cost,axis=1)
462 | # Use this one instead.
463 | cost = cost.sum()
464 | cost = cost / target_mask.sum() #cost average by samples
465 | 
466 | # By default we report cross-entropy cost in bits.
467 | # Switch to nats by commenting out this line:
468 | # log_2(e) = 1.44269504089
469 | #cost = cost * lib.floatX(numpy.log2(numpy.e))
470 | 
471 | ###########
472 | 
473 | test_fn=theano.function(
474 |     [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size],
475 |     [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_big_h0,new_h0],
476 |     on_unused_input='warn'
477 | )
478 | 
479 | def generate_and_save_samples(tag):
480 |     def write_audio_file(name, data):
481 |         data = data.astype('float32')
482 |         #data -= data.min()
483 |         #data /= data.max()
484 |         #data -= 0.5
485 |         #data *= 0.95
486 |         scipy.io.wavfile.write(
487 |                     os.path.join(SAMPLES_PATH, name),
488 |                     BITRATE,
489 |                     data)
490 | 
491 |     total_time=time()
492 |     costs_g = []
493 |     accuracys_g=[]
494 |     samples_low_list=[]
495 |     samples_list=[]
496 |     masks_g_index=[]
497 |     samples_number=0
498 |     count=0
499 |     data_feeder = load_data(test_feeder)
500 |     for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,batch_g,seqs_g_8k_real in data_feeder:
501 |         if reset_g==1:
502 |             big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
503 |             h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32')
504 |             cost_batch=np.zeros((batch_g,),dtype='float32')
505 |             accuracy_batch=np.zeros((batch_g,),dtype='float32')
506 |             mask_batch=np.zeros((batch_g,),dtype='float32')
507 |         cost_g, accuracy_g,mask_sum_g,sample, big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, big_h0_g,h0_g, reset_g, mask_g,batch_g)
508 |         cost_batch=cost_batch+cost_g
509 |         accuracy_batch=accuracy_batch+accuracy_g
510 |         mask_batch=mask_batch+mask_sum_g
511 |         if end_flag_g==1:
512 |             costs_g.extend(list(cost_batch/mask_batch))
513 |             accuracys_g.extend(list(accuracy_batch/mask_batch))
514 | 
515 |         if reset_g==1:
516 |             samples_low=seqs_g_8k_real[:,0:-OVERLAP]
517 |             samples=sample
518 |             masks_g=mask_g[:,0:-OVERLAP]
519 |         else:
520 |             samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1)
521 |             samples=np.concatenate([samples,sample],axis=1)
522 |             masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1)
523 | 
524 |         if end_flag_g==1:
525 |             samples_low_list.append(samples_low)
526 |             samples_list.append(samples)
527 |             masks_g_index.append(masks_g)
528 |     fid=open('datasets/TIMIT/test_list.scp','r')
529 |     test_id_list=fid.readlines()
530 |     for i in xrange(len(samples_list)):
531 |         samples_number+=samples_list[i].shape[0]*samples_list[i].shape[1]
532 |         for j in xrange(samples_list[i].shape[0]):
533 |             samples_lowi=samples_low_list[i][j]
534 |             samplei=samples_list[i][j]
535 |             maski=masks_g_index[i][j]
536 |             samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])]
537 |             samplei=samplei[0:len(np.where(maski==1)[0])]
538 |             if Q_TYPE == 'mu-law':
539 |                 from datasets.dataset import mu2linear
540 |                 samplei = mu2linear(samplei)
541 |             write_audio_file(test_id_list[count].split()[0], samplei/3+samples_lowi)
542 |             count+=1
543 | 
544 | 
545 |     total_time = time() - total_time
546 |     log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds."
547 |     log = log.format(total_time/60,total_time/samples_number*16000)
548 |     print samples_number
549 |     print log,
550 | 
551 |     return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time,list(np.array(accuracys_g)*100)
552 | 
553 | ### Handling the resume option:
554 | if RESUME:
555 |     # Check if checkpoint from previous run is not corrupted.
556 |     # Then overwrite some of the variables above.
557 |     iters_to_consume, res_path, epoch, total_iters,\
558 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
559 |         lib.resumable(path=FOLDER_PREFIX,
560 |                       iter_key=iter_str,
561 |                       epoch_key=epoch_str,
562 |                       add_resume_counter=True,
563 |                       other_keys=[lowest_valid_str,
564 |                                   corresp_test_str,
565 |                                   test_nll_str])
566 |     # At this point we saved the pkl file.
567 |     last_print_iters = total_iters
568 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
569 |     # Consumes this much iters to get to the last point in training data.
570 |     consume_time = time()
571 |     consume_time = time() - consume_time
572 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
573 |             format(consume_time, iters_to_consume)
574 | 
575 |     lib.load_params(res_path)
576 |     print "Parameters from last available checkpoint loaded."
577 | 
578 | tag='gen'
579 | test_cost, test_accuracy,test_time,test_accuracy_list=generate_and_save_samples(tag)
580 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time)


--------------------------------------------------------------------------------
/HRNN_HF/models/three_tier/three_tier_train_valid.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | from datetime import datetime
  3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
  4 | exp_start = time()
  5 | 
  6 | import os, sys, glob
  7 | sys.path.insert(1, os.getcwd())
  8 | import argparse
  9 | import itertools
 10 | 
 11 | import numpy
 12 | numpy.random.seed(123)
 13 | np = numpy
 14 | import random
 15 | random.seed(123)
 16 | 
 17 | import theano
 18 | import theano.tensor as T
 19 | import theano.ifelse
 20 | import lasagne
 21 | import scipy.io.wavfile
 22 | 
 23 | import lib
 24 | 
 25 | LEARNING_RATE = 0.001
 26 | 
 27 | ### Parsing passed args/hyperparameters ###
 28 | def get_args():
 29 |     def t_or_f(arg):
 30 |         ua = str(arg).upper()
 31 |         if 'TRUE'.startswith(ua):
 32 |             return True
 33 |         elif 'FALSE'.startswith(ua):
 34 |             return False
 35 |         else:
 36 |            raise ValueError('Arg is neither `True` nor `False`')
 37 | 
 38 |     def check_non_negative(value):
 39 |         ivalue = int(value)
 40 |         if ivalue < 0:
 41 |              raise argparse.ArgumentTypeError("%s is not non-negative!" % value)
 42 |         return ivalue
 43 | 
 44 |     def check_positive(value):
 45 |         ivalue = int(value)
 46 |         if ivalue < 1:
 47 |              raise argparse.ArgumentTypeError("%s is not positive!" % value)
 48 |         return ivalue
 49 | 
 50 |     def check_unit_interval(value):
 51 |         fvalue = float(value)
 52 |         if fvalue < 0 or fvalue > 1:
 53 |              raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value)
 54 |         return fvalue
 55 | 
 56 |     # No default value here. Indicate every single arguement.
 57 |     parser = argparse.ArgumentParser(
 58 |         description='three_tier.py\nNo default value! Indicate every argument.')
 59 | 
 60 |     # TODO: Fix the descriptions
 61 |     # Hyperparameter arguements:
 62 |     parser.add_argument('--exp', help='Experiment name',
 63 |             type=str, required=False, default='_')
 64 |     parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True)
 65 |     parser.add_argument('--big_frame_size', help='How many samples per big frame',\
 66 |             type=check_positive, required=True)
 67 |     parser.add_argument('--frame_size', help='How many samples per frame',\
 68 |             type=check_positive, required=True)
 69 |     parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\
 70 |             type=t_or_f, required=True)
 71 |     parser.add_argument('--emb_size', help='Size of embedding layer (> 0)',
 72 |             type=check_positive, required=True)  # different than two_tier
 73 |     parser.add_argument('--skip_conn', help='Add skip connections to RNN',
 74 |             type=t_or_f, required=True)
 75 |     parser.add_argument('--dim', help='Dimension of RNN and MLPs',\
 76 |             type=check_positive, required=True)
 77 |     parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN',
 78 |             type=check_positive, choices=xrange(1,6), required=True)
 79 |     parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\
 80 |             required=True)
 81 |     parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\
 82 |             type=t_or_f, required=True)
 83 |     parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\
 84 |             type=check_positive, required=True)
 85 |     parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\
 86 |             choices=['linear', 'a-law', 'mu-law'], required=True)
 87 |     parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK',
 88 |             choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True)
 89 |     parser.add_argument('--batch_size', help='size of mini-batch',
 90 |             type=check_positive, choices=[50,64, 128, 256], required=True)
 91 | 
 92 |     parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\
 93 |             required=False, default=False, action='store_true')
 94 | 
 95 |     args = parser.parse_args()
 96 | 
 97 |     # NEW
 98 |     # Create tag for this experiment based on passed args
 99 |     tag='three_tier_model'
100 |     print "Created experiment tag for these args:"
101 |     print tag
102 | 
103 |     return args, tag
104 | 
105 | args, tag = get_args()
106 | 
107 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512)
108 | #print "------------------previous SEQ_LEN:", SEQ_LEN
109 | # TODO: test incremental training
110 | #SEQ_LEN = 512 + 256
111 | #print "---------------------------new SEQ_LEN:", SEQ_LEN
112 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame (8)
113 | FRAME_SIZE = args.frame_size # How many samples per frame (2)
114 | WEIGHT_NORM = args.weight_norm #True
115 | EMB_SIZE = args.emb_size #(256)
116 | SKIP_CONN = args.skip_conn #(False)
117 | DIM = args.dim # Model dimensionality. (1024)
118 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024)
119 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1)
120 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1)
121 | RNN_TYPE = args.rnn_type #GRU
122 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1)
123 | LEARN_H0 = args.learn_h0 #(True)
124 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256)
125 | Q_TYPE = args.q_type # log- or linear-scale #(linear)
126 | WHICH_SET = args.which_set #(MUSIC)
127 | BATCH_SIZE = args.batch_size #(128)
128 | RESUME = args.resume #(False)
129 | assert SEQ_LEN % BIG_FRAME_SIZE == 0,\
130 |     'seq_len should be divisible by big_frame_size'
131 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\
132 |     'big_frame_size should be divisible by frame_size'
133 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass
134 | 
135 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256:
136 |     raise ValueError('For mu-law Quantization levels should be exactly 256!')
137 | 
138 | # Fixed hyperparams
139 | GRAD_CLIP = 1 # Elementwise grad clip threshold
140 | BITRATE = 16000
141 | 
142 | # Other constants
143 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS
144 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME
145 | #TRAIN_MODE = 'time-iters'
146 | # To use PRINT_TIME for validation,
147 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
148 | #TRAIN_MODE = 'iters-time'
149 | # To use PRINT_ITERS for validation,
150 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp.
151 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations.
152 | STOP_ITERS = 300000 # Stop after this many iterations
153 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds.
154 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.)
155 | N_SEQS = 5  # Number of samples to generate every time monitoring.
156 | RESULTS_DIR = 'results_3t'
157 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag)
158 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude
159 | OVERLAP = BIG_FRAME_SIZE
160 | 
161 | epoch_str = 'epoch'
162 | iter_str = 'iter'
163 | lowest_valid_str = 'lowest valid cost'
164 | corresp_test_str = 'correponding test cost'
165 | train_nll_str, valid_nll_str, test_nll_str = \
166 |     'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)'
167 | 
168 | ### Create directories ###
169 | #   FOLDER_PREFIX: root, contains:
170 | #       log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt]
171 | #   FOLDER_PREFIX/params: saves all checkpoint params as pkl
172 | #   FOLDER_PREFIX/samples: keeps all checkpoint samples as wav
173 | #   FOLDER_PREFIX/best: keeps the best parameters, samples, ...
174 | if not os.path.exists(FOLDER_PREFIX):
175 |     os.makedirs(FOLDER_PREFIX)
176 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params')
177 | if not os.path.exists(PARAMS_PATH):
178 |     os.makedirs(PARAMS_PATH)
179 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples')
180 | if not os.path.exists(SAMPLES_PATH):
181 |     os.makedirs(SAMPLES_PATH)
182 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best')
183 | if not os.path.exists(BEST_PATH):
184 |     os.makedirs(BEST_PATH)
185 | 
186 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True)
187 | 
188 | ### Import the data_feeder ###
189 | # Handling WHICH_SET
190 | if WHICH_SET == 'TIMIT':
191 |     from datasets.dataset import TIMIT_train_feed_epoch as train_feeder
192 |     from datasets.dataset import TIMIT_valid_feed_epoch as valid_feeder
193 |     from datasets.dataset import TIMIT_test_feed_epoch  as test_feeder
194 | 
195 | def load_data(data_feeder):
196 |     """
197 |     Helper function to deal with interface of different datasets.
198 |     `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`.
199 |     """
200 |     return data_feeder(BATCH_SIZE,
201 |                        SEQ_LEN,
202 |                        OVERLAP,
203 |                        Q_LEVELS,
204 |                        Q_ZERO,
205 |                        Q_TYPE)
206 | 
207 | ### Creating computation graph ###
208 | def big_frame_level_rnn(input_sequences, h0, reset):
209 |     """
210 |     input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8
211 |     h0.shape:              (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024
212 |     reset.shape:           ()
213 |     output[0].shape:       (batch size, n frames, DIM)
214 |     output[1].shape:       same as h0.shape
215 |     output[2].shape:       (batch size, seq len, Q_LEVELS)
216 |     """
217 |     frames = input_sequences.reshape((
218 |         input_sequences.shape[0],
219 |         input_sequences.shape[1] // (2*BIG_FRAME_SIZE),
220 |         2*BIG_FRAME_SIZE
221 |     ))
222 | 
223 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
224 |     # (a reasonable range to pass as inputs to the RNN)
225 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
226 |     frames *= lib.floatX(1)
227 | 
228 |     # Initial state of RNNs
229 |     learned_h0 = lib.param(
230 |         'BigFrameLevel.h0',
231 |         numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX)
232 |     )
233 |     # Handling LEARN_H0
234 |     learned_h0.param = LEARN_H0 #True
235 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1
236 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
237 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)   #if reset=1,h0=learned_h0; if reset=0,h0=h0
238 | 
239 |     # Handling RNN_TYPE
240 |     # Handling SKIP_CONN
241 |     if RNN_TYPE == 'GRU':
242 |         rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU',
243 |                                                    N_BIG_RNN,
244 |                                                    2*BIG_FRAME_SIZE,
245 |                                                    BIG_DIM,
246 |                                                    frames,
247 |                                                    h0=h0,
248 |                                                    weightnorm=WEIGHT_NORM,
249 |                                                    skip_conn=SKIP_CONN)
250 |     elif RNN_TYPE == 'LSTM':
251 |         rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM',
252 |                                                     N_BIG_RNN,
253 |                                                     2*BIG_FRAME_SIZE,
254 |                                                     BIG_DIM,
255 |                                                     frames,
256 |                                                     h0=h0,
257 |                                                     weightnorm=WEIGHT_NORM,
258 |                                                     skip_conn=SKIP_CONN)
259 | 
260 |     output = lib.ops.Linear(       #batch*timestep*dim
261 |         'BigFrameLevel.Output',
262 |         BIG_DIM,
263 |         DIM * BIG_FRAME_SIZE / FRAME_SIZE,  #1024*8/2
264 |         rnns_out,
265 |         initialization='he',
266 |         weightnorm=WEIGHT_NORM
267 |     )
268 |     output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM))
269 | 
270 |     return (output, last_hidden) #last_hidden:#batch*1*dim
271 | 
272 | def frame_level_rnn(input_sequences, other_input, h0, reset):
273 |     """
274 |     input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2
275 |     other_input.shape:     (batch size, n frames, DIM)
276 |     h0.shape:              (batch size, N_RNN, DIM)
277 |     reset.shape:           ()
278 |     output.shape:          (batch size, n frames * FRAME_SIZE, DIM)
279 |     """
280 |     frames = input_sequences.reshape((
281 |         input_sequences.shape[0],
282 |         input_sequences.shape[1] // (2*FRAME_SIZE),
283 |         2*FRAME_SIZE
284 |     ))
285 | 
286 |     # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2]
287 |     # (a reasonable range to pass as inputs to the RNN)
288 |     frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1)
289 |     frames *= lib.floatX(1)
290 | 
291 |     gru_input = lib.ops.Linear(
292 |         'FrameLevel.InputExpand',
293 |         2*FRAME_SIZE,
294 |         DIM,
295 |         frames,
296 |         initialization='he',
297 |         weightnorm=WEIGHT_NORM,
298 |         ) + other_input
299 | 
300 |     # Initial state of RNNs
301 |     learned_h0 = lib.param(
302 |         'FrameLevel.h0',
303 |         numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX)
304 |     )
305 |     # Handling LEARN_H0
306 |     learned_h0.param = LEARN_H0
307 |     learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM)
308 |     learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2)
309 |     #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim)
310 |     h0 = theano.ifelse.ifelse(reset, learned_h0, h0)
311 | 
312 |     # Handling RNN_TYPE
313 |     # Handling SKIP_CONN
314 |     if RNN_TYPE == 'GRU':
315 |         rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU',
316 |                                                    N_RNN,
317 |                                                    DIM,
318 |                                                    DIM,
319 |                                                    gru_input,
320 |                                                    h0=h0,
321 |                                                    weightnorm=WEIGHT_NORM,
322 |                                                    skip_conn=SKIP_CONN)
323 |     elif RNN_TYPE == 'LSTM':
324 |         rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM',
325 |                                                     N_RNN,
326 |                                                     DIM,
327 |                                                     DIM,
328 |                                                     gru_input,
329 |                                                     h0=h0,
330 |                                                     weightnorm=WEIGHT_NORM,
331 |                                                     skip_conn=SKIP_CONN)
332 | 
333 |     output = lib.ops.Linear(
334 |         'FrameLevel.Output',
335 |         DIM,
336 |         FRAME_SIZE * DIM,
337 |         rnns_out,
338 |         initialization='he',
339 |         weightnorm=WEIGHT_NORM
340 |     )
341 |     output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM))
342 | 
343 |     return (output, last_hidden)
344 | 
345 | def sample_level_predictor(frame_level_outputs, prev_samples):
346 |     """
347 |     frame_level_outputs.shape: (batch size, DIM)
348 |     prev_samples.shape:        (batch size, FRAME_SIZE)
349 |     output.shape:              (batch size, Q_LEVELS)
350 |     """
351 |     # Handling EMB_SIZE
352 |     if EMB_SIZE == 0:  # no support for one-hot in three_tier and one_tier.
353 |         prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS)
354 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS)
355 |         last_out_shape = Q_LEVELS
356 |     elif EMB_SIZE > 0:  #The embedding steps maps each of the q discrete values to a real-valued vector embedding.
357 |         prev_samples = lib.ops.Embedding(  #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE
358 |             'SampleLevel.Embedding',
359 |             Q_LEVELS,
360 |             EMB_SIZE,
361 |             prev_samples)
362 |         # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32
363 |         last_out_shape = EMB_SIZE
364 |     else:
365 |         raise ValueError('EMB_SIZE cannot be negative.')
366 | 
367 |     prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE)
368 | 
369 |     out = lib.ops.Linear(
370 |         'SampleLevel.L1_PrevSamples',
371 |         FRAME_SIZE * last_out_shape,
372 |         DIM,
373 |         prev_samples,
374 |         biases=False,
375 |         initialization='he',
376 |         weightnorm=WEIGHT_NORM
377 |     )
378 | 
379 |     out += frame_level_outputs
380 |     # out = T.nnet.relu(out)  # commented out to be similar to two_tier
381 | 
382 |     out = lib.ops.Linear('SampleLevel.L2',
383 |                          DIM,
384 |                          DIM,
385 |                          out,
386 |                          initialization='he',
387 |                          weightnorm=WEIGHT_NORM)
388 |     out = T.nnet.relu(out)
389 | 
390 |     # L3
391 |     out = lib.ops.Linear('SampleLevel.L3',
392 |                          DIM,
393 |                          DIM,
394 |                          out,
395 |                          initialization='he',
396 |                          weightnorm=WEIGHT_NORM)
397 |     out = T.nnet.relu(out)
398 | 
399 |     # Output
400 |     # We apply the softmax later
401 |     out = lib.ops.Linear('SampleLevel.Output',
402 |                          DIM,
403 |                          Q_LEVELS,
404 |                          out,
405 |                          weightnorm=WEIGHT_NORM)
406 |     return out
407 | 
408 | sequences_8k   = T.imatrix('sequences_8k') #batch size*samplenum
409 | sequences_up   = T.imatrix('sequences_up')
410 | h0          = T.tensor3('h0')     #(batch size, N_RNN, DIM)
411 | big_h0      = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM)
412 | reset       = T.iscalar('reset')
413 | mask        = T.matrix('mask') #batch size*samplenum
414 | batch_size       =T.iscalar('batch_size')
415 | lr=T.scalar('lr')
416 | 
417 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3)
418 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1))
419 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1,  2*OVERLAP), neib_step=(1, OVERLAP), mode='valid')
420 | big_input_sequences=big_input_sequences.reshape((batch_size,-1))
421 | 
422 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)]  #(tier2)
423 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1))
424 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1,  2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid')
425 | input_sequences=input_sequences.reshape((batch_size,-1))
426 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues
427 | 
428 | target_mask = mask[:,0:-OVERLAP]
429 | 
430 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, big_h0, reset)#tier3->tier2
431 | 
432 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1
433 | 
434 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)]
435 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1))
436 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1,  FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...])
437 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN,  FRAME_SIZE))
438 | 
439 | sample_level_outputs = sample_level_predictor(
440 |     frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)),
441 |     prev_samples
442 | )          #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...]
443 | 
444 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences)
445 | accuracy=accuracy*target_mask
446 | accuracy=T.sum(accuracy,axis=1)
447 | mask_sum=T.sum(target_mask,axis=1)
448 | 
449 | cost = T.nnet.categorical_crossentropy(
450 |     T.nnet.softmax(sample_level_outputs),  #Every row represents a distribution(256 propability)
451 |     target_sequences.flatten()    #A list, represent the groundtruth of every row
452 | )
453 | cost = cost.reshape(target_sequences.shape)
454 | cost = cost * target_mask #dim: batch*num
455 | # Don't use these lines; could end up with NaN
456 | # Specially at the end of audio files where mask is
457 | # all zero for some of the shorter files in mini-batch.
458 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1)
459 | #cost = cost.mean(axis=0)
460 | cost_sum=T.sum(cost,axis=1)
461 | # Use this one instead.
462 | cost = cost.sum()
463 | cost = cost / target_mask.sum() #cost average by samples
464 | 
465 | # By default we report cross-entropy cost in bits.
466 | # Switch to nats by commenting out this line:
467 | # log_2(e) = 1.44269504089
468 | #cost = cost * lib.floatX(numpy.log2(numpy.e))
469 | 
470 | ###########
471 | all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) #if LEARN_H0=True,then learn_h0 is included in parmeters to train
472 | 
473 | lib.print_params_info(all_params, path=FOLDER_PREFIX)
474 | 
475 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn')
476 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads]
477 | 
478 | updates = lasagne.updates.adam(grads, all_params,learning_rate=lr)
479 | 
480 | # Training function(s)
481 | train_fn = theano.function(
482 |     [sequences_8k,sequences_up, big_h0, h0, reset, mask,batch_size,lr],
483 |     [cost, new_big_h0, new_h0],
484 |     updates=updates,
485 |     on_unused_input='warn'
486 | )
487 | 
488 | # Validation and Test function, hence no updates
489 | valid_fn = theano.function(
490 |     [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size],
491 |     [cost_sum, accuracy,mask_sum,new_big_h0,new_h0],
492 |     on_unused_input='warn'
493 | )
494 | 
495 | test_fn=theano.function(
496 |     [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size],
497 |     [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_big_h0,new_h0],
498 |     on_unused_input='warn'
499 | )
500 | 
501 | def generate_and_save_samples(tag):
502 |     def write_audio_file(name, data):
503 |         data = data.astype('float32')
504 |         #data -= data.min()
505 |         #data /= data.max()
506 |         #data -= 0.5
507 |         #data *= 0.95
508 |         scipy.io.wavfile.write(
509 |                     os.path.join(SAMPLES_PATH, name+'.wav'),
510 |                     BITRATE,
511 |                     data)
512 | 
513 |     total_time=time()
514 |     costs_g = []
515 |     accuracys_g=[]
516 |     count=0
517 |     data_feeder = load_data(test_feeder)
518 |     for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,batch_g,seqs_g_8k_real in data_feeder:
519 |         if reset_g==1:
520 |             big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
521 |             h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32')
522 |             cost_batch=np.zeros((batch_g,),dtype='float32')
523 |             accuracy_batch=np.zeros((batch_g,),dtype='float32')
524 |             mask_batch=np.zeros((batch_g,),dtype='float32')
525 |             count+=1
526 |         cost_g, accuracy_g,mask_sum_g,sample, big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, big_h0_g,h0_g, reset_g, mask_g,batch_g)
527 |         cost_batch=cost_batch+cost_g
528 |         accuracy_batch=accuracy_batch+accuracy_g
529 |         mask_batch=mask_batch+mask_sum_g
530 |         if end_flag_g==1:
531 |             costs_g.extend(list(cost_batch/mask_batch))
532 |             accuracys_g.extend(list(accuracy_batch/mask_batch))
533 | 
534 |         if count==1:
535 |             if reset_g==1:
536 |                 samples_low=seqs_g_8k_real[:,0:-OVERLAP]
537 |                 samples=sample
538 |                 masks_g=mask_g[:,0:-OVERLAP]
539 |             else:
540 |                 samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1)
541 |                 samples=np.concatenate([samples,sample],axis=1)
542 |                 masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1)
543 | 
544 | 
545 |     for i in xrange(N_SEQS):
546 |         samples_lowi=samples_low[i]
547 |         samplei=samples[i]
548 |         maski=masks_g[i]
549 |         samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])]
550 |         samplei=samplei[0:len(np.where(maski==1)[0])]
551 |         if Q_TYPE == 'mu-law':
552 |             from datasets.dataset import mu2linear
553 |             samplei = mu2linear(samplei)
554 |         write_audio_file("sample_{}_{}".format(tag, i), samplei/3+samples_lowi)
555 | 
556 |     total_time = time() - total_time
557 |     log = "{} samples generated in {} seconds."
558 |     log = log.format(N_SEQS, total_time)
559 |     print log,
560 | 
561 |     return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time
562 | 
563 | 
564 | def monitor(data_feeder):
565 |     """
566 |     Cost and time of test_fn on a given dataset section.
567 |     Pass only one of `valid_feeder` or `test_feeder`.
568 |     Don't pass `train_feed`.
569 | 
570 |     :returns:
571 |         Mean cost over the input dataset (data_feeder)
572 |         Total time spent
573 |     """
574 |     _total_time = time()
575 |     _costs = []
576 |     _accuracys=[]
577 |     _data_feeder = load_data(data_feeder)
578 |     for _seqs_8k,_seqs_up, _reset, _end_flag,_mask,_batch,_seqs_8k_real in _data_feeder:
579 |         if _reset==1:
580 |             _big_h0=numpy.zeros((_batch, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
581 |             _h0 = numpy.zeros((_batch, N_RNN, H0_MULT*DIM), dtype='float32')
582 |             _cost_batch=np.zeros((_batch,),dtype='float32')
583 |             _accuracy_batch=np.zeros((_batch,),dtype='float32')
584 |             _mask_batch=np.zeros((_batch,),dtype='float32')
585 |         _cost, _accuracy,_mask_sum,_big_h0,_h0 = valid_fn(_seqs_8k,_seqs_up, _big_h0,_h0, _reset, _mask,_batch)
586 |         _cost_batch=_cost_batch+_cost
587 |         _accuracy_batch=_accuracy_batch+_accuracy
588 |         _mask_batch=_mask_batch+_mask_sum
589 |         if _end_flag==1:
590 |             _costs.extend(list(_cost_batch/_mask_batch))
591 |             _accuracys.extend(list(_accuracy_batch/_mask_batch))
592 | 
593 | 
594 |     return numpy.mean(_costs), numpy.mean(_accuracys)*100,time() - _total_time
595 | 
596 | print "Wall clock time spent before training started: {:.2f}h"\
597 |         .format((time()-exp_start)/3600.)
598 | print "Training!"
599 | total_iters = 0
600 | total_time = 0.
601 | last_print_time = 0.
602 | last_print_iters = 0
603 | costs = []
604 | lowest_valid_cost = numpy.finfo(numpy.float32).max
605 | corresponding_test_cost = numpy.finfo(numpy.float32).max
606 | new_lowest_cost = False
607 | end_of_batch = False
608 | epoch = 0
609 | learning_rate=LEARNING_RATE
610 | 
611 | # Initial load train dataset
612 | tr_feeder = load_data(train_feeder)
613 | 
614 | ### Handling the resume option:
615 | if RESUME:
616 |     # Check if checkpoint from previous run is not corrupted.
617 |     # Then overwrite some of the variables above.
618 |     iters_to_consume, res_path, epoch, total_iters,\
619 |         [lowest_valid_cost, corresponding_test_cost, test_cost] = \
620 |         lib.resumable(path=FOLDER_PREFIX,
621 |                       iter_key=iter_str,
622 |                       epoch_key=epoch_str,
623 |                       add_resume_counter=True,
624 |                       other_keys=[lowest_valid_str,
625 |                                   corresp_test_str,
626 |                                   test_nll_str])
627 |     # At this point we saved the pkl file.
628 |     last_print_iters = total_iters
629 |     print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters)
630 |     # Consumes this much iters to get to the last point in training data.
631 |     consume_time = time()
632 |     for i in xrange(iters_to_consume):
633 |         tr_feeder.next()
634 |     consume_time = time() - consume_time
635 |     print "Train data ready in {:.2f}secs after consuming {} minibatches.".\
636 |             format(consume_time, iters_to_consume)
637 | 
638 |     lib.load_params(res_path)
639 |     print "Parameters from last available checkpoint loaded."
640 | 
641 | while True:
642 |     # THIS IS ONE ITERATION
643 |     if total_iters % 500 == 0:
644 |         print total_iters,
645 | 
646 |     total_iters += 1
647 | 
648 |     try:
649 |         # Take as many mini-batches as possible from train set
650 |         mini_batch = tr_feeder.next()
651 |     except StopIteration:
652 |         # Mini-batches are finished. Load it again.
653 |         # Basically, one epoch.
654 |         tr_feeder = load_data(train_feeder)
655 | 
656 |         # and start taking new mini-batches again.
657 |         mini_batch = tr_feeder.next()
658 |         epoch += 1
659 |         end_of_batch = True
660 |         print "[Another epoch]",
661 | 
662 |     seqs_8k, seqs_up,reset, end_flag,mask,batch_num,seqs_8k_real = mini_batch
663 |     if reset==1:
664 |         big_h0=numpy.zeros((batch_num, N_BIG_RNN, H0_MULT*DIM), dtype='float32')
665 |         h0 = numpy.zeros((batch_num, N_RNN, H0_MULT*DIM), dtype='float32')
666 | 
667 |     start_time = time()
668 |     cost,big_h0,h0 = train_fn(seqs_8k, seqs_up, big_h0,h0, reset, mask,batch_num,learning_rate)
669 |     total_time += time() - start_time
670 |     #print "This cost:", cost, "This h0.mean()", h0.mean()
671 | 
672 |     costs.append(cost)
673 | 
674 |     # Monitoring step
675 |     if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \
676 |         (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \
677 |         (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \
678 |         (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \
679 |         end_of_batch:
680 |         # 0. Validation
681 |         print "\nValidation!",
682 |         valid_cost, valid_accuracy,valid_time = monitor(valid_feeder)
683 |         print "Done!"
684 | 
685 |         # 1. Test
686 |         test_time = 0.
687 |         # Only when the validation cost is improved get the cost for test set.
688 |         if valid_cost < lowest_valid_cost:
689 |             lowest_valid_cost = valid_cost
690 |             print "\n>>> Best validation cost of {} reached."\
691 |                     .format(valid_cost),
692 |             #test_cost, test_time = monitor(test_feeder)
693 |             #print "Done!"
694 |             # Report last one which is the lowest on validation set:
695 |             #print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time)
696 |             #corresponding_test_cost = test_cost
697 |             new_lowest_cost = True
698 | 
699 |         tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}"
700 |         tag = tag.format(epoch,
701 |                          total_iters,
702 |                          total_time/3600,
703 |                          numpy.mean(cost),
704 |                          valid_cost)
705 |         tag += ("_best" if new_lowest_cost else "")
706 | 
707 |         print "Sampling!",
708 |         # Generate samples
709 |         test_cost, test_accuracy,test_time=generate_and_save_samples(tag)
710 |         print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time)
711 |         if new_lowest_cost:
712 |             corresponding_test_cost = test_cost
713 |         print "Done!"
714 | 
715 |         # 2. Stdout the training progress
716 |         print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n"
717 |         print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n"
718 |         print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n"
719 |         print_info += "\tvalid cost:{:.4f}\tvalid accuracy:{:.4f}%\ttotal time:{:.2f}h\n"
720 |         print_info += "\ttest  cost:{:.4f}\ttest accuracy:{:.4f}%\ttotal time:{:.2f}h"
721 |         print_info = print_info.format(epoch,
722 |                                        total_iters,
723 |                                        (time()-exp_start)/3600,
724 |                                        lowest_valid_cost,
725 |                                        corresponding_test_cost,
726 |                                        numpy.mean(costs),
727 |                                        total_time/3600,
728 |                                        total_time/total_iters,
729 |                                        valid_cost,
730 |                                        valid_accuracy,
731 |                                        valid_time/3600,
732 |                                        test_cost,
733 |                                        test_accuracy,
734 |                                        test_time/3600)
735 |         print print_info
736 | 
737 | 
738 |         # 3. Save params of model (IO bound, time consuming)
739 |         # If saving params is not successful, there shouldn't be any trace of
740 |         # successful monitoring step in train_log as well.
741 |         print "Saving params!",
742 |         lib.save_params(
743 |                 os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag))
744 |         )
745 |         print "Done!"
746 | 
747 |         # 4. Save and graph training progress (fast)
748 |         training_info = {epoch_str : epoch,
749 |                          iter_str : total_iters,
750 |                          train_nll_str : numpy.mean(costs),
751 |                          valid_nll_str : valid_cost,
752 |                          test_nll_str : test_cost,
753 |                          lowest_valid_str : lowest_valid_cost,
754 |                          corresp_test_str : corresponding_test_cost,
755 |                          'train time' : total_time,
756 |                          'valid time' : valid_time,
757 |                          'test time' : test_time,
758 |                          'wall clock time' : time()-exp_start}
759 |         lib.save_training_info(training_info, FOLDER_PREFIX)
760 |         print "Train info saved!",
761 | 
762 |         # y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str]
763 |         # lib.plot_traing_info(iter_str, y_axis_strs, FOLDER_PREFIX)
764 |         print "And plotted!"
765 | 
766 |         if total_iters-last_print_iters == PRINT_ITERS:
767 |                 # If we are here b/c of onom_end_of_batch, we shouldn't mess
768 |                 # with costs and last_print_iters
769 |             costs = []
770 |             last_print_time += PRINT_TIME
771 |             last_print_iters += PRINT_ITERS
772 | 
773 |         if epoch==6 and end_of_batch==True:
774 |             learning_rate=0.0001
775 |             print "\n Now learning rate is 0.0001."
776 | 
777 |         end_of_batch = False
778 |         new_lowest_cost = False
779 | 
780 |         print "Validation Done!\nBack to Training..."
781 | 
782 |     if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \
783 |        (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \
784 |        ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \
785 |             (total_iters == STOP_ITERS or total_time >= STOP_TIME)):
786 | 
787 |         print "Done! Total iters:", total_iters, "Total time: ", total_time
788 |         print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M')
789 |         print "Wall clock time spent: {:.2f}h"\
790 |                     .format((time()-exp_start)/3600)
791 | 
792 |         sys.exit()


--------------------------------------------------------------------------------
/HRNN_HF/readme.md:
--------------------------------------------------------------------------------
 1 | The HRNN system in the paper: 
 2 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018.
 3 | Usage:
 4 | First enter the root directory of the folder: `cd HRNN_HF`.
 5 | 
 6 | Data preparation:
 7 | Put the train, validiation and test waveforms (16kHz sample rate) into the corresponding folder in directory 'datasets/TIMIT',
 8 | then run `python datasets/TIMIT/_2npy_hf.py` to generate the packaged data.
 9 | 
10 | Traning and validiation:
11 | Run:
12 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/three_tier_train_valid.py --exp BEST_3TIER --seq_len 480 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64`
13 | 
14 | Test:
15 | Run:
16 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/three_tier_test.py --exp BEST_3TIER --seq_len 480 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64`


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension
2 | Codes of the paper: 
3 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018.
4 | 
5 | ./HRNN_HF is the code of the HRNN system in the paper.
6 | 
7 | ./CHRNN_HF is the code of the CHRNN system in the paper.
8 | 


--------------------------------------------------------------------------------