├── CHRNN_HF ├── datasets │ ├── TIMIT │ │ └── _2npy_hf.py │ ├── __init__.py │ └── dataset.py ├── lib │ ├── __init__.py │ └── ops.py ├── models │ └── four_tier │ │ ├── four_tier_generation.py │ │ └── four_tier_train_valid.py └── readme.md ├── HRNN_HF ├── datasets │ ├── TIMIT │ │ └── _2npy_hf.py │ ├── __init__.py │ └── dataset.py ├── lib │ ├── __init__.py │ └── ops.py ├── models │ └── three_tier │ │ ├── three_tier_generation.py │ │ └── three_tier_train_valid.py └── readme.md └── README.md /CHRNN_HF/datasets/TIMIT/_2npy_hf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import random 4 | import os 5 | import glob 6 | import math 7 | 8 | __RAND_SEED = 123 9 | def ReadFloatRawMat(datafile,column): 10 | data = np.fromfile(datafile,dtype=np.float32) 11 | if len(data)%column!=0: 12 | print 'ReadFloatRawMat %s, column wrong!'%datafile 13 | exit() 14 | if len(data)==0: 15 | print 'empty file: %s'%datafile 16 | exit() 17 | data.shape = [len(data)/column,column] 18 | return np.float32(data) 19 | 20 | def __fixed_shuffle(inp_list): 21 | if isinstance(inp_list, list): 22 | random.seed(__RAND_SEED) 23 | random.shuffle(inp_list) 24 | return 25 | if isinstance(inp_list, np.ndarray): 26 | np.random.seed(__RAND_SEED) 27 | np.random.shuffle(inp_list) 28 | return 29 | 30 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 31 | 32 | def clip_times(audio, times): 33 | 34 | audio = audio * times 35 | audio[audio > 1] = 1 36 | audio[audio < -1] = -1 37 | return audio 38 | 39 | def wav2npy(data_path,con_data_path,save_path,name,fixed_shuffle=True,sample_rate=16000,frame_len=160,con_dim=100): 40 | paths = sorted(glob.glob(data_path+"/*.wav")) 41 | if name=='test': 42 | fid=open(save_path+'/'+'test_list.scp','w') 43 | for i in xrange(len(paths)): 44 | fid.write(paths[i].split('/')[-1]+'\n') 45 | fid.close() 46 | con_paths=sorted(glob.glob(con_data_path+"/*.dat")) 47 | if fixed_shuffle: 48 | __fixed_shuffle(paths) 49 | __fixed_shuffle(con_paths) 50 | for i,path in enumerate(paths): 51 | print i 52 | print path 53 | print con_paths[i] 54 | audio16k, _ = librosa.load(path, sr=sample_rate, mono=True) 55 | audio8k = librosa.core.resample(audio16k,sample_rate,sample_rate/2) 56 | audio8k = librosa.core.resample(audio8k,sample_rate/2,sample_rate) 57 | condition=ReadFloatRawMat(con_paths[i],1).reshape(1,-1) 58 | 59 | if(len(audio8k)==len(audio16k)): 60 | pass 61 | elif(len(audio8k)>len(audio16k)): 62 | audio8k=audio8k[0:len(audio16k)] 63 | else: 64 | audio16k=audio16k[0:len(audio8k)] 65 | 66 | audio_up=audio16k-audio8k 67 | audio_up = clip_times(audio_up, 3) 68 | 69 | if len(audio8k)>condition.shape[1]/con_dim*frame_len: 70 | diff=len(audio8k)-condition.shape[1]/con_dim*frame_len 71 | audio8k=audio8k[:-diff] 72 | audio_up=audio_up[:-diff] 73 | elif len(audio8k)max_con_len: 92 | con_mat=np.pad(con_mat,[[0,0],[0,current_con_len-max_con_len]],'constant') 93 | con_mat=np.concatenate((con_mat,condition),axis=0) 94 | max_con_len=current_con_len 95 | else: 96 | con_mat=np.concatenate((con_mat,np.pad(condition,[[0,0],[0,max_con_len-current_con_len]],'constant')),axis=0) 97 | if current_len>max_len: 98 | audio_mat_up=np.pad(audio_mat_up,[[0,0],[0,current_len-max_len]],'constant') 99 | audio_mat_up=np.concatenate((audio_mat_up,np.array(audio_up,dtype='float32').reshape(1,current_len)),axis=0) 100 | audio_mat8k=np.pad(audio_mat8k,[[0,0],[0,current_len-max_len]],'constant') 101 | audio_mat8k=np.concatenate((audio_mat8k,np.array(audio8k,dtype='float32').reshape(1,current_len)),axis=0) 102 | mask=np.pad(mask,[[0,0],[0,current_len-max_len]],'constant') 103 | mask=np.concatenate((mask,np.ones((1,current_len),dtype='float32')),axis=0) 104 | max_len=current_len 105 | else: 106 | audio_mat_up=np.concatenate((audio_mat_up,np.pad(np.array(audio_up,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 107 | audio_mat8k=np.concatenate((audio_mat8k,np.pad(np.array(audio8k,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 108 | mask=np.concatenate((mask,np.pad(np.ones((1,current_len),dtype='float32'),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 109 | 110 | np.save(save_path+'/'+'TIMIT_'+name+'_up.npy', audio_mat_up) 111 | np.save(save_path+'/'+'TIMIT_'+name+'_8k.npy', audio_mat8k) 112 | np.save(save_path+'/'+'TIMIT_'+name+'_mask.npy', mask) 113 | np.save(save_path+'/'+'TIMIT_'+name+'_con.npy', con_mat) 114 | 115 | print name+' data storage is complete!' 116 | 117 | 118 | wav2npy('datasets/TIMIT/waveform/train','datasets/TIMIT/bn_norm_condition/train','datasets/TIMIT','train',fixed_shuffle=True,sample_rate=16000) 119 | wav2npy('datasets/TIMIT/waveform/valid','datasets/TIMIT/bn_norm_condition/valid','datasets/TIMIT','valid',fixed_shuffle=True,sample_rate=16000) 120 | wav2npy('datasets/TIMIT/waveform/test','datasets/TIMIT/bn_norm_condition/test','datasets/TIMIT','test',fixed_shuffle=False,sample_rate=16000) -------------------------------------------------------------------------------- /CHRNN_HF/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiyang8067/Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension/94c3daf9554e20ea2538eb2b7aa044024fedb9ed/CHRNN_HF/datasets/__init__.py -------------------------------------------------------------------------------- /CHRNN_HF/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Vocal Generation Model 3 | 4 | TIMIT data feeders. 5 | """ 6 | 7 | import numpy as np 8 | import random 9 | import time 10 | import os 11 | import glob 12 | 13 | __base = [ 14 | ('Local', 'datasets/'), 15 | ] 16 | 17 | __TIMIT_file = 'TIMIT/TIMIT_{}.npy' 18 | 19 | __train_mask = lambda s: s.format('train_mask') 20 | __train_up = lambda s: s.format('train_up') 21 | __train8k = lambda s: s.format('train_8k') 22 | __train_con = lambda s: s.format('train_con') 23 | __valid_mask = lambda s: s.format('valid_mask') 24 | __valid_up = lambda s: s.format('valid_up') 25 | __valid8k = lambda s: s.format('valid_8k') 26 | __valid_con = lambda s: s.format('valid_con') 27 | __test_mask = lambda s: s.format('test_mask') 28 | __test_up = lambda s: s.format('test_up') 29 | __test8k = lambda s: s.format('test_8k') 30 | __test_con = lambda s: s.format('test_con') 31 | 32 | def find_dataset(filename): 33 | for (k, v) in __base: 34 | tmp_path = os.path.join(v, filename) 35 | if os.path.exists(tmp_path): 36 | return tmp_path 37 | raise Exception('{} NOT FOUND!'.format(filename)) 38 | 39 | ### Basic utils ### 40 | def __round_to(x, y): 41 | """round x up to the nearest y""" 42 | return int(np.ceil(x / float(y))) * y 43 | 44 | def __normalize(data): 45 | """To range [0., 1.]""" 46 | data -= data.min(axis=1)[:, None] 47 | data /= data.max(axis=1)[:, None] 48 | return data 49 | 50 | def __linear_quantize(data, q_levels): 51 | """ 52 | floats in (0, 1) to ints in [0, q_levels-1] 53 | scales normalized across axis 1 54 | """ 55 | # Normalization is on mini-batch not whole file 56 | #eps = numpy.float64(1e-5) 57 | #data -= data.min(axis=1)[:, None] 58 | #data *= ((q_levels - eps) / data.max(axis=1)[:, None]) 59 | #data += eps/2 60 | #data = data.astype('int32') 61 | 62 | eps = np.float64(1e-5) 63 | data *= (q_levels - eps) 64 | data += eps/2 65 | data = data.astype('int32') 66 | return data 67 | 68 | def linear2mu(x, mu=255): 69 | """ 70 | From Joao 71 | x should be normalized between -1 and 1 72 | Converts an array according to mu-law and discretizes it 73 | 74 | Note: 75 | mu2linear(linear2mu(x)) != x 76 | Because we are compressing to 8 bits here. 77 | They will sound pretty much the same, though. 78 | 79 | :usage: 80 | >>> bitrate, samples = scipy.io.wavfile.read('orig.wav') 81 | >>> norm = __normalize(samples)[None, :] # It takes 2D as inp 82 | >>> mu_encoded = linear2mu(2.*norm-1.) # From [0, 1] to [-1, 1] 83 | >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype 84 | 0, 255, dtype('int16') 85 | >>> mu_decoded = mu2linear(mu_encoded) # Back to linear 86 | >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype 87 | -1, 0.9574371, dtype('float32') 88 | """ 89 | x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu) 90 | return ((x_mu + 1)/2 * mu).astype('int16') 91 | 92 | def mu2linear(x, mu=255): 93 | """ 94 | From Joao with modifications 95 | Converts an integer array from mu to linear 96 | 97 | For important notes and usage see: linear2mu 98 | """ 99 | mu = float(mu) 100 | x = x.astype('float32') 101 | y = 2. * (x - (mu+1.)/2.) / (mu+1.) 102 | return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.) 103 | 104 | def __mu_law_quantize(data): 105 | return linear2mu(data) 106 | 107 | def __batch_quantize(data, q_levels, q_type): 108 | """ 109 | One of 'linear', 'a-law', 'mu-law' for q_type. 110 | """ 111 | data = data.astype('float64') 112 | #data = __normalize(data) 113 | if q_type == 'linear': 114 | return __linear_quantize(data, q_levels) 115 | if q_type == 'mu-law': 116 | # from [0, 1] to [-1, 1] 117 | #data = 2.*data-1. 118 | # Automatically quantized to 256 bins. 119 | return __mu_law_quantize(data) 120 | raise NotImplementedError 121 | 122 | __RAND_SEED = 123 123 | def __fixed_shuffle(inp_list): 124 | if isinstance(inp_list, list): 125 | random.seed(__RAND_SEED) 126 | random.shuffle(inp_list) 127 | return 128 | if isinstance(inp_list, np.ndarray): 129 | np.random.seed(__RAND_SEED) 130 | np.random.shuffle(inp_list) 131 | return 132 | 133 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 134 | 135 | def __make_random_batches(inp_list, batch_size,shuffle=True): 136 | batches = [] 137 | for i in xrange(len(inp_list) / batch_size+1): 138 | if i==len(inp_list) / batch_size: 139 | if len(inp_list)%batch_size==0: 140 | break 141 | else: 142 | batches.append(inp_list[i*batch_size:]) 143 | else: 144 | batches.append(inp_list[i*batch_size:(i+1)*batch_size]) 145 | 146 | if shuffle: 147 | __fixed_shuffle(batches) 148 | return batches 149 | 150 | def __mask_sort(mask_matrix): 151 | ind=[] 152 | for i in xrange(len(mask_matrix)): 153 | ind.append(len(np.where(mask_matrix[i]==1)[0])) 154 | b=zip(ind,range(len(ind))) 155 | b.sort(key=lambda x:x[0],reverse=True) 156 | index=[x[1] for x in b] 157 | 158 | return index 159 | 160 | ### TIMIT DATASET LOADER ### 161 | def __TIMIT_feed_epoch(files, 162 | mask_files, 163 | con_files, 164 | shuffle, 165 | sort, 166 | batch_size, 167 | seq_len, 168 | con_frame_size, 169 | con_dim, 170 | overlap, 171 | q_levels, 172 | q_zero, 173 | q_type, 174 | real_valued=False): 175 | """ 176 | Helper function to load blizzard dataset. 177 | Generator that yields training inputs (subbatch, reset). `subbatch` contains 178 | quantized audio data; `reset` is a boolean indicating the start of a new 179 | sequence (i.e. you should reset h0 whenever `reset` is True). 180 | 181 | Feeds subsequences which overlap by a specified amount, so that the model 182 | can always have target for every input in a given subsequence. 183 | 184 | Assumes all flac files have the same length. 185 | 186 | returns: (subbatch, reset) 187 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP) 188 | reset: True or False 189 | """ 190 | if sort: 191 | sort_index=__mask_sort(mask_files) 192 | batches_8k = __make_random_batches(files[0][sort_index], batch_size,shuffle) 193 | batches_up = __make_random_batches(files[1][sort_index], batch_size,shuffle) 194 | mask_batches=__make_random_batches(mask_files[sort_index],batch_size,shuffle) 195 | con_batches=__make_random_batches(con_files[sort_index],batch_size,shuffle) 196 | else: 197 | batches_8k = __make_random_batches(files[0], batch_size,shuffle) 198 | batches_up = __make_random_batches(files[1], batch_size,shuffle) 199 | mask_batches=__make_random_batches(mask_files,batch_size,shuffle) 200 | con_batches=__make_random_batches(con_files,batch_size,shuffle) 201 | 202 | for index,bch_8k in enumerate(batches_8k): 203 | 204 | batch_num=len(bch_8k) 205 | con=con_batches[index] 206 | bch_up=batches_up[index] 207 | mask=mask_batches[index] 208 | mask_sum=np.sum(mask,axis=0) 209 | mask_all0_index=np.where(mask_sum==0)[0] 210 | if len(mask_all0_index!=0): 211 | bch_up=bch_up[:,:-len(mask_all0_index)] 212 | bch_8k=bch_8k[:,:-len(mask_all0_index)] 213 | mask=mask[:,:-len(mask_all0_index)] 214 | con=con[:,:-(len(mask_all0_index)/con_frame_size*con_dim)] 215 | 216 | batch_seq_len = len(bch_8k[0]) 217 | batch_seq_len = __round_to(batch_seq_len, seq_len) 218 | 219 | batch_8k = np.zeros( 220 | (batch_num, batch_seq_len), 221 | dtype='float64' 222 | ) 223 | batch_up = np.zeros( 224 | (batch_num, batch_seq_len), 225 | dtype='float64' 226 | ) 227 | 228 | mask=np.pad(mask,[[0,0],[0,batch_seq_len-mask.shape[1]]],'constant') 229 | con=np.pad(con,[[0,0],[0,batch_seq_len/con_frame_size*con_dim-con.shape[1]]],'constant') 230 | for i, data in enumerate(bch_8k): 231 | batch_8k[i, :len(data)] = data 232 | for i, data in enumerate(bch_up): 233 | batch_up[i, :len(data)] = data 234 | 235 | batch_8k_real=np.concatenate([ 236 | batch_8k, 237 | np.full((batch_num, overlap), 0, dtype='float32') 238 | ], axis=1) 239 | 240 | if not real_valued: 241 | batch_8k = __batch_quantize(batch_8k, q_levels, q_type) 242 | batch_up = __batch_quantize(batch_up, q_levels, q_type) 243 | 244 | batch_8k = np.concatenate([ 245 | batch_8k, 246 | np.full((batch_num, overlap), q_zero, dtype='int32') 247 | ], axis=1) 248 | 249 | batch_up = np.concatenate([ 250 | batch_up, 251 | np.full((batch_num, overlap), q_zero, dtype='int32') 252 | ], axis=1) 253 | 254 | mask = np.concatenate([ 255 | mask, 256 | np.full((batch_num, overlap), 0, dtype='float32') 257 | ], axis=1) 258 | 259 | for i in xrange(batch_seq_len // seq_len): 260 | reset = np.int32(i==0) 261 | end_flag=np.int32(i==batch_seq_len // seq_len-1) 262 | subbatch_8k_real=batch_8k_real[:, i*seq_len : (i+1)*seq_len+overlap] 263 | subbatch_8k = batch_8k[:, i*seq_len : (i+1)*seq_len+overlap] 264 | subbatch_up = batch_up[:, i*seq_len : (i+1)*seq_len+overlap] 265 | submask = mask[:, i*seq_len : (i+1)*seq_len+overlap] 266 | subcon=con[:,i*seq_len/con_frame_size*con_dim:(i+1)*seq_len/con_frame_size*con_dim] 267 | yield (subbatch_8k, subbatch_up,reset, end_flag,submask,subcon,batch_num,subbatch_8k_real) 268 | 269 | def TIMIT_train_feed_epoch(*args): 270 | """ 271 | :parameters: 272 | batch_size: int 273 | seq_len: 274 | overlap: 275 | q_levels: 276 | q_zero: 277 | q_type: One the following 'linear', 'a-law', or 'mu-law' 278 | 279 | THE NEW SEG IS: 280 | 20.48hrs 36*256 281 | 3*256 282 | 3*256 283 | 284 | :returns: 285 | A generator yielding (subbatch, reset, submask) 286 | """ 287 | # Just check if valid/test sets are also available. If not, raise. 288 | find_dataset(__valid_up(__TIMIT_file)) 289 | find_dataset(__valid8k(__TIMIT_file)) 290 | find_dataset(__valid_mask(__TIMIT_file)) 291 | find_dataset(__valid_con(__TIMIT_file)) 292 | find_dataset(__test_up(__TIMIT_file)) 293 | find_dataset(__test8k(__TIMIT_file)) 294 | find_dataset(__test_mask(__TIMIT_file)) 295 | find_dataset(__test_con(__TIMIT_file)) 296 | # Load train set 297 | data_path_8k = find_dataset(__train8k(__TIMIT_file)) 298 | data_path_up = find_dataset(__train_up(__TIMIT_file)) 299 | data_mask_path=find_dataset(__train_mask(__TIMIT_file)) 300 | data_con_path=find_dataset(__train_con(__TIMIT_file)) 301 | files=[] 302 | files.append(np.load(data_path_8k)) 303 | files.append(np.load(data_path_up)) 304 | mask_files=np.load(data_mask_path) 305 | con_files=np.load(data_con_path) 306 | shuffle=True 307 | sort=True 308 | generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args) 309 | return generator 310 | 311 | def TIMIT_valid_feed_epoch(*args): 312 | """ 313 | See: 314 | TIMIT_train_feed_epoch 315 | """ 316 | data_path_8k = find_dataset(__valid8k(__TIMIT_file)) 317 | data_path_up = find_dataset(__valid_up(__TIMIT_file)) 318 | data_mask_path=find_dataset(__valid_mask(__TIMIT_file)) 319 | data_con_path=find_dataset(__valid_con(__TIMIT_file)) 320 | files=[] 321 | files.append(np.load(data_path_8k)) 322 | files.append(np.load(data_path_up)) 323 | mask_files=np.load(data_mask_path) 324 | con_files=np.load(data_con_path) 325 | shuffle=True 326 | sort=False 327 | generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args) 328 | return generator 329 | 330 | def TIMIT_test_feed_epoch(*args): 331 | """ 332 | See: 333 | TIMIT_train_feed_epoch 334 | """ 335 | data_path_8k = find_dataset(__test8k(__TIMIT_file)) 336 | data_path_up = find_dataset(__test_up(__TIMIT_file)) 337 | data_mask_path=find_dataset(__test_mask(__TIMIT_file)) 338 | data_con_path=find_dataset(__test_con(__TIMIT_file)) 339 | files=[] 340 | files.append(np.load(data_path_8k)) 341 | files.append(np.load(data_path_up)) 342 | mask_files=np.load(data_mask_path) 343 | con_files=np.load(data_con_path) 344 | shuffle=False 345 | sort=False 346 | generator = __TIMIT_feed_epoch(files, mask_files,con_files,shuffle,sort,*args) 347 | return generator 348 | -------------------------------------------------------------------------------- /CHRNN_HF/lib/__init__.py: -------------------------------------------------------------------------------- 1 | import ops 2 | #import lasagne 3 | #from theano.compile.nanguardmode import NanGuardMode 4 | 5 | import math 6 | import time 7 | import locale 8 | 9 | import numpy 10 | import theano 11 | import theano.tensor as T 12 | import theano.gof 13 | 14 | import cPickle as pickle 15 | #import pickle 16 | import warnings 17 | import sys, os, errno, glob 18 | 19 | # import matplotlib 20 | # matplotlib.use('Agg') 21 | # import matplotlib.pyplot as plt 22 | 23 | # TODO: Grouping is not working on cluster! :-? 24 | # Set a locale first or you won't get grouping at all 25 | locale.setlocale(locale.LC_ALL, '') 26 | # 'en_US.UTF-8' 27 | 28 | _params = {} 29 | def param(name, *args, **kwargs): 30 | """ 31 | A wrapper for `theano.shared` which enables parameter sharing in models. 32 | 33 | Creates and returns theano shared variables similarly to `theano.shared`, 34 | except if you try to create a param with the same name as a 35 | previously-created one, `param(...)` will just return the old one instead of 36 | making a new one. 37 | 38 | This constructor also adds a `param` attribute to the shared variables it 39 | creates, so that you can easily search a graph for all params. 40 | """ 41 | 42 | if name not in _params: 43 | kwargs['name'] = name 44 | param = theano.shared(*args, **kwargs) 45 | param.param = True 46 | _params[name] = param 47 | return _params[name] 48 | 49 | def delete_params(name): 50 | to_delete = [p_name for p_name in _params if name in p_name] 51 | for p_name in to_delete: 52 | del _params[p_name] 53 | 54 | def search(node, critereon): 55 | """ 56 | Traverse the Theano graph starting at `node` and return a list of all nodes 57 | which match the `critereon` function. When optimizing a cost function, you 58 | can use this to get a list of all of the trainable params in the graph, like 59 | so: 60 | 61 | `lib.search(cost, lambda x: hasattr(x, "param"))` 62 | or 63 | `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)` 64 | """ 65 | 66 | def _search(node, critereon, visited): 67 | if node in visited: 68 | return [] 69 | visited.add(node) 70 | 71 | results = [] 72 | if isinstance(node, T.Apply): 73 | for inp in node.inputs: 74 | results += _search(inp, critereon, visited) 75 | else: # Variable node 76 | if critereon(node): 77 | results.append(node) 78 | if node.owner is not None: 79 | results += _search(node.owner, critereon, visited) 80 | return results 81 | 82 | return _search(node, critereon, set()) 83 | 84 | def floatX(x): 85 | """ 86 | Convert `x` to the numpy type specified in `theano.config.floatX`. 87 | """ 88 | if theano.config.floatX == 'float16': 89 | return numpy.float16(x) 90 | elif theano.config.floatX == 'float32': 91 | return numpy.float32(x) 92 | else: # Theano's default float type is float64 93 | print "Warning: lib.floatX using float64" 94 | return numpy.float64(x) 95 | 96 | def save_params(path): 97 | param_vals = {} 98 | for name, param in _params.iteritems(): 99 | param_vals[name] = param.get_value() 100 | 101 | with open(path, 'wb') as f: 102 | pickle.dump(param_vals, f) 103 | 104 | def load_params(path): 105 | with open(path, 'rb') as f: 106 | param_vals = pickle.load(f) 107 | 108 | for name, val in param_vals.iteritems(): 109 | _params[name].set_value(val) 110 | 111 | def clear_all_params(): 112 | to_delete = [p_name for p_name in _params] 113 | for p_name in to_delete: 114 | del _params[p_name] 115 | 116 | def ensure_dir(dirname): 117 | """ 118 | Ensure that a named directory exists; if it does not, attempt to create it. 119 | """ 120 | try: 121 | os.makedirs(dirname) 122 | except OSError, e: 123 | if e.errno != errno.EEXIST: 124 | raise 125 | 126 | __model_setting_file_name = 'model_settings.txt' 127 | def print_model_settings(locals_var, path=None, sys_arg=False): 128 | """ 129 | Prints all variables in upper case in locals_var, 130 | except for T which usually stands for theano.tensor. 131 | If locals() passed as input to this method, will print 132 | all the variables in upper case defined so far, that is 133 | model settings. 134 | 135 | With `path` as an address to a directory it will _append_ it 136 | as a file named `model_settings.txt` as well. 137 | 138 | With `sys_arg` set to True, log information about Python, Numpy, 139 | and Theano and passed arguments to the script will be added too. 140 | args.pkl would be overwritten, specially in case of resuming a job. 141 | But again that wouldn't be much of a problem as all the passed args 142 | to the script except for '--resume' should be the same. 143 | 144 | With both `path` and `sys_arg` passed, dumps the theano.config. 145 | 146 | :usage: 147 | >>> import theano.tensor as T 148 | >>> import lib 149 | >>> BATCH_SIZE, DIM = 128, 512 150 | >>> DATA_PATH = '/Path/to/dataset' 151 | >>> lib.print_model_settings(locals(), path='./') 152 | """ 153 | log = "" 154 | if sys_arg: 155 | try: 156 | log += "Python:\n" 157 | log += "\tsys.version_info\t{}\n".format(str(sys.version_info)) 158 | log += "Numpy:\n" 159 | log += "\t.__version__\t{}\n".format(numpy.__version__) 160 | log += "Theano:\n" 161 | log += "\t.__version__\t{}\n".format(theano.__version__) 162 | log += "\n\nAll passed args:\n" 163 | log += str(sys.argv) 164 | log += "\n" 165 | except: 166 | print "Something went wrong during sys_arg logging. Continue anyway!" 167 | 168 | log += "\nModel settings:" 169 | all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')] 170 | all_vars = sorted(all_vars, key=lambda x: x[0]) 171 | for var_name, var_value in all_vars: 172 | log += ("\n\t%-20s %s" % (var_name, var_value)) 173 | print log 174 | if path is not None: 175 | ensure_dir(path) 176 | # Don't override, just append if by mistake there is something in the file. 177 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 178 | f.write(log) 179 | if sys_arg: 180 | with open(os.path.join(path, 'th_conf.txt'), 'a+') as f: 181 | f.write(str(theano.config)) 182 | with open(os.path.join(path, 'args.pkl'), 'wb') as f: 183 | pickle.dump(sys.argv, f) 184 | # To load: 185 | # >>> import cPickle as pickle 186 | # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb')) 187 | 188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True): 189 | """ 190 | Default criterion: 191 | lambda x: hasattr(x, 'param') and x.param==True 192 | This will return every parameter for cost from computation graph. 193 | 194 | To exclude a parameter, just set 'param' to False: 195 | >>> h0 = lib.param('h0',\ 196 | numpy.zeros((3, 2*512), dtype=theano.config.floatX)) 197 | >>> print h0.param # Default: True 198 | >>> h0.param = False 199 | 200 | In this case one still can get list of all params (False or True) by: 201 | >>> lib.get_params(cost, lambda x: hasattr(x, 'param') 202 | 203 | :returns: 204 | A list of params 205 | """ 206 | return search(cost, criterion) 207 | 208 | def print_params_info(params, path=None): 209 | """ 210 | Print information about the parameters in the given param set. 211 | 212 | With `path` as an address to a directory it will _append_ it 213 | as a file named `model_settings.txt` as well. 214 | 215 | :usage: 216 | >>> params = lib.get_params(cost) 217 | >>> lib.print_params_info(params, path='./') 218 | """ 219 | params = sorted(params, key=lambda p: p.name) 220 | values = [p.get_value(borrow=True) for p in params] 221 | shapes = [p.shape for p in values] 222 | total_param_count = 0 223 | multiply_all = lambda a, b: a*b 224 | log = "\nParams for cost:" 225 | for param, value, shape in zip(params, values, shapes): 226 | log += ("\n\t%-20s %s" % (shape, param.name)) 227 | total_param_count += reduce(multiply_all, shape) 228 | 229 | log += "\nTotal parameter count for this cost:\n\t{0}".format( 230 | locale.format("%d", total_param_count, grouping=True) 231 | ) 232 | print log 233 | 234 | if path is not None: 235 | ensure_dir(path) 236 | # Don't override, just append if by mistake there is something in the file. 237 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 238 | f.write(log) 239 | 240 | __train_log_file_name = 'train_log.pkl' 241 | def save_training_info(values, path): 242 | """ 243 | Gets a set of values as dictionary and append them to a log file. 244 | stores in /train_log.pkl 245 | """ 246 | file_name = os.path.join(path, __train_log_file_name) 247 | try: 248 | with open(file_name, "rb") as f: 249 | log = pickle.load(f) 250 | except IOError: # first time 251 | log = {} 252 | for k in values.keys(): 253 | log[k] = [] 254 | for k, v in values.items(): 255 | log[k].append(v) 256 | with open(file_name, "wb") as f: 257 | pickle.dump(log, f) 258 | 259 | resume_key = 'last resume index' 260 | def resumable(path, 261 | iter_key='iter', 262 | epoch_key='epoch', 263 | add_resume_counter=True, 264 | other_keys=[]): 265 | """ 266 | :warning: 267 | This is a naive implementation of resuming a training session 268 | and does not save and reload the training loop. The serialization 269 | of training loop and everything is costly and error-prone. 270 | 271 | :todo: 272 | - Save and load a serializable training loop. (See warning above) 273 | - Heavily dependent on the "model" file and the names used there right 274 | now. It's really easy to miss anything. 275 | 276 | `path` should be pointing at the root directory where `train_log.pkl` 277 | (See __train_log_file_name) and `params/` reside. 278 | 279 | Always assuming all the values in the log dictionary (except `resume_key`), 280 | are lists with the same length. 281 | """ 282 | file_name = os.path.join(path, __train_log_file_name) 283 | # Raise error if does not exists. 284 | with open(file_name, "rb") as f: 285 | log = pickle.load(f) 286 | 287 | param_found = False 288 | res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl') 289 | for reverse_idx in range(-1, -len(log[epoch_key])-1, -1): 290 | ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx] 291 | print "> Params file for epoch {} iter {}".format(ep, it), 292 | last_path = glob.glob(res_path.format(ep, it)) 293 | if len(last_path) == 1: 294 | res_path = last_path[0] 295 | param_found = True 296 | print "found." 297 | break 298 | elif len(last_path) == 0: 299 | print "[NOT FOUND]. FALLING BACK TO..." 300 | else: # > 1 301 | # choose one, warning, rare 302 | print "[multiple version found]:" 303 | for l_path in last_path: 304 | print l_path 305 | res_path = last_path[0] 306 | param_found = True 307 | print "Arbitrarily choosing first:\n\t{}".format(res_path) 308 | 309 | assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log) 310 | # Finishing for loop with no success 311 | assert param_found, 'No matching params file with train_log' 312 | 313 | acceptable_len = reverse_idx+len(log[epoch_key])+1 314 | if acceptable_len != len(log[epoch_key]): 315 | # Backup of the old train_log 316 | with open(file_name+'.backup', 'wb') as f: 317 | pickle.dump(log, f) 318 | 319 | # Change the log file to match the last existing checkpoint. 320 | for k, v in log.items(): 321 | # Fix resume indices 322 | if k == resume_key: 323 | log[k] = [i for i in log[k] if i < acceptable_len] 324 | continue 325 | # Rest is useless with no param file. 326 | log[k] = v[:acceptable_len] 327 | 328 | epochs = log[epoch_key] 329 | iters = log[iter_key] 330 | 331 | if add_resume_counter: 332 | resume_val = len(epochs) 333 | if not resume_key in log.keys(): 334 | log[resume_key] = [resume_val] 335 | else: 336 | if log[resume_key] == [] or log[resume_key][-1] != resume_val: 337 | log[resume_key].append(resume_val) 338 | with open(file_name, "wb") as f: 339 | pickle.dump(log, f) 340 | 341 | last_epoch = epochs[-1] 342 | last_iter = iters[-1] 343 | 344 | # The if-else statement is more readable than `next`: 345 | #iters_to_consume = next((last_iter%(i-1) for (e, i) in\ 346 | # zip(epochs, iters) if e == 1), last_iter) 347 | if last_epoch == 0: 348 | iters_to_consume = last_iter 349 | else: 350 | for e, i in zip(epochs, iters): 351 | # first time. Epoch turns from 0 to 1. 352 | # At the end of each `epoch` there should be 353 | # a monitoring step so it will gives number 354 | # number of iterations per epoch 355 | if e == 1: 356 | iters_per_epoch = i - 1 357 | break 358 | iters_to_consume = last_iter % iters_per_epoch 359 | 360 | last_other_keys = [log[k][-1] for k in other_keys] 361 | return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys 362 | 363 | def plot_traing_info(x, ylist, path): 364 | """ 365 | Loads log file and plot x and y values as provided by input. 366 | Saves as /train_log.png 367 | """ 368 | file_name = os.path.join(path, __train_log_file_name) 369 | try: 370 | with open(file_name, "rb") as f: 371 | log = pickle.load(f) 372 | except IOError: # first time 373 | warnings.warn("There is no {} file here!!!".format(file_name)) 374 | return 375 | plt.figure() 376 | x_vals = log[x] 377 | for y in ylist: 378 | y_vals = log[y] 379 | if len(y_vals) != len(x_vals): 380 | warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x)) 381 | plt.plot(x_vals, y_vals, label=y) 382 | # assert len(y_vals) == len(x_vals), "not the same len" 383 | plt.xlabel(x) 384 | plt.legend() 385 | #plt.show() 386 | plt.savefig(file_name[:-3]+'png', bbox_inches='tight') 387 | plt.close('all') 388 | 389 | def create_logging_folders(path): 390 | """ 391 | Handle structure of folders and naming here instead of training file. 392 | 393 | :todo: 394 | - Implement! 395 | """ 396 | pass 397 | 398 | def tv(var): 399 | """ 400 | :todo: 401 | - add tv() function for theano variables so that instead of calling 402 | x.tag.test_value, you can get the same thing just by calling the method 403 | in a faster way... 404 | - also for x.tag.test_value.shape 405 | """ 406 | # Based on EAFP (easier to ask for forgiveness than permission) 407 | try: 408 | return var.tag.test_value 409 | except AttributeError: 410 | print "NONE, test_value has not been set." 411 | import ipdb; ipdb.set_trace() 412 | 413 | ## Rather than LBYL (look before you leap) 414 | #if hasattr(var, 'tag'): 415 | # if hasattr(var.tag, 'test_value'): 416 | # return var.tag.test_value 417 | # else: 418 | # print "NONE, test_value has not set." 419 | # import ipdb; ipdb.set_trace() 420 | #else: 421 | # print "NONE, tag has not set." 422 | # import ipdb; ipdb.set_trace() 423 | 424 | def tvs(var): 425 | """ 426 | :returns: 427 | var.tag.test_value.shape 428 | """ 429 | return tv(var).shape 430 | 431 | def _is_symbolic(v): 432 | r"""Return `True` if any of the arguments are symbolic. 433 | See: 434 | https://github.com/Theano/Theano/wiki/Cookbook 435 | """ 436 | symbolic = False 437 | v = list(v) 438 | for _container, _iter in [(v, xrange(len(v)))]: 439 | for _k in _iter: 440 | _v = _container[_k] 441 | if isinstance(_v, theano.gof.Variable): 442 | symbolic = True 443 | return symbolic 444 | 445 | def unique_list(inp_list): 446 | """ 447 | returns a list with unique values of inp_list. 448 | :usage: 449 | >>> inp_list = ['a', 'b', 'c'] 450 | >>> unique_inp_list = unique_list(inp_list*2) 451 | """ 452 | return list(set(inp_list)) 453 | -------------------------------------------------------------------------------- /CHRNN_HF/models/four_tier/four_tier_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Audio Generation Model 3 | 4 | Three-tier model, Quantized input 5 | For more info: 6 | $ python three_tier.py -h 7 | 8 | How-to-run example: 9 | sampleRNN$ pwd 10 | /u/mehris/sampleRNN 11 | 12 | 13 | sampleRNN$ \ 14 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python -u \ 15 | models/three_tier/three_tier.py --exp AXIS1 --seq_len 512 --big_frame_size 8 \ 16 | --frame_size 2 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 \ 17 | --n_rnn 1 --rnn_type GRU --learn_h0 True --q_levels 256 --q_type mu-law \ 18 | --batch_size 50 --which_set TIMIT 19 | 20 | To resume add ` --resume` to the END of the EXACTLY above line. You can run the 21 | resume code as many time as possible, depending on the TRAIN_MODE. 22 | (folder name, file name, flags, their order, and the values are important) 23 | """ 24 | from time import time 25 | from datetime import datetime 26 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 27 | exp_start = time() 28 | 29 | import os, sys, glob 30 | sys.path.insert(1, os.getcwd()) 31 | import argparse 32 | import itertools 33 | 34 | import numpy 35 | numpy.random.seed(123) 36 | np = numpy 37 | import random 38 | random.seed(123) 39 | 40 | import theano 41 | import theano.tensor as T 42 | import theano.ifelse 43 | import lasagne 44 | import scipy.io.wavfile 45 | 46 | import lib 47 | 48 | LEARNING_RATE = 0.001 49 | 50 | ### Parsing passed args/hyperparameters ### 51 | def get_args(): 52 | def t_or_f(arg): 53 | ua = str(arg).upper() 54 | if 'TRUE'.startswith(ua): 55 | return True 56 | elif 'FALSE'.startswith(ua): 57 | return False 58 | else: 59 | raise ValueError('Arg is neither `True` nor `False`') 60 | 61 | def check_non_negative(value): 62 | ivalue = int(value) 63 | if ivalue < 0: 64 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 65 | return ivalue 66 | 67 | def check_positive(value): 68 | ivalue = int(value) 69 | if ivalue < 1: 70 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 71 | return ivalue 72 | 73 | def check_unit_interval(value): 74 | fvalue = float(value) 75 | if fvalue < 0 or fvalue > 1: 76 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 77 | return fvalue 78 | 79 | # No default value here. Indicate every single arguement. 80 | parser = argparse.ArgumentParser( 81 | description='three_tier.py\nNo default value! Indicate every argument.') 82 | 83 | # TODO: Fix the descriptions 84 | # Hyperparameter arguements: 85 | parser.add_argument('--exp', help='Experiment name', 86 | type=str, required=False, default='_') 87 | parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True) 88 | parser.add_argument('--con_dim', help='Condition dimension',\ 89 | type=check_positive, required=True) 90 | parser.add_argument('--con_frame_size', help='How many samples per condition frame',\ 91 | type=check_positive, required=True) 92 | parser.add_argument('--big_frame_size', help='How many samples per big frame',\ 93 | type=check_positive, required=True) 94 | parser.add_argument('--frame_size', help='How many samples per frame',\ 95 | type=check_positive, required=True) 96 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\ 97 | type=t_or_f, required=True) 98 | parser.add_argument('--emb_size', help='Size of embedding layer (> 0)', 99 | type=check_positive, required=True) # different than two_tier 100 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', 101 | type=t_or_f, required=True) 102 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 103 | type=check_positive, required=True) 104 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 105 | type=check_positive, choices=xrange(1,6), required=True) 106 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 107 | required=True) 108 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 109 | type=t_or_f, required=True) 110 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 111 | type=check_positive, required=True) 112 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 113 | choices=['linear', 'a-law', 'mu-law'], required=True) 114 | parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK', 115 | choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True) 116 | parser.add_argument('--batch_size', help='size of mini-batch', 117 | type=check_positive, choices=[50,64, 128, 256], required=True) 118 | 119 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 120 | required=False, default=True, action='store_true') 121 | 122 | args = parser.parse_args() 123 | 124 | # NEW 125 | # Create tag for this experiment based on passed args 126 | # tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 127 | # tag += '-lr'+str(LEARNING_RATE) 128 | tag='four_tier_model' 129 | print "Created experiment tag for these args:" 130 | print tag 131 | 132 | return args, tag 133 | 134 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001 135 | args, tag = get_args() 136 | 137 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512) 138 | #print "------------------previous SEQ_LEN:", SEQ_LEN 139 | # TODO: test incremental training 140 | #SEQ_LEN = 512 + 256 141 | #print "---------------------------new SEQ_LEN:", SEQ_LEN 142 | CON_DIM=args.con_dim 143 | CON_FRAME_SIZE=args.con_frame_size 144 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame 145 | FRAME_SIZE = args.frame_size # How many samples per frame 146 | WEIGHT_NORM = args.weight_norm #True 147 | EMB_SIZE = args.emb_size #(256) 148 | SKIP_CONN = args.skip_conn #(False) 149 | DIM = args.dim # Model dimensionality. (1024) 150 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024) 151 | CON_TIER_DIM=DIM 152 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1) 153 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1) 154 | N_CON_RNN=N_RNN 155 | RNN_TYPE = args.rnn_type #GRU 156 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1) 157 | LEARN_H0 = args.learn_h0 #(True) 158 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256) 159 | Q_TYPE = args.q_type # log- or linear-scale #(linear) 160 | WHICH_SET = args.which_set #(MUSIC) 161 | BATCH_SIZE = args.batch_size #(128) 162 | RESUME = args.resume #(False) 163 | assert SEQ_LEN % CON_FRAME_SIZE == 0,\ 164 | 'seq_len should be divisible by con_frame_size' 165 | assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\ 166 | 'con_frame_size should be divisible by big_frame_size' 167 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\ 168 | 'big_frame_size should be divisible by frame_size' 169 | 170 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 171 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 172 | 173 | # Fixed hyperparams 174 | GRAD_CLIP = 1 # Elementwise grad clip threshold 175 | BITRATE = 16000 176 | 177 | # Other constants 178 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 179 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 180 | #TRAIN_MODE = 'time-iters' 181 | # To use PRINT_TIME for validation, 182 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 183 | #TRAIN_MODE = 'iters-time' 184 | # To use PRINT_ITERS for validation, 185 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 186 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations. 187 | STOP_ITERS = 300000 # Stop after this many iterations 188 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds. 189 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 190 | N_SEQS = 5 # Number of samples to generate every time monitoring. 191 | RESULTS_DIR = 'results_4t' 192 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 193 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 194 | OVERLAP = BIG_FRAME_SIZE 195 | 196 | epoch_str = 'epoch' 197 | iter_str = 'iter' 198 | lowest_valid_str = 'lowest valid cost' 199 | corresp_test_str = 'correponding test cost' 200 | train_nll_str, valid_nll_str, test_nll_str = \ 201 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 202 | 203 | ### Create directories ### 204 | # FOLDER_PREFIX: root, contains: 205 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 206 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 207 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 208 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 209 | if not os.path.exists(FOLDER_PREFIX): 210 | os.makedirs(FOLDER_PREFIX) 211 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 212 | if not os.path.exists(PARAMS_PATH): 213 | os.makedirs(PARAMS_PATH) 214 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 215 | if not os.path.exists(SAMPLES_PATH): 216 | os.makedirs(SAMPLES_PATH) 217 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 218 | if not os.path.exists(BEST_PATH): 219 | os.makedirs(BEST_PATH) 220 | 221 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 222 | 223 | ### Import the data_feeder ### 224 | # Handling WHICH_SET 225 | if WHICH_SET == 'TIMIT': 226 | from datasets.dataset import TIMIT_test_feed_epoch as test_feeder 227 | 228 | def load_data(data_feeder): 229 | """ 230 | Helper function to deal with interface of different datasets. 231 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 232 | """ 233 | return data_feeder(BATCH_SIZE, 234 | SEQ_LEN, 235 | CON_FRAME_SIZE, 236 | CON_DIM, 237 | OVERLAP, 238 | Q_LEVELS, 239 | Q_ZERO, 240 | Q_TYPE) 241 | 242 | ### Creating computation graph ### 243 | def con_frame_level_rnn(input_sequences, h0, reset): 244 | """ 245 | input_sequences.shape: (batch size, n con frames * CON_DIM) 246 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 247 | reset.shape: () 248 | output[0].shape: (batch size, n frames, DIM) 249 | output[1].shape: same as h0.shape 250 | output[2].shape: (batch size, seq len, Q_LEVELS) 251 | """ 252 | 253 | frames = input_sequences.reshape(( 254 | input_sequences.shape[0], 255 | input_sequences.shape[1] // CON_DIM, 256 | CON_DIM 257 | )) 258 | 259 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 260 | # (a reasonable range to pass as inputs to the RNN) 261 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 262 | # frames *= lib.floatX(2) 263 | 264 | # Initial state of RNNs 265 | learned_h0 = lib.param( 266 | 'ConFrameLevel.h0', 267 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 268 | ) 269 | # Handling LEARN_H0 270 | learned_h0.param = LEARN_H0 #True 271 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 272 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 273 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 274 | 275 | # Handling RNN_TYPE 276 | # Handling SKIP_CONN 277 | if RNN_TYPE == 'GRU': 278 | rnns_out, last_hidden = lib.ops.stackedGRU('ConFrameLevel.GRU', 279 | N_CON_RNN, 280 | CON_DIM, 281 | CON_TIER_DIM, 282 | frames, 283 | h0=h0, 284 | weightnorm=WEIGHT_NORM, 285 | skip_conn=SKIP_CONN) 286 | elif RNN_TYPE == 'LSTM': 287 | rnns_out, last_hidden = lib.ops.stackedLSTM('ConFrameLevel.LSTM', 288 | N_CON_RNN, 289 | CON_DIM, 290 | CON_TIER_DIM, 291 | frames, 292 | h0=h0, 293 | weightnorm=WEIGHT_NORM, 294 | skip_conn=SKIP_CONN) 295 | 296 | output = lib.ops.Linear( #batch*timestep*dim 297 | 'ConFrameLevel.Output', 298 | CON_TIER_DIM, 299 | BIG_DIM * CON_FRAME_SIZE / BIG_FRAME_SIZE, #1024*8/2 300 | rnns_out, 301 | initialization='he', 302 | weightnorm=WEIGHT_NORM 303 | ) 304 | output = output.reshape((output.shape[0], output.shape[1] * CON_FRAME_SIZE / BIG_FRAME_SIZE, BIG_DIM)) 305 | 306 | return (output, last_hidden) #last_hidden:#batch*1*dim 307 | 308 | def big_frame_level_rnn(input_sequences, other_input,h0, reset): 309 | """ 310 | input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8 311 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 312 | reset.shape: () 313 | output[0].shape: (batch size, n frames, DIM) 314 | output[1].shape: same as h0.shape 315 | output[2].shape: (batch size, seq len, Q_LEVELS) 316 | """ 317 | frames = input_sequences.reshape(( 318 | input_sequences.shape[0], 319 | input_sequences.shape[1] // (2*BIG_FRAME_SIZE), 320 | 2*BIG_FRAME_SIZE 321 | )) 322 | 323 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 324 | # (a reasonable range to pass as inputs to the RNN) 325 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 326 | frames *= lib.floatX(1) 327 | 328 | gru_input = lib.ops.Linear( 329 | 'BigFrameLevel.InputExpand', 330 | 2*BIG_FRAME_SIZE, 331 | BIG_DIM, 332 | frames, 333 | initialization='he', 334 | weightnorm=WEIGHT_NORM, 335 | ) + other_input 336 | 337 | # Initial state of RNNs 338 | learned_h0 = lib.param( 339 | 'BigFrameLevel.h0', 340 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 341 | ) 342 | # Handling LEARN_H0 343 | learned_h0.param = LEARN_H0 #True 344 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 345 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 346 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 347 | 348 | # Handling RNN_TYPE 349 | # Handling SKIP_CONN 350 | if RNN_TYPE == 'GRU': 351 | rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU', 352 | N_BIG_RNN, 353 | BIG_DIM, 354 | BIG_DIM, 355 | gru_input, 356 | h0=h0, 357 | weightnorm=WEIGHT_NORM, 358 | skip_conn=SKIP_CONN) 359 | elif RNN_TYPE == 'LSTM': 360 | rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM', 361 | N_BIG_RNN, 362 | BIG_DIM, 363 | BIG_DIM, 364 | gru_input, 365 | h0=h0, 366 | weightnorm=WEIGHT_NORM, 367 | skip_conn=SKIP_CONN) 368 | 369 | output = lib.ops.Linear( #batch*timestep*dim 370 | 'BigFrameLevel.Output', 371 | BIG_DIM, 372 | DIM * BIG_FRAME_SIZE / FRAME_SIZE, #1024*8/2 373 | rnns_out, 374 | initialization='he', 375 | weightnorm=WEIGHT_NORM 376 | ) 377 | output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM)) 378 | 379 | return (output, last_hidden) #last_hidden:#batch*1*dim 380 | 381 | def frame_level_rnn(input_sequences, other_input, h0, reset): 382 | """ 383 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2 384 | other_input.shape: (batch size, n frames, DIM) 385 | h0.shape: (batch size, N_RNN, DIM) 386 | reset.shape: () 387 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 388 | """ 389 | frames = input_sequences.reshape(( 390 | input_sequences.shape[0], 391 | input_sequences.shape[1] // (2*FRAME_SIZE), 392 | 2*FRAME_SIZE 393 | )) 394 | 395 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 396 | # (a reasonable range to pass as inputs to the RNN) 397 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 398 | frames *= lib.floatX(1) 399 | 400 | gru_input = lib.ops.Linear( 401 | 'FrameLevel.InputExpand', 402 | 2*FRAME_SIZE, 403 | DIM, 404 | frames, 405 | initialization='he', 406 | weightnorm=WEIGHT_NORM, 407 | ) + other_input 408 | 409 | # Initial state of RNNs 410 | learned_h0 = lib.param( 411 | 'FrameLevel.h0', 412 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 413 | ) 414 | # Handling LEARN_H0 415 | learned_h0.param = LEARN_H0 416 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 417 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 418 | #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim) 419 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 420 | 421 | # Handling RNN_TYPE 422 | # Handling SKIP_CONN 423 | if RNN_TYPE == 'GRU': 424 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 425 | N_RNN, 426 | DIM, 427 | DIM, 428 | gru_input, 429 | h0=h0, 430 | weightnorm=WEIGHT_NORM, 431 | skip_conn=SKIP_CONN) 432 | elif RNN_TYPE == 'LSTM': 433 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 434 | N_RNN, 435 | DIM, 436 | DIM, 437 | gru_input, 438 | h0=h0, 439 | weightnorm=WEIGHT_NORM, 440 | skip_conn=SKIP_CONN) 441 | 442 | output = lib.ops.Linear( 443 | 'FrameLevel.Output', 444 | DIM, 445 | FRAME_SIZE * DIM, 446 | rnns_out, 447 | initialization='he', 448 | weightnorm=WEIGHT_NORM 449 | ) 450 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 451 | 452 | return (output, last_hidden) 453 | def sample_level_predictor(frame_level_outputs, prev_samples): 454 | """ 455 | frame_level_outputs.shape: (batch size, DIM) 456 | prev_samples.shape: (batch size, FRAME_SIZE) 457 | output.shape: (batch size, Q_LEVELS) 458 | """ 459 | # Handling EMB_SIZE 460 | if EMB_SIZE == 0: # no support for one-hot in three_tier and one_tier. 461 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 462 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 463 | last_out_shape = Q_LEVELS 464 | elif EMB_SIZE > 0: #The embedding steps maps each of the q discrete values to a real-valued vector embedding. 465 | prev_samples = lib.ops.Embedding( #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE 466 | 'SampleLevel.Embedding', 467 | Q_LEVELS, 468 | EMB_SIZE, 469 | prev_samples) 470 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 471 | last_out_shape = EMB_SIZE 472 | else: 473 | raise ValueError('EMB_SIZE cannot be negative.') 474 | 475 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE) 476 | 477 | out = lib.ops.Linear( 478 | 'SampleLevel.L1_PrevSamples', 479 | FRAME_SIZE * last_out_shape, 480 | DIM, 481 | prev_samples, 482 | biases=False, 483 | initialization='he', 484 | weightnorm=WEIGHT_NORM 485 | ) 486 | 487 | out += frame_level_outputs 488 | # out = T.nnet.relu(out) # commented out to be similar to two_tier 489 | 490 | out = lib.ops.Linear('SampleLevel.L2', 491 | DIM, 492 | DIM, 493 | out, 494 | initialization='he', 495 | weightnorm=WEIGHT_NORM) 496 | out = T.nnet.relu(out) 497 | 498 | # L3 499 | out = lib.ops.Linear('SampleLevel.L3', 500 | DIM, 501 | DIM, 502 | out, 503 | initialization='he', 504 | weightnorm=WEIGHT_NORM) 505 | out = T.nnet.relu(out) 506 | 507 | # Output 508 | # We apply the softmax later 509 | out = lib.ops.Linear('SampleLevel.Output', 510 | DIM, 511 | Q_LEVELS, 512 | out, 513 | weightnorm=WEIGHT_NORM) 514 | return out 515 | 516 | sequences_8k = T.imatrix('sequences_8k') #batch size*samplenum 517 | sequences_up = T.imatrix('sequences_up') 518 | condition = T.matrix('con') 519 | con_h0 = T.tensor3('con_h0') 520 | h0 = T.tensor3('h0') #(batch size, N_RNN, DIM) 521 | big_h0 = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM) 522 | reset = T.iscalar('reset') 523 | mask = T.matrix('mask') #batch size*samplenum 524 | batch_size =T.iscalar('batch_size') 525 | lr=T.scalar('lr') 526 | 527 | con_input_sequences = condition 528 | 529 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3) 530 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1)) 531 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1, 2*OVERLAP), neib_step=(1, OVERLAP), mode='valid') 532 | big_input_sequences=big_input_sequences.reshape((batch_size,-1)) 533 | 534 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)] #(tier2) 535 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1)) 536 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1, 2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid') 537 | input_sequences=input_sequences.reshape((batch_size,-1)) 538 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues 539 | 540 | target_mask = mask[:,0:-OVERLAP] 541 | 542 | con_frame_level_outputs, new_con_h0 = con_frame_level_rnn(con_input_sequences,con_h0,reset) 543 | 544 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, con_frame_level_outputs,big_h0, reset)#tier3->tier2 545 | 546 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1 547 | 548 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)] 549 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1)) 550 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...]) 551 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN, FRAME_SIZE)) 552 | 553 | 554 | sample_level_outputs = sample_level_predictor( 555 | frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)), 556 | prev_samples 557 | ) #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...] 558 | 559 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences) 560 | accuracy=accuracy*target_mask 561 | accuracy=T.sum(accuracy,axis=1) 562 | mask_sum=T.sum(target_mask,axis=1) 563 | 564 | cost = T.nnet.categorical_crossentropy( 565 | T.nnet.softmax(sample_level_outputs), #Every row represents a distribution(256 propability) 566 | target_sequences.flatten() #A list, represent the groundtruth of every row 567 | ) 568 | cost = cost.reshape(target_sequences.shape) 569 | cost = cost * target_mask #dim: batch*num 570 | # Don't use these lines; could end up with NaN 571 | # Specially at the end of audio files where mask is 572 | # all zero for some of the shorter files in mini-batch. 573 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 574 | #cost = cost.mean(axis=0) 575 | cost_sum=T.sum(cost,axis=1) 576 | # Use this one instead. 577 | cost = cost.sum() 578 | cost = cost / target_mask.sum() #cost average by samples 579 | 580 | # By default we report cross-entropy cost in bits. 581 | # Switch to nats by commenting out this line: 582 | # log_2(e) = 1.44269504089 583 | #cost = cost * lib.floatX(numpy.log2(numpy.e)) 584 | 585 | ########### 586 | 587 | test_fn=theano.function( 588 | [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size], 589 | [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_con_h0,new_big_h0,new_h0], 590 | on_unused_input='warn' 591 | ) 592 | 593 | def generate_and_save_samples(tag): 594 | def write_audio_file(name, data): 595 | data = data.astype('float32') 596 | #data -= data.min() 597 | #data /= data.max() 598 | #data -= 0.5 599 | #data *= 0.95 600 | scipy.io.wavfile.write( 601 | os.path.join(SAMPLES_PATH, name), 602 | BITRATE, 603 | data) 604 | 605 | total_time=time() 606 | costs_g = [] 607 | accuracys_g=[] 608 | samples_low_list=[] 609 | samples_list=[] 610 | masks_g_index=[] 611 | samples_number=0 612 | count=0 613 | data_feeder = load_data(test_feeder) 614 | for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,con_g,batch_g,seqs_g_8k_real in data_feeder: 615 | if reset_g==1: 616 | con_h0_g=numpy.zeros((batch_g, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32') 617 | big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 618 | h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32') 619 | cost_batch=np.zeros((batch_g,),dtype='float32') 620 | accuracy_batch=np.zeros((batch_g,),dtype='float32') 621 | mask_batch=np.zeros((batch_g,),dtype='float32') 622 | cost_g, accuracy_g,mask_sum_g,sample, con_h0_g,big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, con_g,con_h0_g,big_h0_g,h0_g, reset_g, mask_g,batch_g) 623 | cost_batch=cost_batch+cost_g 624 | accuracy_batch=accuracy_batch+accuracy_g 625 | mask_batch=mask_batch+mask_sum_g 626 | if end_flag_g==1: 627 | costs_g.extend(list(cost_batch/mask_batch)) 628 | accuracys_g.extend(list(accuracy_batch/mask_batch)) 629 | 630 | if reset_g==1: 631 | samples_low=seqs_g_8k_real[:,0:-OVERLAP] 632 | samples=sample 633 | masks_g=mask_g[:,0:-OVERLAP] 634 | else: 635 | samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1) 636 | samples=np.concatenate([samples,sample],axis=1) 637 | masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1) 638 | 639 | if end_flag_g==1: 640 | samples_low_list.append(samples_low) 641 | samples_list.append(samples) 642 | masks_g_index.append(masks_g) 643 | fid=open('datasets/TIMIT/test_list.scp','r') 644 | test_id_list=fid.readlines() 645 | for i in xrange(len(samples_list)): 646 | samples_number+=samples_list[i].shape[0]*samples_list[i].shape[1] 647 | for j in xrange(samples_list[i].shape[0]): 648 | samples_lowi=samples_low_list[i][j] 649 | samplei=samples_list[i][j] 650 | maski=masks_g_index[i][j] 651 | samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])] 652 | samplei=samplei[0:len(np.where(maski==1)[0])] 653 | if Q_TYPE == 'mu-law': 654 | from datasets.dataset import mu2linear 655 | samplei = mu2linear(samplei) 656 | write_audio_file(test_id_list[count].split()[0], samplei/3+samples_lowi) 657 | count+=1 658 | 659 | 660 | total_time = time() - total_time 661 | log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds." 662 | log = log.format(total_time/60,total_time/samples_number*16000) 663 | print log, 664 | 665 | return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time,list(np.array(accuracys_g)*100) 666 | 667 | ### Handling the resume option: 668 | if RESUME: 669 | # Check if checkpoint from previous run is not corrupted. 670 | # Then overwrite some of the variables above. 671 | iters_to_consume, res_path, epoch, total_iters,\ 672 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 673 | lib.resumable(path=FOLDER_PREFIX, 674 | iter_key=iter_str, 675 | epoch_key=epoch_str, 676 | add_resume_counter=True, 677 | other_keys=[lowest_valid_str, 678 | corresp_test_str, 679 | test_nll_str]) 680 | # At this point we saved the pkl file. 681 | last_print_iters = total_iters 682 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 683 | # Consumes this much iters to get to the last point in training data. 684 | consume_time = time() 685 | consume_time = time() - consume_time 686 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 687 | format(consume_time, iters_to_consume) 688 | 689 | lib.load_params(res_path) 690 | print "Parameters from last available checkpoint loaded." 691 | 692 | tag='gen' 693 | test_cost, test_accuracy,test_time,test_accuracy_list=generate_and_save_samples(tag) 694 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time) -------------------------------------------------------------------------------- /CHRNN_HF/models/four_tier/four_tier_train_valid.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from datetime import datetime 3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 4 | exp_start = time() 5 | 6 | import os, sys, glob 7 | sys.path.insert(1, os.getcwd()) 8 | import argparse 9 | import itertools 10 | 11 | import numpy 12 | numpy.random.seed(123) 13 | np = numpy 14 | import random 15 | random.seed(123) 16 | 17 | import theano 18 | import theano.tensor as T 19 | import theano.ifelse 20 | import lasagne 21 | import scipy.io.wavfile 22 | 23 | import lib 24 | 25 | LEARNING_RATE = 0.001 26 | 27 | ### Parsing passed args/hyperparameters ### 28 | def get_args(): 29 | def t_or_f(arg): 30 | ua = str(arg).upper() 31 | if 'TRUE'.startswith(ua): 32 | return True 33 | elif 'FALSE'.startswith(ua): 34 | return False 35 | else: 36 | raise ValueError('Arg is neither `True` nor `False`') 37 | 38 | def check_non_negative(value): 39 | ivalue = int(value) 40 | if ivalue < 0: 41 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 42 | return ivalue 43 | 44 | def check_positive(value): 45 | ivalue = int(value) 46 | if ivalue < 1: 47 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 48 | return ivalue 49 | 50 | def check_unit_interval(value): 51 | fvalue = float(value) 52 | if fvalue < 0 or fvalue > 1: 53 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 54 | return fvalue 55 | 56 | # No default value here. Indicate every single arguement. 57 | parser = argparse.ArgumentParser( 58 | description='three_tier.py\nNo default value! Indicate every argument.') 59 | 60 | # TODO: Fix the descriptions 61 | # Hyperparameter arguements: 62 | parser.add_argument('--exp', help='Experiment name', 63 | type=str, required=False, default='_') 64 | parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True) 65 | parser.add_argument('--con_dim', help='Condition dimension',\ 66 | type=check_positive, required=True) 67 | parser.add_argument('--con_frame_size', help='How many samples per condition frame',\ 68 | type=check_positive, required=True) 69 | parser.add_argument('--big_frame_size', help='How many samples per big frame',\ 70 | type=check_positive, required=True) 71 | parser.add_argument('--frame_size', help='How many samples per frame',\ 72 | type=check_positive, required=True) 73 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\ 74 | type=t_or_f, required=True) 75 | parser.add_argument('--emb_size', help='Size of embedding layer (> 0)', 76 | type=check_positive, required=True) # different than two_tier 77 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', 78 | type=t_or_f, required=True) 79 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 80 | type=check_positive, required=True) 81 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 82 | type=check_positive, choices=xrange(1,6), required=True) 83 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 84 | required=True) 85 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 86 | type=t_or_f, required=True) 87 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 88 | type=check_positive, required=True) 89 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 90 | choices=['linear', 'a-law', 'mu-law'], required=True) 91 | parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK', 92 | choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True) 93 | parser.add_argument('--batch_size', help='size of mini-batch', 94 | type=check_positive, choices=[50,64, 128, 256], required=True) 95 | 96 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 97 | required=False, default=False, action='store_true') 98 | 99 | args = parser.parse_args() 100 | 101 | # NEW 102 | # Create tag for this experiment based on passed args 103 | tag='four_tier_model' 104 | print "Created experiment tag for these args:" 105 | print tag 106 | 107 | return args, tag 108 | 109 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001 110 | args, tag = get_args() 111 | 112 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512) 113 | #print "------------------previous SEQ_LEN:", SEQ_LEN 114 | # TODO: test incremental training 115 | #SEQ_LEN = 512 + 256 116 | #print "---------------------------new SEQ_LEN:", SEQ_LEN 117 | CON_DIM=args.con_dim 118 | CON_FRAME_SIZE=args.con_frame_size 119 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame 120 | FRAME_SIZE = args.frame_size # How many samples per frame 121 | WEIGHT_NORM = args.weight_norm #True 122 | EMB_SIZE = args.emb_size #(256) 123 | SKIP_CONN = args.skip_conn #(False) 124 | DIM = args.dim # Model dimensionality. (1024) 125 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024) 126 | CON_TIER_DIM=DIM 127 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1) 128 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1) 129 | N_CON_RNN=N_RNN 130 | RNN_TYPE = args.rnn_type #GRU 131 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1) 132 | LEARN_H0 = args.learn_h0 #(True) 133 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256) 134 | Q_TYPE = args.q_type # log- or linear-scale #(linear) 135 | WHICH_SET = args.which_set #(MUSIC) 136 | BATCH_SIZE = args.batch_size #(128) 137 | RESUME = args.resume #(False) 138 | assert SEQ_LEN % CON_FRAME_SIZE == 0,\ 139 | 'seq_len should be divisible by con_frame_size' 140 | assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\ 141 | 'con_frame_size should be divisible by big_frame_size' 142 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\ 143 | 'big_frame_size should be divisible by frame_size' 144 | 145 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 146 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 147 | 148 | # Fixed hyperparams 149 | GRAD_CLIP = 1 # Elementwise grad clip threshold 150 | BITRATE = 16000 151 | 152 | # Other constants 153 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 154 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 155 | #TRAIN_MODE = 'time-iters' 156 | # To use PRINT_TIME for validation, 157 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 158 | #TRAIN_MODE = 'iters-time' 159 | # To use PRINT_ITERS for validation, 160 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 161 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations. 162 | STOP_ITERS = 300000 # Stop after this many iterations 163 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds. 164 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 165 | N_SEQS = 5 # Number of samples to generate every time monitoring. 166 | RESULTS_DIR = 'results_4t' 167 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 168 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 169 | OVERLAP = BIG_FRAME_SIZE 170 | 171 | epoch_str = 'epoch' 172 | iter_str = 'iter' 173 | lowest_valid_str = 'lowest valid cost' 174 | corresp_test_str = 'correponding test cost' 175 | train_nll_str, valid_nll_str, test_nll_str = \ 176 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 177 | 178 | ### Create directories ### 179 | # FOLDER_PREFIX: root, contains: 180 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 181 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 182 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 183 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 184 | if not os.path.exists(FOLDER_PREFIX): 185 | os.makedirs(FOLDER_PREFIX) 186 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 187 | if not os.path.exists(PARAMS_PATH): 188 | os.makedirs(PARAMS_PATH) 189 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 190 | if not os.path.exists(SAMPLES_PATH): 191 | os.makedirs(SAMPLES_PATH) 192 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 193 | if not os.path.exists(BEST_PATH): 194 | os.makedirs(BEST_PATH) 195 | 196 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 197 | 198 | ### Import the data_feeder ### 199 | # Handling WHICH_SET 200 | if WHICH_SET == 'TIMIT': 201 | from datasets.dataset import TIMIT_train_feed_epoch as train_feeder 202 | from datasets.dataset import TIMIT_valid_feed_epoch as valid_feeder 203 | from datasets.dataset import TIMIT_test_feed_epoch as test_feeder 204 | 205 | def load_data(data_feeder): 206 | """ 207 | Helper function to deal with interface of different datasets. 208 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 209 | """ 210 | return data_feeder(BATCH_SIZE, 211 | SEQ_LEN, 212 | CON_FRAME_SIZE, 213 | CON_DIM, 214 | OVERLAP, 215 | Q_LEVELS, 216 | Q_ZERO, 217 | Q_TYPE) 218 | 219 | ### Creating computation graph ### 220 | def con_frame_level_rnn(input_sequences, h0, reset): 221 | """ 222 | input_sequences.shape: (batch size, n con frames * CON_DIM) 223 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 224 | reset.shape: () 225 | output[0].shape: (batch size, n frames, DIM) 226 | output[1].shape: same as h0.shape 227 | output[2].shape: (batch size, seq len, Q_LEVELS) 228 | """ 229 | 230 | frames = input_sequences.reshape(( 231 | input_sequences.shape[0], 232 | input_sequences.shape[1] // CON_DIM, 233 | CON_DIM 234 | )) 235 | 236 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 237 | # (a reasonable range to pass as inputs to the RNN) 238 | # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 239 | # frames *= lib.floatX(2) 240 | 241 | # Initial state of RNNs 242 | learned_h0 = lib.param( 243 | 'ConFrameLevel.h0', 244 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 245 | ) 246 | # Handling LEARN_H0 247 | learned_h0.param = LEARN_H0 #True 248 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 249 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 250 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 251 | 252 | # Handling RNN_TYPE 253 | # Handling SKIP_CONN 254 | if RNN_TYPE == 'GRU': 255 | rnns_out, last_hidden = lib.ops.stackedGRU('ConFrameLevel.GRU', 256 | N_CON_RNN, 257 | CON_DIM, 258 | CON_TIER_DIM, 259 | frames, 260 | h0=h0, 261 | weightnorm=WEIGHT_NORM, 262 | skip_conn=SKIP_CONN) 263 | elif RNN_TYPE == 'LSTM': 264 | rnns_out, last_hidden = lib.ops.stackedLSTM('ConFrameLevel.LSTM', 265 | N_CON_RNN, 266 | CON_DIM, 267 | CON_TIER_DIM, 268 | frames, 269 | h0=h0, 270 | weightnorm=WEIGHT_NORM, 271 | skip_conn=SKIP_CONN) 272 | 273 | output = lib.ops.Linear( #batch*timestep*dim 274 | 'ConFrameLevel.Output', 275 | CON_TIER_DIM, 276 | BIG_DIM * CON_FRAME_SIZE / BIG_FRAME_SIZE, #1024*8/2 277 | rnns_out, 278 | initialization='he', 279 | weightnorm=WEIGHT_NORM 280 | ) 281 | output = output.reshape((output.shape[0], output.shape[1] * CON_FRAME_SIZE / BIG_FRAME_SIZE, BIG_DIM)) 282 | 283 | return (output, last_hidden) #last_hidden:#batch*1*dim 284 | 285 | def big_frame_level_rnn(input_sequences, other_input,h0, reset): 286 | """ 287 | input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8 288 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 289 | reset.shape: () 290 | output[0].shape: (batch size, n frames, DIM) 291 | output[1].shape: same as h0.shape 292 | output[2].shape: (batch size, seq len, Q_LEVELS) 293 | """ 294 | frames = input_sequences.reshape(( 295 | input_sequences.shape[0], 296 | input_sequences.shape[1] // (2*BIG_FRAME_SIZE), 297 | 2*BIG_FRAME_SIZE 298 | )) 299 | 300 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 301 | # (a reasonable range to pass as inputs to the RNN) 302 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 303 | frames *= lib.floatX(1) 304 | 305 | gru_input = lib.ops.Linear( 306 | 'BigFrameLevel.InputExpand', 307 | 2*BIG_FRAME_SIZE, 308 | BIG_DIM, 309 | frames, 310 | initialization='he', 311 | weightnorm=WEIGHT_NORM, 312 | ) + other_input 313 | 314 | # Initial state of RNNs 315 | learned_h0 = lib.param( 316 | 'BigFrameLevel.h0', 317 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 318 | ) 319 | # Handling LEARN_H0 320 | learned_h0.param = LEARN_H0 #True 321 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 322 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 323 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 324 | 325 | # Handling RNN_TYPE 326 | # Handling SKIP_CONN 327 | if RNN_TYPE == 'GRU': 328 | rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU', 329 | N_BIG_RNN, 330 | BIG_DIM, 331 | BIG_DIM, 332 | gru_input, 333 | h0=h0, 334 | weightnorm=WEIGHT_NORM, 335 | skip_conn=SKIP_CONN) 336 | elif RNN_TYPE == 'LSTM': 337 | rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM', 338 | N_BIG_RNN, 339 | BIG_DIM, 340 | BIG_DIM, 341 | gru_input, 342 | h0=h0, 343 | weightnorm=WEIGHT_NORM, 344 | skip_conn=SKIP_CONN) 345 | 346 | output = lib.ops.Linear( #batch*timestep*dim 347 | 'BigFrameLevel.Output', 348 | BIG_DIM, 349 | DIM * BIG_FRAME_SIZE / FRAME_SIZE, #1024*8/2 350 | rnns_out, 351 | initialization='he', 352 | weightnorm=WEIGHT_NORM 353 | ) 354 | output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM)) 355 | 356 | return (output, last_hidden) #last_hidden:#batch*1*dim 357 | 358 | def frame_level_rnn(input_sequences, other_input, h0, reset): 359 | """ 360 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2 361 | other_input.shape: (batch size, n frames, DIM) 362 | h0.shape: (batch size, N_RNN, DIM) 363 | reset.shape: () 364 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 365 | """ 366 | frames = input_sequences.reshape(( 367 | input_sequences.shape[0], 368 | input_sequences.shape[1] // (2*FRAME_SIZE), 369 | 2*FRAME_SIZE 370 | )) 371 | 372 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 373 | # (a reasonable range to pass as inputs to the RNN) 374 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 375 | frames *= lib.floatX(1) 376 | 377 | gru_input = lib.ops.Linear( 378 | 'FrameLevel.InputExpand', 379 | 2*FRAME_SIZE, 380 | DIM, 381 | frames, 382 | initialization='he', 383 | weightnorm=WEIGHT_NORM, 384 | ) + other_input 385 | 386 | # Initial state of RNNs 387 | learned_h0 = lib.param( 388 | 'FrameLevel.h0', 389 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 390 | ) 391 | # Handling LEARN_H0 392 | learned_h0.param = LEARN_H0 393 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 394 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 395 | #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim) 396 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 397 | 398 | # Handling RNN_TYPE 399 | # Handling SKIP_CONN 400 | if RNN_TYPE == 'GRU': 401 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 402 | N_RNN, 403 | DIM, 404 | DIM, 405 | gru_input, 406 | h0=h0, 407 | weightnorm=WEIGHT_NORM, 408 | skip_conn=SKIP_CONN) 409 | elif RNN_TYPE == 'LSTM': 410 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 411 | N_RNN, 412 | DIM, 413 | DIM, 414 | gru_input, 415 | h0=h0, 416 | weightnorm=WEIGHT_NORM, 417 | skip_conn=SKIP_CONN) 418 | 419 | output = lib.ops.Linear( 420 | 'FrameLevel.Output', 421 | DIM, 422 | FRAME_SIZE * DIM, 423 | rnns_out, 424 | initialization='he', 425 | weightnorm=WEIGHT_NORM 426 | ) 427 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 428 | 429 | return (output, last_hidden) 430 | 431 | def sample_level_predictor(frame_level_outputs, prev_samples): 432 | """ 433 | frame_level_outputs.shape: (batch size, DIM) 434 | prev_samples.shape: (batch size, FRAME_SIZE) 435 | output.shape: (batch size, Q_LEVELS) 436 | """ 437 | # Handling EMB_SIZE 438 | if EMB_SIZE == 0: # no support for one-hot in three_tier and one_tier. 439 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 440 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 441 | last_out_shape = Q_LEVELS 442 | elif EMB_SIZE > 0: #The embedding steps maps each of the q discrete values to a real-valued vector embedding. 443 | prev_samples = lib.ops.Embedding( #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE 444 | 'SampleLevel.Embedding', 445 | Q_LEVELS, 446 | EMB_SIZE, 447 | prev_samples) 448 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 449 | last_out_shape = EMB_SIZE 450 | else: 451 | raise ValueError('EMB_SIZE cannot be negative.') 452 | 453 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE) 454 | 455 | out = lib.ops.Linear( 456 | 'SampleLevel.L1_PrevSamples', 457 | FRAME_SIZE * last_out_shape, 458 | DIM, 459 | prev_samples, 460 | biases=False, 461 | initialization='he', 462 | weightnorm=WEIGHT_NORM 463 | ) 464 | 465 | out += frame_level_outputs 466 | # out = T.nnet.relu(out) # commented out to be similar to two_tier 467 | 468 | out = lib.ops.Linear('SampleLevel.L2', 469 | DIM, 470 | DIM, 471 | out, 472 | initialization='he', 473 | weightnorm=WEIGHT_NORM) 474 | out = T.nnet.relu(out) 475 | 476 | # L3 477 | out = lib.ops.Linear('SampleLevel.L3', 478 | DIM, 479 | DIM, 480 | out, 481 | initialization='he', 482 | weightnorm=WEIGHT_NORM) 483 | out = T.nnet.relu(out) 484 | 485 | # Output 486 | # We apply the softmax later 487 | out = lib.ops.Linear('SampleLevel.Output', 488 | DIM, 489 | Q_LEVELS, 490 | out, 491 | weightnorm=WEIGHT_NORM) 492 | return out 493 | 494 | sequences_8k = T.imatrix('sequences_8k') #batch size*samplenum 495 | sequences_up = T.imatrix('sequences_up') 496 | condition = T.matrix('con') 497 | con_h0 = T.tensor3('con_h0') 498 | h0 = T.tensor3('h0') #(batch size, N_RNN, DIM) 499 | big_h0 = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM) 500 | reset = T.iscalar('reset') 501 | mask = T.matrix('mask') #batch size*samplenum 502 | batch_size =T.iscalar('batch_size') 503 | lr=T.scalar('lr') 504 | 505 | con_input_sequences = condition 506 | 507 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3) 508 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1)) 509 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1, 2*OVERLAP), neib_step=(1, OVERLAP), mode='valid') 510 | big_input_sequences=big_input_sequences.reshape((batch_size,-1)) 511 | 512 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)] #(tier2) 513 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1)) 514 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1, 2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid') 515 | input_sequences=input_sequences.reshape((batch_size,-1)) 516 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues 517 | 518 | target_mask = mask[:,0:-OVERLAP] 519 | 520 | con_frame_level_outputs, new_con_h0 = con_frame_level_rnn(con_input_sequences,con_h0,reset) 521 | 522 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, con_frame_level_outputs,big_h0, reset)#tier3->tier2 523 | 524 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1 525 | 526 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)] 527 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1)) 528 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...]) 529 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN, FRAME_SIZE)) 530 | 531 | sample_level_outputs = sample_level_predictor( 532 | frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)), 533 | prev_samples 534 | ) #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...] 535 | 536 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences) 537 | accuracy=accuracy*target_mask 538 | accuracy=T.sum(accuracy,axis=1) 539 | mask_sum=T.sum(target_mask,axis=1) 540 | 541 | cost = T.nnet.categorical_crossentropy( 542 | T.nnet.softmax(sample_level_outputs), #Every row represents a distribution(256 propability) 543 | target_sequences.flatten() #A list, represent the groundtruth of every row 544 | ) 545 | cost = cost.reshape(target_sequences.shape) 546 | cost = cost * target_mask #dim: batch*num 547 | # Don't use these lines; could end up with NaN 548 | # Specially at the end of audio files where mask is 549 | # all zero for some of the shorter files in mini-batch. 550 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 551 | #cost = cost.mean(axis=0) 552 | cost_sum=T.sum(cost,axis=1) 553 | # Use this one instead. 554 | cost = cost.sum() 555 | cost = cost / target_mask.sum() #cost average by samples 556 | 557 | # By default we report cross-entropy cost in bits. 558 | # Switch to nats by commenting out this line: 559 | # log_2(e) = 1.44269504089 560 | #cost = cost * lib.floatX(numpy.log2(numpy.e)) 561 | 562 | ########### 563 | all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) #if LEARN_H0=True,then learn_h0 is included in parmeters to train 564 | 565 | lib.print_params_info(all_params, path=FOLDER_PREFIX) 566 | 567 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn') 568 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 569 | 570 | updates = lasagne.updates.adam(grads, all_params,learning_rate=lr) 571 | 572 | # Training function(s) 573 | train_fn = theano.function( 574 | [sequences_8k,sequences_up, condition, con_h0,big_h0, h0, reset, mask,batch_size,lr], 575 | [cost, new_con_h0,new_big_h0, new_h0], 576 | updates=updates, 577 | on_unused_input='warn' 578 | ) 579 | 580 | # Validation and Test function, hence no updates 581 | valid_fn = theano.function( 582 | [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size], 583 | [cost_sum, accuracy,mask_sum,new_con_h0,new_big_h0,new_h0], 584 | on_unused_input='warn' 585 | ) 586 | 587 | test_fn=theano.function( 588 | [sequences_8k,sequences_up, condition,con_h0,big_h0,h0, reset, mask,batch_size], 589 | [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_con_h0,new_big_h0,new_h0], 590 | on_unused_input='warn' 591 | ) 592 | 593 | def generate_and_save_samples(tag): 594 | def write_audio_file(name, data): 595 | data = data.astype('float32') 596 | #data -= data.min() 597 | #data /= data.max() 598 | #data -= 0.5 599 | #data *= 0.95 600 | scipy.io.wavfile.write( 601 | os.path.join(SAMPLES_PATH, name+'.wav'), 602 | BITRATE, 603 | data) 604 | 605 | total_time=time() 606 | costs_g = [] 607 | accuracys_g=[] 608 | count=0 609 | data_feeder = load_data(test_feeder) 610 | for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,con_g,batch_g,seqs_g_8k_real in data_feeder: 611 | if reset_g==1: 612 | con_h0_g=numpy.zeros((batch_g, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32') 613 | big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 614 | h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32') 615 | cost_batch=np.zeros((batch_g,),dtype='float32') 616 | accuracy_batch=np.zeros((batch_g,),dtype='float32') 617 | mask_batch=np.zeros((batch_g,),dtype='float32') 618 | count+=1 619 | cost_g, accuracy_g,mask_sum_g,sample, con_h0_g,big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, con_g,con_h0_g,big_h0_g,h0_g, reset_g, mask_g,batch_g) 620 | cost_batch=cost_batch+cost_g 621 | accuracy_batch=accuracy_batch+accuracy_g 622 | mask_batch=mask_batch+mask_sum_g 623 | if end_flag_g==1: 624 | costs_g.extend(list(cost_batch/mask_batch)) 625 | accuracys_g.extend(list(accuracy_batch/mask_batch)) 626 | 627 | if count==1: 628 | if reset_g==1: 629 | samples_low=seqs_g_8k_real[:,0:-OVERLAP] 630 | samples=sample 631 | masks_g=mask_g[:,0:-OVERLAP] 632 | else: 633 | samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1) 634 | samples=np.concatenate([samples,sample],axis=1) 635 | masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1) 636 | 637 | 638 | for i in xrange(N_SEQS): 639 | samples_lowi=samples_low[i] 640 | samplei=samples[i] 641 | maski=masks_g[i] 642 | samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])] 643 | samplei=samplei[0:len(np.where(maski==1)[0])] 644 | if Q_TYPE == 'mu-law': 645 | from datasets.dataset import mu2linear 646 | samplei = mu2linear(samplei) 647 | write_audio_file("sample_{}_{}".format(tag, i), samplei/3+samples_lowi) 648 | 649 | total_time = time() - total_time 650 | log = "{} samples generated in {} seconds." 651 | log = log.format(N_SEQS, total_time) 652 | print log, 653 | 654 | return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time 655 | 656 | 657 | def monitor(data_feeder): 658 | """ 659 | Cost and time of test_fn on a given dataset section. 660 | Pass only one of `valid_feeder` or `test_feeder`. 661 | Don't pass `train_feed`. 662 | 663 | :returns: 664 | Mean cost over the input dataset (data_feeder) 665 | Total time spent 666 | """ 667 | _total_time = time() 668 | _costs = [] 669 | _accuracys=[] 670 | _data_feeder = load_data(data_feeder) 671 | for _seqs_8k,_seqs_up, _reset, _end_flag,_mask,_con,_batch,_seqs_8k_real in _data_feeder: 672 | if _reset==1: 673 | _con_h0=numpy.zeros((_batch, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32') 674 | _big_h0=numpy.zeros((_batch, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 675 | _h0 = numpy.zeros((_batch, N_RNN, H0_MULT*DIM), dtype='float32') 676 | _cost_batch=np.zeros((_batch,),dtype='float32') 677 | _accuracy_batch=np.zeros((_batch,),dtype='float32') 678 | _mask_batch=np.zeros((_batch,),dtype='float32') 679 | _cost, _accuracy,_mask_sum,_con_h0,_big_h0,_h0 = valid_fn(_seqs_8k,_seqs_up, _con,_con_h0,_big_h0,_h0, _reset, _mask,_batch) 680 | _cost_batch=_cost_batch+_cost 681 | _accuracy_batch=_accuracy_batch+_accuracy 682 | _mask_batch=_mask_batch+_mask_sum 683 | if _end_flag==1: 684 | _costs.extend(list(_cost_batch/_mask_batch)) 685 | _accuracys.extend(list(_accuracy_batch/_mask_batch)) 686 | 687 | 688 | return numpy.mean(_costs), numpy.mean(_accuracys)*100,time() - _total_time 689 | 690 | print "Wall clock time spent before training started: {:.2f}h"\ 691 | .format((time()-exp_start)/3600.) 692 | print "Training!" 693 | total_iters = 0 694 | total_time = 0. 695 | last_print_time = 0. 696 | last_print_iters = 0 697 | costs = [] 698 | lowest_valid_cost = numpy.finfo(numpy.float32).max 699 | corresponding_test_cost = numpy.finfo(numpy.float32).max 700 | new_lowest_cost = False 701 | end_of_batch = False 702 | epoch = 0 703 | learning_rate=LEARNING_RATE 704 | 705 | # Initial load train dataset 706 | tr_feeder = load_data(train_feeder) 707 | 708 | ### Handling the resume option: 709 | if RESUME: 710 | # Check if checkpoint from previous run is not corrupted. 711 | # Then overwrite some of the variables above. 712 | iters_to_consume, res_path, epoch, total_iters,\ 713 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 714 | lib.resumable(path=FOLDER_PREFIX, 715 | iter_key=iter_str, 716 | epoch_key=epoch_str, 717 | add_resume_counter=True, 718 | other_keys=[lowest_valid_str, 719 | corresp_test_str, 720 | test_nll_str]) 721 | # At this point we saved the pkl file. 722 | last_print_iters = total_iters 723 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 724 | # Consumes this much iters to get to the last point in training data. 725 | consume_time = time() 726 | for i in xrange(iters_to_consume): 727 | tr_feeder.next() 728 | consume_time = time() - consume_time 729 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 730 | format(consume_time, iters_to_consume) 731 | 732 | lib.load_params(res_path) 733 | print "Parameters from last available checkpoint loaded." 734 | 735 | while True: 736 | # THIS IS ONE ITERATION 737 | if total_iters % 500 == 0: 738 | print total_iters, 739 | 740 | total_iters += 1 741 | 742 | try: 743 | # Take as many mini-batches as possible from train set 744 | mini_batch = tr_feeder.next() 745 | except StopIteration: 746 | # Mini-batches are finished. Load it again. 747 | # Basically, one epoch. 748 | tr_feeder = load_data(train_feeder) 749 | 750 | # and start taking new mini-batches again. 751 | mini_batch = tr_feeder.next() 752 | epoch += 1 753 | end_of_batch = True 754 | print "[Another epoch]", 755 | 756 | seqs_8k, seqs_up,reset, end_flag,mask,con,batch_num,seqs_8k_real = mini_batch 757 | if reset==1: 758 | con_h0=numpy.zeros((batch_num, N_CON_RNN, H0_MULT*CON_TIER_DIM), dtype='float32') 759 | big_h0=numpy.zeros((batch_num, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 760 | h0 = numpy.zeros((batch_num, N_RNN, H0_MULT*DIM), dtype='float32') 761 | 762 | start_time = time() 763 | cost,con_h0,big_h0,h0 = train_fn(seqs_8k, seqs_up, con,con_h0, big_h0, h0, reset, mask,batch_num,learning_rate) 764 | total_time += time() - start_time 765 | #print "This cost:", cost, "This h0.mean()", h0.mean() 766 | 767 | costs.append(cost) 768 | 769 | # Monitoring step 770 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \ 771 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \ 772 | (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \ 773 | (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \ 774 | end_of_batch: 775 | # 0. Validation 776 | print "\nValidation!", 777 | valid_cost, valid_accuracy,valid_time = monitor(valid_feeder) 778 | print "Done!" 779 | 780 | # 1. Test 781 | test_time = 0. 782 | # Only when the validation cost is improved get the cost for test set. 783 | if valid_cost < lowest_valid_cost: 784 | lowest_valid_cost = valid_cost 785 | print "\n>>> Best validation cost of {} reached."\ 786 | .format(valid_cost), 787 | #test_cost, test_time = monitor(test_feeder) 788 | #print "Done!" 789 | # Report last one which is the lowest on validation set: 790 | #print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time) 791 | #corresponding_test_cost = test_cost 792 | new_lowest_cost = True 793 | 794 | tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}" 795 | tag = tag.format(epoch, 796 | total_iters, 797 | total_time/3600, 798 | numpy.mean(cost), 799 | valid_cost) 800 | tag += ("_best" if new_lowest_cost else "") 801 | 802 | print "Sampling!", 803 | # Generate samples 804 | test_cost, test_accuracy,test_time=generate_and_save_samples(tag) 805 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time) 806 | if new_lowest_cost: 807 | corresponding_test_cost = test_cost 808 | print "Done!" 809 | 810 | # 2. Stdout the training progress 811 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 812 | print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n" 813 | print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n" 814 | print_info += "\tvalid cost:{:.4f}\tvalid accuracy:{:.4f}%\ttotal time:{:.2f}h\n" 815 | print_info += "\ttest cost:{:.4f}\ttest accuracy:{:.4f}%\ttotal time:{:.2f}h" 816 | print_info = print_info.format(epoch, 817 | total_iters, 818 | (time()-exp_start)/3600, 819 | lowest_valid_cost, 820 | corresponding_test_cost, 821 | numpy.mean(costs), 822 | total_time/3600, 823 | total_time/total_iters, 824 | valid_cost, 825 | valid_accuracy, 826 | valid_time/3600, 827 | test_cost, 828 | test_accuracy, 829 | test_time/3600) 830 | print print_info 831 | 832 | 833 | # 3. Save params of model (IO bound, time consuming) 834 | # If saving params is not successful, there shouldn't be any trace of 835 | # successful monitoring step in train_log as well. 836 | print "Saving params!", 837 | lib.save_params( 838 | os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag)) 839 | ) 840 | print "Done!" 841 | 842 | # 4. Save and graph training progress (fast) 843 | training_info = {epoch_str : epoch, 844 | iter_str : total_iters, 845 | train_nll_str : numpy.mean(costs), 846 | valid_nll_str : valid_cost, 847 | test_nll_str : test_cost, 848 | lowest_valid_str : lowest_valid_cost, 849 | corresp_test_str : corresponding_test_cost, 850 | 'train time' : total_time, 851 | 'valid time' : valid_time, 852 | 'test time' : test_time, 853 | 'wall clock time' : time()-exp_start} 854 | lib.save_training_info(training_info, FOLDER_PREFIX) 855 | print "Train info saved!", 856 | 857 | # y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str] 858 | # lib.plot_traing_info(iter_str, y_axis_strs, FOLDER_PREFIX) 859 | print "And plotted!" 860 | 861 | if total_iters-last_print_iters == PRINT_ITERS: 862 | # If we are here b/c of onom_end_of_batch, we shouldn't mess 863 | # with costs and last_print_iters 864 | costs = [] 865 | last_print_time += PRINT_TIME 866 | last_print_iters += PRINT_ITERS 867 | 868 | if epoch==6 and end_of_batch==True: 869 | learning_rate=0.0001 870 | print "\n Now learning rate is 0.0001." 871 | 872 | end_of_batch = False 873 | new_lowest_cost = False 874 | 875 | print "Validation Done!\nBack to Training..." 876 | 877 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \ 878 | (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \ 879 | ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \ 880 | (total_iters == STOP_ITERS or total_time >= STOP_TIME)): 881 | 882 | print "Done! Total iters:", total_iters, "Total time: ", total_time 883 | print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 884 | print "Wall clock time spent: {:.2f}h"\ 885 | .format((time()-exp_start)/3600) 886 | 887 | sys.exit() -------------------------------------------------------------------------------- /CHRNN_HF/readme.md: -------------------------------------------------------------------------------- 1 | The CHRNN system in the paper: 2 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018. 3 | Usage: 4 | First enter the root directory of the folder: `cd CHRNN_HF`. 5 | 6 | Data preparation: 7 | Put the train, validiation and test waveforms (16kHz sample rate) and bottleneck features into the corresponding folder in directory 'datasets/TIMIT/waveforms' and 'datasets/TIMIT/bn_norm_condition', 8 | then run `python datasets/TIMIT/_2npy_hf.py` to generate the packaged data. 9 | 10 | Traning and validiation: 11 | Run: 12 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/four_tier_train_valid.py --exp BEST_4TIER --seq_len 480 --con_dim 100 --con_frame_size 160 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64` 13 | 14 | Test: 15 | Run: 16 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/four_tier_test.py --exp BEST_4TIER --seq_len 480 --con_dim 100 --con_frame_size 160 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 1` -------------------------------------------------------------------------------- /HRNN_HF/datasets/TIMIT/_2npy_hf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import random 4 | import os 5 | import glob 6 | 7 | __RAND_SEED = 123 8 | def __fixed_shuffle(inp_list): 9 | if isinstance(inp_list, list): 10 | random.seed(__RAND_SEED) 11 | random.shuffle(inp_list) 12 | return 13 | if isinstance(inp_list, np.ndarray): 14 | np.random.seed(__RAND_SEED) 15 | np.random.shuffle(inp_list) 16 | return 17 | 18 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 19 | 20 | def clip_times(audio, times): 21 | 22 | audio = audio * times 23 | audio[audio > 1] = 1 24 | audio[audio < -1] = -1 25 | return audio 26 | 27 | 28 | def wav2npy(data_path,save_path,name,fixed_shuffle=True,sample_rate=16000): 29 | paths = sorted(glob.glob(data_path+"/*.wav")) 30 | if name=='test': 31 | fid=open(save_path+'/'+'test_list.scp','w') 32 | for i in xrange(len(paths)): 33 | fid.write(paths[i].split('/')[-1]+'\n') 34 | fid.close() 35 | if fixed_shuffle: 36 | __fixed_shuffle(paths) 37 | for i,path in enumerate(paths): 38 | audio16k, _ = librosa.load(path, sr=sample_rate, mono=True) 39 | audio8k = librosa.core.resample(audio16k,sample_rate,sample_rate/2) 40 | audio8k = librosa.core.resample(audio8k,sample_rate/2,sample_rate) 41 | 42 | if(len(audio8k)==len(audio16k)): 43 | pass 44 | elif(len(audio8k)>len(audio16k)): 45 | audio8k=audio8k[0:len(audio16k)] 46 | else: 47 | audio16k=audio16k[0:len(audio8k)] 48 | 49 | audio_up=audio16k-audio8k 50 | audio_up = clip_times(audio_up, 3) 51 | 52 | if i==0: 53 | max_len=len(audio_up) 54 | audio_mat_up=np.array(audio_up,dtype='float32').reshape(1,len(audio_up)) 55 | audio_mat8k=np.array(audio8k,dtype='float32').reshape(1,len(audio8k)) 56 | mask=np.ones(audio_mat_up.shape,dtype='float32') 57 | else: 58 | current_len=len(audio_up) 59 | if current_len>max_len: 60 | audio_mat_up=np.pad(audio_mat_up,[[0,0],[0,current_len-max_len]],'constant') 61 | audio_mat_up=np.concatenate((audio_mat_up,np.array(audio_up,dtype='float32').reshape(1,current_len)),axis=0) 62 | audio_mat8k=np.pad(audio_mat8k,[[0,0],[0,current_len-max_len]],'constant') 63 | audio_mat8k=np.concatenate((audio_mat8k,np.array(audio8k,dtype='float32').reshape(1,current_len)),axis=0) 64 | mask=np.pad(mask,[[0,0],[0,current_len-max_len]],'constant') 65 | mask=np.concatenate((mask,np.ones((1,current_len),dtype='float32')),axis=0) 66 | max_len=current_len 67 | else: 68 | audio_mat_up=np.concatenate((audio_mat_up,np.pad(np.array(audio_up,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 69 | audio_mat8k=np.concatenate((audio_mat8k,np.pad(np.array(audio8k,dtype='float32').reshape(1,current_len),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 70 | mask=np.concatenate((mask,np.pad(np.ones((1,current_len),dtype='float32'),[[0,0],[0,max_len-current_len]],'constant')),axis=0) 71 | 72 | np.save(save_path+'/'+'TIMIT_'+name+'_up.npy', audio_mat_up) 73 | np.save(save_path+'/'+'TIMIT_'+name+'_8k.npy', audio_mat8k) 74 | np.save(save_path+'/'+'TIMIT_'+name+'_mask.npy', mask) 75 | 76 | print name+' data storage is complete!' 77 | 78 | 79 | wav2npy('datasets/TIMIT/train','datasets/TIMIT','train',fixed_shuffle=True,sample_rate=16000) 80 | wav2npy('datasets/TIMIT/valid','datasets/TIMIT','valid',fixed_shuffle=True,sample_rate=16000) 81 | wav2npy('datasets/TIMIT/test','datasets/TIMIT','test',fixed_shuffle=False,sample_rate=16000) -------------------------------------------------------------------------------- /HRNN_HF/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aiyang8067/Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension/94c3daf9554e20ea2538eb2b7aa044024fedb9ed/HRNN_HF/datasets/__init__.py -------------------------------------------------------------------------------- /HRNN_HF/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN Vocal Generation Model 3 | 4 | TIMIT data feeders. 5 | """ 6 | 7 | import numpy as np 8 | import random 9 | import time 10 | import os 11 | import glob 12 | 13 | __base = [ 14 | ('Local', 'datasets/'), 15 | ] 16 | 17 | __TIMIT_file = 'TIMIT/TIMIT_{}.npy' 18 | 19 | __train_mask = lambda s: s.format('train_mask') 20 | __train_up = lambda s: s.format('train_up') 21 | __train8k = lambda s: s.format('train_8k') 22 | __valid_mask = lambda s: s.format('valid_mask') 23 | __valid_up = lambda s: s.format('valid_up') 24 | __valid8k = lambda s: s.format('valid_8k') 25 | __test_mask = lambda s: s.format('test_mask') 26 | __test_up = lambda s: s.format('test_up') 27 | __test8k = lambda s: s.format('test_8k') 28 | 29 | def find_dataset(filename): 30 | for (k, v) in __base: 31 | tmp_path = os.path.join(v, filename) 32 | if os.path.exists(tmp_path): 33 | return tmp_path 34 | raise Exception('{} NOT FOUND!'.format(filename)) 35 | 36 | ### Basic utils ### 37 | def __round_to(x, y): 38 | """round x up to the nearest y""" 39 | return int(np.ceil(x / float(y))) * y 40 | 41 | def __normalize(data): 42 | """To range [0., 1.]""" 43 | data -= data.min(axis=1)[:, None] 44 | data /= data.max(axis=1)[:, None] 45 | return data 46 | 47 | def __linear_quantize(data, q_levels): 48 | """ 49 | floats in (0, 1) to ints in [0, q_levels-1] 50 | scales normalized across axis 1 51 | """ 52 | # Normalization is on mini-batch not whole file 53 | #eps = numpy.float64(1e-5) 54 | #data -= data.min(axis=1)[:, None] 55 | #data *= ((q_levels - eps) / data.max(axis=1)[:, None]) 56 | #data += eps/2 57 | #data = data.astype('int32') 58 | 59 | eps = np.float64(1e-5) 60 | data *= (q_levels - eps) 61 | data += eps/2 62 | data = data.astype('int32') 63 | return data 64 | 65 | def linear2mu(x, mu=255): 66 | """ 67 | From Joao 68 | x should be normalized between -1 and 1 69 | Converts an array according to mu-law and discretizes it 70 | 71 | Note: 72 | mu2linear(linear2mu(x)) != x 73 | Because we are compressing to 8 bits here. 74 | They will sound pretty much the same, though. 75 | 76 | :usage: 77 | >>> bitrate, samples = scipy.io.wavfile.read('orig.wav') 78 | >>> norm = __normalize(samples)[None, :] # It takes 2D as inp 79 | >>> mu_encoded = linear2mu(2.*norm-1.) # From [0, 1] to [-1, 1] 80 | >>> print mu_encoded.min(), mu_encoded.max(), mu_encoded.dtype 81 | 0, 255, dtype('int16') 82 | >>> mu_decoded = mu2linear(mu_encoded) # Back to linear 83 | >>> print mu_decoded.min(), mu_decoded.max(), mu_decoded.dtype 84 | -1, 0.9574371, dtype('float32') 85 | """ 86 | x_mu = np.sign(x) * np.log(1 + mu*np.abs(x))/np.log(1 + mu) 87 | return ((x_mu + 1)/2 * mu).astype('int16') 88 | 89 | def mu2linear(x, mu=255): 90 | """ 91 | From Joao with modifications 92 | Converts an integer array from mu to linear 93 | 94 | For important notes and usage see: linear2mu 95 | """ 96 | mu = float(mu) 97 | x = x.astype('float32') 98 | y = 2. * (x - (mu+1.)/2.) / (mu+1.) 99 | return np.sign(y) * (1./mu) * ((1. + mu)**np.abs(y) - 1.) 100 | 101 | def __mu_law_quantize(data): 102 | return linear2mu(data) 103 | 104 | def __batch_quantize(data, q_levels, q_type): 105 | """ 106 | One of 'linear', 'a-law', 'mu-law' for q_type. 107 | """ 108 | data = data.astype('float64') 109 | #data = __normalize(data) 110 | if q_type == 'linear': 111 | return __linear_quantize(data, q_levels) 112 | if q_type == 'mu-law': 113 | # from [0, 1] to [-1, 1] 114 | #data = 2.*data-1. 115 | # Automatically quantized to 256 bins. 116 | return __mu_law_quantize(data) 117 | raise NotImplementedError 118 | 119 | __RAND_SEED = 123 120 | def __fixed_shuffle(inp_list): 121 | if isinstance(inp_list, list): 122 | random.seed(__RAND_SEED) 123 | random.shuffle(inp_list) 124 | return 125 | if isinstance(inp_list, np.ndarray): 126 | np.random.seed(__RAND_SEED) 127 | np.random.shuffle(inp_list) 128 | return 129 | 130 | raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list)) 131 | 132 | def __make_random_batches(inp_list, batch_size,shuffle=True): 133 | batches = [] 134 | for i in xrange(len(inp_list) / batch_size+1): 135 | if i==len(inp_list) / batch_size: 136 | if len(inp_list)%batch_size==0: 137 | break 138 | else: 139 | batches.append(inp_list[i*batch_size:]) 140 | else: 141 | batches.append(inp_list[i*batch_size:(i+1)*batch_size]) 142 | 143 | if shuffle: 144 | __fixed_shuffle(batches) 145 | return batches 146 | 147 | def __mask_sort(mask_matrix): 148 | ind=[] 149 | for i in xrange(len(mask_matrix)): 150 | ind.append(len(np.where(mask_matrix[i]==1)[0])) 151 | b=zip(ind,range(len(ind))) 152 | b.sort(key=lambda x:x[0],reverse=True) 153 | index=[x[1] for x in b] 154 | 155 | return index 156 | 157 | ### TIMIT DATASET LOADER ### 158 | def __TIMIT_feed_epoch(files, 159 | mask_files, 160 | shuffle, 161 | is_train, 162 | batch_size, 163 | seq_len, 164 | overlap, 165 | q_levels, 166 | q_zero, 167 | q_type, 168 | real_valued=False): 169 | """ 170 | Helper function to load blizzard dataset. 171 | Generator that yields training inputs (subbatch, reset). `subbatch` contains 172 | quantized audio data; `reset` is a boolean indicating the start of a new 173 | sequence (i.e. you should reset h0 whenever `reset` is True). 174 | 175 | Feeds subsequences which overlap by a specified amount, so that the model 176 | can always have target for every input in a given subsequence. 177 | 178 | Assumes all flac files have the same length. 179 | 180 | returns: (subbatch, reset) 181 | subbatch.shape: (BATCH_SIZE, SEQ_LEN + OVERLAP) 182 | reset: True or False 183 | """ 184 | if is_train: 185 | sort_index=__mask_sort(mask_files) 186 | batches_8k = __make_random_batches(files[0][sort_index], batch_size,shuffle) 187 | batches_up = __make_random_batches(files[1][sort_index], batch_size,shuffle) 188 | mask_batches=__make_random_batches(mask_files[sort_index],batch_size,shuffle) 189 | else: 190 | batches_8k = __make_random_batches(files[0], batch_size,shuffle) 191 | batches_up = __make_random_batches(files[1], batch_size,shuffle) 192 | mask_batches=__make_random_batches(mask_files,batch_size,shuffle) 193 | 194 | for index,bch_8k in enumerate(batches_8k): 195 | 196 | batch_num=len(bch_8k) 197 | bch_up=batches_up[index] 198 | mask=mask_batches[index] 199 | mask_sum=np.sum(mask,axis=0) 200 | mask_all0_index=np.where(mask_sum==0)[0] 201 | if len(mask_all0_index!=0): 202 | bch_up=bch_up[:,:-len(mask_all0_index)] 203 | bch_8k=bch_8k[:,:-len(mask_all0_index)] 204 | mask=mask[:,:-len(mask_all0_index)] 205 | 206 | batch_seq_len = len(bch_8k[0]) 207 | batch_seq_len = __round_to(batch_seq_len, seq_len) 208 | 209 | batch_8k = np.zeros( 210 | (batch_num, batch_seq_len), 211 | dtype='float64' 212 | ) 213 | batch_up = np.zeros( 214 | (batch_num, batch_seq_len), 215 | dtype='float64' 216 | ) 217 | 218 | mask=np.pad(mask,[[0,0],[0,batch_seq_len-mask.shape[1]]],'constant') 219 | for i, data in enumerate(bch_8k): 220 | batch_8k[i, :len(data)] = data 221 | for i, data in enumerate(bch_up): 222 | batch_up[i, :len(data)] = data 223 | 224 | batch_8k_real=np.concatenate([ 225 | batch_8k, 226 | np.full((batch_num, overlap), 0, dtype='float32') 227 | ], axis=1) 228 | if not real_valued: 229 | batch_8k = __batch_quantize(batch_8k, q_levels, q_type) 230 | batch_up = __batch_quantize(batch_up, q_levels, q_type) 231 | 232 | batch_8k = np.concatenate([ 233 | batch_8k, 234 | np.full((batch_num, overlap), q_zero, dtype='int32') 235 | ], axis=1) 236 | 237 | batch_up = np.concatenate([ 238 | batch_up, 239 | np.full((batch_num, overlap), q_zero, dtype='int32') 240 | ], axis=1) 241 | 242 | mask = np.concatenate([ 243 | mask, 244 | np.full((batch_num, overlap), 0, dtype='float32') 245 | ], axis=1) 246 | 247 | 248 | for i in xrange(batch_seq_len // seq_len): 249 | reset = np.int32(i==0) 250 | end_flag=np.int32(i==batch_seq_len // seq_len-1) 251 | subbatch_8k_real=batch_8k_real[:, i*seq_len : (i+1)*seq_len+overlap] 252 | subbatch_8k = batch_8k[:, i*seq_len : (i+1)*seq_len+overlap] 253 | subbatch_up = batch_up[:, i*seq_len : (i+1)*seq_len+overlap] 254 | submask = mask[:, i*seq_len : (i+1)*seq_len+overlap] 255 | yield (subbatch_8k, subbatch_up,reset, end_flag,submask,batch_num,subbatch_8k_real) 256 | 257 | def TIMIT_train_feed_epoch(*args): 258 | """ 259 | :parameters: 260 | batch_size: int 261 | seq_len: 262 | overlap: 263 | q_levels: 264 | q_zero: 265 | q_type: One the following 'linear', 'a-law', or 'mu-law' 266 | 267 | THE NEW SEG IS: 268 | 20.48hrs 36*256 269 | 3*256 270 | 3*256 271 | 272 | :returns: 273 | A generator yielding (subbatch, reset, submask) 274 | """ 275 | # Just check if valid/test sets are also available. If not, raise. 276 | find_dataset(__valid_up(__TIMIT_file)) 277 | find_dataset(__valid8k(__TIMIT_file)) 278 | find_dataset(__valid_mask(__TIMIT_file)) 279 | find_dataset(__test_up(__TIMIT_file)) 280 | find_dataset(__test8k(__TIMIT_file)) 281 | find_dataset(__test_mask(__TIMIT_file)) 282 | # Load train set 283 | data_path_8k = find_dataset(__train8k(__TIMIT_file)) 284 | data_path_up = find_dataset(__train_up(__TIMIT_file)) 285 | data_mask_path=find_dataset(__train_mask(__TIMIT_file)) 286 | files=[] 287 | files.append(np.load(data_path_8k)) 288 | files.append(np.load(data_path_up)) 289 | mask_files=np.load(data_mask_path) 290 | shuffle=True 291 | is_train=True 292 | generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args) 293 | return generator 294 | 295 | def TIMIT_valid_feed_epoch(*args): 296 | """ 297 | See: 298 | TIMIT_train_feed_epoch 299 | """ 300 | data_path_8k = find_dataset(__valid8k(__TIMIT_file)) 301 | data_path_up = find_dataset(__valid_up(__TIMIT_file)) 302 | data_mask_path=find_dataset(__valid_mask(__TIMIT_file)) 303 | files=[] 304 | files.append(np.load(data_path_8k)) 305 | files.append(np.load(data_path_up)) 306 | mask_files=np.load(data_mask_path) 307 | shuffle=True 308 | is_train=False 309 | generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args) 310 | return generator 311 | 312 | def TIMIT_test_feed_epoch(*args): 313 | """ 314 | See: 315 | TIMIT_train_feed_epoch 316 | """ 317 | data_path_8k = find_dataset(__test8k(__TIMIT_file)) 318 | data_path_up = find_dataset(__test_up(__TIMIT_file)) 319 | data_mask_path=find_dataset(__test_mask(__TIMIT_file)) 320 | files=[] 321 | files.append(np.load(data_path_8k)) 322 | files.append(np.load(data_path_up)) 323 | mask_files=np.load(data_mask_path) 324 | shuffle=False 325 | is_train=False 326 | generator = __TIMIT_feed_epoch(files, mask_files,shuffle,is_train,*args) 327 | return generator 328 | -------------------------------------------------------------------------------- /HRNN_HF/lib/__init__.py: -------------------------------------------------------------------------------- 1 | import ops 2 | #import lasagne 3 | #from theano.compile.nanguardmode import NanGuardMode 4 | 5 | import math 6 | import time 7 | import locale 8 | 9 | import numpy 10 | import theano 11 | import theano.tensor as T 12 | import theano.gof 13 | 14 | import cPickle as pickle 15 | #import pickle 16 | import warnings 17 | import sys, os, errno, glob 18 | 19 | # import matplotlib 20 | # matplotlib.use('Agg') 21 | # import matplotlib.pyplot as plt 22 | 23 | # TODO: Grouping is not working on cluster! :-? 24 | # Set a locale first or you won't get grouping at all 25 | locale.setlocale(locale.LC_ALL, '') 26 | # 'en_US.UTF-8' 27 | 28 | _params = {} 29 | def param(name, *args, **kwargs): 30 | """ 31 | A wrapper for `theano.shared` which enables parameter sharing in models. 32 | 33 | Creates and returns theano shared variables similarly to `theano.shared`, 34 | except if you try to create a param with the same name as a 35 | previously-created one, `param(...)` will just return the old one instead of 36 | making a new one. 37 | 38 | This constructor also adds a `param` attribute to the shared variables it 39 | creates, so that you can easily search a graph for all params. 40 | """ 41 | 42 | if name not in _params: 43 | kwargs['name'] = name 44 | param = theano.shared(*args, **kwargs) 45 | param.param = True 46 | _params[name] = param 47 | return _params[name] 48 | 49 | def delete_params(name): 50 | to_delete = [p_name for p_name in _params if name in p_name] 51 | for p_name in to_delete: 52 | del _params[p_name] 53 | 54 | def search(node, critereon): 55 | """ 56 | Traverse the Theano graph starting at `node` and return a list of all nodes 57 | which match the `critereon` function. When optimizing a cost function, you 58 | can use this to get a list of all of the trainable params in the graph, like 59 | so: 60 | 61 | `lib.search(cost, lambda x: hasattr(x, "param"))` 62 | or 63 | `lib.search(cost, lambda x: hasattr(x, "param") and x.param==True)` 64 | """ 65 | 66 | def _search(node, critereon, visited): 67 | if node in visited: 68 | return [] 69 | visited.add(node) 70 | 71 | results = [] 72 | if isinstance(node, T.Apply): 73 | for inp in node.inputs: 74 | results += _search(inp, critereon, visited) 75 | else: # Variable node 76 | if critereon(node): 77 | results.append(node) 78 | if node.owner is not None: 79 | results += _search(node.owner, critereon, visited) 80 | return results 81 | 82 | return _search(node, critereon, set()) 83 | 84 | def floatX(x): 85 | """ 86 | Convert `x` to the numpy type specified in `theano.config.floatX`. 87 | """ 88 | if theano.config.floatX == 'float16': 89 | return numpy.float16(x) 90 | elif theano.config.floatX == 'float32': 91 | return numpy.float32(x) 92 | else: # Theano's default float type is float64 93 | print "Warning: lib.floatX using float64" 94 | return numpy.float64(x) 95 | 96 | def save_params(path): 97 | param_vals = {} 98 | for name, param in _params.iteritems(): 99 | param_vals[name] = param.get_value() 100 | 101 | with open(path, 'wb') as f: 102 | pickle.dump(param_vals, f) 103 | 104 | def load_params(path): 105 | with open(path, 'rb') as f: 106 | param_vals = pickle.load(f) 107 | 108 | for name, val in param_vals.iteritems(): 109 | _params[name].set_value(val) 110 | 111 | def clear_all_params(): 112 | to_delete = [p_name for p_name in _params] 113 | for p_name in to_delete: 114 | del _params[p_name] 115 | 116 | def ensure_dir(dirname): 117 | """ 118 | Ensure that a named directory exists; if it does not, attempt to create it. 119 | """ 120 | try: 121 | os.makedirs(dirname) 122 | except OSError, e: 123 | if e.errno != errno.EEXIST: 124 | raise 125 | 126 | __model_setting_file_name = 'model_settings.txt' 127 | def print_model_settings(locals_var, path=None, sys_arg=False): 128 | """ 129 | Prints all variables in upper case in locals_var, 130 | except for T which usually stands for theano.tensor. 131 | If locals() passed as input to this method, will print 132 | all the variables in upper case defined so far, that is 133 | model settings. 134 | 135 | With `path` as an address to a directory it will _append_ it 136 | as a file named `model_settings.txt` as well. 137 | 138 | With `sys_arg` set to True, log information about Python, Numpy, 139 | and Theano and passed arguments to the script will be added too. 140 | args.pkl would be overwritten, specially in case of resuming a job. 141 | But again that wouldn't be much of a problem as all the passed args 142 | to the script except for '--resume' should be the same. 143 | 144 | With both `path` and `sys_arg` passed, dumps the theano.config. 145 | 146 | :usage: 147 | >>> import theano.tensor as T 148 | >>> import lib 149 | >>> BATCH_SIZE, DIM = 128, 512 150 | >>> DATA_PATH = '/Path/to/dataset' 151 | >>> lib.print_model_settings(locals(), path='./') 152 | """ 153 | log = "" 154 | if sys_arg: 155 | try: 156 | log += "Python:\n" 157 | log += "\tsys.version_info\t{}\n".format(str(sys.version_info)) 158 | log += "Numpy:\n" 159 | log += "\t.__version__\t{}\n".format(numpy.__version__) 160 | log += "Theano:\n" 161 | log += "\t.__version__\t{}\n".format(theano.__version__) 162 | log += "\n\nAll passed args:\n" 163 | log += str(sys.argv) 164 | log += "\n" 165 | except: 166 | print "Something went wrong during sys_arg logging. Continue anyway!" 167 | 168 | log += "\nModel settings:" 169 | all_vars = [(k,v) for (k,v) in locals_var.items() if (k.isupper() and k != 'T')] 170 | all_vars = sorted(all_vars, key=lambda x: x[0]) 171 | for var_name, var_value in all_vars: 172 | log += ("\n\t%-20s %s" % (var_name, var_value)) 173 | print log 174 | if path is not None: 175 | ensure_dir(path) 176 | # Don't override, just append if by mistake there is something in the file. 177 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 178 | f.write(log) 179 | if sys_arg: 180 | with open(os.path.join(path, 'th_conf.txt'), 'a+') as f: 181 | f.write(str(theano.config)) 182 | with open(os.path.join(path, 'args.pkl'), 'wb') as f: 183 | pickle.dump(sys.argv, f) 184 | # To load: 185 | # >>> import cPickle as pickle 186 | # >>> args = pickle.load(open(os.path.join(path, 'args.pkl'), 'rb')) 187 | 188 | def get_params(cost, criterion=lambda x: hasattr(x, 'param') and x.param==True): 189 | """ 190 | Default criterion: 191 | lambda x: hasattr(x, 'param') and x.param==True 192 | This will return every parameter for cost from computation graph. 193 | 194 | To exclude a parameter, just set 'param' to False: 195 | >>> h0 = lib.param('h0',\ 196 | numpy.zeros((3, 2*512), dtype=theano.config.floatX)) 197 | >>> print h0.param # Default: True 198 | >>> h0.param = False 199 | 200 | In this case one still can get list of all params (False or True) by: 201 | >>> lib.get_params(cost, lambda x: hasattr(x, 'param') 202 | 203 | :returns: 204 | A list of params 205 | """ 206 | return search(cost, criterion) 207 | 208 | def print_params_info(params, path=None): 209 | """ 210 | Print information about the parameters in the given param set. 211 | 212 | With `path` as an address to a directory it will _append_ it 213 | as a file named `model_settings.txt` as well. 214 | 215 | :usage: 216 | >>> params = lib.get_params(cost) 217 | >>> lib.print_params_info(params, path='./') 218 | """ 219 | params = sorted(params, key=lambda p: p.name) 220 | values = [p.get_value(borrow=True) for p in params] 221 | shapes = [p.shape for p in values] 222 | total_param_count = 0 223 | multiply_all = lambda a, b: a*b 224 | log = "\nParams for cost:" 225 | for param, value, shape in zip(params, values, shapes): 226 | log += ("\n\t%-20s %s" % (shape, param.name)) 227 | total_param_count += reduce(multiply_all, shape) 228 | 229 | log += "\nTotal parameter count for this cost:\n\t{0}".format( 230 | locale.format("%d", total_param_count, grouping=True) 231 | ) 232 | print log 233 | 234 | if path is not None: 235 | ensure_dir(path) 236 | # Don't override, just append if by mistake there is something in the file. 237 | with open(os.path.join(path, __model_setting_file_name), 'a+') as f: 238 | f.write(log) 239 | 240 | __train_log_file_name = 'train_log.pkl' 241 | def save_training_info(values, path): 242 | """ 243 | Gets a set of values as dictionary and append them to a log file. 244 | stores in /train_log.pkl 245 | """ 246 | file_name = os.path.join(path, __train_log_file_name) 247 | try: 248 | with open(file_name, "rb") as f: 249 | log = pickle.load(f) 250 | except IOError: # first time 251 | log = {} 252 | for k in values.keys(): 253 | log[k] = [] 254 | for k, v in values.items(): 255 | log[k].append(v) 256 | with open(file_name, "wb") as f: 257 | pickle.dump(log, f) 258 | 259 | resume_key = 'last resume index' 260 | def resumable(path, 261 | iter_key='iter', 262 | epoch_key='epoch', 263 | add_resume_counter=True, 264 | other_keys=[]): 265 | """ 266 | :warning: 267 | This is a naive implementation of resuming a training session 268 | and does not save and reload the training loop. The serialization 269 | of training loop and everything is costly and error-prone. 270 | 271 | :todo: 272 | - Save and load a serializable training loop. (See warning above) 273 | - Heavily dependent on the "model" file and the names used there right 274 | now. It's really easy to miss anything. 275 | 276 | `path` should be pointing at the root directory where `train_log.pkl` 277 | (See __train_log_file_name) and `params/` reside. 278 | 279 | Always assuming all the values in the log dictionary (except `resume_key`), 280 | are lists with the same length. 281 | """ 282 | file_name = os.path.join(path, __train_log_file_name) 283 | # Raise error if does not exists. 284 | with open(file_name, "rb") as f: 285 | log = pickle.load(f) 286 | 287 | param_found = False 288 | res_path = os.path.join(path, 'params', 'params_e{}_i{}*.pkl') 289 | for reverse_idx in range(-1, -len(log[epoch_key])-1, -1): 290 | ep, it = log[epoch_key][reverse_idx], log[iter_key][reverse_idx] 291 | print "> Params file for epoch {} iter {}".format(ep, it), 292 | last_path = glob.glob(res_path.format(ep, it)) 293 | if len(last_path) == 1: 294 | res_path = last_path[0] 295 | param_found = True 296 | print "found." 297 | break 298 | elif len(last_path) == 0: 299 | print "[NOT FOUND]. FALLING BACK TO..." 300 | else: # > 1 301 | # choose one, warning, rare 302 | print "[multiple version found]:" 303 | for l_path in last_path: 304 | print l_path 305 | res_path = last_path[0] 306 | param_found = True 307 | print "Arbitrarily choosing first:\n\t{}".format(res_path) 308 | 309 | assert 'reverse_idx' in locals(), 'Empty train_log???\n{}'.format(log) 310 | # Finishing for loop with no success 311 | assert param_found, 'No matching params file with train_log' 312 | 313 | acceptable_len = reverse_idx+len(log[epoch_key])+1 314 | if acceptable_len != len(log[epoch_key]): 315 | # Backup of the old train_log 316 | with open(file_name+'.backup', 'wb') as f: 317 | pickle.dump(log, f) 318 | 319 | # Change the log file to match the last existing checkpoint. 320 | for k, v in log.items(): 321 | # Fix resume indices 322 | if k == resume_key: 323 | log[k] = [i for i in log[k] if i < acceptable_len] 324 | continue 325 | # Rest is useless with no param file. 326 | log[k] = v[:acceptable_len] 327 | 328 | epochs = log[epoch_key] 329 | iters = log[iter_key] 330 | 331 | if add_resume_counter: 332 | resume_val = len(epochs) 333 | if not resume_key in log.keys(): 334 | log[resume_key] = [resume_val] 335 | else: 336 | if log[resume_key] == [] or log[resume_key][-1] != resume_val: 337 | log[resume_key].append(resume_val) 338 | with open(file_name, "wb") as f: 339 | pickle.dump(log, f) 340 | 341 | last_epoch = epochs[-1] 342 | last_iter = iters[-1] 343 | 344 | # The if-else statement is more readable than `next`: 345 | #iters_to_consume = next((last_iter%(i-1) for (e, i) in\ 346 | # zip(epochs, iters) if e == 1), last_iter) 347 | if last_epoch == 0: 348 | iters_to_consume = last_iter 349 | else: 350 | for e, i in zip(epochs, iters): 351 | # first time. Epoch turns from 0 to 1. 352 | # At the end of each `epoch` there should be 353 | # a monitoring step so it will gives number 354 | # number of iterations per epoch 355 | if e == 1: 356 | iters_per_epoch = i - 1 357 | break 358 | iters_to_consume = last_iter % iters_per_epoch 359 | 360 | last_other_keys = [log[k][-1] for k in other_keys] 361 | return iters_to_consume, res_path, last_epoch, last_iter, last_other_keys 362 | 363 | def plot_traing_info(x, ylist, path): 364 | """ 365 | Loads log file and plot x and y values as provided by input. 366 | Saves as /train_log.png 367 | """ 368 | file_name = os.path.join(path, __train_log_file_name) 369 | try: 370 | with open(file_name, "rb") as f: 371 | log = pickle.load(f) 372 | except IOError: # first time 373 | warnings.warn("There is no {} file here!!!".format(file_name)) 374 | return 375 | plt.figure() 376 | x_vals = log[x] 377 | for y in ylist: 378 | y_vals = log[y] 379 | if len(y_vals) != len(x_vals): 380 | warning.warn("One of y's: {} does not have the same length as x:{}".format(y, x)) 381 | plt.plot(x_vals, y_vals, label=y) 382 | # assert len(y_vals) == len(x_vals), "not the same len" 383 | plt.xlabel(x) 384 | plt.legend() 385 | #plt.show() 386 | plt.savefig(file_name[:-3]+'png', bbox_inches='tight') 387 | plt.close('all') 388 | 389 | def create_logging_folders(path): 390 | """ 391 | Handle structure of folders and naming here instead of training file. 392 | 393 | :todo: 394 | - Implement! 395 | """ 396 | pass 397 | 398 | def tv(var): 399 | """ 400 | :todo: 401 | - add tv() function for theano variables so that instead of calling 402 | x.tag.test_value, you can get the same thing just by calling the method 403 | in a faster way... 404 | - also for x.tag.test_value.shape 405 | """ 406 | # Based on EAFP (easier to ask for forgiveness than permission) 407 | try: 408 | return var.tag.test_value 409 | except AttributeError: 410 | print "NONE, test_value has not been set." 411 | import ipdb; ipdb.set_trace() 412 | 413 | ## Rather than LBYL (look before you leap) 414 | #if hasattr(var, 'tag'): 415 | # if hasattr(var.tag, 'test_value'): 416 | # return var.tag.test_value 417 | # else: 418 | # print "NONE, test_value has not set." 419 | # import ipdb; ipdb.set_trace() 420 | #else: 421 | # print "NONE, tag has not set." 422 | # import ipdb; ipdb.set_trace() 423 | 424 | def tvs(var): 425 | """ 426 | :returns: 427 | var.tag.test_value.shape 428 | """ 429 | return tv(var).shape 430 | 431 | def _is_symbolic(v): 432 | r"""Return `True` if any of the arguments are symbolic. 433 | See: 434 | https://github.com/Theano/Theano/wiki/Cookbook 435 | """ 436 | symbolic = False 437 | v = list(v) 438 | for _container, _iter in [(v, xrange(len(v)))]: 439 | for _k in _iter: 440 | _v = _container[_k] 441 | if isinstance(_v, theano.gof.Variable): 442 | symbolic = True 443 | return symbolic 444 | 445 | def unique_list(inp_list): 446 | """ 447 | returns a list with unique values of inp_list. 448 | :usage: 449 | >>> inp_list = ['a', 'b', 'c'] 450 | >>> unique_inp_list = unique_list(inp_list*2) 451 | """ 452 | return list(set(inp_list)) 453 | -------------------------------------------------------------------------------- /HRNN_HF/models/three_tier/three_tier_generation.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from datetime import datetime 3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 4 | exp_start = time() 5 | 6 | import os, sys, glob 7 | sys.path.insert(1, os.getcwd()) 8 | import argparse 9 | import itertools 10 | 11 | import numpy 12 | numpy.random.seed(123) 13 | np = numpy 14 | import random 15 | random.seed(123) 16 | 17 | import theano 18 | import theano.tensor as T 19 | import theano.ifelse 20 | import lasagne 21 | import scipy.io.wavfile 22 | 23 | import lib 24 | 25 | LEARNING_RATE = 0.001 26 | 27 | ### Parsing passed args/hyperparameters ### 28 | def get_args(): 29 | def t_or_f(arg): 30 | ua = str(arg).upper() 31 | if 'TRUE'.startswith(ua): 32 | return True 33 | elif 'FALSE'.startswith(ua): 34 | return False 35 | else: 36 | raise ValueError('Arg is neither `True` nor `False`') 37 | 38 | def check_non_negative(value): 39 | ivalue = int(value) 40 | if ivalue < 0: 41 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 42 | return ivalue 43 | 44 | def check_positive(value): 45 | ivalue = int(value) 46 | if ivalue < 1: 47 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 48 | return ivalue 49 | 50 | def check_unit_interval(value): 51 | fvalue = float(value) 52 | if fvalue < 0 or fvalue > 1: 53 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 54 | return fvalue 55 | 56 | # No default value here. Indicate every single arguement. 57 | parser = argparse.ArgumentParser( 58 | description='three_tier.py\nNo default value! Indicate every argument.') 59 | 60 | # TODO: Fix the descriptions 61 | # Hyperparameter arguements: 62 | parser.add_argument('--exp', help='Experiment name', 63 | type=str, required=False, default='_') 64 | parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True) 65 | parser.add_argument('--big_frame_size', help='How many samples per big frame',\ 66 | type=check_positive, required=True) 67 | parser.add_argument('--frame_size', help='How many samples per frame',\ 68 | type=check_positive, required=True) 69 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\ 70 | type=t_or_f, required=True) 71 | parser.add_argument('--emb_size', help='Size of embedding layer (> 0)', 72 | type=check_positive, required=True) # different than two_tier 73 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', 74 | type=t_or_f, required=True) 75 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 76 | type=check_positive, required=True) 77 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 78 | type=check_positive, choices=xrange(1,6), required=True) 79 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 80 | required=True) 81 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 82 | type=t_or_f, required=True) 83 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 84 | type=check_positive, required=True) 85 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 86 | choices=['linear', 'a-law', 'mu-law'], required=True) 87 | parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK', 88 | choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True) 89 | parser.add_argument('--batch_size', help='size of mini-batch', 90 | type=check_positive, choices=[1,50,64, 128, 256], required=True) 91 | 92 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 93 | required=False, default=True, action='store_true') 94 | 95 | args = parser.parse_args() 96 | 97 | # NEW 98 | # Create tag for this experiment based on passed args 99 | # tag = reduce(lambda a, b: a+b, sys.argv).replace('--resume', '').replace('/', '-').replace('--', '-').replace('True', 'T').replace('False', 'F') 100 | # tag += '-lr'+str(LEARNING_RATE) 101 | tag='three_tier_model' 102 | print "Created experiment tag for these args:" 103 | print tag 104 | 105 | return args, tag 106 | 107 | #tag:three_tier.py-expAXIS1-seq_len512-big_frame_size8-frame_size2-weight_normT-emb_size64-skip_connF-dim32-n_rnn2-rnn_typeLSTM-learn_h0F-q_levels16-q_typelinear-batch_size128-which_setMUSIC-lr0.001 108 | args, tag = get_args() 109 | 110 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512) 111 | #print "------------------previous SEQ_LEN:", SEQ_LEN 112 | # TODO: test incremental training 113 | #SEQ_LEN = 512 + 256 114 | #print "---------------------------new SEQ_LEN:", SEQ_LEN 115 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame (8) 116 | FRAME_SIZE = args.frame_size # How many samples per frame (2) 117 | WEIGHT_NORM = args.weight_norm #True 118 | EMB_SIZE = args.emb_size #(256) 119 | SKIP_CONN = args.skip_conn #(False) 120 | DIM = args.dim # Model dimensionality. (1024) 121 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024) 122 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1) 123 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1) 124 | RNN_TYPE = args.rnn_type #GRU 125 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1) 126 | LEARN_H0 = args.learn_h0 #(True) 127 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256) 128 | Q_TYPE = args.q_type # log- or linear-scale #(linear) 129 | WHICH_SET = args.which_set #(MUSIC) 130 | BATCH_SIZE = args.batch_size #(128) 131 | RESUME = args.resume #(False) 132 | assert SEQ_LEN % BIG_FRAME_SIZE == 0,\ 133 | 'seq_len should be divisible by big_frame_size' 134 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\ 135 | 'big_frame_size should be divisible by frame_size' 136 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass 137 | 138 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 139 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 140 | 141 | # Fixed hyperparams 142 | GRAD_CLIP = 1 # Elementwise grad clip threshold 143 | BITRATE = 16000 144 | 145 | # Other constants 146 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 147 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 148 | #TRAIN_MODE = 'time-iters' 149 | # To use PRINT_TIME for validation, 150 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 151 | #TRAIN_MODE = 'iters-time' 152 | # To use PRINT_ITERS for validation, 153 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 154 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations. 155 | STOP_ITERS = 300000 # Stop after this many iterations 156 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds. 157 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 158 | N_SEQS = 5 # Number of samples to generate every time monitoring. 159 | RESULTS_DIR = 'results_3t' 160 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 161 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 162 | OVERLAP = BIG_FRAME_SIZE 163 | 164 | epoch_str = 'epoch' 165 | iter_str = 'iter' 166 | lowest_valid_str = 'lowest valid cost' 167 | corresp_test_str = 'correponding test cost' 168 | train_nll_str, valid_nll_str, test_nll_str = \ 169 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 170 | 171 | ### Create directories ### 172 | # FOLDER_PREFIX: root, contains: 173 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 174 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 175 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 176 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 177 | if not os.path.exists(FOLDER_PREFIX): 178 | os.makedirs(FOLDER_PREFIX) 179 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 180 | if not os.path.exists(PARAMS_PATH): 181 | os.makedirs(PARAMS_PATH) 182 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 183 | if not os.path.exists(SAMPLES_PATH): 184 | os.makedirs(SAMPLES_PATH) 185 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 186 | if not os.path.exists(BEST_PATH): 187 | os.makedirs(BEST_PATH) 188 | 189 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 190 | 191 | ### Import the data_feeder ### 192 | # Handling WHICH_SET 193 | if WHICH_SET == 'TIMIT': 194 | from datasets.dataset import TIMIT_test_feed_epoch as test_feeder 195 | 196 | def load_data(data_feeder): 197 | """ 198 | Helper function to deal with interface of different datasets. 199 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 200 | """ 201 | return data_feeder(BATCH_SIZE, 202 | SEQ_LEN, 203 | OVERLAP, 204 | Q_LEVELS, 205 | Q_ZERO, 206 | Q_TYPE) 207 | 208 | ### Creating computation graph ### 209 | def big_frame_level_rnn(input_sequences, h0, reset): 210 | """ 211 | input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8 212 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 213 | reset.shape: () 214 | output[0].shape: (batch size, n frames, DIM) 215 | output[1].shape: same as h0.shape 216 | output[2].shape: (batch size, seq len, Q_LEVELS) 217 | """ 218 | frames = input_sequences.reshape(( 219 | input_sequences.shape[0], 220 | input_sequences.shape[1] // (2*BIG_FRAME_SIZE), 221 | 2*BIG_FRAME_SIZE 222 | )) 223 | 224 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 225 | # (a reasonable range to pass as inputs to the RNN) 226 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 227 | frames *= lib.floatX(1) 228 | 229 | # Initial state of RNNs 230 | learned_h0 = lib.param( 231 | 'BigFrameLevel.h0', 232 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 233 | ) 234 | # Handling LEARN_H0 235 | learned_h0.param = LEARN_H0 #True 236 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 237 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 238 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 239 | 240 | # Handling RNN_TYPE 241 | # Handling SKIP_CONN 242 | if RNN_TYPE == 'GRU': 243 | rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU', 244 | N_BIG_RNN, 245 | 2*BIG_FRAME_SIZE, 246 | BIG_DIM, 247 | frames, 248 | h0=h0, 249 | weightnorm=WEIGHT_NORM, 250 | skip_conn=SKIP_CONN) 251 | elif RNN_TYPE == 'LSTM': 252 | rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM', 253 | N_BIG_RNN, 254 | 2*BIG_FRAME_SIZE, 255 | BIG_DIM, 256 | frames, 257 | h0=h0, 258 | weightnorm=WEIGHT_NORM, 259 | skip_conn=SKIP_CONN) 260 | 261 | output = lib.ops.Linear( #batch*timestep*dim 262 | 'BigFrameLevel.Output', 263 | BIG_DIM, 264 | DIM * BIG_FRAME_SIZE / FRAME_SIZE, #1024*8/2 265 | rnns_out, 266 | initialization='he', 267 | weightnorm=WEIGHT_NORM 268 | ) 269 | output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM)) 270 | 271 | return (output, last_hidden) #last_hidden:#batch*1*dim 272 | 273 | def frame_level_rnn(input_sequences, other_input, h0, reset): 274 | """ 275 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2 276 | other_input.shape: (batch size, n frames, DIM) 277 | h0.shape: (batch size, N_RNN, DIM) 278 | reset.shape: () 279 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 280 | """ 281 | frames = input_sequences.reshape(( 282 | input_sequences.shape[0], 283 | input_sequences.shape[1] // (2*FRAME_SIZE), 284 | 2*FRAME_SIZE 285 | )) 286 | 287 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 288 | # (a reasonable range to pass as inputs to the RNN) 289 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 290 | frames *= lib.floatX(1) 291 | 292 | gru_input = lib.ops.Linear( 293 | 'FrameLevel.InputExpand', 294 | 2*FRAME_SIZE, 295 | DIM, 296 | frames, 297 | initialization='he', 298 | weightnorm=WEIGHT_NORM, 299 | ) + other_input 300 | 301 | # Initial state of RNNs 302 | learned_h0 = lib.param( 303 | 'FrameLevel.h0', 304 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 305 | ) 306 | # Handling LEARN_H0 307 | learned_h0.param = LEARN_H0 308 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 309 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 310 | #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim) 311 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 312 | 313 | # Handling RNN_TYPE 314 | # Handling SKIP_CONN 315 | if RNN_TYPE == 'GRU': 316 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 317 | N_RNN, 318 | DIM, 319 | DIM, 320 | gru_input, 321 | h0=h0, 322 | weightnorm=WEIGHT_NORM, 323 | skip_conn=SKIP_CONN) 324 | elif RNN_TYPE == 'LSTM': 325 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 326 | N_RNN, 327 | DIM, 328 | DIM, 329 | gru_input, 330 | h0=h0, 331 | weightnorm=WEIGHT_NORM, 332 | skip_conn=SKIP_CONN) 333 | 334 | output = lib.ops.Linear( 335 | 'FrameLevel.Output', 336 | DIM, 337 | FRAME_SIZE * DIM, 338 | rnns_out, 339 | initialization='he', 340 | weightnorm=WEIGHT_NORM 341 | ) 342 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 343 | 344 | return (output, last_hidden) 345 | def sample_level_predictor(frame_level_outputs, prev_samples): 346 | """ 347 | frame_level_outputs.shape: (batch size, DIM) 348 | prev_samples.shape: (batch size, FRAME_SIZE) 349 | output.shape: (batch size, Q_LEVELS) 350 | """ 351 | # Handling EMB_SIZE 352 | if EMB_SIZE == 0: # no support for one-hot in three_tier and one_tier. 353 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 354 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 355 | last_out_shape = Q_LEVELS 356 | elif EMB_SIZE > 0: #The embedding steps maps each of the q discrete values to a real-valued vector embedding. 357 | prev_samples = lib.ops.Embedding( #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE 358 | 'SampleLevel.Embedding', 359 | Q_LEVELS, 360 | EMB_SIZE, 361 | prev_samples) 362 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 363 | last_out_shape = EMB_SIZE 364 | else: 365 | raise ValueError('EMB_SIZE cannot be negative.') 366 | 367 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE) 368 | 369 | out = lib.ops.Linear( 370 | 'SampleLevel.L1_PrevSamples', 371 | FRAME_SIZE * last_out_shape, 372 | DIM, 373 | prev_samples, 374 | biases=False, 375 | initialization='he', 376 | weightnorm=WEIGHT_NORM 377 | ) 378 | 379 | out += frame_level_outputs 380 | # out = T.nnet.relu(out) # commented out to be similar to two_tier 381 | 382 | out = lib.ops.Linear('SampleLevel.L2', 383 | DIM, 384 | DIM, 385 | out, 386 | initialization='he', 387 | weightnorm=WEIGHT_NORM) 388 | out = T.nnet.relu(out) 389 | 390 | # L3 391 | out = lib.ops.Linear('SampleLevel.L3', 392 | DIM, 393 | DIM, 394 | out, 395 | initialization='he', 396 | weightnorm=WEIGHT_NORM) 397 | out = T.nnet.relu(out) 398 | 399 | # Output 400 | # We apply the softmax later 401 | out = lib.ops.Linear('SampleLevel.Output', 402 | DIM, 403 | Q_LEVELS, 404 | out, 405 | weightnorm=WEIGHT_NORM) 406 | return out 407 | 408 | sequences_8k = T.imatrix('sequences_8k') #batch size*samplenum 409 | sequences_up = T.imatrix('sequences_up') 410 | h0 = T.tensor3('h0') #(batch size, N_RNN, DIM) 411 | big_h0 = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM) 412 | reset = T.iscalar('reset') 413 | mask = T.matrix('mask') #batch size*samplenum 414 | batch_size =T.iscalar('batch_size') 415 | lr=T.scalar('lr') 416 | 417 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3) 418 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1)) 419 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1, 2*OVERLAP), neib_step=(1, OVERLAP), mode='valid') 420 | big_input_sequences=big_input_sequences.reshape((batch_size,-1)) 421 | 422 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)] #(tier2) 423 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1)) 424 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1, 2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid') 425 | input_sequences=input_sequences.reshape((batch_size,-1)) 426 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues 427 | 428 | target_mask = mask[:,0:-OVERLAP] 429 | 430 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, big_h0, reset)#tier3->tier2 431 | 432 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1 433 | 434 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)] 435 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1)) 436 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...]) 437 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN, FRAME_SIZE)) 438 | 439 | 440 | sample_level_outputs = sample_level_predictor( 441 | frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)), 442 | prev_samples 443 | ) #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...] 444 | 445 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences) 446 | accuracy=accuracy*target_mask 447 | accuracy=T.sum(accuracy,axis=1) 448 | mask_sum=T.sum(target_mask,axis=1) 449 | 450 | cost = T.nnet.categorical_crossentropy( 451 | T.nnet.softmax(sample_level_outputs), #Every row represents a distribution(256 propability) 452 | target_sequences.flatten() #A list, represent the groundtruth of every row 453 | ) 454 | cost = cost.reshape(target_sequences.shape) 455 | cost = cost * target_mask #dim: batch*num 456 | # Don't use these lines; could end up with NaN 457 | # Specially at the end of audio files where mask is 458 | # all zero for some of the shorter files in mini-batch. 459 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 460 | #cost = cost.mean(axis=0) 461 | cost_sum=T.sum(cost,axis=1) 462 | # Use this one instead. 463 | cost = cost.sum() 464 | cost = cost / target_mask.sum() #cost average by samples 465 | 466 | # By default we report cross-entropy cost in bits. 467 | # Switch to nats by commenting out this line: 468 | # log_2(e) = 1.44269504089 469 | #cost = cost * lib.floatX(numpy.log2(numpy.e)) 470 | 471 | ########### 472 | 473 | test_fn=theano.function( 474 | [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size], 475 | [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_big_h0,new_h0], 476 | on_unused_input='warn' 477 | ) 478 | 479 | def generate_and_save_samples(tag): 480 | def write_audio_file(name, data): 481 | data = data.astype('float32') 482 | #data -= data.min() 483 | #data /= data.max() 484 | #data -= 0.5 485 | #data *= 0.95 486 | scipy.io.wavfile.write( 487 | os.path.join(SAMPLES_PATH, name), 488 | BITRATE, 489 | data) 490 | 491 | total_time=time() 492 | costs_g = [] 493 | accuracys_g=[] 494 | samples_low_list=[] 495 | samples_list=[] 496 | masks_g_index=[] 497 | samples_number=0 498 | count=0 499 | data_feeder = load_data(test_feeder) 500 | for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,batch_g,seqs_g_8k_real in data_feeder: 501 | if reset_g==1: 502 | big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 503 | h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32') 504 | cost_batch=np.zeros((batch_g,),dtype='float32') 505 | accuracy_batch=np.zeros((batch_g,),dtype='float32') 506 | mask_batch=np.zeros((batch_g,),dtype='float32') 507 | cost_g, accuracy_g,mask_sum_g,sample, big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, big_h0_g,h0_g, reset_g, mask_g,batch_g) 508 | cost_batch=cost_batch+cost_g 509 | accuracy_batch=accuracy_batch+accuracy_g 510 | mask_batch=mask_batch+mask_sum_g 511 | if end_flag_g==1: 512 | costs_g.extend(list(cost_batch/mask_batch)) 513 | accuracys_g.extend(list(accuracy_batch/mask_batch)) 514 | 515 | if reset_g==1: 516 | samples_low=seqs_g_8k_real[:,0:-OVERLAP] 517 | samples=sample 518 | masks_g=mask_g[:,0:-OVERLAP] 519 | else: 520 | samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1) 521 | samples=np.concatenate([samples,sample],axis=1) 522 | masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1) 523 | 524 | if end_flag_g==1: 525 | samples_low_list.append(samples_low) 526 | samples_list.append(samples) 527 | masks_g_index.append(masks_g) 528 | fid=open('datasets/TIMIT/test_list.scp','r') 529 | test_id_list=fid.readlines() 530 | for i in xrange(len(samples_list)): 531 | samples_number+=samples_list[i].shape[0]*samples_list[i].shape[1] 532 | for j in xrange(samples_list[i].shape[0]): 533 | samples_lowi=samples_low_list[i][j] 534 | samplei=samples_list[i][j] 535 | maski=masks_g_index[i][j] 536 | samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])] 537 | samplei=samplei[0:len(np.where(maski==1)[0])] 538 | if Q_TYPE == 'mu-law': 539 | from datasets.dataset import mu2linear 540 | samplei = mu2linear(samplei) 541 | write_audio_file(test_id_list[count].split()[0], samplei/3+samples_lowi) 542 | count+=1 543 | 544 | 545 | total_time = time() - total_time 546 | log = "192 samples generated in {} minutes.\nThe time of generating 1 second speech is {} seconds." 547 | log = log.format(total_time/60,total_time/samples_number*16000) 548 | print samples_number 549 | print log, 550 | 551 | return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time,list(np.array(accuracys_g)*100) 552 | 553 | ### Handling the resume option: 554 | if RESUME: 555 | # Check if checkpoint from previous run is not corrupted. 556 | # Then overwrite some of the variables above. 557 | iters_to_consume, res_path, epoch, total_iters,\ 558 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 559 | lib.resumable(path=FOLDER_PREFIX, 560 | iter_key=iter_str, 561 | epoch_key=epoch_str, 562 | add_resume_counter=True, 563 | other_keys=[lowest_valid_str, 564 | corresp_test_str, 565 | test_nll_str]) 566 | # At this point we saved the pkl file. 567 | last_print_iters = total_iters 568 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 569 | # Consumes this much iters to get to the last point in training data. 570 | consume_time = time() 571 | consume_time = time() - consume_time 572 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 573 | format(consume_time, iters_to_consume) 574 | 575 | lib.load_params(res_path) 576 | print "Parameters from last available checkpoint loaded." 577 | 578 | tag='gen' 579 | test_cost, test_accuracy,test_time,test_accuracy_list=generate_and_save_samples(tag) 580 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time) -------------------------------------------------------------------------------- /HRNN_HF/models/three_tier/three_tier_train_valid.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from datetime import datetime 3 | print "Experiment started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 4 | exp_start = time() 5 | 6 | import os, sys, glob 7 | sys.path.insert(1, os.getcwd()) 8 | import argparse 9 | import itertools 10 | 11 | import numpy 12 | numpy.random.seed(123) 13 | np = numpy 14 | import random 15 | random.seed(123) 16 | 17 | import theano 18 | import theano.tensor as T 19 | import theano.ifelse 20 | import lasagne 21 | import scipy.io.wavfile 22 | 23 | import lib 24 | 25 | LEARNING_RATE = 0.001 26 | 27 | ### Parsing passed args/hyperparameters ### 28 | def get_args(): 29 | def t_or_f(arg): 30 | ua = str(arg).upper() 31 | if 'TRUE'.startswith(ua): 32 | return True 33 | elif 'FALSE'.startswith(ua): 34 | return False 35 | else: 36 | raise ValueError('Arg is neither `True` nor `False`') 37 | 38 | def check_non_negative(value): 39 | ivalue = int(value) 40 | if ivalue < 0: 41 | raise argparse.ArgumentTypeError("%s is not non-negative!" % value) 42 | return ivalue 43 | 44 | def check_positive(value): 45 | ivalue = int(value) 46 | if ivalue < 1: 47 | raise argparse.ArgumentTypeError("%s is not positive!" % value) 48 | return ivalue 49 | 50 | def check_unit_interval(value): 51 | fvalue = float(value) 52 | if fvalue < 0 or fvalue > 1: 53 | raise argparse.ArgumentTypeError("%s is not in [0, 1] interval!" % value) 54 | return fvalue 55 | 56 | # No default value here. Indicate every single arguement. 57 | parser = argparse.ArgumentParser( 58 | description='three_tier.py\nNo default value! Indicate every argument.') 59 | 60 | # TODO: Fix the descriptions 61 | # Hyperparameter arguements: 62 | parser.add_argument('--exp', help='Experiment name', 63 | type=str, required=False, default='_') 64 | parser.add_argument('--seq_len', help='How many samples to include in each Truncated BPTT pass', type=check_positive, required=True) 65 | parser.add_argument('--big_frame_size', help='How many samples per big frame',\ 66 | type=check_positive, required=True) 67 | parser.add_argument('--frame_size', help='How many samples per frame',\ 68 | type=check_positive, required=True) 69 | parser.add_argument('--weight_norm', help='Adding learnable weight normalization to all the linear layers (except for the embedding layer)',\ 70 | type=t_or_f, required=True) 71 | parser.add_argument('--emb_size', help='Size of embedding layer (> 0)', 72 | type=check_positive, required=True) # different than two_tier 73 | parser.add_argument('--skip_conn', help='Add skip connections to RNN', 74 | type=t_or_f, required=True) 75 | parser.add_argument('--dim', help='Dimension of RNN and MLPs',\ 76 | type=check_positive, required=True) 77 | parser.add_argument('--n_rnn', help='Number of layers in the stacked RNN', 78 | type=check_positive, choices=xrange(1,6), required=True) 79 | parser.add_argument('--rnn_type', help='GRU or LSTM', choices=['LSTM', 'GRU'],\ 80 | required=True) 81 | parser.add_argument('--learn_h0', help='Whether to learn the initial state of RNN',\ 82 | type=t_or_f, required=True) 83 | parser.add_argument('--q_levels', help='Number of bins for quantization of audio samples. Should be 256 for mu-law.',\ 84 | type=check_positive, required=True) 85 | parser.add_argument('--q_type', help='Quantization in linear-scale, a-law-companding, or mu-law compandig. With mu-/a-law quantization level shoud be set as 256',\ 86 | choices=['linear', 'a-law', 'mu-law'], required=True) 87 | parser.add_argument('--which_set', help='ONOM, BLIZZ, MUSIC, or HUCK', 88 | choices=['yp1000','ONOM', 'BLIZZ', 'MUSIC', 'HUCK','TIMIT'], required=True) 89 | parser.add_argument('--batch_size', help='size of mini-batch', 90 | type=check_positive, choices=[50,64, 128, 256], required=True) 91 | 92 | parser.add_argument('--resume', help='Resume the same model from the last checkpoint. Order of params are important. [for now]',\ 93 | required=False, default=False, action='store_true') 94 | 95 | args = parser.parse_args() 96 | 97 | # NEW 98 | # Create tag for this experiment based on passed args 99 | tag='three_tier_model' 100 | print "Created experiment tag for these args:" 101 | print tag 102 | 103 | return args, tag 104 | 105 | args, tag = get_args() 106 | 107 | SEQ_LEN = args.seq_len # How many samples to include in each truncated BPTT pass (512) 108 | #print "------------------previous SEQ_LEN:", SEQ_LEN 109 | # TODO: test incremental training 110 | #SEQ_LEN = 512 + 256 111 | #print "---------------------------new SEQ_LEN:", SEQ_LEN 112 | BIG_FRAME_SIZE = args.big_frame_size # how many samples per big frame (8) 113 | FRAME_SIZE = args.frame_size # How many samples per frame (2) 114 | WEIGHT_NORM = args.weight_norm #True 115 | EMB_SIZE = args.emb_size #(256) 116 | SKIP_CONN = args.skip_conn #(False) 117 | DIM = args.dim # Model dimensionality. (1024) 118 | BIG_DIM = DIM # Dimensionality for the slowest level. (1024) 119 | N_RNN = args.n_rnn # How many RNNs to stack in the frame-level model (1) 120 | N_BIG_RNN = N_RNN # how many RNNs to stack in the big-frame-level model (1) 121 | RNN_TYPE = args.rnn_type #GRU 122 | H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 #(1) 123 | LEARN_H0 = args.learn_h0 #(True) 124 | Q_LEVELS = args.q_levels # How many levels to use when discretizing samples. e.g. 256 = 8-bit scalar quantization #(256) 125 | Q_TYPE = args.q_type # log- or linear-scale #(linear) 126 | WHICH_SET = args.which_set #(MUSIC) 127 | BATCH_SIZE = args.batch_size #(128) 128 | RESUME = args.resume #(False) 129 | assert SEQ_LEN % BIG_FRAME_SIZE == 0,\ 130 | 'seq_len should be divisible by big_frame_size' 131 | assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\ 132 | 'big_frame_size should be divisible by frame_size' 133 | N_FRAMES = SEQ_LEN / FRAME_SIZE # Number of frames in each truncated BPTT pass 134 | 135 | if Q_TYPE == 'mu-law' and Q_LEVELS != 256: 136 | raise ValueError('For mu-law Quantization levels should be exactly 256!') 137 | 138 | # Fixed hyperparams 139 | GRAD_CLIP = 1 # Elementwise grad clip threshold 140 | BITRATE = 16000 141 | 142 | # Other constants 143 | TRAIN_MODE = 'iters' # To use PRINT_ITERS and STOP_ITERS 144 | #TRAIN_MODE = 'time' # To use PRINT_TIME and STOP_TIME 145 | #TRAIN_MODE = 'time-iters' 146 | # To use PRINT_TIME for validation, 147 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 148 | #TRAIN_MODE = 'iters-time' 149 | # To use PRINT_ITERS for validation, 150 | # and (STOP_ITERS, STOP_TIME), whichever happened first, for stopping exp. 151 | PRINT_ITERS = 5000 # Print cost, generate samples, save model checkpoint every N iterations. 152 | STOP_ITERS = 300000 # Stop after this many iterations 153 | PRINT_TIME = 2*60 # Print cost, generate samples, save model checkpoint every N seconds. 154 | STOP_TIME = 60*60*24*7 # Stop after this many seconds of actual training (not including time req'd to generate samples etc.) 155 | N_SEQS = 5 # Number of samples to generate every time monitoring. 156 | RESULTS_DIR = 'results_3t' 157 | FOLDER_PREFIX = os.path.join(RESULTS_DIR, tag) 158 | Q_ZERO = numpy.int32(Q_LEVELS//2) # Discrete value correponding to zero amplitude 159 | OVERLAP = BIG_FRAME_SIZE 160 | 161 | epoch_str = 'epoch' 162 | iter_str = 'iter' 163 | lowest_valid_str = 'lowest valid cost' 164 | corresp_test_str = 'correponding test cost' 165 | train_nll_str, valid_nll_str, test_nll_str = \ 166 | 'train NLL (bits)', 'valid NLL (bits)', 'test NLL (bits)' 167 | 168 | ### Create directories ### 169 | # FOLDER_PREFIX: root, contains: 170 | # log.txt, __note.txt, train_log.pkl, train_log.png [, model_settings.txt] 171 | # FOLDER_PREFIX/params: saves all checkpoint params as pkl 172 | # FOLDER_PREFIX/samples: keeps all checkpoint samples as wav 173 | # FOLDER_PREFIX/best: keeps the best parameters, samples, ... 174 | if not os.path.exists(FOLDER_PREFIX): 175 | os.makedirs(FOLDER_PREFIX) 176 | PARAMS_PATH = os.path.join(FOLDER_PREFIX, 'params') 177 | if not os.path.exists(PARAMS_PATH): 178 | os.makedirs(PARAMS_PATH) 179 | SAMPLES_PATH = os.path.join(FOLDER_PREFIX, 'samples') 180 | if not os.path.exists(SAMPLES_PATH): 181 | os.makedirs(SAMPLES_PATH) 182 | BEST_PATH = os.path.join(FOLDER_PREFIX, 'best') 183 | if not os.path.exists(BEST_PATH): 184 | os.makedirs(BEST_PATH) 185 | 186 | lib.print_model_settings(locals(), path=FOLDER_PREFIX, sys_arg=True) 187 | 188 | ### Import the data_feeder ### 189 | # Handling WHICH_SET 190 | if WHICH_SET == 'TIMIT': 191 | from datasets.dataset import TIMIT_train_feed_epoch as train_feeder 192 | from datasets.dataset import TIMIT_valid_feed_epoch as valid_feeder 193 | from datasets.dataset import TIMIT_test_feed_epoch as test_feeder 194 | 195 | def load_data(data_feeder): 196 | """ 197 | Helper function to deal with interface of different datasets. 198 | `data_feeder` should be `train_feeder`, `valid_feeder`, or `test_feeder`. 199 | """ 200 | return data_feeder(BATCH_SIZE, 201 | SEQ_LEN, 202 | OVERLAP, 203 | Q_LEVELS, 204 | Q_ZERO, 205 | Q_TYPE) 206 | 207 | ### Creating computation graph ### 208 | def big_frame_level_rnn(input_sequences, h0, reset): 209 | """ 210 | input_sequences.shape: (batch size, n big frames * BIG_FRAME_SIZE) #BIG_FRAME_SIZE=8 211 | h0.shape: (batch size, N_BIG_RNN, BIG_DIM) #N_BIG_RNN=1,BIG_DIM=1024 212 | reset.shape: () 213 | output[0].shape: (batch size, n frames, DIM) 214 | output[1].shape: same as h0.shape 215 | output[2].shape: (batch size, seq len, Q_LEVELS) 216 | """ 217 | frames = input_sequences.reshape(( 218 | input_sequences.shape[0], 219 | input_sequences.shape[1] // (2*BIG_FRAME_SIZE), 220 | 2*BIG_FRAME_SIZE 221 | )) 222 | 223 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 224 | # (a reasonable range to pass as inputs to the RNN) 225 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 226 | frames *= lib.floatX(1) 227 | 228 | # Initial state of RNNs 229 | learned_h0 = lib.param( 230 | 'BigFrameLevel.h0', 231 | numpy.zeros((N_BIG_RNN, H0_MULT*BIG_DIM), dtype=theano.config.floatX) 232 | ) 233 | # Handling LEARN_H0 234 | learned_h0.param = LEARN_H0 #True 235 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_BIG_RNN, H0_MULT*BIG_DIM) #broadcast according to batch size,H0_MULT=1 236 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 237 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) #if reset=1,h0=learned_h0; if reset=0,h0=h0 238 | 239 | # Handling RNN_TYPE 240 | # Handling SKIP_CONN 241 | if RNN_TYPE == 'GRU': 242 | rnns_out, last_hidden = lib.ops.stackedGRU('BigFrameLevel.GRU', 243 | N_BIG_RNN, 244 | 2*BIG_FRAME_SIZE, 245 | BIG_DIM, 246 | frames, 247 | h0=h0, 248 | weightnorm=WEIGHT_NORM, 249 | skip_conn=SKIP_CONN) 250 | elif RNN_TYPE == 'LSTM': 251 | rnns_out, last_hidden = lib.ops.stackedLSTM('BigFrameLevel.LSTM', 252 | N_BIG_RNN, 253 | 2*BIG_FRAME_SIZE, 254 | BIG_DIM, 255 | frames, 256 | h0=h0, 257 | weightnorm=WEIGHT_NORM, 258 | skip_conn=SKIP_CONN) 259 | 260 | output = lib.ops.Linear( #batch*timestep*dim 261 | 'BigFrameLevel.Output', 262 | BIG_DIM, 263 | DIM * BIG_FRAME_SIZE / FRAME_SIZE, #1024*8/2 264 | rnns_out, 265 | initialization='he', 266 | weightnorm=WEIGHT_NORM 267 | ) 268 | output = output.reshape((output.shape[0], output.shape[1] * BIG_FRAME_SIZE / FRAME_SIZE, DIM)) 269 | 270 | return (output, last_hidden) #last_hidden:#batch*1*dim 271 | 272 | def frame_level_rnn(input_sequences, other_input, h0, reset): 273 | """ 274 | input_sequences.shape: (batch size, n frames * FRAME_SIZE) #FRAME_SIZE=2 275 | other_input.shape: (batch size, n frames, DIM) 276 | h0.shape: (batch size, N_RNN, DIM) 277 | reset.shape: () 278 | output.shape: (batch size, n frames * FRAME_SIZE, DIM) 279 | """ 280 | frames = input_sequences.reshape(( 281 | input_sequences.shape[0], 282 | input_sequences.shape[1] // (2*FRAME_SIZE), 283 | 2*FRAME_SIZE 284 | )) 285 | 286 | # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] 287 | # (a reasonable range to pass as inputs to the RNN) 288 | frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) 289 | frames *= lib.floatX(1) 290 | 291 | gru_input = lib.ops.Linear( 292 | 'FrameLevel.InputExpand', 293 | 2*FRAME_SIZE, 294 | DIM, 295 | frames, 296 | initialization='he', 297 | weightnorm=WEIGHT_NORM, 298 | ) + other_input 299 | 300 | # Initial state of RNNs 301 | learned_h0 = lib.param( 302 | 'FrameLevel.h0', 303 | numpy.zeros((N_RNN, H0_MULT*DIM), dtype=theano.config.floatX) 304 | ) 305 | # Handling LEARN_H0 306 | learned_h0.param = LEARN_H0 307 | learned_h0 = T.alloc(learned_h0, h0.shape[0], N_RNN, H0_MULT*DIM) 308 | learned_h0 = T.unbroadcast(learned_h0, 0, 1, 2) 309 | #learned_h0 = T.patternbroadcast(learned_h0, [False] * learned_h0.ndim) 310 | h0 = theano.ifelse.ifelse(reset, learned_h0, h0) 311 | 312 | # Handling RNN_TYPE 313 | # Handling SKIP_CONN 314 | if RNN_TYPE == 'GRU': 315 | rnns_out, last_hidden = lib.ops.stackedGRU('FrameLevel.GRU', 316 | N_RNN, 317 | DIM, 318 | DIM, 319 | gru_input, 320 | h0=h0, 321 | weightnorm=WEIGHT_NORM, 322 | skip_conn=SKIP_CONN) 323 | elif RNN_TYPE == 'LSTM': 324 | rnns_out, last_hidden = lib.ops.stackedLSTM('FrameLevel.LSTM', 325 | N_RNN, 326 | DIM, 327 | DIM, 328 | gru_input, 329 | h0=h0, 330 | weightnorm=WEIGHT_NORM, 331 | skip_conn=SKIP_CONN) 332 | 333 | output = lib.ops.Linear( 334 | 'FrameLevel.Output', 335 | DIM, 336 | FRAME_SIZE * DIM, 337 | rnns_out, 338 | initialization='he', 339 | weightnorm=WEIGHT_NORM 340 | ) 341 | output = output.reshape((output.shape[0], output.shape[1] * FRAME_SIZE, DIM)) 342 | 343 | return (output, last_hidden) 344 | 345 | def sample_level_predictor(frame_level_outputs, prev_samples): 346 | """ 347 | frame_level_outputs.shape: (batch size, DIM) 348 | prev_samples.shape: (batch size, FRAME_SIZE) 349 | output.shape: (batch size, Q_LEVELS) 350 | """ 351 | # Handling EMB_SIZE 352 | if EMB_SIZE == 0: # no support for one-hot in three_tier and one_tier. 353 | prev_samples = lib.ops.T_one_hot(prev_samples, Q_LEVELS) 354 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, Q_LEVELS) 355 | last_out_shape = Q_LEVELS 356 | elif EMB_SIZE > 0: #The embedding steps maps each of the q discrete values to a real-valued vector embedding. 357 | prev_samples = lib.ops.Embedding( #after embedding, the dim is batch size*FRANME_SIZE*EMB_SIZE 358 | 'SampleLevel.Embedding', 359 | Q_LEVELS, 360 | EMB_SIZE, 361 | prev_samples) 362 | # (BATCH_SIZE*N_FRAMES*FRAME_SIZE, FRAME_SIZE, EMB_SIZE), f32 363 | last_out_shape = EMB_SIZE 364 | else: 365 | raise ValueError('EMB_SIZE cannot be negative.') 366 | 367 | prev_samples = prev_samples.reshape((-1, FRAME_SIZE * last_out_shape)) #dim:batch size*(FRAME_SIZE*EMB_SIZE) 368 | 369 | out = lib.ops.Linear( 370 | 'SampleLevel.L1_PrevSamples', 371 | FRAME_SIZE * last_out_shape, 372 | DIM, 373 | prev_samples, 374 | biases=False, 375 | initialization='he', 376 | weightnorm=WEIGHT_NORM 377 | ) 378 | 379 | out += frame_level_outputs 380 | # out = T.nnet.relu(out) # commented out to be similar to two_tier 381 | 382 | out = lib.ops.Linear('SampleLevel.L2', 383 | DIM, 384 | DIM, 385 | out, 386 | initialization='he', 387 | weightnorm=WEIGHT_NORM) 388 | out = T.nnet.relu(out) 389 | 390 | # L3 391 | out = lib.ops.Linear('SampleLevel.L3', 392 | DIM, 393 | DIM, 394 | out, 395 | initialization='he', 396 | weightnorm=WEIGHT_NORM) 397 | out = T.nnet.relu(out) 398 | 399 | # Output 400 | # We apply the softmax later 401 | out = lib.ops.Linear('SampleLevel.Output', 402 | DIM, 403 | Q_LEVELS, 404 | out, 405 | weightnorm=WEIGHT_NORM) 406 | return out 407 | 408 | sequences_8k = T.imatrix('sequences_8k') #batch size*samplenum 409 | sequences_up = T.imatrix('sequences_up') 410 | h0 = T.tensor3('h0') #(batch size, N_RNN, DIM) 411 | big_h0 = T.tensor3('big_h0') #(batch size, N_BIG_RNN, BIG_DIM) 412 | reset = T.iscalar('reset') 413 | mask = T.matrix('mask') #batch size*samplenum 414 | batch_size =T.iscalar('batch_size') 415 | lr=T.scalar('lr') 416 | 417 | big_input_sequences = sequences_8k #The last BIG_FRAME_SIZE frames do not need (tier3) 418 | big_input_sequences=big_input_sequences.reshape((1, batch_size, 1, -1)) 419 | big_input_sequences=T.nnet.neighbours.images2neibs(big_input_sequences, (1, 2*OVERLAP), neib_step=(1, OVERLAP), mode='valid') 420 | big_input_sequences=big_input_sequences.reshape((batch_size,-1)) 421 | 422 | input_sequences = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE)] #(tier2) 423 | input_sequences=input_sequences.reshape((1, batch_size, 1, -1)) 424 | input_sequences=T.nnet.neighbours.images2neibs(input_sequences, (1, 2*FRAME_SIZE), neib_step=(1, FRAME_SIZE), mode='valid') 425 | input_sequences=input_sequences.reshape((batch_size,-1)) 426 | target_sequences = sequences_up[:,0:-OVERLAP] #groundtrues 427 | 428 | target_mask = mask[:,0:-OVERLAP] 429 | 430 | big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, big_h0, reset)#tier3->tier2 431 | 432 | frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1 433 | 434 | prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)] 435 | prev_samples = prev_samples.reshape((1, batch_size, 1, -1)) 436 | prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...]) 437 | prev_samples = prev_samples.reshape((batch_size * SEQ_LEN, FRAME_SIZE)) 438 | 439 | sample_level_outputs = sample_level_predictor( 440 | frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)), 441 | prev_samples 442 | ) #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...] 443 | 444 | accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences) 445 | accuracy=accuracy*target_mask 446 | accuracy=T.sum(accuracy,axis=1) 447 | mask_sum=T.sum(target_mask,axis=1) 448 | 449 | cost = T.nnet.categorical_crossentropy( 450 | T.nnet.softmax(sample_level_outputs), #Every row represents a distribution(256 propability) 451 | target_sequences.flatten() #A list, represent the groundtruth of every row 452 | ) 453 | cost = cost.reshape(target_sequences.shape) 454 | cost = cost * target_mask #dim: batch*num 455 | # Don't use these lines; could end up with NaN 456 | # Specially at the end of audio files where mask is 457 | # all zero for some of the shorter files in mini-batch. 458 | #cost = cost.sum(axis=1) / target_mask.sum(axis=1) 459 | #cost = cost.mean(axis=0) 460 | cost_sum=T.sum(cost,axis=1) 461 | # Use this one instead. 462 | cost = cost.sum() 463 | cost = cost / target_mask.sum() #cost average by samples 464 | 465 | # By default we report cross-entropy cost in bits. 466 | # Switch to nats by commenting out this line: 467 | # log_2(e) = 1.44269504089 468 | #cost = cost * lib.floatX(numpy.log2(numpy.e)) 469 | 470 | ########### 471 | all_params = lib.get_params(cost, lambda x: hasattr(x, 'param') and x.param==True) #if LEARN_H0=True,then learn_h0 is included in parmeters to train 472 | 473 | lib.print_params_info(all_params, path=FOLDER_PREFIX) 474 | 475 | grads = T.grad(cost, wrt=all_params, disconnected_inputs='warn') 476 | grads = [T.clip(g, lib.floatX(-GRAD_CLIP), lib.floatX(GRAD_CLIP)) for g in grads] 477 | 478 | updates = lasagne.updates.adam(grads, all_params,learning_rate=lr) 479 | 480 | # Training function(s) 481 | train_fn = theano.function( 482 | [sequences_8k,sequences_up, big_h0, h0, reset, mask,batch_size,lr], 483 | [cost, new_big_h0, new_h0], 484 | updates=updates, 485 | on_unused_input='warn' 486 | ) 487 | 488 | # Validation and Test function, hence no updates 489 | valid_fn = theano.function( 490 | [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size], 491 | [cost_sum, accuracy,mask_sum,new_big_h0,new_h0], 492 | on_unused_input='warn' 493 | ) 494 | 495 | test_fn=theano.function( 496 | [sequences_8k,sequences_up, big_h0,h0, reset, mask,batch_size], 497 | [cost_sum,accuracy,mask_sum,lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),new_big_h0,new_h0], 498 | on_unused_input='warn' 499 | ) 500 | 501 | def generate_and_save_samples(tag): 502 | def write_audio_file(name, data): 503 | data = data.astype('float32') 504 | #data -= data.min() 505 | #data /= data.max() 506 | #data -= 0.5 507 | #data *= 0.95 508 | scipy.io.wavfile.write( 509 | os.path.join(SAMPLES_PATH, name+'.wav'), 510 | BITRATE, 511 | data) 512 | 513 | total_time=time() 514 | costs_g = [] 515 | accuracys_g=[] 516 | count=0 517 | data_feeder = load_data(test_feeder) 518 | for seqs_g_8k,seqs_g_up, reset_g, end_flag_g,mask_g,batch_g,seqs_g_8k_real in data_feeder: 519 | if reset_g==1: 520 | big_h0_g = numpy.zeros((batch_g, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 521 | h0_g = numpy.zeros((batch_g, N_RNN, H0_MULT*DIM), dtype='float32') 522 | cost_batch=np.zeros((batch_g,),dtype='float32') 523 | accuracy_batch=np.zeros((batch_g,),dtype='float32') 524 | mask_batch=np.zeros((batch_g,),dtype='float32') 525 | count+=1 526 | cost_g, accuracy_g,mask_sum_g,sample, big_h0_g,h0_g = test_fn(seqs_g_8k,seqs_g_up, big_h0_g,h0_g, reset_g, mask_g,batch_g) 527 | cost_batch=cost_batch+cost_g 528 | accuracy_batch=accuracy_batch+accuracy_g 529 | mask_batch=mask_batch+mask_sum_g 530 | if end_flag_g==1: 531 | costs_g.extend(list(cost_batch/mask_batch)) 532 | accuracys_g.extend(list(accuracy_batch/mask_batch)) 533 | 534 | if count==1: 535 | if reset_g==1: 536 | samples_low=seqs_g_8k_real[:,0:-OVERLAP] 537 | samples=sample 538 | masks_g=mask_g[:,0:-OVERLAP] 539 | else: 540 | samples_low=np.concatenate([samples_low,seqs_g_8k_real[:,0:-OVERLAP]],axis=1) 541 | samples=np.concatenate([samples,sample],axis=1) 542 | masks_g=np.concatenate([masks_g,mask_g[:,0:-OVERLAP]],axis=1) 543 | 544 | 545 | for i in xrange(N_SEQS): 546 | samples_lowi=samples_low[i] 547 | samplei=samples[i] 548 | maski=masks_g[i] 549 | samples_lowi=samples_lowi[0:len(np.where(maski==1)[0])] 550 | samplei=samplei[0:len(np.where(maski==1)[0])] 551 | if Q_TYPE == 'mu-law': 552 | from datasets.dataset import mu2linear 553 | samplei = mu2linear(samplei) 554 | write_audio_file("sample_{}_{}".format(tag, i), samplei/3+samples_lowi) 555 | 556 | total_time = time() - total_time 557 | log = "{} samples generated in {} seconds." 558 | log = log.format(N_SEQS, total_time) 559 | print log, 560 | 561 | return numpy.mean(costs_g),numpy.mean(accuracys_g)*100,total_time 562 | 563 | 564 | def monitor(data_feeder): 565 | """ 566 | Cost and time of test_fn on a given dataset section. 567 | Pass only one of `valid_feeder` or `test_feeder`. 568 | Don't pass `train_feed`. 569 | 570 | :returns: 571 | Mean cost over the input dataset (data_feeder) 572 | Total time spent 573 | """ 574 | _total_time = time() 575 | _costs = [] 576 | _accuracys=[] 577 | _data_feeder = load_data(data_feeder) 578 | for _seqs_8k,_seqs_up, _reset, _end_flag,_mask,_batch,_seqs_8k_real in _data_feeder: 579 | if _reset==1: 580 | _big_h0=numpy.zeros((_batch, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 581 | _h0 = numpy.zeros((_batch, N_RNN, H0_MULT*DIM), dtype='float32') 582 | _cost_batch=np.zeros((_batch,),dtype='float32') 583 | _accuracy_batch=np.zeros((_batch,),dtype='float32') 584 | _mask_batch=np.zeros((_batch,),dtype='float32') 585 | _cost, _accuracy,_mask_sum,_big_h0,_h0 = valid_fn(_seqs_8k,_seqs_up, _big_h0,_h0, _reset, _mask,_batch) 586 | _cost_batch=_cost_batch+_cost 587 | _accuracy_batch=_accuracy_batch+_accuracy 588 | _mask_batch=_mask_batch+_mask_sum 589 | if _end_flag==1: 590 | _costs.extend(list(_cost_batch/_mask_batch)) 591 | _accuracys.extend(list(_accuracy_batch/_mask_batch)) 592 | 593 | 594 | return numpy.mean(_costs), numpy.mean(_accuracys)*100,time() - _total_time 595 | 596 | print "Wall clock time spent before training started: {:.2f}h"\ 597 | .format((time()-exp_start)/3600.) 598 | print "Training!" 599 | total_iters = 0 600 | total_time = 0. 601 | last_print_time = 0. 602 | last_print_iters = 0 603 | costs = [] 604 | lowest_valid_cost = numpy.finfo(numpy.float32).max 605 | corresponding_test_cost = numpy.finfo(numpy.float32).max 606 | new_lowest_cost = False 607 | end_of_batch = False 608 | epoch = 0 609 | learning_rate=LEARNING_RATE 610 | 611 | # Initial load train dataset 612 | tr_feeder = load_data(train_feeder) 613 | 614 | ### Handling the resume option: 615 | if RESUME: 616 | # Check if checkpoint from previous run is not corrupted. 617 | # Then overwrite some of the variables above. 618 | iters_to_consume, res_path, epoch, total_iters,\ 619 | [lowest_valid_cost, corresponding_test_cost, test_cost] = \ 620 | lib.resumable(path=FOLDER_PREFIX, 621 | iter_key=iter_str, 622 | epoch_key=epoch_str, 623 | add_resume_counter=True, 624 | other_keys=[lowest_valid_str, 625 | corresp_test_str, 626 | test_nll_str]) 627 | # At this point we saved the pkl file. 628 | last_print_iters = total_iters 629 | print "### RESUMING JOB FROM EPOCH {}, ITER {}".format(epoch, total_iters) 630 | # Consumes this much iters to get to the last point in training data. 631 | consume_time = time() 632 | for i in xrange(iters_to_consume): 633 | tr_feeder.next() 634 | consume_time = time() - consume_time 635 | print "Train data ready in {:.2f}secs after consuming {} minibatches.".\ 636 | format(consume_time, iters_to_consume) 637 | 638 | lib.load_params(res_path) 639 | print "Parameters from last available checkpoint loaded." 640 | 641 | while True: 642 | # THIS IS ONE ITERATION 643 | if total_iters % 500 == 0: 644 | print total_iters, 645 | 646 | total_iters += 1 647 | 648 | try: 649 | # Take as many mini-batches as possible from train set 650 | mini_batch = tr_feeder.next() 651 | except StopIteration: 652 | # Mini-batches are finished. Load it again. 653 | # Basically, one epoch. 654 | tr_feeder = load_data(train_feeder) 655 | 656 | # and start taking new mini-batches again. 657 | mini_batch = tr_feeder.next() 658 | epoch += 1 659 | end_of_batch = True 660 | print "[Another epoch]", 661 | 662 | seqs_8k, seqs_up,reset, end_flag,mask,batch_num,seqs_8k_real = mini_batch 663 | if reset==1: 664 | big_h0=numpy.zeros((batch_num, N_BIG_RNN, H0_MULT*DIM), dtype='float32') 665 | h0 = numpy.zeros((batch_num, N_RNN, H0_MULT*DIM), dtype='float32') 666 | 667 | start_time = time() 668 | cost,big_h0,h0 = train_fn(seqs_8k, seqs_up, big_h0,h0, reset, mask,batch_num,learning_rate) 669 | total_time += time() - start_time 670 | #print "This cost:", cost, "This h0.mean()", h0.mean() 671 | 672 | costs.append(cost) 673 | 674 | # Monitoring step 675 | if (TRAIN_MODE=='iters' and total_iters-last_print_iters == PRINT_ITERS) or \ 676 | (TRAIN_MODE=='time' and total_time-last_print_time >= PRINT_TIME) or \ 677 | (TRAIN_MODE=='time-iters' and total_time-last_print_time >= PRINT_TIME) or \ 678 | (TRAIN_MODE=='iters-time' and total_iters-last_print_iters >= PRINT_ITERS) or \ 679 | end_of_batch: 680 | # 0. Validation 681 | print "\nValidation!", 682 | valid_cost, valid_accuracy,valid_time = monitor(valid_feeder) 683 | print "Done!" 684 | 685 | # 1. Test 686 | test_time = 0. 687 | # Only when the validation cost is improved get the cost for test set. 688 | if valid_cost < lowest_valid_cost: 689 | lowest_valid_cost = valid_cost 690 | print "\n>>> Best validation cost of {} reached."\ 691 | .format(valid_cost), 692 | #test_cost, test_time = monitor(test_feeder) 693 | #print "Done!" 694 | # Report last one which is the lowest on validation set: 695 | #print ">>> test cost:{}\ttotal time:{}".format(test_cost, test_time) 696 | #corresponding_test_cost = test_cost 697 | new_lowest_cost = True 698 | 699 | tag = "e{}_i{}_t{:.2f}_tr{:.4f}_v{:.4f}" 700 | tag = tag.format(epoch, 701 | total_iters, 702 | total_time/3600, 703 | numpy.mean(cost), 704 | valid_cost) 705 | tag += ("_best" if new_lowest_cost else "") 706 | 707 | print "Sampling!", 708 | # Generate samples 709 | test_cost, test_accuracy,test_time=generate_and_save_samples(tag) 710 | print "\n>>> test cost:{}\ttest accuracy:{}%\ttotal time:{}".format(test_cost, test_accuracy,test_time) 711 | if new_lowest_cost: 712 | corresponding_test_cost = test_cost 713 | print "Done!" 714 | 715 | # 2. Stdout the training progress 716 | print_info = "epoch:{}\ttotal iters:{}\twall clock time:{:.2f}h\n" 717 | print_info += ">>> Lowest valid cost:{}\t Corresponding test cost:{}\n" 718 | print_info += "\ttrain cost:{:.4f}\ttotal time:{:.2f}h\tper iter:{:.3f}s\n" 719 | print_info += "\tvalid cost:{:.4f}\tvalid accuracy:{:.4f}%\ttotal time:{:.2f}h\n" 720 | print_info += "\ttest cost:{:.4f}\ttest accuracy:{:.4f}%\ttotal time:{:.2f}h" 721 | print_info = print_info.format(epoch, 722 | total_iters, 723 | (time()-exp_start)/3600, 724 | lowest_valid_cost, 725 | corresponding_test_cost, 726 | numpy.mean(costs), 727 | total_time/3600, 728 | total_time/total_iters, 729 | valid_cost, 730 | valid_accuracy, 731 | valid_time/3600, 732 | test_cost, 733 | test_accuracy, 734 | test_time/3600) 735 | print print_info 736 | 737 | 738 | # 3. Save params of model (IO bound, time consuming) 739 | # If saving params is not successful, there shouldn't be any trace of 740 | # successful monitoring step in train_log as well. 741 | print "Saving params!", 742 | lib.save_params( 743 | os.path.join(PARAMS_PATH, 'params_{}.pkl'.format(tag)) 744 | ) 745 | print "Done!" 746 | 747 | # 4. Save and graph training progress (fast) 748 | training_info = {epoch_str : epoch, 749 | iter_str : total_iters, 750 | train_nll_str : numpy.mean(costs), 751 | valid_nll_str : valid_cost, 752 | test_nll_str : test_cost, 753 | lowest_valid_str : lowest_valid_cost, 754 | corresp_test_str : corresponding_test_cost, 755 | 'train time' : total_time, 756 | 'valid time' : valid_time, 757 | 'test time' : test_time, 758 | 'wall clock time' : time()-exp_start} 759 | lib.save_training_info(training_info, FOLDER_PREFIX) 760 | print "Train info saved!", 761 | 762 | # y_axis_strs = [train_nll_str, valid_nll_str, test_nll_str] 763 | # lib.plot_traing_info(iter_str, y_axis_strs, FOLDER_PREFIX) 764 | print "And plotted!" 765 | 766 | if total_iters-last_print_iters == PRINT_ITERS: 767 | # If we are here b/c of onom_end_of_batch, we shouldn't mess 768 | # with costs and last_print_iters 769 | costs = [] 770 | last_print_time += PRINT_TIME 771 | last_print_iters += PRINT_ITERS 772 | 773 | if epoch==6 and end_of_batch==True: 774 | learning_rate=0.0001 775 | print "\n Now learning rate is 0.0001." 776 | 777 | end_of_batch = False 778 | new_lowest_cost = False 779 | 780 | print "Validation Done!\nBack to Training..." 781 | 782 | if (TRAIN_MODE=='iters' and total_iters == STOP_ITERS) or \ 783 | (TRAIN_MODE=='time' and total_time >= STOP_TIME) or \ 784 | ((TRAIN_MODE=='time-iters' or TRAIN_MODE=='iters-time') and \ 785 | (total_iters == STOP_ITERS or total_time >= STOP_TIME)): 786 | 787 | print "Done! Total iters:", total_iters, "Total time: ", total_time 788 | print "Experiment ended at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') 789 | print "Wall clock time spent: {:.2f}h"\ 790 | .format((time()-exp_start)/3600) 791 | 792 | sys.exit() -------------------------------------------------------------------------------- /HRNN_HF/readme.md: -------------------------------------------------------------------------------- 1 | The HRNN system in the paper: 2 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018. 3 | Usage: 4 | First enter the root directory of the folder: `cd HRNN_HF`. 5 | 6 | Data preparation: 7 | Put the train, validiation and test waveforms (16kHz sample rate) into the corresponding folder in directory 'datasets/TIMIT', 8 | then run `python datasets/TIMIT/_2npy_hf.py` to generate the packaged data. 9 | 10 | Traning and validiation: 11 | Run: 12 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/three_tier_train_valid.py --exp BEST_3TIER --seq_len 480 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64` 13 | 14 | Test: 15 | Run: 16 | `THEANO_FLAGS='floatX=float32,device=gpu0,allow_gc=False,lib.cnmem=0.95' python -u models/three_tier/three_tier_test.py --exp BEST_3TIER --seq_len 480 --big_frame_size 16 --frame_size 4 --weight_norm True --emb_size 256 --skip_conn False --dim 1024 --n_rnn 1 --rnn_type LSTM --learn_h0 True --q_levels 256 --q_type mu-law --which_set TIMIT --batch_size 64` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical-Recurrent-Neural-Networks-for-Speech-Bandwidth-Extension 2 | Codes of the paper: 3 | * Zhen-Hua Ling , Yang Ai, Yu Gu, and Li-Rong Dai, "Waveform Modeling and Generation Using Hierarchical Recurrent Neural Networks for Speech Bandwidth Extension," IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 26, no. 5, pp. 883-894, 2018. 4 | 5 | ./HRNN_HF is the code of the HRNN system in the paper. 6 | 7 | ./CHRNN_HF is the code of the CHRNN system in the paper. 8 | --------------------------------------------------------------------------------