├── README.md ├── frames └── foo.txt ├── leaky_big_training.py ├── manne_gui.py ├── models ├── ae_noskip_chroma_sig_oo_trained_decoder.h5 ├── ae_noskip_chroma_sig_oo_trained_encoder.h5 ├── ae_skip_oo_trained_decoder.h5 ├── ae_skip_oo_trained_encoder.h5 └── foo.txt ├── requirements.txt ├── synth_manne_gui.py └── wav2frames.py /README.md: -------------------------------------------------------------------------------- 1 | # manne 2 | Remaking My ANNe effect 3 | 4 | Tested with Python 2.7.12 5 | 6 | I suggest using a virtualenvironment to ensure that all packages are correct 7 | 8 | ``` 9 | mkdir venv 10 | virtualenv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | From what I remember, this application requires ffmpeg and portaudio19-dev 16 | 17 | To start the program, run 18 | 19 | ``` 20 | python manne_gui.py 21 | ``` 22 | 23 | Type the relative path of the track you would like to filter into the "Track Name" box. 24 | 25 | Type the prefix of the trained model you would like to run (in this case just ```all_frames```) into the "Model Name" box. 26 | 27 | Clicking "START" will start to filter the track through the neural network and play out audio in real time. Change the value of the sliders to change the latent representation of the audio. 28 | 29 | Clicking "PAUSE" will pause the audio output and freeze the track where it is. I'm pretty sure clicking "START" again will resume the track. 30 | 31 | To render an entire track with fixed latent activations, click "RENDER". The song will be output as "rendered.wav" in your given directory. It should be a mono wav file, 16bit PCM, 44.1kHz. 32 | 33 | To begin a recording of you altering the latents as the track plays, click "RECORD" and begin moving the sliders. 34 | To end a recording, just click the "RECORD" button again so that it is unchecked. The recorded wav file will be output as "recorded.wav" in your given directory. It should be a mono wav file, 16bit PCM, 44.1kHz. 35 | 36 | Clicking "QUIT" will close the application. 37 | 38 | ^_^ 39 | 40 | "MANNe" pronunciation guide: https://www.youtube.com/watch?v=EmZvOhHF85I&feature=youtu.be&t=5 41 | -------------------------------------------------------------------------------- /frames/foo.txt: -------------------------------------------------------------------------------- 1 | helo 2 | -------------------------------------------------------------------------------- /leaky_big_training.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from keras.layers import Input, Dense, Lambda, Concatenate, Dropout, LeakyReLU 5 | from keras.models import Model, Sequential, load_model, clone_model 6 | 7 | from keras.regularizers import l2 8 | from keras.losses import mse 9 | from keras.callbacks import LambdaCallback 10 | from keras.optimizers import Adam 11 | from keras.utils import plot_model 12 | from keras import backend as K 13 | import numpy as np 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | import argparse 17 | import os 18 | import librosa 19 | import tensorflow as tf 20 | 21 | global alpha 22 | global beta 23 | beta = K.variable(3e-7) 24 | alpha = K.variable(0.3) 25 | 26 | def change_params(epoch, logs): 27 | if epoch<=5 and epoch%1==0: 28 | K.set_value(beta,K.get_value(beta)+2e-5) 29 | if epoch == 30: 30 | K.set_value(alpha,0.0) 31 | 32 | def get_arguments(): 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--filename_in', type=str) 35 | parser.add_argument('--filename_out', type=str) 36 | parser.add_argument('--net_type', type=str) 37 | parser.add_argument('--mode', type=str) 38 | parser.add_argument('--trained_model_name', type=str, default='') 39 | parser.add_argument('--n_epochs', type=int, default=5) 40 | parser.add_argument('--skip', type=bool, default=False) 41 | return parser.parse_args() 42 | 43 | class Manne: 44 | def __init__(self, args): 45 | self.frames = [] 46 | self.X_train = [] 47 | self.X_val = [] 48 | self.X_test = [] 49 | self.encoder = [] 50 | self.decoder = [] 51 | self.network = [] 52 | self.encoder_widths = [] 53 | self.decoder_widths = [] 54 | 55 | self.z_mean = K.placeholder(shape=(8,)) 56 | self.z_log_var = K.placeholder(shape=(8,)) 57 | self.beta_changer = [] 58 | 59 | self.n_epochs = args.n_epochs 60 | self.net_type = args.net_type 61 | self.skip = args.skip 62 | self.filename_in = args.filename_in 63 | self.filename_out = args.filename_out 64 | self.trained_model_name = args.trained_model_name 65 | 66 | def do_everything(self): 67 | self.load_dataset() 68 | self.define_net() 69 | self.make_net() 70 | self.train_net() 71 | self.evaluate_net() 72 | self.save_latents() 73 | 74 | def just_plot(self): 75 | self.load_dataset() 76 | self.load_net() 77 | self.make_net() 78 | adam_rate = 5e-4 79 | self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.my_mse, metrics=[self.my_mse]) 80 | self.evaluate_net() 81 | self.save_latents() 82 | 83 | def sampling(self,args): 84 | self.z_mean, self.z_log_var = args 85 | batch = K.shape(self.z_mean)[0] 86 | dim = K.int_shape(self.z_mean)[1] 87 | epsilon = K.random_normal(shape=(batch,dim)) 88 | return self.z_mean + K.exp(0.5*self.z_log_var)*epsilon 89 | 90 | def get_loss(self, inputs, outputs): 91 | global beta 92 | reconstruction_loss = mse(inputs[:,:2049],outputs) 93 | kl_loss = 1+self.z_log_var-K.square(self.z_mean)-K.exp(self.z_log_var) 94 | kl_loss = K.sum(kl_loss, axis=-1) 95 | kl_loss *= -0.5*beta 96 | vae_loss = K.sum(reconstruction_loss+kl_loss) 97 | return vae_loss 98 | 99 | def my_mse(self, inputs, outputs): 100 | return mse(inputs[:,:2049],outputs) 101 | 102 | def my_kl(self, inputs, outputs): 103 | kl_loss = 1+self.z_log_var-K.square(self.z_mean)-K.exp(self.z_log_var) 104 | kl_loss = K.sum(kl_loss, axis=-1) 105 | kl_loss *= -0.5 106 | return kl_loss 107 | 108 | def load_net(self): 109 | enc_filename = os.path.join(os.getcwd(),'models/'+self.trained_model_name+'_trained_encoder.h5') 110 | print(enc_filename) 111 | self.encoder = load_model(enc_filename,custom_objects={'sampling': self.sampling}, compile=False) 112 | dec_filename = os.path.join(os.getcwd(),'models/'+self.trained_model_name+'_trained_decoder.h5') 113 | self.decoder = load_model(dec_filename,custom_objects={'sampling': self.sampling}, compile=False) 114 | 115 | def load_dataset(self): 116 | filename = 'frames/'+self.filename_in+'_frames.npy' #Static Data used for training net 117 | filepath = os.path.join(os.getcwd(),filename) 118 | orig_frames = np.load(filepath) 119 | orig_frames = np.asarray(orig_frames) 120 | len_frames = orig_frames.shape[0] 121 | 122 | chroma = np.transpose(librosa.feature.chroma_stft(S=np.transpose(orig_frames), sr=44100)) 123 | chroma = librosa.feature.chroma_stft(S=np.transpose(orig_frames), sr=44100) 124 | chroma = (chroma == chroma.max(axis=1)[:,None]).astype(int) 125 | chroma = np.transpose(chroma) 126 | augmentations = chroma 127 | 128 | self.frames = np.hstack((orig_frames,augmentations)) 129 | 130 | if args.filename_in == 'one_octave': 131 | self.X_train = self.frames[:16685,:] 132 | self.X_val = self.frames[16685:17998,:] 133 | self.X_test = self.frames[17998:,:] 134 | elif args.filename_in == 'five_octave': 135 | self.X_train = self.frames[:78991,:] 136 | self.X_val = self.frames[78991:84712,:] 137 | self.X_test = self.frames[84712:,:] 138 | elif args.filename_in == 'guitar': 139 | self.X_train = self.frames[:62018,:] 140 | self.X_val = self.frames[62018:66835,:] 141 | self.X_test = self.frames[66835:,:] 142 | elif args.filename_in == 'violin': 143 | self.X_train = self.frames[:90571,:] 144 | self.X_val = self.frames[90571:100912,:] 145 | self.X_test = self.frames[100912:,:] 146 | else: 147 | raise Exception('Unexpected filename_in') 148 | 149 | def define_net(self): 150 | if self.net_type=='vae': 151 | l2_penalty = 0 152 | else: 153 | l2_penalty = 1e-7 154 | 155 | #8 Neuron Model from the paper 156 | self.encoder_widths = [1024,512,256,128,64,32,16,8] 157 | self.decoder_widths = [16,32,64,128,256,512,1024] 158 | 159 | #Lighter weight model 160 | #self.encoder_widths = [512,256,128,64,8] 161 | #self.decoder_widths = [64,128,256,512] 162 | 163 | decoder_outdim = 2049 164 | drop = 0.0 165 | alpha_val=0.1 166 | 167 | input_spec = Input(shape=(self.frames.shape[1],)) 168 | encoded = Dense(units=self.encoder_widths[0], 169 | activation=None, 170 | kernel_regularizer=l2(l2_penalty))(input_spec) 171 | encoded = LeakyReLU(alpha=alpha_val)(encoded) 172 | for width in self.encoder_widths[1:-1]: 173 | encoded = Dense(units=width, 174 | activation=None, 175 | kernel_regularizer=l2(l2_penalty))(encoded) 176 | encoded = LeakyReLU(alpha=alpha_val)(encoded) 177 | 178 | encoded = Dense(units=self.encoder_widths[-1], activation='sigmoid', kernel_regularizer=l2(l2_penalty))(encoded) 179 | 180 | if self.net_type == 'vae': 181 | self.z_mean = Dense(self.encoder_widths[-1],input_shape=(self.encoder_widths[-1],), name='z_mean')(encoded) 182 | self.z_log_var = Dense(self.encoder_widths[-1],input_shape=(self.encoder_widths[-1],), name='z_log_var')(encoded) 183 | z = Lambda(self.sampling,output_shape=(self.encoder_widths[-1],), name='z')([self.z_mean,self.z_log_var]) 184 | self.encoder = Model(input_spec, [self.z_mean, self.z_log_var, z]) 185 | else: 186 | self.encoder = Model(input_spec, encoded) 187 | 188 | if self.skip == True: 189 | input_latent = Input(shape=(self.encoder_widths[-1]+12,)) 190 | else: 191 | input_latent = Input(shape=(self.encoder_widths[-1],)) 192 | 193 | decoded = Dense(units=self.decoder_widths[0], 194 | activation=None, 195 | kernel_regularizer=l2(l2_penalty))(input_latent) 196 | decoded = LeakyReLU(alpha=alpha_val)(decoded) 197 | for width in self.decoder_widths[1:]: 198 | decoded = Dense(units=width, 199 | activation=None, 200 | kernel_regularizer=l2(l2_penalty))(decoded) 201 | decoded = LeakyReLU(alpha=alpha_val)(decoded) 202 | decoded = Dense(units=2049, 203 | activation='relu', 204 | kernel_regularizer=l2(l2_penalty))(decoded) 205 | self.decoder = Model(input_latent,decoded) 206 | 207 | def make_net(self): 208 | auto_input = Input(shape=(self.frames.shape[1],)) 209 | encoded = self.encoder(auto_input) 210 | 211 | if self.net_type == 'vae': 212 | latents = encoded[2] 213 | else: 214 | latents = encoded 215 | 216 | if self.skip == True: 217 | chroma_input = Input(shape=(12,)) 218 | new_latents = Concatenate()([latents,chroma_input]) 219 | decoded = self.decoder(new_latents) 220 | self.network = Model(inputs=[auto_input,chroma_input], outputs=decoded) 221 | else: 222 | decoded = self.decoder(latents) 223 | self.network = Model(inputs=[auto_input], outputs=decoded) 224 | 225 | print('\n net summary \n') 226 | self.network.summary() 227 | print('\n encoder summary \n') 228 | self.encoder.summary() 229 | print('\n decoder summary \n') 230 | self.decoder.summary() 231 | 232 | def train_net(self): 233 | adam_rate = 5e-4 234 | if self.skip == True: #Handling case where Keras expects two inputs 235 | train_data = [self.X_train,self.X_train[:,-12:]] 236 | val_data = [self.X_val,self.X_val[:,-12:]] 237 | else: 238 | train_data = self.X_train 239 | val_data = self.X_val 240 | if self.net_type == 'vae': 241 | beta_changer = LambdaCallback(on_epoch_end=change_params) 242 | self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.get_loss, metrics=[self.my_mse, self.my_kl]) 243 | self.network.fit(x=train_data, y=self.X_train, 244 | epochs=self.n_epochs, 245 | batch_size=200, 246 | shuffle=True, 247 | validation_data=(val_data, self.X_val), 248 | callbacks=[beta_changer] 249 | ) 250 | 251 | else: 252 | alpha_changer = LambdaCallback(on_epoch_end=change_params) 253 | self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.my_mse, metrics=[self.my_mse]) 254 | self.network.fit(x=train_data, y=self.X_train, 255 | epochs=self.n_epochs, 256 | batch_size=200, 257 | shuffle=True, 258 | validation_data=(val_data, self.X_val), 259 | callbacks=[alpha_changer] 260 | ) 261 | self.encoder.save('models/'+self.net_type+'_'+self.filename_out+'_trained_encoder.h5') 262 | self.decoder.save('models/'+self.net_type+'_'+self.filename_out+'_trained_decoder.h5') 263 | 264 | def save_latents(self): 265 | 266 | indat = self.frames 267 | enc_mag = self.encoder.predict(indat,verbose=1) 268 | 269 | if self.net_type == 'vae': 270 | a = enc_mag[0] 271 | b = enc_mag[1] 272 | print(a.shape) 273 | print(b.shape) 274 | enc_mag = np.hstack((enc_mag[0],enc_mag[1])) 275 | 276 | df = pd.DataFrame(enc_mag) 277 | df.to_csv('encoded_mags.csv') 278 | 279 | 280 | def evaluate_net(self): 281 | if self.skip == True: #Handling case where Keras expects two inputs 282 | test_data = [self.X_test,self.X_test[:,-12:]] 283 | val_data = [self.X_val,self.X_val[:,-12:]] 284 | else: 285 | test_data = self.X_test 286 | val_data = self.X_val 287 | 288 | if args.filename_in == 'one_octave': 289 | mod = 1 290 | elif args.filename_in == 'five_octave' or args.filename_in == 'violin': 291 | mod = 10 292 | elif args.filename_in == 'guitar': 293 | mod = 3 294 | else: 295 | mod = 1 296 | 297 | print('\n') 298 | print('Evaluating performance on validation and test sets') 299 | a=self.network.evaluate(x=val_data,y=self.X_val,verbose=1) 300 | b=self.network.evaluate(x=test_data,y=self.X_test,verbose=1) 301 | print('\n') 302 | for idx in range(len(self.network.metrics_names)): 303 | print('Validation '+self.network.metrics_names[idx]) 304 | print(a[idx]) 305 | print('\n') 306 | for idx in range(len(self.network.metrics_names)): 307 | print('Testing '+self.network.metrics_names[idx]) 308 | print(b[idx]) 309 | print('\n') 310 | print('Plotting network reconstructions') 311 | valset_eval = self.network.predict(val_data,verbose=1) 312 | testset_eval = self.network.predict(test_data,verbose=1) 313 | frame_check = [100, 150, 200, 250, 300, 350, 400, 450, 500] 314 | 315 | for frame in frame_check: 316 | frame *= mod 317 | xx = np.arange(2049)*(22050/2049) 318 | val_yy = self.X_val[frame,0:2049] 319 | val_zz = valset_eval[frame,0:2049] 320 | test_yy = self.X_val[frame,0:2049] 321 | test_zz = valset_eval[frame,0:2049] 322 | plt.figure(1) 323 | plt.subplot(211) 324 | plt.plot(xx,val_yy) 325 | plt.ylim([0,1.2]) 326 | plt.ylabel('Spectral Magnitude') 327 | plt.xscale('log') 328 | plt.xlabel('Frequency (Hz)') 329 | plt.title('Input Spectrum') 330 | plt.subplot(212) 331 | plt.plot(xx,val_zz,color='r') 332 | plt.ylim([0,1.2]) 333 | plt.ylabel('Spectral Magnitude') 334 | plt.xscale('log') 335 | plt.xlabel('Frequency (Hz)') 336 | plt.title('Output Spectrum') 337 | plt.tight_layout() 338 | plotname = self.net_type+'_val_'+str(frame)+'.pdf' 339 | plt.savefig(plotname, format = 'pdf', bbox_inches='tight') 340 | plt.clf() 341 | 342 | plt.figure(1) 343 | plt.subplot(211) 344 | plt.plot(xx,test_yy) 345 | plt.ylim([0,1.2]) 346 | plt.ylabel('Spectral Magnitude') 347 | plt.xscale('log') 348 | plt.xlabel('Frequency (Hz)') 349 | plt.title('Input Spectrum') 350 | plt.subplot(212) 351 | plt.plot(xx,test_zz,color='r') 352 | plt.ylim([0,1.2]) 353 | plt.ylabel('Spectral Magnitude') 354 | plt.xscale('log') 355 | plt.xlabel('Frequency (Hz)') 356 | plt.title('Output Spectrum') 357 | plt.tight_layout() 358 | plotname = self.net_type+'_test_'+str(frame)+'.pdf' 359 | plt.savefig(plotname, format = 'pdf', bbox_inches='tight') 360 | plt.clf() 361 | 362 | 363 | if __name__ == '__main__': 364 | args = get_arguments() 365 | my_manne = Manne(args) 366 | if args.mode == 'train': 367 | my_manne.do_everything() 368 | else: 369 | my_manne.just_plot() 370 | 371 | 372 | 373 | 374 | 375 | 376 | -------------------------------------------------------------------------------- /manne_gui.py: -------------------------------------------------------------------------------- 1 | from Tkinter import * 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import librosa 6 | import soundfile as sf 7 | import argparse 8 | import pyaudio 9 | import numpy as np 10 | from keras.layers import Input, Dense, LeakyReLU 11 | from keras.models import Model, load_model 12 | import tensorflow as tf 13 | import time 14 | 15 | RATE = int(44100) 16 | CHUNK = int(1024) 17 | CHANNELS = int(1) 18 | NUM_CHUNKS = 20 19 | ind = NUM_CHUNKS+1 20 | proc_ind = 1 21 | 22 | class Application(Frame): 23 | global make_sine 24 | def make_sine(seg_length,ii): 25 | global mag 26 | global phase 27 | global remember 28 | global CHUNK 29 | global encoder 30 | global enc_graph 31 | global decoder 32 | global dec_graph 33 | global scales 34 | global app 35 | 36 | print(scales) 37 | #Additional = 1 because it works, -3 because of 75% window overlap 38 | additional = 1 39 | ind_array = np.arange((seg_length*ii-3),(seg_length*(ii+1)+additional)) 40 | temp_out_mag = mag[ind_array,:] 41 | temp_phase = phase[:,ind_array] 42 | temp_remember = remember[ind_array] 43 | 44 | with enc_graph.as_default(): 45 | temp_enc_mag = encoder.predict(temp_out_mag) 46 | enc_mag = temp_enc_mag * scales #NEED TO ADD SCALE HERE 47 | with dec_graph.as_default(): 48 | temp_out_mag = decoder.predict(enc_mag) 49 | 50 | out_mag = temp_out_mag.T * temp_remember 51 | E = out_mag*np.exp(1j*temp_phase) 52 | out = np.float32(librosa.istft(E)) 53 | out = 0.8*out[3*CHUNK:] 54 | return out.reshape(((len(out)/CHUNK),CHUNK)) 55 | 56 | global callback 57 | def callback(in_data, frame_count, time_info, status): 58 | # stream.write(samples.astype(np.float32).tostring()) 59 | global ind 60 | global proc_ind 61 | global NUM_CHUNKS 62 | global all_data 63 | 64 | if ind>=NUM_CHUNKS: 65 | all_data = make_sine(NUM_CHUNKS,proc_ind) #Create new back of samples 66 | ind = 0 67 | proc_ind+=1 68 | data = all_data[ind,:] #Send a chunk to the audio buffer when it asks for one 69 | ind +=1 70 | return (data, pyaudio.paContinue) 71 | 72 | def render(self): 73 | global mag 74 | global phase 75 | global remember 76 | global CHUNK 77 | global encoder 78 | global enc_graph 79 | global decoder 80 | global dec_graph 81 | global scales 82 | global app 83 | 84 | print(scales) 85 | temp_out_mag = mag 86 | temp_phase = phase 87 | temp_remember = remember 88 | 89 | with enc_graph.as_default(): 90 | temp_enc_mag = encoder.predict(temp_out_mag) 91 | enc_mag = temp_enc_mag * scales #NEED TO ADD SCALE HERE 92 | with dec_graph.as_default(): 93 | temp_out_mag = decoder.predict(enc_mag) 94 | 95 | out_mag = temp_out_mag.T * temp_remember 96 | E = out_mag*np.exp(1j*temp_phase) 97 | out = np.float32(librosa.istft(E)) 98 | out = (0.9/np.max(np.abs(out)))*out 99 | 100 | sf.write('rendered.wav', out, 44100, subtype='PCM_16') 101 | print('done rendering') 102 | 103 | 104 | def record(self): 105 | global mag 106 | global phase 107 | global remember 108 | global CHUNK 109 | global encoder 110 | global enc_graph 111 | global decoder 112 | global dec_graph 113 | global scales 114 | global app 115 | global recorded_scales 116 | global proc_ind 117 | 118 | first_ind = 0 119 | last_ind = 0 120 | total_frames = 0 121 | if self.RECORD_var.get() == 1: 122 | first_ind = proc_ind 123 | print('Button On') 124 | self.start_net() 125 | else: 126 | last_ind = proc_ind 127 | print('Button off') 128 | self.pause_sounds() 129 | 130 | total_frames = (last_ind-first_ind)*NUM_CHUNKS 131 | out_scales = np.ones((total_frames,15)) 132 | temp_scales = np.vstack(recorded_scales) 133 | a = temp_scales.shape[0] 134 | increase_by = total_frames//a+1 135 | kurt=0 136 | for ii in range(a): 137 | the_rows = np.arange((kurt*increase_by),min(((kurt+1)*increase_by),total_frames)) 138 | out_scales[the_rows,:] = np.tile(temp_scales[ii,:],(len(the_rows),1)) 139 | kurt+=1 140 | ind_array = np.arange((first_ind),(NUM_CHUNKS*(last_ind))) 141 | temp_out_mag = mag[ind_array,:] 142 | temp_phase = phase[:,ind_array] 143 | temp_remember = remember[ind_array] 144 | 145 | with enc_graph.as_default(): 146 | temp_enc_mag = encoder.predict(temp_out_mag) 147 | enc_mag = temp_enc_mag * out_scales #NEED TO ADD SCALE HERE 148 | with dec_graph.as_default(): 149 | temp_out_mag = decoder.predict(enc_mag) 150 | 151 | out_mag = temp_out_mag.T * temp_remember 152 | E = out_mag*np.exp(1j*temp_phase) 153 | out = np.float32(librosa.istft(E)) 154 | out = (0.9/np.max(np.abs(out)))*out 155 | 156 | sf.write('recorded.wav', out, 44100, subtype='PCM_16') 157 | print('done recording') 158 | 159 | 160 | 161 | def model_to_mem(self): 162 | global encoder 163 | global enc_graph 164 | global decoder 165 | global dec_graph 166 | 167 | data_path_enc = os.path.join(os.getcwd(),self.model_name.get()+'_trained_encoder.h5') 168 | encoder = load_model(data_path_enc, compile=False) 169 | encoder._make_predict_function() 170 | enc_graph = tf.get_default_graph() 171 | data_path_dec = os.path.join(os.getcwd(),self.model_name.get()+'_trained_decoder.h5') 172 | decoder = load_model(data_path_dec, compile=False) 173 | decoder._make_predict_function() 174 | dec_graph = tf.get_default_graph() 175 | return encoder, decoder 176 | 177 | def process_track(self): 178 | global mag 179 | global phase 180 | global remember 181 | 182 | len_window = 4096 #Specified length of analysis window 183 | hop_length_ = 1024 #Specified percentage hop length between windows 184 | 185 | filename_in = self.track_name.get() 186 | data_path = os.path.join(os.getcwd(),filename_in) 187 | y, sr = librosa.load(data_path, sr=44100, mono=True) 188 | 189 | D = librosa.stft(y,n_fft=len_window, window='hann') 190 | mag = D 191 | mag = np.abs(mag) #Magnitude response of the STFT 192 | remember = mag.max(axis=0)+0.000000001 #Used for normalizing STFT frames (with addition to avoid division by zero) 193 | mag = mag / remember #Normalizing 194 | phase = np.angle(D) #Phase response of STFT 195 | mag = mag.T 196 | 197 | return mag, phase, remember 198 | 199 | 200 | 201 | def start_net(self): 202 | global p 203 | global stream 204 | self.model_to_mem() 205 | mag, phase, remember = self.process_track() 206 | 207 | p = pyaudio.PyAudio() 208 | 209 | stream = p.open(format=pyaudio.paFloat32, 210 | channels=CHANNELS, 211 | frames_per_buffer=CHUNK, 212 | rate=RATE, 213 | output=True, 214 | stream_callback=callback) 215 | 216 | 217 | stream.start_stream() 218 | time.sleep(0.1) 219 | 220 | def pause_sounds(self): 221 | global p 222 | global stream 223 | global ind 224 | global proc_ind 225 | 226 | stream.stop_stream() 227 | print('sounds paused') 228 | stream.close() 229 | p.terminate() 230 | ind = NUM_CHUNKS+1 231 | proc_ind = 0 232 | 233 | def quit(self): 234 | root.destroy() 235 | 236 | 237 | def createWidgets(self): 238 | self.QUIT = Button(self) 239 | self.QUIT["text"] = "QUIT" 240 | self.QUIT["fg"] = "red" 241 | self.QUIT["command"] = self.quit 242 | self.QUIT.pack() 243 | self.QUIT.place(relx=0.45,rely=0.9) 244 | 245 | self.model_name = Entry(self) 246 | self.model_name.pack() 247 | self.model_name.place(relx=0.4,rely=0.65) 248 | self.label_1 = Label(self,text='Model Name') 249 | self.label_1.pack() 250 | self.label_1.place(relx=0.25,rely=0.65) 251 | 252 | self.track_name = Entry(self) 253 | self.track_name.pack() 254 | self.track_name.place(relx=0.4,rely=0.6) 255 | self.label_1 = Label(self,text='Track Name') 256 | self.label_1.pack() 257 | self.label_1.place(relx=0.25,rely=0.6) 258 | 259 | self.START = Button(self) 260 | self.START["text"] = "START" 261 | self.START["fg"] = "green" 262 | self.START["command"] = lambda: self.start_net() 263 | self.START.pack() 264 | self.START.place(relx=0.45,rely=0.85) 265 | 266 | self.PAUSE = Button(self) 267 | self.PAUSE["text"] = "PAUSE" 268 | self.PAUSE["fg"] = "black" 269 | self.PAUSE["command"] = lambda: self.pause_sounds() 270 | self.PAUSE.pack() 271 | self.PAUSE.place(relx=0.45,rely=0.8) 272 | 273 | self.RECORD_var = IntVar() 274 | self.RECORD = Checkbutton(self, variable=self.RECORD_var) 275 | self.RECORD["text"] = "RECORD" 276 | self.RECORD["fg"] = "black" 277 | self.RECORD["command"] = lambda: self.record() 278 | self.RECORD.pack() 279 | self.RECORD.place(relx=0.45,rely=0.75) 280 | 281 | self.RENDER = Button(self) 282 | self.RENDER["text"] = "RENDER" 283 | self.RENDER["fg"] = "black" 284 | self.RENDER["command"] = lambda: self.render() 285 | self.RENDER.pack() 286 | self.RENDER.place(relx=0.45,rely=0.7) 287 | 288 | 289 | 290 | def createSliders(self): 291 | global scales 292 | scales = np.ones(15) 293 | self.scale_list = [] 294 | for w in range(15): 295 | scale = Scale(self,from_=40, to=-10,length=200) 296 | scale.pack() 297 | scale.place(relx=w/16.,rely=0.2) 298 | scale.set(10) 299 | scales[w]=scale.get() 300 | self.scale_list.append(scale) 301 | 302 | def update_scales(self): 303 | global scales 304 | global recorded_scales 305 | 306 | temp_scales = np.ones(15) 307 | for w in range(15): 308 | temp_scales[w]=self.scale_list[w].get()/10. 309 | scales = temp_scales 310 | if self.RECORD_var.get() == 1: 311 | recorded_scales.append(scales) 312 | self.after(250, self.update_scales) 313 | 314 | 315 | def __init__(self, master=None): 316 | global recorded_scales 317 | 318 | Frame.__init__(self, master,width=800, height=800) 319 | self.pack() 320 | self.createWidgets() 321 | self.createSliders() 322 | recorded_scales = [] 323 | self.update_scales() 324 | 325 | global app 326 | root = Tk() 327 | app = Application(master=root) 328 | app.mainloop() 329 | root.destroy() -------------------------------------------------------------------------------- /models/ae_noskip_chroma_sig_oo_trained_decoder.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_noskip_chroma_sig_oo_trained_decoder.h5 -------------------------------------------------------------------------------- /models/ae_noskip_chroma_sig_oo_trained_encoder.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_noskip_chroma_sig_oo_trained_encoder.h5 -------------------------------------------------------------------------------- /models/ae_skip_oo_trained_decoder.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_skip_oo_trained_decoder.h5 -------------------------------------------------------------------------------- /models/ae_skip_oo_trained_encoder.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_skip_oo_trained_encoder.h5 -------------------------------------------------------------------------------- /models/foo.txt: -------------------------------------------------------------------------------- 1 | hi 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.1 2 | astor==0.8.0 3 | audioread==2.1.8 4 | backports.functools-lru-cache==1.5 5 | backports.weakref==1.0.post1 6 | cffi==1.12.3 7 | cycler==0.10.0 8 | decorator==4.4.0 9 | enum34==1.1.6 10 | funcsigs==1.0.2 11 | futures==3.3.0 12 | gast==0.3.2 13 | grpcio==1.24.1 14 | h5py==2.10.0 15 | joblib==0.14.0 16 | Keras==2.2.4 17 | Keras-Applications==1.0.8 18 | Keras-Preprocessing==1.1.0 19 | kiwisolver==1.1.0 20 | librosa==0.7.0 21 | llvmlite==0.29.0 22 | Markdown==3.1.1 23 | matplotlib==2.2.4 24 | mock==3.0.5 25 | numba==0.45.1 26 | numpy==1.16.5 27 | pandas==0.24.2 28 | pkg-resources==0.0.0 29 | protobuf==3.10.0 30 | PyAudio==0.2.11 31 | pycparser==2.19 32 | pyo==1.0.0 33 | pyparsing==2.4.2 34 | python-dateutil==2.8.0 35 | pytz==2019.3 36 | PyYAML==5.1.2 37 | resampy==0.2.2 38 | scikit-learn==0.20.4 39 | scipy==1.2.2 40 | singledispatch==3.4.0.3 41 | six==1.12.0 42 | SoundFile==0.10.2 43 | subprocess32==3.5.4 44 | tensorboard==1.9.0 45 | tensorflow==1.15.2 46 | termcolor==1.1.0 47 | tqdm==4.36.1 48 | Werkzeug==0.16.0 49 | -------------------------------------------------------------------------------- /synth_manne_gui.py: -------------------------------------------------------------------------------- 1 | from Tkinter import * 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import librosa 6 | import soundfile as sf 7 | import argparse 8 | import pyaudio 9 | import numpy as np 10 | from keras.layers import Input, Dense, LeakyReLU 11 | from keras.models import Model, load_model 12 | import tensorflow as tf 13 | import time 14 | from scipy import signal 15 | 16 | RATE = int(44100) 17 | CHUNK = int(1024) 18 | CHANNELS = int(1) 19 | NUM_CHUNKS = 5 20 | ind = NUM_CHUNKS+1 21 | proc_ind = 1 22 | crossfade_time = int(CHUNK*3) 23 | fade_in = np.log(np.linspace(1, 2.71, crossfade_time)) 24 | fade_out = np.log(np.linspace(2.71, 1, crossfade_time)) 25 | threshold = 1e-3 26 | pie = np.pi 27 | relative_height = 0.01 28 | len_window = 4096 29 | width_ = (len_window/2)/np.sqrt(-2*np.log(relative_height)) 30 | freq_time_ratio = -1*(pie/4)*(np.power(len_window,2)/np.log(relative_height)) 31 | last_three_der_frames = np.zeros((2049,3)) 32 | last_three_phase_frames = np.zeros((2049,3)) 33 | wn_phase = np.load('wn_phase.npy') 34 | print('All wn phase shape') 35 | print(wn_phase.shape) 36 | 37 | class Application(Frame): 38 | 39 | global make_sine 40 | def make_sine(seg_length,ii): 41 | global mag 42 | global phase 43 | global remember 44 | global CHUNK 45 | global encoder 46 | global enc_graph 47 | global decoder 48 | global dec_graph 49 | global scales 50 | global recorded_scales 51 | global app 52 | global POLL_TIME 53 | global RATE 54 | global fade_in 55 | global fade_out 56 | global wn_phase 57 | global chroma_choice 58 | global num_latents 59 | global skip 60 | 61 | additional = 4 62 | enc_mag = scales*np.ones((1,num_latents)) 63 | if skip=='skip': 64 | chroma_append = np.zeros((1,12)) 65 | chroma_append[0,chroma_choice] = 1 66 | enc_mag = np.hstack((enc_mag,chroma_append)) 67 | 68 | 69 | ind_array = np.arange((seg_length*ii-3),(seg_length*(ii+1)+1)) 70 | temp_phase = wn_phase[ind_array,:] 71 | with dec_graph.as_default(): 72 | temp_out_mag = decoder.predict(enc_mag) 73 | temp_out_mag = np.tile(temp_out_mag,(NUM_CHUNKS+additional,1)) 74 | E = temp_out_mag*np.exp(1j*temp_phase) 75 | _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096) 76 | out = np.float32(now_out[3*CHUNK:]*(0.08/np.max(np.abs(now_out)))) 77 | final_out = out.reshape(((len(out)/CHUNK),CHUNK)) 78 | return final_out 79 | 80 | global callback 81 | def callback(in_data, frame_count, time_info, status): 82 | global ind 83 | global proc_ind 84 | global NUM_CHUNKS 85 | global all_data 86 | 87 | if ind>=(NUM_CHUNKS-1): 88 | all_data = make_sine(NUM_CHUNKS,proc_ind) 89 | ind = 0 90 | proc_ind+=1 91 | data = all_data[ind,:] #Send a chunk to the audio buffer when it asks for one 92 | ind +=1 93 | return (data, pyaudio.paContinue) 94 | 95 | def render(self): 96 | global mag 97 | global phase 98 | global remember 99 | global CHUNK 100 | global encoder 101 | global enc_graph 102 | global decoder 103 | global dec_graph 104 | global scales 105 | global app 106 | global recorded_scales 107 | 108 | print(scales) 109 | ind_array = np.arange((1000),(1000*(2))) 110 | temp_phase = wn_phase[ind_array,:] 111 | enc_mag = scales*np.ones((1,15)) 112 | with dec_graph.as_default(): 113 | temp_out_mag = decoder.predict(enc_mag) 114 | temp_out_mag = np.tile(temp_out_mag,(200,1)) 115 | E = temp_out_mag*np.exp(1j*temp_phase) 116 | _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096) 117 | out = np.float32(now_out[3*CHUNK:]*(0.08/np.max(np.abs(now_out)))) 118 | 119 | sf.write('rendered.wav', out, 44100, subtype='PCM_16') 120 | print('done rendering') 121 | 122 | def record(self): 123 | global mag 124 | global phase 125 | global remember 126 | global CHUNK 127 | global encoder 128 | global enc_graph 129 | global decoder 130 | global dec_graph 131 | global scales 132 | global app 133 | global recorded_scales 134 | global proc_ind 135 | 136 | first_ind = 0 137 | last_ind = 0 138 | total_frames = 0 139 | if self.RECORD_var.get() == 1: 140 | first_ind = proc_ind 141 | print('Button On') 142 | self.start_net() 143 | else: 144 | last_ind = proc_ind 145 | print('Button off') 146 | self.pause_sounds() 147 | 148 | total_frames = (last_ind-first_ind)*NUM_CHUNKS 149 | out_scales = np.ones((total_frames,15)) 150 | temp_scales = np.vstack(recorded_scales) 151 | a = temp_scales.shape[0] 152 | increase_by = total_frames//a+1 153 | kurt=0 154 | for ii in range(a): 155 | the_rows = np.arange((kurt*increase_by),min(((kurt+1)*increase_by),total_frames)) 156 | out_scales[the_rows,:] = np.tile(temp_scales[ii,:],(len(the_rows),1)) 157 | kurt+=1 158 | ind_array = np.arange((first_ind),(NUM_CHUNKS*(last_ind))) 159 | temp_phase = wn_phase[ind_array,:] 160 | 161 | with dec_graph.as_default(): 162 | temp_out_mag = decoder.predict(out_scales) 163 | 164 | E = temp_out_mag*np.exp(1j*temp_phase) 165 | _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096) 166 | out = np.float32(now_out[3*CHUNK:]*(0.8/np.max(np.abs(now_out)))) 167 | 168 | sf.write('recorded.wav', out, 44100, subtype='PCM_16') 169 | print('done recording') 170 | 171 | 172 | 173 | def model_to_mem(self): 174 | global decoder 175 | global dec_graph 176 | 177 | data_path = os.path.join(os.getcwd(),self.model_name.get()+'_trained_decoder.h5') 178 | decoder = load_model(data_path, compile=False) 179 | decoder._make_predict_function() 180 | dec_graph = tf.get_default_graph() 181 | 182 | def process_track(self): 183 | global mag 184 | global phase 185 | global remember 186 | 187 | len_window = 4096 #Specified length of analysis window 188 | hop_length_ = 1024 #Specified percentage hop length between windows 189 | 190 | filename_in = self.track_name.get() 191 | data_path = os.path.join(os.getcwd(),filename_in) 192 | y, sr = librosa.load(data_path, sr=44100, mono=True) 193 | 194 | D = librosa.stft(y,n_fft=len_window, window='hann') 195 | mag = D 196 | mag = np.abs(mag) #Magnitude response of the STFT 197 | remember = mag.max(axis=0)+0.000000001 #Used for normalizing STFT frames (with addition to avoid division by zero) 198 | mag = mag / remember #Normalizing 199 | phase = np.angle(D) #Phase response of STFT 200 | mag = mag.T 201 | 202 | return mag, phase, remember 203 | 204 | 205 | 206 | def start_net(self): 207 | global p 208 | global stream 209 | self.model_to_mem() 210 | 211 | p = pyaudio.PyAudio() 212 | 213 | stream = p.open(format=pyaudio.paFloat32, 214 | channels=CHANNELS, 215 | frames_per_buffer=CHUNK, 216 | rate=RATE, 217 | output=True, 218 | stream_callback=callback) 219 | 220 | 221 | stream.start_stream() 222 | time.sleep(0.1) 223 | 224 | def pause_sounds(self): 225 | global p 226 | global stream 227 | global ind 228 | global proc_ind 229 | 230 | stream.stop_stream() 231 | print('sounds paused') 232 | stream.close() 233 | p.terminate() 234 | ind = NUM_CHUNKS+1 235 | proc_ind = 0 236 | 237 | def quit(self): 238 | root.destroy() 239 | 240 | 241 | def createWidgets(self): 242 | self.QUIT = Button(self) 243 | self.QUIT["text"] = "QUIT" 244 | self.QUIT["fg"] = "red" 245 | self.QUIT["command"] = self.quit 246 | self.QUIT.pack() 247 | self.QUIT.place(relx=0.45,rely=0.9) 248 | 249 | self.model_name = Entry(self) 250 | self.model_name.pack() 251 | self.model_name.place(relx=0.4,rely=0.65) 252 | self.label_1 = Label(self,text='Model Name') 253 | self.label_1.pack() 254 | self.label_1.place(relx=0.25,rely=0.65) 255 | 256 | self.START = Button(self) 257 | self.START["text"] = "START" 258 | self.START["fg"] = "green" 259 | self.START["command"] = lambda: self.start_net() 260 | self.START.pack() 261 | self.START.place(relx=0.45,rely=0.85) 262 | 263 | self.PAUSE = Button(self) 264 | self.PAUSE["text"] = "PAUSE" 265 | self.PAUSE["fg"] = "black" 266 | self.PAUSE["command"] = lambda: self.pause_sounds() 267 | self.PAUSE.pack() 268 | self.PAUSE.place(relx=0.45,rely=0.8) 269 | 270 | self.RECORD_var = IntVar() 271 | self.RECORD = Checkbutton(self, variable=self.RECORD_var) 272 | self.RECORD["text"] = "RECORD" 273 | self.RECORD["fg"] = "black" 274 | self.RECORD["command"] = lambda: self.record() 275 | self.RECORD.pack() 276 | self.RECORD.place(relx=0.45,rely=0.75) 277 | 278 | self.RENDER = Button(self) 279 | self.RENDER["text"] = "RENDER" 280 | self.RENDER["fg"] = "black" 281 | self.RENDER["command"] = lambda: self.render() 282 | self.RENDER.pack() 283 | self.RENDER.place(relx=0.45,rely=0.7) 284 | 285 | 286 | 287 | def createSliders(self): 288 | global scales 289 | global num_latents 290 | scales = np.ones(num_latents) 291 | self.scale_list = [] 292 | for w in range(num_latents): 293 | scale = Scale(self,from_=110, to=-10,length=200) 294 | scale.pack() 295 | scale.place(relx=w/(float(num_latents)),rely=0.2) 296 | scale.set(0) 297 | scales[w]=scale.get() 298 | self.scale_list.append(scale) 299 | 300 | def createButtons(self): 301 | global chroma_val 302 | self.chroma_val = IntVar() 303 | self.chroma_val.set(0) 304 | NOTE_OPTIONS = [ 305 | ('C',0), 306 | ('C#',1), 307 | ('D',2), 308 | ('D#',3), 309 | ('E',4), 310 | ('F',5), 311 | ('F#',6), 312 | ('G',7), 313 | ('G#',8), 314 | ('A',9), 315 | ('A#',10), 316 | ('B',11) 317 | ] 318 | 319 | for text, val in NOTE_OPTIONS: 320 | b = Radiobutton(self, text=text, value=val, variable=self.chroma_val) 321 | b.pack() 322 | b.place(relx=0.2+val/19.,rely=0.1) 323 | 324 | 325 | def update_scales(self): 326 | global scales 327 | global recorded_scales 328 | global POLL_TIME 329 | global chroma_choice 330 | global num_latents 331 | 332 | POLL_TIME = 100 333 | chroma_choice = self.chroma_val.get() 334 | temp_scales = np.ones(8) 335 | for w in range(num_latents): 336 | temp_scales[w]=self.scale_list[w].get()/300. 337 | scales = temp_scales 338 | if self.RECORD_var.get() == 1: 339 | recorded_scales.append(scales) 340 | self.after(POLL_TIME, self.update_scales) 341 | 342 | 343 | def __init__(self, master=None): 344 | global recorded_scales 345 | 346 | Frame.__init__(self, master,width=800, height=800) 347 | self.pack() 348 | self.createWidgets() 349 | self.createButtons() 350 | self.createSliders() 351 | recorded_scales = [] 352 | self.update_scales() 353 | 354 | global app 355 | global num_latents 356 | global skip 357 | num_latents = int(sys.argv[1]) 358 | skip = sys.argv[2] 359 | root = Tk() 360 | app = Application(master=root) 361 | app.mainloop() 362 | root.destroy() -------------------------------------------------------------------------------- /wav2frames.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pickle 4 | import os 5 | import librosa 6 | import librosa.display 7 | import scipy as sci 8 | import argparse 9 | 10 | def get_arguments(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--filename_in', type=str) 13 | parser.add_argument('--filename_out', type=str) 14 | return parser.parse_args() 15 | 16 | args = get_arguments() 17 | 18 | len_window = 4096 #Specified length of analysis window 19 | hop_length_ = 1024 #Specified percentage hop length between windows 20 | 21 | filename_in = args.filename_in 22 | filename_out = args.filename_out 23 | data_path = os.path.join(os.getcwd(),filename_in) 24 | y, sr = librosa.load(data_path, sr=44100) 25 | 26 | D = librosa.stft(y,n_fft=4096, window='hann') 27 | print(D.shape) 28 | temp = D[:,:] 29 | phase = np.angle(temp) 30 | temp = np.abs(temp) 31 | temp = temp / (temp.max(axis=0)+0.000000001) 32 | print(temp.max(axis=0)) 33 | temp = np.transpose(temp) 34 | phase = np.transpose(phase) 35 | print(np.shape(temp)) 36 | output = temp[~np.all(temp == 0, axis=1)] 37 | out_phase = phase[~np.all(temp == 0, axis=1)] 38 | print(np.shape(output)) 39 | np.save(filename_out+'.npy',output) 40 | np.save(filename_out+'_phase.npy',out_phase) 41 | --------------------------------------------------------------------------------