├── README.md
├── frames
    └── foo.txt
├── leaky_big_training.py
├── manne_gui.py
├── models
    ├── ae_noskip_chroma_sig_oo_trained_decoder.h5
    ├── ae_noskip_chroma_sig_oo_trained_encoder.h5
    ├── ae_skip_oo_trained_decoder.h5
    ├── ae_skip_oo_trained_encoder.h5
    └── foo.txt
├── requirements.txt
├── synth_manne_gui.py
└── wav2frames.py


/README.md:
--------------------------------------------------------------------------------
 1 | # manne
 2 | Remaking My ANNe effect
 3 | 
 4 | Tested with Python 2.7.12 
 5 | 
 6 | I suggest using a virtualenvironment to ensure that all packages are correct
 7 | 
 8 | ```
 9 | mkdir venv
10 | virtualenv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | From what I remember, this application requires ffmpeg and portaudio19-dev 
16 | 
17 | To start the program, run 
18 | 
19 | ```
20 | python manne_gui.py
21 | ```
22 | 
23 | Type the relative path of the track you would like to filter into the "Track Name" box.
24 | 
25 | Type the prefix of the trained model you would like to run (in this case just ```all_frames```) into the "Model Name" box.
26 | 
27 | Clicking "START" will start to filter the track through the neural network and play out audio in real time. Change the value of the sliders to change the latent representation of the audio. 
28 | 
29 | Clicking "PAUSE" will pause the audio output and freeze the track where it is. I'm pretty sure clicking "START" again will resume the track.
30 | 
31 | To render an entire track with fixed latent activations, click "RENDER". The song will be output as "rendered.wav" in your given directory. It should be a mono wav file, 16bit PCM, 44.1kHz.
32 | 
33 | To begin a recording of you altering the latents as the track plays, click "RECORD" and begin moving the sliders. 
34 | To end a recording, just click the "RECORD" button again so that it is unchecked. The recorded wav file will be output as "recorded.wav" in your given directory. It should be a mono wav file, 16bit PCM, 44.1kHz.
35 | 
36 | Clicking "QUIT" will close the application.
37 | 
38 | ^_^
39 | 
40 | "MANNe" pronunciation guide: https://www.youtube.com/watch?v=EmZvOhHF85I&feature=youtu.be&t=5
41 | 


--------------------------------------------------------------------------------
/frames/foo.txt:
--------------------------------------------------------------------------------
1 | helo
2 | 


--------------------------------------------------------------------------------
/leaky_big_training.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from keras.layers import Input, Dense, Lambda, Concatenate, Dropout, LeakyReLU
  5 | from keras.models import Model, Sequential, load_model, clone_model
  6 | 
  7 | from keras.regularizers import l2
  8 | from keras.losses import mse
  9 | from keras.callbacks import LambdaCallback
 10 | from keras.optimizers import Adam
 11 | from keras.utils import plot_model
 12 | from keras import backend as K 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import matplotlib.pyplot as plt
 16 | import argparse
 17 | import os
 18 | import librosa
 19 | import tensorflow as tf
 20 | 
 21 | global alpha
 22 | global beta
 23 | beta = K.variable(3e-7)
 24 | alpha = K.variable(0.3)
 25 | 
 26 | def change_params(epoch, logs):
 27 | 	if epoch<=5 and epoch%1==0:
 28 | 		K.set_value(beta,K.get_value(beta)+2e-5)
 29 | 	if epoch == 30:
 30 | 		K.set_value(alpha,0.0)
 31 | 
 32 | def get_arguments():
 33 | 	parser = argparse.ArgumentParser()
 34 | 	parser.add_argument('--filename_in', type=str)
 35 | 	parser.add_argument('--filename_out', type=str)
 36 | 	parser.add_argument('--net_type', type=str)
 37 | 	parser.add_argument('--mode', type=str)
 38 | 	parser.add_argument('--trained_model_name', type=str, default='')
 39 | 	parser.add_argument('--n_epochs', type=int, default=5)
 40 | 	parser.add_argument('--skip', type=bool, default=False)
 41 | 	return parser.parse_args()
 42 | 
 43 | class Manne:
 44 | 	def __init__(self, args):
 45 | 		self.frames = []
 46 | 		self.X_train = []
 47 | 		self.X_val = []
 48 | 		self.X_test = []
 49 | 		self.encoder = []
 50 | 		self.decoder = []
 51 | 		self.network = []
 52 | 		self.encoder_widths = []
 53 | 		self.decoder_widths = []
 54 | 		
 55 | 		self.z_mean = K.placeholder(shape=(8,))
 56 | 		self.z_log_var = K.placeholder(shape=(8,))
 57 | 		self.beta_changer = []
 58 | 
 59 | 		self.n_epochs = args.n_epochs
 60 | 		self.net_type = args.net_type
 61 | 		self.skip = args.skip
 62 | 		self.filename_in = args.filename_in
 63 | 		self.filename_out = args.filename_out
 64 | 		self.trained_model_name = args.trained_model_name
 65 | 		
 66 | 	def do_everything(self):
 67 | 		self.load_dataset()
 68 | 		self.define_net()
 69 | 		self.make_net()
 70 | 		self.train_net()
 71 | 		self.evaluate_net()
 72 | 		self.save_latents()
 73 | 		
 74 | 	def just_plot(self):
 75 | 		self.load_dataset()
 76 | 		self.load_net()
 77 | 		self.make_net()
 78 | 		adam_rate = 5e-4
 79 | 		self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.my_mse, metrics=[self.my_mse])
 80 | 		self.evaluate_net()
 81 | 		self.save_latents()
 82 | 		
 83 | 	def sampling(self,args):
 84 | 		self.z_mean, self.z_log_var = args
 85 | 		batch = K.shape(self.z_mean)[0]
 86 | 		dim = K.int_shape(self.z_mean)[1]
 87 | 		epsilon = K.random_normal(shape=(batch,dim))
 88 | 		return self.z_mean + K.exp(0.5*self.z_log_var)*epsilon
 89 | 		
 90 | 	def get_loss(self, inputs, outputs):
 91 | 		global beta 
 92 | 		reconstruction_loss = mse(inputs[:,:2049],outputs)
 93 | 		kl_loss = 1+self.z_log_var-K.square(self.z_mean)-K.exp(self.z_log_var)
 94 | 		kl_loss = K.sum(kl_loss, axis=-1)
 95 | 		kl_loss *= -0.5*beta
 96 | 		vae_loss = K.sum(reconstruction_loss+kl_loss)
 97 | 		return vae_loss
 98 | 		
 99 | 	def my_mse(self, inputs, outputs):
100 | 		return mse(inputs[:,:2049],outputs)
101 | 		
102 | 	def my_kl(self, inputs, outputs):
103 | 		kl_loss = 1+self.z_log_var-K.square(self.z_mean)-K.exp(self.z_log_var)
104 | 		kl_loss = K.sum(kl_loss, axis=-1)
105 | 		kl_loss *= -0.5
106 | 		return kl_loss
107 | 	
108 | 	def load_net(self):
109 | 		enc_filename = os.path.join(os.getcwd(),'models/'+self.trained_model_name+'_trained_encoder.h5')
110 | 		print(enc_filename)
111 | 		self.encoder = load_model(enc_filename,custom_objects={'sampling': self.sampling}, compile=False)
112 | 		dec_filename = os.path.join(os.getcwd(),'models/'+self.trained_model_name+'_trained_decoder.h5')
113 | 		self.decoder = load_model(dec_filename,custom_objects={'sampling': self.sampling}, compile=False)
114 | 		
115 | 	def load_dataset(self):
116 | 		filename = 'frames/'+self.filename_in+'_frames.npy'	#Static Data used for training net
117 | 		filepath = os.path.join(os.getcwd(),filename)
118 | 		orig_frames = np.load(filepath)
119 | 		orig_frames = np.asarray(orig_frames)
120 | 		len_frames = orig_frames.shape[0]
121 | 		
122 | 		chroma = np.transpose(librosa.feature.chroma_stft(S=np.transpose(orig_frames), sr=44100))
123 | 		chroma = librosa.feature.chroma_stft(S=np.transpose(orig_frames), sr=44100)
124 | 		chroma = (chroma == chroma.max(axis=1)[:,None]).astype(int)
125 | 		chroma = np.transpose(chroma)	
126 | 		augmentations = chroma
127 | 	
128 | 		self.frames = np.hstack((orig_frames,augmentations))
129 | 		
130 | 		if args.filename_in == 'one_octave':
131 | 			self.X_train = self.frames[:16685,:]
132 | 			self.X_val = self.frames[16685:17998,:]
133 | 			self.X_test = self.frames[17998:,:]
134 | 		elif args.filename_in == 'five_octave':
135 | 			self.X_train = self.frames[:78991,:]
136 | 			self.X_val = self.frames[78991:84712,:]
137 | 			self.X_test = self.frames[84712:,:]
138 | 		elif args.filename_in == 'guitar':
139 | 			self.X_train = self.frames[:62018,:]
140 | 			self.X_val = self.frames[62018:66835,:]
141 | 			self.X_test = self.frames[66835:,:]
142 | 		elif args.filename_in == 'violin':
143 | 			self.X_train = self.frames[:90571,:]
144 | 			self.X_val = self.frames[90571:100912,:]
145 | 			self.X_test = self.frames[100912:,:]
146 | 		else:
147 | 			raise Exception('Unexpected filename_in')
148 | 			
149 | 	def define_net(self):
150 | 		if self.net_type=='vae':
151 | 			l2_penalty = 0
152 | 		else:
153 | 			l2_penalty = 1e-7
154 | 		
155 | 		#8 Neuron Model from the paper
156 | 		self.encoder_widths = [1024,512,256,128,64,32,16,8]
157 | 		self.decoder_widths = [16,32,64,128,256,512,1024]
158 | 		
159 | 		#Lighter weight model
160 | 		#self.encoder_widths = [512,256,128,64,8]
161 | 		#self.decoder_widths = [64,128,256,512]
162 | 		
163 | 		decoder_outdim = 2049
164 | 		drop = 0.0
165 | 		alpha_val=0.1
166 | 		
167 | 		input_spec = Input(shape=(self.frames.shape[1],))
168 | 		encoded = Dense(units=self.encoder_widths[0], 
169 | 				activation=None,
170 | 				kernel_regularizer=l2(l2_penalty))(input_spec)
171 | 		encoded = LeakyReLU(alpha=alpha_val)(encoded)
172 | 		for width in self.encoder_widths[1:-1]:
173 | 			encoded = Dense(units=width, 
174 | 				activation=None,
175 | 				kernel_regularizer=l2(l2_penalty))(encoded)
176 | 			encoded = LeakyReLU(alpha=alpha_val)(encoded)
177 | 			
178 | 		encoded = Dense(units=self.encoder_widths[-1], activation='sigmoid', kernel_regularizer=l2(l2_penalty))(encoded)
179 | 		
180 | 		if self.net_type == 'vae':
181 | 			self.z_mean = Dense(self.encoder_widths[-1],input_shape=(self.encoder_widths[-1],), name='z_mean')(encoded)
182 | 			self.z_log_var = Dense(self.encoder_widths[-1],input_shape=(self.encoder_widths[-1],), name='z_log_var')(encoded)
183 | 			z = Lambda(self.sampling,output_shape=(self.encoder_widths[-1],), name='z')([self.z_mean,self.z_log_var])
184 | 			self.encoder = Model(input_spec, [self.z_mean, self.z_log_var, z])	
185 | 		else:
186 | 			self.encoder = Model(input_spec, encoded)
187 | 		
188 | 		if self.skip == True:
189 | 			input_latent = Input(shape=(self.encoder_widths[-1]+12,))
190 | 		else:
191 | 			input_latent = Input(shape=(self.encoder_widths[-1],))
192 | 		
193 | 		decoded = Dense(units=self.decoder_widths[0], 
194 | 			activation=None,
195 | 			kernel_regularizer=l2(l2_penalty))(input_latent)
196 | 		decoded = LeakyReLU(alpha=alpha_val)(decoded)
197 | 		for width in self.decoder_widths[1:]:
198 | 			decoded = Dense(units=width, 
199 | 				activation=None,
200 | 				kernel_regularizer=l2(l2_penalty))(decoded)
201 | 			decoded = LeakyReLU(alpha=alpha_val)(decoded)
202 | 		decoded = Dense(units=2049, 
203 | 			activation='relu',
204 | 			kernel_regularizer=l2(l2_penalty))(decoded)
205 | 		self.decoder = Model(input_latent,decoded)
206 | 		
207 | 	def make_net(self):
208 | 		auto_input = Input(shape=(self.frames.shape[1],))
209 | 		encoded = self.encoder(auto_input)
210 | 		
211 | 		if self.net_type == 'vae':
212 | 			latents = encoded[2]
213 | 		else:
214 | 			latents = encoded
215 | 			
216 | 		if self.skip == True:
217 | 			chroma_input = Input(shape=(12,))
218 | 			new_latents = Concatenate()([latents,chroma_input])
219 | 			decoded = self.decoder(new_latents)
220 | 			self.network = Model(inputs=[auto_input,chroma_input], outputs=decoded)
221 | 		else:
222 | 			decoded = self.decoder(latents)
223 | 			self.network = Model(inputs=[auto_input], outputs=decoded)
224 | 
225 | 		print('\n net summary \n')
226 | 		self.network.summary()
227 | 		print('\n encoder summary \n')
228 | 		self.encoder.summary()
229 | 		print('\n decoder summary \n')
230 | 		self.decoder.summary()
231 | 		
232 | 	def train_net(self):
233 | 		adam_rate = 5e-4
234 | 		if self.skip == True: #Handling case where Keras expects two inputs
235 | 			train_data = [self.X_train,self.X_train[:,-12:]]
236 | 			val_data = [self.X_val,self.X_val[:,-12:]]
237 | 		else:
238 | 			train_data = self.X_train
239 | 			val_data = self.X_val
240 | 		if self.net_type == 'vae':
241 | 			beta_changer = LambdaCallback(on_epoch_end=change_params)
242 | 			self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.get_loss, metrics=[self.my_mse, self.my_kl])
243 | 			self.network.fit(x=train_data, y=self.X_train,
244 | 					epochs=self.n_epochs,
245 | 					batch_size=200,
246 | 					shuffle=True,
247 | 					validation_data=(val_data, self.X_val),
248 | 					callbacks=[beta_changer]
249 | 					)
250 | 			
251 | 		else:
252 | 			alpha_changer = LambdaCallback(on_epoch_end=change_params)
253 | 			self.network.compile(optimizer=Adam(lr=adam_rate), loss=self.my_mse, metrics=[self.my_mse])
254 | 			self.network.fit(x=train_data, y=self.X_train,
255 | 					epochs=self.n_epochs,
256 | 					batch_size=200,
257 | 					shuffle=True,
258 | 					validation_data=(val_data, self.X_val),
259 | 					callbacks=[alpha_changer]
260 | 					)		
261 | 		self.encoder.save('models/'+self.net_type+'_'+self.filename_out+'_trained_encoder.h5')
262 | 		self.decoder.save('models/'+self.net_type+'_'+self.filename_out+'_trained_decoder.h5')
263 | 		
264 | 	def save_latents(self):
265 | 
266 | 		indat = self.frames
267 | 		enc_mag = self.encoder.predict(indat,verbose=1)
268 | 		
269 | 		if self.net_type == 'vae':
270 | 			a = enc_mag[0]
271 | 			b = enc_mag[1]
272 | 			print(a.shape)
273 | 			print(b.shape)
274 | 			enc_mag = np.hstack((enc_mag[0],enc_mag[1]))
275 | 			
276 | 		df = pd.DataFrame(enc_mag)
277 | 		df.to_csv('encoded_mags.csv')
278 | 			
279 | 
280 | 	def evaluate_net(self):
281 | 		if self.skip == True: #Handling case where Keras expects two inputs
282 | 			test_data = [self.X_test,self.X_test[:,-12:]]
283 | 			val_data = [self.X_val,self.X_val[:,-12:]]
284 | 		else:
285 | 			test_data = self.X_test
286 | 			val_data = self.X_val
287 | 			
288 | 		if args.filename_in == 'one_octave':
289 | 			mod = 1
290 | 		elif args.filename_in == 'five_octave' or args.filename_in == 'violin':
291 | 			mod = 10
292 | 		elif args.filename_in == 'guitar':
293 | 			mod = 3
294 | 		else:
295 | 			mod = 1
296 | 		
297 | 		print('\n')
298 | 		print('Evaluating performance on validation and test sets')
299 | 		a=self.network.evaluate(x=val_data,y=self.X_val,verbose=1)
300 | 		b=self.network.evaluate(x=test_data,y=self.X_test,verbose=1)
301 | 		print('\n')
302 | 		for idx in range(len(self.network.metrics_names)):
303 | 			print('Validation '+self.network.metrics_names[idx])
304 | 			print(a[idx])
305 | 		print('\n')
306 | 		for idx in range(len(self.network.metrics_names)):
307 | 			print('Testing '+self.network.metrics_names[idx])
308 | 			print(b[idx])
309 | 		print('\n')
310 | 		print('Plotting network reconstructions')
311 | 		valset_eval = self.network.predict(val_data,verbose=1)
312 | 		testset_eval = self.network.predict(test_data,verbose=1)
313 | 		frame_check = [100, 150, 200, 250, 300, 350, 400, 450, 500]
314 | 
315 | 		for frame in frame_check:
316 | 			frame *= mod
317 | 			xx = np.arange(2049)*(22050/2049)
318 | 			val_yy = self.X_val[frame,0:2049]
319 | 			val_zz = valset_eval[frame,0:2049]
320 | 			test_yy = self.X_val[frame,0:2049]
321 | 			test_zz = valset_eval[frame,0:2049]
322 | 			plt.figure(1)
323 | 			plt.subplot(211)
324 | 			plt.plot(xx,val_yy)
325 | 			plt.ylim([0,1.2])
326 | 			plt.ylabel('Spectral Magnitude')
327 | 			plt.xscale('log')
328 | 			plt.xlabel('Frequency (Hz)')
329 | 			plt.title('Input Spectrum')
330 | 			plt.subplot(212)
331 | 			plt.plot(xx,val_zz,color='r')
332 | 			plt.ylim([0,1.2])
333 | 			plt.ylabel('Spectral Magnitude')
334 | 			plt.xscale('log')
335 | 			plt.xlabel('Frequency (Hz)')
336 | 			plt.title('Output Spectrum')
337 | 			plt.tight_layout()
338 | 			plotname = self.net_type+'_val_'+str(frame)+'.pdf'
339 | 			plt.savefig(plotname, format = 'pdf', bbox_inches='tight')
340 | 			plt.clf()
341 | 			
342 | 			plt.figure(1)
343 | 			plt.subplot(211)
344 | 			plt.plot(xx,test_yy)
345 | 			plt.ylim([0,1.2])
346 | 			plt.ylabel('Spectral Magnitude')
347 | 			plt.xscale('log')
348 | 			plt.xlabel('Frequency (Hz)')
349 | 			plt.title('Input Spectrum')
350 | 			plt.subplot(212)
351 | 			plt.plot(xx,test_zz,color='r')
352 | 			plt.ylim([0,1.2])
353 | 			plt.ylabel('Spectral Magnitude')
354 | 			plt.xscale('log')
355 | 			plt.xlabel('Frequency (Hz)')
356 | 			plt.title('Output Spectrum')
357 | 			plt.tight_layout()
358 | 			plotname = self.net_type+'_test_'+str(frame)+'.pdf'
359 | 			plt.savefig(plotname, format = 'pdf', bbox_inches='tight')
360 | 			plt.clf()
361 | 			
362 | 
363 | if __name__ == '__main__':
364 | 	args = get_arguments()
365 | 	my_manne = Manne(args)
366 | 	if args.mode == 'train':
367 | 		my_manne.do_everything()
368 | 	else:
369 | 		my_manne.just_plot()
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------
/manne_gui.py:
--------------------------------------------------------------------------------
  1 | from Tkinter import *
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | import librosa
  6 | import soundfile as sf
  7 | import argparse
  8 | import pyaudio
  9 | import numpy as np
 10 | from keras.layers import Input, Dense, LeakyReLU
 11 | from keras.models import Model, load_model
 12 | import tensorflow as tf 
 13 | import time
 14 | 
 15 | RATE     = int(44100)
 16 | CHUNK    = int(1024)
 17 | CHANNELS = int(1)
 18 | NUM_CHUNKS = 20
 19 | ind = NUM_CHUNKS+1
 20 | proc_ind = 1
 21 | 
 22 | class Application(Frame):
 23 |     global make_sine 
 24 |     def make_sine(seg_length,ii):
 25 |         global mag
 26 |         global phase 
 27 |         global remember
 28 |         global CHUNK
 29 |         global encoder
 30 |         global enc_graph
 31 |         global decoder 
 32 |         global dec_graph
 33 |         global scales  
 34 |         global app 
 35 | 
 36 |         print(scales)
 37 |         #Additional = 1 because it works, -3 because of 75% window overlap
 38 |         additional = 1
 39 |         ind_array = np.arange((seg_length*ii-3),(seg_length*(ii+1)+additional))
 40 |         temp_out_mag = mag[ind_array,:]
 41 |         temp_phase = phase[:,ind_array]
 42 |         temp_remember = remember[ind_array]
 43 | 
 44 |         with enc_graph.as_default():
 45 |             temp_enc_mag = encoder.predict(temp_out_mag)
 46 |             enc_mag = temp_enc_mag * scales #NEED TO ADD SCALE HERE
 47 |         with dec_graph.as_default():
 48 |             temp_out_mag = decoder.predict(enc_mag)
 49 | 
 50 |         out_mag = temp_out_mag.T * temp_remember
 51 |         E = out_mag*np.exp(1j*temp_phase)
 52 |         out = np.float32(librosa.istft(E))
 53 |         out = 0.8*out[3*CHUNK:]
 54 |         return out.reshape(((len(out)/CHUNK),CHUNK))
 55 | 
 56 |     global callback 
 57 |     def callback(in_data, frame_count, time_info, status):
 58 |         # stream.write(samples.astype(np.float32).tostring())
 59 |         global ind
 60 |         global proc_ind 
 61 |         global NUM_CHUNKS
 62 |         global all_data
 63 | 
 64 |         if ind>=NUM_CHUNKS:
 65 |             all_data = make_sine(NUM_CHUNKS,proc_ind) #Create new back of samples
 66 |             ind = 0
 67 |             proc_ind+=1
 68 |         data = all_data[ind,:] #Send a chunk to the audio buffer when it asks for one
 69 |         ind +=1 
 70 |         return (data, pyaudio.paContinue)
 71 | 
 72 |     def render(self):
 73 |         global mag
 74 |         global phase 
 75 |         global remember
 76 |         global CHUNK
 77 |         global encoder
 78 |         global enc_graph
 79 |         global decoder 
 80 |         global dec_graph
 81 |         global scales  
 82 |         global app 
 83 | 
 84 |         print(scales)
 85 |         temp_out_mag = mag
 86 |         temp_phase = phase
 87 |         temp_remember = remember
 88 | 
 89 |         with enc_graph.as_default():
 90 |             temp_enc_mag = encoder.predict(temp_out_mag)
 91 |             enc_mag = temp_enc_mag * scales #NEED TO ADD SCALE HERE
 92 |         with dec_graph.as_default():
 93 |             temp_out_mag = decoder.predict(enc_mag)
 94 | 
 95 |         out_mag = temp_out_mag.T * temp_remember
 96 |         E = out_mag*np.exp(1j*temp_phase)
 97 |         out = np.float32(librosa.istft(E))
 98 |         out = (0.9/np.max(np.abs(out)))*out
 99 | 
100 |         sf.write('rendered.wav', out, 44100, subtype='PCM_16')
101 |         print('done rendering')
102 | 
103 | 
104 |     def record(self):
105 |         global mag
106 |         global phase 
107 |         global remember
108 |         global CHUNK
109 |         global encoder
110 |         global enc_graph
111 |         global decoder 
112 |         global dec_graph
113 |         global scales  
114 |         global app 
115 |         global recorded_scales
116 |         global proc_ind
117 | 
118 |         first_ind = 0
119 |         last_ind = 0
120 |         total_frames = 0
121 |         if self.RECORD_var.get() == 1:
122 |             first_ind = proc_ind
123 |             print('Button On')
124 |             self.start_net()
125 |         else:
126 |             last_ind = proc_ind
127 |             print('Button off')
128 |             self.pause_sounds()
129 | 
130 |             total_frames = (last_ind-first_ind)*NUM_CHUNKS
131 |             out_scales = np.ones((total_frames,15))
132 |             temp_scales = np.vstack(recorded_scales)
133 |             a = temp_scales.shape[0]
134 |             increase_by = total_frames//a+1
135 |             kurt=0
136 |             for ii in range(a):
137 |                 the_rows = np.arange((kurt*increase_by),min(((kurt+1)*increase_by),total_frames))
138 |                 out_scales[the_rows,:] = np.tile(temp_scales[ii,:],(len(the_rows),1))
139 |                 kurt+=1
140 |             ind_array = np.arange((first_ind),(NUM_CHUNKS*(last_ind)))
141 |             temp_out_mag = mag[ind_array,:]
142 |             temp_phase = phase[:,ind_array]
143 |             temp_remember = remember[ind_array]
144 | 
145 |             with enc_graph.as_default():
146 |                 temp_enc_mag = encoder.predict(temp_out_mag)
147 |                 enc_mag = temp_enc_mag * out_scales #NEED TO ADD SCALE HERE
148 |             with dec_graph.as_default():
149 |                 temp_out_mag = decoder.predict(enc_mag)
150 | 
151 |             out_mag = temp_out_mag.T * temp_remember
152 |             E = out_mag*np.exp(1j*temp_phase)
153 |             out = np.float32(librosa.istft(E))
154 |             out = (0.9/np.max(np.abs(out)))*out
155 | 
156 |             sf.write('recorded.wav', out, 44100, subtype='PCM_16')
157 |             print('done recording')
158 | 
159 | 
160 | 
161 |     def model_to_mem(self):
162 |         global encoder
163 |         global enc_graph
164 |         global decoder 
165 |         global dec_graph
166 | 
167 |         data_path_enc = os.path.join(os.getcwd(),self.model_name.get()+'_trained_encoder.h5')
168 |         encoder = load_model(data_path_enc, compile=False)
169 |         encoder._make_predict_function()
170 |         enc_graph = tf.get_default_graph()
171 |         data_path_dec = os.path.join(os.getcwd(),self.model_name.get()+'_trained_decoder.h5')
172 |         decoder = load_model(data_path_dec, compile=False)
173 |         decoder._make_predict_function()
174 |         dec_graph = tf.get_default_graph()
175 |         return encoder, decoder
176 | 
177 |     def process_track(self):
178 |         global mag
179 |         global phase
180 |         global remember
181 | 
182 |         len_window = 4096 #Specified length of analysis window
183 |         hop_length_ = 1024 #Specified percentage hop length between windows
184 | 
185 |         filename_in = self.track_name.get()
186 |         data_path = os.path.join(os.getcwd(),filename_in)
187 |         y, sr = librosa.load(data_path, sr=44100, mono=True)
188 | 
189 |         D = librosa.stft(y,n_fft=len_window, window='hann')
190 |         mag = D 
191 |         mag = np.abs(mag) #Magnitude response of the STFT
192 |         remember = mag.max(axis=0)+0.000000001 #Used for normalizing STFT frames (with addition to avoid division by zero)
193 |         mag = mag / remember #Normalizing
194 |         phase = np.angle(D) #Phase response of STFT
195 |         mag = mag.T
196 | 
197 |         return mag, phase, remember  
198 | 
199 | 
200 | 
201 |     def start_net(self):
202 |         global p 
203 |         global stream
204 |         self.model_to_mem()
205 |         mag, phase, remember = self.process_track()
206 | 
207 |         p = pyaudio.PyAudio()
208 | 
209 |         stream = p.open(format=pyaudio.paFloat32,
210 |                         channels=CHANNELS,
211 |                         frames_per_buffer=CHUNK,
212 |                         rate=RATE,
213 |                         output=True,
214 |                         stream_callback=callback)
215 | 
216 | 
217 |         stream.start_stream()
218 |         time.sleep(0.1)
219 | 
220 |     def pause_sounds(self):
221 |         global p 
222 |         global stream
223 |         global ind
224 |         global proc_ind 
225 |         
226 |         stream.stop_stream()
227 |         print('sounds paused')
228 |         stream.close()
229 |         p.terminate()
230 |         ind = NUM_CHUNKS+1
231 |         proc_ind = 0
232 | 
233 |     def quit(self):
234 |         root.destroy()
235 |         
236 | 
237 |     def createWidgets(self):
238 |         self.QUIT = Button(self)
239 |         self.QUIT["text"] = "QUIT"
240 |         self.QUIT["fg"]   = "red"
241 |         self.QUIT["command"] =  self.quit
242 |         self.QUIT.pack()
243 |         self.QUIT.place(relx=0.45,rely=0.9)
244 | 
245 |         self.model_name = Entry(self)
246 |         self.model_name.pack()
247 |         self.model_name.place(relx=0.4,rely=0.65)
248 |         self.label_1 = Label(self,text='Model Name')
249 |         self.label_1.pack()
250 |         self.label_1.place(relx=0.25,rely=0.65)
251 | 
252 |         self.track_name = Entry(self)
253 |         self.track_name.pack()
254 |         self.track_name.place(relx=0.4,rely=0.6)
255 |         self.label_1 = Label(self,text='Track Name')
256 |         self.label_1.pack()
257 |         self.label_1.place(relx=0.25,rely=0.6)
258 | 
259 |         self.START = Button(self)
260 |         self.START["text"] = "START"
261 |         self.START["fg"]   = "green"
262 |         self.START["command"] =  lambda: self.start_net()
263 |         self.START.pack()
264 |         self.START.place(relx=0.45,rely=0.85)
265 | 
266 |         self.PAUSE = Button(self)
267 |         self.PAUSE["text"] = "PAUSE"
268 |         self.PAUSE["fg"]   = "black"
269 |         self.PAUSE["command"] =  lambda: self.pause_sounds()
270 |         self.PAUSE.pack()
271 |         self.PAUSE.place(relx=0.45,rely=0.8)
272 | 
273 |         self.RECORD_var = IntVar()
274 |         self.RECORD = Checkbutton(self, variable=self.RECORD_var)
275 |         self.RECORD["text"] = "RECORD"
276 |         self.RECORD["fg"]   = "black"
277 |         self.RECORD["command"] =  lambda: self.record()
278 |         self.RECORD.pack()
279 |         self.RECORD.place(relx=0.45,rely=0.75)
280 | 
281 |         self.RENDER = Button(self)
282 |         self.RENDER["text"] = "RENDER"
283 |         self.RENDER["fg"]   = "black"
284 |         self.RENDER["command"] =  lambda: self.render()
285 |         self.RENDER.pack()
286 |         self.RENDER.place(relx=0.45,rely=0.7)
287 | 
288 | 
289 | 
290 |     def createSliders(self):
291 |         global scales 
292 |         scales = np.ones(15)
293 |         self.scale_list = []
294 |         for w in range(15):
295 |             scale = Scale(self,from_=40, to=-10,length=200)
296 |             scale.pack()
297 |             scale.place(relx=w/16.,rely=0.2)
298 |             scale.set(10)
299 |             scales[w]=scale.get()
300 |             self.scale_list.append(scale)
301 | 
302 |     def update_scales(self):
303 |         global scales 
304 |         global recorded_scales
305 | 
306 |         temp_scales = np.ones(15)
307 |         for w in range(15):
308 |             temp_scales[w]=self.scale_list[w].get()/10.
309 |         scales = temp_scales
310 |         if self.RECORD_var.get() == 1:
311 |             recorded_scales.append(scales)
312 |         self.after(250, self.update_scales)
313 | 
314 | 
315 |     def __init__(self, master=None):
316 |         global recorded_scales
317 | 
318 |         Frame.__init__(self, master,width=800, height=800)
319 |         self.pack()
320 |         self.createWidgets()
321 |         self.createSliders()
322 |         recorded_scales = []
323 |         self.update_scales()
324 | 
325 | global app 
326 | root = Tk()
327 | app = Application(master=root)
328 | app.mainloop()
329 | root.destroy()


--------------------------------------------------------------------------------
/models/ae_noskip_chroma_sig_oo_trained_decoder.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_noskip_chroma_sig_oo_trained_decoder.h5


--------------------------------------------------------------------------------
/models/ae_noskip_chroma_sig_oo_trained_encoder.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_noskip_chroma_sig_oo_trained_encoder.h5


--------------------------------------------------------------------------------
/models/ae_skip_oo_trained_decoder.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_skip_oo_trained_decoder.h5


--------------------------------------------------------------------------------
/models/ae_skip_oo_trained_encoder.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JTColonel/manne/220db1657b9a7c60514d2463dafb4892b1772363/models/ae_skip_oo_trained_encoder.h5


--------------------------------------------------------------------------------
/models/foo.txt:
--------------------------------------------------------------------------------
1 | hi
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.8.1
 2 | astor==0.8.0
 3 | audioread==2.1.8
 4 | backports.functools-lru-cache==1.5
 5 | backports.weakref==1.0.post1
 6 | cffi==1.12.3
 7 | cycler==0.10.0
 8 | decorator==4.4.0
 9 | enum34==1.1.6
10 | funcsigs==1.0.2
11 | futures==3.3.0
12 | gast==0.3.2
13 | grpcio==1.24.1
14 | h5py==2.10.0
15 | joblib==0.14.0
16 | Keras==2.2.4
17 | Keras-Applications==1.0.8
18 | Keras-Preprocessing==1.1.0
19 | kiwisolver==1.1.0
20 | librosa==0.7.0
21 | llvmlite==0.29.0
22 | Markdown==3.1.1
23 | matplotlib==2.2.4
24 | mock==3.0.5
25 | numba==0.45.1
26 | numpy==1.16.5
27 | pandas==0.24.2
28 | pkg-resources==0.0.0
29 | protobuf==3.10.0
30 | PyAudio==0.2.11
31 | pycparser==2.19
32 | pyo==1.0.0
33 | pyparsing==2.4.2
34 | python-dateutil==2.8.0
35 | pytz==2019.3
36 | PyYAML==5.1.2
37 | resampy==0.2.2
38 | scikit-learn==0.20.4
39 | scipy==1.2.2
40 | singledispatch==3.4.0.3
41 | six==1.12.0
42 | SoundFile==0.10.2
43 | subprocess32==3.5.4
44 | tensorboard==1.9.0
45 | tensorflow==1.15.2
46 | termcolor==1.1.0
47 | tqdm==4.36.1
48 | Werkzeug==0.16.0
49 | 


--------------------------------------------------------------------------------
/synth_manne_gui.py:
--------------------------------------------------------------------------------
  1 | from Tkinter import *
  2 | import numpy as np
  3 | import pandas as pd
  4 | import os
  5 | import librosa
  6 | import soundfile as sf
  7 | import argparse
  8 | import pyaudio
  9 | import numpy as np
 10 | from keras.layers import Input, Dense, LeakyReLU
 11 | from keras.models import Model, load_model
 12 | import tensorflow as tf 
 13 | import time
 14 | from scipy import signal 
 15 | 
 16 | RATE     = int(44100)
 17 | CHUNK    = int(1024)
 18 | CHANNELS = int(1)
 19 | NUM_CHUNKS = 5
 20 | ind = NUM_CHUNKS+1
 21 | proc_ind = 1
 22 | crossfade_time = int(CHUNK*3)
 23 | fade_in = np.log(np.linspace(1, 2.71, crossfade_time))
 24 | fade_out = np.log(np.linspace(2.71, 1, crossfade_time))
 25 | threshold = 1e-3
 26 | pie = np.pi
 27 | relative_height = 0.01
 28 | len_window = 4096
 29 | width_ = (len_window/2)/np.sqrt(-2*np.log(relative_height))
 30 | freq_time_ratio = -1*(pie/4)*(np.power(len_window,2)/np.log(relative_height))
 31 | last_three_der_frames = np.zeros((2049,3))
 32 | last_three_phase_frames = np.zeros((2049,3))
 33 | wn_phase = np.load('wn_phase.npy')
 34 | print('All wn phase shape')
 35 | print(wn_phase.shape)
 36 | 
 37 | class Application(Frame):
 38 | 
 39 |     global make_sine 
 40 |     def make_sine(seg_length,ii):
 41 |         global mag
 42 |         global phase 
 43 |         global remember
 44 |         global CHUNK
 45 |         global encoder
 46 |         global enc_graph
 47 |         global decoder 
 48 |         global dec_graph
 49 |         global scales  
 50 |         global recorded_scales
 51 |         global app 
 52 |         global POLL_TIME
 53 |         global RATE 
 54 |         global fade_in
 55 |         global fade_out
 56 |         global wn_phase
 57 |         global chroma_choice
 58 |         global num_latents
 59 |         global skip
 60 | 
 61 |         additional = 4
 62 |         enc_mag = scales*np.ones((1,num_latents))
 63 |         if skip=='skip':
 64 |             chroma_append = np.zeros((1,12))
 65 |             chroma_append[0,chroma_choice] = 1
 66 |             enc_mag = np.hstack((enc_mag,chroma_append))
 67 | 
 68 | 
 69 |         ind_array = np.arange((seg_length*ii-3),(seg_length*(ii+1)+1))
 70 |         temp_phase = wn_phase[ind_array,:]
 71 |         with dec_graph.as_default():
 72 |             temp_out_mag = decoder.predict(enc_mag)
 73 |         temp_out_mag = np.tile(temp_out_mag,(NUM_CHUNKS+additional,1))
 74 |         E = temp_out_mag*np.exp(1j*temp_phase)
 75 |         _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096)
 76 |         out = np.float32(now_out[3*CHUNK:]*(0.08/np.max(np.abs(now_out))))
 77 |         final_out = out.reshape(((len(out)/CHUNK),CHUNK))
 78 |         return final_out
 79 | 
 80 |     global callback 
 81 |     def callback(in_data, frame_count, time_info, status):
 82 |         global ind
 83 |         global proc_ind 
 84 |         global NUM_CHUNKS
 85 |         global all_data
 86 | 
 87 |         if ind>=(NUM_CHUNKS-1):
 88 |             all_data = make_sine(NUM_CHUNKS,proc_ind) 
 89 |             ind = 0
 90 |             proc_ind+=1
 91 |         data = all_data[ind,:] #Send a chunk to the audio buffer when it asks for one
 92 |         ind +=1 
 93 |         return (data, pyaudio.paContinue)
 94 | 
 95 |     def render(self):
 96 |         global mag
 97 |         global phase 
 98 |         global remember
 99 |         global CHUNK
100 |         global encoder
101 |         global enc_graph
102 |         global decoder 
103 |         global dec_graph
104 |         global scales  
105 |         global app 
106 |         global recorded_scales
107 | 
108 |         print(scales)
109 |         ind_array = np.arange((1000),(1000*(2)))
110 |         temp_phase = wn_phase[ind_array,:]
111 |         enc_mag = scales*np.ones((1,15))
112 |         with dec_graph.as_default():
113 |             temp_out_mag = decoder.predict(enc_mag)
114 |         temp_out_mag = np.tile(temp_out_mag,(200,1))
115 |         E = temp_out_mag*np.exp(1j*temp_phase)
116 |         _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096)
117 |         out = np.float32(now_out[3*CHUNK:]*(0.08/np.max(np.abs(now_out))))
118 | 
119 |         sf.write('rendered.wav', out, 44100, subtype='PCM_16')
120 |         print('done rendering')
121 | 
122 |     def record(self):
123 |         global mag
124 |         global phase 
125 |         global remember
126 |         global CHUNK
127 |         global encoder
128 |         global enc_graph
129 |         global decoder 
130 |         global dec_graph
131 |         global scales  
132 |         global app 
133 |         global recorded_scales
134 |         global proc_ind
135 | 
136 |         first_ind = 0
137 |         last_ind = 0
138 |         total_frames = 0
139 |         if self.RECORD_var.get() == 1:
140 |             first_ind = proc_ind
141 |             print('Button On')
142 |             self.start_net()
143 |         else:
144 |             last_ind = proc_ind
145 |             print('Button off')
146 |             self.pause_sounds()
147 | 
148 |             total_frames = (last_ind-first_ind)*NUM_CHUNKS
149 |             out_scales = np.ones((total_frames,15))
150 |             temp_scales = np.vstack(recorded_scales)
151 |             a = temp_scales.shape[0]
152 |             increase_by = total_frames//a+1
153 |             kurt=0
154 |             for ii in range(a):
155 |                 the_rows = np.arange((kurt*increase_by),min(((kurt+1)*increase_by),total_frames))
156 |                 out_scales[the_rows,:] = np.tile(temp_scales[ii,:],(len(the_rows),1))
157 |                 kurt+=1
158 |             ind_array = np.arange((first_ind),(NUM_CHUNKS*(last_ind)))
159 |             temp_phase = wn_phase[ind_array,:]
160 | 
161 |             with dec_graph.as_default():
162 |                 temp_out_mag = decoder.predict(out_scales)
163 | 
164 |             E = temp_out_mag*np.exp(1j*temp_phase)
165 |             _, now_out = signal.istft(E.T, fs=44100, noverlap=3*1024, nfft=4096)
166 |             out = np.float32(now_out[3*CHUNK:]*(0.8/np.max(np.abs(now_out))))
167 | 
168 |             sf.write('recorded.wav', out, 44100, subtype='PCM_16')
169 |             print('done recording')
170 | 
171 | 
172 | 
173 |     def model_to_mem(self):
174 |         global decoder 
175 |         global dec_graph
176 | 
177 |         data_path = os.path.join(os.getcwd(),self.model_name.get()+'_trained_decoder.h5')
178 |         decoder = load_model(data_path, compile=False)
179 |         decoder._make_predict_function()
180 |         dec_graph = tf.get_default_graph()
181 | 
182 |     def process_track(self):
183 |         global mag
184 |         global phase
185 |         global remember
186 | 
187 |         len_window = 4096 #Specified length of analysis window
188 |         hop_length_ = 1024 #Specified percentage hop length between windows
189 | 
190 |         filename_in = self.track_name.get()
191 |         data_path = os.path.join(os.getcwd(),filename_in)
192 |         y, sr = librosa.load(data_path, sr=44100, mono=True)
193 | 
194 |         D = librosa.stft(y,n_fft=len_window, window='hann')
195 |         mag = D 
196 |         mag = np.abs(mag) #Magnitude response of the STFT
197 |         remember = mag.max(axis=0)+0.000000001 #Used for normalizing STFT frames (with addition to avoid division by zero)
198 |         mag = mag / remember #Normalizing
199 |         phase = np.angle(D) #Phase response of STFT
200 |         mag = mag.T
201 | 
202 |         return mag, phase, remember  
203 | 
204 | 
205 | 
206 |     def start_net(self):
207 |         global p 
208 |         global stream
209 |         self.model_to_mem()
210 | 
211 |         p = pyaudio.PyAudio()
212 | 
213 |         stream = p.open(format=pyaudio.paFloat32,
214 |                         channels=CHANNELS,
215 |                         frames_per_buffer=CHUNK,
216 |                         rate=RATE,
217 |                         output=True,
218 |                         stream_callback=callback)
219 | 
220 | 
221 |         stream.start_stream()
222 |         time.sleep(0.1)
223 | 
224 |     def pause_sounds(self):
225 |         global p 
226 |         global stream
227 |         global ind
228 |         global proc_ind 
229 |         
230 |         stream.stop_stream()
231 |         print('sounds paused')
232 |         stream.close()
233 |         p.terminate()
234 |         ind = NUM_CHUNKS+1
235 |         proc_ind = 0
236 | 
237 |     def quit(self):
238 |         root.destroy()
239 |         
240 | 
241 |     def createWidgets(self):
242 |         self.QUIT = Button(self)
243 |         self.QUIT["text"] = "QUIT"
244 |         self.QUIT["fg"]   = "red"
245 |         self.QUIT["command"] =  self.quit
246 |         self.QUIT.pack()
247 |         self.QUIT.place(relx=0.45,rely=0.9)
248 | 
249 |         self.model_name = Entry(self)
250 |         self.model_name.pack()
251 |         self.model_name.place(relx=0.4,rely=0.65)
252 |         self.label_1 = Label(self,text='Model Name')
253 |         self.label_1.pack()
254 |         self.label_1.place(relx=0.25,rely=0.65)
255 | 
256 |         self.START = Button(self)
257 |         self.START["text"] = "START"
258 |         self.START["fg"]   = "green"
259 |         self.START["command"] =  lambda: self.start_net()
260 |         self.START.pack()
261 |         self.START.place(relx=0.45,rely=0.85)
262 | 
263 |         self.PAUSE = Button(self)
264 |         self.PAUSE["text"] = "PAUSE"
265 |         self.PAUSE["fg"]   = "black"
266 |         self.PAUSE["command"] =  lambda: self.pause_sounds()
267 |         self.PAUSE.pack()
268 |         self.PAUSE.place(relx=0.45,rely=0.8)
269 | 
270 |         self.RECORD_var = IntVar()
271 |         self.RECORD = Checkbutton(self, variable=self.RECORD_var)
272 |         self.RECORD["text"] = "RECORD"
273 |         self.RECORD["fg"]   = "black"
274 |         self.RECORD["command"] =  lambda: self.record()
275 |         self.RECORD.pack()
276 |         self.RECORD.place(relx=0.45,rely=0.75)
277 | 
278 |         self.RENDER = Button(self)
279 |         self.RENDER["text"] = "RENDER"
280 |         self.RENDER["fg"]   = "black"
281 |         self.RENDER["command"] =  lambda: self.render()
282 |         self.RENDER.pack()
283 |         self.RENDER.place(relx=0.45,rely=0.7)
284 | 
285 | 
286 | 
287 |     def createSliders(self):
288 |         global scales 
289 |         global num_latents
290 |         scales = np.ones(num_latents)
291 |         self.scale_list = []
292 |         for w in range(num_latents):
293 |             scale = Scale(self,from_=110, to=-10,length=200)
294 |             scale.pack()
295 |             scale.place(relx=w/(float(num_latents)),rely=0.2)
296 |             scale.set(0)
297 |             scales[w]=scale.get()
298 |             self.scale_list.append(scale)
299 | 
300 |     def createButtons(self):
301 |         global chroma_val
302 |         self.chroma_val = IntVar()
303 |         self.chroma_val.set(0)
304 |         NOTE_OPTIONS = [
305 |         ('C',0),
306 |         ('C#',1),
307 |         ('D',2),
308 |         ('D#',3),
309 |         ('E',4),
310 |         ('F',5),
311 |         ('F#',6),
312 |         ('G',7),
313 |         ('G#',8),
314 |         ('A',9),
315 |         ('A#',10),
316 |         ('B',11)
317 |         ]
318 | 
319 |         for text, val in NOTE_OPTIONS:
320 |             b = Radiobutton(self, text=text, value=val, variable=self.chroma_val)
321 |             b.pack()
322 |             b.place(relx=0.2+val/19.,rely=0.1)
323 | 
324 | 
325 |     def update_scales(self):
326 |         global scales 
327 |         global recorded_scales
328 |         global POLL_TIME
329 |         global chroma_choice
330 |         global num_latents
331 | 
332 |         POLL_TIME = 100
333 |         chroma_choice = self.chroma_val.get()
334 |         temp_scales = np.ones(8)
335 |         for w in range(num_latents):
336 |             temp_scales[w]=self.scale_list[w].get()/300.
337 |         scales = temp_scales
338 |         if self.RECORD_var.get() == 1:
339 |             recorded_scales.append(scales)
340 |         self.after(POLL_TIME, self.update_scales)
341 | 
342 | 
343 |     def __init__(self, master=None):
344 |         global recorded_scales
345 | 
346 |         Frame.__init__(self, master,width=800, height=800)
347 |         self.pack()
348 |         self.createWidgets()
349 |         self.createButtons()
350 |         self.createSliders()
351 |         recorded_scales = []
352 |         self.update_scales()
353 | 
354 | global app 
355 | global num_latents
356 | global skip
357 | num_latents = int(sys.argv[1])
358 | skip = sys.argv[2]
359 | root = Tk()
360 | app = Application(master=root)
361 | app.mainloop()
362 | root.destroy()


--------------------------------------------------------------------------------
/wav2frames.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import matplotlib.pyplot as plt 
 3 | import pickle
 4 | import os
 5 | import librosa
 6 | import librosa.display
 7 | import scipy as sci
 8 | import argparse
 9 | 
10 | def get_arguments():
11 | 	parser = argparse.ArgumentParser()
12 | 	parser.add_argument('--filename_in', type=str)
13 | 	parser.add_argument('--filename_out', type=str)
14 | 	return parser.parse_args()
15 | 
16 | args = get_arguments()
17 | 
18 | len_window = 4096 #Specified length of analysis window
19 | hop_length_ = 1024 #Specified percentage hop length between windows
20 | 
21 | filename_in = args.filename_in
22 | filename_out = args.filename_out
23 | data_path = os.path.join(os.getcwd(),filename_in)
24 | y, sr = librosa.load(data_path, sr=44100)
25 | 
26 | D = librosa.stft(y,n_fft=4096, window='hann')
27 | print(D.shape)
28 | temp = D[:,:]
29 | phase = np.angle(temp)
30 | temp = np.abs(temp)
31 | temp = temp / (temp.max(axis=0)+0.000000001)
32 | print(temp.max(axis=0))
33 | temp = np.transpose(temp)
34 | phase = np.transpose(phase)
35 | print(np.shape(temp))
36 | output = temp[~np.all(temp == 0, axis=1)]
37 | out_phase = phase[~np.all(temp == 0, axis=1)]
38 | print(np.shape(output))
39 | np.save(filename_out+'.npy',output)
40 | np.save(filename_out+'_phase.npy',out_phase)
41 | 


--------------------------------------------------------------------------------