├── .gitignore ├── MIRdl.py ├── README.md ├── buildArchitecture.py ├── data ├── datasets │ └── INFO.md ├── preloaded │ └── INFO.md └── results │ └── INFO.md ├── load_datasets.py ├── runMIRdl.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *.training 4 | *.result 5 | *.npz 6 | *.param 7 | *.pickle 8 | venv/ 9 | data/datasets/Ballroom 10 | data/datasets/GTZAN 11 | test 12 | output/ 13 | error/ 14 | build/ 15 | .idea/ 16 | .Rhistory 17 | -------------------------------------------------------------------------------- /MIRdl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import time, random, csv 4 | import numpy as np 5 | 6 | import theano, lasagne 7 | import theano.tensor as T 8 | 9 | import load_datasets as loadData 10 | import buildArchitecture as buildArch 11 | 12 | def main(parameters): 13 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False): 14 | assert len(inputs) == len(targets) 15 | if shuffle: 16 | indices = np.arange(len(inputs)) 17 | np.random.shuffle(indices) 18 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): 19 | if shuffle: 20 | excerpt = indices[start_idx:start_idx + batchsize] 21 | else: 22 | excerpt = slice(start_idx, start_idx + batchsize) 23 | yield inputs[excerpt], targets[excerpt] 24 | 25 | print("Loading data..") 26 | parameters['numOutputNeurons'], X_train, y_train, X_val, y_val, X_test, y_test = loadData.load_dataset(parameters) 27 | 28 | print("Building network..") 29 | input_var = T.tensor4('inputs') 30 | target_var = T.ivector('targets') 31 | network,netLayers,parameters=buildArch.buildNet(input_var,parameters) 32 | 33 | print("Compiling functions..") 34 | 35 | def computeLoss(prediction, target_var, parameters): 36 | if parameters['cost']=='crossentropy': 37 | loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) 38 | elif parameters['cost']=='squared_error': 39 | loss = lasagne.objectives.squared_error(prediction, target_var) 40 | loss = loss.mean() 41 | return loss 42 | 43 | # define training functions 44 | prediction = lasagne.layers.get_output(network) 45 | loss=computeLoss(prediction, target_var, parameters) 46 | params = lasagne.layers.get_all_params(network, trainable=True) 47 | updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=parameters['lr'], momentum=parameters['momentum']) 48 | 49 | # define testing/val functions 50 | test_prediction = lasagne.layers.get_output(network, deterministic=True) 51 | test_loss=computeLoss(test_prediction, target_var, parameters) 52 | test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) 53 | 54 | # compile training and test/val functions 55 | train_fn = theano.function([input_var, target_var], loss, updates=updates) 56 | val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) 57 | 58 | print("Training..") 59 | hash = random.getrandbits(128) 60 | trainLoss_ans=np.inf 61 | for epoch in range(parameters['num_epochs']): 62 | # training set 63 | train_err = 0 64 | train_batches = 0 65 | start_time = time.time() 66 | for batch in iterate_minibatches(X_train, y_train, parameters['batchSize'], shuffle=True): 67 | inputs, targets = batch 68 | train_err += train_fn(inputs, targets) 69 | train_batches += 1 70 | 71 | # validation set 72 | val_err = 0 73 | val_acc = 0 74 | val_batches = 0 75 | for batch in iterate_minibatches(X_val, y_val, parameters['batchSize'], shuffle=False): 76 | inputs, targets = batch 77 | err, acc = val_fn(inputs, targets) 78 | val_err += err 79 | val_acc += acc 80 | val_batches += 1 81 | 82 | # output 83 | print("Epoch {} of {} took {:.3f}s".format( 84 | epoch + 1, parameters['num_epochs'], time.time() - start_time)) 85 | print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) 86 | print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) 87 | print(" validation accuracy:\t\t{:.2f} %".format( 88 | val_acc / val_batches * 100)) 89 | 90 | ################# JUST STORING OUTPUTS INTO FILES FOR TRACKING ################## 91 | name='./data/results/'+parameters['dataset']+'_'+parameters['type']+'_'+str(hash) 92 | # save the best model 93 | if train_err/train_batches./data/datasets/GTZAN/blues 33 | > 34 | >./data/datasets/GTZAN/classical 35 | > 36 | > (...) 37 | > 38 | >./data/datasets/GTZAN/rock 39 | - **2)** Adapt the *load_datasets.py* function to work using your dataset. We recommend you to use first the GTZAN dataset (already implemented) to understand how it works. 40 | 41 | - **3)** Set the *runMIRdl.py* parameters and the deep learning architecture in *buildArchitecture.py*. 42 | 43 | - **4)** Run *runMIRdl.py*. 44 | 45 | - **5)** *[Optional]* Visualize what the net has learned with *utils.py*. 46 | 47 | **Future features** 48 | - Autoencoders support. 49 | - AcousticBrainz.org support. 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /buildArchitecture.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | 3 | def buildNet(input_var, parameters): 4 | 'Select a deep learning architecture' 5 | if parameters['type']=='cnn1': 6 | return cnn1(input_var, parameters) 7 | else: 8 | print 'Architecture NOT supported' 9 | 10 | def cnn1(input_var,parameters): 11 | 12 | # set architecture parameters 13 | parameters['filter_size']=(12,8) 14 | parameters['num_filters']=15 15 | parameters['nonlinearity']=lasagne.nonlinearities.rectify 16 | parameters['W_init']=lasagne.init.GlorotUniform() 17 | parameters['pool_size']=(2, 1) 18 | parameters['dropout_p']=.5 19 | parameters['num_dense_units']=200 20 | 21 | # set convolutional neural network 22 | network={} 23 | # input layer 24 | network["1"] = lasagne.layers.InputLayer(shape=(None, int(parameters['numChannels']), int(parameters['melBands']), int(parameters['inputFrames'])),input_var=input_var) 25 | # convolutional layer 26 | network["2"] = lasagne.layers.Conv2DLayer(network["1"], num_filters=parameters['num_filters'], filter_size=parameters['filter_size'],nonlinearity=parameters['nonlinearity'],W=parameters['W_init']) 27 | # pooling layer 28 | network["3"] = lasagne.layers.MaxPool2DLayer(network["2"], pool_size=parameters['pool_size']) 29 | # feed-forward layer 30 | network["4"] = lasagne.layers.DenseLayer(lasagne.layers.dropout(network["3"], p=parameters['dropout_p']),num_units=parameters['num_dense_units'],nonlinearity=parameters['nonlinearity']) 31 | # output layer 32 | network["5"] = lasagne.layers.DenseLayer(lasagne.layers.dropout(network["4"], p=parameters['dropout_p']),num_units=int(parameters['numOutputNeurons']),nonlinearity=lasagne.nonlinearities.softmax) 33 | 34 | # returning the output layer standing for the net (network['5']), each layer separately (network) and the updated parameters for tracking. 35 | return network["5"],network,parameters -------------------------------------------------------------------------------- /data/datasets/INFO.md: -------------------------------------------------------------------------------- 1 | In this directory (./data/datasets/) the library expects to have the dataset divided by folders that represent the tag to be predicted. 2 | 3 | For example, for the GTZAN dataset (http://marsyasweb.appspot.com/download/data_sets/) the library expects: 4 | 5 | ./data/datasets/GTZAN/blues 6 | 7 | ./data/datasets/GTZAN/classical 8 | 9 | ./data/datasets/GTZAN/country 10 | 11 | ./data/datasets/GTZAN/disco 12 | 13 | ./data/datasets/GTZAN/hiphop 14 | 15 | ./data/datasets/GTZAN/jazz 16 | 17 | ./data/datasets/GTZAN/metal 18 | 19 | ./data/datasets/GTZAN/pop 20 | 21 | ./data/datasets/GTZAN/reggae 22 | 23 | ./data/datasets/GTZAN/rock, 24 | 25 | where each folder contains all the songs for training, testing and validating for each class. 26 | -------------------------------------------------------------------------------- /data/preloaded/INFO.md: -------------------------------------------------------------------------------- 1 | This directory contains the pickle files storing the datasets in a format readable for the library. 2 | -------------------------------------------------------------------------------- /data/results/INFO.md: -------------------------------------------------------------------------------- 1 | This directory stores the following files: **.result** (with training and test results), **.training** (having the training evolution, readable with utils.py!), **.param** (storing all the deep learning parameters used for each concrete experiment) and **.npz** (where the best trained deep learning model is stored). -------------------------------------------------------------------------------- /load_datasets.py: -------------------------------------------------------------------------------- 1 | import sys, os, pickle 2 | import numpy as np 3 | 4 | from essentia.standard import * 5 | 6 | def load_dataset(parameters): 7 | 'Choose which dataset you want to use and for which task - only supports classification, by now.' 8 | if parameters['dataset']=='ballroom': 9 | if parameters['task'] == 'classification': 10 | return ballroom_classification(parameters) 11 | 12 | elif parameters['dataset']=='GTZAN': 13 | if parameters['task'] == 'classification': 14 | return GTZAN_classification(parameters) 15 | 16 | else: 17 | print('Dataset NOT supported') 18 | 19 | def normalization(spect,inputNorm): 20 | 'Normalize the input spectrograms, choose a configuration.' 21 | if inputNorm=='energy': 22 | E=sum(sum(np.power(spect, 2)))/len(spect) 23 | spect=spect/E 24 | 25 | elif inputNorm=='energy20Log': 26 | E=sum(sum(np.power(spect, 2)))/len(spect) 27 | spect=spect/E 28 | spect = 20*np.log10(spect+1) 29 | 30 | elif inputNorm=='energyLog': 31 | E=sum(sum(np.power(spect, 2)))/len(spect) 32 | spect=spect/E 33 | spect = np.log10(spect+1) 34 | 35 | elif inputNorm=='log': 36 | spect = np.log10(spect+1) 37 | 38 | elif inputNorm=='None': 39 | print 'No normalization!' 40 | 41 | else: 42 | print 'This normalization does not exist!' 43 | 44 | ##!## remove silences ? 45 | ##!## log10(x+1) by itself avoids nans. No need of more stuff such as: spect[spect == -np.inf]= 0 or np.finfo(float).eps 46 | ##!## mean 0 variance 1 ? 47 | 48 | return spect 49 | 50 | def GTZAN_classification(parameters): 51 | 52 | # setting the name for storing the pre-processed and formatted GTZAN data. 53 | pickleFile='./data/preloaded/' 54 | for key, value in parameters.iteritems(): 55 | pickleFile=pickleFile+'_'+str(value) 56 | pickleFile=pickleFile+'.pickle' 57 | 58 | # if it was already computed, simply load it. 59 | if os.path.exists(pickleFile): 60 | 61 | print "-- Loading pre-computed spectrograms.." 62 | 63 | with open(pickleFile) as f: 64 | numOutputNeurons, X_train, X_test, X_val, y_train, y_test, y_val = pickle.load(f) 65 | 66 | # otherwise, compute it! 67 | else: 68 | 69 | # define where the audio are 70 | dir= './data/datasets/GTZAN' 71 | 72 | # associate with a dictionary the names of the folders (tags to be learned) to an integer 73 | dict = {'blues':0,'classical':1,'country':2,'disco':3,'hiphop':4,'jazz':5,'metal':6,'pop':7,'reggae':8,'rock':9} 74 | # TODO: create dict automatically! 75 | 76 | # number of 'sliced' spectrograms 77 | numInputs=16000 78 | # if not known: numInputs=countNumInputs(dir,parameters) 79 | 80 | # format the audio for classification 81 | numOutputNeurons, X_train, y_train, X_val, y_val, X_test, y_test = formatAudioClassification(dir,dict,numInputs,parameters) 82 | 83 | # Saving the computed features 84 | with open(pickleFile, 'w') as f: 85 | pickle.dump([numOutputNeurons, X_train, X_test, X_val, y_train, y_test, y_val], f) 86 | 87 | return numOutputNeurons, X_train, y_train, X_val, y_val, X_test, y_test 88 | 89 | def ballroom_classification(parameters): 90 | 'Similarly defined as the GTZAN dataset' 91 | pickleFile='./data/preloaded/' 92 | for key, value in parameters.iteritems(): 93 | pickleFile=pickleFile+'_'+str(value) 94 | pickleFile=pickleFile+'.pickle' 95 | 96 | if os.path.exists(pickleFile): 97 | 98 | print "-- Loading pre-computed features.." 99 | 100 | with open(pickleFile) as f: 101 | numOutputNeurons, X_train, X_test, X_val, y_train, y_test, y_val = pickle.load(f) 102 | 103 | else: 104 | 105 | dir= './data/datasets/Ballroom/BallroomData' 106 | dict = {'ChaChaCha':0,'Samba':1,'Quickstep':2,'VienneseWaltz':3,'Tango':4,'Jive':5,'Waltz':6,'Rumba':7} 107 | 108 | numInputs=11629 109 | # if not known: numInputs=countNumInputs(dir,parameters) 110 | 111 | numOutputNeurons, X_train, y_train, X_val, y_val, X_test, y_test = formatAudioClassification(dir,dict,numInputs,parameters) 112 | 113 | # Saving the objects: 114 | with open(pickleFile, 'w') as f: 115 | pickle.dump([numOutputNeurons, X_train, X_test, X_val, y_train, y_test, y_val], f) 116 | 117 | return numOutputNeurons, X_train, y_train, X_val, y_val, X_test, y_test 118 | 119 | def formatAudioClassification(dir,dict,num_inputs,parameters): 120 | 'Compute the spectrograms and format them to be fed in the network' 121 | # init variables 122 | D=np.zeros(num_inputs*parameters['melBands']*parameters['inputFrames'],dtype=np.float32).reshape(num_inputs,1,parameters['melBands'],parameters['inputFrames']) 123 | A=np.zeros(num_inputs,dtype=np.uint8)+parameters['errorCode'] 124 | count=0 125 | # walk through the directory: compute spectrograms, normalize and annotate. 126 | for root, dirs, files in os.walk(dir): 127 | for annotation in dirs: 128 | for r, ds, fs in os.walk(root+'/'+annotation): 129 | for f in fs: 130 | spect = computeMelSpectrogram(root+'/'+annotation+'/'+f,parameters['frameSize'],parameters['hopSize'],parameters['windowType'],parameters['melBands']) 131 | spect=normalization(spect,parameters['inputNorm']) 132 | for c in chunk(spect,parameters['inputFrames']): 133 | # sliced spectrogram 134 | D[count][0]=c 135 | # associated annotation 136 | A[count]=dict[annotation] 137 | count=count+1 138 | print(root+'/'+annotation+'/'+f) 139 | 140 | # randomize order 141 | D,A=shuffle_in_unison_inplace(D, A) 142 | 143 | # split training/test/validation data 144 | cut_train=int(np.floor(parameters['trainSplit']*D.shape[0])) 145 | cut_test=int(np.floor((parameters['trainSplit']+parameters['testSplit'])*D.shape[0])) 146 | X_train, X_test, X_val = D[:cut_train], D[cut_train+1:cut_test], D[cut_test+1:] 147 | y_train, y_test, y_val = A[:cut_train], A[cut_train+1:cut_test], A[cut_test+1:] 148 | 149 | # number of different classes/output neurons 150 | numOutputNeurons=len(set(A)) 151 | 152 | return numOutputNeurons, X_train, y_train, X_val, y_val, X_test, y_test 153 | 154 | def countNumInputs(dir,parameters): 155 | 'Counting number of instances (sliced spectrograms) resulting for a given dataset' 156 | num_inputs=0 157 | for root, dirs, files in os.walk(dir): 158 | for annotation in dirs: 159 | for r, ds, fs in os.walk(root+'/'+annotation): 160 | for f in fs: 161 | spect = computeMelSpectrogram(root+'/'+annotation+'/'+f,parameters['frameSize'],parameters['hopSize'],parameters['windowType'],parameters['melBands']) 162 | num_inputs=num_inputs+len(chunk(spect,80)) 163 | print num_inputs 164 | 165 | def computeMelSpectrogram(file,frameSize,hopSize,windowType,melBands): 166 | 'Compute MEL spectrogram using Essentia python bindings' 167 | loader = essentia.standard.MonoLoader(filename = file) 168 | audio = loader() 169 | w = Windowing(type = windowType) 170 | spectrum = Spectrum() 171 | mel = MelBands(numberBands = melBands) # 40 bands! 172 | 173 | melSpec = [] 174 | for frame in FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): 175 | melSpec.append(mel(spectrum(w(frame)))) 176 | # we need to convert the list to an essentia.array first (== numpy.array of floats) 177 | melSpec = essentia.array(melSpec).T 178 | 179 | return melSpec 180 | 181 | def chunk(l, n): 182 | 'Yield successive n-sized chunks from l.' 183 | out=[] 184 | for i in xrange(0, int(l.shape[1]/n)*n, n): 185 | out.append(l[:,i:i+n]) 186 | 187 | return out 188 | 189 | def shuffle_in_unison_inplace(a, b): 190 | 'Shuffle in unison the data with its annotations. Randomizing the order!' 191 | assert len(a) == len(b) 192 | p = np.random.permutation(len(a)) 193 | return a[p], b[p] -------------------------------------------------------------------------------- /runMIRdl.py: -------------------------------------------------------------------------------- 1 | import MIRdl 2 | 3 | # Input data parameters 4 | parameters={} 5 | parameters['dataset'] = 'GTZAN' # 'ballroom' or 'GTZAN' 6 | parameters['frameSize'] = 2048 7 | parameters['hopSize'] = 1024 8 | parameters['numChannels'] = 1 9 | parameters['windowType'] = 'blackmanharris62' 10 | parameters['melBands'] = 40 11 | parameters['inputFrames'] = 80 12 | parameters['errorCode'] = 999 13 | parameters['inputNorm'] = 'energy20Log' # 'energy','energy20Log','energyLog','log' or 'None' 14 | 15 | # Deep Learning architecture 16 | parameters['type'] = 'cnn1' # only 'cnn1' 17 | parameters['task'] = 'classification' # only 'classification' 18 | 19 | # Training parameters 20 | parameters['trainSplit'] = 0.75 21 | parameters['testSplit'] = 0.15 22 | parameters['valSplit'] = 1-parameters['trainSplit']-parameters['testSplit'] 23 | parameters['num_epochs'] = 2000 24 | parameters['batchSize'] = 500 25 | parameters['lr'] = 0.02 26 | parameters['momentum'] = 0.9 27 | parameters['cost']='crossentropy' # only 'crossentropy' 28 | 29 | MIRdl.main(parameters) 30 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import theano, lasagne, csv 2 | import theano.tensor as T 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pandas as pd 6 | import buildArchitecture as buildArch 7 | 8 | def visualizeWcnn1(name): 9 | 'Visualize weights of the convolutional layer of cnn1' 10 | 11 | ##!!## biases not shown ! 12 | ##!!## deterministic W ? 13 | 14 | # load parameters 15 | with open(name+'.param', 'rb') as paramFile: 16 | params = csv.reader(paramFile, delimiter='-') 17 | count=0; 18 | for param in params: 19 | if count==0: 20 | tmp1=param 21 | count=count+1 22 | else: 23 | tmp2=param 24 | parameters = {} 25 | for i in range(len(tmp2)): 26 | parameters[tmp1[i]] = tmp2[i] 27 | 28 | print("Building network..") 29 | input_var = T.tensor4('inputs') 30 | network,netLayers,parameters=buildArch.buildNet(input_var,parameters) 31 | # load trained network 32 | with np.load(name+'.npz') as f: 33 | param_values = [f['arr_%d' % i] for i in range(len(f.files))] 34 | lasagne.layers.set_all_param_values(network, param_values) 35 | 36 | print("Compiling functions..") 37 | # visualize convLayers 38 | conv_w = theano.function([],netLayers['2'].W) 39 | weights=conv_w() 40 | 41 | ##!!## plot considering 20log? 42 | ##!!## set min/max to visualize always the same? 43 | 44 | # plot W! 45 | for i in range(len(weights)): 46 | plt.subplot(1, len(weights), i+1) 47 | plt.imshow(np.squeeze(weights[i]), cmap=plt.cm.Reds, interpolation='None', aspect='auto') 48 | plt.colorbar() 49 | plt.show() 50 | 51 | def trainingEvolution(name): 52 | 'Plot the training evolution: training loss, validation loss and validation accuracy.' 53 | 54 | # load data 55 | df = pd.read_csv(name+'.training') 56 | trainingLoss = df['trainingLoss'] 57 | validationLoss = df['validationLoss'] 58 | validationAccuracy = df['validationAccuracy'] 59 | 60 | # plot training evolution! 61 | plt.subplot(1, 2, 1) 62 | plt.title('Loss') 63 | plt.plot(range(1,len(trainingLoss)+1,1),trainingLoss, color='red',linestyle='--', marker='o',label="Training Loss") 64 | plt.hold(True) 65 | plt.plot(range(1,len(trainingLoss)+1,1),validationLoss,color='blue',linestyle='--', marker='o',label="Validation Loss") 66 | plt.legend() 67 | 68 | plt.subplot(1, 2, 2) 69 | plt.title('Validation Accuracy (%)') 70 | plt.plot(range(1,len(trainingLoss)+1,1),validationAccuracy,color='blue',linestyle='--', marker='o') 71 | 72 | plt.show() 73 | 74 | name='./data/results/ballroom_cnn1_46378760430139206020064112940399742791' 75 | visualizeWcnn1(name) 76 | trainingEvolution(name) --------------------------------------------------------------------------------