├── src ├── __init__.py ├── trainer.py ├── models.py └── utility.py ├── images ├── artists.PNG ├── crnn_arch.png └── representation_313.png ├── representation_output └── 313.png ├── .gitignore ├── requirements.txt ├── metrics ├── trials_song_split │ ├── 911_score.csv │ ├── 157_score.csv │ ├── 313_score.csv │ ├── 32_pooled_score.csv │ ├── 628_score.csv │ ├── 94_score.csv │ ├── 157_pooled_score.csv │ ├── 313_pooled_score.csv │ ├── 32_score.csv │ ├── 628_pooled_score.csv │ ├── 911_pooled_score.csv │ ├── 94_pooled_score.csv │ └── summary.csv └── trials_album_split │ ├── 188_score.csv │ ├── 313_score.csv │ ├── 628_score.csv │ ├── 911_score.csv │ ├── 157_pooled_score.csv │ ├── 157_score.csv │ ├── 188_pooled_score.csv │ ├── 313_pooled_score.csv │ ├── 32_pooled_score.csv │ ├── 32_score.csv │ ├── 628_pooled_score.csv │ ├── 911_pooled_score.csv │ ├── 94_pooled_score.csv │ ├── 94_score.csv │ └── summary.csv ├── main.py ├── representation.py └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/artists.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/artists.PNG -------------------------------------------------------------------------------- /images/crnn_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/crnn_arch.png -------------------------------------------------------------------------------- /images/representation_313.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/representation_313.png -------------------------------------------------------------------------------- /representation_output/313.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/representation_output/313.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | 3 | song_data/ 4 | 5 | artists/ 6 | 7 | __pycache__/ 8 | 9 | weights/ 10 | 11 | .vscode/ 12 | 13 | .idea/ 14 | 15 | song_split/ 16 | 17 | weights_album_split/ 18 | 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dill==0.2.8.2 2 | h5py==2.8.0 3 | Keras==2.1.1 4 | librosa==0.5.1 5 | matplotlib==2.2.3 6 | numpy==1.14.5 7 | pandas==0.23.4 8 | scikit-learn==0.20.0 9 | scipy==1.1.0 10 | seaborn==0.9.0 11 | tensorflow==1.10.0 12 | -------------------------------------------------------------------------------- /metrics/trials_song_split/911_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.7714753542029786,0.7810249187225589,0.7712287712287712,1001 3 | 1,0.7602856755984815,0.7875472879313269,0.7606490872210954,986 4 | 2,0.7597119233703686,0.7813017462550562,0.75564681724846,974 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/188_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5336579293828763,0.5527614261521365,0.53440150801131,9549 3 | 1,0.5754112627511248,0.6062769249986311,0.5813156440022111,9045 4 | 2,0.5480376088212674,0.5593886068971844,0.559202948629348,8682 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/313_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5317990069658532,0.5656616496932699,0.5288323782234957,5584 3 | 1,0.5599974125230504,0.6000080968256013,0.5703745743473326,5286 4 | 2,0.5213751068019872,0.5442140772769193,0.5266167192429022,5072 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/628_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5177083023387815,0.5440327943176324,0.5284615384615384,2600 3 | 1,0.5323742070146015,0.5766902659555373,0.5464547677261614,2454 4 | 2,0.5534135315010184,0.5805494326802327,0.5696686491079015,2354 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/911_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6105420527168003,0.6191343930830404,0.616258218768679,1673 3 | 1,0.58778564005534,0.6061265778766737,0.6030245746691871,1587 4 | 2,0.6115240756462318,0.6308803553721625,0.6116951379763469,1522 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/157_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.7753273575553421,0.7860847795988812,0.7748499487629923,6831 3 | 1,0.7561801507833547,0.7722441849743193,0.7556346381969158,6744 4 | 2,0.778755140952637,0.7939222330670741,0.7763649962602842,6685 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/313_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.7659461155323102,0.7731585831005727,0.7658037326911499,3322 3 | 1,0.7870843721212496,0.8135325716589318,0.7833384192859323,3277 4 | 2,0.7620437480948646,0.7747989695444815,0.7607626076260763,3252 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/32_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.92927347408281,0.9370696400625979,0.9295774647887324,142 3 | 1,0.9441284068044632,0.9552915785310151,0.9436619718309859,142 4 | 2,0.9149622513225183,0.9231873564620043,0.9154929577464789,142 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/628_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.7711982461581101,0.7831310195769803,0.7692307692307693,1547 3 | 1,0.7918580389638201,0.8154197330843526,0.7921311475409836,1525 4 | 2,0.7417110965045544,0.765184565847942,0.7389292795769994,1513 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/94_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.768399662232993,0.7724971282977493,0.7694172655641182,11549 3 | 1,0.7651360211105698,0.7789145583714601,0.7636012636012636,11396 4 | 2,0.7617449335919688,0.7661181363406381,0.7621616840615603,11306 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/157_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.661845763821731,0.6688576628535463,0.6733067729083665,251 3 | 1,0.6434829386418183,0.7183564677235562,0.6582278481012658,237 4 | 2,0.6507806944832459,0.6800388476152669,0.6724890829694323,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/157_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5445861110359475,0.5627585991416229,0.5459126053340283,11511 3 | 1,0.5496279761721112,0.5822215121871555,0.5524045521292217,10896 4 | 2,0.5152640898705974,0.5213126422530422,0.5322241346337732,10458 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/188_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6324398063292215,0.6512175346495959,0.6414342629482072,251 3 | 1,0.7274046015856109,0.7304983694224201,0.7468354430379747,237 4 | 2,0.6711582489648036,0.6990221376363169,0.6855895196506551,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/313_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6280262020511764,0.668418235151303,0.6374501992031872,251 3 | 1,0.6828608673419323,0.7534652552895014,0.7046413502109705,237 4 | 2,0.5787682154102902,0.6177462776564403,0.5982532751091703,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/32_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6976793629830077,0.709440735443227,0.7131474103585658,251 3 | 1,0.7001895989658906,0.7454980439157654,0.7172995780590717,237 4 | 2,0.5251844415655749,0.5064950057385873,0.5720524017467249,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/32_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5039120603890946,0.5250239196433978,0.505807230994909,57945 3 | 1,0.5158189416004764,0.5372896418026213,0.5247033952360992,54871 4 | 2,0.42568326049526756,0.44523432680319114,0.43406729106901987,52637 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/628_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6009689686352416,0.6461849636403966,0.6169354838709677,248 3 | 1,0.5488787231935612,0.5749714035440219,0.5822784810126582,237 4 | 2,0.6087626196793897,0.6752419705476473,0.62882096069869,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/911_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6879468274299179,0.7079761618569418,0.6963562753036437,247 3 | 1,0.6437505478649044,0.6773717229702891,0.6779661016949152,236 4 | 2,0.6914103422292879,0.7144843389038436,0.6973684210526315,228 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/94_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.6494714509753788,0.6679470595805257,0.6653386454183267,251 3 | 1,0.6495389158019867,0.6235012622940279,0.6962025316455697,237 4 | 2,0.6533845147061176,0.7167330805750641,0.6681222707423581,229 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/94_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.5255900545652935,0.5367921071986408,0.5300610977049853,19477 3 | 1,0.5270434151546429,0.5359543268750439,0.5452475193840481,18443 4 | 2,0.4864527164650928,0.5055720879627964,0.48672016274864377,17696 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/157_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.9297395009071415,0.9411971830985916,0.9295774647887324,142 3 | 1,0.909152033872415,0.9257331870007927,0.9084507042253521,142 4 | 2,0.9156774203475465,0.9295271629778671,0.9154929577464789,142 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/313_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.914872117760968,0.9234434384082272,0.9154929577464789,142 3 | 1,0.910919412907814,0.9322965571205009,0.9084507042253521,142 4 | 2,0.8808887995921965,0.8947221206024022,0.8802816901408451,142 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/32_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.7322581509235274,0.7352414233797548,0.7336924868957484,34340 3 | 1,0.7332756752128675,0.7428611923750827,0.7337836641986962,33901 4 | 2,0.7223202438547724,0.7277711185921146,0.7215829691077216,33633 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/628_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.8799518591047174,0.8931030628213726,0.8802816901408451,142 3 | 1,0.8705806804812605,0.9172986637775372,0.8732394366197183,142 4 | 2,0.832684276800277,0.8593459928566313,0.8297872340425532,141 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/911_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.8426366982160408,0.8562704041427446,0.8439716312056738,141 3 | 1,0.8437117678418424,0.8739672885799646,0.8380281690140845,142 4 | 2,0.8505825433547586,0.8825033319714171,0.8439716312056738,141 5 | -------------------------------------------------------------------------------- /metrics/trials_song_split/94_pooled_score.csv: -------------------------------------------------------------------------------- 1 | ,f1-score,precision,recall,support 2 | 0,0.9237024907799558,0.9350700160559318,0.9225352112676056,142 3 | 1,0.9656059350756949,0.9709926224010731,0.9647887323943662,142 4 | 2,0.9218146712031071,0.9329962095807166,0.9225352112676056,142 5 | -------------------------------------------------------------------------------- /metrics/trials_album_split/summary.csv: -------------------------------------------------------------------------------- 1 | Average F1,32,94,157,313,628,911 2 | Frame,0.482,0.513,0.536,0.538,0.534,0.603 3 | Song,0.516,0.527,0.550,0.560,0.553,0.612 4 | ,,,,,, 5 | Maximum F1,32,94,157,313,628,911 6 | Frame,0.641,0.651,0.652,0.630,0.568,0.674 7 | Song,0.700,0.653,0.662,0.683,0.609,0.691 8 | -------------------------------------------------------------------------------- /metrics/trials_song_split/summary.csv: -------------------------------------------------------------------------------- 1 | Average F1,32,94,157,313,628,911 2 | Frame,0.729,0.765,0.770,0.772,0.768,0.764 3 | Song,0.930,0.937,0.918,0.902,0.861,0.846 4 | ,,,,,, 5 | Maximum F1,32,94,157,313,628,911 6 | Frame,0.733,0.768,0.779,0.787,0.792,0.772 7 | Song,0.944,0.966,0.930,0.915,0.880,0.851 8 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Nov 7 13:05:16 2017 4 | Updated on Nov 14 2017 5 | @author: Zain 6 | """ 7 | import os 8 | import pandas as pd 9 | import gc 10 | 11 | import src.trainer as trainer 12 | 13 | if __name__ == '__main__': 14 | 15 | ''' 16 | 1s 32 frames 17 | 3s 94 frames 18 | 5s 157 frames 19 | 6s 188 frames 20 | 10s 313 frames 21 | 20s 628 frames 22 | 29.12s 911 frames 23 | ''' 24 | 25 | slice_lengths = [911, 628, 313, 157, 94, 32] 26 | random_state_list = [0, 21, 42] 27 | iterations = 3 28 | summary_metrics_output_folder = 'trials_song_split' 29 | for slice_len in slice_lengths: 30 | 31 | scores = [] 32 | pooling_scores = [] 33 | for i in range(iterations): 34 | score, pooling_score = trainer.train_model( 35 | nb_classes=20, 36 | slice_length=slice_len, 37 | lr=0.001, 38 | train=True, 39 | load_checkpoint=True, 40 | plots=False, 41 | album_split=False, 42 | random_states=random_state_list[i], 43 | save_metrics=True, 44 | save_metrics_folder='metrics_song_split', 45 | save_weights_folder='weights_song_split') 46 | 47 | scores.append(score['weighted avg']) 48 | pooling_scores.append(pooling_score['weighted avg']) 49 | gc.collect() 50 | 51 | os.makedirs(summary_metrics_output_folder, exist_ok=True) 52 | 53 | pd.DataFrame(scores).to_csv( 54 | '{}/{}_score.csv'.format(summary_metrics_output_folder, slice_len)) 55 | 56 | pd.DataFrame(pooling_scores).to_csv( 57 | '{}/{}_pooled_score.csv'.format( 58 | summary_metrics_output_folder, slice_len)) 59 | -------------------------------------------------------------------------------- /representation.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import os 3 | from os.path import isfile 4 | 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | import numpy as np 8 | import seaborn as sns 9 | from keras.optimizers import Adam 10 | from sklearn.manifold import TSNE 11 | 12 | import src.models as models 13 | import src.utility as utility 14 | 15 | if __name__ == '__main__': 16 | 17 | # set these parameters 18 | random_states = 21 19 | slice_length = 313 20 | checkpoint_path = 'weights/20_313_21' 21 | 22 | # leave as-is 23 | load_checkpoint = True 24 | nb_classes = 20 25 | folder = 'song_data' 26 | lr = 0.0001 # not used 27 | ensemble_visual = False # average out representations at the song level 28 | save_path = 'representation_output/' 29 | 30 | # Load the song data and split into train and test sets at song level 31 | print("Loading data for {}".format(slice_length)) 32 | Y, X, S = utility.load_dataset(song_folder_name=folder, 33 | nb_classes=nb_classes, 34 | random_state=random_states) 35 | X, Y, S = utility.slice_songs(X, Y, S, length=slice_length) 36 | 37 | # Reshape data as 2d convolutional tensor shape 38 | X_shape = X.shape + (1,) 39 | X = X.reshape(X_shape) 40 | 41 | # encode Y 42 | Y_original = Y 43 | Y, le, enc = utility.encode_labels(Y) 44 | 45 | # build the model 46 | model = models.CRNN2D(X.shape, nb_classes=Y.shape[1]) 47 | model.compile(loss='categorical_crossentropy', 48 | optimizer=Adam(lr=lr), 49 | metrics=['accuracy']) 50 | 51 | # Initialize weights using checkpoint if it exists 52 | if isfile(checkpoint_path): 53 | print('Checkpoint file detected. Loading weights.') 54 | model.load_weights(checkpoint_path) 55 | else: 56 | raise Exception('no checkpoint for {}'.format(checkpoint_path)) 57 | 58 | # drop final dense layer and activation 59 | print("Modifying model and predicting representation") 60 | model.pop() 61 | model.pop() 62 | model.summary() 63 | 64 | # predict representation 65 | print("Predicting") 66 | X_rep = model.predict(X) 67 | 68 | print("Garbage collection") 69 | del X 70 | gc.collect() 71 | 72 | if ensemble_visual: 73 | songs = np.unique(S) 74 | X_song = np.zeros((songs.shape[0], X_rep.shape[1])) 75 | Y_song = np.empty((songs.shape[0]), dtype="S10") 76 | for i, song in enumerate(songs): 77 | xs = X_rep[S == song] 78 | Y_song[i] = Y_original[S == song][0] 79 | X_song[i, :] = np.mean(xs, axis=0) 80 | 81 | X_rep = X_song 82 | Y_original = Y_song 83 | 84 | # fit tsne 85 | print("Fitting TSNE {}".format(X_rep.shape)) 86 | tsne_model = TSNE() 87 | X_2d = tsne_model.fit_transform(X_rep) 88 | 89 | # save results 90 | print("Saving results") 91 | os.makedirs(save_path, exist_ok=True) 92 | save_path += str(checkpoint_path.split('_')[1]) 93 | if ensemble_visual: 94 | save_path += '_ensemble' 95 | 96 | pd.DataFrame({'x0': X_2d[:, 0], 'x1': X_2d[:, 1], 97 | 'label': Y_original}).to_csv( 98 | save_path + '.csv', index=False) 99 | 100 | # save figure 101 | sns.set_palette("Paired", n_colors=20) 102 | plt.figure(figsize=(20, 20)) 103 | sns.scatterplot(x=X_2d[:, 0], y=X_2d[:, 1], 104 | hue=Y_original, palette=sns.color_palette(n_colors=20)) 105 | plt.savefig(save_path + '.png') 106 | 107 | del Y, S, X_rep, X_2d, Y_original 108 | -------------------------------------------------------------------------------- /src/trainer.py: -------------------------------------------------------------------------------- 1 | import src.utility as utility 2 | import src.models as models 3 | 4 | import os 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from os.path import isfile 8 | 9 | from keras.callbacks import ModelCheckpoint, EarlyStopping 10 | from keras.optimizers import Adam 11 | 12 | from sklearn.metrics import confusion_matrix, classification_report 13 | 14 | 15 | def train_model(nb_classes=20, 16 | slice_length=911, 17 | artist_folder='artists', 18 | song_folder='song_data', 19 | plots=True, 20 | train=True, 21 | load_checkpoint=False, 22 | save_metrics=True, 23 | save_metrics_folder='metrics', 24 | save_weights_folder='weights', 25 | batch_size=16, 26 | nb_epochs=200, 27 | early_stop=10, 28 | lr=0.0001, 29 | album_split=True, 30 | random_states=42): 31 | """ 32 | Main function for training the model and testing 33 | """ 34 | 35 | weights = os.path.join(save_weights_folder, str(nb_classes) + 36 | '_' + str(slice_length) + '_' + str(random_states)) 37 | os.makedirs(save_weights_folder, exist_ok=True) 38 | os.makedirs(save_metrics_folder, exist_ok=True) 39 | 40 | print("Loading dataset...") 41 | 42 | if not album_split: 43 | # song split 44 | Y_train, X_train, S_train, Y_test, X_test, S_test, \ 45 | Y_val, X_val, S_val = \ 46 | utility.load_dataset_song_split(song_folder_name=song_folder, 47 | artist_folder=artist_folder, 48 | nb_classes=nb_classes, 49 | random_state=random_states) 50 | else: 51 | Y_train, X_train, S_train, Y_test, X_test, S_test, \ 52 | Y_val, X_val, S_val = \ 53 | utility.load_dataset_album_split(song_folder_name=song_folder, 54 | artist_folder=artist_folder, 55 | nb_classes=nb_classes, 56 | random_state=random_states) 57 | 58 | print("Loaded and split dataset. Slicing songs...") 59 | 60 | # Create slices out of the songs 61 | X_train, Y_train, S_train = utility.slice_songs(X_train, Y_train, S_train, 62 | length=slice_length) 63 | X_val, Y_val, S_val = utility.slice_songs(X_val, Y_val, S_val, 64 | length=slice_length) 65 | X_test, Y_test, S_test = utility.slice_songs(X_test, Y_test, S_test, 66 | length=slice_length) 67 | 68 | print("Training set label counts:", np.unique(Y_train, return_counts=True)) 69 | 70 | # Encode the target vectors into one-hot encoded vectors 71 | Y_train, le, enc = utility.encode_labels(Y_train) 72 | Y_test, le, enc = utility.encode_labels(Y_test, le, enc) 73 | Y_val, le, enc = utility.encode_labels(Y_val, le, enc) 74 | 75 | # Reshape data as 2d convolutional tensor shape 76 | X_train = X_train.reshape(X_train.shape + (1,)) 77 | X_val = X_val.reshape(X_val.shape + (1,)) 78 | X_test = X_test.reshape(X_test.shape + (1,)) 79 | 80 | # build the model 81 | model = models.CRNN2D(X_train.shape, nb_classes=Y_train.shape[1]) 82 | model.compile(loss='categorical_crossentropy', 83 | optimizer=Adam(lr=lr), 84 | metrics=['accuracy']) 85 | model.summary() 86 | 87 | # Initialize weights using checkpoint if it exists 88 | if load_checkpoint: 89 | print("Looking for previous weights...") 90 | if isfile(weights): 91 | print('Checkpoint file detected. Loading weights.') 92 | model.load_weights(weights) 93 | else: 94 | print('No checkpoint file detected. Starting from scratch.') 95 | else: 96 | print('Starting from scratch (no checkpoint)') 97 | 98 | checkpointer = ModelCheckpoint(filepath=weights, 99 | verbose=1, 100 | save_best_only=True) 101 | earlystopper = EarlyStopping(monitor='val_loss', min_delta=0, 102 | patience=early_stop, verbose=0, mode='auto') 103 | 104 | # Train the model 105 | if train: 106 | print("Input Data Shape", X_train.shape) 107 | history = model.fit(X_train, Y_train, batch_size=batch_size, 108 | shuffle=True, epochs=nb_epochs, 109 | verbose=1, validation_data=(X_val, Y_val), 110 | callbacks=[checkpointer, earlystopper]) 111 | if plots: 112 | utility.plot_history(history) 113 | 114 | # Load weights that gave best performance on validation set 115 | model.load_weights(weights) 116 | filename = os.path.join(save_metrics_folder, str(nb_classes) + '_' 117 | + str(slice_length) 118 | + '_' + str(random_states) + '.txt') 119 | 120 | # Score test model 121 | score = model.evaluate(X_test, Y_test, verbose=0) 122 | y_score = model.predict_proba(X_test) 123 | 124 | # Calculate confusion matrix 125 | y_predict = np.argmax(y_score, axis=1) 126 | y_true = np.argmax(Y_test, axis=1) 127 | cm = confusion_matrix(y_true, y_predict) 128 | 129 | # Plot the confusion matrix 130 | class_names = np.arange(nb_classes) 131 | class_names_original = le.inverse_transform(class_names) 132 | plt.figure(figsize=(14, 14)) 133 | utility.plot_confusion_matrix(cm, classes=class_names_original, 134 | normalize=True, 135 | title='Confusion matrix with normalization') 136 | if save_metrics: 137 | plt.savefig(filename + '.png', bbox_inches="tight") 138 | plt.close() 139 | plt.figure(figsize=(14, 14)) 140 | 141 | # Print out metrics 142 | print('Test score/loss:', score[0]) 143 | print('Test accuracy:', score[1]) 144 | print('\nTest results on each slice:') 145 | scores = classification_report(y_true, y_predict, 146 | target_names=class_names_original) 147 | scores_dict = classification_report(y_true, y_predict, 148 | target_names=class_names_original, 149 | output_dict=True) 150 | print(scores) 151 | 152 | # Predict artist using pooling methodology 153 | pooling_scores, pooled_scores_dict = \ 154 | utility.predict_artist(model, X_test, Y_test, S_test, 155 | le, class_names=class_names_original, 156 | slices=None, verbose=False) 157 | 158 | # Save metrics 159 | if save_metrics: 160 | plt.savefig(filename + '_pooled.png', bbox_inches="tight") 161 | plt.close() 162 | with open(filename, 'w') as f: 163 | f.write("Training data shape:" + str(X_train.shape)) 164 | f.write('\nnb_classes: ' + str(nb_classes) + 165 | '\nslice_length: ' + str(slice_length)) 166 | f.write('\nweights: ' + weights) 167 | f.write('\nlr: ' + str(lr)) 168 | f.write('\nTest score/loss: ' + str(score[0])) 169 | f.write('\nTest accuracy: ' + str(score[1])) 170 | f.write('\nTest results on each slice:\n') 171 | f.write(str(scores)) 172 | f.write('\n\n Scores when pooling song slices:\n') 173 | f.write(str(pooling_scores)) 174 | 175 | return (scores_dict, pooled_scores_dict) 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Musical Artist Classification with Convolutional Recurrent Neural Networks 2 | 3 | Nasrullah, Z. and Zhao, Y., Musical Artist Classification with Convolutional Recurrent Neural Networks. *International Joint Conference on Neural Networks (IJCNN)*, 2019. 4 | 5 | Please cite the paper as: 6 | 7 | @inproceedings{nasrullah2019music, 8 | author={Nasrullah, Zain and Zhao, Yue}, 9 | title={Musical Artist Classification with Convolutional Recurrent Neural Networks}, 10 | booktitle={2019 International Joint Conference on Neural Networks (IJCNN)}, 11 | year={2019}, 12 | month={July} 13 | pages={1-8}, 14 | doi={10.1109/IJCNN.2019.8851988}, 15 | organization={IEEE} 16 | } 17 | 18 | [PDF for Personal Use](http://arxiv.org/abs/1901.04555) | [IEEE Xplore](https://ieeexplore.ieee.org/document/8851988) 19 | 20 | 21 | ------------ 22 | 23 | 24 | ## Introduction 25 | Previous attempts at music artist classification use frame level audio features which summarize frequency content within short intervals of time. Comparatively, more recent music information retrieval tasks take advantage of temporal structure in audio spectrograms using deep convolutional and recurrent models. This paper revisits artist classification with this new framework and empirically explores the impacts of incorporating temporal structure in the feature representation. To this end, an established classification architecture, a Convolutional Recurrent Neural Network (CRNN), is applied to the artist20 music artist identification dataset under a comprehensive set of conditions. These include audio clip length, which is a novel contribution in this work, and previously identified considerations such as dataset split and feature level. Our results improve upon baseline works, verify the influence of the producer effect on classification performance and demonstrate the trade-offs between audio length and training set size. The best performing model achieves an average F1 score of 0.937 across three independent trials which is a substantial improvement over the corresponding baseline under similar conditions. Additionally, to showcase the effectiveness of the CRNN's feature extraction capabilities, we visualize audio samples at the model's bottleneck layer demonstrating that learned representations segment into clusters belonging to their respective artists. 26 | 27 | 28 | ![Convolutional Recurrent Neural Network](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/crnn_arch.png) 29 | 30 | 31 | ## Dependency 32 | The experiment code is writen in Python 3.6 and built on a number of Python packages including (but not limited to): 33 | - dill==3.2.8.2 34 | - h5py==2.8.0 35 | - Keras==3.1.1 36 | - librosa==1.5.1 37 | - matplotlib==3.2.3 38 | - numpy==2.14.5 39 | - pandas==1.23.4 40 | - scikit-learn==1.20.0 41 | - scipy==2.1.0 42 | - seaborn==1.9.0 43 | - tensorflow==2.10.0 44 | 45 | 46 | Batch installation is possible using the supplied "requirements.txt" with pip or conda. 47 | 48 | ````cmd 49 | pip install -r requirements.txt 50 | ```` 51 | 52 | Additional install details (recommended for replication and strong performance): 53 | - Python: 3.6.6 54 | - GPU: Nvidia GTX 1080 (Driver: 390.87) 55 | - CUDA: 8.0 56 | - CUDNN: 7.0.5 57 | - [ffmpeg](http://ffmpeg.org/download.html) is required by Librosa to convert audio files into spectrograms. 58 | 59 | 60 | ## Datasets 61 | 62 | This study primarily uses the artist20 musical artist identification dataset by Labrosa [1]. The data is accessible upon request from https://labrosa.ee.columbia.edu/projects/artistid/. 63 | 64 | The main characteristics of the dataset can be summarized as: 65 | 66 | |Property | Value | 67 | |-------------------|---------| 68 | |# of Tracks | 1,413 | 69 | |# of Artists | 20 | 70 | |Albums per Artist | 6 | 71 | |Bitrate | 32 kbps | 72 | |Sample Rate | 16 kHz | 73 | |Channels | Mono | 74 | 75 | The figure below visualizes three seconds of the mel-scaled audio spectrogram for a randomly sampled song from each artist. This is the primary data representation used in the paper. 76 | 77 | ![Convolutional Recurrent Neural Network](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/artists.PNG) 78 | 79 | ## Usage 80 | 81 | To re-create experimental results: 82 | 83 | - Prepare mel-scaled spectrograms from raw audio in the dataset. 84 | - Run src/utility.py if the dataset is stored using its original folder structure (artists/[artist]/[album]/[song].mp3) in the project root. 85 | - Using the create_dataset() utility function in src/utility.py with a custom directory if the dataset is stored elsewhere. 86 | - Run the main.py script. This will begin a training loop which runs three independent trials for each audio length in {1s, 3s, 5s, 10s, 20s, 30s}. 87 | - This script must be adjusted manually to vary whether or not to use an album split via the album_split flag in the train_model function call. 88 | - It should be noted that training each model is computationally expensive and can take several hours even with reliable hardware. At minimum, a Nvidia GTX 1080 GPU is recommended with at least 16GB of memory on the machine. 89 | - To reproduce the representation visualization, the representation.py script can be used but one must specify the model weight location and relevant audio clip length. 90 | 91 | The models and utility functions provided can also generically be used for any audio-based classification task where one wants to experiment with audio length. The train_model function in src/trainer.py is fairly extensive. 92 | 93 | ## Results 94 | 95 | Classification performance is evaluated using the test F1-score of three independent trials and also varying parameters such as audio length {1s, 3s, 5s, 10s, 20s, 30s}, the type of dataset split {song-level, album-level} and feature-level {frame-level, song-level}. Both the average and maximum score are reported among the trials. 96 | 97 | As a whole, from the four base conditions resulting from audio split and level, the CRNN model outperforms the most comparable baseline for at least one audio clip length. This holds true for both the best and average case performance except for the album split with song-level features where the CRNN model only outperforms in its best-run. This discrepancy may be explained by considering that Mandel's dataset contains less classes or because, unlike the baselines works, we are additionally reporting the average of three independent trials instead of performance on a single trial. 98 | 99 | *Test F1 Scores for Frame-level Audio Features (3 runs):* 100 | 101 | |Split | Type | 1s | 3s | 5s | 10s | 20s | 30s | 102 | |------|---------|--------|-------|-------|-------|-------|----------| 103 | |Song | Average | 0.729 | 0.765 | 0.770 | **0.787** | 0.768 | 0.764| 104 | |Song | Best | 0.733 | 0.768 | 0.779 | 0.772 | **0.792** | 0.771| 105 | |Album | Average | 0.482 | 0.513 | 0.536 | 0.538 | 0.534 | **0.603**| 106 | |Album | Best | 0.516 | 0.527 | 0.550 | 0.560 | 0.553 | **0.612**| 107 | 108 | *Test F1 Scores for Song-level Audio Features (3 runs):* 109 | 110 | |Split | Type | 1s | 3s | 5s | 10s | 20s | 30s | 111 | |------|---------|-------|-----------|-------|-------|-------|------| 112 | |Song | Average | 0.929 | **0.937** | 0.918 | 0.902 | 0.861 | 0.846| 113 | |Song | Best | 0.944 | **0.966** | 0.930 | 0.915 | 0.880 | 0.851| 114 | |Album | Average | 0.641 | 0.651 | 0.652 | 0.630 | 0.568 | **0.674**| 115 | |Album | Best | **0.700** | 0.653 | 0.662 | 0.683 | 0.609 | 0.691| 116 | 117 | Additionally, audio samples at the bottleneck layer of the network are also visualized using t-SNE to demonstrate how effectively the model is able to learn to classify artists. As can be seen below, the learned representations prior to classification separate into distinct clusters belonging to each artist demonstrating that the convolution and recurrent layers are effective at the task. The example below is for the model trained on 10s of audio. 118 | 119 | ![Learned representations at bottleneck layer of network (10s)](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/representation_313.png) 120 | 121 | ## Conclusions 122 | This paper establishes a deep learning baseline for music artist classification on the \textbf{\textit{artist20}} dataset and demonstrates that a Convolutional Recurrent Neural Network is able to outperform traditional baselines under a range of conditions. The results show that including additional temporal structure in an audio sample improves classification performance and also that there is a point beyond which the returns may diminish. This is attributed to a possible lack of complexity in the model or early pooling layers discarding too much information. Using the trained models, predictions are also aggregated at the song level using a majority vote to determine the artist performing a song. This leads to another substantial gain in performance and validates the feasibility of using a CRNN for industry applications such as copyright detection. The best-performing model is trained using three second audio samples under a song dataset split and evaluated at the song level to achieve an average F1 score of 0.937 across three independent trials. Additionally, we visualize audio samples at the bottleneck layer of the network to show that learned representations cluster by artist---highlighting the model's capability as a feature extractor. Future directions include audio augmentation, model pre-training and minimizing temporal pooling as avenues for further performance improvement. 123 | 124 | ## References 125 | 126 | [1] D. Ellis (2007). Classifying Music Audio with Timbral and Chroma Features, 127 | *Proc. Int. Conf. on Music Information Retrieval (ISMIR)*, Vienna, Austria, Sep. 2007. 128 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 11 11:23:13 2017 4 | Updated on Nov 14 2017 5 | @author: Zain 6 | """ 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Dropout, Activation, Reshape, Permute 10 | from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D 11 | from keras.layers.normalization import BatchNormalization 12 | from keras.layers.recurrent import GRU, LSTM 13 | 14 | 15 | def CRNN2D(X_shape, nb_classes): 16 | ''' 17 | Model used for evaluation in paper. Inspired by K. Choi model in: 18 | https://github.com/keunwoochoi/music-auto_tagging-keras/blob/master/music_tagger_crnn.py 19 | ''' 20 | 21 | nb_layers = 4 # number of convolutional layers 22 | nb_filters = [64, 128, 128, 128] # filter sizes 23 | kernel_size = (3, 3) # convolution kernel size 24 | activation = 'elu' # activation function to use after each layer 25 | pool_size = [(2, 2), (4, 2), (4, 2), (4, 2), 26 | (4, 2)] # size of pooling area 27 | 28 | # shape of input data (frequency, time, channels) 29 | input_shape = (X_shape[1], X_shape[2], X_shape[3]) 30 | frequency_axis = 1 31 | time_axis = 2 32 | channel_axis = 3 33 | 34 | # Create sequential model and normalize along frequency axis 35 | model = Sequential() 36 | model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape)) 37 | 38 | # First convolution layer specifies shape 39 | model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same', 40 | data_format="channels_last", 41 | input_shape=input_shape)) 42 | model.add(Activation(activation)) 43 | model.add(BatchNormalization(axis=channel_axis)) 44 | model.add(MaxPooling2D(pool_size=pool_size[0], strides=pool_size[0])) 45 | model.add(Dropout(0.1)) 46 | 47 | # Add more convolutional layers 48 | for layer in range(nb_layers - 1): 49 | # Convolutional layer 50 | model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size, 51 | padding='same')) 52 | model.add(Activation(activation)) 53 | model.add(BatchNormalization( 54 | axis=channel_axis)) # Improves overfitting/underfitting 55 | model.add(MaxPooling2D(pool_size=pool_size[layer + 1], 56 | strides=pool_size[layer + 1])) # Max pooling 57 | model.add(Dropout(0.1)) 58 | 59 | # Reshaping input for recurrent layer 60 | # (frequency, time, channels) --> (time, frequency, channel) 61 | model.add(Permute((time_axis, frequency_axis, channel_axis))) 62 | resize_shape = model.output_shape[2] * model.output_shape[3] 63 | model.add(Reshape((model.output_shape[1], resize_shape))) 64 | 65 | # recurrent layer 66 | model.add(GRU(32, return_sequences=True)) 67 | model.add(GRU(32, return_sequences=False)) 68 | model.add(Dropout(0.3)) 69 | 70 | # Output layer 71 | model.add(Dense(nb_classes)) 72 | model.add(Activation("softmax")) 73 | return model 74 | 75 | 76 | ############################################################################### 77 | ''' 78 | Models below this point were only pre-tested and were not presented in the paper 79 | ''' 80 | 81 | 82 | ############################################################################### 83 | 84 | def CRNN2DLarger(X_shape, nb_classes): 85 | ''' 86 | Making the previous model larger and deeper 87 | ''' 88 | nb_layers = 5 # number of convolutional layers 89 | nb_filters = [64, 128, 256, 512, 512] 90 | kernel_size = (3, 3) # convolution kernel size 91 | activation = 'elu' # activation function to use after each layer 92 | pool_size = [(2, 2), (2, 2), (2, 2), (4, 1), 93 | (4, 1)] # # size of pooling area 94 | # pool_size = [(4,2), (4,2), (4,1), (2,1)] this worked well 95 | 96 | # shape of input data (frequency, time, channels) 97 | input_shape = (X_shape[1], X_shape[2], X_shape[3]) 98 | frequency_axis = 1 99 | time_axis = 2 100 | channel_axis = 3 101 | 102 | # Create sequential model 103 | model = Sequential() 104 | model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape)) 105 | 106 | # First convolution layer 107 | model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same', 108 | data_format="channels_last", 109 | input_shape=input_shape)) 110 | model.add(Activation(activation)) 111 | model.add(BatchNormalization( 112 | axis=channel_axis)) # Improves overfitting/underfitting 113 | model.add(MaxPooling2D(pool_size=pool_size[0], 114 | strides=pool_size[0])) # Max pooling 115 | model.add(Dropout(0.1)) # 0.2 116 | 117 | # Add more convolutional layers 118 | for layer in range(nb_layers - 1): 119 | # Convolutional layer 120 | model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size, 121 | padding='same')) 122 | model.add(Activation(activation)) 123 | model.add(BatchNormalization( 124 | axis=channel_axis)) # Improves overfitting/underfitting 125 | model.add(MaxPooling2D(pool_size=pool_size[layer + 1], 126 | strides=pool_size[layer + 1])) # Max pooling 127 | model.add(Dropout(0.1)) # 0.2 128 | 129 | # Reshaping input for recurrent layer 130 | # (frequency, time, channels) --> (time, frequency, channel) 131 | model.add(Permute((time_axis, frequency_axis, channel_axis))) 132 | resize_shape = model.output_shape[2] * model.output_shape[3] 133 | model.add(Reshape((model.output_shape[1], resize_shape))) 134 | 135 | # recurrent layer 136 | model.add(GRU(32, return_sequences=True)) 137 | model.add(GRU(32, return_sequences=False)) 138 | model.add(Dropout(0.3)) 139 | 140 | # Output layer 141 | model.add(Dense(nb_classes)) 142 | model.add(Activation("softmax")) 143 | return model 144 | 145 | 146 | def CRNN2DVGG(X_shape, nb_classes): 147 | ''' 148 | Based on VGG-16 Architecture 149 | ''' 150 | nb_layers = 5 # number of convolutional layers 151 | nb_filters = [64, 128, 256, 512, 512] 152 | kernel_size = (3, 3) # convolution kernel size 153 | activation = 'elu' # activation function to use after each layer 154 | pool_size = [(2, 2), (2, 2), (2, 2), (4, 1), 155 | (4, 1)] # # size of pooling area 156 | # pool_size = [(4,2), (4,2), (4,1), (2,1)] this worked well 157 | 158 | # shape of input data (frequency, time, channels) 159 | input_shape = (X_shape[1], X_shape[2], X_shape[3]) 160 | frequency_axis = 1 161 | time_axis = 2 162 | channel_axis = 3 163 | 164 | # Create sequential model 165 | model = Sequential() 166 | model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape)) 167 | 168 | # First convolution layer 169 | model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same', 170 | data_format="channels_last", 171 | input_shape=input_shape)) 172 | model.add(Activation(activation)) 173 | model.add(BatchNormalization( 174 | axis=channel_axis)) # Improves overfitting/underfitting 175 | 176 | model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same', 177 | data_format="channels_last", 178 | input_shape=input_shape)) 179 | model.add(Activation(activation)) 180 | model.add(BatchNormalization( 181 | axis=channel_axis)) # Improves overfitting/underfitting 182 | 183 | model.add(MaxPooling2D(pool_size=pool_size[0], 184 | strides=pool_size[0])) # Max pooling 185 | model.add(Dropout(0.1)) # 0.2 186 | 187 | # Add more convolutional layers 188 | for layer in range(nb_layers - 1): 189 | # Convolutional layer 190 | model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size, 191 | padding='same')) 192 | model.add(Activation(activation)) 193 | model.add(BatchNormalization( 194 | axis=channel_axis)) # Improves overfitting/underfitting 195 | 196 | model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size, 197 | padding='same')) 198 | model.add(Activation(activation)) 199 | model.add(BatchNormalization( 200 | axis=channel_axis)) # Improves overfitting/underfitting 201 | 202 | if nb_filters[layer + 1] != 128: 203 | model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size, 204 | padding='same')) 205 | model.add(Activation(activation)) 206 | model.add(BatchNormalization( 207 | axis=channel_axis)) # Improves overfitting/underfitting 208 | 209 | model.add(MaxPooling2D(pool_size=pool_size[layer + 1], 210 | strides=pool_size[layer + 1])) # Max pooling 211 | model.add(Dropout(0.1)) # 0.2 212 | 213 | # Reshaping input for recurrent layer 214 | # (frequency, time, channels) --> (time, frequency, channel) 215 | model.add(Permute((time_axis, frequency_axis, channel_axis))) 216 | resize_shape = model.output_shape[2] * model.output_shape[3] 217 | model.add(Reshape((model.output_shape[1], resize_shape))) 218 | 219 | # recurrent layer 220 | model.add(GRU(32, return_sequences=True)) 221 | model.add(GRU(32, return_sequences=False)) 222 | model.add(Dropout(0.2)) 223 | 224 | # Output layer 225 | model.add(Dense(nb_classes)) 226 | model.add(Activation("softmax")) 227 | return model 228 | 229 | 230 | def CRNN1D(X_shape, nb_classes): 231 | ''' 232 | Based on 1D convolution 233 | ''' 234 | 235 | nb_layers = 3 # number of convolutional layers 236 | kernel_size = 5 # convolution kernel size 237 | activation = 'relu' # activation function to use after each layer 238 | pool_size = 2 # size of pooling area 239 | 240 | # shape of input data (frequency, time, channels) 241 | input_shape = (X_shape[1], X_shape[2], X_shape[3]) 242 | frequency_axis = 1 243 | time_axis = 2 244 | channel_axis = 3 245 | 246 | # Create sequential model 247 | model = Sequential() 248 | 249 | model.add(Permute((time_axis, frequency_axis, channel_axis), 250 | input_shape=input_shape)) 251 | resize_shape = model.output_shape[2] * model.output_shape[3] 252 | model.add(Reshape((model.output_shape[1], resize_shape))) 253 | 254 | # First convolution layer 255 | model.add(Conv1D(64, kernel_size)) 256 | model.add(Activation(activation)) 257 | model.add( 258 | MaxPooling1D(pool_size=pool_size, strides=pool_size)) # Max pooling 259 | # model.add(Dropout(0.2)) 260 | 261 | # Add more convolutional layers 262 | for _ in range(nb_layers - 1): 263 | # Convolutional layer 264 | model.add(Conv1D(128, kernel_size)) 265 | model.add(Activation(activation)) 266 | model.add(MaxPooling1D(pool_size=pool_size, 267 | strides=pool_size)) # Max pooling 268 | 269 | model.add(GRU(64, return_sequences=True)) 270 | model.add(GRU(64, return_sequences=False)) 271 | 272 | model.add(Dense(nb_classes)) # note sure about this 273 | model.add(Activation('softmax')) 274 | 275 | # Output layer 276 | return model 277 | 278 | 279 | def RNN(X_shape, nb_classes): 280 | ''' 281 | Implementing only the RNN 282 | ''' 283 | # shape of input data (frequency, time, channels) 284 | input_shape = (X_shape[1], X_shape[2], X_shape[3]) 285 | frequency_axis = 1 286 | time_axis = 2 287 | channel_axis = 3 288 | 289 | # Create sequential model 290 | model = Sequential() 291 | 292 | model.add(Permute((time_axis, frequency_axis, channel_axis), 293 | input_shape=input_shape)) 294 | resize_shape = model.output_shape[2] * model.output_shape[3] 295 | model.add(Reshape((model.output_shape[1], resize_shape))) 296 | 297 | model.add(LSTM(64, return_sequences=True)) 298 | model.add(LSTM(64, return_sequences=False)) 299 | 300 | model.add(Dense(nb_classes)) # note sure about this 301 | model.add(Activation('softmax')) 302 | 303 | # Output layer 304 | return model 305 | -------------------------------------------------------------------------------- /src/utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dill 3 | import random 4 | import itertools 5 | 6 | import numpy as np 7 | from numpy.random import RandomState 8 | import matplotlib.pyplot as plt 9 | 10 | import librosa 11 | import librosa.display 12 | 13 | from sklearn.model_selection import train_test_split 14 | from sklearn import preprocessing 15 | from sklearn.metrics import confusion_matrix, classification_report 16 | from sklearn.utils import shuffle 17 | from scipy import stats 18 | 19 | 20 | def visualize_spectrogram(path, duration=None, 21 | offset=0, sr=16000, n_mels=128, n_fft=2048, 22 | hop_length=512): 23 | """This function creates a visualization of a spectrogram 24 | given the path to an audio file.""" 25 | 26 | # Make a mel-scaled power (energy-squared) spectrogram 27 | y, sr = librosa.load(path, sr=sr, duration=duration, offset=offset) 28 | S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, n_fft=n_fft, 29 | hop_length=hop_length) 30 | 31 | # Convert to log scale (dB) 32 | log_S = librosa.logamplitude(S, ref_power=1.0) 33 | 34 | # Render output spectrogram in the console 35 | plt.figure(figsize=(12, 5)) 36 | librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') 37 | plt.title('mel power spectrogram') 38 | plt.colorbar(format='%+02.0f dB') 39 | plt.tight_layout() 40 | 41 | 42 | def create_dataset(artist_folder='artists', save_folder='song_data', 43 | sr=16000, n_mels=128, 44 | n_fft=2048, hop_length=512): 45 | """This function creates the dataset given a folder 46 | with the correct structure (artist_folder/artists/albums/*.mp3) 47 | and saves it to a specified folder.""" 48 | 49 | # get list of all artists 50 | os.makedirs(save_folder, exist_ok=True) 51 | artists = [path for path in os.listdir(artist_folder) if 52 | os.path.isdir(path)] 53 | 54 | # iterate through all artists, albums, songs and find mel spectrogram 55 | for artist in artists: 56 | print(artist) 57 | artist_path = os.path.join(artist_folder, artist) 58 | artist_albums = os.listdir(artist_path) 59 | 60 | for album in artist_albums: 61 | album_path = os.path.join(artist_path, album) 62 | album_songs = os.listdir(album_path) 63 | 64 | for song in album_songs: 65 | song_path = os.path.join(album_path, song) 66 | 67 | # Create mel spectrogram and convert it to the log scale 68 | y, sr = librosa.load(song_path, sr=sr) 69 | S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, 70 | n_fft=n_fft, 71 | hop_length=hop_length) 72 | log_S = librosa.logamplitude(S, ref_power=1.0) 73 | data = (artist, log_S, song) 74 | 75 | # Save each song 76 | save_name = artist + '_%%-%%_' + album + '_%%-%%_' + song 77 | with open(os.path.join(save_folder, save_name), 'wb') as fp: 78 | dill.dump(data, fp) 79 | 80 | 81 | def load_dataset(song_folder_name='song_data', 82 | artist_folder='artists', 83 | nb_classes=20, random_state=42): 84 | """This function loads the dataset based on a location; 85 | it returns a list of spectrograms 86 | and their corresponding artists/song names""" 87 | 88 | # Get all songs saved as numpy arrays in the given folder 89 | song_list = os.listdir(song_folder_name) 90 | 91 | # Load the list of artists 92 | artist_list = os.listdir(artist_folder) 93 | 94 | # select the appropriate number of classes 95 | prng = RandomState(random_state) 96 | artists = prng.choice(artist_list, size=nb_classes, replace=False) 97 | 98 | # Create empty lists 99 | artist = [] 100 | spectrogram = [] 101 | song_name = [] 102 | 103 | # Load each song into memory if the artist is included and return 104 | for song in song_list: 105 | with open(os.path.join(song_folder_name, song), 'rb') as fp: 106 | loaded_song = dill.load(fp) 107 | if loaded_song[0] in artists: 108 | artist.append(loaded_song[0]) 109 | spectrogram.append(loaded_song[1]) 110 | song_name.append(loaded_song[2]) 111 | 112 | return artist, spectrogram, song_name 113 | 114 | 115 | def load_dataset_album_split(song_folder_name='song_data', 116 | artist_folder='artists', 117 | nb_classes=20, random_state=42): 118 | """ This function loads a dataset and splits it on an album level""" 119 | song_list = os.listdir(song_folder_name) 120 | 121 | # Load the list of artists 122 | artist_list = os.listdir(artist_folder) 123 | 124 | train_albums = [] 125 | test_albums = [] 126 | val_albums = [] 127 | random.seed(random_state) 128 | for artist in os.listdir(artist_folder): 129 | albums = os.listdir(os.path.join(artist_folder, artist)) 130 | random.shuffle(albums) 131 | test_albums.append(artist + '_%%-%%_' + albums.pop(0)) 132 | val_albums.append(artist + '_%%-%%_' + albums.pop(0)) 133 | train_albums.extend([artist + '_%%-%%_' + album for album in albums]) 134 | 135 | # select the appropriate number of classes 136 | prng = RandomState(random_state) 137 | artists = prng.choice(artist_list, size=nb_classes, replace=False) 138 | 139 | # Create empty lists 140 | Y_train, Y_test, Y_val = [], [], [] 141 | X_train, X_test, X_val = [], [], [] 142 | S_train, S_test, S_val = [], [], [] 143 | 144 | # Load each song into memory if the artist is included and return 145 | for song in song_list: 146 | with open(os.path.join(song_folder_name, song), 'rb') as fp: 147 | loaded_song = dill.load(fp) 148 | artist, album, song_name = song.split('_%%-%%_') 149 | artist_album = artist + '_%%-%%_' + album 150 | 151 | if loaded_song[0] in artists: 152 | if artist_album in train_albums: 153 | Y_train.append(loaded_song[0]) 154 | X_train.append(loaded_song[1]) 155 | S_train.append(loaded_song[2]) 156 | elif artist_album in test_albums: 157 | Y_test.append(loaded_song[0]) 158 | X_test.append(loaded_song[1]) 159 | S_test.append(loaded_song[2]) 160 | elif artist_album in val_albums: 161 | Y_val.append(loaded_song[0]) 162 | X_val.append(loaded_song[1]) 163 | S_val.append(loaded_song[2]) 164 | 165 | return Y_train, X_train, S_train, \ 166 | Y_test, X_test, S_test, \ 167 | Y_val, X_val, S_val 168 | 169 | 170 | def load_dataset_song_split(song_folder_name='song_data', 171 | artist_folder='artists', 172 | nb_classes=20, 173 | test_split_size=0.1, 174 | validation_split_size=0.1, 175 | random_state=42): 176 | Y, X, S = load_dataset(song_folder_name=song_folder_name, 177 | artist_folder=artist_folder, 178 | nb_classes=nb_classes, 179 | random_state=random_state) 180 | # train and test split 181 | X_train, X_test, Y_train, Y_test, S_train, S_test = train_test_split( 182 | X, Y, S, test_size=test_split_size, stratify=Y, 183 | random_state=random_state) 184 | 185 | # Create a validation to be used to track progress 186 | X_train, X_val, Y_train, Y_val, S_train, S_val = train_test_split( 187 | X_train, Y_train, S_train, test_size=validation_split_size, 188 | shuffle=True, stratify=Y_train, random_state=random_state) 189 | 190 | return Y_train, X_train, S_train, \ 191 | Y_test, X_test, S_test, \ 192 | Y_val, X_val, S_val 193 | 194 | 195 | def slice_songs(X, Y, S, length=911): 196 | """Slices the spectrogram into sub-spectrograms according to length""" 197 | 198 | # Create empty lists for train and test sets 199 | artist = [] 200 | spectrogram = [] 201 | song_name = [] 202 | 203 | # Slice up songs using the length specified 204 | for i, song in enumerate(X): 205 | slices = int(song.shape[1] / length) 206 | for j in range(slices - 1): 207 | spectrogram.append(song[:, length * j:length * (j + 1)]) 208 | artist.append(Y[i]) 209 | song_name.append(S[i]) 210 | 211 | return np.array(spectrogram), np.array(artist), np.array(song_name) 212 | 213 | 214 | def create_spectrogram_plots(artist_folder='artists', sr=16000, n_mels=128, 215 | n_fft=2048, hop_length=512): 216 | """Create a spectrogram from a randomly selected song 217 | for each artist and plot""" 218 | 219 | # get list of all artists 220 | artists = os.listdir(artist_folder) 221 | 222 | fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(14, 12), sharex=True, 223 | sharey=True) 224 | 225 | row = 0 226 | col = 0 227 | 228 | # iterate through artists, randomly select an album, 229 | # randomly select a song, and plot a spectrogram on a grid 230 | for artist in artists: 231 | print(artist) 232 | # Randomly select album and song 233 | artist_path = os.path.join(artist_folder, artist) 234 | artist_albums = os.listdir(artist_path) 235 | album = random.choice(artist_albums) 236 | album_path = os.path.join(artist_path, album) 237 | album_songs = os.listdir(album_path) 238 | song = random.choice(album_songs) 239 | song_path = os.path.join(album_path, song) 240 | 241 | # Create mel spectrogram 242 | y, sr = librosa.load(song_path, sr=sr, offset=60, duration=3) 243 | S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, 244 | n_fft=n_fft, hop_length=hop_length) 245 | log_S = librosa.logamplitude(S, ref_power=1.0) 246 | 247 | # Plot on grid 248 | plt.axes(ax[row, col]) 249 | librosa.display.specshow(log_S, sr=sr) 250 | plt.title(artist) 251 | col += 1 252 | if col == 5: 253 | row += 1 254 | col = 0 255 | 256 | fig.tight_layout() 257 | 258 | 259 | def plot_confusion_matrix(cm, classes, 260 | normalize=False, 261 | title='Confusion matrix', 262 | cmap=plt.cm.get_cmap('Blues')): 263 | """ 264 | This function prints and plots the confusion matrix. 265 | Normalization can be applied by setting `normalize=True`. 266 | """ 267 | if normalize: 268 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 269 | 270 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 271 | plt.title(title) 272 | plt.colorbar() 273 | tick_marks = np.arange(len(classes)) 274 | plt.xticks(tick_marks, classes, rotation=90) 275 | plt.yticks(tick_marks, classes) 276 | 277 | fmt = '.2f' if normalize else 'd' 278 | thresh = cm.max() / 2. 279 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 280 | plt.text(j, i, format(cm[i, j], fmt), 281 | horizontalalignment="center", 282 | color="white" if cm[i, j] > thresh else "black") 283 | 284 | plt.tight_layout() 285 | plt.ylabel('True label') 286 | plt.xlabel('Predicted label') 287 | 288 | 289 | def plot_history(history, title="model accuracy"): 290 | """ 291 | This function plots the training and validation accuracy 292 | per epoch of training 293 | """ 294 | plt.plot(history.history['acc']) 295 | plt.plot(history.history['val_acc']) 296 | plt.title(title) 297 | plt.ylabel('accuracy') 298 | plt.xlabel('epoch') 299 | plt.legend(['train', 'test'], loc='lower right') 300 | plt.show() 301 | 302 | return 303 | 304 | 305 | def predict_artist(model, X, Y, S, 306 | le, class_names, 307 | slices=None, verbose=False, 308 | ml_mode=False): 309 | """ 310 | This function takes slices of songs and predicts their output. 311 | For each song, it votes on the most frequent artist. 312 | """ 313 | print("Test results when pooling slices by song and voting:") 314 | # Obtain the list of songs 315 | songs = np.unique(S) 316 | 317 | prediction_list = [] 318 | actual_list = [] 319 | 320 | # Iterate through each song 321 | for song in songs: 322 | 323 | # Grab all slices related to a particular song 324 | X_song = X[S == song] 325 | Y_song = Y[S == song] 326 | 327 | # If not using full song, shuffle and take up to a number of slices 328 | if slices and slices <= X_song.shape[0]: 329 | X_song, Y_song = shuffle(X_song, Y_song) 330 | X_song = X_song[:slices] 331 | Y_song = Y_song[:slices] 332 | 333 | # Get probabilities of each class 334 | predictions = model.predict(X_song, verbose=0) 335 | 336 | if not ml_mode: 337 | # Get list of highest probability classes and their probability 338 | class_prediction = np.argmax(predictions, axis=1) 339 | class_probability = np.max(predictions, axis=1) 340 | 341 | # keep only predictions confident about; 342 | prediction_summary_trim = class_prediction[class_probability > 0.5] 343 | 344 | # deal with edge case where there is no confident class 345 | if len(prediction_summary_trim) == 0: 346 | prediction_summary_trim = class_prediction 347 | else: 348 | prediction_summary_trim = predictions 349 | 350 | # get most frequent class 351 | prediction = stats.mode(prediction_summary_trim)[0][0] 352 | actual = stats.mode(np.argmax(Y_song))[0][0] 353 | 354 | # Keeping track of overall song classification accuracy 355 | prediction_list.append(prediction) 356 | actual_list.append(actual) 357 | 358 | # Print out prediction 359 | if verbose: 360 | print(song) 361 | print("Predicted:", le.inverse_transform(prediction), "\nActual:", 362 | le.inverse_transform(actual)) 363 | print('\n') 364 | 365 | # Print overall song accuracy 366 | actual_array = np.array(actual_list) 367 | prediction_array = np.array(prediction_list) 368 | cm = confusion_matrix(actual_array, prediction_array) 369 | plot_confusion_matrix(cm, classes=class_names, normalize=True, 370 | title='Confusion matrix for pooled results' + 371 | ' with normalization') 372 | class_report = classification_report(actual_array, prediction_array, 373 | target_names=class_names) 374 | print(class_report) 375 | 376 | class_report_dict = classification_report(actual_array, prediction_array, 377 | target_names=class_names, 378 | output_dict=True) 379 | return (class_report, class_report_dict) 380 | 381 | 382 | def encode_labels(Y, le=None, enc=None): 383 | """Encodes target variables into numbers and then one hot encodings""" 384 | 385 | # initialize encoders 386 | N = Y.shape[0] 387 | 388 | # Encode the labels 389 | if le is None: 390 | le = preprocessing.LabelEncoder() 391 | Y_le = le.fit_transform(Y).reshape(N, 1) 392 | else: 393 | Y_le = le.transform(Y).reshape(N, 1) 394 | 395 | # convert into one hot encoding 396 | if enc is None: 397 | enc = preprocessing.OneHotEncoder() 398 | Y_enc = enc.fit_transform(Y_le).toarray() 399 | else: 400 | Y_enc = enc.transform(Y_le).toarray() 401 | 402 | # return encoders to re-use on other data 403 | return Y_enc, le, enc 404 | 405 | 406 | def simple_encoding(Y, le=None): 407 | """Encodes target variables into numbers""" 408 | 409 | # initialize encoders 410 | N = Y.shape[0] 411 | 412 | # Encode the labels 413 | if le is None: 414 | le = preprocessing.LabelEncoder() 415 | Y_le = le.fit_transform(Y) 416 | else: 417 | Y_le = le.transform(Y) 418 | 419 | # return encoders to re-use on other data 420 | return Y_le, le 421 | 422 | 423 | if __name__ == '__main__': 424 | 425 | # configuration options 426 | create_data = True 427 | create_visuals = False 428 | save_visuals = False 429 | 430 | if create_data: 431 | create_dataset(artist_folder='artists', save_folder='song_data', 432 | sr=16000, n_mels=128, n_fft=2048, 433 | hop_length=512) 434 | 435 | if create_visuals: 436 | # Create spectrogram for a specific song 437 | visualize_spectrogram( 438 | 'artists/u2/The_Joshua_Tree/' + 439 | '02-I_Still_Haven_t_Found_What_I_m_Looking_For.mp3', 440 | offset=60, duration=29.12) 441 | 442 | # Create spectrogram subplots 443 | create_spectrogram_plots(artist_folder='artists', sr=16000, n_mels=128, 444 | n_fft=2048, hop_length=512) 445 | if save_visuals: 446 | plt.savefig(os.path.join('spectrograms.png'), 447 | bbox_inches="tight") 448 | --------------------------------------------------------------------------------