├── src
    ├── __init__.py
    ├── trainer.py
    ├── models.py
    └── utility.py
├── images
    ├── artists.PNG
    ├── crnn_arch.png
    └── representation_313.png
├── representation_output
    └── 313.png
├── .gitignore
├── requirements.txt
├── metrics
    ├── trials_song_split
    │   ├── 911_score.csv
    │   ├── 157_score.csv
    │   ├── 313_score.csv
    │   ├── 32_pooled_score.csv
    │   ├── 628_score.csv
    │   ├── 94_score.csv
    │   ├── 157_pooled_score.csv
    │   ├── 313_pooled_score.csv
    │   ├── 32_score.csv
    │   ├── 628_pooled_score.csv
    │   ├── 911_pooled_score.csv
    │   ├── 94_pooled_score.csv
    │   └── summary.csv
    └── trials_album_split
    │   ├── 188_score.csv
    │   ├── 313_score.csv
    │   ├── 628_score.csv
    │   ├── 911_score.csv
    │   ├── 157_pooled_score.csv
    │   ├── 157_score.csv
    │   ├── 188_pooled_score.csv
    │   ├── 313_pooled_score.csv
    │   ├── 32_pooled_score.csv
    │   ├── 32_score.csv
    │   ├── 628_pooled_score.csv
    │   ├── 911_pooled_score.csv
    │   ├── 94_pooled_score.csv
    │   ├── 94_score.csv
    │   └── summary.csv
├── main.py
├── representation.py
└── README.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/artists.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/artists.PNG


--------------------------------------------------------------------------------
/images/crnn_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/crnn_arch.png


--------------------------------------------------------------------------------
/images/representation_313.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/images/representation_313.png


--------------------------------------------------------------------------------
/representation_output/313.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZainNasrullah/music-artist-classification-crnn/HEAD/representation_output/313.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | 
 3 | song_data/
 4 | 
 5 | artists/
 6 | 
 7 | __pycache__/
 8 | 
 9 | weights/
10 | 
11 | .vscode/
12 | 
13 | .idea/
14 | 
15 | song_split/
16 | 
17 | weights_album_split/
18 | 
19 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dill==0.2.8.2
 2 | h5py==2.8.0
 3 | Keras==2.1.1
 4 | librosa==0.5.1
 5 | matplotlib==2.2.3
 6 | numpy==1.14.5
 7 | pandas==0.23.4
 8 | scikit-learn==0.20.0
 9 | scipy==1.1.0
10 | seaborn==0.9.0
11 | tensorflow==1.10.0
12 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/911_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.7714753542029786,0.7810249187225589,0.7712287712287712,1001
3 | 1,0.7602856755984815,0.7875472879313269,0.7606490872210954,986
4 | 2,0.7597119233703686,0.7813017462550562,0.75564681724846,974
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/188_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5336579293828763,0.5527614261521365,0.53440150801131,9549
3 | 1,0.5754112627511248,0.6062769249986311,0.5813156440022111,9045
4 | 2,0.5480376088212674,0.5593886068971844,0.559202948629348,8682
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/313_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5317990069658532,0.5656616496932699,0.5288323782234957,5584
3 | 1,0.5599974125230504,0.6000080968256013,0.5703745743473326,5286
4 | 2,0.5213751068019872,0.5442140772769193,0.5266167192429022,5072
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/628_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5177083023387815,0.5440327943176324,0.5284615384615384,2600
3 | 1,0.5323742070146015,0.5766902659555373,0.5464547677261614,2454
4 | 2,0.5534135315010184,0.5805494326802327,0.5696686491079015,2354
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/911_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6105420527168003,0.6191343930830404,0.616258218768679,1673
3 | 1,0.58778564005534,0.6061265778766737,0.6030245746691871,1587
4 | 2,0.6115240756462318,0.6308803553721625,0.6116951379763469,1522
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/157_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.7753273575553421,0.7860847795988812,0.7748499487629923,6831
3 | 1,0.7561801507833547,0.7722441849743193,0.7556346381969158,6744
4 | 2,0.778755140952637,0.7939222330670741,0.7763649962602842,6685
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/313_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.7659461155323102,0.7731585831005727,0.7658037326911499,3322
3 | 1,0.7870843721212496,0.8135325716589318,0.7833384192859323,3277
4 | 2,0.7620437480948646,0.7747989695444815,0.7607626076260763,3252
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/32_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.92927347408281,0.9370696400625979,0.9295774647887324,142
3 | 1,0.9441284068044632,0.9552915785310151,0.9436619718309859,142
4 | 2,0.9149622513225183,0.9231873564620043,0.9154929577464789,142
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/628_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.7711982461581101,0.7831310195769803,0.7692307692307693,1547
3 | 1,0.7918580389638201,0.8154197330843526,0.7921311475409836,1525
4 | 2,0.7417110965045544,0.765184565847942,0.7389292795769994,1513
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/94_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.768399662232993,0.7724971282977493,0.7694172655641182,11549
3 | 1,0.7651360211105698,0.7789145583714601,0.7636012636012636,11396
4 | 2,0.7617449335919688,0.7661181363406381,0.7621616840615603,11306
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/157_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.661845763821731,0.6688576628535463,0.6733067729083665,251
3 | 1,0.6434829386418183,0.7183564677235562,0.6582278481012658,237
4 | 2,0.6507806944832459,0.6800388476152669,0.6724890829694323,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/157_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5445861110359475,0.5627585991416229,0.5459126053340283,11511
3 | 1,0.5496279761721112,0.5822215121871555,0.5524045521292217,10896
4 | 2,0.5152640898705974,0.5213126422530422,0.5322241346337732,10458
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/188_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6324398063292215,0.6512175346495959,0.6414342629482072,251
3 | 1,0.7274046015856109,0.7304983694224201,0.7468354430379747,237
4 | 2,0.6711582489648036,0.6990221376363169,0.6855895196506551,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/313_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6280262020511764,0.668418235151303,0.6374501992031872,251
3 | 1,0.6828608673419323,0.7534652552895014,0.7046413502109705,237
4 | 2,0.5787682154102902,0.6177462776564403,0.5982532751091703,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/32_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6976793629830077,0.709440735443227,0.7131474103585658,251
3 | 1,0.7001895989658906,0.7454980439157654,0.7172995780590717,237
4 | 2,0.5251844415655749,0.5064950057385873,0.5720524017467249,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/32_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5039120603890946,0.5250239196433978,0.505807230994909,57945
3 | 1,0.5158189416004764,0.5372896418026213,0.5247033952360992,54871
4 | 2,0.42568326049526756,0.44523432680319114,0.43406729106901987,52637
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/628_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6009689686352416,0.6461849636403966,0.6169354838709677,248
3 | 1,0.5488787231935612,0.5749714035440219,0.5822784810126582,237
4 | 2,0.6087626196793897,0.6752419705476473,0.62882096069869,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/911_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6879468274299179,0.7079761618569418,0.6963562753036437,247
3 | 1,0.6437505478649044,0.6773717229702891,0.6779661016949152,236
4 | 2,0.6914103422292879,0.7144843389038436,0.6973684210526315,228
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/94_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.6494714509753788,0.6679470595805257,0.6653386454183267,251
3 | 1,0.6495389158019867,0.6235012622940279,0.6962025316455697,237
4 | 2,0.6533845147061176,0.7167330805750641,0.6681222707423581,229
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/94_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.5255900545652935,0.5367921071986408,0.5300610977049853,19477
3 | 1,0.5270434151546429,0.5359543268750439,0.5452475193840481,18443
4 | 2,0.4864527164650928,0.5055720879627964,0.48672016274864377,17696
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/157_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.9297395009071415,0.9411971830985916,0.9295774647887324,142
3 | 1,0.909152033872415,0.9257331870007927,0.9084507042253521,142
4 | 2,0.9156774203475465,0.9295271629778671,0.9154929577464789,142
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/313_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.914872117760968,0.9234434384082272,0.9154929577464789,142
3 | 1,0.910919412907814,0.9322965571205009,0.9084507042253521,142
4 | 2,0.8808887995921965,0.8947221206024022,0.8802816901408451,142
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/32_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.7322581509235274,0.7352414233797548,0.7336924868957484,34340
3 | 1,0.7332756752128675,0.7428611923750827,0.7337836641986962,33901
4 | 2,0.7223202438547724,0.7277711185921146,0.7215829691077216,33633
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/628_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.8799518591047174,0.8931030628213726,0.8802816901408451,142
3 | 1,0.8705806804812605,0.9172986637775372,0.8732394366197183,142
4 | 2,0.832684276800277,0.8593459928566313,0.8297872340425532,141
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/911_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.8426366982160408,0.8562704041427446,0.8439716312056738,141
3 | 1,0.8437117678418424,0.8739672885799646,0.8380281690140845,142
4 | 2,0.8505825433547586,0.8825033319714171,0.8439716312056738,141
5 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/94_pooled_score.csv:
--------------------------------------------------------------------------------
1 | ,f1-score,precision,recall,support
2 | 0,0.9237024907799558,0.9350700160559318,0.9225352112676056,142
3 | 1,0.9656059350756949,0.9709926224010731,0.9647887323943662,142
4 | 2,0.9218146712031071,0.9329962095807166,0.9225352112676056,142
5 | 


--------------------------------------------------------------------------------
/metrics/trials_album_split/summary.csv:
--------------------------------------------------------------------------------
1 | Average F1,32,94,157,313,628,911
2 | Frame,0.482,0.513,0.536,0.538,0.534,0.603
3 | Song,0.516,0.527,0.550,0.560,0.553,0.612
4 | ,,,,,,
5 | Maximum F1,32,94,157,313,628,911
6 | Frame,0.641,0.651,0.652,0.630,0.568,0.674
7 | Song,0.700,0.653,0.662,0.683,0.609,0.691
8 | 


--------------------------------------------------------------------------------
/metrics/trials_song_split/summary.csv:
--------------------------------------------------------------------------------
1 | Average F1,32,94,157,313,628,911
2 | Frame,0.729,0.765,0.770,0.772,0.768,0.764
3 | Song,0.930,0.937,0.918,0.902,0.861,0.846
4 | ,,,,,,
5 | Maximum F1,32,94,157,313,628,911
6 | Frame,0.733,0.768,0.779,0.787,0.792,0.772
7 | Song,0.944,0.966,0.930,0.915,0.880,0.851
8 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Nov  7 13:05:16 2017
 4 | Updated on Nov 14 2017
 5 | @author: Zain
 6 | """
 7 | import os
 8 | import pandas as pd
 9 | import gc
10 | 
11 | import src.trainer as trainer
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     '''
16 |     1s 32 frames
17 |     3s 94 frames
18 |     5s 157 frames
19 |     6s 188 frames
20 |     10s 313 frames
21 |     20s 628 frames
22 |     29.12s 911 frames
23 |     '''
24 | 
25 |     slice_lengths = [911, 628, 313, 157, 94, 32]
26 |     random_state_list = [0, 21, 42]
27 |     iterations = 3
28 |     summary_metrics_output_folder = 'trials_song_split'
29 |     for slice_len in slice_lengths:
30 | 
31 |         scores = []
32 |         pooling_scores = []
33 |         for i in range(iterations):
34 |             score, pooling_score = trainer.train_model(
35 |                 nb_classes=20,
36 |                 slice_length=slice_len,
37 |                 lr=0.001,
38 |                 train=True,
39 |                 load_checkpoint=True,
40 |                 plots=False,
41 |                 album_split=False,
42 |                 random_states=random_state_list[i],
43 |                 save_metrics=True,
44 |                 save_metrics_folder='metrics_song_split',
45 |                 save_weights_folder='weights_song_split')
46 | 
47 |             scores.append(score['weighted avg'])
48 |             pooling_scores.append(pooling_score['weighted avg'])
49 |             gc.collect()
50 | 
51 |         os.makedirs(summary_metrics_output_folder, exist_ok=True)
52 | 
53 |         pd.DataFrame(scores).to_csv(
54 |             '{}/{}_score.csv'.format(summary_metrics_output_folder, slice_len))
55 | 
56 |         pd.DataFrame(pooling_scores).to_csv(
57 |             '{}/{}_pooled_score.csv'.format(
58 |                 summary_metrics_output_folder, slice_len))
59 | 


--------------------------------------------------------------------------------
/representation.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import os
  3 | from os.path import isfile
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import pandas as pd
  7 | import numpy as np
  8 | import seaborn as sns
  9 | from keras.optimizers import Adam
 10 | from sklearn.manifold import TSNE
 11 | 
 12 | import src.models as models
 13 | import src.utility as utility
 14 | 
 15 | if __name__ == '__main__':
 16 | 
 17 |     # set these parameters
 18 |     random_states = 21
 19 |     slice_length = 313
 20 |     checkpoint_path = 'weights/20_313_21'
 21 | 
 22 |     # leave as-is
 23 |     load_checkpoint = True
 24 |     nb_classes = 20
 25 |     folder = 'song_data'
 26 |     lr = 0.0001  # not used
 27 |     ensemble_visual = False  # average out representations at the song level
 28 |     save_path = 'representation_output/'
 29 | 
 30 |     # Load the song data and split into train and test sets at song level
 31 |     print("Loading data for {}".format(slice_length))
 32 |     Y, X, S = utility.load_dataset(song_folder_name=folder,
 33 |                                    nb_classes=nb_classes,
 34 |                                    random_state=random_states)
 35 |     X, Y, S = utility.slice_songs(X, Y, S, length=slice_length)
 36 | 
 37 |     # Reshape data as 2d convolutional tensor shape
 38 |     X_shape = X.shape + (1,)
 39 |     X = X.reshape(X_shape)
 40 | 
 41 |     # encode Y
 42 |     Y_original = Y
 43 |     Y, le, enc = utility.encode_labels(Y)
 44 | 
 45 |     # build the model
 46 |     model = models.CRNN2D(X.shape, nb_classes=Y.shape[1])
 47 |     model.compile(loss='categorical_crossentropy',
 48 |                   optimizer=Adam(lr=lr),
 49 |                   metrics=['accuracy'])
 50 | 
 51 |     # Initialize weights using checkpoint if it exists
 52 |     if isfile(checkpoint_path):
 53 |         print('Checkpoint file detected. Loading weights.')
 54 |         model.load_weights(checkpoint_path)
 55 |     else:
 56 |         raise Exception('no checkpoint for {}'.format(checkpoint_path))
 57 | 
 58 |     # drop final dense layer and activation
 59 |     print("Modifying model and predicting representation")
 60 |     model.pop()
 61 |     model.pop()
 62 |     model.summary()
 63 | 
 64 |     # predict representation
 65 |     print("Predicting")
 66 |     X_rep = model.predict(X)
 67 | 
 68 |     print("Garbage collection")
 69 |     del X
 70 |     gc.collect()
 71 | 
 72 |     if ensemble_visual:
 73 |         songs = np.unique(S)
 74 |         X_song = np.zeros((songs.shape[0], X_rep.shape[1]))
 75 |         Y_song = np.empty((songs.shape[0]), dtype="S10")
 76 |         for i, song in enumerate(songs):
 77 |             xs = X_rep[S == song]
 78 |             Y_song[i] = Y_original[S == song][0]
 79 |             X_song[i, :] = np.mean(xs, axis=0)
 80 | 
 81 |         X_rep = X_song
 82 |         Y_original = Y_song
 83 | 
 84 |     # fit tsne
 85 |     print("Fitting TSNE {}".format(X_rep.shape))
 86 |     tsne_model = TSNE()
 87 |     X_2d = tsne_model.fit_transform(X_rep)
 88 | 
 89 |     # save results
 90 |     print("Saving results")
 91 |     os.makedirs(save_path, exist_ok=True)
 92 |     save_path += str(checkpoint_path.split('_')[1])
 93 |     if ensemble_visual:
 94 |         save_path += '_ensemble'
 95 | 
 96 |     pd.DataFrame({'x0': X_2d[:, 0], 'x1': X_2d[:, 1],
 97 |                   'label': Y_original}).to_csv(
 98 |         save_path + '.csv', index=False)
 99 | 
100 |     # save figure
101 |     sns.set_palette("Paired", n_colors=20)
102 |     plt.figure(figsize=(20, 20))
103 |     sns.scatterplot(x=X_2d[:, 0], y=X_2d[:, 1],
104 |                     hue=Y_original, palette=sns.color_palette(n_colors=20))
105 |     plt.savefig(save_path + '.png')
106 | 
107 |     del Y, S, X_rep, X_2d, Y_original
108 | 


--------------------------------------------------------------------------------
/src/trainer.py:
--------------------------------------------------------------------------------
  1 | import src.utility as utility
  2 | import src.models as models
  3 | 
  4 | import os
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from os.path import isfile
  8 | 
  9 | from keras.callbacks import ModelCheckpoint, EarlyStopping
 10 | from keras.optimizers import Adam
 11 | 
 12 | from sklearn.metrics import confusion_matrix, classification_report
 13 | 
 14 | 
 15 | def train_model(nb_classes=20,
 16 |                 slice_length=911,
 17 |                 artist_folder='artists',
 18 |                 song_folder='song_data',
 19 |                 plots=True,
 20 |                 train=True,
 21 |                 load_checkpoint=False,
 22 |                 save_metrics=True,
 23 |                 save_metrics_folder='metrics',
 24 |                 save_weights_folder='weights',
 25 |                 batch_size=16,
 26 |                 nb_epochs=200,
 27 |                 early_stop=10,
 28 |                 lr=0.0001,
 29 |                 album_split=True,
 30 |                 random_states=42):
 31 |     """
 32 |     Main function for training the model and testing
 33 |     """
 34 | 
 35 |     weights = os.path.join(save_weights_folder, str(nb_classes) +
 36 |                            '_' + str(slice_length) + '_' + str(random_states))
 37 |     os.makedirs(save_weights_folder, exist_ok=True)
 38 |     os.makedirs(save_metrics_folder, exist_ok=True)
 39 | 
 40 |     print("Loading dataset...")
 41 | 
 42 |     if not album_split:
 43 |         # song split
 44 |         Y_train, X_train, S_train, Y_test, X_test, S_test, \
 45 |         Y_val, X_val, S_val = \
 46 |             utility.load_dataset_song_split(song_folder_name=song_folder,
 47 |                                             artist_folder=artist_folder,
 48 |                                             nb_classes=nb_classes,
 49 |                                             random_state=random_states)
 50 |     else:
 51 |         Y_train, X_train, S_train, Y_test, X_test, S_test, \
 52 |         Y_val, X_val, S_val = \
 53 |             utility.load_dataset_album_split(song_folder_name=song_folder,
 54 |                                              artist_folder=artist_folder,
 55 |                                              nb_classes=nb_classes,
 56 |                                              random_state=random_states)
 57 | 
 58 |     print("Loaded and split dataset. Slicing songs...")
 59 | 
 60 |     # Create slices out of the songs
 61 |     X_train, Y_train, S_train = utility.slice_songs(X_train, Y_train, S_train,
 62 |                                                     length=slice_length)
 63 |     X_val, Y_val, S_val = utility.slice_songs(X_val, Y_val, S_val,
 64 |                                               length=slice_length)
 65 |     X_test, Y_test, S_test = utility.slice_songs(X_test, Y_test, S_test,
 66 |                                                  length=slice_length)
 67 | 
 68 |     print("Training set label counts:", np.unique(Y_train, return_counts=True))
 69 | 
 70 |     # Encode the target vectors into one-hot encoded vectors
 71 |     Y_train, le, enc = utility.encode_labels(Y_train)
 72 |     Y_test, le, enc = utility.encode_labels(Y_test, le, enc)
 73 |     Y_val, le, enc = utility.encode_labels(Y_val, le, enc)
 74 | 
 75 |     # Reshape data as 2d convolutional tensor shape
 76 |     X_train = X_train.reshape(X_train.shape + (1,))
 77 |     X_val = X_val.reshape(X_val.shape + (1,))
 78 |     X_test = X_test.reshape(X_test.shape + (1,))
 79 | 
 80 |     # build the model
 81 |     model = models.CRNN2D(X_train.shape, nb_classes=Y_train.shape[1])
 82 |     model.compile(loss='categorical_crossentropy',
 83 |                   optimizer=Adam(lr=lr),
 84 |                   metrics=['accuracy'])
 85 |     model.summary()
 86 | 
 87 |     # Initialize weights using checkpoint if it exists
 88 |     if load_checkpoint:
 89 |         print("Looking for previous weights...")
 90 |         if isfile(weights):
 91 |             print('Checkpoint file detected. Loading weights.')
 92 |             model.load_weights(weights)
 93 |         else:
 94 |             print('No checkpoint file detected.  Starting from scratch.')
 95 |     else:
 96 |         print('Starting from scratch (no checkpoint)')
 97 | 
 98 |     checkpointer = ModelCheckpoint(filepath=weights,
 99 |                                    verbose=1,
100 |                                    save_best_only=True)
101 |     earlystopper = EarlyStopping(monitor='val_loss', min_delta=0,
102 |                                  patience=early_stop, verbose=0, mode='auto')
103 | 
104 |     # Train the model
105 |     if train:
106 |         print("Input Data Shape", X_train.shape)
107 |         history = model.fit(X_train, Y_train, batch_size=batch_size,
108 |                             shuffle=True, epochs=nb_epochs,
109 |                             verbose=1, validation_data=(X_val, Y_val),
110 |                             callbacks=[checkpointer, earlystopper])
111 |         if plots:
112 |             utility.plot_history(history)
113 | 
114 |     # Load weights that gave best performance on validation set
115 |     model.load_weights(weights)
116 |     filename = os.path.join(save_metrics_folder, str(nb_classes) + '_'
117 |                             + str(slice_length)
118 |                             + '_' + str(random_states) + '.txt')
119 | 
120 |     # Score test model
121 |     score = model.evaluate(X_test, Y_test, verbose=0)
122 |     y_score = model.predict_proba(X_test)
123 | 
124 |     # Calculate confusion matrix
125 |     y_predict = np.argmax(y_score, axis=1)
126 |     y_true = np.argmax(Y_test, axis=1)
127 |     cm = confusion_matrix(y_true, y_predict)
128 | 
129 |     # Plot the confusion matrix
130 |     class_names = np.arange(nb_classes)
131 |     class_names_original = le.inverse_transform(class_names)
132 |     plt.figure(figsize=(14, 14))
133 |     utility.plot_confusion_matrix(cm, classes=class_names_original,
134 |                                   normalize=True,
135 |                                   title='Confusion matrix with normalization')
136 |     if save_metrics:
137 |         plt.savefig(filename + '.png', bbox_inches="tight")
138 |     plt.close()
139 |     plt.figure(figsize=(14, 14))
140 | 
141 |     # Print out metrics
142 |     print('Test score/loss:', score[0])
143 |     print('Test accuracy:', score[1])
144 |     print('\nTest results on each slice:')
145 |     scores = classification_report(y_true, y_predict,
146 |                                    target_names=class_names_original)
147 |     scores_dict = classification_report(y_true, y_predict,
148 |                                         target_names=class_names_original,
149 |                                         output_dict=True)
150 |     print(scores)
151 | 
152 |     # Predict artist using pooling methodology
153 |     pooling_scores, pooled_scores_dict = \
154 |         utility.predict_artist(model, X_test, Y_test, S_test,
155 |                                le, class_names=class_names_original,
156 |                                slices=None, verbose=False)
157 | 
158 |     # Save metrics
159 |     if save_metrics:
160 |         plt.savefig(filename + '_pooled.png', bbox_inches="tight")
161 |         plt.close()
162 |         with open(filename, 'w') as f:
163 |             f.write("Training data shape:" + str(X_train.shape))
164 |             f.write('\nnb_classes: ' + str(nb_classes) +
165 |                     '\nslice_length: ' + str(slice_length))
166 |             f.write('\nweights: ' + weights)
167 |             f.write('\nlr: ' + str(lr))
168 |             f.write('\nTest score/loss: ' + str(score[0]))
169 |             f.write('\nTest accuracy: ' + str(score[1]))
170 |             f.write('\nTest results on each slice:\n')
171 |             f.write(str(scores))
172 |             f.write('\n\n Scores when pooling song slices:\n')
173 |             f.write(str(pooling_scores))
174 | 
175 |     return (scores_dict, pooled_scores_dict)
176 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Musical Artist Classification with Convolutional Recurrent Neural Networks
  2 | 
  3 | Nasrullah, Z. and Zhao, Y., Musical Artist Classification with Convolutional Recurrent Neural Networks. *International Joint Conference on Neural Networks (IJCNN)*, 2019.
  4 | 
  5 | Please cite the paper as:
  6 | 
  7 |     @inproceedings{nasrullah2019music,
  8 |       author={Nasrullah, Zain and Zhao, Yue},
  9 |       title={Musical Artist Classification with Convolutional Recurrent Neural Networks},
 10 |       booktitle={2019 International Joint Conference on Neural Networks (IJCNN)},
 11 |       year={2019},
 12 |       month={July}
 13 |       pages={1-8},
 14 |       doi={10.1109/IJCNN.2019.8851988},
 15 |       organization={IEEE}
 16 |     }
 17 |         
 18 |  [PDF for Personal Use](http://arxiv.org/abs/1901.04555) | [IEEE Xplore](https://ieeexplore.ieee.org/document/8851988)
 19 | 
 20 | 
 21 | ------------
 22 | 
 23 | 
 24 | ## Introduction
 25 | Previous attempts at music artist classification use frame level audio features which summarize frequency content within short intervals of time. Comparatively, more recent music information retrieval tasks take advantage of temporal structure in audio spectrograms using deep convolutional and recurrent models. This paper revisits artist classification with this new framework and empirically explores the impacts of incorporating temporal structure in the feature representation. To this end, an established classification architecture, a Convolutional Recurrent Neural Network (CRNN), is applied to the artist20 music artist identification dataset under a comprehensive set of conditions. These include audio clip length, which is a novel contribution in this work, and previously identified considerations such as dataset split and feature level. Our results improve upon baseline works, verify the influence of the producer effect on classification performance and demonstrate the trade-offs between audio length and training set size. The best performing model achieves an average F1 score of 0.937 across three independent trials which is a substantial improvement over the corresponding baseline under similar conditions. Additionally, to showcase the effectiveness of the CRNN's feature extraction capabilities, we visualize audio samples at the model's bottleneck layer demonstrating that learned representations segment into clusters belonging to their respective artists.
 26 | 
 27 | 
 28 | ![Convolutional Recurrent Neural Network](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/crnn_arch.png)
 29 | 
 30 | 
 31 | ## Dependency
 32 | The experiment code is writen in Python 3.6 and built on a number of Python packages including (but not limited to):
 33 | - dill==3.2.8.2
 34 | - h5py==2.8.0
 35 | - Keras==3.1.1
 36 | - librosa==1.5.1
 37 | - matplotlib==3.2.3
 38 | - numpy==2.14.5
 39 | - pandas==1.23.4
 40 | - scikit-learn==1.20.0
 41 | - scipy==2.1.0
 42 | - seaborn==1.9.0
 43 | - tensorflow==2.10.0
 44 | 
 45 | 
 46 | Batch installation is possible using the supplied "requirements.txt" with pip or conda.
 47 | 
 48 | ````cmd
 49 | pip install -r requirements.txt
 50 | ````
 51 | 
 52 | Additional install details (recommended for replication and strong performance):
 53 | - Python: 3.6.6
 54 | - GPU: Nvidia GTX 1080 (Driver: 390.87)
 55 | - CUDA: 8.0
 56 | - CUDNN: 7.0.5
 57 | - [ffmpeg](http://ffmpeg.org/download.html) is required by Librosa to convert audio files into spectrograms. 
 58 | 
 59 | 
 60 | ## Datasets
 61 | 
 62 | This study primarily uses the artist20 musical artist identification dataset by Labrosa [1]. The data is accessible upon request from https://labrosa.ee.columbia.edu/projects/artistid/.
 63 | 
 64 | The main characteristics of the dataset can be summarized as:
 65 | 
 66 | |Property           | Value   |
 67 | |-------------------|---------|
 68 | |# of Tracks        | 1,413   |
 69 | |# of Artists       | 20      |
 70 | |Albums per Artist  | 6       | 
 71 | |Bitrate            | 32 kbps |
 72 | |Sample Rate        | 16 kHz  |
 73 | |Channels           | Mono    |
 74 | 
 75 | The figure below visualizes three seconds of the mel-scaled audio spectrogram for a randomly sampled song from each artist. This is the primary data representation used in the paper. 
 76 | 
 77 | ![Convolutional Recurrent Neural Network](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/artists.PNG)
 78 | 
 79 | ## Usage
 80 | 
 81 | To re-create experimental results:
 82 | 
 83 | - Prepare mel-scaled spectrograms from raw audio in the dataset.
 84 |     - Run src/utility.py if the dataset is stored using its original folder structure (artists/[artist]/[album]/[song].mp3) in the project root.
 85 |     - Using the create_dataset() utility function in src/utility.py with a custom directory if the dataset is stored elsewhere.
 86 | - Run the main.py script. This will begin a training loop which runs three independent trials for each audio length in {1s, 3s, 5s, 10s, 20s, 30s}.
 87 |     - This script must be adjusted manually to vary whether or not to use an album split via the album_split flag in the train_model function call. 
 88 |     - It should be noted that training each model is computationally expensive and can take several hours even with reliable hardware. At minimum, a Nvidia GTX 1080 GPU is recommended with at least 16GB of memory on the machine.  
 89 | - To reproduce the representation visualization, the representation.py script can be used but one must specify the model weight location and relevant audio clip length. 
 90 | 
 91 | The models and utility functions provided can also generically be used for any audio-based classification task where one wants to experiment with audio length. The train_model function in src/trainer.py is fairly extensive. 
 92 | 
 93 | ## Results
 94 | 
 95 | Classification performance is evaluated using the test F1-score of three independent trials and also varying parameters such as audio length {1s, 3s, 5s, 10s, 20s, 30s}, the type of dataset split {song-level, album-level} and feature-level {frame-level, song-level}. Both the average and maximum score are reported among the trials. 
 96 | 
 97 | As a whole, from the four base conditions resulting from audio split and level, the CRNN model outperforms the most comparable baseline for at least one audio clip length. This holds true for both the best and average case performance except for the album split with song-level features where the CRNN model only outperforms in its best-run. This discrepancy may be explained by considering that Mandel's dataset contains less classes or because, unlike the baselines works, we are additionally reporting the average of three independent trials instead of performance on a single trial. 
 98 | 
 99 | *Test F1 Scores for Frame-level Audio Features (3 runs):*
100 | 
101 | |Split | Type    | 1s     | 3s    | 5s    | 10s   | 20s   | 30s      | 
102 | |------|---------|--------|-------|-------|-------|-------|----------|
103 | |Song  | Average | 0.729  | 0.765 | 0.770 | **0.787** | 0.768 | 0.764|
104 | |Song  | Best    | 0.733  | 0.768 | 0.779 | 0.772 | **0.792** | 0.771|
105 | |Album | Average | 0.482  | 0.513 | 0.536 | 0.538 | 0.534 | **0.603**|
106 | |Album | Best    | 0.516  | 0.527 | 0.550 | 0.560 | 0.553 | **0.612**|
107 | 
108 | *Test F1 Scores for Song-level Audio Features (3 runs):*
109 | 
110 | |Split | Type    | 1s    | 3s        | 5s    | 10s   | 20s   | 30s  | 
111 | |------|---------|-------|-----------|-------|-------|-------|------|
112 | |Song  | Average | 0.929 | **0.937** | 0.918 | 0.902 | 0.861 | 0.846|
113 | |Song  | Best    | 0.944 | **0.966** | 0.930 | 0.915 | 0.880 | 0.851|
114 | |Album | Average | 0.641 | 0.651 | 0.652 | 0.630 | 0.568 | **0.674**|
115 | |Album | Best    | **0.700** | 0.653 | 0.662 | 0.683 | 0.609 | 0.691|
116 | 
117 | Additionally, audio samples at the bottleneck layer of the network are also visualized using t-SNE to demonstrate how effectively the model is able to learn to classify artists. As can be seen below, the learned representations prior to classification separate into distinct clusters belonging to each artist demonstrating that the convolution and recurrent layers are effective at the task. The example below is for the model trained on 10s of audio.  
118 | 
119 | ![Learned representations at bottleneck layer of network (10s)](https://github.com/ZainNasrullah/music-artist-classification-crnn/blob/master/images/representation_313.png)
120 | 
121 | ## Conclusions
122 | This paper establishes a deep learning baseline for music artist classification on the \textbf{\textit{artist20}} dataset and demonstrates that a Convolutional Recurrent Neural Network is able to outperform traditional baselines under a range of conditions. The results show that including additional temporal structure in an audio sample improves classification performance and also that there is a point beyond which the returns may diminish. This is attributed to a possible lack of complexity in the model or early pooling layers discarding too much information. Using the trained models, predictions are also aggregated at the song level using a majority vote to determine the artist performing a song. This leads to another substantial gain in performance and validates the feasibility of using a CRNN for industry applications such as copyright detection. The best-performing model is trained using three second audio samples under a song dataset split and evaluated at the song level to achieve an average F1 score of 0.937 across three independent trials. Additionally, we visualize audio samples at the bottleneck layer of the network to show that learned representations cluster by artist---highlighting the model's capability as a feature extractor. Future directions include audio augmentation, model pre-training and minimizing temporal pooling as avenues for further performance improvement.  
123 | 
124 | ## References
125 | 
126 | [1] D. Ellis (2007). Classifying Music Audio with Timbral and Chroma Features,
127 | *Proc. Int. Conf. on Music Information Retrieval (ISMIR)*, Vienna, Austria, Sep. 2007.
128 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Nov 11 11:23:13 2017
  4 | Updated on Nov 14 2017
  5 | @author: Zain
  6 | """
  7 | 
  8 | from keras.models import Sequential
  9 | from keras.layers import Dense, Dropout, Activation, Reshape, Permute
 10 | from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D
 11 | from keras.layers.normalization import BatchNormalization
 12 | from keras.layers.recurrent import GRU, LSTM
 13 | 
 14 | 
 15 | def CRNN2D(X_shape, nb_classes):
 16 |     '''
 17 |     Model used for evaluation in paper. Inspired by K. Choi model in:
 18 |     https://github.com/keunwoochoi/music-auto_tagging-keras/blob/master/music_tagger_crnn.py
 19 |     '''
 20 | 
 21 |     nb_layers = 4  # number of convolutional layers
 22 |     nb_filters = [64, 128, 128, 128]  # filter sizes
 23 |     kernel_size = (3, 3)  # convolution kernel size
 24 |     activation = 'elu'  # activation function to use after each layer
 25 |     pool_size = [(2, 2), (4, 2), (4, 2), (4, 2),
 26 |                  (4, 2)]  # size of pooling area
 27 | 
 28 |     # shape of input data (frequency, time, channels)
 29 |     input_shape = (X_shape[1], X_shape[2], X_shape[3])
 30 |     frequency_axis = 1
 31 |     time_axis = 2
 32 |     channel_axis = 3
 33 | 
 34 |     # Create sequential model and normalize along frequency axis
 35 |     model = Sequential()
 36 |     model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape))
 37 | 
 38 |     # First convolution layer specifies shape
 39 |     model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
 40 |                      data_format="channels_last",
 41 |                      input_shape=input_shape))
 42 |     model.add(Activation(activation))
 43 |     model.add(BatchNormalization(axis=channel_axis))
 44 |     model.add(MaxPooling2D(pool_size=pool_size[0], strides=pool_size[0]))
 45 |     model.add(Dropout(0.1))
 46 | 
 47 |     # Add more convolutional layers
 48 |     for layer in range(nb_layers - 1):
 49 |         # Convolutional layer
 50 |         model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
 51 |                          padding='same'))
 52 |         model.add(Activation(activation))
 53 |         model.add(BatchNormalization(
 54 |             axis=channel_axis))  # Improves overfitting/underfitting
 55 |         model.add(MaxPooling2D(pool_size=pool_size[layer + 1],
 56 |                                strides=pool_size[layer + 1]))  # Max pooling
 57 |         model.add(Dropout(0.1))
 58 | 
 59 |         # Reshaping input for recurrent layer
 60 |     # (frequency, time, channels) --> (time, frequency, channel)
 61 |     model.add(Permute((time_axis, frequency_axis, channel_axis)))
 62 |     resize_shape = model.output_shape[2] * model.output_shape[3]
 63 |     model.add(Reshape((model.output_shape[1], resize_shape)))
 64 | 
 65 |     # recurrent layer
 66 |     model.add(GRU(32, return_sequences=True))
 67 |     model.add(GRU(32, return_sequences=False))
 68 |     model.add(Dropout(0.3))
 69 | 
 70 |     # Output layer
 71 |     model.add(Dense(nb_classes))
 72 |     model.add(Activation("softmax"))
 73 |     return model
 74 | 
 75 | 
 76 | ###############################################################################
 77 | '''
 78 | Models below this point were only pre-tested and were not presented in the paper
 79 | '''
 80 | 
 81 | 
 82 | ###############################################################################
 83 | 
 84 | def CRNN2DLarger(X_shape, nb_classes):
 85 |     '''
 86 |     Making the previous model larger and deeper
 87 |     '''
 88 |     nb_layers = 5  # number of convolutional layers
 89 |     nb_filters = [64, 128, 256, 512, 512]
 90 |     kernel_size = (3, 3)  # convolution kernel size
 91 |     activation = 'elu'  # activation function to use after each layer
 92 |     pool_size = [(2, 2), (2, 2), (2, 2), (4, 1),
 93 |                  (4, 1)]  # # size of pooling area
 94 |     # pool_size = [(4,2), (4,2), (4,1), (2,1)] this worked well
 95 | 
 96 |     # shape of input data (frequency, time, channels)
 97 |     input_shape = (X_shape[1], X_shape[2], X_shape[3])
 98 |     frequency_axis = 1
 99 |     time_axis = 2
100 |     channel_axis = 3
101 | 
102 |     # Create sequential model
103 |     model = Sequential()
104 |     model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape))
105 | 
106 |     # First convolution layer
107 |     model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
108 |                      data_format="channels_last",
109 |                      input_shape=input_shape))
110 |     model.add(Activation(activation))
111 |     model.add(BatchNormalization(
112 |         axis=channel_axis))  # Improves overfitting/underfitting
113 |     model.add(MaxPooling2D(pool_size=pool_size[0],
114 |                            strides=pool_size[0]))  # Max pooling
115 |     model.add(Dropout(0.1))  # 0.2
116 | 
117 |     # Add more convolutional layers
118 |     for layer in range(nb_layers - 1):
119 |         # Convolutional layer
120 |         model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
121 |                          padding='same'))
122 |         model.add(Activation(activation))
123 |         model.add(BatchNormalization(
124 |             axis=channel_axis))  # Improves overfitting/underfitting
125 |         model.add(MaxPooling2D(pool_size=pool_size[layer + 1],
126 |                                strides=pool_size[layer + 1]))  # Max pooling
127 |         model.add(Dropout(0.1))  # 0.2
128 | 
129 |     # Reshaping input for recurrent layer
130 |     # (frequency, time, channels) --> (time, frequency, channel)
131 |     model.add(Permute((time_axis, frequency_axis, channel_axis)))
132 |     resize_shape = model.output_shape[2] * model.output_shape[3]
133 |     model.add(Reshape((model.output_shape[1], resize_shape)))
134 | 
135 |     # recurrent layer
136 |     model.add(GRU(32, return_sequences=True))
137 |     model.add(GRU(32, return_sequences=False))
138 |     model.add(Dropout(0.3))
139 | 
140 |     # Output layer
141 |     model.add(Dense(nb_classes))
142 |     model.add(Activation("softmax"))
143 |     return model
144 | 
145 | 
146 | def CRNN2DVGG(X_shape, nb_classes):
147 |     '''
148 |     Based on VGG-16 Architecture
149 |     '''
150 |     nb_layers = 5  # number of convolutional layers
151 |     nb_filters = [64, 128, 256, 512, 512]
152 |     kernel_size = (3, 3)  # convolution kernel size
153 |     activation = 'elu'  # activation function to use after each layer
154 |     pool_size = [(2, 2), (2, 2), (2, 2), (4, 1),
155 |                  (4, 1)]  # # size of pooling area
156 |     # pool_size = [(4,2), (4,2), (4,1), (2,1)] this worked well
157 | 
158 |     # shape of input data (frequency, time, channels)
159 |     input_shape = (X_shape[1], X_shape[2], X_shape[3])
160 |     frequency_axis = 1
161 |     time_axis = 2
162 |     channel_axis = 3
163 | 
164 |     # Create sequential model
165 |     model = Sequential()
166 |     model.add(BatchNormalization(axis=frequency_axis, input_shape=input_shape))
167 | 
168 |     # First convolution layer
169 |     model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
170 |                      data_format="channels_last",
171 |                      input_shape=input_shape))
172 |     model.add(Activation(activation))
173 |     model.add(BatchNormalization(
174 |         axis=channel_axis))  # Improves overfitting/underfitting
175 | 
176 |     model.add(Conv2D(nb_filters[0], kernel_size=kernel_size, padding='same',
177 |                      data_format="channels_last",
178 |                      input_shape=input_shape))
179 |     model.add(Activation(activation))
180 |     model.add(BatchNormalization(
181 |         axis=channel_axis))  # Improves overfitting/underfitting
182 | 
183 |     model.add(MaxPooling2D(pool_size=pool_size[0],
184 |                            strides=pool_size[0]))  # Max pooling
185 |     model.add(Dropout(0.1))  # 0.2
186 | 
187 |     # Add more convolutional layers
188 |     for layer in range(nb_layers - 1):
189 |         # Convolutional layer
190 |         model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
191 |                          padding='same'))
192 |         model.add(Activation(activation))
193 |         model.add(BatchNormalization(
194 |             axis=channel_axis))  # Improves overfitting/underfitting
195 | 
196 |         model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
197 |                          padding='same'))
198 |         model.add(Activation(activation))
199 |         model.add(BatchNormalization(
200 |             axis=channel_axis))  # Improves overfitting/underfitting
201 | 
202 |         if nb_filters[layer + 1] != 128:
203 |             model.add(Conv2D(nb_filters[layer + 1], kernel_size=kernel_size,
204 |                              padding='same'))
205 |             model.add(Activation(activation))
206 |             model.add(BatchNormalization(
207 |                 axis=channel_axis))  # Improves overfitting/underfitting
208 | 
209 |         model.add(MaxPooling2D(pool_size=pool_size[layer + 1],
210 |                                strides=pool_size[layer + 1]))  # Max pooling
211 |         model.add(Dropout(0.1))  # 0.2
212 | 
213 |     # Reshaping input for recurrent layer
214 |     # (frequency, time, channels) --> (time, frequency, channel)
215 |     model.add(Permute((time_axis, frequency_axis, channel_axis)))
216 |     resize_shape = model.output_shape[2] * model.output_shape[3]
217 |     model.add(Reshape((model.output_shape[1], resize_shape)))
218 | 
219 |     # recurrent layer
220 |     model.add(GRU(32, return_sequences=True))
221 |     model.add(GRU(32, return_sequences=False))
222 |     model.add(Dropout(0.2))
223 | 
224 |     # Output layer
225 |     model.add(Dense(nb_classes))
226 |     model.add(Activation("softmax"))
227 |     return model
228 | 
229 | 
230 | def CRNN1D(X_shape, nb_classes):
231 |     '''
232 |     Based on 1D convolution
233 |     '''
234 | 
235 |     nb_layers = 3  # number of convolutional layers
236 |     kernel_size = 5  # convolution kernel size
237 |     activation = 'relu'  # activation function to use after each layer
238 |     pool_size = 2  # size of pooling area
239 | 
240 |     # shape of input data (frequency, time, channels)
241 |     input_shape = (X_shape[1], X_shape[2], X_shape[3])
242 |     frequency_axis = 1
243 |     time_axis = 2
244 |     channel_axis = 3
245 | 
246 |     # Create sequential model
247 |     model = Sequential()
248 | 
249 |     model.add(Permute((time_axis, frequency_axis, channel_axis),
250 |                       input_shape=input_shape))
251 |     resize_shape = model.output_shape[2] * model.output_shape[3]
252 |     model.add(Reshape((model.output_shape[1], resize_shape)))
253 | 
254 |     # First convolution layer
255 |     model.add(Conv1D(64, kernel_size))
256 |     model.add(Activation(activation))
257 |     model.add(
258 |         MaxPooling1D(pool_size=pool_size, strides=pool_size))  # Max pooling
259 |     # model.add(Dropout(0.2))
260 | 
261 |     # Add more convolutional layers
262 |     for _ in range(nb_layers - 1):
263 |         # Convolutional layer
264 |         model.add(Conv1D(128, kernel_size))
265 |         model.add(Activation(activation))
266 |         model.add(MaxPooling1D(pool_size=pool_size,
267 |                                strides=pool_size))  # Max pooling
268 | 
269 |     model.add(GRU(64, return_sequences=True))
270 |     model.add(GRU(64, return_sequences=False))
271 | 
272 |     model.add(Dense(nb_classes))  # note sure about this
273 |     model.add(Activation('softmax'))
274 | 
275 |     # Output layer
276 |     return model
277 | 
278 | 
279 | def RNN(X_shape, nb_classes):
280 |     '''
281 |     Implementing only the RNN
282 |     '''
283 |     # shape of input data (frequency, time, channels)
284 |     input_shape = (X_shape[1], X_shape[2], X_shape[3])
285 |     frequency_axis = 1
286 |     time_axis = 2
287 |     channel_axis = 3
288 | 
289 |     # Create sequential model
290 |     model = Sequential()
291 | 
292 |     model.add(Permute((time_axis, frequency_axis, channel_axis),
293 |                       input_shape=input_shape))
294 |     resize_shape = model.output_shape[2] * model.output_shape[3]
295 |     model.add(Reshape((model.output_shape[1], resize_shape)))
296 | 
297 |     model.add(LSTM(64, return_sequences=True))
298 |     model.add(LSTM(64, return_sequences=False))
299 | 
300 |     model.add(Dense(nb_classes))  # note sure about this
301 |     model.add(Activation('softmax'))
302 | 
303 |     # Output layer
304 |     return model
305 | 


--------------------------------------------------------------------------------
/src/utility.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dill
  3 | import random
  4 | import itertools
  5 | 
  6 | import numpy as np
  7 | from numpy.random import RandomState
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | import librosa
 11 | import librosa.display
 12 | 
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn import preprocessing
 15 | from sklearn.metrics import confusion_matrix, classification_report
 16 | from sklearn.utils import shuffle
 17 | from scipy import stats
 18 | 
 19 | 
 20 | def visualize_spectrogram(path, duration=None,
 21 |                           offset=0, sr=16000, n_mels=128, n_fft=2048,
 22 |                           hop_length=512):
 23 |     """This function creates a visualization of a spectrogram
 24 |     given the path to an audio file."""
 25 | 
 26 |     # Make a mel-scaled power (energy-squared) spectrogram
 27 |     y, sr = librosa.load(path, sr=sr, duration=duration, offset=offset)
 28 |     S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, n_fft=n_fft,
 29 |                                        hop_length=hop_length)
 30 | 
 31 |     # Convert to log scale (dB)
 32 |     log_S = librosa.logamplitude(S, ref_power=1.0)
 33 | 
 34 |     # Render output spectrogram in the console
 35 |     plt.figure(figsize=(12, 5))
 36 |     librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
 37 |     plt.title('mel power spectrogram')
 38 |     plt.colorbar(format='%+02.0f dB')
 39 |     plt.tight_layout()
 40 | 
 41 | 
 42 | def create_dataset(artist_folder='artists', save_folder='song_data',
 43 |                    sr=16000, n_mels=128,
 44 |                    n_fft=2048, hop_length=512):
 45 |     """This function creates the dataset given a folder
 46 |      with the correct structure (artist_folder/artists/albums/*.mp3)
 47 |     and saves it to a specified folder."""
 48 | 
 49 |     # get list of all artists
 50 |     os.makedirs(save_folder, exist_ok=True)
 51 |     artists = [path for path in os.listdir(artist_folder) if
 52 |                os.path.isdir(path)]
 53 | 
 54 |     # iterate through all artists, albums, songs and find mel spectrogram
 55 |     for artist in artists:
 56 |         print(artist)
 57 |         artist_path = os.path.join(artist_folder, artist)
 58 |         artist_albums = os.listdir(artist_path)
 59 | 
 60 |         for album in artist_albums:
 61 |             album_path = os.path.join(artist_path, album)
 62 |             album_songs = os.listdir(album_path)
 63 | 
 64 |             for song in album_songs:
 65 |                 song_path = os.path.join(album_path, song)
 66 | 
 67 |                 # Create mel spectrogram and convert it to the log scale
 68 |                 y, sr = librosa.load(song_path, sr=sr)
 69 |                 S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels,
 70 |                                                    n_fft=n_fft,
 71 |                                                    hop_length=hop_length)
 72 |                 log_S = librosa.logamplitude(S, ref_power=1.0)
 73 |                 data = (artist, log_S, song)
 74 | 
 75 |                 # Save each song
 76 |                 save_name = artist + '_%%-%%_' + album + '_%%-%%_' + song
 77 |                 with open(os.path.join(save_folder, save_name), 'wb') as fp:
 78 |                     dill.dump(data, fp)
 79 | 
 80 | 
 81 | def load_dataset(song_folder_name='song_data',
 82 |                  artist_folder='artists',
 83 |                  nb_classes=20, random_state=42):
 84 |     """This function loads the dataset based on a location;
 85 |      it returns a list of spectrograms
 86 |      and their corresponding artists/song names"""
 87 | 
 88 |     # Get all songs saved as numpy arrays in the given folder
 89 |     song_list = os.listdir(song_folder_name)
 90 | 
 91 |     # Load the list of artists
 92 |     artist_list = os.listdir(artist_folder)
 93 | 
 94 |     # select the appropriate number of classes
 95 |     prng = RandomState(random_state)
 96 |     artists = prng.choice(artist_list, size=nb_classes, replace=False)
 97 | 
 98 |     # Create empty lists
 99 |     artist = []
100 |     spectrogram = []
101 |     song_name = []
102 | 
103 |     # Load each song into memory if the artist is included and return
104 |     for song in song_list:
105 |         with open(os.path.join(song_folder_name, song), 'rb') as fp:
106 |             loaded_song = dill.load(fp)
107 |         if loaded_song[0] in artists:
108 |             artist.append(loaded_song[0])
109 |             spectrogram.append(loaded_song[1])
110 |             song_name.append(loaded_song[2])
111 | 
112 |     return artist, spectrogram, song_name
113 | 
114 | 
115 | def load_dataset_album_split(song_folder_name='song_data',
116 |                              artist_folder='artists',
117 |                              nb_classes=20, random_state=42):
118 |     """ This function loads a dataset and splits it on an album level"""
119 |     song_list = os.listdir(song_folder_name)
120 | 
121 |     # Load the list of artists
122 |     artist_list = os.listdir(artist_folder)
123 | 
124 |     train_albums = []
125 |     test_albums = []
126 |     val_albums = []
127 |     random.seed(random_state)
128 |     for artist in os.listdir(artist_folder):
129 |         albums = os.listdir(os.path.join(artist_folder, artist))
130 |         random.shuffle(albums)
131 |         test_albums.append(artist + '_%%-%%_' + albums.pop(0))
132 |         val_albums.append(artist + '_%%-%%_' + albums.pop(0))
133 |         train_albums.extend([artist + '_%%-%%_' + album for album in albums])
134 | 
135 |     # select the appropriate number of classes
136 |     prng = RandomState(random_state)
137 |     artists = prng.choice(artist_list, size=nb_classes, replace=False)
138 | 
139 |     # Create empty lists
140 |     Y_train, Y_test, Y_val = [], [], []
141 |     X_train, X_test, X_val = [], [], []
142 |     S_train, S_test, S_val = [], [], []
143 | 
144 |     # Load each song into memory if the artist is included and return
145 |     for song in song_list:
146 |         with open(os.path.join(song_folder_name, song), 'rb') as fp:
147 |             loaded_song = dill.load(fp)
148 |         artist, album, song_name = song.split('_%%-%%_')
149 |         artist_album = artist + '_%%-%%_' + album
150 | 
151 |         if loaded_song[0] in artists:
152 |             if artist_album in train_albums:
153 |                 Y_train.append(loaded_song[0])
154 |                 X_train.append(loaded_song[1])
155 |                 S_train.append(loaded_song[2])
156 |             elif artist_album in test_albums:
157 |                 Y_test.append(loaded_song[0])
158 |                 X_test.append(loaded_song[1])
159 |                 S_test.append(loaded_song[2])
160 |             elif artist_album in val_albums:
161 |                 Y_val.append(loaded_song[0])
162 |                 X_val.append(loaded_song[1])
163 |                 S_val.append(loaded_song[2])
164 | 
165 |     return Y_train, X_train, S_train, \
166 |            Y_test, X_test, S_test, \
167 |            Y_val, X_val, S_val
168 | 
169 | 
170 | def load_dataset_song_split(song_folder_name='song_data',
171 |                             artist_folder='artists',
172 |                             nb_classes=20,
173 |                             test_split_size=0.1,
174 |                             validation_split_size=0.1,
175 |                             random_state=42):
176 |     Y, X, S = load_dataset(song_folder_name=song_folder_name,
177 |                            artist_folder=artist_folder,
178 |                            nb_classes=nb_classes,
179 |                            random_state=random_state)
180 |     # train and test split
181 |     X_train, X_test, Y_train, Y_test, S_train, S_test = train_test_split(
182 |         X, Y, S, test_size=test_split_size, stratify=Y,
183 |         random_state=random_state)
184 | 
185 |     # Create a validation to be used to track progress
186 |     X_train, X_val, Y_train, Y_val, S_train, S_val = train_test_split(
187 |         X_train, Y_train, S_train, test_size=validation_split_size,
188 |         shuffle=True, stratify=Y_train, random_state=random_state)
189 | 
190 |     return Y_train, X_train, S_train, \
191 |            Y_test, X_test, S_test, \
192 |            Y_val, X_val, S_val
193 | 
194 | 
195 | def slice_songs(X, Y, S, length=911):
196 |     """Slices the spectrogram into sub-spectrograms according to length"""
197 | 
198 |     # Create empty lists for train and test sets
199 |     artist = []
200 |     spectrogram = []
201 |     song_name = []
202 | 
203 |     # Slice up songs using the length specified
204 |     for i, song in enumerate(X):
205 |         slices = int(song.shape[1] / length)
206 |         for j in range(slices - 1):
207 |             spectrogram.append(song[:, length * j:length * (j + 1)])
208 |             artist.append(Y[i])
209 |             song_name.append(S[i])
210 | 
211 |     return np.array(spectrogram), np.array(artist), np.array(song_name)
212 | 
213 | 
214 | def create_spectrogram_plots(artist_folder='artists', sr=16000, n_mels=128,
215 |                              n_fft=2048, hop_length=512):
216 |     """Create a spectrogram from a randomly selected song
217 |      for each artist and plot"""
218 | 
219 |     # get list of all artists
220 |     artists = os.listdir(artist_folder)
221 | 
222 |     fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(14, 12), sharex=True,
223 |                            sharey=True)
224 | 
225 |     row = 0
226 |     col = 0
227 | 
228 |     # iterate through artists, randomly select an album,
229 |     # randomly select a song, and plot a spectrogram on a grid
230 |     for artist in artists:
231 |         print(artist)
232 |         # Randomly select album and song
233 |         artist_path = os.path.join(artist_folder, artist)
234 |         artist_albums = os.listdir(artist_path)
235 |         album = random.choice(artist_albums)
236 |         album_path = os.path.join(artist_path, album)
237 |         album_songs = os.listdir(album_path)
238 |         song = random.choice(album_songs)
239 |         song_path = os.path.join(album_path, song)
240 | 
241 |         # Create mel spectrogram
242 |         y, sr = librosa.load(song_path, sr=sr, offset=60, duration=3)
243 |         S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels,
244 |                                            n_fft=n_fft, hop_length=hop_length)
245 |         log_S = librosa.logamplitude(S, ref_power=1.0)
246 | 
247 |         # Plot on grid
248 |         plt.axes(ax[row, col])
249 |         librosa.display.specshow(log_S, sr=sr)
250 |         plt.title(artist)
251 |         col += 1
252 |         if col == 5:
253 |             row += 1
254 |             col = 0
255 | 
256 |     fig.tight_layout()
257 | 
258 | 
259 | def plot_confusion_matrix(cm, classes,
260 |                           normalize=False,
261 |                           title='Confusion matrix',
262 |                           cmap=plt.cm.get_cmap('Blues')):
263 |     """
264 |     This function prints and plots the confusion matrix.
265 |     Normalization can be applied by setting `normalize=True`.
266 |     """
267 |     if normalize:
268 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
269 | 
270 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
271 |     plt.title(title)
272 |     plt.colorbar()
273 |     tick_marks = np.arange(len(classes))
274 |     plt.xticks(tick_marks, classes, rotation=90)
275 |     plt.yticks(tick_marks, classes)
276 | 
277 |     fmt = '.2f' if normalize else 'd'
278 |     thresh = cm.max() / 2.
279 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
280 |         plt.text(j, i, format(cm[i, j], fmt),
281 |                  horizontalalignment="center",
282 |                  color="white" if cm[i, j] > thresh else "black")
283 | 
284 |     plt.tight_layout()
285 |     plt.ylabel('True label')
286 |     plt.xlabel('Predicted label')
287 | 
288 | 
289 | def plot_history(history, title="model accuracy"):
290 |     """
291 |     This function plots the training and validation accuracy
292 |      per epoch of training
293 |     """
294 |     plt.plot(history.history['acc'])
295 |     plt.plot(history.history['val_acc'])
296 |     plt.title(title)
297 |     plt.ylabel('accuracy')
298 |     plt.xlabel('epoch')
299 |     plt.legend(['train', 'test'], loc='lower right')
300 |     plt.show()
301 | 
302 |     return
303 | 
304 | 
305 | def predict_artist(model, X, Y, S,
306 |                    le, class_names,
307 |                    slices=None, verbose=False,
308 |                    ml_mode=False):
309 |     """
310 |     This function takes slices of songs and predicts their output.
311 |     For each song, it votes on the most frequent artist.
312 |     """
313 |     print("Test results when pooling slices by song and voting:")
314 |     # Obtain the list of songs
315 |     songs = np.unique(S)
316 | 
317 |     prediction_list = []
318 |     actual_list = []
319 | 
320 |     # Iterate through each song
321 |     for song in songs:
322 | 
323 |         # Grab all slices related to a particular song
324 |         X_song = X[S == song]
325 |         Y_song = Y[S == song]
326 | 
327 |         # If not using full song, shuffle and take up to a number of slices
328 |         if slices and slices <= X_song.shape[0]:
329 |             X_song, Y_song = shuffle(X_song, Y_song)
330 |             X_song = X_song[:slices]
331 |             Y_song = Y_song[:slices]
332 | 
333 |         # Get probabilities of each class
334 |         predictions = model.predict(X_song, verbose=0)
335 | 
336 |         if not ml_mode:
337 |             # Get list of highest probability classes and their probability
338 |             class_prediction = np.argmax(predictions, axis=1)
339 |             class_probability = np.max(predictions, axis=1)
340 | 
341 |             # keep only predictions confident about;
342 |             prediction_summary_trim = class_prediction[class_probability > 0.5]
343 | 
344 |             # deal with edge case where there is no confident class
345 |             if len(prediction_summary_trim) == 0:
346 |                 prediction_summary_trim = class_prediction
347 |         else:
348 |             prediction_summary_trim = predictions
349 | 
350 |         # get most frequent class
351 |         prediction = stats.mode(prediction_summary_trim)[0][0]
352 |         actual = stats.mode(np.argmax(Y_song))[0][0]
353 | 
354 |         # Keeping track of overall song classification accuracy
355 |         prediction_list.append(prediction)
356 |         actual_list.append(actual)
357 | 
358 |         # Print out prediction
359 |         if verbose:
360 |             print(song)
361 |             print("Predicted:", le.inverse_transform(prediction), "\nActual:",
362 |                   le.inverse_transform(actual))
363 |             print('\n')
364 | 
365 |     # Print overall song accuracy
366 |     actual_array = np.array(actual_list)
367 |     prediction_array = np.array(prediction_list)
368 |     cm = confusion_matrix(actual_array, prediction_array)
369 |     plot_confusion_matrix(cm, classes=class_names, normalize=True,
370 |                           title='Confusion matrix for pooled results' +
371 |                                 ' with normalization')
372 |     class_report = classification_report(actual_array, prediction_array,
373 |                                          target_names=class_names)
374 |     print(class_report)
375 | 
376 |     class_report_dict = classification_report(actual_array, prediction_array,
377 |                                               target_names=class_names,
378 |                                               output_dict=True)
379 |     return (class_report, class_report_dict)
380 | 
381 | 
382 | def encode_labels(Y, le=None, enc=None):
383 |     """Encodes target variables into numbers and then one hot encodings"""
384 | 
385 |     # initialize encoders
386 |     N = Y.shape[0]
387 | 
388 |     # Encode the labels
389 |     if le is None:
390 |         le = preprocessing.LabelEncoder()
391 |         Y_le = le.fit_transform(Y).reshape(N, 1)
392 |     else:
393 |         Y_le = le.transform(Y).reshape(N, 1)
394 | 
395 |     # convert into one hot encoding
396 |     if enc is None:
397 |         enc = preprocessing.OneHotEncoder()
398 |         Y_enc = enc.fit_transform(Y_le).toarray()
399 |     else:
400 |         Y_enc = enc.transform(Y_le).toarray()
401 | 
402 |     # return encoders to re-use on other data
403 |     return Y_enc, le, enc
404 | 
405 | 
406 | def simple_encoding(Y, le=None):
407 |     """Encodes target variables into numbers"""
408 | 
409 |     # initialize encoders
410 |     N = Y.shape[0]
411 | 
412 |     # Encode the labels
413 |     if le is None:
414 |         le = preprocessing.LabelEncoder()
415 |         Y_le = le.fit_transform(Y)
416 |     else:
417 |         Y_le = le.transform(Y)
418 | 
419 |     # return encoders to re-use on other data
420 |     return Y_le, le
421 | 
422 | 
423 | if __name__ == '__main__':
424 | 
425 |     # configuration options
426 |     create_data = True
427 |     create_visuals = False
428 |     save_visuals = False
429 | 
430 |     if create_data:
431 |         create_dataset(artist_folder='artists', save_folder='song_data',
432 |                        sr=16000, n_mels=128, n_fft=2048,
433 |                        hop_length=512)
434 | 
435 |     if create_visuals:
436 |         # Create spectrogram for a specific song
437 |         visualize_spectrogram(
438 |             'artists/u2/The_Joshua_Tree/' +
439 |             '02-I_Still_Haven_t_Found_What_I_m_Looking_For.mp3',
440 |             offset=60, duration=29.12)
441 | 
442 |         # Create spectrogram subplots
443 |         create_spectrogram_plots(artist_folder='artists', sr=16000, n_mels=128,
444 |                                  n_fft=2048, hop_length=512)
445 |         if save_visuals:
446 |             plt.savefig(os.path.join('spectrograms.png'),
447 |                         bbox_inches="tight")
448 | 


--------------------------------------------------------------------------------