├── shuffled_idx_list.npz ├── README.md ├── music_classification_main.py └── music_gen_lib.py /shuffled_idx_list.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ds7711/music_genre_classification/HEAD/shuffled_idx_list.npz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # music_genre_classification 2 | CNN achieves human-level accuracy in music genre classification 3 | This is part of the course project for Rutgers Machine Learning class (2017 Spring). 4 | 5 | We used a simple convolutional neural network to perform music genre classification and achieve 70% accuracy, which is comparable to the human accuracy. 6 | 7 | To run the code, one has to either download the GTZAN dataset (http://marsyasweb.appspot.com/download/data_sets/) and convert them into mel-spectrogram (64 mel-filters), or download the already converted data (https://drive.google.com/file/d/0B3I2KG9W0zM2YmFWbk1oMFhkU1k/view?usp=sharing). Note: the preprocessed data (.npz) file contains the mel-spectrogram of the audio and the corresponding genre. Mel-spectrogram is converted using Librosa library in python. 8 | 9 | After downloading the data, simply run the music_classification_main.py to train the model. random_seed (0-29) determines how the data are split into training, validation, and testing set. The codes were tested under ubuntu 14.04 using Keras and tensorflow with a GTX-1070. On GPU, it roughly takes 10-30 minutes for the model to converge. 10 | 11 | For the analysis of the model, run the notebook files. 12 | 13 | A detailed explanation of the code is in the PDF. 14 | -------------------------------------------------------------------------------- /music_classification_main.py: -------------------------------------------------------------------------------- 1 | # import modules 2 | import numpy as np 3 | import librosa 4 | import music_gen_lib as mgl 5 | from keras.utils import np_utils 6 | import time 7 | from keras import backend as K 8 | from keras.engine.topology import Layer 9 | import tensorflow as tf 10 | import keras 11 | from sklearn.metrics import classification_report, confusion_matrix 12 | 13 | def main(random_seed=None, visualize_label=False, ann_model=mgl.baseline_model_96): 14 | 15 | 16 | # determine the random seed so that results are reproducible 17 | # random_seed = 11 # also determines which shuffled index to use 18 | np.random.seed(random_seed) 19 | 20 | 21 | # 1st part: load all the data and organize data 22 | data_converted = True 23 | if data_converted == False: 24 | # load all the data 25 | X, SR, T = mgl.load_original_data() 26 | # data format: 27 | # x: 1d numpy array 28 | # t: 1d numpy array with numsic genre names (numeric arrays or multinomial vector?) 29 | 30 | # convert the data into mel-scale spectrogram 31 | st = time.time() 32 | newX = mgl.batch_mel_spectrogram(X, SR) 33 | print(time.time() - st) 34 | 35 | # save the data into npz 36 | np.savez_compressed("audio_sr_label.npz", X=newX, SR=SR, T=T) 37 | 38 | else: 39 | st = time.time() 40 | data = np.load("audio_sr_label.npz") 41 | X = data["X"] 42 | SR = data["SR"] 43 | T = data["T"] 44 | loading_time = time.time() - st 45 | print("Loading takes %f seconds." % (loading_time)) 46 | 47 | 48 | # Use log transformation to preserve the order but shrink the range 49 | X = np.log(X + 1) 50 | X = X[:, :, :, np.newaxis] # image channel should be the last dimension, check by using print K.image_data_format() 51 | 52 | 53 | 54 | # convert string type labels to vectors 55 | genres = np.unique(T) 56 | genres_dict = dict([[label, value] for value, label in enumerate(genres)]) 57 | T_numeric = np.asarray([genres_dict[label] for label in T]) 58 | T_vectorized = np_utils.to_categorical(T_numeric) 59 | 60 | 61 | # split data into training, cross-validation, testing data 62 | # following is used to generate random see used to split the data into different sets 63 | # split_idxes = np.asarray([0, 0.5, 0.7, 1]) 64 | # training_idxes_list, validation_idxes_list, testing_idxes_list = [], [], [] 65 | # for idx in xrange(30): 66 | # training_idxes, validation_idxes, testing_idxes = mgl.split_data(T, split_idxes) 67 | # training_idxes_list.append(training_idxes) 68 | # validation_idxes_list.append(validation_idxes) 69 | # testing_idxes_list.append(testing_idxes) 70 | # 71 | # training_idxes_list = np.asarray(training_idxes_list) 72 | # validation_idxes_list = np.asarray(validation_idxes_list) 73 | # testing_idxes_list = np.asarray(testing_idxes_list) 74 | # 75 | # np.savez_compressed("shuffled_idx_list.npz", training_idxes_list=training_idxes_list, 76 | # validation_idxes_list=validation_idxes_list, testing_idxes_list=testing_idxes_list) 77 | 78 | 79 | ## load one fixed data shuffling indexes 80 | idxes_list = np.load("shuffled_idx_list.npz") 81 | training_idxes = idxes_list["training_idxes_list"][random_seed] 82 | validation_idxes = idxes_list["validation_idxes_list"][random_seed] 83 | testing_idxes = idxes_list["testing_idxes_list"][random_seed] 84 | 85 | 86 | # shuffled_idx = np.random.permutation(num_total_data) # shuffle or not 87 | # shuffled_idx_list = np.asarray([np.random.permutation(num_total_data) for x in xrange(30)]) 88 | # np.savez_compressed("shuffled_idx_list.npz", shuffled_idx_list=shuffled_idx_list) 89 | 90 | 91 | training_X = X[training_idxes] 92 | validation_X = X[validation_idxes] 93 | testing_X = X[testing_idxes] 94 | 95 | training_T = T_vectorized[training_idxes] 96 | validation_T = T_vectorized[validation_idxes] 97 | testing_T = T_vectorized[testing_idxes] 98 | # testing_T_label = T[testing_idxes] 99 | 100 | 101 | # try to load pre-trained model 102 | saved_model_name = "mgcnn_rs_" + str(random_seed) + ".h5" 103 | # saved_model_name = "mgcnn_poisson_rs_" + str(random_seed) + ".h5" 104 | MGCNN = mgl.Music_Genre_CNN(ann_model) 105 | try: 106 | # MGCNN.load_model(saved_model_name, custom_objects={'PoissonLayer': ann_model}) 107 | MGCNN.load_model(saved_model_name) 108 | except: 109 | print("The model hasn't been trained before.") 110 | # training the model 111 | training_flag = True 112 | max_iterations = 10 113 | while training_flag and max_iterations >= 0: 114 | validation_accuracies = MGCNN.train_model(training_X, training_T, cv=True, 115 | validation_spectrograms=validation_X, 116 | validation_labels=validation_T) 117 | 118 | diff = np.mean(validation_accuracies[-10:]) - np.mean(validation_accuracies[:10]) 119 | MGCNN.backup_model() # backup in case error occurred 120 | if np.abs(diff) < 0.01: 121 | training_flag = False 122 | max_iterations -= 1 123 | 124 | MGCNN.backup_model(saved_model_name) 125 | 126 | 127 | 128 | 129 | test_accuracy, confusion_data = MGCNN.test_model(testing_X, testing_T) 130 | print("\n ****** The final test accuracy is %f. ******\n" % (test_accuracy)) 131 | 132 | with open("model_accuracy_log.txt", "a") as text_file: 133 | things2write = saved_model_name + "\t" + "accuracy: " + str(test_accuracy) + "\n" 134 | text_file.write(things2write) 135 | 136 | # analyze the confusion matrix 137 | cs = classification_summary = classification_report(confusion_data[:, 1], confusion_data[:, 0], 138 | labels=genres_dict.values(), target_names=genres_dict.keys()) 139 | cm = confusion_matrix(confusion_data[:, 1], confusion_data[:, 0]) / (len(testing_T) * 1.0 / len(genres)) 140 | 141 | # visualize 142 | if visualize_label: 143 | import matplotlib.pylab as plt 144 | plt.matshow(cm); plt.colorbar() 145 | 146 | return cs, cm 147 | 148 | main(random_seed=0, visualize_label=False, ann_model=mgl.baseline_model_64) -------------------------------------------------------------------------------- /music_gen_lib.py: -------------------------------------------------------------------------------- 1 | # store the function/object used in the project 2 | 3 | # import modules 4 | from __future__ import print_function 5 | import numpy as np 6 | import librosa 7 | import keras 8 | from keras.models import Sequential, load_model 9 | from keras.layers import Dense, Dropout, Flatten 10 | from keras.layers import Conv2D, MaxPooling2D 11 | from keras import backend as K 12 | import tensorflow as tf 13 | from keras.utils import np_utils 14 | from keras import regularizers 15 | import time 16 | from keras.engine.topology import Layer 17 | 18 | 19 | # parameters 20 | sr = 22050 # if sampling rate is different, resample it to this 21 | 22 | # parameters for calculating spectrogram in mel scale 23 | fmax = 10000 # maximum frequency considered 24 | fft_window_points = 512 25 | fft_window_dur = fft_window_points * 1.0 / sr # 23ms windows 26 | hop_size = int(fft_window_points/ 2) # 50% overlap between consecutive frames 27 | n_mels = 64 28 | 29 | # segment duration 30 | num_fft_windows = 256 # num fft windows per music segment 31 | segment_in_points = num_fft_windows * 255 # number of data points that insure the spectrogram has size: 64 * 256 32 | segment_dur = segment_in_points * 1.0 / sr 33 | 34 | num_genres=10 35 | input_shape=(64, 256, 1) 36 | 37 | 38 | def split_data(T, split_idxes): 39 | """ 40 | give the indexes of training, validation, and testing data 41 | :param T: label of all data 42 | :param split_idxes: splitting points of the data 43 | :return: 44 | """ 45 | genres = np.unique(T) 46 | training_idxes = [] 47 | validation_idxes = [] 48 | testing_idxes = [] 49 | for idx, music_genre in enumerate(genres): 50 | tmp_logidx = music_genre == T 51 | tmp_idx = np.flatnonzero(tmp_logidx) 52 | tmp_shuffled_idx = np.random.permutation(tmp_idx) 53 | tmp_num_examles = len(tmp_shuffled_idx) 54 | tmp_split_idxes = np.asarray(split_idxes * tmp_num_examles, dtype=np.int) 55 | training_idxes.append(tmp_shuffled_idx[tmp_split_idxes[0] : tmp_split_idxes[1]]) 56 | validation_idxes.append(tmp_shuffled_idx[tmp_split_idxes[1] : tmp_split_idxes[2]]) 57 | testing_idxes.append(tmp_shuffled_idx[tmp_split_idxes[2] : tmp_split_idxes[3]]) 58 | return(np.concatenate(training_idxes), np.concatenate(validation_idxes), np.concatenate(testing_idxes)) 59 | 60 | 61 | def load_original_data(): 62 | """ 63 | load original audio files 64 | :return: 65 | """ 66 | import os 67 | data_folder = "/home/md/Dropbox/Courses/2017_Spring_Machine_learning/projects/music_gen/genres" 68 | # genre_folders = [x[0] for x in os.walk(data_folder)] 69 | genre_folders = os.listdir(data_folder) 70 | X = [] 71 | T = [] 72 | SR = [] 73 | min_length = 0 74 | for sub_folder in genre_folders: 75 | genre_path = data_folder + "/" + sub_folder 76 | print(genre_path) 77 | audio_files = os.listdir(genre_path) 78 | for audio_name in audio_files: 79 | audio_path = genre_path + "/" + audio_name 80 | x, sr = librosa.core.load(audio_path) 81 | if x.shape[0] < 30 * sr: 82 | x = np.append(x, np.zeros(30*sr - x.shape[0])) # insure all files are exactly the same length 83 | if min_length < x.shape[0]: 84 | min_length = x.shape[0] # report the duration of the minimum audio clip 85 | print("This audio last %f seconds, zeros are padded at the end." % (x.shape[0]*1.0/sr)) 86 | X.append(x[:30*sr]) 87 | SR.append(sr) 88 | T.append(sub_folder) 89 | return np.asarray(X), np.asarray(SR), np.asarray(T, dtype=str) 90 | 91 | # calculate mel-spectrogram 92 | def mel_spectrogram(ys, sr, n_mels=n_mels, hop_size=hop_size, fmax=fmax, pre_emphasis=False): 93 | """ 94 | calculate the spectrogram in mel scale, refer to documentation of libriso and MFCC tutorial 95 | :param ys: 96 | :param sr: 97 | :param n_mels: 98 | :param hop_size: 99 | :param fmax: 100 | :param pre_emphasis: 101 | :return: 102 | """ 103 | if pre_emphasis: 104 | ys = np.append(ys[0], ys[1:]-pre_emphasis*ys[:-1]) 105 | return librosa.feature.melspectrogram(ys, sr, 106 | n_fft=fft_window_points, 107 | hop_length=hop_size, n_mels=n_mels, 108 | fmax=fmax) 109 | 110 | # batch convert waveform into spectrogram in mel-scale 111 | def batch_mel_spectrogram(X, SR): 112 | """ 113 | convert all waveforms in R into time * 64 spectrogram in mel scale 114 | :param X: 115 | :param SR: 116 | :return: 117 | """ 118 | melspec_list = [] 119 | for idx in xrange(X.shape[0]): 120 | tmp_melspec = mel_spectrogram(X[idx], SR[idx]) 121 | melspec_list.append(tmp_melspec) 122 | return np.asarray(melspec_list) 123 | 124 | 125 | # def segment_spectrogram(input_spectrogram, num_fft_windows=num_fft_windows): 126 | # # given a spectrogram of a music that's longer than 3 seconds, segment it into relatively independent pieces 127 | # length_in_fft = input_spectrogram.shape[1] 128 | # num_segments = int(length_in_fft / num_fft_windows) 129 | # pass 130 | 131 | 132 | def baseline_model_32(num_genres=num_genres, input_shape=input_shape): 133 | model = Sequential() 134 | model.add(Conv2D(32, kernel_size=(3, 3), 135 | activation='relu', kernel_regularizer=regularizers.l2(0.01), 136 | input_shape=input_shape)) 137 | model.add(MaxPooling2D(pool_size=(2, 4))) 138 | model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01))) 139 | model.add(MaxPooling2D(pool_size=(2, 4))) 140 | model.add(Dropout(0.2)) 141 | model.add(Flatten()) 142 | model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02))) 143 | model.add(Dropout(0.2)) 144 | model.add(Dense(num_genres, activation='softmax')) 145 | model.compile(loss=keras.losses.categorical_crossentropy, 146 | optimizer=keras.optimizers.Adadelta(decay=1e-5), 147 | metrics=['accuracy']) 148 | return(model) 149 | 150 | def baseline_model_64(num_genres=num_genres, input_shape=input_shape): 151 | model = Sequential() 152 | model.add(Conv2D(64, kernel_size=(3, 3), 153 | activation='relu', kernel_regularizer=regularizers.l2(0.01), 154 | input_shape=input_shape)) 155 | model.add(MaxPooling2D(pool_size=(2, 4))) 156 | model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01))) 157 | model.add(MaxPooling2D(pool_size=(2, 4))) 158 | model.add(Dropout(0.2)) 159 | model.add(Flatten()) 160 | model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02))) 161 | model.add(Dropout(0.2)) 162 | model.add(Dense(num_genres, activation='softmax')) 163 | model.compile(loss=keras.losses.categorical_crossentropy, 164 | optimizer=keras.optimizers.Adadelta(decay=1e-5), 165 | metrics=['accuracy']) 166 | return(model) 167 | 168 | def baseline_model_96(num_genres=num_genres, input_shape=input_shape): 169 | model = Sequential() 170 | model.add(Conv2D(96, kernel_size=(3, 3), 171 | activation='relu', kernel_regularizer=regularizers.l2(0.01), 172 | input_shape=input_shape)) 173 | model.add(MaxPooling2D(pool_size=(2, 4))) 174 | model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01))) 175 | model.add(MaxPooling2D(pool_size=(2, 4))) 176 | model.add(Dropout(0.2)) 177 | model.add(Flatten()) 178 | model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02))) 179 | model.add(Dropout(0.2)) 180 | model.add(Dense(num_genres, activation='softmax')) 181 | model.compile(loss=keras.losses.categorical_crossentropy, 182 | optimizer=keras.optimizers.Adadelta(decay=1e-5), 183 | metrics=['accuracy']) 184 | return(model) 185 | 186 | 187 | class Music_Genre_CNN(object): 188 | 189 | def __init__(self, ann_model): 190 | self.model = ann_model() 191 | 192 | def load_model(self, model_path, custom_objects=None): 193 | self.model = load_model(model_path, custom_objects=custom_objects) 194 | 195 | def train_model(self, input_spectrograms, labels, cv=False, 196 | validation_spectrograms=None, validation_labels=None, 197 | small_batch_size=150, max_iteration=500, print_interval=1): 198 | """ 199 | train the CNN model 200 | :param input_spectrograms: number of training examplex * num of mel bands * number of fft windows * 1 201 | type: 4D numpy array 202 | :param labels: vectorized class labels 203 | type: 204 | :param cv: whether do cross validation 205 | :param validation_spectrograms: data used for cross validation 206 | type: as input_spectrogram 207 | :param validation_labels: used for cross validation 208 | :param small_batch_size: size of each training batch 209 | :param max_iteration: 210 | maximum number of iterations allowed for one training 211 | :return: 212 | trained model 213 | """ 214 | validation_accuracy_list = [] 215 | for iii in xrange(max_iteration): 216 | 217 | st_time = time.time() 218 | 219 | # split training data into even batches 220 | num_training_data = len(input_spectrograms) 221 | batch_idx = np.random.permutation(num_training_data) 222 | num_batches = int(num_training_data / small_batch_size) 223 | 224 | for jjj in xrange(num_batches - 1): 225 | sample_idx = np.random.randint(input_spectrograms.shape[2] - num_fft_windows) 226 | training_idx = batch_idx[jjj * small_batch_size: (jjj + 1) * small_batch_size] 227 | training_data = input_spectrograms[training_idx, :, sample_idx:sample_idx+num_fft_windows, :] 228 | training_label = labels[training_idx] 229 | self.model.train_on_batch(training_data, training_label) 230 | training_accuracy = self.model.evaluate(training_data, training_label) 231 | # print("Training accuracy is: %f" % (training_accuracy)) 232 | 233 | end_time = time.time() 234 | elapsed_time = end_time - st_time 235 | if cv: 236 | validation_accuracy = self.model.evaluate(validation_spectrograms[:, :, sample_idx:sample_idx+num_fft_windows, :], validation_labels) 237 | validation_accuracy_list.append(validation_accuracy[1]) 238 | else: 239 | validation_accuracy = [-1.0, -1.0] 240 | 241 | if iii % print_interval == 0: 242 | print("\nTime elapsed: %f; Training accuracy: %f, Validation accuracy: %f\n" % 243 | (elapsed_time, training_accuracy[1], validation_accuracy[1])) 244 | if cv: 245 | return np.asarray(validation_accuracy_list) 246 | 247 | 248 | def song_spectrogram_prediction(self, song_mel_spectrogram, overlap): 249 | """ 250 | give the predicted_probability for each class and each segment 251 | :param song_mel_spectrogram: 252 | 4D numpy array: num of time windows * mel bands * 1 (depth) 253 | :param overlap: 254 | overlap between segments, overlap = 0 means no overlap between segments 255 | :return: 256 | predictions: numpy array (number of segments * num classes) 257 | """ 258 | # 1st segment spectrogram into sizes of 64 * 256 259 | largest_idx = song_mel_spectrogram.shape[1] - num_fft_windows - 1 260 | step_size = int((1 - overlap) * num_fft_windows) 261 | num_segments = int(largest_idx / step_size) 262 | segment_edges = np.arange(num_segments) * step_size 263 | segment_list = [] 264 | for idx in segment_edges: 265 | segment = song_mel_spectrogram[:, idx : idx + num_fft_windows] 266 | segment_list.append(segment) 267 | segment_array = np.asarray(segment_list)[:, :, :, np.newaxis] 268 | predictions = self.model.predict_proba(segment_array, batch_size=len(segment_array), verbose=0) 269 | summarized_prediction = np.argmax(predictions.sum(axis=0)) 270 | return(summarized_prediction, predictions) 271 | 272 | def test_model(self, test_X, test_T, overlap=0.5): 273 | # test the accuracy of the model using testing data 274 | num_sample = len(test_T) 275 | correct_labels = np.argmax(test_T, axis=1) 276 | predicted_labels = np.zeros(num_sample) 277 | for iii in xrange(len(test_X)): 278 | song_mel_spectrogram = test_X[iii].squeeze() 279 | predicted_labels[iii], _ = self.song_spectrogram_prediction(song_mel_spectrogram, overlap=overlap) 280 | # correct_labels[iii] = np.argmax(test_T[iii]) 281 | confusion_data = np.vstack((predicted_labels, correct_labels)).T 282 | accuracy = np.sum(correct_labels == predicted_labels) * 1.0 / num_sample 283 | return(accuracy, confusion_data) 284 | 285 | def backup_model(self, model_bk_name=False): 286 | if not model_bk_name: 287 | year, month, day, hour, minute = time.strftime("%Y,%m,%d,%H,%M").split(',') 288 | model_bk_name = "mgcnn_" + month + day + hour + minute + ".h5" 289 | self.model.save(model_bk_name) 290 | 291 | def song_genre_prediction(self, song_waveform): 292 | # resample the song into single channel, 22050 sampling frequency 293 | 294 | # convert into mel-scale spectrogram 295 | 296 | # predict using trained model 297 | 298 | # 299 | 300 | pass 301 | 302 | 303 | --------------------------------------------------------------------------------