├── shuffled_idx_list.npz
├── README.md
├── music_classification_main.py
└── music_gen_lib.py


/shuffled_idx_list.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ds7711/music_genre_classification/HEAD/shuffled_idx_list.npz


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # music_genre_classification
 2 | CNN achieves human-level accuracy in music genre classification
 3 | This is part of the course project for Rutgers Machine Learning class (2017 Spring). 
 4 | 
 5 | We used a simple convolutional neural network to perform music genre classification and achieve 70% accuracy, which is comparable to the human accuracy. 
 6 | 
 7 | To run the code, one has to either download the GTZAN dataset (http://marsyasweb.appspot.com/download/data_sets/) and convert them into mel-spectrogram (64 mel-filters), or download the already converted data (https://drive.google.com/file/d/0B3I2KG9W0zM2YmFWbk1oMFhkU1k/view?usp=sharing). Note: the preprocessed data (.npz) file contains the mel-spectrogram of the audio and the corresponding genre. Mel-spectrogram is converted using Librosa library in python. 
 8 | 
 9 | After downloading the data, simply run the music_classification_main.py to train the model. random_seed (0-29) determines how the data are split into training, validation, and testing set. The codes were tested under ubuntu 14.04 using Keras and tensorflow with a GTX-1070. On GPU, it roughly takes 10-30 minutes for the model to converge. 
10 | 
11 | For the analysis of the model, run the notebook files. 
12 | 
13 | A detailed explanation of the code is in the PDF. 
14 | 


--------------------------------------------------------------------------------
/music_classification_main.py:
--------------------------------------------------------------------------------
  1 | # import modules
  2 | import numpy as np
  3 | import librosa
  4 | import music_gen_lib as mgl
  5 | from keras.utils import np_utils
  6 | import time
  7 | from keras import backend as K
  8 | from keras.engine.topology import Layer
  9 | import tensorflow as tf
 10 | import keras
 11 | from sklearn.metrics import classification_report, confusion_matrix
 12 | 
 13 | def main(random_seed=None, visualize_label=False, ann_model=mgl.baseline_model_96):
 14 | 
 15 | 
 16 |     # determine the random seed so that results are reproducible
 17 |     # random_seed = 11 # also determines which shuffled index to use
 18 |     np.random.seed(random_seed)
 19 | 
 20 | 
 21 |     # 1st part: load all the data and organize data
 22 |     data_converted = True
 23 |     if data_converted == False:
 24 |         # load all the data
 25 |         X, SR, T = mgl.load_original_data()
 26 |         # data format:
 27 |         #       x: 1d numpy array
 28 |         #       t: 1d numpy array with numsic genre names (numeric arrays or multinomial vector?)
 29 | 
 30 |         # convert the data into mel-scale spectrogram
 31 |         st = time.time()
 32 |         newX = mgl.batch_mel_spectrogram(X, SR)
 33 |         print(time.time() - st)
 34 | 
 35 |         # save the data into npz
 36 |         np.savez_compressed("audio_sr_label.npz", X=newX, SR=SR, T=T)
 37 | 
 38 |     else:
 39 |         st = time.time()
 40 |         data = np.load("audio_sr_label.npz")
 41 |         X = data["X"]
 42 |         SR = data["SR"]
 43 |         T = data["T"]
 44 |         loading_time = time.time() - st
 45 |         print("Loading takes %f seconds." % (loading_time))
 46 | 
 47 | 
 48 |     # Use log transformation to preserve the order but shrink the range
 49 |     X = np.log(X + 1)
 50 |     X = X[:, :, :, np.newaxis]  # image channel should be the last dimension, check by using print K.image_data_format()
 51 | 
 52 | 
 53 | 
 54 |     # convert string type labels to vectors
 55 |     genres = np.unique(T)
 56 |     genres_dict = dict([[label, value] for value, label in enumerate(genres)])
 57 |     T_numeric = np.asarray([genres_dict[label] for label in T])
 58 |     T_vectorized = np_utils.to_categorical(T_numeric)
 59 | 
 60 | 
 61 |     # split data into training, cross-validation,  testing data
 62 |     # following is used to generate random see used to split the data into different sets
 63 |     # split_idxes = np.asarray([0, 0.5, 0.7, 1])
 64 |     # training_idxes_list, validation_idxes_list, testing_idxes_list = [], [], []
 65 |     # for idx in xrange(30):
 66 |     #     training_idxes, validation_idxes, testing_idxes = mgl.split_data(T, split_idxes)
 67 |     #     training_idxes_list.append(training_idxes)
 68 |     #     validation_idxes_list.append(validation_idxes)
 69 |     #     testing_idxes_list.append(testing_idxes)
 70 |     #
 71 |     # training_idxes_list = np.asarray(training_idxes_list)
 72 |     # validation_idxes_list = np.asarray(validation_idxes_list)
 73 |     # testing_idxes_list = np.asarray(testing_idxes_list)
 74 |     #
 75 |     # np.savez_compressed("shuffled_idx_list.npz", training_idxes_list=training_idxes_list,
 76 |     #                     validation_idxes_list=validation_idxes_list, testing_idxes_list=testing_idxes_list)
 77 | 
 78 | 
 79 |     ## load one fixed data shuffling indexes
 80 |     idxes_list = np.load("shuffled_idx_list.npz")
 81 |     training_idxes = idxes_list["training_idxes_list"][random_seed]
 82 |     validation_idxes = idxes_list["validation_idxes_list"][random_seed]
 83 |     testing_idxes = idxes_list["testing_idxes_list"][random_seed]
 84 | 
 85 | 
 86 |     # shuffled_idx = np.random.permutation(num_total_data) # shuffle or not
 87 |     # shuffled_idx_list = np.asarray([np.random.permutation(num_total_data) for x in xrange(30)])
 88 |     # np.savez_compressed("shuffled_idx_list.npz", shuffled_idx_list=shuffled_idx_list)
 89 | 
 90 | 
 91 |     training_X = X[training_idxes]
 92 |     validation_X = X[validation_idxes]
 93 |     testing_X = X[testing_idxes]
 94 | 
 95 |     training_T = T_vectorized[training_idxes]
 96 |     validation_T = T_vectorized[validation_idxes]
 97 |     testing_T = T_vectorized[testing_idxes]
 98 |     # testing_T_label = T[testing_idxes]
 99 | 
100 | 
101 |     # try to load pre-trained model
102 |     saved_model_name = "mgcnn_rs_" + str(random_seed) + ".h5"
103 |     # saved_model_name = "mgcnn_poisson_rs_" + str(random_seed) + ".h5"
104 |     MGCNN = mgl.Music_Genre_CNN(ann_model)
105 |     try:
106 |         # MGCNN.load_model(saved_model_name, custom_objects={'PoissonLayer': ann_model})
107 |         MGCNN.load_model(saved_model_name)
108 |     except:
109 |         print("The model hasn't been trained before.")
110 |         # training the model
111 |         training_flag = True
112 |         max_iterations = 10
113 |         while training_flag and max_iterations >= 0:
114 |             validation_accuracies = MGCNN.train_model(training_X, training_T, cv=True,
115 |                                                       validation_spectrograms=validation_X,
116 |                                                       validation_labels=validation_T)
117 | 
118 |             diff = np.mean(validation_accuracies[-10:]) - np.mean(validation_accuracies[:10])
119 |             MGCNN.backup_model()  # backup in case error occurred
120 |             if np.abs(diff) < 0.01:
121 |                 training_flag = False
122 |             max_iterations -= 1
123 | 
124 |         MGCNN.backup_model(saved_model_name)
125 | 
126 | 
127 | 
128 | 
129 |     test_accuracy, confusion_data = MGCNN.test_model(testing_X, testing_T)
130 |     print("\n ****** The final test accuracy is %f. ******\n" % (test_accuracy))
131 | 
132 |     with open("model_accuracy_log.txt", "a") as text_file:
133 |         things2write = saved_model_name + "\t" + "accuracy: " + str(test_accuracy) + "\n"
134 |         text_file.write(things2write)
135 | 
136 |     # analyze the confusion matrix
137 |     cs = classification_summary = classification_report(confusion_data[:, 1], confusion_data[:, 0],
138 |                                                         labels=genres_dict.values(), target_names=genres_dict.keys())
139 |     cm = confusion_matrix(confusion_data[:, 1], confusion_data[:, 0]) / (len(testing_T) * 1.0 / len(genres))
140 | 
141 |     # visualize
142 |     if visualize_label:
143 |         import matplotlib.pylab as plt
144 |         plt.matshow(cm); plt.colorbar()
145 | 
146 |     return cs, cm
147 | 
148 | main(random_seed=0, visualize_label=False, ann_model=mgl.baseline_model_64)


--------------------------------------------------------------------------------
/music_gen_lib.py:
--------------------------------------------------------------------------------
  1 | # store the function/object used in the project
  2 | 
  3 | # import modules
  4 | from __future__ import print_function
  5 | import numpy as np
  6 | import librosa
  7 | import keras
  8 | from keras.models import Sequential, load_model
  9 | from keras.layers import Dense, Dropout, Flatten
 10 | from keras.layers import Conv2D, MaxPooling2D
 11 | from keras import backend as K
 12 | import tensorflow as tf
 13 | from keras.utils import np_utils
 14 | from keras import regularizers
 15 | import time
 16 | from keras.engine.topology import Layer
 17 | 
 18 | 
 19 | # parameters
 20 | sr = 22050 # if sampling rate is different, resample it to this
 21 | 
 22 | # parameters for calculating spectrogram in mel scale
 23 | fmax = 10000 # maximum frequency considered
 24 | fft_window_points = 512
 25 | fft_window_dur = fft_window_points * 1.0 / sr # 23ms windows
 26 | hop_size = int(fft_window_points/ 2) # 50% overlap between consecutive frames
 27 | n_mels = 64
 28 | 
 29 | # segment duration
 30 | num_fft_windows = 256 # num fft windows per music segment
 31 | segment_in_points = num_fft_windows * 255 # number of data points that insure the spectrogram has size: 64 * 256
 32 | segment_dur = segment_in_points * 1.0 / sr
 33 | 
 34 | num_genres=10
 35 | input_shape=(64, 256, 1)
 36 | 
 37 | 
 38 | def split_data(T, split_idxes):
 39 |     """
 40 |     give the indexes of training, validation, and testing data
 41 |     :param T: label of all data
 42 |     :param split_idxes: splitting points of the data
 43 |     :return:
 44 |     """
 45 |     genres = np.unique(T)
 46 |     training_idxes = []
 47 |     validation_idxes = []
 48 |     testing_idxes = []
 49 |     for idx, music_genre in enumerate(genres):
 50 |         tmp_logidx = music_genre == T
 51 |         tmp_idx = np.flatnonzero(tmp_logidx)
 52 |         tmp_shuffled_idx = np.random.permutation(tmp_idx)
 53 |         tmp_num_examles = len(tmp_shuffled_idx)
 54 |         tmp_split_idxes = np.asarray(split_idxes * tmp_num_examles, dtype=np.int)
 55 |         training_idxes.append(tmp_shuffled_idx[tmp_split_idxes[0] : tmp_split_idxes[1]])
 56 |         validation_idxes.append(tmp_shuffled_idx[tmp_split_idxes[1] : tmp_split_idxes[2]])
 57 |         testing_idxes.append(tmp_shuffled_idx[tmp_split_idxes[2] : tmp_split_idxes[3]])
 58 |     return(np.concatenate(training_idxes), np.concatenate(validation_idxes), np.concatenate(testing_idxes))
 59 | 
 60 | 
 61 | def load_original_data():
 62 |     """
 63 |     load original audio files
 64 |     :return:
 65 |     """
 66 |     import os
 67 |     data_folder = "/home/md/Dropbox/Courses/2017_Spring_Machine_learning/projects/music_gen/genres"
 68 |     # genre_folders = [x[0] for x in os.walk(data_folder)]
 69 |     genre_folders = os.listdir(data_folder)
 70 |     X = []
 71 |     T = []
 72 |     SR = []
 73 |     min_length = 0
 74 |     for sub_folder in genre_folders:
 75 |         genre_path = data_folder + "/" + sub_folder
 76 |         print(genre_path)
 77 |         audio_files = os.listdir(genre_path)
 78 |         for audio_name in audio_files:
 79 |             audio_path = genre_path + "/" + audio_name
 80 |             x, sr = librosa.core.load(audio_path)
 81 |             if x.shape[0] < 30 * sr:
 82 |                 x = np.append(x, np.zeros(30*sr - x.shape[0])) # insure all files are exactly the same length
 83 |                 if min_length < x.shape[0]:
 84 |                     min_length = x.shape[0] # report the duration of the minimum audio clip
 85 |                     print("This audio last %f seconds, zeros are padded at the end." % (x.shape[0]*1.0/sr))
 86 |             X.append(x[:30*sr])
 87 |             SR.append(sr)
 88 |             T.append(sub_folder)
 89 |     return np.asarray(X), np.asarray(SR), np.asarray(T, dtype=str)
 90 | 
 91 | # calculate mel-spectrogram
 92 | def mel_spectrogram(ys, sr, n_mels=n_mels, hop_size=hop_size, fmax=fmax, pre_emphasis=False):
 93 |     """
 94 |     calculate the spectrogram in mel scale, refer to documentation of libriso and MFCC tutorial
 95 |     :param ys:
 96 |     :param sr:
 97 |     :param n_mels:
 98 |     :param hop_size:
 99 |     :param fmax:
100 |     :param pre_emphasis:
101 |     :return:
102 |     """
103 |     if pre_emphasis:
104 |         ys = np.append(ys[0], ys[1:]-pre_emphasis*ys[:-1])
105 |     return librosa.feature.melspectrogram(ys, sr,
106 |                                           n_fft=fft_window_points,
107 |                                           hop_length=hop_size, n_mels=n_mels,
108 |                                           fmax=fmax)
109 | 
110 | # batch convert waveform into spectrogram in mel-scale
111 | def batch_mel_spectrogram(X, SR):
112 |     """
113 |     convert all waveforms in R into time * 64 spectrogram in mel scale
114 |     :param X:
115 |     :param SR:
116 |     :return:
117 |     """
118 |     melspec_list = []
119 |     for idx in xrange(X.shape[0]):
120 |         tmp_melspec = mel_spectrogram(X[idx], SR[idx])
121 |         melspec_list.append(tmp_melspec)
122 |     return np.asarray(melspec_list)
123 | 
124 | 
125 | # def segment_spectrogram(input_spectrogram, num_fft_windows=num_fft_windows):
126 | #     # given a spectrogram of a music that's longer than 3 seconds, segment it into relatively independent pieces
127 | #     length_in_fft = input_spectrogram.shape[1]
128 | #     num_segments = int(length_in_fft / num_fft_windows)
129 | #     pass
130 | 
131 | 
132 | def baseline_model_32(num_genres=num_genres, input_shape=input_shape):
133 |     model = Sequential()
134 |     model.add(Conv2D(32, kernel_size=(3, 3),
135 |                      activation='relu', kernel_regularizer=regularizers.l2(0.01),
136 |                      input_shape=input_shape))
137 |     model.add(MaxPooling2D(pool_size=(2, 4)))
138 |     model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
139 |     model.add(MaxPooling2D(pool_size=(2, 4)))
140 |     model.add(Dropout(0.2))
141 |     model.add(Flatten())
142 |     model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
143 |     model.add(Dropout(0.2))
144 |     model.add(Dense(num_genres, activation='softmax'))
145 |     model.compile(loss=keras.losses.categorical_crossentropy,
146 |                   optimizer=keras.optimizers.Adadelta(decay=1e-5),
147 |                   metrics=['accuracy'])
148 |     return(model)
149 | 
150 | def baseline_model_64(num_genres=num_genres, input_shape=input_shape):
151 |     model = Sequential()
152 |     model.add(Conv2D(64, kernel_size=(3, 3),
153 |                      activation='relu', kernel_regularizer=regularizers.l2(0.01),
154 |                      input_shape=input_shape))
155 |     model.add(MaxPooling2D(pool_size=(2, 4)))
156 |     model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
157 |     model.add(MaxPooling2D(pool_size=(2, 4)))
158 |     model.add(Dropout(0.2))
159 |     model.add(Flatten())
160 |     model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
161 |     model.add(Dropout(0.2))
162 |     model.add(Dense(num_genres, activation='softmax'))
163 |     model.compile(loss=keras.losses.categorical_crossentropy,
164 |                   optimizer=keras.optimizers.Adadelta(decay=1e-5),
165 |                   metrics=['accuracy'])
166 |     return(model)
167 | 
168 | def baseline_model_96(num_genres=num_genres, input_shape=input_shape):
169 |     model = Sequential()
170 |     model.add(Conv2D(96, kernel_size=(3, 3),
171 |                      activation='relu', kernel_regularizer=regularizers.l2(0.01),
172 |                      input_shape=input_shape))
173 |     model.add(MaxPooling2D(pool_size=(2, 4)))
174 |     model.add(Conv2D(64, (3, 5), activation='relu', kernel_regularizer=regularizers.l2(0.01)))
175 |     model.add(MaxPooling2D(pool_size=(2, 4)))
176 |     model.add(Dropout(0.2))
177 |     model.add(Flatten())
178 |     model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
179 |     model.add(Dropout(0.2))
180 |     model.add(Dense(num_genres, activation='softmax'))
181 |     model.compile(loss=keras.losses.categorical_crossentropy,
182 |                   optimizer=keras.optimizers.Adadelta(decay=1e-5),
183 |                   metrics=['accuracy'])
184 |     return(model)
185 | 
186 | 
187 | class Music_Genre_CNN(object):
188 | 
189 |     def __init__(self, ann_model):
190 |         self.model = ann_model()
191 | 
192 |     def load_model(self, model_path, custom_objects=None):
193 |         self.model = load_model(model_path, custom_objects=custom_objects)
194 | 
195 |     def train_model(self, input_spectrograms, labels, cv=False,
196 |                     validation_spectrograms=None, validation_labels=None,
197 |                     small_batch_size=150, max_iteration=500, print_interval=1):
198 |         """
199 |         train the CNN model
200 |         :param input_spectrograms: number of training examplex * num of mel bands * number of fft windows * 1
201 |             type: 4D numpy array
202 |         :param labels: vectorized class labels
203 |             type:
204 |         :param cv: whether do cross validation
205 |         :param validation_spectrograms: data used for cross validation
206 |             type: as input_spectrogram
207 |         :param validation_labels: used for cross validation
208 |         :param small_batch_size: size of each training batch
209 |         :param max_iteration:
210 |             maximum number of iterations allowed for one training
211 |         :return:
212 |             trained model
213 |         """
214 |         validation_accuracy_list = []
215 |         for iii in xrange(max_iteration):
216 | 
217 |             st_time = time.time()
218 | 
219 |             # split training data into even batches
220 |             num_training_data = len(input_spectrograms)
221 |             batch_idx = np.random.permutation(num_training_data)
222 |             num_batches = int(num_training_data / small_batch_size)
223 | 
224 |             for jjj in xrange(num_batches - 1):
225 |                 sample_idx = np.random.randint(input_spectrograms.shape[2] - num_fft_windows)
226 |                 training_idx = batch_idx[jjj * small_batch_size: (jjj + 1) * small_batch_size]
227 |                 training_data = input_spectrograms[training_idx, :, sample_idx:sample_idx+num_fft_windows, :]
228 |                 training_label = labels[training_idx]
229 |                 self.model.train_on_batch(training_data, training_label)
230 |                 training_accuracy = self.model.evaluate(training_data, training_label)
231 |                 # print("Training accuracy is: %f" % (training_accuracy))
232 | 
233 |             end_time = time.time()
234 |             elapsed_time = end_time - st_time
235 |             if cv:
236 |                 validation_accuracy = self.model.evaluate(validation_spectrograms[:, :, sample_idx:sample_idx+num_fft_windows, :], validation_labels)
237 |                 validation_accuracy_list.append(validation_accuracy[1])
238 |             else:
239 |                 validation_accuracy = [-1.0, -1.0]
240 | 
241 |             if iii % print_interval == 0:
242 |                 print("\nTime elapsed: %f; Training accuracy: %f, Validation accuracy: %f\n" %
243 |                       (elapsed_time, training_accuracy[1], validation_accuracy[1]))
244 |         if cv:
245 |             return np.asarray(validation_accuracy_list)
246 | 
247 | 
248 |     def song_spectrogram_prediction(self, song_mel_spectrogram, overlap):
249 |         """
250 |         give the predicted_probability for each class and each segment
251 |         :param song_mel_spectrogram:
252 |             4D numpy array: num of time windows * mel bands * 1 (depth)
253 |         :param overlap:
254 |             overlap between segments, overlap = 0 means no overlap between segments
255 |         :return:
256 |             predictions: numpy array (number of segments * num classes)
257 |         """
258 |         # 1st segment spectrogram into sizes of 64 * 256
259 |         largest_idx = song_mel_spectrogram.shape[1] - num_fft_windows - 1
260 |         step_size = int((1 - overlap) * num_fft_windows)
261 |         num_segments = int(largest_idx / step_size)
262 |         segment_edges = np.arange(num_segments) * step_size
263 |         segment_list = []
264 |         for idx in segment_edges:
265 |             segment = song_mel_spectrogram[:, idx : idx + num_fft_windows]
266 |             segment_list.append(segment)
267 |         segment_array = np.asarray(segment_list)[:, :, :, np.newaxis]
268 |         predictions = self.model.predict_proba(segment_array, batch_size=len(segment_array), verbose=0)
269 |         summarized_prediction = np.argmax(predictions.sum(axis=0))
270 |         return(summarized_prediction, predictions)
271 | 
272 |     def test_model(self, test_X, test_T, overlap=0.5):
273 |         # test the accuracy of the model using testing data
274 |         num_sample = len(test_T)
275 |         correct_labels = np.argmax(test_T, axis=1)
276 |         predicted_labels = np.zeros(num_sample)
277 |         for iii in xrange(len(test_X)):
278 |             song_mel_spectrogram = test_X[iii].squeeze()
279 |             predicted_labels[iii], _ = self.song_spectrogram_prediction(song_mel_spectrogram, overlap=overlap)
280 |             # correct_labels[iii] = np.argmax(test_T[iii])
281 |         confusion_data = np.vstack((predicted_labels, correct_labels)).T
282 |         accuracy = np.sum(correct_labels == predicted_labels) * 1.0 / num_sample
283 |         return(accuracy, confusion_data)
284 | 
285 |     def backup_model(self, model_bk_name=False):
286 |         if not model_bk_name:
287 |             year, month, day, hour, minute = time.strftime("%Y,%m,%d,%H,%M").split(',')
288 |             model_bk_name = "mgcnn_" + month + day + hour + minute + ".h5"
289 |         self.model.save(model_bk_name)
290 | 
291 |     def song_genre_prediction(self, song_waveform):
292 |         # resample the song into single channel, 22050 sampling frequency
293 | 
294 |         # convert into mel-scale spectrogram
295 | 
296 |         # predict using trained model
297 | 
298 |         #
299 | 
300 |         pass
301 | 
302 | 
303 | 


--------------------------------------------------------------------------------