├── Audio-Data-Analysis-CNN.py ├── README.md ├── audio_data_analysis_ANN.py └── images.jpeg /Audio-Data-Analysis-CNN.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from numpy import argmax 4 | import matplotlib.pyplot as plt 5 | %matplotlib inline 6 | import librosa 7 | import librosa.display 8 | import IPython.display 9 | import random 10 | import warnings 11 | import os 12 | from PIL import Image 13 | import pathlib 14 | import csv 15 | # sklearn Preprocessing 16 | from sklearn.model_selection import train_test_split 17 | #Keras 18 | import keras 19 | import warnings 20 | warnings.filterwarnings('ignore') 21 | from keras import layers 22 | from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add 23 | from keras.models import Sequential 24 | from keras.optimizers import SGD 25 | 26 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split() 27 | for g in genres: 28 | pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True) 29 | for filename in os.listdir(f'./drive/My Drive/genres/{g}'): 30 | songname = f'./drive/My Drive/genres/{g}/{filename}' 31 | y, sr = librosa.load(songname, mono=True, duration=5) 32 | print(y.shape) 33 | plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB'); 34 | plt.axis('off'); 35 | plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png') 36 | plt.clf() 37 | 38 | 39 | 40 | import split_folders 41 | # To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`. 42 | split_folders.ratio('./img_data/', output="./data", seed=1337, ratio=(.8, .2)) # default values 43 | 44 | from keras.preprocessing.image import ImageDataGenerator 45 | train_datagen = ImageDataGenerator( 46 | rescale=1./255, # rescale all pixel values from 0-255, so aftre this step all our pixel values are in range (0,1) 47 | shear_range=0.2, #to apply some random tranfromations 48 | zoom_range=0.2, #to apply zoom 49 | horizontal_flip=True) # image will be flipper horiz 50 | test_datagen = ImageDataGenerator(rescale=1./255) 51 | 52 | 53 | training_set = train_datagen.flow_from_directory( 54 | './data/train', 55 | target_size=(64, 64), 56 | batch_size=32, 57 | class_mode='categorical', 58 | shuffle = False) 59 | test_set = test_datagen.flow_from_directory( 60 | './data/val', 61 | target_size=(64, 64), 62 | batch_size=32, 63 | class_mode='categorical', 64 | shuffle = False ) 65 | 66 | model = Sequential() 67 | input_shape=(64, 64, 3) 68 | #1st hidden layer 69 | model.add(Conv2D(32, (3, 3), strides=(2, 2), input_shape=input_shape)) 70 | model.add(AveragePooling2D((2, 2), strides=(2,2))) 71 | model.add(Activation('relu')) 72 | #2nd hidden layer 73 | model.add(Conv2D(64, (3, 3), padding="same")) 74 | model.add(AveragePooling2D((2, 2), strides=(2,2))) 75 | model.add(Activation('relu')) 76 | #3rd hidden layer 77 | model.add(Conv2D(64, (3, 3), padding="same")) 78 | model.add(AveragePooling2D((2, 2), strides=(2,2))) 79 | model.add(Activation('relu')) 80 | #Flatten 81 | model.add(Flatten()) 82 | model.add(Dropout(rate=0.5)) 83 | #Add fully connected layer. 84 | model.add(Dense(64)) 85 | model.add(Activation('relu')) 86 | model.add(Dropout(rate=0.5)) 87 | #Output layer 88 | model.add(Dense(10)) 89 | model.add(Activation('softmax')) 90 | model.summary() 91 | 92 | 93 | epochs = 200 94 | batch_size = 8 95 | learning_rate = 0.01 96 | decay_rate = learning_rate / epochs 97 | momentum = 0.9 98 | sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False) 99 | model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=['accuracy']) 100 | 101 | model.fit_generator( 102 | training_set, 103 | steps_per_epoch=100, 104 | epochs=50, 105 | validation_data=test_set, 106 | validation_steps=200) 107 | 108 | 109 | #Model Evaluation 110 | model.evaluate_generator(generator=test_set, steps=50) 111 | #OUTPUT 112 | [1.704445120342617, 0.33798882681564246] 113 | 114 | test_set.reset() 115 | pred = model.predict_generator(test_set, steps=50, verbose=1) 116 | 117 | predicted_class_indices=np.argmax(pred,axis=1) 118 | 119 | labels = (training_set.class_indices) 120 | labels = dict((v,k) for k,v in labels.items()) 121 | predictions = [labels[k] for k in predicted_class_indices] 122 | predictions = predictions[:200] 123 | filenames=test_set.filenames 124 | 125 | print(len(filename, len(predictions))) 126 | # (200, 200) 127 | 128 | results=pd.DataFrame({"Filename":filenames, 129 | "Predictions":predictions},orient='index') 130 | results.to_csv("prediction_results.csv",index=False) 131 | 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Audio Data Analysis Using Deep Learning 2 | 3 | Audio data analysis is about analyzing and understanding audio signals captured by digital devices, with numerous applications in the enterprise, healthcare, productivity, and smart cities. 4 | 5 | In this repository we have done audio data analysis and extracted necessary features from a sound/audio file. Also build an Artificial Neural Network(**ANN**) and Convolutional Neural Network(**CNN**) for classifying music genre. 6 | 7 | ![NN](https://github.com/nageshsinghc4/Audio-Data-Analysis-Using-Deep-Learning/blob/master/images.jpeg) 8 | 9 | 10 | Pre requisites: 11 | 12 | 1. **Librosa**: to analyze audio signals in general but geared more towards music. It includes the nuts and bolts to build a MIR(Music information retrieval) system. 13 | 14 | ```pip install librosa``` 15 | 16 | 17 | 2. **IPython.display.Audio**: Lets you play audio directly in a jupyter notebook. 18 | 19 | ## Important termanologies: 20 | 21 | ### Spectrogram 22 | A spectrogram is a visual way of representing the signal strength, or “loudness”, of a signal over time at various frequencies present in a particular waveform. 23 | 24 | ### Feature extraction from Audio signal 25 | Every audio signal consists of many features. However, we must extract the characteristics that are relevant to the problem we are trying to solve. The process of extracting features to use them for analysis is called feature extraction. Let us study a few of the features in detail. 26 | 27 | The spectral features (frequency-based features), which are obtained by converting the time-based signal into the frequency domain using the Fourier Transform, like: 28 | 29 | 1. Spectral centroid 30 | 2. Spectral Rolloff 31 | 3. Spectral Bandwidth 32 | 4. Zero-Crossing Rate 33 | 5. Mel-Frequency Cepstral Coefficients(MFCCs) 34 | 6. Chroma feature 35 | 36 | Please consider reading these articles to understand Audio data analysis and its step by step implementation [here](https://www.theaidream.com/post/audio-data-analysis-using-deep-learning-with-python-part-1) and [here](https://www.theaidream.com/post/audio-data-analysis-using-deep-learning-with-python-part-2). 37 | -------------------------------------------------------------------------------- /audio_data_analysis_ANN.py: -------------------------------------------------------------------------------- 1 | #pip install librosa 2 | #How to handle audio data 3 | import librosa 4 | audio_data = '/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav' 5 | x , sr = librosa.load(audio_data) 6 | print(type(x), type(sr)) 7 | print(x.shape, sr) 8 | 9 | #Visualize an audio file 10 | %matplotlib inline 11 | import matplotlib.pyplot as plt 12 | import librosa.display 13 | plt.figure(figsize=(14, 5)) 14 | librosa.display.waveplot(x, sr=sr) 15 | 16 | #Spectogram 17 | X = librosa.stft(x) 18 | Xdb = librosa.amplitude_to_db(abs(X)) 19 | plt.figure(figsize=(14, 5)) 20 | librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz') 21 | plt.colorbar() 22 | 23 | librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log') 24 | plt.colorbar() 25 | 26 | #audio widget 27 | import IPython.display as ipd 28 | ipd.Audio(audio_data) 29 | 30 | #creating an Audio signal 31 | import numpy as np 32 | sr = 22050 # sample rate 33 | T = 5.0 # seconds 34 | t = np.linspace(0, T, int(T*sr), endpoint=False) # time variable 35 | x = 0.5*np.sin(2*np.pi*220*t)# pure sine wave at 220 Hz 36 | Playing the audio 37 | ipd.Audio(x, rate=sr) # load a NumPy array 38 | Saving the audio 39 | librosa.output.write_wav('tone_220.wav', x, sr) 40 | 41 | #Feature extraction 42 | # 1.Zero Crossing Rate : The zero crossing rate is the rate of sign-changes along a signal, i.e., the rate at which the signal changes from positive to negative or back. 43 | # Load the signal 44 | x, sr = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav') 45 | #Plot the signal: 46 | plt.figure(figsize=(14, 5)) 47 | librosa.display.waveplot(x, sr=sr) 48 | # Zooming in 49 | n0 = 9000 50 | n1 = 9100 51 | plt.figure(figsize=(14, 5)) 52 | plt.plot(x[n0:n1]) 53 | plt.grid() 54 | 55 | #2. Spectral Centroid: It indicates where the ”centre of mass” for a sound is located and is calculated as the weighted mean of the frequencies present in the sound. 56 | import sklearn 57 | spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0] 58 | spectral_centroids.shape 59 | (775,) 60 | # Computing the time variable for visualization 61 | frames = range(len(spectral_centroids)) 62 | t = librosa.frames_to_time(frames) 63 | 64 | # Normalising the spectral centroid for visualisation 65 | plt.figure(figsize=(14, 5)) 66 | def normalize(x, axis=0): 67 | return sklearn.preprocessing.minmax_scale(x, axis=axis) 68 | #Plotting the Spectral Centroid along the waveform 69 | librosa.display.waveplot(x, sr=sr, alpha=0.4) 70 | plt.plot(t, normalize(spectral_centroids), color='r') 71 | 72 | #3. Spectral Rolloff 73 | #It is a measure of the shape of the signal. It represents the frequency below which a specified percentage of the total spectral energy. 74 | spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0] 75 | librosa.display.waveplot(x, sr=sr, alpha=0.4) 76 | plt.plot(t, normalize(spectral_rolloff), color='r') 77 | 78 | #4. Spectral Bandwidth 79 | spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr)[0] 80 | spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=3)[0] 81 | spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=4)[0] 82 | plt.figure(figsize=(15, 9)) 83 | librosa.display.waveplot(x, sr=sr, alpha=0.4) 84 | plt.plot(t, normalize(spectral_bandwidth_2), color='r') 85 | plt.plot(t, normalize(spectral_bandwidth_3), color='g') 86 | plt.plot(t, normalize(spectral_bandwidth_4), color='y') 87 | plt.legend(('p = 2', 'p = 3', 'p = 4')) 88 | 89 | 90 | #4. Mel-Frequency Cepstral Coefficients : The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) which concisely describe the overall shape of a spectral envelope. 91 | x, fs = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav') 92 | librosa.display.waveplot(x, sr=sr) 93 | mfccs = librosa.feature.mfcc(x, sr=fs) 94 | print mfccs.shape 95 | (20, 97) 96 | #Displaying the MFCCs: 97 | librosa.display.specshow(mfccs, sr=sr, x_axis='time') 98 | 99 | #5. Chroma Frequencies : Chroma features are an interesting and powerful representation for music audio in which the entire spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave. 100 | # Loadign the file 101 | x, sr = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav') 102 | hop_length = 512 103 | chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length) 104 | plt.figure(figsize=(15, 5)) 105 | librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm') 106 | 107 | """ 108 | In his section, we will model a classifier to classify songs into different genres. Let us assume a scenario in which, for some reason, we find a bunch of randomly named MP3 files on our hard disk, which are assumed to contain music. Our task is to sort them according to the music genre into different folders such as jazz, classical, country, pop, rock, and metal. 109 | Dataset 110 | """ 111 | import pandas as pd 112 | import numpy as np 113 | from numpy import argmax 114 | import matplotlib.pyplot as plt 115 | %matplotlib inline 116 | import librosa 117 | import librosa.display 118 | import IPython.display 119 | import pandas as pd 120 | import random 121 | import warnings 122 | import os 123 | from PIL import Image 124 | import pathlib 125 | import csv 126 | # Preprocessing 127 | from sklearn.model_selection import train_test_split 128 | from sklearn.preprocessing import LabelEncoder, StandardScaler 129 | #Keras 130 | import keras 131 | import warnings 132 | warnings.filterwarnings('ignore') 133 | from keras import layers 134 | from keras.layers import Activation, Dense, Dropout, Conv1D, Conv2D, Flatten, BatchNormalization, ZeroPadding2D, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add 135 | from keras.models import Sequential 136 | from keras import regularizers 137 | from keras.optimizers import SGD 138 | import keras.backend as K 139 | from keras.models import load_model 140 | from keras.callbacks import EarlyStopping 141 | 142 | """ 143 | Extracting music and features 144 | Dataset 145 | We use GTZAN genre collection dataset for classification. 146 | The dataset consists of 10 genres i.e 147 | Blues Classical Country Disco Hiphop Jazz Metal Pop Reggae Rock 148 | Each genre contains 100 songs. Total dataset: 1000 songs 149 | """ 150 | """ Extracting the Spectrogram for every Audio """ 151 | 152 | cmap = plt.get_cmap('inferno') 153 | plt.figure(figsize=(8,8)) 154 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split() 155 | for g in genres: 156 | pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True) 157 | for filename in os.listdir(f'./drive/My Drive/genres/{g}'): 158 | songname = f'./drive/My Drive/genres/{g}/{filename}' 159 | y, sr = librosa.load(songname, mono=True, duration=5) 160 | print(y.shape) 161 | plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB'); 162 | plt.axis('off'); 163 | plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png') 164 | plt.clf() 165 | 166 | """ 167 | All the audio files get converted into their respective spectrograms .WE can noe easily extract features from them. 168 | """ 169 | """ 170 | Extracting features from Spectrogram and they are: 171 | 172 | Mel-frequency cepstral coefficients (MFCC)(20 in number) 173 | Spectral Centroid, 174 | Zero Crossing Rate 175 | Chroma Frequencies 176 | Spectral Roll-off. 177 | """ 178 | 179 | header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate' 180 | for i in range(1, 21): 181 | header += f' mfcc{i}' 182 | header += ' label' 183 | header = header.split() 184 | 185 | #We write the data to a csv file 186 | 187 | file = open('dataset.csv', 'w', newline='') 188 | with file: 189 | writer = csv.writer(file) 190 | writer.writerow(header) 191 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split() 192 | for g in genres: 193 | for filename in os.listdir(f'./drive/My Drive/genres/{g}'): 194 | songname = f'./drive/My Drive/genres/{g}/{filename}' 195 | y, sr = librosa.load(songname, mono=True, duration=30) 196 | rmse = librosa.feature.rmse(y=y) 197 | chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) 198 | spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr) 199 | spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr) 200 | rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) 201 | zcr = librosa.feature.zero_crossing_rate(y) 202 | mfcc = librosa.feature.mfcc(y=y, sr=sr) 203 | to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}' 204 | for e in mfcc: 205 | to_append += f' {np.mean(e)}' 206 | to_append += f' {g}' 207 | file = open('dataset.csv', 'a', newline='') 208 | with file: 209 | writer = csv.writer(file) 210 | writer.writerow(to_append.split()) 211 | 212 | #Analysing the Data in Pandas¶ 213 | data = pd.read_csv('dataset.csv') 214 | data.head() 215 | # Dropping unneccesary columns 216 | data = data.drop(['filename'],axis=1) 217 | #Encoding the Labels¶ 218 | genre_list = data.iloc[:, -1] 219 | encoder = LabelEncoder() 220 | y = encoder.fit_transform(genre_list) 221 | #Scaling the Feature columns¶ 222 | scaler = StandardScaler() 223 | X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float)) 224 | #Dividing data into training and Testing set¶ 225 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 226 | 227 | #ANN implementation 228 | from keras import layers 229 | from keras import layers 230 | import keras 231 | from keras.models import Sequential 232 | model = Sequential() 233 | model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],))) 234 | model.add(layers.Dense(128, activation='relu')) 235 | model.add(layers.Dense(64, activation='relu')) 236 | model.add(layers.Dense(10, activation='softmax')) 237 | model.compile(optimizer='adam', 238 | loss='sparse_categorical_crossentropy', 239 | metrics=['accuracy']) 240 | 241 | classifier = model.fit(X_train, 242 | y_train, 243 | epochs=100, 244 | batch_size=128) 245 | 246 | test_loss, test_acc = model.evaluate(X_test,y_test) 247 | print('test_acc: ',test_acc) 248 | #Validating our approach¶ 249 | x_val = X_train[:200] 250 | partial_x_train = X_train[200:] 251 | 252 | y_val = y_train[:200] 253 | partial_y_train = y_train[200:] 254 | 255 | model = Sequential() 256 | model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],))) 257 | model.add(layers.Dense(256, activation='relu')) 258 | model.add(layers.Dense(128, activation='relu')) 259 | model.add(layers.Dense(64, activation='relu')) 260 | model.add(layers.Dense(10, activation='softmax')) 261 | 262 | model.compile(optimizer='adam', 263 | loss='sparse_categorical_crossentropy', 264 | metrics=['accuracy']) 265 | 266 | model.fit(partial_x_train, 267 | partial_y_train, 268 | epochs=30, 269 | batch_size=512, 270 | validation_data=(x_val, y_val)) 271 | results = model.evaluate(X_test, y_test) 272 | 273 | 274 | Predictions on Test Data¶ 275 | predictions = model.predict(X_test) 276 | 277 | # Making the Confusion Matrix 278 | from sklearn.metrics import confusion_matrix 279 | cm = confusion_matrix(y_test, predictions) 280 | -------------------------------------------------------------------------------- /images.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nageshsinghc4/Audio-Data-Analysis-Using-Deep-Learning/f2823057dd02c0810706f38013dafc5830352a05/images.jpeg --------------------------------------------------------------------------------