├── Audio-Data-Analysis-CNN.py
├── README.md
├── audio_data_analysis_ANN.py
└── images.jpeg


/Audio-Data-Analysis-CNN.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from numpy import argmax
  4 | import matplotlib.pyplot as plt
  5 | %matplotlib inline
  6 | import librosa
  7 | import librosa.display
  8 | import IPython.display
  9 | import random
 10 | import warnings
 11 | import os
 12 | from PIL import Image
 13 | import pathlib
 14 | import csv
 15 | # sklearn Preprocessing
 16 | from sklearn.model_selection import train_test_split
 17 | #Keras
 18 | import keras
 19 | import warnings
 20 | warnings.filterwarnings('ignore')
 21 | from keras import layers
 22 | from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add
 23 | from keras.models import Sequential
 24 | from keras.optimizers import SGD
 25 | 
 26 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
 27 | for g in genres:
 28 |     pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True)
 29 |     for filename in os.listdir(f'./drive/My Drive/genres/{g}'):
 30 |         songname = f'./drive/My Drive/genres/{g}/{filename}'
 31 |         y, sr = librosa.load(songname, mono=True, duration=5)
 32 |         print(y.shape)
 33 |         plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
 34 |         plt.axis('off');
 35 |         plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png')
 36 |         plt.clf()
 37 |         
 38 |         
 39 |         
 40 | import split_folders
 41 | # To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
 42 | split_folders.ratio('./img_data/', output="./data", seed=1337, ratio=(.8, .2)) # default values
 43 | 
 44 | from keras.preprocessing.image import ImageDataGenerator
 45 | train_datagen = ImageDataGenerator(
 46 |         rescale=1./255, # rescale all pixel values from 0-255, so aftre this step all our pixel values are in range (0,1)
 47 |         shear_range=0.2, #to apply some random tranfromations
 48 |         zoom_range=0.2, #to apply zoom
 49 |         horizontal_flip=True) # image will be flipper horiz
 50 | test_datagen = ImageDataGenerator(rescale=1./255)
 51 | 
 52 | 
 53 | training_set = train_datagen.flow_from_directory(
 54 |         './data/train',
 55 |         target_size=(64, 64),
 56 |         batch_size=32,
 57 |         class_mode='categorical',
 58 |         shuffle = False)
 59 | test_set = test_datagen.flow_from_directory(
 60 |         './data/val',
 61 |         target_size=(64, 64),
 62 |         batch_size=32,
 63 |         class_mode='categorical',
 64 |         shuffle = False )
 65 |         
 66 | model = Sequential()
 67 | input_shape=(64, 64, 3)
 68 | #1st hidden layer
 69 | model.add(Conv2D(32, (3, 3), strides=(2, 2), input_shape=input_shape))
 70 | model.add(AveragePooling2D((2, 2), strides=(2,2)))
 71 | model.add(Activation('relu'))
 72 | #2nd hidden layer
 73 | model.add(Conv2D(64, (3, 3), padding="same"))
 74 | model.add(AveragePooling2D((2, 2), strides=(2,2)))
 75 | model.add(Activation('relu'))
 76 | #3rd hidden layer
 77 | model.add(Conv2D(64, (3, 3), padding="same"))
 78 | model.add(AveragePooling2D((2, 2), strides=(2,2)))
 79 | model.add(Activation('relu'))
 80 | #Flatten
 81 | model.add(Flatten())
 82 | model.add(Dropout(rate=0.5))
 83 | #Add fully connected layer.
 84 | model.add(Dense(64))
 85 | model.add(Activation('relu'))
 86 | model.add(Dropout(rate=0.5))
 87 | #Output layer
 88 | model.add(Dense(10))
 89 | model.add(Activation('softmax'))
 90 | model.summary()
 91 | 
 92 | 
 93 | epochs = 200
 94 | batch_size = 8
 95 | learning_rate = 0.01
 96 | decay_rate = learning_rate / epochs
 97 | momentum = 0.9
 98 | sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
 99 | model.compile(optimizer="sgd", loss="categorical_crossentropy", metrics=['accuracy'])
100 | 
101 | model.fit_generator(
102 |         training_set,
103 |         steps_per_epoch=100,
104 |         epochs=50,
105 |         validation_data=test_set,
106 |         validation_steps=200)
107 |         
108 |         
109 | #Model Evaluation
110 | model.evaluate_generator(generator=test_set, steps=50)
111 | #OUTPUT
112 | [1.704445120342617, 0.33798882681564246]
113 | 
114 | test_set.reset()
115 | pred = model.predict_generator(test_set, steps=50, verbose=1)
116 | 
117 | predicted_class_indices=np.argmax(pred,axis=1)
118 | 
119 | labels = (training_set.class_indices)
120 | labels = dict((v,k) for k,v in labels.items())
121 | predictions = [labels[k] for k in predicted_class_indices]
122 | predictions = predictions[:200]
123 | filenames=test_set.filenames
124 | 
125 | print(len(filename, len(predictions)))
126 | # (200, 200)
127 | 
128 | results=pd.DataFrame({"Filename":filenames,
129 |                       "Predictions":predictions},orient='index')
130 | results.to_csv("prediction_results.csv",index=False)
131 | 
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Audio Data Analysis Using Deep Learning
 2 | 
 3 | Audio data analysis is about analyzing and understanding audio signals captured by digital devices, with numerous applications in the enterprise, healthcare, productivity, and smart cities.
 4 | 
 5 | In this repository we have done audio data analysis and extracted necessary features from a sound/audio file. Also build an Artificial Neural Network(**ANN**) and Convolutional Neural Network(**CNN**) for classifying music genre.
 6 | 
 7 | ![NN](https://github.com/nageshsinghc4/Audio-Data-Analysis-Using-Deep-Learning/blob/master/images.jpeg)
 8 | 
 9 | 
10 | Pre requisites:
11 | 
12 | 1. **Librosa**: to analyze audio signals in general but geared more towards music. It includes the nuts and bolts to build a MIR(Music information retrieval) system. 
13 | 
14 | ```pip install librosa```
15 | 
16 | 
17 | 2. **IPython.display.Audio**: Lets you play audio directly in a jupyter notebook.
18 | 
19 | ## Important termanologies:
20 | 
21 | ### Spectrogram
22 | A spectrogram is a visual way of representing the signal strength, or “loudness”, of a signal over time at various frequencies present in a particular waveform.
23 | 
24 | ### Feature extraction from Audio signal
25 | Every audio signal consists of many features. However, we must extract the characteristics that are relevant to the problem we are trying to solve. The process of extracting features to use them for analysis is called feature extraction. Let us study a few of the features in detail.
26 | 
27 | The spectral features (frequency-based features), which are obtained by converting the time-based signal into the frequency domain using the Fourier Transform, like:
28 | 
29 | 1. Spectral centroid
30 | 2. Spectral Rolloff
31 | 3. Spectral Bandwidth
32 | 4. Zero-Crossing Rate
33 | 5. Mel-Frequency Cepstral Coefficients(MFCCs)
34 | 6. Chroma feature
35 | 
36 | Please consider reading these articles to understand Audio data analysis and its step by step implementation [here](https://www.theaidream.com/post/audio-data-analysis-using-deep-learning-with-python-part-1) and [here](https://www.theaidream.com/post/audio-data-analysis-using-deep-learning-with-python-part-2).
37 | 


--------------------------------------------------------------------------------
/audio_data_analysis_ANN.py:
--------------------------------------------------------------------------------
  1 | #pip install librosa
  2 | #How to handle audio data
  3 | import librosa
  4 | audio_data = '/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav'
  5 | x , sr = librosa.load(audio_data)
  6 | print(type(x), type(sr))
  7 | print(x.shape, sr)
  8 | 
  9 | #Visualize an audio file
 10 | %matplotlib inline
 11 | import matplotlib.pyplot as plt
 12 | import librosa.display
 13 | plt.figure(figsize=(14, 5))
 14 | librosa.display.waveplot(x, sr=sr)
 15 | 
 16 | #Spectogram
 17 | X = librosa.stft(x)
 18 | Xdb = librosa.amplitude_to_db(abs(X))
 19 | plt.figure(figsize=(14, 5))
 20 | librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
 21 | plt.colorbar()
 22 | 
 23 | librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
 24 | plt.colorbar()
 25 | 
 26 | #audio widget
 27 | import IPython.display as ipd
 28 | ipd.Audio(audio_data)
 29 | 
 30 | #creating an Audio signal
 31 | import numpy as np
 32 | sr = 22050 # sample rate
 33 | T = 5.0    # seconds
 34 | t = np.linspace(0, T, int(T*sr), endpoint=False) # time variable
 35 | x = 0.5*np.sin(2*np.pi*220*t)# pure sine wave at 220 Hz
 36 | Playing the audio
 37 | ipd.Audio(x, rate=sr) # load a NumPy array
 38 | Saving the audio
 39 | librosa.output.write_wav('tone_220.wav', x, sr)
 40 | 
 41 | #Feature extraction
 42 | # 1.Zero Crossing Rate : The zero crossing rate is the rate of sign-changes along a signal, i.e., the rate at which the signal changes from positive to negative or back.
 43 | # Load the signal
 44 | x, sr = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav')
 45 | #Plot the signal:
 46 | plt.figure(figsize=(14, 5))
 47 | librosa.display.waveplot(x, sr=sr)
 48 | # Zooming in
 49 | n0 = 9000
 50 | n1 = 9100
 51 | plt.figure(figsize=(14, 5))
 52 | plt.plot(x[n0:n1])
 53 | plt.grid()
 54 | 
 55 | #2. Spectral Centroid: It indicates where the ”centre of mass” for a sound is located and is calculated as the weighted mean of the frequencies present in the sound.
 56 | import sklearn
 57 | spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0]
 58 | spectral_centroids.shape
 59 | (775,)
 60 | # Computing the time variable for visualization
 61 | frames = range(len(spectral_centroids))
 62 | t = librosa.frames_to_time(frames)
 63 | 
 64 | # Normalising the spectral centroid for visualisation
 65 | plt.figure(figsize=(14, 5))
 66 | def normalize(x, axis=0):
 67 |     return sklearn.preprocessing.minmax_scale(x, axis=axis)
 68 | #Plotting the Spectral Centroid along the waveform
 69 | librosa.display.waveplot(x, sr=sr, alpha=0.4)
 70 | plt.plot(t, normalize(spectral_centroids), color='r')
 71 | 
 72 | #3. Spectral Rolloff
 73 | #It is a measure of the shape of the signal. It represents the frequency below which a specified percentage of the total spectral energy.
 74 | spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sr)[0]
 75 | librosa.display.waveplot(x, sr=sr, alpha=0.4)
 76 | plt.plot(t, normalize(spectral_rolloff), color='r')
 77 | 
 78 | #4. Spectral Bandwidth
 79 | spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr)[0]
 80 | spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=3)[0]
 81 | spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(x+0.01, sr=sr, p=4)[0]
 82 | plt.figure(figsize=(15, 9))
 83 | librosa.display.waveplot(x, sr=sr, alpha=0.4)
 84 | plt.plot(t, normalize(spectral_bandwidth_2), color='r')
 85 | plt.plot(t, normalize(spectral_bandwidth_3), color='g')
 86 | plt.plot(t, normalize(spectral_bandwidth_4), color='y')
 87 | plt.legend(('p = 2', 'p = 3', 'p = 4'))
 88 | 
 89 | 
 90 | #4. Mel-Frequency Cepstral Coefficients : The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) which concisely describe the overall shape of a spectral envelope.
 91 | x, fs = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav')
 92 | librosa.display.waveplot(x, sr=sr)
 93 | mfccs = librosa.feature.mfcc(x, sr=fs)
 94 | print mfccs.shape
 95 | (20, 97)
 96 | #Displaying  the MFCCs:
 97 | librosa.display.specshow(mfccs, sr=sr, x_axis='time')
 98 | 
 99 | #5. Chroma Frequencies : Chroma features are an interesting and powerful representation for music audio in which the entire spectrum is projected onto 12 bins representing the 12 distinct semitones (or chroma) of the musical octave.
100 | # Loadign the file
101 | x, sr = librosa.load('/Users/nageshsinghchauhan/Downloads/ML/music_classification/gruesome.wav')
102 | hop_length = 512
103 | chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length)
104 | plt.figure(figsize=(15, 5))
105 | librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
106 | 
107 | """
108 | In his section, we will model a classifier to classify songs into different genres. Let us assume a scenario in which, for some reason, we find a bunch of randomly named MP3 files on our hard disk, which are assumed to contain music. Our task is to sort them according to the music genre into different folders such as jazz, classical, country, pop, rock, and metal.
109 | Dataset
110 | """
111 | import pandas as pd
112 | import numpy as np
113 | from numpy import argmax
114 | import matplotlib.pyplot as plt
115 | %matplotlib inline
116 | import librosa
117 | import librosa.display
118 | import IPython.display
119 | import pandas as pd
120 | import random
121 | import warnings
122 | import os
123 | from PIL import Image
124 | import pathlib
125 | import csv
126 | # Preprocessing
127 | from sklearn.model_selection import train_test_split
128 | from sklearn.preprocessing import LabelEncoder, StandardScaler
129 | #Keras
130 | import keras
131 | import warnings
132 | warnings.filterwarnings('ignore')
133 | from keras import layers
134 | from keras.layers import Activation, Dense, Dropout, Conv1D, Conv2D, Flatten, BatchNormalization, ZeroPadding2D, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling1D, AveragePooling2D, Input, Add
135 | from keras.models import Sequential
136 | from keras import regularizers
137 | from keras.optimizers import SGD
138 | import keras.backend as K
139 | from keras.models import load_model
140 | from keras.callbacks import EarlyStopping
141 | 
142 | """
143 | Extracting music and features
144 | Dataset
145 | We use GTZAN genre collection dataset for classification.
146 | The dataset consists of 10 genres i.e
147 | Blues Classical Country Disco Hiphop Jazz Metal Pop Reggae Rock
148 | Each genre contains 100 songs. Total dataset: 1000 songs
149 | """
150 | """ Extracting the Spectrogram for every Audio """
151 | 
152 | cmap = plt.get_cmap('inferno')
153 | plt.figure(figsize=(8,8))
154 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
155 | for g in genres:
156 |     pathlib.Path(f'img_data/{g}').mkdir(parents=True, exist_ok=True)
157 |     for filename in os.listdir(f'./drive/My Drive/genres/{g}'):
158 |         songname = f'./drive/My Drive/genres/{g}/{filename}'
159 |         y, sr = librosa.load(songname, mono=True, duration=5)
160 |         print(y.shape)
161 |         plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
162 |         plt.axis('off');
163 |         plt.savefig(f'img_data/{g}/{filename[:-3].replace(".", "")}.png')
164 |         plt.clf()
165 | 
166 | """
167 | All the audio files get converted into their respective spectrograms .WE can noe easily extract features from them.
168 | """
169 | """
170 | Extracting features from Spectrogram and they are:
171 | 
172 | Mel-frequency cepstral coefficients (MFCC)(20 in number)
173 | Spectral Centroid,
174 | Zero Crossing Rate
175 | Chroma Frequencies
176 | Spectral Roll-off.
177 | """
178 | 
179 | header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
180 | for i in range(1, 21):
181 |     header += f' mfcc{i}'
182 | header += ' label'
183 | header = header.split()
184 | 
185 | #We write the data to a csv file
186 | 
187 | file = open('dataset.csv', 'w', newline='')
188 | with file:
189 |     writer = csv.writer(file)
190 |     writer.writerow(header)
191 | genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
192 | for g in genres:
193 |     for filename in os.listdir(f'./drive/My Drive/genres/{g}'):
194 |         songname = f'./drive/My Drive/genres/{g}/{filename}'
195 |         y, sr = librosa.load(songname, mono=True, duration=30)
196 |         rmse = librosa.feature.rmse(y=y)
197 |         chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
198 |         spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
199 |         spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
200 |         rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
201 |         zcr = librosa.feature.zero_crossing_rate(y)
202 |         mfcc = librosa.feature.mfcc(y=y, sr=sr)
203 |         to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
204 |         for e in mfcc:
205 |             to_append += f' {np.mean(e)}'
206 |         to_append += f' {g}'
207 |         file = open('dataset.csv', 'a', newline='')
208 |         with file:
209 |             writer = csv.writer(file)
210 |             writer.writerow(to_append.split())
211 | 
212 | #Analysing the Data in Pandas¶
213 | data = pd.read_csv('dataset.csv')
214 | data.head()
215 | # Dropping unneccesary columns
216 | data = data.drop(['filename'],axis=1)
217 | #Encoding the Labels¶
218 | genre_list = data.iloc[:, -1]
219 | encoder = LabelEncoder()
220 | y = encoder.fit_transform(genre_list)
221 | #Scaling the Feature columns¶
222 | scaler = StandardScaler()
223 | X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
224 | #Dividing data into training and Testing set¶
225 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
226 | 
227 | #ANN implementation
228 | from keras import layers
229 | from keras import layers
230 | import keras
231 | from keras.models import Sequential
232 | model = Sequential()
233 | model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
234 | model.add(layers.Dense(128, activation='relu'))
235 | model.add(layers.Dense(64, activation='relu'))
236 | model.add(layers.Dense(10, activation='softmax'))
237 | model.compile(optimizer='adam',
238 |               loss='sparse_categorical_crossentropy',
239 |               metrics=['accuracy'])
240 | 
241 | classifier = model.fit(X_train,
242 |                     y_train,
243 |                     epochs=100,
244 |                     batch_size=128)
245 | 
246 | test_loss, test_acc = model.evaluate(X_test,y_test)
247 | print('test_acc: ',test_acc)
248 | #Validating our approach¶
249 | x_val = X_train[:200]
250 | partial_x_train = X_train[200:]
251 | 
252 | y_val = y_train[:200]
253 | partial_y_train = y_train[200:]
254 | 
255 | model = Sequential()
256 | model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
257 | model.add(layers.Dense(256, activation='relu'))
258 | model.add(layers.Dense(128, activation='relu'))
259 | model.add(layers.Dense(64, activation='relu'))
260 | model.add(layers.Dense(10, activation='softmax'))
261 | 
262 | model.compile(optimizer='adam',
263 |               loss='sparse_categorical_crossentropy',
264 |               metrics=['accuracy'])
265 | 
266 | model.fit(partial_x_train,
267 |           partial_y_train,
268 |           epochs=30,
269 |           batch_size=512,
270 |           validation_data=(x_val, y_val))
271 | results = model.evaluate(X_test, y_test)
272 | 
273 | 
274 | Predictions on Test Data¶
275 | predictions = model.predict(X_test)
276 | 
277 | # Making the Confusion Matrix
278 | from sklearn.metrics import confusion_matrix
279 | cm = confusion_matrix(y_test, predictions)
280 | 


--------------------------------------------------------------------------------
/images.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nageshsinghc4/Audio-Data-Analysis-Using-Deep-Learning/f2823057dd02c0810706f38013dafc5830352a05/images.jpeg


--------------------------------------------------------------------------------