├── .ipynb_checkpoints └── urbanclap_audio_classification-checkpoint.ipynb ├── Exploring the UrbanClap Data.ipynb ├── README.md ├── urbanclap_audio_classification.ipynb └── urbanclap_audio_classification.py /README.md: -------------------------------------------------------------------------------- 1 | # Audio-Classification-using-Deep-Learning 2 | Classifying 10 different categories of Urban Sounds using Deep Learning. 3 | 4 | The audio files can be downloaded from the following link: 5 | https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU 6 | 7 | 8 | ## IMPORTANT: The folders should be arranged in the following manner: 9 | Dir of train label: sounds/labels/train.csv 10 | 11 | Dir of test label: sounds/labels/test.csv 12 | 13 | Dir of train sounds:sounds/train/train_sound/ (audio files in .wav format) 14 | 15 | Dir of train sounds:sounds/test/test_sound/ (audio files in .wav format) 16 | 17 | 18 | ### The train folder are labelled 19 | ### The test folder aren't labelled 20 | 21 | We separate one audio signal into 3 to actually load the data into a machine understandable format. 22 | For this, we simply take values after every specific time steps. 23 | For example; in a 2 second audio file, we extract values at half a second. 24 | ![Alt Text](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/08/23210623/sound.png) 25 | This is called sampling of audio data, and the rate at which it is sampled is called the sampling rate. 26 | 27 | Different pure signals, which can now be represented as three unique values in frequency domain. 28 | 29 | There are a few more ways in which audio data can be represented, for example. using MFCs (Mel-Frequency cepstrums). 30 | These are nothing but different ways to represent the data. 31 | 32 | Next we extract features from this audio representations, so that our Deep Learning model can work on these features and perform the task it is designed for.. 33 | -------------------------------------------------------------------------------- /urbanclap_audio_classification.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[61]: 5 | 6 | 7 | # importing necessary libraries and dependencies 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import librosa 13 | from sklearn.preprocessing import LabelEncoder 14 | from keras.utils.np_utils import to_categorical 15 | from keras.models import Sequential 16 | from keras.layers import Dense, Dropout, Activation 17 | from keras.optimizers import Adam 18 | from sklearn.cross_validation import train_test_split 19 | from sklearn import metrics 20 | 21 | 22 | # In[2]: 23 | 24 | 25 | train_data_dir = 'sounds/train/train_sound/' 26 | test_data_dir = 'sounds/test/test_sound/' 27 | 28 | # reading the labels 29 | train = pd.read_csv('sounds/labels/train.csv') 30 | test = pd.read_csv('sounds/labels/test.csv') 31 | 32 | 33 | # In[3]: 34 | 35 | 36 | # function to load files and extract features 37 | def parser(row, data_dir): 38 | # setting path 39 | file_name = os.path.join(data_dir,str(row.ID)+'.wav') 40 | print(file_name) 41 | # check if the file is corrupted 42 | try: 43 | # here kaiser_fast is a technique used for faster extraction 44 | # X-> audio_time_series_data; sample_rate-> sampling rate 45 | X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 46 | 47 | # extraccting Mel-Frequeny Cepstral Coeficients feature from data 48 | # y -> accepts time-series audio data; sr -> accepts sampling rate 49 | # n_mfccs -> no. of MFCCs to return 50 | mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis = 0) 51 | 52 | except Exception as e: 53 | print("Error encountered while parsing file: ", e) 54 | return None, None 55 | 56 | # store mfccs features 57 | feature = mfccs 58 | # store the respective id 59 | data_id = row.ID 60 | 61 | return [data_id, feature] 62 | 63 | 64 | # ### Reading train.csv and storing into temp 65 | 66 | # In[4]: 67 | 68 | 69 | # parsing train 70 | temp = train.apply(parser,axis=1,data_dir=train_data_dir) 71 | temp.columns = ['ID','feature'] 72 | 73 | 74 | # In[5]: 75 | 76 | 77 | # adding Class to 'temp' 78 | temp['Class'] = train['Class'] 79 | 80 | 81 | # In[6]: 82 | 83 | 84 | type(temp) 85 | 86 | 87 | # ### Reading test.csv and storing into temp_test 88 | 89 | # In[7]: 90 | 91 | 92 | # parsing test 93 | temp_test = test.apply(parser, axis=1,data_dir=test_data_dir) 94 | temp_test.columns = ['ID', 'feature'] 95 | 96 | 97 | # In[16]: 98 | 99 | 100 | temp_test = pd.DataFrame(temp_test) 101 | type(temp_test) 102 | 103 | 104 | # In[19]: 105 | 106 | 107 | temp_test.columns = ['mix'] 108 | 109 | 110 | # In[23]: 111 | 112 | 113 | temp_test.keys() 114 | 115 | 116 | # In[24]: 117 | 118 | 119 | temp_test[['ID','feature']] = temp_test['mix'].apply(pd.Series) 120 | 121 | 122 | # In[28]: 123 | 124 | 125 | temp_test.drop('mix',axis=1,inplace=True) 126 | 127 | 128 | # In[32]: 129 | 130 | 131 | print("\n---------------------train data---------------------") 132 | print(type(temp)) 133 | print(temp.head()) 134 | 135 | print("\n---------------------test data---------------------") 136 | print(type(temp_test)) 137 | print(temp_test.head()) 138 | 139 | 140 | print('---------------------Checking for NONE values---------------------') 141 | # checking for NONE values 142 | print(temp[temp.Class.isnull()]) 143 | 144 | # removing NONE values from temp 145 | temp = temp[temp.Class.notnull()] 146 | temp_test = temp_test[temp_test.notnull()] 147 | #print(temp.ID[temp.label.isnull()]) 148 | 149 | 150 | # In[37]: 151 | 152 | 153 | temp.Class.unique() 154 | 155 | 156 | # In[38]: 157 | 158 | 159 | temp.Class.nunique() 160 | 161 | 162 | # In[35]: 163 | 164 | 165 | # Label Encoding the audio data 166 | lb = LabelEncoder() 167 | 168 | # converting pd.series into np.array for faster processing 169 | X = np.array(temp.feature.tolist()) 170 | y = np.array(temp.Class.tolist()) 171 | 172 | 173 | y = to_categorical(lb.fit_transform(y)) 174 | 175 | 176 | # In[62]: 177 | 178 | 179 | x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.3) 180 | 181 | 182 | # ## Building a deep learning model 183 | 184 | # In[73]: 185 | 186 | 187 | num_labels = y.shape[1] 188 | filter_size = 2 189 | 190 | def categorical_classifier(): 191 | model = Sequential() 192 | 193 | # input and first hidden layer 194 | model.add(Dense(input_shape=(40,), units=256, activation='relu', kernel_initializer='uniform')) 195 | model.add(Dropout(0.5)) 196 | 197 | # second hidden layer 198 | model.add(Dense(units=256,activation='relu',kernel_initializer='uniform')) 199 | model.add(Dropout(0.5)) 200 | 201 | # output layer 202 | model.add(Dense(units=num_labels, activation='softmax')) 203 | 204 | # compiling our model 205 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') 206 | 207 | # training the data 208 | #model.fit(X,y, batch_size=32, epochs=500, validation_split=0.3) 209 | return model 210 | 211 | 212 | # In[77]: 213 | 214 | 215 | # training the data 216 | model.fit(x_train,y_train, batch_size=32, epochs=650, validation_data=(x_test, y_test)) 217 | 218 | --------------------------------------------------------------------------------