├── README
├── config.py
├── main_dnn.py
└── prepare_data.py


/README:
--------------------------------------------------------------------------------
 1 | This is sourcecode of DCASE Challenge 2016 Task 1 of paper
 2 | http://www.cs.tut.fi/sgn/arg/dcase2016/documents/challenge_technical_reports/Task1/Kong_2016_task1.pdf
 3 | 
 4 | THIS CODE IS MIT LISCENCED. 
 5 | AUTHOR: QIUQIANG KONG, q.kong@surrey.ac.uk
 6 | CVSSP, UNIVERSITY OF SURREY
 7 | Created: 2016.06.24
 8 | Latest modified: 2017.05.02
 9 | ======================================
10 | 
11 | Pre-requiste python packages:
12 | pip install hat [--upgrade]	# Deep learning toolbox, https://qiuqiangkong.github.io/Hat
13 | 
14 | Usage:
15 | ------ Preparation ------
16 | * modify the paths of dataset in config.py
17 | * python prepare_data.py --calculate_logmel 	# Calculate features
18 | 
19 | ------ Development Cross Validation ------
20 | * python main_dnn.py --dev_train	# Train model on cross validation of development data. 
21 | * python main_dnn.py --dev_recognize	# Get predictions and print out statistics. 
22 | 
23 | ------ Testing of Private Data ------
24 | * python main_dnn.py --eva_train	# Train model on whole development data
25 | * python main_dnn.py --eva_recognize	# Get predictions of private data and print statistics. 
26 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SUMMARY:  config file
 3 | AUTHOR:   Qiuqiang Kong
 4 | Created:  2016.05.11
 5 | Modified: 2016.06.24 Delete unnecessary
 6 |           2016.10.09 rename some variables
 7 |           2017.04.27 Minor fix. 
 8 | --------------------------------------
 9 | """
10 | # hat deep learning toolbox
11 | hat_root = ""   # you need to specify the source code path if hat is not 
12 |                 # installed on your computer. 
13 | 
14 | # development config
15 | dev_wav_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/audio"
16 | 
17 | dev_csv_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/evaluation_setup"
18 | dev_tr_csv = [dev_csv_fd+"/fold1_train.txt", dev_csv_fd+"/fold2_train.txt", 
19 |                dev_csv_fd+"/fold3_train.txt", dev_csv_fd+"/fold4_train.txt"]
20 | dev_te_csv = [dev_csv_fd+"/fold1_evaluate.txt", dev_csv_fd+"/fold2_evaluate.txt", 
21 |                dev_csv_fd+"/fold3_evaluate.txt", dev_csv_fd+"/fold4_evaluate.txt"]
22 | dev_meta_csv = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/meta.txt"
23 | 
24 | # evaluation config
25 | eva_wav_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-evaluation/audio"
26 | eva_meta_csv = "/vol/vssp/AP_datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-evaluation/meta.txt"
27 | 
28 | # your workspace
29 | scrap_fd = "/vol/vssp/msos/qk/DCASE2016_task1_scrap"
30 | fe_fd = scrap_fd + "/features"
31 | dev_fe_logmel_fd = fe_fd + "/dev/logmel"
32 | eva_fe_logmel_fd = fe_fd + "/eva/logmel"
33 | md_fd = scrap_fd + "/models"
34 | dev_md_fd = scrap_fd + "/dev"
35 | eva_md_fd = scrap_fd + "/eva"
36 | 
37 | # 1 of 15 acoustic label
38 | labels = ['bus', 'cafe/restaurant', 'car', 'city_center', 'forest_path', 
39 |            'grocery_store', 'home', 'beach', 'library', 'metro_station', 
40 |            'office', 'residential_area', 'train', 'tram', 'park']
41 |             
42 | lb_to_id = {lb:id for id, lb in enumerate(labels)}
43 | id_to_lb = {id:lb for id, lb in enumerate(labels)}
44 | 
45 | fs = 44100.
46 | n_fft = 1024.


--------------------------------------------------------------------------------
/main_dnn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SUMMARY:  Dcase 2016 Task 1. Scene classification
  3 |           Train and test on dev dataset (split to 4 cv-folders)
  4 |           Training time: 17 s/epoch. (GTX TitanX)
  5 |           Dev: te_frame_based_acc: 61.4% +- ? , te_clip_based_acc: 80.7% +- ? 
  6 |           (Train on fold 1,2,3 and validate on fold 0)
  7 |           Eva: te_frame_based_acc: 81.0% +- ? , te_clip_based_acc: 69.2% +- ?
  8 |           after 10 epoches. 
  9 | AUTHOR:   Qiuqiang Kong
 10 | Created:  2016.05.11
 11 | Modified: 2017.05.02 
 12 | --------------------------------------
 13 | """
 14 | import cPickle
 15 | import numpy as np
 16 | np.random.seed(1515)
 17 | import os
 18 | import sys
 19 | import csv
 20 | 
 21 | import config as cfg
 22 | import prepare_data as pp_data
 23 | 
 24 | sys.path.append(cfg.hat_root)
 25 | from hat.models import Sequential
 26 | from hat.layers.core import InputLayer, Flatten, Dense, Dropout
 27 | from hat.callbacks import SaveModel, Validation
 28 | from hat.preprocessing import sparse_to_categorical, mat_2d_to_3d
 29 | from hat.optimizers import SGD, Adam
 30 | import hat.backend as K
 31 | 
 32 | 
 33 | def train(tr_fe_fd, tr_csv_file, te_fe_fd, te_csv_file, 
 34 |           n_concat, hop, scaler, out_md_fd):
 35 |     # Prepare data 
 36 |     tr_x, tr_y = pp_data.get_matrix_format_data(
 37 |                      fe_fd=tr_fe_fd, 
 38 |                      csv_file=tr_csv_file, 
 39 |                      n_concat=n_concat, hop=hop, scaler=scaler)
 40 |                      
 41 |     te_x, te_y = pp_data.get_matrix_format_data(
 42 |                      fe_fd=te_fe_fd, 
 43 |                      csv_file=te_csv_file, 
 44 |                      n_concat=n_concat, hop=hop, scaler=scaler)
 45 |     
 46 |     n_freq = tr_x.shape[2]
 47 |     print 'tr_x.shape:', tr_x.shape     # (n_samples, n_concat, n_freq)
 48 |     print 'tr_y.shape:', tr_y.shape     # (n_samples, n_labels)
 49 |     
 50 |     
 51 |     # Build model
 52 |     n_out = len(cfg.labels)
 53 |     seq = Sequential()
 54 |     seq.add(InputLayer((n_concat, n_freq)))
 55 |     seq.add(Flatten())
 56 |     seq.add(Dropout(0.2))
 57 |     seq.add(Dense(200, act='relu'))
 58 |     seq.add(Dropout(0.2))
 59 |     seq.add(Dense(200, act='relu'))
 60 |     seq.add(Dropout(0.2))
 61 |     seq.add(Dense(n_out, act='softmax'))
 62 |     md = seq.compile()
 63 |     md.summary()
 64 |     
 65 |     # Validation. 
 66 |     # tr_err, te_err are frame based. To get event based err, run recognize.py
 67 |     validation = Validation(tr_x=tr_x, tr_y=tr_y, 
 68 |                             va_x=None, va_y=None, 
 69 |                             te_x=te_x, te_y=te_y, 
 70 |                             batch_size=500, call_freq=1, dump_path=None)
 71 |     
 72 |     # Save model
 73 |     pp_data.create_folder(out_md_fd)
 74 |     save_model = SaveModel(out_md_fd, call_freq=2)
 75 |     
 76 |     # Callbacks
 77 |     callbacks = [validation, save_model]
 78 |     
 79 |     # Optimizer
 80 |     optimizer = Adam(1e-3)
 81 |     
 82 |     # fit model
 83 |     md.fit(x=tr_x, y=tr_y, 
 84 |            batch_size=100, 
 85 |            n_epochs=101, 
 86 |            loss_func='categorical_crossentropy', 
 87 |            optimizer=optimizer, 
 88 |            callbacks=callbacks)
 89 |            
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 |     # hyper-params
 94 |     n_concat = 11        # concatenate frames
 95 |     hop = 5            # step_len
 96 |     fold = 0            # can be 0, 1, 2, 3
 97 |     
 98 |     dev_fe_fd = cfg.dev_fe_logmel_fd
 99 |     eva_fd_fd = cfg.eva_fe_logmel_fd
100 |     
101 |     if sys.argv[1] == "--dev_train": 
102 |         scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 
103 |                                     csv_file=cfg.dev_tr_csv[fold], 
104 |                                     with_mean=True, 
105 |                                     with_std=True)
106 |         train(tr_fe_fd=dev_fe_fd, 
107 |               tr_csv_file=cfg.dev_tr_csv[fold], 
108 |               te_fe_fd=dev_fe_fd, 
109 |               te_csv_file=cfg.dev_te_csv[fold], 
110 |               n_concat=n_concat, 
111 |               hop=hop, 
112 |               scaler=scaler, 
113 |               out_md_fd=cfg.dev_md_fd)
114 |               
115 |     elif sys.argv[1] == "--dev_recognize":
116 |         scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 
117 |                                     csv_file=cfg.dev_tr_csv[fold], 
118 |                                     with_mean=True, 
119 |                                     with_std=True)
120 |         pp_data.recognize(md_path=cfg.dev_md_fd+'/md10_epochs.p', 
121 |                   te_fe_fd=dev_fe_fd, 
122 |                   te_csv_file=cfg.dev_te_csv[fold], 
123 |                   n_concat=n_concat, 
124 |                   hop=hop, 
125 |                   scaler=scaler)
126 |                   
127 |     elif sys.argv[1] == "--eva_train": 
128 |         scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 
129 |                                     csv_file=cfg.dev_meta_csv, 
130 |                                     with_mean=True, 
131 |                                     with_std=True)
132 |         train(tr_fe_fd=dev_fe_fd, 
133 |               tr_csv_file=cfg.dev_meta_csv, 
134 |               te_fe_fd=eva_fd_fd, 
135 |               te_csv_file=cfg.eva_meta_csv, 
136 |               n_concat=n_concat, 
137 |               hop=hop, 
138 |               scaler=scaler, 
139 |               out_md_fd=cfg.eva_md_fd)
140 |               
141 |     elif sys.argv[1] == "--eva_recognize":
142 |         scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 
143 |                                     csv_file=cfg.dev_meta_csv, 
144 |                                     with_mean=True, 
145 |                                     with_std=True)
146 |         pp_data.recognize(md_path=cfg.eva_md_fd+'/md10_epochs.p', 
147 |                   te_fe_fd=eva_fd_fd, 
148 |                   te_csv_file=cfg.eva_meta_csv, 
149 |                   n_concat=n_concat, 
150 |                   scaler=scaler, 
151 |                   hop=hop)
152 |     else: 
153 |         raise Exception("Incorrect argv!")


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SUMMARY:  prepareData, some functions copy from py_sceneClassification2
  3 | AUTHOR:   Qiuqiang Kong
  4 | Created:  2016.05.11
  5 | Modified: 2017.05.02
  6 | --------------------------------------
  7 | """
  8 | import csv
  9 | import wavio
 10 | import sys
 11 | import numpy as np
 12 | from scipy import signal
 13 | import scipy.stats
 14 | import cPickle
 15 | import os
 16 | import matplotlib.pyplot as plt
 17 | from scipy import signal
 18 | import librosa
 19 | 
 20 | import config as cfg
 21 | 
 22 | sys.path.append(cfg.hat_root)
 23 | from hat.preprocessing import mat_2d_to_3d, sparse_to_categorical
 24 | from hat import serializations
 25 | 
 26 | 
 27 | def create_folder(fd):
 28 |     if not os.path.exists(fd):
 29 |         os.makedirs(fd)
 30 |         
 31 | def get_mode_value(ary):
 32 |     return scipy.stats.mode(ary)[0]
 33 | 
 34 | ### Wave related
 35 | def readwav(path):
 36 |     Struct = wavio.read(path)
 37 |     wav = Struct.data.astype(float) / np.power(2, Struct.sampwidth*8-1)
 38 |     fs = Struct.rate
 39 |     return wav, fs
 40 | 
 41 | def to_mono(wav):
 42 |     if wav.ndim == 1:
 43 |         return wav
 44 |     elif wav.ndim == 2:
 45 |         return np.mean(wav, axis=-1)
 46 |         
 47 | ### Extract feature
 48 | def calculate_logmel(wav_fd, fe_fd):
 49 |     """Calculate log mel spectrogram and write to disk. 
 50 |     
 51 |     Args:
 52 |       wav_fd: string. 
 53 |       fe_fd: string. Calculated features will be saved here. 
 54 |     """
 55 |     names = [na for na in os.listdir(wav_fd) if na.endswith('.wav')]
 56 |     names = sorted(names)
 57 |     for na in names:
 58 |         print na
 59 |         path = os.path.join(wav_fd, na)
 60 |         wav, fs = readwav(path)
 61 |         wav = to_mono(wav)
 62 |         assert fs == cfg.fs
 63 |         ham_win = np.hamming(cfg.n_fft)
 64 |         [f, t, x] = signal.spectral.spectrogram(x=wav, 
 65 |                                                 window=ham_win, 
 66 |                                                 nperseg=cfg.n_fft, 
 67 |                                                 noverlap=0, 
 68 |                                                 detrend=False, 
 69 |                                                 return_onesided=True, 
 70 |                                                 mode='magnitude') 
 71 |         x = x.T     # (n_frames, n_freq)
 72 |         
 73 |         # Mel transform matrix
 74 |         if globals().get('melW') is None:
 75 |             global melW
 76 |             melW = librosa.filters.mel(sr=fs, 
 77 |                                        n_fft=cfg.n_fft, 
 78 |                                        n_mels=64, 
 79 |                                        fmin=0., 
 80 |                                        fmax=22100)
 81 |             #melW /= np.max(melW, axis=-1)[:,None]
 82 |             
 83 |         x = np.dot(x, melW.T)
 84 |         x = np.log(x + 1e-8)
 85 |         
 86 |         # # DEBUG. print mel-spectrogram
 87 |         # plt.matshow(x.T, origin='lower', aspect='auto')
 88 |         # plt.show()
 89 |         # pause
 90 |         
 91 |         out_path = fe_fd + '/' + na[0:-4] + '.f'
 92 |         cPickle.dump(x, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
 93 | 
 94 | ### Data related
 95 | def get_scaler(fe_fd, csv_file, with_mean, with_std):
 96 |     """Calculate scaler from data in csv_file. 
 97 |     
 98 |     Args:
 99 |       fe_fd: string. Feature folder. 
100 |       csv_file: string. Path of csv file. 
101 |       with_mean: bool. 
102 |       with_std: bool. 
103 |       
104 |     Returns:
105 |       scaler object. 
106 |     """
107 |     with open(csv_file, 'rb') as f:
108 |         reader = csv.reader(f)
109 |         lis = list(reader)
110 |     
111 |     x_all = []
112 |     for li in lis:
113 |         try:
114 |             [na, lb] = li[0].split('\t')
115 |         except:
116 |             na = li[0]
117 |         na = na.split('/')[1][0:-4]
118 |         path = fe_fd + '/' + na + '.f'
119 |         x = cPickle.load(open(path, 'rb'))
120 |         x_all.append(x)
121 |     
122 |     x_all = np.concatenate(x_all, axis=0)
123 |     from sklearn import preprocessing
124 |     scaler = preprocessing.StandardScaler(with_mean, with_std).fit(x_all)
125 |     return scaler
126 |     
127 | def get_matrix_format_data(fe_fd, csv_file, n_concat, hop, scaler):
128 |     """Get training data and ground truth in matrix format. 
129 |     
130 |     Args:
131 |       fe_fd: string. Feature folder. 
132 |       csv_file: string. Path of csv file. 
133 |       n_concat: integar. Number of frames to concatenate. 
134 |       hop: integar. Number of hop frames. 
135 |       scaler: None | object. 
136 |     """
137 |     with open(csv_file, 'rb') as f:
138 |         reader = csv.reader(f)
139 |         lis = list(reader)
140 |     
141 |     x3d_all = []
142 |     y_all = []
143 |     
144 |     for li in lis:
145 |         [na, lb] = li[0].split('\t')
146 |         na = na.split('/')[1][0:-4]
147 |         path = fe_fd + '/' + na + '.f'
148 |         x = cPickle.load(open(path, 'rb'))
149 |         if scaler:
150 |             x = scaler.transform(x)
151 |         x3d = mat_2d_to_3d(x, n_concat, hop)     # (n_blocks, n_concat, n_freq)
152 |         x3d_all.append(x3d)
153 |         y_all += [cfg.lb_to_id[lb]] * len(x3d)
154 |     
155 |     x3d_all = np.concatenate(x3d_all)       # (n_samples, n_concat, n_freq)
156 |     y_all = np.array(y_all)            
157 |     y_all = sparse_to_categorical(y_all, len(cfg.labels)) # (n_samples, n_labels)
158 |     return x3d_all, y_all
159 | 
160 | ### Recognize
161 | def recognize(md_path, te_fe_fd, te_csv_file, n_concat, hop, scaler):
162 |     """Recognize and get statistics. 
163 |     
164 |     Args:
165 |       md_path: string. Path of model. 
166 |       te_fe_fd: string. Folder path containing testing features. 
167 |       te_csv_file: string. Path of test csv file. 
168 |       n_concat: integar. Number of frames to concatenate. 
169 |       hop: integar. Number of frames to hop. 
170 |       scaler: None | scaler object. 
171 |     """
172 |     # Load model
173 |     md = serializations.load(md_path)
174 | 
175 |     # Recognize and get statistics
176 |     n_labels = len(cfg.labels)
177 |     confuse_mat = np.zeros((n_labels, n_labels))      # confusion matrix
178 |     frame_based_accs = []
179 |     
180 |     # Get test file names
181 |     with open(te_csv_file, 'rb') as f:
182 |         reader = csv.reader(f)
183 |         lis = list(reader)
184 |         
185 |     # Predict for each scene
186 |     for li in lis:
187 |         # Load data
188 |         [na, lb] = li[0].split('\t')
189 |         na = na.split('/')[1][0:-4]
190 |         path = te_fe_fd + '/' + na + '.f'
191 |         x = cPickle.load(open(path, 'rb'))
192 |         if scaler:
193 |             x = scaler.transform(x)
194 |         x = mat_2d_to_3d(x, n_concat, hop)
195 |     
196 |         # Predict
197 |         p_y_preds = md.predict(x)[0]        # (n_block,label)
198 |         pred_ids = np.argmax(p_y_preds, axis=-1)     # (n_block,)
199 |         pred_id = int(get_mode_value(pred_ids))
200 |         gt_id = cfg.lb_to_id[lb]
201 |         
202 |         # Statistics
203 |         confuse_mat[gt_id, pred_id] += 1            
204 |         n_correct_frames = list(pred_ids).count(gt_id)
205 |         frame_based_accs += [float(n_correct_frames) / len(pred_ids)]
206 |             
207 |     clip_based_acc = np.sum(np.diag(np.diag(confuse_mat))) / np.sum(confuse_mat)
208 |     frame_based_acc = np.mean(frame_based_accs)
209 |     
210 |     print 'event_acc:', clip_based_acc
211 |     print 'frame_acc:', frame_based_acc
212 |     print confuse_mat
213 | 
214 | ### Main
215 | if __name__ == "__main__":
216 |     create_folder(cfg.dev_fe_logmel_fd)
217 |     create_folder(cfg.eva_fe_logmel_fd)
218 |     
219 |     # calculate mel feature
220 |     calculate_logmel(cfg.dev_wav_fd, cfg.dev_fe_logmel_fd)
221 |     calculate_logmel(cfg.eva_wav_fd, cfg.eva_fe_logmel_fd)


--------------------------------------------------------------------------------