├── README ├── config.py ├── main_dnn.py └── prepare_data.py /README: -------------------------------------------------------------------------------- 1 | This is sourcecode of DCASE Challenge 2016 Task 1 of paper 2 | http://www.cs.tut.fi/sgn/arg/dcase2016/documents/challenge_technical_reports/Task1/Kong_2016_task1.pdf 3 | 4 | THIS CODE IS MIT LISCENCED. 5 | AUTHOR: QIUQIANG KONG, q.kong@surrey.ac.uk 6 | CVSSP, UNIVERSITY OF SURREY 7 | Created: 2016.06.24 8 | Latest modified: 2017.05.02 9 | ====================================== 10 | 11 | Pre-requiste python packages: 12 | pip install hat [--upgrade] # Deep learning toolbox, https://qiuqiangkong.github.io/Hat 13 | 14 | Usage: 15 | ------ Preparation ------ 16 | * modify the paths of dataset in config.py 17 | * python prepare_data.py --calculate_logmel # Calculate features 18 | 19 | ------ Development Cross Validation ------ 20 | * python main_dnn.py --dev_train # Train model on cross validation of development data. 21 | * python main_dnn.py --dev_recognize # Get predictions and print out statistics. 22 | 23 | ------ Testing of Private Data ------ 24 | * python main_dnn.py --eva_train # Train model on whole development data 25 | * python main_dnn.py --eva_recognize # Get predictions of private data and print statistics. 26 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """ 2 | SUMMARY: config file 3 | AUTHOR: Qiuqiang Kong 4 | Created: 2016.05.11 5 | Modified: 2016.06.24 Delete unnecessary 6 | 2016.10.09 rename some variables 7 | 2017.04.27 Minor fix. 8 | -------------------------------------- 9 | """ 10 | # hat deep learning toolbox 11 | hat_root = "" # you need to specify the source code path if hat is not 12 | # installed on your computer. 13 | 14 | # development config 15 | dev_wav_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/audio" 16 | 17 | dev_csv_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/evaluation_setup" 18 | dev_tr_csv = [dev_csv_fd+"/fold1_train.txt", dev_csv_fd+"/fold2_train.txt", 19 | dev_csv_fd+"/fold3_train.txt", dev_csv_fd+"/fold4_train.txt"] 20 | dev_te_csv = [dev_csv_fd+"/fold1_evaluate.txt", dev_csv_fd+"/fold2_evaluate.txt", 21 | dev_csv_fd+"/fold3_evaluate.txt", dev_csv_fd+"/fold4_evaluate.txt"] 22 | dev_meta_csv = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-development/meta.txt" 23 | 24 | # evaluation config 25 | eva_wav_fd = "/vol/vssp/datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-evaluation/audio" 26 | eva_meta_csv = "/vol/vssp/AP_datasets/audio/dcase2016/task1/TUT-acoustic-scenes-2016-evaluation/meta.txt" 27 | 28 | # your workspace 29 | scrap_fd = "/vol/vssp/msos/qk/DCASE2016_task1_scrap" 30 | fe_fd = scrap_fd + "/features" 31 | dev_fe_logmel_fd = fe_fd + "/dev/logmel" 32 | eva_fe_logmel_fd = fe_fd + "/eva/logmel" 33 | md_fd = scrap_fd + "/models" 34 | dev_md_fd = scrap_fd + "/dev" 35 | eva_md_fd = scrap_fd + "/eva" 36 | 37 | # 1 of 15 acoustic label 38 | labels = ['bus', 'cafe/restaurant', 'car', 'city_center', 'forest_path', 39 | 'grocery_store', 'home', 'beach', 'library', 'metro_station', 40 | 'office', 'residential_area', 'train', 'tram', 'park'] 41 | 42 | lb_to_id = {lb:id for id, lb in enumerate(labels)} 43 | id_to_lb = {id:lb for id, lb in enumerate(labels)} 44 | 45 | fs = 44100. 46 | n_fft = 1024. -------------------------------------------------------------------------------- /main_dnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | SUMMARY: Dcase 2016 Task 1. Scene classification 3 | Train and test on dev dataset (split to 4 cv-folders) 4 | Training time: 17 s/epoch. (GTX TitanX) 5 | Dev: te_frame_based_acc: 61.4% +- ? , te_clip_based_acc: 80.7% +- ? 6 | (Train on fold 1,2,3 and validate on fold 0) 7 | Eva: te_frame_based_acc: 81.0% +- ? , te_clip_based_acc: 69.2% +- ? 8 | after 10 epoches. 9 | AUTHOR: Qiuqiang Kong 10 | Created: 2016.05.11 11 | Modified: 2017.05.02 12 | -------------------------------------- 13 | """ 14 | import cPickle 15 | import numpy as np 16 | np.random.seed(1515) 17 | import os 18 | import sys 19 | import csv 20 | 21 | import config as cfg 22 | import prepare_data as pp_data 23 | 24 | sys.path.append(cfg.hat_root) 25 | from hat.models import Sequential 26 | from hat.layers.core import InputLayer, Flatten, Dense, Dropout 27 | from hat.callbacks import SaveModel, Validation 28 | from hat.preprocessing import sparse_to_categorical, mat_2d_to_3d 29 | from hat.optimizers import SGD, Adam 30 | import hat.backend as K 31 | 32 | 33 | def train(tr_fe_fd, tr_csv_file, te_fe_fd, te_csv_file, 34 | n_concat, hop, scaler, out_md_fd): 35 | # Prepare data 36 | tr_x, tr_y = pp_data.get_matrix_format_data( 37 | fe_fd=tr_fe_fd, 38 | csv_file=tr_csv_file, 39 | n_concat=n_concat, hop=hop, scaler=scaler) 40 | 41 | te_x, te_y = pp_data.get_matrix_format_data( 42 | fe_fd=te_fe_fd, 43 | csv_file=te_csv_file, 44 | n_concat=n_concat, hop=hop, scaler=scaler) 45 | 46 | n_freq = tr_x.shape[2] 47 | print 'tr_x.shape:', tr_x.shape # (n_samples, n_concat, n_freq) 48 | print 'tr_y.shape:', tr_y.shape # (n_samples, n_labels) 49 | 50 | 51 | # Build model 52 | n_out = len(cfg.labels) 53 | seq = Sequential() 54 | seq.add(InputLayer((n_concat, n_freq))) 55 | seq.add(Flatten()) 56 | seq.add(Dropout(0.2)) 57 | seq.add(Dense(200, act='relu')) 58 | seq.add(Dropout(0.2)) 59 | seq.add(Dense(200, act='relu')) 60 | seq.add(Dropout(0.2)) 61 | seq.add(Dense(n_out, act='softmax')) 62 | md = seq.compile() 63 | md.summary() 64 | 65 | # Validation. 66 | # tr_err, te_err are frame based. To get event based err, run recognize.py 67 | validation = Validation(tr_x=tr_x, tr_y=tr_y, 68 | va_x=None, va_y=None, 69 | te_x=te_x, te_y=te_y, 70 | batch_size=500, call_freq=1, dump_path=None) 71 | 72 | # Save model 73 | pp_data.create_folder(out_md_fd) 74 | save_model = SaveModel(out_md_fd, call_freq=2) 75 | 76 | # Callbacks 77 | callbacks = [validation, save_model] 78 | 79 | # Optimizer 80 | optimizer = Adam(1e-3) 81 | 82 | # fit model 83 | md.fit(x=tr_x, y=tr_y, 84 | batch_size=100, 85 | n_epochs=101, 86 | loss_func='categorical_crossentropy', 87 | optimizer=optimizer, 88 | callbacks=callbacks) 89 | 90 | 91 | 92 | if __name__ == '__main__': 93 | # hyper-params 94 | n_concat = 11 # concatenate frames 95 | hop = 5 # step_len 96 | fold = 0 # can be 0, 1, 2, 3 97 | 98 | dev_fe_fd = cfg.dev_fe_logmel_fd 99 | eva_fd_fd = cfg.eva_fe_logmel_fd 100 | 101 | if sys.argv[1] == "--dev_train": 102 | scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 103 | csv_file=cfg.dev_tr_csv[fold], 104 | with_mean=True, 105 | with_std=True) 106 | train(tr_fe_fd=dev_fe_fd, 107 | tr_csv_file=cfg.dev_tr_csv[fold], 108 | te_fe_fd=dev_fe_fd, 109 | te_csv_file=cfg.dev_te_csv[fold], 110 | n_concat=n_concat, 111 | hop=hop, 112 | scaler=scaler, 113 | out_md_fd=cfg.dev_md_fd) 114 | 115 | elif sys.argv[1] == "--dev_recognize": 116 | scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 117 | csv_file=cfg.dev_tr_csv[fold], 118 | with_mean=True, 119 | with_std=True) 120 | pp_data.recognize(md_path=cfg.dev_md_fd+'/md10_epochs.p', 121 | te_fe_fd=dev_fe_fd, 122 | te_csv_file=cfg.dev_te_csv[fold], 123 | n_concat=n_concat, 124 | hop=hop, 125 | scaler=scaler) 126 | 127 | elif sys.argv[1] == "--eva_train": 128 | scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 129 | csv_file=cfg.dev_meta_csv, 130 | with_mean=True, 131 | with_std=True) 132 | train(tr_fe_fd=dev_fe_fd, 133 | tr_csv_file=cfg.dev_meta_csv, 134 | te_fe_fd=eva_fd_fd, 135 | te_csv_file=cfg.eva_meta_csv, 136 | n_concat=n_concat, 137 | hop=hop, 138 | scaler=scaler, 139 | out_md_fd=cfg.eva_md_fd) 140 | 141 | elif sys.argv[1] == "--eva_recognize": 142 | scaler = pp_data.get_scaler(fe_fd=dev_fe_fd, 143 | csv_file=cfg.dev_meta_csv, 144 | with_mean=True, 145 | with_std=True) 146 | pp_data.recognize(md_path=cfg.eva_md_fd+'/md10_epochs.p', 147 | te_fe_fd=eva_fd_fd, 148 | te_csv_file=cfg.eva_meta_csv, 149 | n_concat=n_concat, 150 | scaler=scaler, 151 | hop=hop) 152 | else: 153 | raise Exception("Incorrect argv!") -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | SUMMARY: prepareData, some functions copy from py_sceneClassification2 3 | AUTHOR: Qiuqiang Kong 4 | Created: 2016.05.11 5 | Modified: 2017.05.02 6 | -------------------------------------- 7 | """ 8 | import csv 9 | import wavio 10 | import sys 11 | import numpy as np 12 | from scipy import signal 13 | import scipy.stats 14 | import cPickle 15 | import os 16 | import matplotlib.pyplot as plt 17 | from scipy import signal 18 | import librosa 19 | 20 | import config as cfg 21 | 22 | sys.path.append(cfg.hat_root) 23 | from hat.preprocessing import mat_2d_to_3d, sparse_to_categorical 24 | from hat import serializations 25 | 26 | 27 | def create_folder(fd): 28 | if not os.path.exists(fd): 29 | os.makedirs(fd) 30 | 31 | def get_mode_value(ary): 32 | return scipy.stats.mode(ary)[0] 33 | 34 | ### Wave related 35 | def readwav(path): 36 | Struct = wavio.read(path) 37 | wav = Struct.data.astype(float) / np.power(2, Struct.sampwidth*8-1) 38 | fs = Struct.rate 39 | return wav, fs 40 | 41 | def to_mono(wav): 42 | if wav.ndim == 1: 43 | return wav 44 | elif wav.ndim == 2: 45 | return np.mean(wav, axis=-1) 46 | 47 | ### Extract feature 48 | def calculate_logmel(wav_fd, fe_fd): 49 | """Calculate log mel spectrogram and write to disk. 50 | 51 | Args: 52 | wav_fd: string. 53 | fe_fd: string. Calculated features will be saved here. 54 | """ 55 | names = [na for na in os.listdir(wav_fd) if na.endswith('.wav')] 56 | names = sorted(names) 57 | for na in names: 58 | print na 59 | path = os.path.join(wav_fd, na) 60 | wav, fs = readwav(path) 61 | wav = to_mono(wav) 62 | assert fs == cfg.fs 63 | ham_win = np.hamming(cfg.n_fft) 64 | [f, t, x] = signal.spectral.spectrogram(x=wav, 65 | window=ham_win, 66 | nperseg=cfg.n_fft, 67 | noverlap=0, 68 | detrend=False, 69 | return_onesided=True, 70 | mode='magnitude') 71 | x = x.T # (n_frames, n_freq) 72 | 73 | # Mel transform matrix 74 | if globals().get('melW') is None: 75 | global melW 76 | melW = librosa.filters.mel(sr=fs, 77 | n_fft=cfg.n_fft, 78 | n_mels=64, 79 | fmin=0., 80 | fmax=22100) 81 | #melW /= np.max(melW, axis=-1)[:,None] 82 | 83 | x = np.dot(x, melW.T) 84 | x = np.log(x + 1e-8) 85 | 86 | # # DEBUG. print mel-spectrogram 87 | # plt.matshow(x.T, origin='lower', aspect='auto') 88 | # plt.show() 89 | # pause 90 | 91 | out_path = fe_fd + '/' + na[0:-4] + '.f' 92 | cPickle.dump(x, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) 93 | 94 | ### Data related 95 | def get_scaler(fe_fd, csv_file, with_mean, with_std): 96 | """Calculate scaler from data in csv_file. 97 | 98 | Args: 99 | fe_fd: string. Feature folder. 100 | csv_file: string. Path of csv file. 101 | with_mean: bool. 102 | with_std: bool. 103 | 104 | Returns: 105 | scaler object. 106 | """ 107 | with open(csv_file, 'rb') as f: 108 | reader = csv.reader(f) 109 | lis = list(reader) 110 | 111 | x_all = [] 112 | for li in lis: 113 | try: 114 | [na, lb] = li[0].split('\t') 115 | except: 116 | na = li[0] 117 | na = na.split('/')[1][0:-4] 118 | path = fe_fd + '/' + na + '.f' 119 | x = cPickle.load(open(path, 'rb')) 120 | x_all.append(x) 121 | 122 | x_all = np.concatenate(x_all, axis=0) 123 | from sklearn import preprocessing 124 | scaler = preprocessing.StandardScaler(with_mean, with_std).fit(x_all) 125 | return scaler 126 | 127 | def get_matrix_format_data(fe_fd, csv_file, n_concat, hop, scaler): 128 | """Get training data and ground truth in matrix format. 129 | 130 | Args: 131 | fe_fd: string. Feature folder. 132 | csv_file: string. Path of csv file. 133 | n_concat: integar. Number of frames to concatenate. 134 | hop: integar. Number of hop frames. 135 | scaler: None | object. 136 | """ 137 | with open(csv_file, 'rb') as f: 138 | reader = csv.reader(f) 139 | lis = list(reader) 140 | 141 | x3d_all = [] 142 | y_all = [] 143 | 144 | for li in lis: 145 | [na, lb] = li[0].split('\t') 146 | na = na.split('/')[1][0:-4] 147 | path = fe_fd + '/' + na + '.f' 148 | x = cPickle.load(open(path, 'rb')) 149 | if scaler: 150 | x = scaler.transform(x) 151 | x3d = mat_2d_to_3d(x, n_concat, hop) # (n_blocks, n_concat, n_freq) 152 | x3d_all.append(x3d) 153 | y_all += [cfg.lb_to_id[lb]] * len(x3d) 154 | 155 | x3d_all = np.concatenate(x3d_all) # (n_samples, n_concat, n_freq) 156 | y_all = np.array(y_all) 157 | y_all = sparse_to_categorical(y_all, len(cfg.labels)) # (n_samples, n_labels) 158 | return x3d_all, y_all 159 | 160 | ### Recognize 161 | def recognize(md_path, te_fe_fd, te_csv_file, n_concat, hop, scaler): 162 | """Recognize and get statistics. 163 | 164 | Args: 165 | md_path: string. Path of model. 166 | te_fe_fd: string. Folder path containing testing features. 167 | te_csv_file: string. Path of test csv file. 168 | n_concat: integar. Number of frames to concatenate. 169 | hop: integar. Number of frames to hop. 170 | scaler: None | scaler object. 171 | """ 172 | # Load model 173 | md = serializations.load(md_path) 174 | 175 | # Recognize and get statistics 176 | n_labels = len(cfg.labels) 177 | confuse_mat = np.zeros((n_labels, n_labels)) # confusion matrix 178 | frame_based_accs = [] 179 | 180 | # Get test file names 181 | with open(te_csv_file, 'rb') as f: 182 | reader = csv.reader(f) 183 | lis = list(reader) 184 | 185 | # Predict for each scene 186 | for li in lis: 187 | # Load data 188 | [na, lb] = li[0].split('\t') 189 | na = na.split('/')[1][0:-4] 190 | path = te_fe_fd + '/' + na + '.f' 191 | x = cPickle.load(open(path, 'rb')) 192 | if scaler: 193 | x = scaler.transform(x) 194 | x = mat_2d_to_3d(x, n_concat, hop) 195 | 196 | # Predict 197 | p_y_preds = md.predict(x)[0] # (n_block,label) 198 | pred_ids = np.argmax(p_y_preds, axis=-1) # (n_block,) 199 | pred_id = int(get_mode_value(pred_ids)) 200 | gt_id = cfg.lb_to_id[lb] 201 | 202 | # Statistics 203 | confuse_mat[gt_id, pred_id] += 1 204 | n_correct_frames = list(pred_ids).count(gt_id) 205 | frame_based_accs += [float(n_correct_frames) / len(pred_ids)] 206 | 207 | clip_based_acc = np.sum(np.diag(np.diag(confuse_mat))) / np.sum(confuse_mat) 208 | frame_based_acc = np.mean(frame_based_accs) 209 | 210 | print 'event_acc:', clip_based_acc 211 | print 'frame_acc:', frame_based_acc 212 | print confuse_mat 213 | 214 | ### Main 215 | if __name__ == "__main__": 216 | create_folder(cfg.dev_fe_logmel_fd) 217 | create_folder(cfg.eva_fe_logmel_fd) 218 | 219 | # calculate mel feature 220 | calculate_logmel(cfg.dev_wav_fd, cfg.dev_fe_logmel_fd) 221 | calculate_logmel(cfg.eva_wav_fd, cfg.eva_fe_logmel_fd) --------------------------------------------------------------------------------