├── Mmnet.py ├── MusicAutoTagging ├── README.md ├── main │ ├── Test.py │ ├── Trainer.py │ ├── data_gentor.py │ ├── evaluator.py │ ├── layers.py │ ├── main.py │ └── main_te.py ├── net │ ├── CNN_2D.py │ ├── CNN_2D.pyc │ ├── FrameCNN_1D.py │ ├── FrameCNN_1D.pyc │ ├── FrameCNN_2D.py │ ├── FrameCNN_2D.pyc │ ├── FrameCNN_MS_2D.py │ ├── Mmnet.py │ ├── Mmnet.pyc │ ├── Mmnet_2D.py │ └── Mmnet_2D.pyc └── pre │ ├── ext.py │ ├── ext_wav.py │ ├── gen_ASmusic_data.py │ └── te_ext.py ├── README.md ├── Trainer.py ├── data ├── groundtruth_weak_label_testing_set.csv ├── groundtruth_weak_label_training_set.csv └── sound_event_list_17_classes.txt ├── data_generator.py ├── evaluator.py ├── feature_extractor.py └── main.py /Mmnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | class block(nn.Module): 5 | def __init__(self, inp, out): 6 | super(block, self).__init__() 7 | self.bn1 = nn.BatchNorm2d(inp) 8 | self.conv1 = nn.Conv2d(inp, out, (1,3), padding=(0,1)) 9 | self.bn2 = nn.BatchNorm2d(out) 10 | self.conv2 = nn.Conv2d(out, out, (1,3), padding=(0,1)) 11 | self.bn3 = nn.BatchNorm2d(out) 12 | 13 | self.sk = nn.Conv2d(inp, out, (1,1), padding=(0,0)) 14 | def forward(self, x): 15 | out = self.bn1(x) 16 | bn1 = F.relu(out) 17 | out = self.conv1(out) 18 | out = self.conv2(F.relu(self.bn2(out))) 19 | out = self.bn3(out) 20 | out += self.sk(x) 21 | return out, bn1 22 | 23 | 24 | class Net(nn.Module): 25 | def __init__(self, channel, num_classes): 26 | super(Net, self).__init__() 27 | self.nc = num_classes 28 | self.model_name = 'Mmnet_1D' 29 | 30 | self.ks = (1,3) 31 | self.ps = (0,1) 32 | inp = channel 33 | 34 | self.bn = nn.BatchNorm2d(inp) 35 | 36 | self.conv1 = nn.Conv2d(inp, inp, self.ks, padding=self.ps) 37 | self.b1 = block(inp*1, inp*2) 38 | self.b2 = block(inp*2, inp*3) 39 | self.b3 = block(inp*3, inp*3) 40 | self.bnf = nn.BatchNorm2d(inp*3) 41 | 42 | self.ks = (1,1) 43 | self.ps = (0,0) 44 | self.det = nn.Conv2d(inp*3, self.nc, self.ks, padding=self.ps) 45 | self.att = nn.Conv2d(inp*3, inp*3, self.ks, padding=self.ps) 46 | 47 | self.dp = nn.Dropout(.0) 48 | dns = 512*2 49 | 50 | # linear 51 | self.den1 = nn.Linear(inp*3, dns) 52 | self.den2 = nn.Linear(dns, dns) 53 | self.dbn1 = nn.BatchNorm1d(dns) 54 | self.dbn2 = nn.BatchNorm1d(dns) 55 | self.prd = nn.Linear(dns, self.nc) 56 | 57 | def nn_apl(self, x): 58 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 59 | 60 | def nn_att(self, inp, att): 61 | att_out = F.softmax(att(inp), dim=3) 62 | 63 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 64 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 65 | #att_ens = F.softmax(att_sup, dim=3) 66 | 67 | return att_sc 68 | 69 | def forward(self, x, Xavg, Xstd): 70 | 71 | x = x.permute(0,2,1,3) 72 | xs = x.size() 73 | Xavg = Xavg.view(1, Xavg.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 74 | Xstd = Xstd.view(1, Xstd.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 75 | z_x = (x - Xavg)/Xstd 76 | 77 | z_x = self.bn(z_x) 78 | 79 | conv1 = self.conv1(z_x) 80 | b1, bnb1 = self.b1(conv1) 81 | mp1 = F.max_pool2d(b1, (1,4)) 82 | b2, bnb2 = self.b2(mp1) 83 | mp2 = F.max_pool2d(b2, (1,4)) 84 | b3, bnb3 = self.b3(mp2) 85 | bf = F.relu(self.bnf(b3)) 86 | 87 | # global average pooling 88 | gap = self.nn_apl(bf) 89 | 90 | #DNN 91 | den1 = F.relu(self.dbn1(self.den1(self.dp(gap)))) 92 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 93 | y_cls = self.prd(self.dp(den2)) 94 | 95 | # attention 96 | att = self.nn_att(bf, self.att) 97 | det = self.det(bf) 98 | 99 | 100 | # ensemble prediction 101 | att_ens = F.softmax(att, dim=3) 102 | y_att = (det * att_ens).sum(-1).sum(-1) 103 | y_ens = (y_cls + y_att)/2 104 | 105 | return y_ens, [[att, det]] 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /MusicAutoTagging/README.md: -------------------------------------------------------------------------------- 1 | # A M&mnet-based Network for Music Auto-tagging) 2 | 3 | ## Requirements 4 | * Python 2.7 5 | * LibROSA 0.6.0 6 | * PyTorch 0.4.0 7 | * cuda-8.0 8 | * Download [Pytorch 2.7](https://pytorch.org) 9 | ```bash 10 | pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp27-cp27mu-linux_x86_64.whl 11 | pip install torchvision 12 | ``` 13 | 14 | 15 | ## Usage 16 | Note: You need to modify to your dataset path before you run the code. 17 | 18 | $ python ./pre/gen_ASmusic_data.py 19 | $ python ./main/main.py 20 | 21 | ## Pre-trained models 22 | Model |Dataset | 23 | :----:|:--------:| 24 | [M&mnet](https://drive.google.com/file/d/1hfNTgH4WM2qlgIKrqizxqWNp7UvWvFBs/view?usp=sharing)|[AudioSetMusic](https://research.google.com/audioset/ontology/music_1.html) 25 | 26 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/Test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.utils.data import Dataset 7 | 8 | import time 9 | import sys 10 | import os 11 | sys.path.append('../net') 12 | sys.path.append('../pre') 13 | from Mmnet import * 14 | from data_gentor import * 15 | import glob 16 | from te_ext import * 17 | 18 | os.environ['CUDA_VISIBLE_DEVICES'] = '3' 19 | 20 | Xtr, Ytr, Xva, Yva, Xte, Yte, avg_std = load_data('') 21 | 22 | #a = mel(fn[0]) 23 | 24 | class HDF52Torch(Dataset): 25 | def __init__(self, X, Y, mode='Test'): 26 | self.X = X 27 | self.Y = Y 28 | self.mode = mode 29 | def __getitem__(self, index): 30 | 31 | rX = self.X[index] 32 | rY = self.Y[index] 33 | 34 | mX = torch.from_numpy(rX.astype('float32')) 35 | mY = torch.from_numpy(rY) 36 | return mX, mY 37 | 38 | def __len__(self): 39 | return len(self.X) 40 | 41 | def show_model_params(model): 42 | params = 0 43 | for i in model.parameters(): 44 | params += i.view(-1).size()[0] 45 | print 'Model:' + model.module.model_name + '\t#params:%d'%(params) 46 | 47 | 48 | class Trainer: 49 | def __init__(self, args): 50 | self.fn = glob.glob('../web_AED/static/fwave/*') 51 | model = nn.DataParallel(Net(args.mel, Ytr.shape[1]).cuda()) 52 | self.model = model 53 | self.args = args 54 | 55 | # load avg and std for Z-score 56 | Xavg = torch.tensor([avg_std[0]]) 57 | Xstd = torch.tensor([avg_std[1]]) 58 | self.Xavg, self.Xstd = Variable(Xavg.cuda()), Variable(Xstd.cuda()) 59 | 60 | self.load_pretrained_model() 61 | 62 | def load_pretrained_model(self): 63 | # pre-training 64 | if os.path.exists(self.args.pmp): 65 | pretrained_model = torch.load(self.args.pmp) 66 | self.pretrained_model = pretrained_model 67 | model_param = self.model.state_dict() 68 | for k in pretrained_model['state_dict'].keys(): 69 | try: 70 | model_param[k].copy_(pretrained_model['state_dict'][k]) 71 | except: 72 | print '[ERROR] Load pre-trained model' 73 | self.model.apply(model_init) 74 | break 75 | print 'Load Pre_trained Model : ' + self.args.pmp 76 | 77 | else: 78 | print 'Learning from scrath' 79 | #self.model.apply(model_init) 80 | 81 | 82 | def predictor(self): 83 | st = time.time() 84 | all_pred = [] 85 | self.model.eval() 86 | for i in self.fn: 87 | X = mel(i) 88 | X = torch.from_numpy(X.reshape(1, X.shape[0], X.shape[1])) 89 | print X.size() 90 | X = Variable( X.cuda()) 91 | clip_out, _ = self.model(X, self.Xavg, self.Xstd) 92 | all_pred.extend(F.sigmoid(clip_out).data.cpu().numpy()) 93 | 94 | print 'Prediction Time:%1f'%(time.time() - st) 95 | return np.array(all_pred) 96 | 97 | 98 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/Trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.utils.data import Dataset 7 | 8 | import time 9 | import sys 10 | import os 11 | sys.path.append('../net') 12 | from data_gentor import * 13 | #from Mmnet import * 14 | from CNN_2D import * 15 | from evaluator import * 16 | 17 | os.environ['CUDA_VISIBLE_DEVICES'] = '2' 18 | 19 | Xtr, Ytr, Xva, Yva, Xte, Yte, avg_std = load_data('') 20 | 21 | class HDF52Torch(Dataset): 22 | def __init__(self, X, Y, mode='Test'): 23 | self.X = X 24 | self.Y = Y 25 | self.mode = mode 26 | def __getitem__(self, index): 27 | 28 | if self.mode == 'Training': 29 | # roll 30 | rate = np.random.randint(100, self.X[0].shape[-1] - 100) 31 | rX = np.roll(self.X[index], rate, axis=-1) 32 | rX = rX[:,:400] 33 | rY = self.Y[index] 34 | ''' 35 | #mixup 36 | midx = np.random.randint(0, len(self.X)) 37 | mrate = np.random.randint(100, self.X[0].shape[-1] - 100) 38 | rmX = np.roll(self.X[midx], mrate, axis=-1) 39 | rX = (rX + rmX)/1 40 | rY = self.Y[index] + self.Y[midx] 41 | rY[rY>1] = 1 42 | ''' 43 | else: 44 | rX = self.X[index] 45 | rY = self.Y[index] 46 | 47 | mX = torch.from_numpy(rX.astype('float32')) 48 | mY = torch.from_numpy(rY) 49 | return mX, mY 50 | 51 | def __len__(self): 52 | return len(self.X) 53 | 54 | def model_init(m): 55 | classname = m.__class__.__name__ 56 | if classname.find('Conv') != -1: 57 | init.xavier_uniform_(m.weight, gain=np.sqrt(2)) 58 | init.constant_(m.bias, 0) 59 | elif classname.find('BatchNorm') != -1: 60 | init.constant_(m.weight, 1) 61 | init.constant_(m.bias, 0) 62 | elif classname.find('Linear') != -1: 63 | init.xavier_uniform_(m.weight, gain=np.sqrt(2)) 64 | init.constant_(m.bias, 0) 65 | 66 | def show_model_params(model): 67 | params = 0 68 | for i in model.parameters(): 69 | params += i.view(-1).size()[0] 70 | print 'Model:' + model.module.model_name + '\t#params:%d'%(params) 71 | 72 | 73 | class Trainer: 74 | def __init__(self, args): 75 | model = nn.DataParallel(Net(args.mel, Ytr.shape[1]).cuda()) 76 | self.model = model 77 | self.args = args 78 | 79 | # data builder 80 | data_args = {'batch_size': args.bs, 'num_workers': 5, 'pin_memory': True} 81 | self.tr_loader = torch.utils.data.DataLoader(HDF52Torch(Xtr, Ytr, mode='Training'), 82 | shuffle=True, drop_last=True, **data_args) 83 | data_args = {'batch_size': 16, 'num_workers': 5, 'pin_memory': True} 84 | self.va_loader = torch.utils.data.DataLoader(HDF52Torch(Xva, Yva), 85 | **data_args) 86 | self.te_loader = torch.utils.data.DataLoader(HDF52Torch(Xte, Yte), 87 | **data_args) 88 | 89 | # load avg and std for Z-score 90 | Xavg = torch.tensor([avg_std[0]]) 91 | Xstd = torch.tensor([avg_std[1]]) 92 | self.Xavg, self.Xstd = Variable(Xavg.cuda()), Variable(Xstd.cuda()) 93 | 94 | # pre-class loss weight 95 | # http://www.cs.tut.fi/sgn/arg/dcase2017/documents/workshop_presentations/the_story_of_audioset.pdf 96 | class_prior = Ytr[:].sum(0) / float(Ytr[:].sum()) 97 | mean_prior = class_prior.mean() 98 | PCLW = ( (mean_prior/ class_prior) * ((1-mean_prior)/(1-class_prior)) )**args.beta 99 | self.PCLW = torch.from_numpy(PCLW.astype('float32')).cuda() 100 | print self.PCLW 101 | self.show_dataset_model_params() 102 | self.load_pretrained_model() 103 | 104 | def load_pretrained_model(self): 105 | # pre-training 106 | if os.path.exists(self.args.pmp): 107 | pretrained_model = torch.load(self.args.pmp) 108 | model_param = self.model.state_dict() 109 | for k in pretrained_model['state_dict'].keys(): 110 | try: 111 | model_param[k].copy_(pretrained_model['state_dict'][k]) 112 | except: 113 | print '[ERROR] Load pre-trained model' 114 | self.model.apply(model_init) 115 | break 116 | print 'Load Pre_trained Model : ' + self.args.pmp 117 | 118 | else: 119 | print 'Learning from scrath' 120 | #self.model.apply(model_init) 121 | 122 | 123 | def show_dataset_model_params(self): 124 | # show model structure 125 | print self.model 126 | 127 | # show params 128 | print show_model_params(self.model) 129 | 130 | # show the size of training, validation and test set 131 | print 'Dataset : ' + self.args.dn 132 | print 'Xtr->' + str(self.tr_loader.dataset.X.shape) + '\t\tYtr->' + str(self.tr_loader.dataset.Y.shape) 133 | print 'Xva->' + str(self.va_loader.dataset.X.shape) + '\t\tYva->' + str(self.va_loader.dataset.Y.shape) 134 | print 'Xte->' + str(self.te_loader.dataset.X.shape) + '\t\tYte->' + str(self.te_loader.dataset.Y.shape) 135 | 136 | def mm_loss(self, target, macro_out, micro_out): 137 | 138 | #tar = target.data 139 | target = target.float() 140 | #iwe = 3 - self.PCLW 141 | we = self.PCLW * 1 142 | wwe = self.args.gw 143 | wwe = 1 144 | #we *= wwe 145 | loss = 0 146 | if (macro_out.size(1)/2) == target.size(1): 147 | we = we.view(1,-1).repeat(target.size(0), 1) 148 | macro_out = F.log_softmax(macro_out) 149 | l = (macro_out * target * we).sum() / target.sum() 150 | loss = -l 151 | 152 | else: 153 | #if (macro_out.size(1)/1) == target.size(1): 154 | 155 | we = we.view(1,-1).repeat(target.size(0), 1) 156 | iwe = 2 - we 157 | twe = we * target + (1 - target)*iwe 158 | loss_fn = torch.nn.BCEWithLogitsLoss(weight=twe, size_average=True) 159 | loss += loss_fn(macro_out, target) 160 | for att_sc, det in micro_out: 161 | os = det.size() 162 | fl_target = target.view(os[0], os[1], 1, 1).repeat(1,1,os[2],os[3]) 163 | #twe = we.view(os[0],os[1],1,1).repeat(1, 1, os[2], os[3]) 164 | #tiwe = iwe.view(os[0],os[1],1,1).repeat(1, 1, os[2], os[3]) 165 | #twe = att_sc.data * twe * fl_target + (1 - fl_target) * tiwe 166 | #twe = twe * fl_target + (1 - fl_target) * wwe 167 | # Noet: att_sc.data is requirement 168 | #loss_fn = torch.nn.BCEWithLogitsLoss(weight=twe, size_average=True) 169 | itwe = att_sc.data * twe.view(os[0], os[1], 1, 1).repeat(1,1,os[2],os[3]) 170 | loss_fn = torch.nn.BCELoss(weight=itwe, size_average=True) 171 | l = loss_fn(F.sigmoid(det), fl_target) 172 | loss += l 173 | 174 | return loss 175 | 176 | 177 | def predictor(self, loader): 178 | st = time.time() 179 | all_pred = [] 180 | self.model.eval() 181 | for data, target in loader: 182 | with torch.no_grad(): 183 | data, target = Variable(data.cuda()), Variable(target.cuda()) 184 | clip_out, _ = self.model(data, self.Xavg, self.Xstd) 185 | all_pred.extend(F.sigmoid(clip_out).data.cpu().numpy()) 186 | #mt = clip_out.max(1)[0] 187 | #clip_out = clip_out / (mt.view(-1, 1).repeat(1, Ytr.shape[1])) 188 | #all_pred.extend(clip_out.data.cpu().numpy()) 189 | #all_pred.extend(F.sigmoid(clip_out).data.cpu().numpy()) 190 | #all_pred.extend(F.softmax(clip_out).data.cpu().numpy()) 191 | 192 | print 'Prediction Time:%1f'%(time.time() - st) 193 | return np.array(all_pred) 194 | 195 | def fit(self): 196 | st = time.time() 197 | save_dict = {} 198 | self.model.train() 199 | for e in xrange(1, self.args.ep+1): 200 | 201 | # set optimizer (SGD) 202 | lr = self.args.lr * ( 0.1 **( e/self.args.lrde )) 203 | #lr = self.args.lr ** ((e/(self.args.lrde))+1) 204 | print '\n==> Training Epoch #%d lr=%4f'%(e, lr) 205 | self.optimizer = optim.SGD(self.model.parameters(), 206 | lr=lr, momentum=self.args.mom, weight_decay=self.args.wd) 207 | 208 | # Training 209 | for batch_idx, (data, target) in enumerate(self.tr_loader): 210 | data, target = Variable(data.cuda()), Variable(target.cuda()) 211 | 212 | macro_out, micro_out = self.model(data, self.Xavg, self.Xstd) 213 | #macro_out, micro_out = self.model(data) 214 | loss = self.mm_loss(target, macro_out, micro_out) 215 | self.optimizer.zero_grad() 216 | loss.backward() 217 | self.optimizer.step() 218 | 219 | # print training epoch, training loss and training time 220 | sys.stdout.write('\r') 221 | sys.stdout.write('| Epoch [%3d/%3d] Iter[%4d/%4d]\tLoss %4f\tTime %d' 222 | %(e, self.args.ep, batch_idx+1, len(self.tr_loader), 223 | loss.item(), time.time() - st)) 224 | sys.stdout.flush() 225 | print '\n' 226 | 227 | # evaluation 228 | all_pred = self.predictor(self.va_loader) 229 | va_class_threshold, _, va_out = evl(self.va_loader.dataset.Y, all_pred) 230 | all_pred = self.predictor(self.te_loader) 231 | _, te_result_pre_class, te_out = evl(self.te_loader.dataset.Y, all_pred, va_th=va_class_threshold) 232 | print va_class_threshold 233 | 234 | save_dict['state_dict'] = self.model.state_dict() 235 | save_dict['tr_loss'] = loss 236 | save_dict['te_out'] = te_out 237 | save_dict['va_class_threshold'] = va_class_threshold 238 | 239 | # test on evaluation set and save the results 240 | ########################## 241 | #all_pred = self.predictor(self.te_loader) 242 | #_, te_result_pre_class, te_out = evl(self.te_loader.dataset.Y, all_pred, va_th=va_class_threshold) 243 | #save_dict['te_out'] = te_out 244 | #save_dict['te_result_pre_class'] = te_result_pre_class 245 | ########################## 246 | 247 | 248 | directory = '../model/%s'%(self.model.module.model_name) 249 | if not os.path.exists(directory): 250 | os.makedirs(directory) 251 | 252 | torch.save(save_dict, directory +'/epoch_%d'%(e)) 253 | 254 | 255 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/data_gentor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import SharedArray as sa 3 | import sys 4 | import h5py 5 | 6 | def f_load(m_name, fp): 7 | try: 8 | out = sa.attach(m_name) 9 | except: 10 | print 'load data : %s'%(fp) 11 | out = np.load(fp) 12 | X = sa.create(m_name, (out.shape), dtype='float32') 13 | X[:] = out 14 | return out.astype('float32') 15 | 16 | def load_data(data_name): 17 | va_Y = f_load('ASmusic_va_Y' , '../data/ASmusic_bala_valid_Y.npy') 18 | va_X = f_load('ASmusic_va_X' , '../data/ASmusic_bala_valid_X.npy') 19 | te_Y = f_load('ASmusic_te_Y' , '../data/ASmusic_eval_test_Y.npy') 20 | te_X = f_load('ASmusic_te_X' , '../data/ASmusic_eval_test_X.npy') 21 | tr_Y = f_load('ASmusic_tr_Y' , '../data/ASmusic_unbala_train_Y.npy') 22 | tr_X = f_load('ASmusic_tr_Y' , '../data/ASmusic_unbala_train_X.npy') 23 | 24 | avg_std = np.load('../data/AS_ubl_X_avg_std.npy') 25 | 26 | print 'load data : ASmusic_music' 27 | return tr_X, tr_Y, va_X, va_Y, te_X, te_Y, avg_std 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from sklearn.metrics import roc_auc_score 4 | from sklearn.metrics import average_precision_score 5 | import multiprocessing 6 | import functools 7 | 8 | def class_F1_R_P(gru, pre, th): 9 | best = np.zeros(4) 10 | for t in th: 11 | tidx = gru==1 12 | vpred = pre.copy() 13 | vpred[vpred> t] = 1 14 | vpred[vpred<=t] = 0 15 | 16 | TP = vpred[tidx].sum() 17 | if TP == 0 : 18 | continue 19 | 20 | P = TP / float(vpred.sum()) 21 | R = TP / float(gru.sum()) 22 | F1 = 2*(P*R)/(R+P) 23 | 24 | if F1 > best[1]: 25 | best = np.array([t, F1, R, P]) 26 | return best 27 | 28 | def multi_evl_nt(i, gru, pre, th): 29 | st = time.time() 30 | evl_metrics = np.zeros(6) 31 | 32 | if gru[:,i].sum() == 0 or gru[:,i].sum()==len(gru): 33 | evl_metrics = evl_metrics -1 34 | return evl_metrics 35 | 36 | pre_tag = (np.argmax(pre[:,:,i],1)==i).astype(int) 37 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre_tag, [0]) 38 | 39 | evl_metrics[4] = average_precision_score(gru[:,i], pre[:,i,i]) 40 | evl_metrics[5] = roc_auc_score(gru[:,i], pre[:,i,i]) 41 | #print time.time() - st 42 | return evl_metrics 43 | 44 | def multi_evl(i, gru, pre, th): 45 | st = time.time() 46 | evl_metrics = np.zeros(6) 47 | 48 | if gru[:,i].sum() == 0 or gru[:,i].sum()==len(gru): 49 | evl_metrics = evl_metrics -1 50 | return evl_metrics 51 | 52 | if len(th) == 0: 53 | #th = np.arange(0, 1, 0.0001) 54 | th = np.arange(0, 1, 0.01) 55 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], th) 56 | else: 57 | #if len(th) == 1: 58 | # evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], th) 59 | #else: 60 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], [th[i]]) 61 | 62 | evl_metrics[4] = average_precision_score(gru[:,i], pre[:,i]) 63 | evl_metrics[5] = roc_auc_score(gru[:,i], pre[:,i]) 64 | #print time.time() - st 65 | return evl_metrics 66 | 67 | def evl(gru, pre, va_th=[]): 68 | st =time.time() 69 | vate = 'TE' 70 | evl_metrics = np.zeros((pre.shape[-1], 6)) 71 | if len(va_th) == 0: 72 | vate = 'VA' 73 | 74 | if vate not in ['TE', 'VA']: 75 | multi_evl_1 = functools.partial(multi_evl, gru=gru, pre=pre, th=va_th) 76 | P = multiprocessing.Pool(30) 77 | evl_metrics = np.array(P.map(multi_evl_1, np.arange(pre.shape[-1]))) 78 | P.close() 79 | P.join() 80 | 81 | else: 82 | for i in np.arange(pre.shape[-1]): 83 | if len(pre.shape)==2: 84 | evl_metrics[i] = multi_evl(i, gru=gru, pre=pre, th=va_th) 85 | else: 86 | evl_metrics[i] = multi_evl_nt(i, gru=gru, pre=pre, th=va_th) 87 | 88 | va_th = evl_metrics[:,0].copy() 89 | evl_metrics = evl_metrics[:,1:] 90 | 91 | #print np.arange(527)[evl_metrics[:,0]!=-1] 92 | acc = evl_metrics[evl_metrics[:,0]!=-1,:].mean(axis=0) * 100 93 | #print acc 94 | #print np.arange(pre.shape[-1])[evl_metrics[:,0]==-100,:] 95 | out = '[%s] mAP:%.1f%% AUC:%.1f%% F1-CB:%.1f%% R-CB:%.1f%% P-CB:%.1f%% time:%.1f'\ 96 | % (vate, acc[3], acc[4], acc[0], acc[1], acc[2], time.time()-st) 97 | print out 98 | return va_th, evl_metrics, out 99 | 100 | 101 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | def l_tpl(x, attention=[]): 6 | # Global temproal pooling 7 | xs = x.size() 8 | # Global average pooling 9 | apl = F.avg_pool2d(x, xs[2:]).view(xs[0], -1) 10 | # Global max pooling 11 | mpl = F.max_pool2d(x, xs[2:]).view(xs[0], -1) 12 | 13 | # Global variance pooling 14 | var = (x - apl.view(xs[0], xs[1], 1, 1).repeat(1,1,xs[2],xs[3]) )**2 15 | vpl = F.avg_pool2d(var, xs[2:]).view(xs[0], -1) 16 | 17 | ''' 18 | sm_att = F.softmax(attention.permute(0,2,1,3)).permute(0,2,1,3).repeat(1,xs[1],1,1) 19 | att = x * sm_att 20 | att = att.sum(dim=-1).sum(dim=-1) 21 | ''' 22 | #return torch.cat([attention, mpl, vpl], dim=1) 23 | return torch.cat([apl, vpl, mpl], dim=1) 24 | #return att, apl, mpl, vpl 25 | 26 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from Trainer import * 3 | 4 | # pre_trained model path 5 | #pmp = './Mmnet_DCASE17' 6 | pmp = '' 7 | 8 | # params for audio feature extraction (mel-spectrogram) 9 | parser = argparse.ArgumentParser(description= 'PyTorch M&ment Training using AudioSet Dataset') 10 | parser.add_argument('--dn', default='AudioSet_Music', type=str, help='dataset name') 11 | parser.add_argument('--sr', default=44100, type=int, help='[fea_ext] sample rate') 12 | parser.add_argument('--ws', default=2048, type=int, help='[fea_ext] windows size') 13 | parser.add_argument('--hs', default=512, type=int, help='[fea_ext] hop size') 14 | parser.add_argument('--mel', default=128, type=int, help='[fea_ext] mel bands') 15 | parser.add_argument('--msc', default=10, type=int, help='[fea_ext] top duration of audio clip') 16 | 17 | # params for training 18 | parser.add_argument('--bs', default=64, type=int, help='[net] batch size') 19 | parser.add_argument('--gw', default=1, type=int, help='[net] global weight for both positive and negative samples') 20 | parser.add_argument('--lrde', default=30, type=int, help='[net] divided the learning rate 10 by every lrde epochs') 21 | parser.add_argument('--mom', default=0.9, type=float, help='[net] momentum') 22 | parser.add_argument('--wd', default=1e-4, type=float, help='[net] weight decay') 23 | parser.add_argument('--lr', default=0.1, type=float, help='[net] learning rate') 24 | parser.add_argument('--ep', default=100, type=int, help='[net] epoch') 25 | parser.add_argument('--beta', default=0.3, type=float, help='[net] hyperparameter for pre-class loss weight') 26 | parser.add_argument('--pmp', default=pmp, type=str, help='[net] pre-trained model path') 27 | args = parser.parse_args() 28 | 29 | 30 | # build model 31 | #model = nn.DataParallel(Net(args.mel, data['Ytr'].shape[1]).cuda()) 32 | 33 | # Train 34 | Trer = Trainer(args) 35 | Trer.fit() 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /MusicAutoTagging/main/main_te.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from Test import * 3 | 4 | # pre_trained model path 5 | #pmp = './Mmnet_DCASE17' 6 | pmp = '../model/Mmnet_1D_MS_att/epoch_100' 7 | 8 | # params for audio feature extraction (mel-spectrogram) 9 | parser = argparse.ArgumentParser(description= 'PyTorch M&ment Training using AudioSet Dataset') 10 | parser.add_argument('--dn', default='AudioSet_Music', type=str, help='dataset name') 11 | parser.add_argument('--sr', default=44100, type=int, help='[fea_ext] sample rate') 12 | parser.add_argument('--ws', default=2048, type=int, help='[fea_ext] windows size') 13 | parser.add_argument('--hs', default=512, type=int, help='[fea_ext] hop size') 14 | parser.add_argument('--mel', default=128, type=int, help='[fea_ext] mel bands') 15 | parser.add_argument('--msc', default=10, type=int, help='[fea_ext] top duration of audio clip') 16 | 17 | # params for training 18 | parser.add_argument('--bs', default=64, type=int, help='[net] batch size') 19 | parser.add_argument('--gw', default=1, type=int, help='[net] global weight for both positive and negative samples') 20 | parser.add_argument('--lrde', default=30, type=int, help='[net] divided the learning rate 10 by every lrde epochs') 21 | parser.add_argument('--mom', default=0.9, type=float, help='[net] momentum') 22 | parser.add_argument('--wd', default=1e-4, type=float, help='[net] weight decay') 23 | parser.add_argument('--lr', default=0.1, type=float, help='[net] learning rate') 24 | parser.add_argument('--ep', default=100, type=int, help='[net] epoch') 25 | parser.add_argument('--beta', default=0.3, type=float, help='[net] hyperparameter for pre-class loss weight') 26 | parser.add_argument('--pmp', default=pmp, type=str, help='[net] pre-trained model path') 27 | args = parser.parse_args() 28 | 29 | 30 | # build model 31 | #model = nn.DataParallel(Net(args.mel, data['Ytr'].shape[1]).cuda()) 32 | 33 | # Train 34 | Trer = Trainer(args) 35 | pred_fn = '../pred/%s.npy'%(pmp.split('/')[2]) 36 | 37 | try: 38 | out = np.load(pred_fn) 39 | except: 40 | out = Trer.predictor() 41 | np.save('../pred/%s'%(pmp.split('/')[2]), out) 42 | 43 | vt = np.load('./tmp_va_the.npy') 44 | music_mood = np.arange(276,283) 45 | music_genre = np.arange(216,265) 46 | music = np.append(music_mood, music_genre) 47 | 48 | with open('/home/fearofchou/ND/m189/max/FCNN_torch/pre/csv/class_labels_indices.csv', 'r') as f: 49 | cl = f.readlines() 50 | 51 | id2tag = {} 52 | a = 0 53 | for i in music: 54 | tag = cl[i+1].split(',')[-1][1:-2] 55 | id2tag[a] = tag 56 | a+=1 57 | 58 | 59 | get_high_recall = 0.02 60 | out_tag = {} 61 | for i in xrange(len(out)): 62 | out_tag[Trer.fn[i]] = [] 63 | for j in np.arange(len(id2tag))[out[i] > (vt-get_high_recall)]: 64 | out_tag[Trer.fn[i]].append(id2tag[j]) 65 | 66 | np.save('../out_tag/%s_%s'%(pmp.split('/')[2], str(get_high_recall)[2:]), out_tag) 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/CNN_2D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | class block(nn.Module): 5 | def __init__(self, inp, out): 6 | super(block, self).__init__() 7 | self.conv = nn.Conv2d(inp, out, (3,3), padding=(1,1), bias=False) 8 | self.bn = nn.BatchNorm2d(out) 9 | 10 | def forward(self, x): 11 | out = F.relu(self.bn(self.conv(x))) 12 | return out 13 | 14 | 15 | class Net(nn.Module): 16 | def __init__(self, channel, num_classes): 17 | super(Net, self).__init__() 18 | self.nc = num_classes 19 | self.model_name = 'CNN_2D' 20 | 21 | self.ks = (3,3) 22 | self.ps = (1,1) 23 | 24 | inp = 64*2 25 | self.b1 = block( 1, 16) 26 | self.b2 = block( 16, inp*2) 27 | self.b3 = block(inp*2, inp*4) 28 | 29 | self.det1 = nn.Conv2d( 16, self.nc, self.ks, padding=self.ps, bias=False) 30 | self.det2 = nn.Conv2d(inp*2, self.nc, self.ks, padding=self.ps, bias=False) 31 | self.det3 = nn.Conv2d(inp*4, self.nc, self.ks, padding=self.ps, bias=False) 32 | self.att1 = nn.Conv2d( 16, 16, self.ks, padding=self.ps, bias=False) 33 | self.att2 = nn.Conv2d(inp*2, inp*2, self.ks, padding=self.ps, bias=False) 34 | self.att3 = nn.Conv2d(inp*4, inp*4, self.ks, padding=self.ps, bias=False) 35 | 36 | self.dp = nn.Dropout(.5) 37 | dns = 512*2 38 | 39 | # linear 40 | #self.den1 = nn.Linear(inp*3*2, dns) 41 | #self.den2 = nn.Linear(dns, dns) 42 | #self.dbn1 = nn.BatchNorm1d(dns) 43 | #self.dbn2 = nn.BatchNorm1d(dns) 44 | #self.prd = nn.Linear(dns, self.nc * 2) 45 | self.prd = nn.Linear(inp*4*2, self.nc * 1) 46 | 47 | def nn_apl(self, x): 48 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 49 | 50 | def nn_att(self, inp, att): 51 | att_out = F.softmax(att(inp).view(inp.size(0), inp.size(1), -1), dim=-1) 52 | 53 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 54 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 55 | #att_ens = F.softmax(att_sup, dim=3) 56 | 57 | return att_sc, (att_out*inp.view(inp.size(0), inp.size(1), -1)).sum(-1) 58 | 59 | def forward(self, x, Xavg, Xstd): 60 | #def forward(self, x): 61 | xs = x.size() 62 | x = x.view(xs[0], 1, xs[1], xs[2]) 63 | 64 | #x = x.permute(0,2,1,3) 65 | #Xavg = Xavg.view(1, Xavg.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 66 | #Xstd = Xstd.view(1, Xstd.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 67 | z_x = (x - Xavg)/Xstd 68 | 69 | 70 | b1 = self.b1(z_x) 71 | mp1 = F.max_pool2d(b1, (4,4)) 72 | b2 = self.b2(mp1) 73 | mp2 = F.max_pool2d(b2, (4,4)) 74 | bf = self.b3(mp2) 75 | 76 | # global average pooling 77 | gap = self.nn_apl(bf) 78 | att1, _ = self.nn_att(b1, self.att1) 79 | att2, _ = self.nn_att(b2, self.att2) 80 | att3, att_embed = self.nn_att(bf, self.att3) 81 | gap = torch.cat([gap, att_embed], dim=1) 82 | 83 | #DNN 84 | #den1 = F.relu(self.dbn1(self.den1(self.dp(gap)))) 85 | #den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 86 | y_cls = self.prd(self.dp(gap)) 87 | 88 | # attention 89 | det1 = self.det1(self.dp(b1)) 90 | det2 = self.det2(self.dp(b2)) 91 | det3 = self.det3(self.dp(bf)) 92 | 93 | 94 | # ensemble prediction 95 | att_ens = F.softmax(att3.view(att3.size(0), att3.size(1), -1), dim=-1) 96 | y_att = (det3.view(det3.size(0), det3.size(1), -1) * att_ens).sum(-1) 97 | y_ens = (y_cls + y_att)/2 98 | 99 | 100 | return y_ens, [[att1, det1], [att2, det2], [att3, det3]] 101 | #return y_ens, [] 102 | #return y_cls, [[att, det]] 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/CNN_2D.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fearofchou/mmnet/71847844df6f8467e5d331a0bd69a18007425164/MusicAutoTagging/net/CNN_2D.pyc -------------------------------------------------------------------------------- /MusicAutoTagging/net/FrameCNN_1D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | class block(nn.Module): 5 | def __init__(self, inp, out): 6 | super(block, self).__init__() 7 | self.bn1 = nn.BatchNorm2d(inp) 8 | self.conv1 = nn.Conv2d(inp, out, (1,3), padding=(0,1)) 9 | 10 | def forward(self, x): 11 | out = self.bn1(x) 12 | bn1 = F.relu(out) 13 | out = self.conv1(out) 14 | return out, bn1 15 | 16 | 17 | class Net(nn.Module): 18 | def __init__(self, channel, num_classes): 19 | super(Net, self).__init__() 20 | self.nc = num_classes 21 | self.model_name = 'FrameCNN_1D' 22 | 23 | self.ks = (1,3) 24 | self.ps = (0,1) 25 | inp = channel 26 | 27 | self.bn = nn.BatchNorm2d(inp) 28 | 29 | self.conv1 = nn.Conv2d(inp, inp, self.ks, padding=self.ps) 30 | self.b1 = block(inp*1, inp*2) 31 | self.b2 = block(inp*2, inp*3) 32 | self.b3 = block(inp*3, inp*3) 33 | self.bnf = nn.BatchNorm2d(inp*3) 34 | 35 | self.det = nn.Conv2d(inp*3, self.nc, self.ks, padding=self.ps) 36 | self.att = nn.Conv2d(inp*3, inp*3, self.ks, padding=self.ps) 37 | 38 | self.dp = nn.Dropout(.5) 39 | dns = 512*2 40 | 41 | # linear 42 | self.den1 = nn.Linear(inp*3, dns) 43 | self.den2 = nn.Linear(dns, dns) 44 | self.dbn1 = nn.BatchNorm1d(dns) 45 | self.dbn2 = nn.BatchNorm1d(dns) 46 | #self.prd = nn.Linear(dns, self.nc * 2) 47 | self.prd = nn.Linear(dns, self.nc * 1) 48 | 49 | def nn_apl(self, x): 50 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 51 | 52 | def nn_att(self, inp, att): 53 | att_out = F.softmax(att(inp), dim=3) 54 | 55 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 56 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 57 | #att_ens = F.softmax(att_sup, dim=3) 58 | 59 | return att_sc 60 | 61 | def forward(self, x, Xavg, Xstd): 62 | #def forward(self, x): 63 | xs = x.size() 64 | x = x.view(xs[0], 1, xs[1], xs[2]) 65 | 66 | x = x.permute(0,2,1,3) 67 | #Xavg = Xavg.view(1, Xavg.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 68 | #Xstd = Xstd.view(1, Xstd.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 69 | z_x = (x - Xavg)/Xstd 70 | 71 | z_x = self.bn(z_x) 72 | 73 | conv1 = self.conv1(z_x) 74 | b1, bnb1 = self.b1(conv1) 75 | mp1 = F.max_pool2d(b1, (1,4)) 76 | b2, bnb2 = self.b2(mp1) 77 | mp2 = F.max_pool2d(b2, (1,4)) 78 | b3, bnb3 = self.b3(mp2) 79 | bf = F.relu(self.bnf(b3)) 80 | 81 | # global average pooling 82 | gap = self.nn_apl(bf) 83 | 84 | #DNN 85 | den1 = F.relu(self.dbn1(self.den1(self.dp(gap)))) 86 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 87 | y_cls = self.prd(self.dp(den2)) 88 | 89 | # attention 90 | att = self.nn_att(bf, self.att) 91 | det = self.det(self.dp(bf)) 92 | 93 | 94 | # ensemble prediction 95 | att_ens = F.softmax(att, dim=3) 96 | y_att = (det * att_ens).sum(-1).sum(-1) 97 | y_ens = (y_cls + y_att)/2 98 | 99 | 100 | return y_ens, [[att, det]] 101 | #return y_ens, [] 102 | #return y_cls, [[att, det]] 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/FrameCNN_1D.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fearofchou/mmnet/71847844df6f8467e5d331a0bd69a18007425164/MusicAutoTagging/net/FrameCNN_1D.pyc -------------------------------------------------------------------------------- /MusicAutoTagging/net/FrameCNN_2D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import sys 4 | sys.path.append('../fun') 5 | from layers import * 6 | 7 | class Net(nn.Module): 8 | def __init__(self, num_labels): 9 | super(Net, self).__init__() 10 | self.numl = num_labels 11 | ks = 128 12 | fs = fs 13 | ps = ps 14 | self.model_name = 'FrameCNN_MS_GAP_DNN' 15 | 16 | self.conv1 = nn.Conv2d( 1, ks*1, fs, padding=ps) 17 | self.bn1 = nn.BatchNorm2d(ks*1) 18 | self.conv2 = nn.Conv2d(ks*1, ks*2, fs, padding=ps) 19 | self.bn2 = nn.BatchNorm2d(ks*2) 20 | self.conv3 = nn.Conv2d(ks*2, ks*3, fs, padding=ps) 21 | self.bn3 = nn.BatchNorm2d(ks*3) 22 | 23 | ''' 24 | self.dconv1 = nn.Conv2d(ks*3, ks*2, fs, padding=ps) 25 | self.db1 = nn.BatchNorm2d(ks*3) 26 | self.dconv2 = nn.Conv2d(ks*2, ks*1, fs, padding=ps) 27 | self.db2 = nn.BatchNorm2d(ks*1) 28 | ''' 29 | self.det1 = nn.Conv2d(ks*1, self.numl, fs, padding=ps) 30 | self.det2 = nn.Conv2d(ks*2, self.numl, fs, padding=ps) 31 | self.det3 = nn.Conv2d(ks*3, self.numl, fs, padding=ps) 32 | #self.det4 = nn.Conv2d(ks*2, self.numl, fs, padding=ps) 33 | #self.det5 = nn.Conv2d(ks*1, self.numl, fs, padding=ps) 34 | 35 | self.dp = nn.Dropout(.5) 36 | dns = 512*1 37 | 38 | # linear 39 | self.channel = ks*3*1 40 | self.den1 = nn.Linear(self.channel, dns) 41 | self.den2 = nn.Linear(dns, dns) 42 | self.dbn1 = nn.BatchNorm1d(dns) 43 | self.dbn2 = nn.BatchNorm1d(dns) 44 | self.channel = dns 45 | #self.channel = ks*4 46 | 47 | self.prd = nn.Linear(self.channel, self.numl) 48 | 49 | def apl(self, x): 50 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 51 | 52 | def GAP(self, layer, att, pool=0): 53 | # (50, 384, 56, 1) 54 | # (50, 1, 56, 1) 55 | out = att(layer).permute(0,2,1,3) 56 | # (50, 56, 1, 1) 57 | out = F.softmax(out) 58 | # (50, 56, 1, 1) 59 | out = out.permute(0,2,1,3) 60 | # (50, 1, 56, 1) 61 | #otu = att.repeat(1, bf.size(1), 1, 1) 62 | # (50, 384, 56, 1) 63 | if pool == 1: 64 | out1 = out * layer 65 | # (50, 384) 66 | out1 = out1.sum(-1).sum(-1) 67 | else: 68 | out1 = 1 69 | 70 | out = out.sum(1).view(out.size(0), 1, layer.size(2), 1) 71 | 72 | out2 = out.permute(0,2,1,3) 73 | out2 = F.softmax(out2) 74 | out2 = out2.permute(0,2,1,3) 75 | out2 = out2.repeat(1, self.numl, 1, 1) 76 | 77 | out = out.repeat(1, self.numl, 1, 1) 78 | 79 | return out1, out, out2 80 | 81 | def forward(self, x, Xavg, Xstd): 82 | 83 | #x = x.permute(0,3,2,1) 84 | 85 | xs = x.size() 86 | #Xavg = Xavg.view(1, Xavg.size()[0],1,1).repeat(xs[0], 1, xs[2], xs[3]) 87 | #Xstd = Xstd.view(1, Xstd.size()[0],1,1).repeat(xs[0], 1, xs[2], xs[3]) 88 | z_x = (x - Xavg)/Xstd 89 | 90 | z_x = z_x.view(xs[0], 1, xs[1], xs[2]) 91 | 92 | ms = (4,4) 93 | c1 = F.relu(self.bn1(self.conv1(z_x))) 94 | m1 = F.max_pool2d(c1, ms) 95 | c2 = F.relu(self.bn2(self.conv2(m1))) 96 | m2 = F.max_pool2d(c2, ms) 97 | c3 = F.relu(self.bn3(self.conv3(m2))) 98 | 99 | ''' 100 | uc3 = F.upsample(c3, scale_factor=ms, mode='nearest') 101 | c4 = F.relu(self.db1(self.dconv1(uc3))) 102 | uc4 = F.upsample(c4, scale_factor=ms, mode='nearest') 103 | c5 = F.relu(self.db2(self.dconv2(uc4))) 104 | ''' 105 | d1 = self.det1(self.dp(c1)) 106 | d2 = self.det2(self.dp(c2)) 107 | d3 = self.det3(self.dp(c3)) 108 | #d4 = self.det4(c4) 109 | #d5 = self.det5(c5) 110 | 111 | #s = F.sigmoid(self.sou(c5)) 112 | #s = F.relu(self.sou(c5)) 113 | 114 | # pooling 115 | apl = self.apl(c3) 116 | #apl = l_tpl(c3) 117 | den1 = F.relu(self.dbn1(self.den1(self.dp(apl)))) 118 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 119 | #den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 120 | pred = self.prd(den2) 121 | #pred = self.prd(apl) 122 | 123 | return pred, [pred, d1, d2, d3, d4, d5, x], [pred, d1,d2,d3,d4,d5,s*x],[] 124 | #return pred, [pred, d1, d2, d3, d4, d5], [pred, d1,d2,d3,d4,d5],[] 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/FrameCNN_2D.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fearofchou/mmnet/71847844df6f8467e5d331a0bd69a18007425164/MusicAutoTagging/net/FrameCNN_2D.pyc -------------------------------------------------------------------------------- /MusicAutoTagging/net/FrameCNN_MS_2D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import sys 4 | sys.path.append('../fun') 5 | from layers import * 6 | 7 | class Net(nn.Module): 8 | def __init__(self, channel, ks, num_labels): 9 | super(Net, self).__init__() 10 | self.numl = num_labels 11 | self.channel = channel 12 | ks = 128 13 | self.model_name = 'FrameCNN_MS_GAP_DNN' 14 | 15 | fss = 16 16 | self.conv1 = nn.Conv2d(1, fss, (3,3), padding=(1,1)) 17 | self.bn1 = nn.BatchNorm2d(fss) 18 | self.conv2 = nn.Conv2d(fss, ks*2, (3,3), padding=(1,1)) 19 | self.bn2 = nn.BatchNorm2d(ks*2) 20 | self.conv3 = nn.Conv2d(ks*2, ks*4, (3,3), padding=(1,1)) 21 | self.bn3 = nn.BatchNorm2d(ks*4) 22 | 23 | self.dconv1 = nn.Conv2d(ks*4, ks*2, (3,3), padding=(1,1)) 24 | self.db1 = nn.BatchNorm2d(ks*2) 25 | self.dconv2 = nn.Conv2d(ks*2, fss, (3,3), padding=(1,1)) 26 | self.db2 = nn.BatchNorm2d(fss) 27 | 28 | fs = (3,3) 29 | ps = (1,1) 30 | self.det1 = nn.Conv2d(fss, self.numl, fs, padding=ps) 31 | self.det2 = nn.Conv2d(ks*2, self.numl, fs, padding=ps) 32 | self.det3 = nn.Conv2d(ks*4, self.numl, fs, padding=ps) 33 | self.det4 = nn.Conv2d(ks*2, self.numl, fs, padding=ps) 34 | self.det5 = nn.Conv2d(fss, self.numl, fs, padding=ps) 35 | 36 | self.sou = nn.Conv2d(fss, 1, fs, padding=ps) 37 | 38 | self.dp = nn.Dropout(.0) 39 | dns = 512*1 40 | 41 | # linear 42 | self.channel = ks*4*1 43 | self.den1 = nn.Linear(self.channel, dns) 44 | self.den2 = nn.Linear(dns, dns) 45 | self.dbn1 = nn.BatchNorm1d(dns) 46 | self.dbn2 = nn.BatchNorm1d(dns) 47 | self.channel = dns 48 | #self.channel = ks*4 49 | 50 | self.prd = nn.Linear(self.channel, self.numl) 51 | 52 | def apl(self, x): 53 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 54 | 55 | def GAP(self, layer, att, pool=0): 56 | # (50, 384, 56, 1) 57 | # (50, 1, 56, 1) 58 | out = att(layer).permute(0,2,1,3) 59 | # (50, 56, 1, 1) 60 | out = F.softmax(out) 61 | # (50, 56, 1, 1) 62 | out = out.permute(0,2,1,3) 63 | # (50, 1, 56, 1) 64 | #otu = att.repeat(1, bf.size(1), 1, 1) 65 | # (50, 384, 56, 1) 66 | if pool == 1: 67 | out1 = out * layer 68 | # (50, 384) 69 | out1 = out1.sum(-1).sum(-1) 70 | else: 71 | out1 = 1 72 | 73 | out = out.sum(1).view(out.size(0), 1, layer.size(2), 1) 74 | 75 | out2 = out.permute(0,2,1,3) 76 | out2 = F.softmax(out2) 77 | out2 = out2.permute(0,2,1,3) 78 | out2 = out2.repeat(1, self.numl, 1, 1) 79 | 80 | out = out.repeat(1, self.numl, 1, 1) 81 | 82 | return out1, out, out2 83 | 84 | def forward(self, x, Xavg, Xstd): 85 | 86 | #x = x.permute(0,3,2,1) 87 | 88 | xs = x.size() 89 | Xavg = Xavg.view(1, Xavg.size()[0],1,1).repeat(xs[0], 1, xs[2], xs[3]) 90 | Xstd = Xstd.view(1, Xstd.size()[0],1,1).repeat(xs[0], 1, xs[2], xs[3]) 91 | z_x = (x - Xavg)/Xstd 92 | 93 | ms = (8,16) 94 | c1 = F.relu(self.bn1(self.conv1(z_x))) 95 | m1 = F.max_pool2d(c1, ms) 96 | c2 = F.relu(self.bn2(self.conv2(m1))) 97 | m2 = F.max_pool2d(c2, ms) 98 | c3 = F.relu(self.bn3(self.conv3(m2))) 99 | 100 | ''' 101 | uc3 = F.upsample(c3, scale_factor=ms, mode='nearest') 102 | c4 = F.relu(self.db1(self.dconv1(uc3))) 103 | uc4 = F.upsample(c4, scale_factor=ms, mode='nearest') 104 | c5 = F.relu(self.db2(self.dconv2(uc4))) 105 | ''' 106 | d1 = self.det1(c1) 107 | d2 = self.det2(c2) 108 | d3 = self.det3(c3) 109 | #d4 = self.det4(c4) 110 | #d5 = self.det5(c5) 111 | 112 | #s = F.sigmoid(self.sou(c5)) 113 | #s = F.relu(self.sou(c5)) 114 | 115 | # pooling 116 | apl = self.apl(c3) 117 | #apl = l_tpl(c3) 118 | den1 = F.relu(self.dbn1(self.den1(self.dp(apl)))) 119 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 120 | #den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 121 | pred = self.prd(den2) 122 | #pred = self.prd(apl) 123 | 124 | #return pred, [pred, d1, d2, d3, d4, d5, x], [pred, d1,d2,d3,d4,d5,s*x],[] 125 | #return pred, [pred, d1, d2, d3, d4, d5], [pred, d1,d2,d3,d4,d5],[] 126 | return pred, [pred], [pred],[] 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/Mmnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | class block(nn.Module): 5 | def __init__(self, inp, out): 6 | super(block, self).__init__() 7 | self.bn1 = nn.BatchNorm2d(inp) 8 | self.conv1 = nn.Conv2d(inp, out, (1,3), padding=(0,1), bias=False) 9 | self.bn2 = nn.BatchNorm2d(out) 10 | self.conv2 = nn.Conv2d(out, out, (1,3), padding=(0,1), bias=False) 11 | self.bn3 = nn.BatchNorm2d(out) 12 | 13 | self.sk = nn.Conv2d(inp, out, (1,1), padding=(0,0), bias=False) 14 | def forward(self, x): 15 | out = self.bn1(x) 16 | bn1 = F.relu(out) 17 | out = self.conv1(out) 18 | out = self.conv2(F.relu(self.bn2(out))) 19 | out = self.bn3(out) 20 | out += self.sk(x) 21 | return out, bn1 22 | 23 | 24 | class Net(nn.Module): 25 | def __init__(self, channel, num_classes): 26 | super(Net, self).__init__() 27 | self.nc = num_classes 28 | self.model_name = 'Mmnet_1D_MS_att' 29 | 30 | self.ks = (1,3) 31 | self.ps = (0,1) 32 | inp = 196 33 | 34 | self.bn = nn.BatchNorm2d(128) 35 | 36 | self.conv1 = nn.Conv2d(128, inp*1, self.ks, padding=self.ps) 37 | self.b1 = block(inp*1, inp*2) 38 | self.b2 = block(inp*2, inp*3) 39 | self.b3 = block(inp*3, inp*4) 40 | self.bnf = nn.BatchNorm2d(inp*4) 41 | 42 | self.det1 = nn.Conv2d(inp*1, self.nc, self.ks, padding=self.ps, bias=False) 43 | self.det2 = nn.Conv2d(inp*2, self.nc, self.ks, padding=self.ps, bias=False) 44 | self.det3 = nn.Conv2d(inp*4, self.nc, self.ks, padding=self.ps, bias=False) 45 | self.att1 = nn.Conv2d(inp*1, inp*1, self.ks, padding=self.ps, bias=False) 46 | self.att2 = nn.Conv2d(inp*2, inp*2, self.ks, padding=self.ps, bias=False) 47 | self.att3 = nn.Conv2d(inp*4, inp*4, self.ks, padding=self.ps, bias=False) 48 | 49 | self.dp = nn.Dropout(.5) 50 | dns = 512*2 51 | 52 | # linear 53 | self.den1 = nn.Linear(inp*4*2, dns) 54 | self.den2 = nn.Linear(dns, dns) 55 | self.dbn1 = nn.BatchNorm1d(dns) 56 | self.dbn2 = nn.BatchNorm1d(dns) 57 | #self.prd = nn.Linear(dns, self.nc * 2) 58 | self.prd = nn.Linear(dns, self.nc * 1) 59 | 60 | def nn_apl(self, x): 61 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 62 | 63 | def nn_att(self, inp, att): 64 | att_out = F.softmax(att(inp), dim=3) 65 | 66 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 67 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 68 | #att_ens = F.softmax(att_sup, dim=3) 69 | 70 | return att_sc, (att_out*inp).sum(-1).sum(-1) 71 | 72 | def forward(self, x, Xavg, Xstd): 73 | #def forward(self, x): 74 | xs = x.size() 75 | x = x.view(xs[0], 1, xs[1], xs[2]) 76 | 77 | x = x.permute(0,2,1,3) 78 | #Xavg = Xavg.view(1, Xavg.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 79 | #Xstd = Xstd.view(1, Xstd.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 80 | z_x = (x - Xavg)/Xstd 81 | 82 | z_x = self.bn(z_x) 83 | 84 | conv1 = self.conv1(z_x) 85 | b1, bnb1 = self.b1(conv1) 86 | mp1 = F.max_pool2d(b1, (1,4)) 87 | b2, bnb2 = self.b2(mp1) 88 | mp2 = F.max_pool2d(b2, (1,4)) 89 | b3, bnb3 = self.b3(mp2) 90 | bf = F.relu(self.bnf(b3)) 91 | 92 | # global average pooling 93 | gap = self.nn_apl(bf) 94 | att1, _ = self.nn_att(bnb1, self.att1) 95 | att2, _ = self.nn_att(bnb2, self.att2) 96 | att3, att_embed = self.nn_att(bf, self.att3) 97 | gap = torch.cat([gap, att_embed], dim=1) 98 | 99 | #DNN 100 | den1 = F.relu(self.dbn1(self.den1(self.dp(gap)))) 101 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 102 | y_cls = self.prd(self.dp(den2)) 103 | 104 | # attention 105 | det1 = self.det1(self.dp(bnb1)) 106 | det2 = self.det2(self.dp(bnb2)) 107 | det3 = self.det3(self.dp(bf)) 108 | 109 | 110 | # ensemble prediction 111 | att_ens = F.softmax(att3, dim=3) 112 | y_att = (det3 * att_ens).sum(-1).sum(-1) 113 | y_ens = (y_cls + y_att)/2 114 | 115 | 116 | return y_ens, [[att1, det1], [att2, det2], [att3, det3]] 117 | #return y_ens, [] 118 | #return y_cls, [[att, det]] 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/Mmnet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fearofchou/mmnet/71847844df6f8467e5d331a0bd69a18007425164/MusicAutoTagging/net/Mmnet.pyc -------------------------------------------------------------------------------- /MusicAutoTagging/net/Mmnet_2D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | class block(nn.Module): 5 | def __init__(self, inp, out): 6 | super(block, self).__init__() 7 | self.bn1 = nn.BatchNorm2d(inp) 8 | self.conv1 = nn.Conv2d(inp, out, (3,3), padding=(1,1), bias=False) 9 | self.bn2 = nn.BatchNorm2d(out) 10 | self.conv2 = nn.Conv2d(out, out, (3,3), padding=(1,1), bias=False) 11 | self.bn3 = nn.BatchNorm2d(out) 12 | 13 | self.sk = nn.Conv2d(inp, out, (1,1), padding=(0,0), bias=False) 14 | def forward(self, x): 15 | out = self.bn1(x) 16 | bn1 = F.relu(out) 17 | out = self.conv1(out) 18 | out = self.conv2(F.relu(self.bn2(out))) 19 | out = self.bn3(out) 20 | out += self.sk(x) 21 | return out, bn1 22 | 23 | 24 | class Net(nn.Module): 25 | def __init__(self, channel, num_classes): 26 | super(Net, self).__init__() 27 | self.nc = num_classes 28 | self.model_name = 'Mmnet_2D' 29 | 30 | self.ks = (3,3) 31 | self.ps = (1,1) 32 | 33 | self.bn = nn.BatchNorm2d(1) 34 | 35 | inp = 64/2 36 | self.conv1 = nn.Conv2d(1, 3, self.ks, padding=self.ps) 37 | self.b1 = block( 3, 16) 38 | self.b2 = block(16, inp*2) 39 | self.b3 = block(inp*2, inp*3) 40 | self.bnf = nn.BatchNorm2d(inp*3) 41 | 42 | self.det = nn.Conv2d(inp*3, self.nc, self.ks, padding=self.ps) 43 | self.att = nn.Conv2d(inp*3, inp*3, self.ks, padding=self.ps) 44 | 45 | self.dp = nn.Dropout(.5) 46 | dns = 512*2 47 | 48 | # linear 49 | self.den1 = nn.Linear(inp*3, dns) 50 | self.den2 = nn.Linear(dns, dns) 51 | self.dbn1 = nn.BatchNorm1d(dns) 52 | self.dbn2 = nn.BatchNorm1d(dns) 53 | #self.prd = nn.Linear(dns, self.nc * 2) 54 | self.prd = nn.Linear(dns, self.nc * 1) 55 | 56 | def nn_apl(self, x): 57 | return F.avg_pool2d(x, x.size()[2:]).view(x.size()[0], -1) 58 | 59 | def nn_att(self, inp, att): 60 | if inp.size(2) == 1: 61 | att_out = F.softmax(att(inp), dim=3) 62 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 63 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 64 | else: 65 | att_out = F.softmax(att(inp).view(inp.size(0), inp.size(1), -1), dim=-1) 66 | att_sc = att_out.sum(1).view(inp.size(0), 1, inp.size(2), inp.size(3)) 67 | att_sc = att_sc.repeat(1, self.nc, 1, 1) 68 | #att_ens = F.softmax(att_sup, dim=3) 69 | 70 | return att_sc 71 | 72 | def forward(self, x, Xavg, Xstd): 73 | #def forward(self, x): 74 | xs = x.size() 75 | x = x.view(xs[0], 1, xs[1], xs[2]) 76 | 77 | #x = x.permute(0,1,2,3) 78 | #Xavg = Xavg.view(1, Xavg.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 79 | #Xstd = Xstd.view(1, Xstd.size(0),1,1).repeat(xs[0], 1, xs[2], xs[3]) 80 | z_x = (x - Xavg)/Xstd 81 | 82 | #z_x = self.bn(z_x) 83 | 84 | conv1 = self.conv1(z_x) 85 | b1, bnb1 = self.b1(conv1) 86 | mp1 = F.max_pool2d(b1, (4,4)) 87 | b2, bnb2 = self.b2(mp1) 88 | mp2 = F.max_pool2d(b2, (4,4)) 89 | b3, bnb3 = self.b3(mp2) 90 | bf = F.relu(self.bnf(b3)) 91 | 92 | # global average pooling 93 | gap = self.nn_apl(bf) 94 | 95 | #DNN 96 | den1 = F.relu(self.dbn1(self.den1(self.dp(gap)))) 97 | den2 = F.relu(self.dbn2(self.den2(self.dp(den1)))) 98 | y_cls = self.prd(self.dp(den2)) 99 | 100 | # attention 101 | att = self.nn_att(bf, self.att) 102 | det = self.det(self.dp(bf)) 103 | 104 | 105 | # ensemble prediction 106 | att_ens = F.softmax(att, dim=3) 107 | y_att = (det * att_ens).sum(-1).sum(-1) 108 | y_ens = (y_cls + y_att)/2 109 | 110 | 111 | return y_ens, [[att, det]] 112 | #return y_ens, [] 113 | #return y_cls, [[att, det]] 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /MusicAutoTagging/net/Mmnet_2D.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fearofchou/mmnet/71847844df6f8467e5d331a0bd69a18007425164/MusicAutoTagging/net/Mmnet_2D.pyc -------------------------------------------------------------------------------- /MusicAutoTagging/pre/ext.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import os 4 | 5 | def mel(fn, sr=44100, sec=10, lv=10000, ws=2048, mel=128): 6 | 7 | fea_len = int((sr*sec/(ws/4)))+1 8 | init_mel = np.zeros((mel, fea_len)) 9 | 10 | if not os.path.isfile(fn): 11 | #print fn 12 | return init_mel 13 | 14 | y, sr = librosa.load(fn, sr) 15 | 16 | if len(y) < 4410 : 17 | print fn 18 | print len(y) 19 | return init_mel 20 | 21 | 22 | S = librosa.feature.melspectrogram(y, sr=sr, n_fft=ws, 23 | hop_length=ws/4, n_mels=mel) 24 | ss = S.shape[1] 25 | if fea_len == ss: 26 | init_mel = S.copy() 27 | if fea_len < ss: 28 | init_mel = S[:,:fea_len].copy() 29 | if fea_len > ss: 30 | init_mel[:, :ss] = S.copy() 31 | 32 | init_mel = np.log(1 + lv * init_mel) 33 | 34 | return init_mel 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /MusicAutoTagging/pre/ext_wav.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import os 4 | 5 | def mel(fn, sr=44100, sec=10, lv=10000, ws=2048, mel=128): 6 | 7 | fea_len = int((sr*sec/(ws/4)))+1 8 | init_mel = np.zeros((mel, fea_len)) 9 | 10 | if not os.path.isfile(fn): 11 | #print fn 12 | return init_mel 13 | 14 | y, sr = librosa.load(fn, sr) 15 | 16 | return y 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /MusicAutoTagging/pre/gen_ASmusic_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import subprocess as subp 4 | import multiprocessing 5 | import ext 6 | import functools 7 | 8 | def run_mp(args, li): 9 | lv, sec, sr, ws, mel, evl = args 10 | i = li[:-1].replace('"','').replace(' ','').split(',') 11 | wav_fn = wav_fp[evl] + 'Y'+i[0]+'_'+i[1]+'_'+i[2]+'.wav' 12 | 13 | return ext.mel(wav_fn, lv=lv, sec=sec, sr=sr, ws=ws, mel=mel) 14 | 15 | def get_tag(li): 16 | i = li[:-1].replace('"','').replace(' ','').split(',') 17 | 18 | # get Y tag 19 | Y = np.zeros(len(cl_dict)) 20 | for tag in i[3:]: 21 | Y[cl_dict[tag]] = 1 22 | 23 | return Y 24 | 25 | music_mood = np.arange(276,283) 26 | music_genre = np.arange(216,265) 27 | music = np.append(music_mood, music_genre) 28 | 29 | # load data 30 | as_csv = {} 31 | with open('/home/fearofchou/ND/m189/max/FCNN_torch/pre/csv/balanced_train_segments.csv', 'r') as f: 32 | as_csv['bala_valid'] = f.readlines() 33 | with open('/home/fearofchou/ND/m189/max/FCNN_torch/pre/csv/unbalanced_train_segments.csv', 'r') as f: 34 | as_csv['unbala_train'] = f.readlines() 35 | with open('/home/fearofchou/ND/m189/max/FCNN_torch/pre/csv/eval_segments.csv', 'r') as f: 36 | as_csv['eval_test'] = f.readlines() 37 | with open('/home/fearofchou/ND/m189/max/FCNN_torch/pre/csv/class_labels_indices.csv', 'r') as f: 38 | cl = f.readlines() 39 | 40 | # get tag dict 41 | cl_dict = {} 42 | for i in cl[1:]: 43 | i= i.split(',') 44 | iid = i[0] 45 | mask_id = i[1] 46 | cl_dict[mask_id] = int(iid) 47 | 48 | # find wav fn 49 | wav_fp = {} 50 | wav_fp['bala_valid'] = '/home/fearofchou/ND/data2/audioset_sy/audio/balance/' 51 | wav_fp['eval_test'] = '/home/fearofchou/ND/data2/audioset_sy/audio/Test/' 52 | wav_fp['unbala_train'] = '/home/fearofchou/ND/data2/audioset_sy/audio/seg/' 53 | 54 | for i in wav_fp.keys(): 55 | print i 56 | file_list = np.unique(as_csv[i])[3:] 57 | args = [10000, 10, 44100, 2048, 128, i] 58 | f_run_mp = functools.partial(run_mp, args) 59 | 60 | P = multiprocessing.Pool(20) 61 | Y = np.array(P.map(get_tag, file_list[:])) 62 | P.close() 63 | P.join() 64 | 65 | idx = np.arange(len(Y))[Y[:,music].sum(1)!=0] 66 | print 'number of files %d'%(len(idx)) 67 | X = np.zeros((len(file_list[idx]), 128, 862)) 68 | 69 | for i in xrange( (X.shape[0]/1000) +1): 70 | print 'load batch index %d %d'%(i, X.shape[0]/1000) 71 | P = multiprocessing.Pool(20) 72 | tmp_X = np.array(P.map(f_run_mp, file_list[idx[i*1000:(i+1)*1000]])) 73 | P.close() 74 | P.join() 75 | X[i*1000:(i+1)*1000] = tmp_X 76 | 77 | len_X = X.sum(1).sum(1) 78 | X_idx = np.arange(len(file_list[idx]))[len_X!=0] 79 | 80 | print 'Save file at ../data/AS_music_%s'%(args[-1]) 81 | c = Y[idx][X_idx][:,music] 82 | 83 | np.save('../data/ASmusic_%s_Y'%(args[-1]), Y[idx][X_idx][:,music]) 84 | np.save('../data/ASmusic_%s_X'%(args[-1]), X[X_idx]) 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /MusicAutoTagging/pre/te_ext.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import os 4 | 5 | def mel(fn, sr=44100, sec=10, lv=10000, ws=2048, mel=128): 6 | 7 | fea_len = int((sr*sec/(ws/4)))+1 8 | 9 | y, sr = librosa.load(fn, sr) 10 | S = librosa.feature.melspectrogram(y, sr=sr, n_fft=ws, 11 | hop_length=ws/4, n_mels=mel) 12 | return np.log(1 + lv * S).astype('float32') 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # M&mnet (A CNN-based Network for Audio Recognition) 2 | Pytorch implementation of [Learning to Recognize Transient Sound Events Using Attentional Supervision] 3 | 4 | 5 | ## Citation 6 | If you use M&mnet in your research, please cite our paper 7 | 8 | @article{Chou2018mmnet, 9 | title={Learning to Recognize Transient Sound Events Using Attentional Supervision}, 10 | author={Szu-Yu Chou and Jyh-Shing Roger Jang and Yi-Hsuan Yang}, 11 | journal={in Proc. Int. Joint Conf. Artificial Intelligence (IJCAI)}, 12 | year={2018} 13 | } 14 | 15 | ## Requirements 16 | * Python 2.7 17 | * LibROSA 0.6.0 18 | * PyTorch 0.4.0 19 | * cuda-8.0 20 | * Download [Pytorch 2.7](https://pytorch.org) 21 | ```bash 22 | pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp27-cp27mu-linux_x86_64.whl 23 | pip install torchvision 24 | ``` 25 | 26 | ## Download Task4_DCASE2017_dataset 27 | * [Training Set](https://drive.google.com/file/d/1HOQaUHbTgCRsS6Sr9I9uE6uCjiNPC3d3/view) 28 | * [Testing Set](https://drive.google.com/file/d/1GfP5JATSmCqD8p3CBIkk1J90mfJuPI-k/view) 29 | * [Password](https://groups.google.com/forum/#!searchin/dcase-discussions/own%7Csort:relevance/dcase-discussions/Lk2dTScX3A8/kvW17tlzAgAJ) 30 | 31 | 32 | ## Usage 33 | Note: You need to modify to your dataset path before you run the code. 34 | 35 | $ https://github.com/fearofchou/mmnet.git 36 | $ python main.py 37 | 38 | ## Pre-trained models 39 | Model |DataSet | 40 | :----:|:--------:| 41 | [M&mnet](https://drive.google.com/file/d/1cdaQNltci_9namelgMS3Vjc16kF8g8A9/view?usp=sharing)|[DCASE2017-Task4](http://www.cs.tut.fi/sgn/arg/dcase2017/challenge/task-large-scale-sound-event-detection) 42 | [M&mnet (Coming soon)](https://github.com/fearofchou/mmnet)|[AudioSet-2M](https://research.google.com/audioset/) 43 | 44 | -------------------------------------------------------------------------------- /Trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.utils.data import Dataset 7 | 8 | import time 9 | import sys 10 | import os 11 | from evaluator import * 12 | 13 | class HDF52Torch(Dataset): 14 | def __init__(self, X, Y): 15 | self.X = X 16 | self.Y = Y 17 | 18 | def __getitem__(self, index): 19 | mX = torch.from_numpy(self.X[index]) 20 | mY = torch.from_numpy(self.Y[index]) 21 | return mX, mY 22 | 23 | def __len__(self): 24 | return len(self.X) 25 | 26 | def model_init(m): 27 | classname = m.__class__.__name__ 28 | if classname.find('Conv') != -1: 29 | init.xavier_uniform_(m.weight, gain=np.sqrt(2)) 30 | init.constant_(m.bias, 0) 31 | elif classname.find('BatchNorm') != -1: 32 | init.constant_(m.weight, 1) 33 | init.constant_(m.bias, 0) 34 | elif classname.find('Linear') != -1: 35 | init.xavier_uniform_(m.weight, gain=np.sqrt(2)) 36 | init.constant_(m.bias, 0) 37 | 38 | def show_model_params(model): 39 | params = 0 40 | for i in model.parameters(): 41 | params += i.view(-1).size()[0] 42 | print 'Model:' + model.module.model_name + '\t#params:%d'%(params) 43 | 44 | 45 | class Trainer: 46 | def __init__(self, data, model, args): 47 | self.model = model 48 | self.args = args 49 | 50 | # data builder 51 | data_args = {'batch_size': args.bs, 'num_workers': 0, 'pin_memory': True} 52 | self.tr_loader = torch.utils.data.DataLoader(HDF52Torch(data['Xtr'], data['Ytr']), 53 | shuffle=True, drop_last=True, **data_args) 54 | self.va_loader = torch.utils.data.DataLoader(HDF52Torch(data['Xte'], data['Yte']), 55 | **data_args) 56 | 57 | # if you want to make a evaluation on evaluation set, please build a data_loader for it 58 | # self.te_loader = torch.utils.data.DataLoader(HDF52Torch(data['Xte'][:], data['Yte'][:]), 59 | #**data_args) 60 | 61 | # load avg and std for Z-score 62 | Xavg = torch.from_numpy(data['Xtr_avg_std'][0].astype('float32')) 63 | Xstd = torch.from_numpy(data['Xtr_avg_std'][1].astype('float32')) 64 | self.Xavg, self.Xstd = Variable(Xavg.cuda()), Variable(Xstd.cuda()) 65 | 66 | # pre-class loss weight 67 | # http://www.cs.tut.fi/sgn/arg/dcase2017/documents/workshop_presentations/the_story_of_audioset.pdf 68 | class_prior = data['Ytr'][:].sum(0) / float(data['Ytr'][:].sum()) 69 | mean_prior = class_prior.mean() 70 | PCLW = ( (mean_prior/ class_prior) * ((1-mean_prior)/(1-class_prior)) )**args.beta 71 | self.PCLW = torch.from_numpy(PCLW.astype('float32')).cuda() 72 | print self.PCLW 73 | self.show_dataset_model_params() 74 | self.load_pretrained_model() 75 | 76 | def load_pretrained_model(self): 77 | # pre-training 78 | if os.path.exists(self.args.pmp): 79 | pretrained_model = torch.load(self.args.pmp) 80 | model_param = self.model.state_dict() 81 | for k in pretrained_model['state_dict'].keys(): 82 | try: 83 | model_param[k].copy_(pretrained_model['state_dict'][k]) 84 | except: 85 | print '[ERROR] Load pre-trained model' 86 | self.model.apply(model_init) 87 | break 88 | print 'Load Pre_trained Model : ' + self.args.pmp 89 | 90 | else: 91 | print 'Learning from scrath' 92 | self.model.apply(model_init) 93 | 94 | 95 | def show_dataset_model_params(self): 96 | # show model structure 97 | print self.model 98 | 99 | # show params 100 | print show_model_params(self.model) 101 | 102 | # show the size of training, validation and test set 103 | print 'Dataset : ' + self.args.dn 104 | print 'Xtr->' + str(self.tr_loader.dataset.X.shape) + '\t\tYtr->' + str(self.tr_loader.dataset.Y.shape) 105 | print 'Xte->' + str(self.va_loader.dataset.X.shape) + '\t\tYva->' + str(self.va_loader.dataset.Y.shape) 106 | #print 'Xte->' + str(self.te_loader.dataset.X.shape) + '\t\tYte->' + str(self.te_loader.dataset.Y.shape) 107 | 108 | def mm_loss(self, target, macro_out, micro_out): 109 | 110 | #tar = target.data 111 | target = target.float() 112 | we = self.PCLW 113 | wwe = self.args.gw 114 | we *= wwe 115 | loss = 0 116 | we = we.view(1,-1).repeat(target.size(0), 1) 117 | 118 | twe = we * target + (1 - target)*wwe 119 | 120 | loss_fn = torch.nn.BCEWithLogitsLoss(weight=twe, size_average=True) 121 | loss += loss_fn(macro_out, target) 122 | 123 | for att_sc, det in micro_out: 124 | os = det.size() 125 | fl_target = target.view(os[0], os[1], 1, 1).repeat(1,1,os[2],os[3]) 126 | twe = we.view(os[0],os[1],1,1).repeat(1, 1, os[2], os[3]) 127 | twe = att_sc.data * twe * fl_target + (1 - fl_target) * wwe 128 | # Noet: att_sc.data is requirement 129 | 130 | loss_fn = torch.nn.BCEWithLogitsLoss(weight=twe, size_average=True) 131 | loss += loss_fn(det, fl_target) 132 | 133 | return loss 134 | 135 | 136 | def predictor(self, loader): 137 | st = time.time() 138 | all_pred = [] 139 | self.model.eval() 140 | for data, target in loader: 141 | with torch.no_grad(): 142 | data, target = Variable(data.cuda()), Variable(target.cuda()) 143 | clip_out, _ = self.model(data, self.Xavg, self.Xstd) 144 | all_pred.extend(F.sigmoid(clip_out).data.cpu().numpy()) 145 | 146 | print 'Prediction Time:%1f'%(time.time() - st) 147 | return np.array(all_pred) 148 | 149 | def fit(self): 150 | st = time.time() 151 | save_dict = {} 152 | for e in xrange(1, self.args.ep+1): 153 | 154 | # set optimizer (SGD) 155 | lr = self.args.lr ** ((e/(self.args.lrde))+1) 156 | print '\n==> Training Epoch #%d lr=%4f'%(e, lr) 157 | self.optimizer = optim.SGD(self.model.parameters(), 158 | lr=lr, momentum=self.args.mom, weight_decay=self.args.wd) 159 | 160 | # Training 161 | for batch_idx, (data, target) in enumerate(self.tr_loader): 162 | data, target = Variable(data.cuda()), Variable(target.cuda()) 163 | self.model.train() 164 | 165 | macro_out, micro_out = self.model(data, self.Xavg, self.Xstd) 166 | loss = self.mm_loss(target, macro_out, micro_out) 167 | self.optimizer.zero_grad() 168 | loss.backward() 169 | self.optimizer.step() 170 | 171 | # print training epoch, training loss and training time 172 | sys.stdout.write('\r') 173 | sys.stdout.write('| Epoch [%3d/%3d] Iter[%4d/%4d]\tLoss %4f\tTime %d' 174 | %(e, self.args.ep, batch_idx+1, len(self.tr_loader), 175 | loss.item(), time.time() - st)) 176 | sys.stdout.flush() 177 | print '\n' 178 | 179 | # evaluation 180 | all_pred = self.predictor(self.va_loader) 181 | va_class_threshold, _, va_out = evl(self.va_loader.dataset.Y, all_pred) 182 | 183 | save_dict['state_dict'] = self.model.state_dict() 184 | save_dict['tr_loss'] = loss 185 | save_dict['va_out'] = va_out 186 | save_dict['va_class_threshold'] = va_class_threshold 187 | 188 | 189 | # test on evaluation set and save the results 190 | ########################## 191 | #all_pred = self.predictor(self.te_loader) 192 | #_, te_result_pre_class, te_out = evl(self.te_loader.dataset.Y, all_pred, va_th=va_class_threshold) 193 | #save_dict['te_out'] = te_out 194 | #save_dict['te_result_pre_class'] = te_result_pre_class 195 | ########################## 196 | 197 | 198 | directory = './data/model/%s'%(self.model.module.model_name) 199 | if not os.path.exists(directory): 200 | os.makedirs(directory) 201 | 202 | torch.save(save_dict, directory +'/epoch_%d'%(e)) 203 | 204 | 205 | -------------------------------------------------------------------------------- /data/groundtruth_weak_label_testing_set.csv: -------------------------------------------------------------------------------- 1 | -5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train horn 2 | -E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train horn 3 | -GCwoyCnYsY_0.000_10.000.wav 0.000 10.000 Train horn 4 | -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train horn 5 | -Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train horn 6 | -Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train horn 7 | -Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train horn 8 | -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Train horn 9 | -nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train horn 10 | -u9BxBNcrw4_30.000_40.000.wav 30.000 40.000 Train horn 11 | -zqW9xCZd80_260.000_270.000.wav 260.000 270.000 Train horn 12 | 02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train horn 13 | 0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train horn 14 | 0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train horn 15 | 0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train horn 16 | 0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train horn 17 | 0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train horn 18 | 10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train horn 19 | 1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train horn 20 | 1S5WKCcf-wU_40.000_50.000.wav 40.000 50.000 Train horn 21 | 1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train horn 22 | 1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train horn 23 | 1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train horn 24 | 1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train horn 25 | 1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train horn 26 | 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train horn 27 | 2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train horn 28 | 2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train horn 29 | 2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train horn 30 | 2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train horn 31 | -8baTnilyjs_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 32 | -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 33 | -jG26jT3fP8_230.000_240.000.wav 230.000 240.000 Air horn, truck horn 34 | -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Air horn, truck horn 35 | -v7cUxke-f4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 36 | -yeWlsEpcpA_15.000_25.000.wav 15.000 25.000 Air horn, truck horn 37 | 04KOunVOkSA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 38 | 08y2LHhxmsM_400.000_410.000.wav 400.000 410.000 Air horn, truck horn 39 | 0G73yqtBwgE_11.000_21.000.wav 11.000 21.000 Air horn, truck horn 40 | 0UPY7ws-VFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn 41 | 0euD32aKYUs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn 42 | 1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 43 | 1iRgwn7p0DA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 44 | 1myTsHAIvYc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 45 | 1z0XoG6GEv4_420.000_430.000.wav 420.000 430.000 Air horn, truck horn 46 | 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Air horn, truck horn 47 | 2KmSuPb9gwA_24.000_34.000.wav 24.000 34.000 Air horn, truck horn 48 | 2Vy5NCEkg2I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 49 | 2ZciT0XrifM_0.000_8.000.wav 0.000 8.000 Air horn, truck horn 50 | 2jOzX06bzuA_16.000_26.000.wav 16.000 26.000 Air horn, truck horn 51 | 35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 52 | 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Air horn, truck horn 53 | 3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Air horn, truck horn 54 | 3rGOv4evODE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn 55 | 42U7xIucU68_20.000_30.000.wav 20.000 30.000 Air horn, truck horn 56 | 46r7mO2k6zY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 57 | 4EBnb2DN3Yg_13.000_23.000.wav 13.000 23.000 Air horn, truck horn 58 | 4NTjS5pFfSc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 59 | 4bvfOnX7BIE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 60 | 4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn 61 | -ajCLjpfGKI_83.000_93.000.wav 83.000 93.000 Car alarm 62 | -hLSc9aPOms_13.000_23.000.wav 13.000 23.000 Car alarm 63 | -rgDWfvxxqw_30.000_40.000.wav 30.000 40.000 Car alarm 64 | 0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Car alarm 65 | 0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car alarm 66 | 0ZPafgZftWk_80.000_90.000.wav 80.000 90.000 Car alarm 67 | 0npLQ4LzD0c_40.000_50.000.wav 40.000 50.000 Car alarm 68 | 17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Car alarm 69 | 3HxQ83IMyw4_70.000_80.000.wav 70.000 80.000 Car alarm 70 | 3z05luLEc_Q_0.000_10.000.wav 0.000 10.000 Car alarm 71 | 4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Car alarm 72 | 4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car alarm 73 | 4h01lBkTVQY_18.000_28.000.wav 18.000 28.000 Car alarm 74 | 5-SzZotiaBU_30.000_40.000.wav 30.000 40.000 Car alarm 75 | 54PbkldEp9M_30.000_40.000.wav 30.000 40.000 Car alarm 76 | 5P6YYsMaIH4_30.000_40.000.wav 30.000 40.000 Car alarm 77 | 5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car alarm 78 | 7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Car alarm 79 | 7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car alarm 80 | 7NZ0kMj2HSI_54.000_64.000.wav 54.000 64.000 Car alarm 81 | 7RQpt1_1ZzU_30.000_40.000.wav 30.000 40.000 Car alarm 82 | 7ee54nr6jG8_30.000_40.000.wav 30.000 40.000 Car alarm 83 | 8OajsyPSNt8_40.000_50.000.wav 40.000 50.000 Car alarm 84 | 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car alarm 85 | 9fzeD7CeI7Y_110.000_120.000.wav 110.000 120.000 Car alarm 86 | 9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car alarm 87 | A-GNszKtjJc_93.000_103.000.wav 93.000 103.000 Car alarm 88 | A437a4Y_xag_230.000_240.000.wav 230.000 240.000 Car alarm 89 | APMPW2YI-Zk_20.000_30.000.wav 20.000 30.000 Car alarm 90 | AR-KmtlXg4Y_70.000_80.000.wav 70.000 80.000 Car alarm 91 | -60XojQWWoc_30.000_40.000.wav 30.000 40.000 Reversing beeps 92 | -6d-zxMvC5E_30.000_40.000.wav 30.000 40.000 Reversing beeps 93 | -6qSMlbJJ58_30.000_40.000.wav 30.000 40.000 Reversing beeps 94 | -8OITuFZha8_30.000_40.000.wav 30.000 40.000 Reversing beeps 95 | -8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Reversing beeps 96 | -AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Reversing beeps 97 | -AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Reversing beeps 98 | -AXDeY-N2_M_30.000_40.000.wav 30.000 40.000 Reversing beeps 99 | -B1uzsLG0Dk_30.000_40.000.wav 30.000 40.000 Reversing beeps 100 | -BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Reversing beeps 101 | -Em3OpyaefM_30.000_40.000.wav 30.000 40.000 Reversing beeps 102 | -FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Reversing beeps 103 | -SP7KWmTRUU_30.000_40.000.wav 30.000 40.000 Reversing beeps 104 | -h4or05bj_I_30.000_40.000.wav 30.000 40.000 Reversing beeps 105 | -oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Reversing beeps 106 | -r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Reversing beeps 107 | -s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Reversing beeps 108 | -uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Reversing beeps 109 | -x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Reversing beeps 110 | -xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Reversing beeps 111 | -zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Reversing beeps 112 | 03xMfqt4fZI_24.000_34.000.wav 24.000 34.000 Reversing beeps 113 | 0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Reversing beeps 114 | 0FQo-2xRJ0E_30.000_40.000.wav 30.000 40.000 Reversing beeps 115 | 0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Reversing beeps 116 | 0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Reversing beeps 117 | 0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Reversing beeps 118 | 0P-YGHC5cBU_30.000_40.000.wav 30.000 40.000 Reversing beeps 119 | 0QKet-tdquc_30.000_40.000.wav 30.000 40.000 Reversing beeps 120 | 0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Reversing beeps 121 | -5px8DVPl8A_28.000_38.000.wav 28.000 38.000 Bicycle 122 | -D08wyQwDPQ_10.000_20.000.wav 10.000 20.000 Bicycle 123 | -F1_Gh78vJ0_30.000_40.000.wav 30.000 40.000 Bicycle 124 | -FZQIkX44Pk_10.000_20.000.wav 10.000 20.000 Bicycle 125 | -FsvS99nWTc_30.000_40.000.wav 30.000 40.000 Bicycle 126 | -Holdef_BZ0_30.000_40.000.wav 30.000 40.000 Bicycle 127 | -Inn26beF70_30.000_40.000.wav 30.000 40.000 Bicycle 128 | -Jq9HNSs_ns_14.000_24.000.wav 14.000 24.000 Bicycle 129 | -KlN_AXMM0Q_30.000_40.000.wav 30.000 40.000 Bicycle 130 | -NCcqKWiGus_30.000_40.000.wav 30.000 40.000 Bicycle 131 | -NNC_TqWfGw_30.000_40.000.wav 30.000 40.000 Bicycle 132 | -OGFiXvmldM_30.000_40.000.wav 30.000 40.000 Bicycle 133 | -RFpDUZhN-g_13.000_23.000.wav 13.000 23.000 Bicycle 134 | -XUfeRTw3b4_0.000_6.000.wav 0.000 6.000 Bicycle 135 | -XoATxJ-Qcg_30.000_40.000.wav 30.000 40.000 Bicycle 136 | -bFNxvFwDts_470.000_480.000.wav 470.000 480.000 Bicycle 137 | -e5PokL6Cyo_30.000_40.000.wav 30.000 40.000 Bicycle 138 | -fNyOf9zIU0_30.000_40.000.wav 30.000 40.000 Bicycle 139 | -fhpkRyZL90_30.000_40.000.wav 30.000 40.000 Bicycle 140 | -fo3m0hiZbg_30.000_40.000.wav 30.000 40.000 Bicycle 141 | -ikJkNwcmkA_27.000_37.000.wav 27.000 37.000 Bicycle 142 | -k2nMcxAjWE_30.000_40.000.wav 30.000 40.000 Bicycle 143 | -k80ibA-fyw_30.000_40.000.wav 30.000 40.000 Bicycle 144 | -lBcEVa_NKw_30.000_40.000.wav 30.000 40.000 Bicycle 145 | -mQyAYU_Bd4_50.000_60.000.wav 50.000 60.000 Bicycle 146 | -ngrinYHF4c_30.000_40.000.wav 30.000 40.000 Bicycle 147 | -nqm_RJ2xj8_40.000_50.000.wav 40.000 50.000 Bicycle 148 | -oAw5iTeT1g_40.000_50.000.wav 40.000 50.000 Bicycle 149 | -p2EMzpTE38_4.000_14.000.wav 4.000 14.000 Bicycle 150 | -qmfWP_yzn4_30.000_40.000.wav 30.000 40.000 Bicycle 151 | -0DIFwkUpjQ_50.000_60.000.wav 50.000 60.000 Skateboard 152 | -53qltVyjpc_180.000_190.000.wav 180.000 190.000 Skateboard 153 | -5y4jb9eUWs_110.000_120.000.wav 110.000 120.000 Skateboard 154 | -81kolkG8M0_0.000_8.000.wav 0.000 8.000 Skateboard 155 | -9dwTSq6JZg_70.000_80.000.wav 70.000 80.000 Skateboard 156 | -9oKZsjjf_0_20.000_30.000.wav 20.000 30.000 Skateboard 157 | -AFGfu5zOzQ_30.000_40.000.wav 30.000 40.000 Skateboard 158 | -DHGwygUsQc_30.000_40.000.wav 30.000 40.000 Skateboard 159 | -DkuTmIs7_Q_30.000_40.000.wav 30.000 40.000 Skateboard 160 | -E1E17R7UBA_260.000_270.000.wav 260.000 270.000 Skateboard 161 | -E1aIXhB4YU_30.000_40.000.wav 30.000 40.000 Skateboard 162 | -McJLXNN3-o_50.000_60.000.wav 50.000 60.000 Skateboard 163 | -N7nQ4CXGsY_170.000_180.000.wav 170.000 180.000 Skateboard 164 | -O5vrHFRzcY_30.000_40.000.wav 30.000 40.000 Skateboard 165 | -Plh9jAN_Eo_0.000_2.000.wav 0.000 2.000 Skateboard 166 | -Qd_dXTbgK0_30.000_40.000.wav 30.000 40.000 Skateboard 167 | -aVZ-H92M_s_0.000_4.000.wav 0.000 4.000 Skateboard 168 | -cd-Zn8qFxU_90.000_100.000.wav 90.000 100.000 Skateboard 169 | -esP4loyvjM_60.000_70.000.wav 60.000 70.000 Skateboard 170 | -iB3a71aPew_30.000_40.000.wav 30.000 40.000 Skateboard 171 | -lZapwtvwlg_0.000_10.000.wav 0.000 10.000 Skateboard 172 | -mxMaMJCXL8_180.000_190.000.wav 180.000 190.000 Skateboard 173 | -nYGTw9Sypg_20.000_30.000.wav 20.000 30.000 Skateboard 174 | -oS19KshdlM_30.000_40.000.wav 30.000 40.000 Skateboard 175 | -s6uxc77NWo_40.000_50.000.wav 40.000 50.000 Skateboard 176 | -sCrXS2kJlA_30.000_40.000.wav 30.000 40.000 Skateboard 177 | -saCvPTdQ7s_30.000_40.000.wav 30.000 40.000 Skateboard 178 | -sb-knLiDic_20.000_30.000.wav 20.000 30.000 Skateboard 179 | -tSwRvqaKWg_90.000_100.000.wav 90.000 100.000 Skateboard 180 | -x_jV34hVq4_30.000_40.000.wav 30.000 40.000 Skateboard 181 | --ljM2Kojag_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 182 | -4F1TX-T6T4_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 183 | -7HVWUwyMig_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 184 | -9pUUT-6o8U_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 185 | -Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Ambulance (siren) 186 | -LGTb-xyjzA_11.000_21.000.wav 11.000 21.000 Ambulance (siren) 187 | -Y1qiiugnk8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 188 | -YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 189 | -ZeMV790MXE_10.000_20.000.wav 10.000 20.000 Ambulance (siren) 190 | -d-T8Y9-TOg_17.000_27.000.wav 17.000 27.000 Ambulance (siren) 191 | -dcrL5JLmvo_11.000_21.000.wav 11.000 21.000 Ambulance (siren) 192 | -fCSO8SVWZU_6.000_16.000.wav 6.000 16.000 Ambulance (siren) 193 | -fGFQTGd2nA_10.000_20.000.wav 10.000 20.000 Ambulance (siren) 194 | -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Ambulance (siren) 195 | -jnQgpHubNI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 196 | -k6p9n9y22Q_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 197 | -kr4SUjnm88_29.000_39.000.wav 29.000 39.000 Ambulance (siren) 198 | -lyPnABQhCI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 199 | -od8LQAVgno_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 200 | -pVEgzu95Nc_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 201 | -w-9yF465IY_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 202 | -woquFRnQk8_16.000_26.000.wav 16.000 26.000 Ambulance (siren) 203 | -xz75wUCln8_50.000_60.000.wav 50.000 60.000 Ambulance (siren) 204 | -yGElLHdkEI_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 205 | -yPSgCn9AWo_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 206 | -z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 207 | 00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 208 | 02u3P99INjs_8.000_18.000.wav 8.000 18.000 Ambulance (siren) 209 | 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Ambulance (siren) 210 | 0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 211 | -0Eem_FuIto_15.000_25.000.wav 15.000 25.000 Fire engine, fire truck (siren) 212 | -2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 213 | -45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 214 | -4B435WQvag_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren) 215 | -6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren) 216 | -8uyNBFbdFc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 217 | -Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 218 | -KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Fire engine, fire truck (siren) 219 | -PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Fire engine, fire truck (siren) 220 | -QBo1W2w8II_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 221 | -QX-ddNtUvE_24.000_34.000.wav 24.000 34.000 Fire engine, fire truck (siren) 222 | -RlUu1el2G4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 223 | -SkO97C81Ms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 224 | -T8QHPXfIC4_13.000_23.000.wav 13.000 23.000 Fire engine, fire truck (siren) 225 | -USiTjZoh88_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 226 | -X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 227 | -Z3ByS_RCwI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 228 | -ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 229 | -cOjJ0Nvtlw_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren) 230 | -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Fire engine, fire truck (siren) 231 | -eYUCWGQ_wU_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 232 | -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren) 233 | -hplTh4SGvs_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren) 234 | -nPhg6Eu4b4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 235 | -oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 236 | -oEGuMg8hT4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 237 | -pvaJ4DwtRg_3.000_13.000.wav 3.000 13.000 Fire engine, fire truck (siren) 238 | -qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 239 | -sJn3uUxpH8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 240 | -sfn1NDHWJI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 241 | -09rxiqNNEs_30.000_40.000.wav 30.000 40.000 Civil defense siren 242 | -3qh-WFUV2U_30.000_40.000.wav 30.000 40.000 Civil defense siren 243 | -4JG_Ag99hY_30.000_40.000.wav 30.000 40.000 Civil defense siren 244 | -60NmEaP0is_0.000_10.000.wav 0.000 10.000 Civil defense siren 245 | -6cTEqIcics_30.000_40.000.wav 30.000 40.000 Civil defense siren 246 | -6iVBmb5PZU_40.000_50.000.wav 40.000 50.000 Civil defense siren 247 | -6qp8NjWffE_30.000_40.000.wav 30.000 40.000 Civil defense siren 248 | -75iY1j3MeY_30.000_40.000.wav 30.000 40.000 Civil defense siren 249 | -E3Yju3lrRo_30.000_40.000.wav 30.000 40.000 Civil defense siren 250 | -FHSBdx5A3g_40.000_50.000.wav 40.000 50.000 Civil defense siren 251 | -JhSzxTdcwY_30.000_40.000.wav 30.000 40.000 Civil defense siren 252 | -OtNDK_Hxp8_30.000_40.000.wav 30.000 40.000 Civil defense siren 253 | -S3_I0RiG3g_30.000_40.000.wav 30.000 40.000 Civil defense siren 254 | -YMXgDKKAwU_30.000_40.000.wav 30.000 40.000 Civil defense siren 255 | -c7XoYM-SSY_30.000_40.000.wav 30.000 40.000 Civil defense siren 256 | -j8EeIX9ynk_30.000_40.000.wav 30.000 40.000 Civil defense siren 257 | -t478yabOQw_30.000_40.000.wav 30.000 40.000 Civil defense siren 258 | -uIyMR9luvg_30.000_40.000.wav 30.000 40.000 Civil defense siren 259 | -wgP6ua-t4k_40.000_50.000.wav 40.000 50.000 Civil defense siren 260 | -zGAb18JxmI_30.000_40.000.wav 30.000 40.000 Civil defense siren 261 | 03NLMEMi8-I_30.000_40.000.wav 30.000 40.000 Civil defense siren 262 | 0552YhBdeXo_30.000_40.000.wav 30.000 40.000 Civil defense siren 263 | 06TM6z3NvuY_30.000_40.000.wav 30.000 40.000 Civil defense siren 264 | 0CUi0oGUzjU_30.000_40.000.wav 30.000 40.000 Civil defense siren 265 | 0GpUFFJNFH8_30.000_40.000.wav 30.000 40.000 Civil defense siren 266 | 0H_WUo2srs0_30.000_40.000.wav 30.000 40.000 Civil defense siren 267 | 0HvYkBXQ44A_30.000_40.000.wav 30.000 40.000 Civil defense siren 268 | 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Civil defense siren 269 | 0JKcTVpby0I_30.000_40.000.wav 30.000 40.000 Civil defense siren 270 | 0PhU-PIsUMw_40.000_50.000.wav 40.000 50.000 Civil defense siren 271 | -122tCXtFhU_30.000_40.000.wav 30.000 40.000 Police car (siren) 272 | -1U98XBTyB4_30.000_40.000.wav 30.000 40.000 Police car (siren) 273 | -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Police car (siren) 274 | -6WqJCSmkCw_70.000_80.000.wav 70.000 80.000 Police car (siren) 275 | -AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Police car (siren) 276 | -AFASmp1fpk_6.000_16.000.wav 6.000 16.000 Police car (siren) 277 | -F2lk9A8B8M_30.000_40.000.wav 30.000 40.000 Police car (siren) 278 | -GPv09qi9A8_120.000_130.000.wav 120.000 130.000 Police car (siren) 279 | -Hi-WpRGUpc_9.000_19.000.wav 9.000 19.000 Police car (siren) 280 | -KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Police car (siren) 281 | -MfBpxtGQmE_20.000_30.000.wav 20.000 30.000 Police car (siren) 282 | -Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Police car (siren) 283 | -UCf_-3yzWU_290.000_300.000.wav 290.000 300.000 Police car (siren) 284 | -VULyMtKazE_0.000_7.000.wav 0.000 7.000 Police car (siren) 285 | -XRiLbb3Syo_2.000_12.000.wav 2.000 12.000 Police car (siren) 286 | -XrpzGb6xCU_190.000_200.000.wav 190.000 200.000 Police car (siren) 287 | -YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Police car (siren) 288 | -ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Police car (siren) 289 | -_8fdnv6Crg_30.000_40.000.wav 30.000 40.000 Police car (siren) 290 | -az6BooRLxw_40.000_50.000.wav 40.000 50.000 Police car (siren) 291 | -bs3c27rEtc_30.000_40.000.wav 30.000 40.000 Police car (siren) 292 | -dBTGdL4RFs_30.000_40.000.wav 30.000 40.000 Police car (siren) 293 | -gKNRXbpAKs_30.000_40.000.wav 30.000 40.000 Police car (siren) 294 | -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Police car (siren) 295 | -haSUR_IUto_30.000_40.000.wav 30.000 40.000 Police car (siren) 296 | -l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Police car (siren) 297 | -lWs7_49gss_30.000_40.000.wav 30.000 40.000 Police car (siren) 298 | -lhnhB4rbGw_3.000_13.000.wav 3.000 13.000 Police car (siren) 299 | -rkJeBBmiTQ_60.000_70.000.wav 60.000 70.000 Police car (siren) 300 | -rs7FPxzc6w_8.000_18.000.wav 8.000 18.000 Police car (siren) 301 | -20uudT97E0_30.000_40.000.wav 30.000 40.000 Screaming 302 | -3bGlOhRkAo_140.000_150.000.wav 140.000 150.000 Screaming 303 | -4pUrlMafww_1.000_11.000.wav 1.000 11.000 Screaming 304 | -7R0ybQQAHg_60.000_70.000.wav 60.000 70.000 Screaming 305 | -7gojlG6bE4_30.000_40.000.wav 30.000 40.000 Screaming 306 | -GI5PbO6j50_30.000_40.000.wav 30.000 40.000 Screaming 307 | -MuIRudOtxw_30.000_40.000.wav 30.000 40.000 Screaming 308 | -WfQBr42ymw_30.000_40.000.wav 30.000 40.000 Screaming 309 | -YOjIgYspsY_30.000_40.000.wav 30.000 40.000 Screaming 310 | -g_AcRVFfXU_30.000_40.000.wav 30.000 40.000 Screaming 311 | -gb5uvwsRpI_30.000_40.000.wav 30.000 40.000 Screaming 312 | -iAwqlQ3TEk_0.000_3.000.wav 0.000 3.000 Screaming 313 | -nJoxcmxz5g_30.000_40.000.wav 30.000 40.000 Screaming 314 | -pwgypWE-J8_30.000_40.000.wav 30.000 40.000 Screaming 315 | -pzasCR0kpc_30.000_40.000.wav 30.000 40.000 Screaming 316 | -sUgHKZQKYc_30.000_40.000.wav 30.000 40.000 Screaming 317 | -uazzQEmQ7c_0.000_10.000.wav 0.000 10.000 Screaming 318 | -vHJU1wDRsY_30.000_40.000.wav 30.000 40.000 Screaming 319 | 0-RnTXpp8Q0_30.000_40.000.wav 30.000 40.000 Screaming 320 | 09YQukdYVI4_30.000_40.000.wav 30.000 40.000 Screaming 321 | 0Ees8KFCUXM_30.000_40.000.wav 30.000 40.000 Screaming 322 | 0EymGuYWkFk_30.000_40.000.wav 30.000 40.000 Screaming 323 | 0Nw1OyTsaAo_30.000_40.000.wav 30.000 40.000 Screaming 324 | 0YnOMAls83g_30.000_40.000.wav 30.000 40.000 Screaming 325 | 0_gyUQkLCY8_30.000_40.000.wav 30.000 40.000 Screaming 326 | 0_hnDV2SHBI_7.000_17.000.wav 7.000 17.000 Screaming 327 | 0cqEaAkbrbI_80.000_90.000.wav 80.000 90.000 Screaming 328 | 0hC044mDsWA_30.000_40.000.wav 30.000 40.000 Screaming 329 | 0kQANiakiH0_30.000_40.000.wav 30.000 40.000 Screaming 330 | 0rVBXpbgO8s_30.000_40.000.wav 30.000 40.000 Screaming 331 | ---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car 332 | --330hg-Ocw_30.000_40.000.wav 30.000 40.000 Car 333 | --8puiAGLhs_30.000_40.000.wav 30.000 40.000 Car 334 | --9VR_F7CtY_30.000_40.000.wav 30.000 40.000 Car 335 | --F70LWypIg_30.000_40.000.wav 30.000 40.000 Car 336 | --P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car 337 | --QvRbvnbUE_30.000_40.000.wav 30.000 40.000 Car 338 | --SeOZy3Yik_30.000_40.000.wav 30.000 40.000 Car 339 | --Zz7BgxSUg_30.000_40.000.wav 30.000 40.000 Car 340 | --e0Vu_ruTc_30.000_40.000.wav 30.000 40.000 Car 341 | --iFD6IyQW8_30.000_40.000.wav 30.000 40.000 Car 342 | --jGnLqFsQ4_24.000_34.000.wav 24.000 34.000 Car 343 | --jc0NAxK8M_30.000_40.000.wav 30.000 40.000 Car 344 | --v1WjOJv-w_150.000_160.000.wav 150.000 160.000 Car 345 | --xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car 346 | --yaQA8d1dI_6.000_16.000.wav 6.000 16.000 Car 347 | --zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car 348 | -0-jXXldDOU_10.000_20.000.wav 10.000 20.000 Car 349 | -03ld83JliM_29.000_39.000.wav 29.000 39.000 Car 350 | -0B-egfXU7E_30.000_40.000.wav 30.000 40.000 Car 351 | -0Bkyt8iZ1I_8.000_18.000.wav 8.000 18.000 Car 352 | -0CIk-OOp7Y_30.000_40.000.wav 30.000 40.000 Car 353 | -0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car 354 | -0CY5NWBHyY_20.000_30.000.wav 20.000 30.000 Car 355 | -0HsrVfb5vc_20.000_30.000.wav 20.000 30.000 Car 356 | -0I89-H0AFo_26.000_36.000.wav 26.000 36.000 Car 357 | -0P6VDQ1YDs_80.000_90.000.wav 80.000 90.000 Car 358 | -0PrEsytvc0_30.000_40.000.wav 30.000 40.000 Car 359 | -0RqnaXZu_E_30.000_40.000.wav 30.000 40.000 Car 360 | -0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car 361 | ---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car passing by 362 | --P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car passing by 363 | --xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car passing by 364 | --zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car passing by 365 | --zbPxnl27o_20.000_30.000.wav 20.000 30.000 Car passing by 366 | -0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car passing by 367 | -0MnD7jBvkE_0.000_4.000.wav 0.000 4.000 Car passing by 368 | -0U3c4PN8sc_30.000_40.000.wav 30.000 40.000 Car passing by 369 | -0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car passing by 370 | -10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car passing by 371 | -14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car passing by 372 | -15nPYi2v1g_30.000_40.000.wav 30.000 40.000 Car passing by 373 | -19pq3HJoBM_30.000_40.000.wav 30.000 40.000 Car passing by 374 | -1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car passing by 375 | -1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car passing by 376 | -1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car passing by 377 | -1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car passing by 378 | -2-luek6dI8_30.000_40.000.wav 30.000 40.000 Car passing by 379 | -21-RfxQscI_30.000_40.000.wav 30.000 40.000 Car passing by 380 | -25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car passing by 381 | -2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car passing by 382 | -2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car passing by 383 | -2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car passing by 384 | -31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car passing by 385 | -35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car passing by 386 | -3929cmVE20_30.000_40.000.wav 30.000 40.000 Car passing by 387 | -3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car passing by 388 | -3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car passing by 389 | -3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car passing by 390 | -3exNVlj92w_30.000_40.000.wav 30.000 40.000 Car passing by 391 | --0w1YA1Hm4_30.000_40.000.wav 30.000 40.000 Bus 392 | -0_vEaaXndY_11.000_21.000.wav 11.000 21.000 Bus 393 | -5GcZwBvBdI_30.000_40.000.wav 30.000 40.000 Bus 394 | -5digoPWn6U_8.000_18.000.wav 8.000 18.000 Bus 395 | -79l4w4DsYM_30.000_40.000.wav 30.000 40.000 Bus 396 | -7B4pbkIEas_30.000_40.000.wav 30.000 40.000 Bus 397 | -8YTu7ZGA2w_30.000_40.000.wav 30.000 40.000 Bus 398 | -93IM29_8rs_14.000_24.000.wav 14.000 24.000 Bus 399 | -9GhPxGkpio_26.000_36.000.wav 26.000 36.000 Bus 400 | -9J9xs7LM9Y_25.000_35.000.wav 25.000 35.000 Bus 401 | -AY_lZLYJR8_8.000_18.000.wav 8.000 18.000 Bus 402 | -AdQBgtN_4E_30.000_40.000.wav 30.000 40.000 Bus 403 | -BxfsWlPUPY_30.000_40.000.wav 30.000 40.000 Bus 404 | -CgCr8Eknm0_14.000_24.000.wav 14.000 24.000 Bus 405 | -CnsvTDIXdE_20.000_30.000.wav 20.000 30.000 Bus 406 | -CpMlnGhxEU_0.000_9.000.wav 0.000 9.000 Bus 407 | -DP_cv0x_Ng_30.000_40.000.wav 30.000 40.000 Bus 408 | -FEXRjcryZE_30.000_40.000.wav 30.000 40.000 Bus 409 | -Fp2-w-iLiE_20.000_30.000.wav 20.000 30.000 Bus 410 | -GLk6G9U09A_30.000_40.000.wav 30.000 40.000 Bus 411 | -Ga9sSkpngg_30.000_40.000.wav 30.000 40.000 Bus 412 | -H8V23dZoLo_0.000_10.000.wav 0.000 10.000 Bus 413 | -HeQfwKbFzg_30.000_40.000.wav 30.000 40.000 Bus 414 | -HzzEuFBiDU_30.000_40.000.wav 30.000 40.000 Bus 415 | -I4INTpMKT4_30.000_40.000.wav 30.000 40.000 Bus 416 | -II-7qJxKPc_21.000_31.000.wav 21.000 31.000 Bus 417 | -LnpzyfTkF8_30.000_40.000.wav 30.000 40.000 Bus 418 | -OgRshQfsi8_30.000_40.000.wav 30.000 40.000 Bus 419 | -P53lJ1ViWk_30.000_40.000.wav 30.000 40.000 Bus 420 | -PvNUvEov4Q_30.000_40.000.wav 30.000 40.000 Bus 421 | --12UOziMF0_30.000_40.000.wav 30.000 40.000 Truck 422 | --73E04RpiQ_0.000_9.000.wav 0.000 9.000 Truck 423 | --J947HxQVM_0.000_9.000.wav 0.000 9.000 Truck 424 | --bD1DVKlzQ_30.000_40.000.wav 30.000 40.000 Truck 425 | --ivFZu-hlc_30.000_40.000.wav 30.000 40.000 Truck 426 | --wuU7kzB5o_30.000_40.000.wav 30.000 40.000 Truck 427 | -0B_CYyG5Dg_30.000_40.000.wav 30.000 40.000 Truck 428 | -0JqTq_4jaE_40.000_50.000.wav 40.000 50.000 Truck 429 | -0MrEZKJ5MQ_30.000_40.000.wav 30.000 40.000 Truck 430 | -0awng26xQ8_30.000_40.000.wav 30.000 40.000 Truck 431 | -0dq1Vg9rd8_30.000_40.000.wav 30.000 40.000 Truck 432 | -0wkq7CUYME_310.000_320.000.wav 310.000 320.000 Truck 433 | -14RXdkqYuI_30.000_40.000.wav 30.000 40.000 Truck 434 | -1B3CzpiW1M_30.000_40.000.wav 30.000 40.000 Truck 435 | -1Q21cZhHDE_30.000_40.000.wav 30.000 40.000 Truck 436 | -1ZXXnBXJ6c_8.000_18.000.wav 8.000 18.000 Truck 437 | -1s0DWApvT8_30.000_40.000.wav 30.000 40.000 Truck 438 | -1s84_2Vn4g_30.000_40.000.wav 30.000 40.000 Truck 439 | -26ansJluVo_30.000_40.000.wav 30.000 40.000 Truck 440 | -2EscdO0l-A_30.000_40.000.wav 30.000 40.000 Truck 441 | -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Truck 442 | -2NBZUCcvm0_30.000_40.000.wav 30.000 40.000 Truck 443 | -2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Truck 444 | -2vmprMUw10_30.000_40.000.wav 30.000 40.000 Truck 445 | -2x4TB8VWvE_18.000_28.000.wav 18.000 28.000 Truck 446 | -39q4y0tt-g_30.000_40.000.wav 30.000 40.000 Truck 447 | -3N5rjPrNCc_190.000_200.000.wav 190.000 200.000 Truck 448 | -3NcUIyJtFY_30.000_40.000.wav 30.000 40.000 Truck 449 | -3PplV0ErOk_30.000_40.000.wav 30.000 40.000 Truck 450 | -3gSkrDKNSA_27.000_37.000.wav 27.000 37.000 Truck 451 | --p-rk_HBuU_30.000_40.000.wav 30.000 40.000 Motorcycle 452 | -1WK72M4xeg_220.000_230.000.wav 220.000 230.000 Motorcycle 453 | -1XfuJcdvfg_30.000_40.000.wav 30.000 40.000 Motorcycle 454 | -3XWBAmjmaQ_11.000_21.000.wav 11.000 21.000 Motorcycle 455 | -4-87UgJcUw_70.000_80.000.wav 70.000 80.000 Motorcycle 456 | -4D3Gkyisyc_30.000_40.000.wav 30.000 40.000 Motorcycle 457 | -5k5GyHd2So_4.000_14.000.wav 4.000 14.000 Motorcycle 458 | -6A2L1U9b5Y_54.000_64.000.wav 54.000 64.000 Motorcycle 459 | -6Yfati1N10_80.000_90.000.wav 80.000 90.000 Motorcycle 460 | -7_o_GhpZpM_12.000_22.000.wav 12.000 22.000 Motorcycle 461 | -7rZwMK6uSs_70.000_80.000.wav 70.000 80.000 Motorcycle 462 | -85f5DKKfSo_30.000_40.000.wav 30.000 40.000 Motorcycle 463 | -9Smdrt5zwk_40.000_50.000.wav 40.000 50.000 Motorcycle 464 | -9gZLVDKpnE_30.000_40.000.wav 30.000 40.000 Motorcycle 465 | -BGebo8V4XY_30.000_40.000.wav 30.000 40.000 Motorcycle 466 | -DdiduB5B_w_190.000_200.000.wav 190.000 200.000 Motorcycle 467 | -HIPq7T3eFI_11.000_21.000.wav 11.000 21.000 Motorcycle 468 | -H_3oEkKe0M_50.000_60.000.wav 50.000 60.000 Motorcycle 469 | -HmuMoykRqA_500.000_510.000.wav 500.000 510.000 Motorcycle 470 | -IMRE_psvtI_30.000_40.000.wav 30.000 40.000 Motorcycle 471 | -Ie4LSPDEF4_6.000_16.000.wav 6.000 16.000 Motorcycle 472 | -J0F29UCZiA_70.000_80.000.wav 70.000 80.000 Motorcycle 473 | -KFCJ7ydu2E_0.000_10.000.wav 0.000 10.000 Motorcycle 474 | -KmDAgYb0Uo_100.000_110.000.wav 100.000 110.000 Motorcycle 475 | -P7iW3WzNfc_400.000_410.000.wav 400.000 410.000 Motorcycle 476 | -QMAKXzIGx4_10.000_20.000.wav 10.000 20.000 Motorcycle 477 | -S-5z2vYtxw_10.000_20.000.wav 10.000 20.000 Motorcycle 478 | -SlL0NZh51w_30.000_40.000.wav 30.000 40.000 Motorcycle 479 | -US2mpJxbj4_30.000_40.000.wav 30.000 40.000 Motorcycle 480 | -VO-C9C0uqY_1.000_11.000.wav 1.000 11.000 Motorcycle 481 | --H_-CEB2wA_30.000_40.000.wav 30.000 40.000 Train 482 | -1VsFy0eVJs_30.000_40.000.wav 30.000 40.000 Train 483 | -1X7kpLnOpM_60.000_70.000.wav 60.000 70.000 Train 484 | -3FIglJti0s_30.000_40.000.wav 30.000 40.000 Train 485 | -5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train 486 | -6KOEEiAf9s_19.000_29.000.wav 19.000 29.000 Train 487 | -97l_c6PToE_30.000_40.000.wav 30.000 40.000 Train 488 | -9S5Z-uciLo_70.000_80.000.wav 70.000 80.000 Train 489 | -CkgGfKepO4_140.000_150.000.wav 140.000 150.000 Train 490 | -E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train 491 | -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train 492 | -JpQivta6MQ_20.000_30.000.wav 20.000 30.000 Train 493 | -K9oTZj3mVQ_30.000_40.000.wav 30.000 40.000 Train 494 | -KjE40DlSdU_0.000_10.000.wav 0.000 10.000 Train 495 | -NrFtZ_xxFU_30.000_40.000.wav 30.000 40.000 Train 496 | -PYRamK58Ss_0.000_10.000.wav 0.000 10.000 Train 497 | -P_XDJt4p_s_30.000_40.000.wav 30.000 40.000 Train 498 | -Pjylzex7oc_350.000_360.000.wav 350.000 360.000 Train 499 | -QHuZGmIy_I_30.000_40.000.wav 30.000 40.000 Train 500 | -Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train 501 | -RXKRoRPWXg_30.000_40.000.wav 30.000 40.000 Train 502 | -VH414svzI0_30.000_40.000.wav 30.000 40.000 Train 503 | -WFdYxE-PYI_30.000_40.000.wav 30.000 40.000 Train 504 | -Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train 505 | -XcC-UlbcRA_30.000_40.000.wav 30.000 40.000 Train 506 | -Y2cD8xvCHI_30.000_40.000.wav 30.000 40.000 Train 507 | -ZKZkMHe3cY_70.000_80.000.wav 70.000 80.000 Train 508 | -Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train 509 | -aZ7XC4LG2A_30.000_40.000.wav 30.000 40.000 Train 510 | -abVemAm9HM_430.000_440.000.wav 430.000 440.000 Train 511 | 1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Ambulance (siren) 512 | 7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Ambulance (siren) 513 | -z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 514 | 00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 515 | 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 516 | 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Fire engine, fire truck (siren) 517 | 4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren) 518 | 35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Civil defense siren 519 | 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Police car (siren) 520 | 0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Police car (siren) 521 | 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Police car (siren) 522 | 17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Police car (siren) 523 | 4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Police car (siren) 524 | -10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car 525 | -122tCXtFhU_30.000_40.000.wav 30.000 40.000 Car 526 | -14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car 527 | -1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car 528 | -1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car 529 | -1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car 530 | -1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car 531 | -25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car 532 | -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Car 533 | -2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car 534 | -2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car 535 | -2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car 536 | -31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car 537 | -35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car 538 | -3929cmVE20_30.000_40.000.wav 30.000 40.000 Car 539 | -3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car 540 | -3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car 541 | -3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car 542 | -AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Car 543 | -Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Car 544 | -VULyMtKazE_0.000_7.000.wav 0.000 7.000 Car 545 | -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Car 546 | 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Car 547 | 0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Car 548 | 0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car 549 | 4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car 550 | 5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car 551 | 7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car 552 | 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car 553 | 9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car 554 | -l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Car passing by 555 | 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car passing by 556 | -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Bus 557 | -45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Truck 558 | -4B435WQvag_20.000_30.000.wav 20.000 30.000 Truck 559 | -60XojQWWoc_30.000_40.000.wav 30.000 40.000 Truck 560 | -6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Truck 561 | -8OITuFZha8_30.000_40.000.wav 30.000 40.000 Truck 562 | -8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Truck 563 | -AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Truck 564 | -AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Truck 565 | -BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Truck 566 | -Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Truck 567 | -FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Truck 568 | -Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Truck 569 | -PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Truck 570 | -X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Truck 571 | -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Truck 572 | -oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Truck 573 | -oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Truck 574 | -qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Truck 575 | -r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Truck 576 | -s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Truck 577 | -uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Truck 578 | -x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Truck 579 | -xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Truck 580 | -zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Truck 581 | 0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Truck 582 | 0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Truck 583 | 0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Truck 584 | 0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Truck 585 | 0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Truck 586 | 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Truck 587 | -nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train 588 | 02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train 589 | 0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train 590 | 0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train 591 | 0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train 592 | 0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train 593 | 0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train 594 | 10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train 595 | 1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train 596 | 1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train 597 | 1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train 598 | 1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train 599 | 1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train 600 | 1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train 601 | 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train 602 | 2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train 603 | 2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train 604 | 2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train 605 | 2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train 606 | 3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Train 607 | -------------------------------------------------------------------------------- /data/sound_event_list_17_classes.txt: -------------------------------------------------------------------------------- 1 | Class_ID ConfidenceofAnnotation(by Google) Class_Label 2 | /m/0284vy3 10 Train horn 3 | /m/05x_td 9 Air horn, truck horn 4 | /m/02mfyn 7 Car alarm 5 | /m/02rhddq 5 Reversing beeps 6 | /m/0199g 7 Bicycle 7 | /m/06_fw 8 Skateboard 8 | /m/012n7d 6 Ambulance (siren) 9 | /m/012ndj 8 Fire engine, fire truck (siren) 10 | /m/0dgbq 9 Civil defense siren 11 | /m/04qvtq 8 Police car (siren) 12 | /m/03qc9zr 8 Screaming 13 | /m/0k4j 10 Car 14 | /t/dd00134 5 Car passing by 15 | /m/01bjv 7 Bus 16 | /m/07r04 8 Truck 17 | /m/04_sv 9 Motorcycle 18 | /m/07jdr 6 Train 19 | -------------------------------------------------------------------------------- /data_generator.py: -------------------------------------------------------------------------------- 1 | from feature_extractor import * 2 | import multiprocessing as mu 3 | from functools import partial 4 | import h5py 5 | import time 6 | import os 7 | import sys 8 | 9 | def get_tr_set_avg_std(data): 10 | st = time.time() 11 | print 'Get std and average of training set' 12 | 13 | common_sum = 0 14 | square_sum = 0 15 | 16 | # data.shape -> (1, 128, 896) 17 | number_non_zero_len = 0 18 | for i in xrange(len(data)): 19 | # ignore zero padding 20 | number_non_zero_len += (data[i].sum((0,1))!=0).astype(int).sum() 21 | common_sum += data[i].sum(0).sum(-1) 22 | square_sum += (data[i]**2).sum(0).sum(-1) 23 | 24 | common_avg = common_sum / number_non_zero_len 25 | square_avg = square_sum / number_non_zero_len 26 | 27 | std = np.sqrt( square_avg - common_avg**2 ) 28 | print 'length of std' + str(std.shape) 29 | print time.time() - st 30 | return np.array([common_avg, std]) 31 | 32 | def get_csv_file(fn): 33 | with open(fn) as f: 34 | data = f.readlines() 35 | return data 36 | 37 | def get_tag2idx(fn): 38 | #idx2tag = {} 39 | tag2idx = {} 40 | classes = get_csv_file(fn) 41 | for idx, val in enumerate(classes[1:]): 42 | mask_tag_id, confidence, tag_name = val[:-1].split('\t') 43 | tag2idx[tag_name] = idx 44 | 45 | return tag2idx 46 | 47 | def get_fn2target(fn, tag2idx, wav_path): 48 | # target.shape -> (number of files, 17) binary vectors 49 | fn2target = {} 50 | for i in get_csv_file(fn): 51 | wav_fn, start_time, end_time, tag_name = i[:-1].split('\t') 52 | wav_fn = wav_path + '/Y' + wav_fn 53 | try: 54 | fn2target[wav_fn][tag2idx[tag_name]] = 1 55 | except: 56 | fn2target[wav_fn] = np.zeros(len(tag2idx)) 57 | fn2target[wav_fn][tag2idx[tag_name]] = 1 58 | return fn2target 59 | 60 | def get_create_h5_data(h5, set_name,data_len, num_classes, args): 61 | max_time_len = int((args.sr*args.msc)/args.hs) 62 | Xtr = h5.create_dataset('X'+set_name, 63 | shape=(data_len, 1, args.mel, max_time_len), 64 | maxshape=(None, 1, args.mel, max_time_len), 65 | chunks=(1, 1, args.mel, max_time_len), dtype='float32') 66 | Ytr = h5.create_dataset('Y'+set_name, (data_len, num_classes), dtype='int') 67 | return Xtr, Ytr 68 | 69 | def get_data_fea_target_to_h5(h5, data2target, set_name, num_classes, args): 70 | X, Y = get_create_h5_data(h5, set_name, len(data2target), num_classes, args) 71 | 72 | # put target (Y) to h5 73 | file_list = np.array(data2target.keys()) 74 | Y[:] = np.array(data2target.values()).copy() 75 | 76 | # put mel spect (X) to h5 77 | process_files = 0 78 | multi_read_files = 488 79 | st = time.time() 80 | get_mel_spect_partial = partial(get_mel_spect, args=args) 81 | for i in xrange(np.ceil(len(data2target)/float(multi_read_files)).astype(int)): 82 | #P = mu.Pool(mu.cpu_count()) 83 | P = mu.Pool(30) 84 | out = np.array(P.map(get_mel_spect_partial, file_list[i*multi_read_files:(i+1)*multi_read_files])) 85 | P.close() 86 | P.join() 87 | X[i*multi_read_files:(i+1)*multi_read_files] = out 88 | process_files += len(out) 89 | sys.stdout.write('\r') 90 | sys.stdout.write('Extract feature for each %s example [%7d/%7d] Time %d' 91 | %(set_name, process_files, len(file_list), time.time()-st)) 92 | sys.stdout.flush() 93 | 94 | #print '\n' 95 | print '\nExtract feature for %s set: Done'%(set_name) 96 | return X, Y 97 | 98 | 99 | def get_h5_dataset(h5_fn, tr_csv_fn, te_csv_fn, tr_wav_fp, te_wav_fp, classes_fn, args): 100 | 101 | # Create h5 file 102 | if os.path.isfile(h5_fn): 103 | print '[File Exist] Read file : %s'%(h5_fn) 104 | return h5py.File(h5_fn, 'r') 105 | 106 | # get tag index 107 | tag2idx = get_tag2idx(classes_fn) 108 | target_len = len(tag2idx) 109 | 110 | tr2target = get_fn2target(tr_csv_fn, tag2idx, tr_wav_fp) 111 | te2target = get_fn2target(te_csv_fn, tag2idx, te_wav_fp) 112 | 113 | h5f = h5py.File(h5_fn, 'w') 114 | Xte, Yte = get_data_fea_target_to_h5(h5f, te2target, 'te', len(tag2idx), args) 115 | Xtr, Ytr = get_data_fea_target_to_h5(h5f, tr2target, 'tr', len(tag2idx), args) 116 | 117 | # get training set std&avg 118 | avg_std = h5f.create_dataset('Xtr_avg_std', 119 | data=get_tr_set_avg_std(Xtr), dtype='float32') 120 | h5f.close() 121 | 122 | print 'H5 File Path %s'%(h5_fn) 123 | return h5py.File(h5_fn, 'r') 124 | 125 | 126 | -------------------------------------------------------------------------------- /evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | from sklearn.metrics import roc_auc_score 4 | from sklearn.metrics import average_precision_score 5 | import multiprocessing 6 | import functools 7 | 8 | def class_F1_R_P(gru, pre, th): 9 | best = np.zeros(4) 10 | for t in th: 11 | tidx = gru==1 12 | vpred = pre.copy() 13 | vpred[vpred> t] = 1 14 | vpred[vpred<=t] = 0 15 | 16 | TP = vpred[tidx].sum() 17 | if TP == 0 : 18 | continue 19 | 20 | P = TP / float(vpred.sum()) 21 | R = TP / float(gru.sum()) 22 | F1 = 2*(P*R)/(R+P) 23 | 24 | if F1 > best[1]: 25 | best = np.array([t, F1, R, P]) 26 | return best 27 | 28 | def multi_evl_nt(i, gru, pre, th): 29 | st = time.time() 30 | evl_metrics = np.zeros(6) 31 | 32 | if gru[:,i].sum() == 0 or gru[:,i].sum()==len(gru): 33 | evl_metrics = evl_metrics -1 34 | return evl_metrics 35 | 36 | pre_tag = (np.argmax(pre[:,:,i],1)==i).astype(int) 37 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre_tag, [0]) 38 | 39 | evl_metrics[4] = average_precision_score(gru[:,i], pre[:,i,i]) 40 | evl_metrics[5] = roc_auc_score(gru[:,i], pre[:,i,i]) 41 | #print time.time() - st 42 | return evl_metrics 43 | 44 | def multi_evl(i, gru, pre, th): 45 | st = time.time() 46 | evl_metrics = np.zeros(6) 47 | 48 | if gru[:,i].sum() == 0 or gru[:,i].sum()==len(gru): 49 | evl_metrics = evl_metrics -1 50 | return evl_metrics 51 | 52 | if len(th) == 0: 53 | #th = np.arange(0, 1, 0.0001) 54 | th = np.arange(0, 1, 0.01) 55 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], th) 56 | else: 57 | #if len(th) == 1: 58 | # evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], th) 59 | #else: 60 | evl_metrics[:4] = class_F1_R_P(gru[:,i], pre[:,i], [th[i]]) 61 | 62 | evl_metrics[4] = average_precision_score(gru[:,i], pre[:,i]) 63 | evl_metrics[5] = roc_auc_score(gru[:,i], pre[:,i]) 64 | #print time.time() - st 65 | return evl_metrics 66 | 67 | def evl(gru, pre, va_th=[]): 68 | st =time.time() 69 | vate = 'TE' 70 | evl_metrics = np.zeros((pre.shape[-1], 6)) 71 | if len(va_th) == 0: 72 | vate = 'VA' 73 | 74 | if vate not in ['TE', 'VA']: 75 | multi_evl_1 = functools.partial(multi_evl, gru=gru, pre=pre, th=va_th) 76 | P = multiprocessing.Pool(30) 77 | evl_metrics = np.array(P.map(multi_evl_1, np.arange(pre.shape[-1]))) 78 | P.close() 79 | P.join() 80 | 81 | else: 82 | for i in np.arange(pre.shape[-1]): 83 | if len(pre.shape)==2: 84 | evl_metrics[i] = multi_evl(i, gru=gru, pre=pre, th=va_th) 85 | else: 86 | evl_metrics[i] = multi_evl_nt(i, gru=gru, pre=pre, th=va_th) 87 | 88 | va_th = evl_metrics[:,0].copy() 89 | evl_metrics = evl_metrics[:,1:] 90 | 91 | #print np.arange(527)[evl_metrics[:,0]!=-1] 92 | acc = evl_metrics[evl_metrics[:,0]!=-1,:].mean(axis=0) * 100 93 | #print acc 94 | #print np.arange(pre.shape[-1])[evl_metrics[:,0]==-100,:] 95 | out = '[%s] mAP:%.1f%% AUC:%.1f%% F1-CB:%.1f%% R-CB:%.1f%% P-CB:%.1f%% time:%.1f'\ 96 | % (vate, acc[3], acc[4], acc[0], acc[1], acc[2], time.time()-st) 97 | print out 98 | return va_th, evl_metrics, out 99 | 100 | 101 | -------------------------------------------------------------------------------- /feature_extractor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import subprocess as subp 4 | 5 | def get_wav_ffmpeg(fn, sr): 6 | command = ['ffmpeg', '-i', fn, '-f', 'f32le', 7 | '-acodec', 'pcm_f32le', '-ar', str(sr), 8 | '-loglevel', 'quiet','-ac', str(1), '-'] 9 | pipe = subp.Popen(command, stdout=subp.PIPE, 10 | startupinfo=None) 11 | raw_audio = pipe.stdout.read() 12 | return np.fromstring(raw_audio, dtype="float32") 13 | 14 | def get_wav_librosa(fn, sr): 15 | y, _ = librosa.core.load(fn, sr) 16 | return y 17 | 18 | def get_padded_fea(fea, max_time_len): 19 | fea_time_len = fea.shape[1] 20 | 21 | if fea_time_len == max_time_len: 22 | return fea 23 | 24 | if fea_time_len > max_time_len: 25 | return fea[:,:max_time_len] 26 | 27 | if fea_time_len < max_time_len: 28 | # only pad time axis 29 | return np.pad(fea, ((0,0),(0,max_time_len - fea_time_len)), 'constant') 30 | 31 | def get_mel_spect(fn, args): 32 | max_time_len = int((args.sr*args.msc)/args.hs) 33 | 34 | wav = get_wav_librosa(fn, args.sr) 35 | 36 | if len(wav) == 0: 37 | print 'Read Empty Wave File %s'%(fn) 38 | return -2 # Empty wave file 39 | else: 40 | fea = librosa.feature.melspectrogram(wav, sr=args.sr, n_fft=args.ws, 41 | hop_length=args.hs, n_mels=args.mel) 42 | pad_fea = get_padded_fea(fea, max_time_len) 43 | 44 | return np.log(1 + 10000 * pad_fea).reshape(1, args.mel, max_time_len) 45 | 46 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from data_generator import * 2 | import argparse 3 | from Mmnet import * 4 | from Trainer import * 5 | 6 | import os 7 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 8 | 9 | # DCASE metadata file path 10 | classes_fn = './data/sound_event_list_17_classes.txt' 11 | training_set_csv_fn = './data/groundtruth_weak_label_training_set.csv' 12 | test_set_csv_fn = './data/groundtruth_weak_label_testing_set.csv' 13 | 14 | # DCASE wave file path 15 | #training_set_wav_fp = './Task_4_DCASE_2017_training_set/unbalanced_train_segments_training_set_audio_formatted_and_segmented_downloads' 16 | training_set_wav_fp = '../../dataset/DCA17_4/Task_4_DCASE_2017_training_set/unbalanced_train_segments_training_set_audio_formatted_and_segmented_downloads/' 17 | test_set_wav_fp = '../../dataset/DCA17_4/Task_4_DCASE_2017_testing_set/unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads/' 18 | #test_set_wav_fp = './DCA17_4/Task_4_DCASE_2017_testing_set/unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads' 19 | 20 | # pre_trained model path 21 | #pmp = './Mmnet_DCASE17' 22 | pmp = '' 23 | 24 | # params for audio feature extraction (mel-spectrogram) 25 | parser = argparse.ArgumentParser(description= 'PyTorch M&ment Training used DCASE2017 Dataset') 26 | parser.add_argument('--dn', default='DCASE17_task4', type=str, help='dataset name') 27 | parser.add_argument('--sr', default=44100, type=int, help='[fea_ext] sample rate') 28 | parser.add_argument('--ws', default=2048, type=int, help='[fea_ext] windows size') 29 | parser.add_argument('--hs', default=492, type=int, help='[fea_ext] hop size') 30 | parser.add_argument('--mel', default=128, type=int, help='[fea_ext] mel bands') 31 | parser.add_argument('--msc', default=10, type=int, help='[fea_ext] top duration of audio clip') 32 | 33 | # params for training 34 | parser.add_argument('--bs', default=64, type=int, help='[net] batch size') 35 | parser.add_argument('--gw', default=1, type=int, help='[net] global weight for both positive and negative samples') 36 | parser.add_argument('--lrde', default=30, type=int, help='[net] divided the learning rate 10 by every lrde epochs') 37 | parser.add_argument('--mom', default=0.9, type=float, help='[net] momentum') 38 | parser.add_argument('--wd', default=1e-4, type=float, help='[net] weight decay') 39 | parser.add_argument('--lr', default=0.1, type=float, help='[net] learning rate') 40 | parser.add_argument('--ep', default=100, type=int, help='[net] epoch') 41 | parser.add_argument('--beta', default=0.3, type=float, help='[net] hyperparameter for pre-class loss weight') 42 | parser.add_argument('--pmp', default=pmp, type=str, help='[net] pre-trained model path') 43 | args = parser.parse_args() 44 | 45 | # Read (if it exist) or generate data for training 46 | h5_fn = './data/%s_%d_%d_%d_%d.h5'%(args.dn, args.sr, args.ws, args.hs, args.mel) 47 | h5_fn = '/home/fearofchou/%s_%d_%d_%d_%d.h5'%(args.dn, args.sr, args.ws, args.hs, args.mel) 48 | data = get_h5_dataset(h5_fn, training_set_csv_fn, test_set_csv_fn, 49 | training_set_wav_fp, test_set_wav_fp, classes_fn, args) 50 | 51 | 52 | # build model 53 | model = nn.DataParallel(Net(args.mel, data['Ytr'].shape[1]).cuda()) 54 | 55 | # Train 56 | Trer = Trainer(data, model, args) 57 | Trer.fit() 58 | 59 | 60 | 61 | --------------------------------------------------------------------------------