├── pytorch ├── ReadMe.md ├── losses.py ├── pytorch_utils.py ├── evaluate.py ├── main.py └── models.py ├── utils ├── ReadMe.md ├── config.py ├── gtg_example.py ├── gtg.py ├── utilities.py ├── plot_results.py ├── features.py └── data_generator.py ├── workspace └── ReadMe.md ├── runme.sh └── README.md /pytorch/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /utils/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /workspace/ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pytorch/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def nll_loss(output, target): 6 | '''Negative likelihood loss. The output should be obtained using F.log_softmax(x). 7 | 8 | Args: 9 | output: (N, classes_num) 10 | target: (N, classes_num) 11 | ''' 12 | loss = - torch.mean(target * output) 13 | return loss -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | sample_rate = 44100 2 | window_size = 2048 3 | hop_size = 512 # So that there are 64 frames per second 4 | mel_bins = 40 5 | fmin = 50 # Hz 6 | fmax = 14000 # Hz 7 | 8 | # MFCC setings 9 | mfcc_frames = 431 10 | n_mfcc = 40 11 | mfcc_hop_size = 512 12 | 13 | # gammatonegram settings 14 | gamm_frames = 499 15 | n_gamm = 64 16 | 17 | frames_per_second = sample_rate // hop_size 18 | audio_duration = 10 # Audio recordings in DCASE2019 Task1 are all 10 seconds 19 | frames_num = frames_per_second * audio_duration 20 | total_samples = sample_rate * audio_duration 21 | total_frames = (sample_rate * audio_duration) // hop_size 22 | 23 | labels = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', 24 | 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park', 'unknown'] 25 | 26 | # classes_num = len(labels) 27 | lb_to_idx = {lb: idx for idx, lb in enumerate(labels)} 28 | idx_to_lb = {idx: lb for idx, lb in enumerate(labels)} -------------------------------------------------------------------------------- /runme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # You need to modify this path to your downloaded dataset directory 3 | DATASET_DIR='data/' 4 | 5 | # You need to modify this path to your workspace to store features and models 6 | WORKSPACE='workspace/' 7 | 8 | # Hyper-parameters 9 | GPU_ID=0 10 | MODEL_TYPE='Logmel_Cnn' 11 | # MODEL_TYPE='Cqt_Cnn' 12 | # MODEL_TYPE='Gamm_Cnn' 13 | # MODEL_TYPE='Mfcc_Cnn' 14 | # MODEL_TYPE='DCMR' 15 | # MODEL_TYPE='Logmel_DCMF' 16 | # MODEL_TYPE='Cqt_DCMF' 17 | # MODEL_TYPE='Gamm_DCMF' 18 | # MODEL_TYPE='Mfcc_DCMF' 19 | # MODEL_TYPE='Logmel_DCMT' 20 | # MODEL_TYPE='Cqt_DCMT' 21 | # MODEL_TYPE='Gamm_DCMT' 22 | # MODEL_TYPE='Mfcc_DCMT' 23 | # MODEL_TYPE='DCMR_DCMF' 24 | # MODEL_TYPE='DCMR_DCMT' 25 | # MODEL_TYPE='DCMR_DCMF_DCMT' 26 | 27 | 28 | #### Origanl Train (Other Models) 29 | BATCH_SIZE=128 30 | ITE_TRAIN=15000 31 | ITE_EVA=12000 32 | ITE_STORE=12000 33 | 34 | ############ Train and validate on development dataset ############ 35 | # Calculate feature 36 | python utils/features.py calculate_feature_for_all_audio_files --dataset_dir=$DATASET_DIR --subtask='a' --data_type='development' --workspace=$WORKSPACE 37 | 38 | # Calculate scalar 39 | python utils/features.py calculate_scalar --subtask='a' --data_type='development' --workspace=$WORKSPACE 40 | 41 | # Subtask A 42 | CUDA_VISIBLE_DEVICES=$GPU_ID python pytorch/main.py train --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --subtask='a' --data_type='development' --holdout_fold=1 --model_type=$MODEL_TYPE --batch_size=$BATCH_SIZE --ite_train=$ITE_TRAIN --ite_eva=$ITE_EVA --ite_store=$ITE_STORE --fixed=$FIXED --finetune=$FINETUNE --cuda 43 | 44 | CUDA_VISIBLE_DEVICES=$GPU_ID python pytorch/main.py inference_validation --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --subtask='a' --data_type='development' --holdout_fold=1 --model_type=$MODEL_TYPE --iteration=$ITE_TRAIN --batch_size=$BATCH_SIZE --cuda 45 | -------------------------------------------------------------------------------- /pytorch/pytorch_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def move_data_to_gpu(x, cuda): 6 | if 'float' in str(x.dtype): 7 | x = torch.Tensor(x) 8 | elif 'int' in str(x.dtype): 9 | x = torch.LongTensor(x) 10 | else: 11 | raise Exception("Error!") 12 | 13 | if cuda: 14 | x = x.cuda() 15 | 16 | return x 17 | 18 | 19 | def append_to_dict(dict, key, value): 20 | if key in dict.keys(): 21 | dict[key].append(value) 22 | else: 23 | dict[key] = [value] 24 | 25 | 26 | def forward(model, generate_func, cuda, return_input=False, 27 | return_target=False): 28 | '''Forward data to model in mini-batch. 29 | 30 | Args: 31 | model: object 32 | generate_func: function 33 | cuda: bool 34 | return_input: bool 35 | return_target: bool 36 | max_validate_num: None | int, maximum mini-batch to forward to speed up validation 37 | ''' 38 | output_dict = {} 39 | 40 | # Evaluate on mini-batch 41 | for batch_data_dict in generate_func: 42 | 43 | # Predict 44 | batch_feature = move_data_to_gpu(batch_data_dict['feature'], cuda) 45 | batch_feature_mfcc = move_data_to_gpu(batch_data_dict['feature_mfcc'], cuda) 46 | batch_feature_gamm = move_data_to_gpu(batch_data_dict['feature_gamm'], cuda) 47 | batch_feature_panns = move_data_to_gpu(batch_data_dict['feature_panns'], cuda) 48 | 49 | with torch.no_grad(): 50 | model.eval() 51 | batch_output, batch_loss = model(batch_feature,batch_feature_gamm,batch_feature_mfcc,batch_feature_panns) 52 | 53 | append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name']) 54 | 55 | append_to_dict(output_dict, 'output', batch_output.data.cpu().numpy()) 56 | 57 | append_to_dict(output_dict, 'loss', batch_loss.data.cpu().numpy()) 58 | 59 | if return_input: 60 | append_to_dict(output_dict, 'feature', batch_data_dict['feature']) 61 | 62 | if return_target: 63 | if 'target' in batch_data_dict.keys(): 64 | append_to_dict(output_dict, 'target', batch_data_dict['target']) 65 | 66 | for key in output_dict.keys(): 67 | output_dict[key] = np.concatenate(output_dict[key], axis=0) 68 | 69 | return output_dict 70 | -------------------------------------------------------------------------------- /utils/gtg_example.py: -------------------------------------------------------------------------------- 1 | from gtg import gammatonegram 2 | import numpy as np 3 | from scipy.stats import norm as ssn 4 | import scipy.io.wavfile as wf 5 | import matplotlib.pyplot as plt 6 | 7 | """ 8 | Example of using python gammatonegram code 9 | mcusi@mit.edu, 2018 Sept 24 10 | """ 11 | 12 | def gtg_in_dB(sound, sampling_rate=44100, log_constant=1e-80, dB_threshold=-50.0): 13 | """ Convert sound into gammatonegram, with amplitude in decibels""" 14 | sxx, center_frequencies = gammatonegram(sound, sr=sampling_rate, fmin=20, fmax=int(sampling_rate/2.)) 15 | sxx[sxx == 0] = log_constant 16 | sxx = 20.*np.log10(sxx) #convert to dB 17 | sxx[sxx < dB_threshold] = dB_threshold 18 | return sxx, center_frequencies 19 | 20 | """ 21 | def loglikelihood(sxx_observation, sxx_hypothesis): 22 | likelihood_weighting = 5.0 #free parameter! 23 | loglikelihood = likelihood_weighting*np.sum(ssn.logpdf(sxx_observation, 24 | loc=sxx_hypothesis, scale=1)) 25 | return loglikelihood 26 | """ 27 | 28 | def gtgplot(sxx, center_frequencies, sample_duration, sampling_rate, 29 | dB_threshold=-50.0, dB_max=10.0, t_space=50, f_space=10): 30 | """Plot gammatonegram""" 31 | fig, ax = plt.subplots(1,1) 32 | 33 | time_per_pixel = sample_duration/(1.*sampling_rate*sxx.shape[1]) 34 | t = time_per_pixel * np.arange(sxx.shape[1]) 35 | 36 | plt.pcolormesh(sxx,vmin=dB_threshold, vmax=dB_max, cmap='Blues') 37 | ax.set_ylabel('Frequency (Hz)', fontsize=16) 38 | ax.set_xlabel('Time (s)',fontsize=16) 39 | ax.xaxis.set_ticks(range(sxx.shape[1])[::t_space]) 40 | ax.xaxis.set_ticklabels((t[::t_space]*100.).astype(int)/100.,fontsize=16) 41 | ax.set_xbound(0,sxx.shape[1]) 42 | ax.yaxis.set_ticks(range(len(center_frequencies))[::f_space]) 43 | ax.yaxis.set_ticklabels(center_frequencies.astype(int)[::f_space],fontsize=16) 44 | ax.set_ybound(0,len(center_frequencies)) 45 | cbar = plt.colorbar(pad=0.01) 46 | cbar.ax.tick_params(labelsize=16) 47 | cbar.ax.set_ylabel('Amplitude (dB)', rotation=270, labelpad=15,fontsize=16) 48 | plt.show() 49 | plt.close() 50 | 51 | if __name__ == '__main__': 52 | sampling_rate, sound = wf.read('1-7973-A-7.wav') 53 | sxx, center_frequencies = gtg_in_dB(sound, sampling_rate) 54 | print(sxx.shape) 55 | print(center_frequencies.shape) 56 | gtgplot(sxx, center_frequencies, len(sound), sampling_rate) 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DCASE-2020-Task1A-Code 2 | A Pytorch implementation of the paper : ["Acoustic Scene Classification with Spectrogram Processing Strategies"](http://dcase.community/documents/workshop2020/proceedings/DCASE2020Workshop_Wang_58.pdf) 3 | 4 | ## Paper results 5 | 6 | Note that the test results are obtained only by the device a for the DCASE2020 Task1A dev set. 7 | 8 | Model | Accuracy | Log loss | Model size 9 | -|-|-|- 10 | DCASE2020 Task 1 Baseline|70.6%|1.356|19.1 MB 11 | Log-Mel CNN|72.1%|0.879|18.9 MB 12 | Gamma CNN|76.1% |0.762 |18.9 MB 13 | MFCC CNN| 63.6%| 1.029| 18.9 MB 14 | SPSMR| 79.4%| 0.696| 75.5 MB 15 | Log-Mel CNN + SPSMF| 75.5%| 1.135| 94.4 MB 16 | CQT CNN + SPSMF| 74.5%| 1.185| 94.4 MB 17 | Gamma CNN + SPSMF| 78.8%| 1.169| 94.4 MB 18 | MFCC CNN + SPSMF| 60.9%| 1.801| 94.4 MB 19 | SPSMR + SPSMF| 80.9%| 0.737| 377.6 MB 20 | Log-Mel CNN + SPSMT| 74.5%| 0.987| 18.9 MB 21 | CQT CNN + SPSMT| 73.3%| 1.032| 18.9 MB 22 | Gamma CNN + SPSMT| 78.2%| 0.866| 18.9 MB 23 | MFCC CNN + SPSMT| 67.6%| 1.081| 18.9 MB 24 | SPSMR + SPSMT| 79.7%| 0.701| 75.5 MB 25 | SPSMR + SPSMF + SPSMT| 81.8%| 0.694| 453.1 MB 26 | 27 | 28 | ## Citation 29 | If this code is helpful, please feel free to cite the following papers: 30 | ``` 31 | @inproceedings{Wang2020, 32 | author = "Wang, Helin and Zou, Yuexian and Chong, DaDing", 33 | title = "Acoustic Scene Classification with Spectrogram Processing Strategies", 34 | booktitle = "Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)", 35 | address = "Tokyo, Japan", 36 | month = "November", 37 | year = "2020", 38 | pages = "210--214", 39 | abstract = "Recently, convolutional neural networks (CNN) have achieved the state-of-the-art performance in acoustic scene classification (ASC) task. The audio data is often transformed into two-dimensional spectrogram representations, which are then fed to the neural networks. In this paper, we study the problem of efficiently taking advantage of different spectrogram representations through discriminative processing strategies. There are two main contributions. The first contribution is exploring the impact of the combination of multiple spectrogram representations at different stages, which provides a meaningful reference for the effective spectrogram fusion. The second contribution is that the processing strategies in multiple frequency bands and multiple temporal frames are proposed to make fully use of a single spectrogram representation. The proposed spectrogram processing strategies can be easily transferred to any network structures. The experiments are carried out on the DCASE 2020 Task1 datasets, and the results show that our method could achieve the accuracy of 81.8\% (official baseline: 54.1\%) and 92.1\% (official baseline: 87.3\%) on the officially provided fold 1 evaluation dataset of Task1A and Task1B, respectively." 40 | } 41 | ``` 42 | 43 | ## Acknowledgment 44 | Thanks for the base code provided by https://github.com/qiuqiangkong/dcase2019_task1/. 45 | 46 | ``` 47 | @article{kong2019cross, 48 | title={Cross-task learning for audio tagging, sound event detection and spatial localization: DCASE 2019 baseline systems}, 49 | author={Kong, Qiuqiang and Cao, Yin and Iqbal, Turab and Xu, Yong and Wang, Wenwu and Plumbley, Mark D}, 50 | journal={arXiv preprint arXiv:1904.03476}, 51 | year={2019} 52 | } 53 | ``` 54 | 55 | ## Contact 56 | If you have any problem about our code, feel free to contact 57 | - wanghl15@pku.edu.cn 58 | 59 | or describe your problem in Issues. 60 | -------------------------------------------------------------------------------- /utils/gtg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Sat May 27 15:37:50 2017 3 | Python version of: 4 | D. P. W. Ellis (2009). "Gammatone-like spectrograms", web resource. http://www.ee.columbia.edu/~dpwe/resources/matlab/gammatonegram/ 5 | On the corresponding webpage, Dan notes that he would be grateful if you cited him if you use his work (as above). 6 | This python code does not contain all features present in MATLAB code. 7 | Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017 8 | """ 9 | 10 | from __future__ import division 11 | import numpy as np 12 | import scipy.signal as sps 13 | import scipy.io.wavfile as wf 14 | 15 | def fft2gammatonemx(nfft, sr=20000, nfilts=64, width=1.0, minfreq=100, 16 | maxfreq=10000, maxlen=1024): 17 | """ 18 | # Ellis' description in MATLAB: 19 | # [wts,cfreqa] = fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen) 20 | # Generate a matrix of weights to combine FFT bins into 21 | # Gammatone bins. nfft defines the source FFT size at 22 | # sampling rate sr. Optional nfilts specifies the number of 23 | # output bands required (default 64), and width is the 24 | # constant width of each band in Bark (default 1). 25 | # minfreq, maxfreq specify range covered in Hz (100, sr/2). 26 | # While wts has nfft columns, the second half are all zero. 27 | # Hence, aud spectrum is 28 | # fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft)); 29 | # maxlen truncates the rows to this many bins. 30 | # cfreqs returns the actual center frequencies of each 31 | # gammatone band in Hz. 32 | # 33 | # 2009/02/22 02:29:25 Dan Ellis dpwe@ee.columbia.edu based on rastamat/audspec.m 34 | # Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017: convert to python 35 | """ 36 | 37 | wts = np.zeros([nfilts,nfft]) 38 | 39 | #after Slaney's MakeERBFilters 40 | EarQ = 9.26449; minBW = 24.7; order = 1; 41 | 42 | nFr = np.array(range(nfilts)) + 1 43 | em = EarQ*minBW 44 | cfreqs = (maxfreq+em)*np.exp(nFr*(-np.log(maxfreq + em)+np.log(minfreq + em))/nfilts)-em 45 | cfreqs = cfreqs[::-1] 46 | 47 | GTord = 4 48 | ucircArray = np.array(range(int(nfft/2 + 1))) 49 | ucirc = np.exp(1j*2*np.pi*ucircArray/nfft); 50 | #justpoles = 0 :taking out the 'if' corresponding to this. 51 | 52 | ERB = width*np.power(np.power(cfreqs/EarQ,order) + np.power(minBW,order),1/order); 53 | B = 1.019 * 2 * np.pi * ERB; 54 | r = np.exp(-B/sr) 55 | theta = 2*np.pi*cfreqs/sr 56 | pole = r*np.exp(1j*theta) 57 | T = 1/sr 58 | ebt = np.exp(B*T); cpt = 2*cfreqs*np.pi*T; 59 | ccpt = 2*T*np.cos(cpt); scpt = 2*T*np.sin(cpt); 60 | A11 = -np.divide(np.divide(ccpt,ebt) + np.divide(np.sqrt(3+2**1.5)*scpt,ebt),2); 61 | A12 = -np.divide(np.divide(ccpt,ebt) - np.divide(np.sqrt(3+2**1.5)*scpt,ebt),2); 62 | A13 = -np.divide(np.divide(ccpt,ebt) + np.divide(np.sqrt(3-2**1.5)*scpt,ebt),2); 63 | A14 = -np.divide(np.divide(ccpt,ebt) - np.divide(np.sqrt(3-2**1.5)*scpt,ebt),2); 64 | zros = -np.array([A11, A12, A13, A14])/T; 65 | wIdx = range(int(nfft/2 + 1)) 66 | gain = np.abs((-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) - np.sqrt(3 - 2**(3/2))* np.sin(2*cfreqs*np.pi*T))) *(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) + np.sqrt(3 - 2**(3/2)) * np.sin(2*cfreqs*np.pi*T)))*(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) - np.sqrt(3 + 2**(3/2))*np.sin(2*cfreqs*np.pi*T))) *(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) + np.sqrt(3 + 2**(3/2))*np.sin(2*cfreqs*np.pi*T))) /(-2 / np.exp(2*B*T) - 2*np.exp(4*1j*cfreqs*np.pi*T) + 2*(1 + np.exp(4*1j*cfreqs*np.pi*T))/np.exp(B*T))**4); 67 | #in MATLAB, there used to be 64 where here it says nfilts: 68 | wts[:, wIdx] = ((T**4)/np.reshape(gain,(nfilts,1))) * np.abs(ucirc-np.reshape(zros[0],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[1],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[2],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[3],(nfilts,1)))*(np.abs(np.power(np.multiply(np.reshape(pole,(nfilts,1))-ucirc,np.conj(np.reshape(pole,(nfilts,1)))-ucirc),-GTord))); 69 | wts = wts[:,range(maxlen)]; 70 | 71 | return wts, cfreqs 72 | 73 | def gammatonegram(x,sr=44100,twin=0.040,thop=0.020,N=64, 74 | fmin=50,fmax=14000,width=1.0): 75 | """ 76 | # Ellis' description in MATLAB: 77 | # [Y,F] = gammatonegram(X,SR,N,TWIN,THOP,FMIN,FMAX,USEFFT,WIDTH) 78 | # Calculate a spectrogram-like time frequency magnitude array 79 | # based on Gammatone subband filters. Waveform X (at sample 80 | # rate SR) is passed through an N (default 64) channel gammatone 81 | # auditory model filterbank, with lowest frequency FMIN (50) 82 | # and highest frequency FMAX (SR/2). The outputs of each band 83 | # then have their energy integrated over windows of TWIN secs 84 | # (0.025), advancing by THOP secs (0.010) for successive 85 | # columns. These magnitudes are returned as an N-row 86 | # nonnegative real matrix, Y. 87 | # WIDTH (default 1.0) is how to scale bandwidth of filters 88 | # relative to ERB default (for fast method only). 89 | # F returns the center frequencies in Hz of each row of Y 90 | # (uniformly spaced on a Bark scale). 91 | 92 | # 2009/02/23 DAn Ellis dpwe@ee.columbia.edu 93 | # Sat May 27 15:37:50 2017 Maddie Cusimano mcusi@mit.edu, converted to python 94 | """ 95 | 96 | #Entirely skipping Malcolm's function, because would require 97 | #altering ERBFilterBank code as well. 98 | #i.e., in Ellis' code: usefft = 1 99 | assert(x.dtype == 'int32') 100 | 101 | # How long a window to use relative to the integration window requested 102 | winext = 1; 103 | twinmod = winext*twin; 104 | nfft = int(2**(np.ceil(np.log(2*twinmod*sr)/np.log(2)))) 105 | nhop = int(np.round(thop*sr)) 106 | nwin = int(np.round(twinmod*sr)) 107 | [gtm,f] = fft2gammatonemx(nfft, sr, N, width, fmin, fmax, int(nfft/2+1)) 108 | # perform FFT and weighting in amplitude domain 109 | # note: in MATLAB, abs(spectrogram(X, hanning(nwin), nwin-nhop, nfft, SR)) 110 | # = abs(specgram(X,nfft,SR,nwin,nwin-nhop)) 111 | # in python approx = sps.spectrogram(x, fs=sr, window='hann', nperseg=nwin, 112 | # noverlap=nwin-nhop, nfft=nfft, detrend=False, 113 | # scaling='density', mode='magnitude') 114 | plotF, plotT, Sxx = sps.spectrogram(x, fs=sr, window='hann', nperseg=nwin, 115 | noverlap=nwin-nhop, nfft=nfft, detrend=False, 116 | scaling='density', mode='magnitude') 117 | y = (1/nfft)*np.dot(gtm,Sxx) 118 | 119 | return y, f 120 | -------------------------------------------------------------------------------- /utils/utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import soundfile 5 | import librosa 6 | import h5py 7 | import math 8 | import pandas as pd 9 | from sklearn import metrics 10 | import logging 11 | import matplotlib.pyplot as plt 12 | import torch 13 | import config 14 | import scipy.io.wavfile as wf 15 | from pydub import AudioSegment 16 | import csv 17 | 18 | ''' 19 | def mixup_data(x, y, alpha=0.2): 20 | if alpha > 0.: 21 | lam = np.random.beta(alpha, alpha) 22 | else: 23 | lam = 1. 24 | batch_size = x.size()[0] 25 | index = torch.randperm(batch_size).cuda() 26 | mixed_x = lam * x + (1 - lam) * x[index, :] 27 | y_a, y_b = y, y[index] 28 | return mixed_x, y_a, y_b, lam 29 | ''' 30 | def mixup_data(x1,x2,x3,x4, y, alpha=0.2): 31 | 32 | '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda''' 33 | if alpha > 0.: 34 | lam = np.random.beta(alpha, alpha) 35 | else: 36 | lam = 1. 37 | batch_size = x1.size()[0] 38 | index = torch.randperm(batch_size).cuda() 39 | mixed_x1 = lam * x1 + (1 - lam) * x1[index, :] 40 | mixed_x2 = lam * x2 + (1 - lam) * x2[index, :] 41 | mixed_x3 = lam * x3 + (1 - lam) * x3[index, :] 42 | mixed_x4 = lam * x4 + (1 - lam) * x4[index, :] 43 | y_a, y_b = y, y[index] 44 | return mixed_x1, mixed_x2, mixed_x3,mixed_x4, y_a, y_b, lam 45 | 46 | def mixup_criterion(class_criterion, pred, y_a, y_b, lam): 47 | return lam * class_criterion(pred, y_a) + (1 - lam) * class_criterion(pred, y_b) 48 | 49 | def create_folder(fd): 50 | if not os.path.exists(fd): 51 | os.makedirs(fd) 52 | 53 | def get_filename(path): 54 | path = os.path.realpath(path) 55 | name_ext = path.split('/')[-1] 56 | name = os.path.splitext(name_ext)[0] 57 | return name 58 | 59 | def create_logging(log_dir, filemode): 60 | create_folder(log_dir) 61 | i1 = 0 62 | 63 | while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))): 64 | i1 += 1 65 | 66 | log_path = os.path.join(log_dir, '{:04d}.log'.format(i1)) 67 | logging.basicConfig( 68 | level=logging.DEBUG, 69 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 70 | datefmt='%a, %d %b %Y %H:%M:%S', 71 | filename=log_path, 72 | filemode=filemode) 73 | # Print to console 74 | console = logging.StreamHandler() 75 | console.setLevel(logging.INFO) 76 | formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') 77 | console.setFormatter(formatter) 78 | logger = logging.getLogger('') 79 | while logger.handlers: 80 | logger.handlers.pop() 81 | logger.addHandler(console) 82 | 83 | return logging 84 | 85 | def read_audio(audio_path, target_fs=None): 86 | (audio, fs) = soundfile.read(audio_path) 87 | 88 | if audio.ndim > 1: 89 | audio = np.mean(audio, axis=1) 90 | 91 | if target_fs is not None and fs != target_fs: 92 | audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs) 93 | fs = target_fs 94 | 95 | return audio, fs 96 | 97 | def read_audio_gamm(audio_path, target_fs=None): 98 | audio1='temp_16.wav' 99 | s = AudioSegment.from_file(audio_path, format = "wav" ) 100 | s.export(audio1 , bitrate="16", format="wav") 101 | fs, audio = wf.read(audio1) 102 | return audio, fs 103 | 104 | def pad_truncate_sequence(x, max_len): 105 | if len(x) < max_len: 106 | return np.concatenate((x, np.zeros(max_len - len(x)))) 107 | else: 108 | return x[0 : max_len] 109 | 110 | 111 | def calculate_scalar_of_tensor(x): 112 | if x.ndim == 2: 113 | axis = 0 114 | elif x.ndim == 3: 115 | axis = (0, 1) 116 | 117 | mean = np.mean(x, axis=axis) 118 | std = np.std(x, axis=axis) 119 | 120 | return mean, std 121 | 122 | 123 | def load_scalar(scalar_path): 124 | with h5py.File(scalar_path, 'r') as hf: 125 | mean = hf['mean'][:] 126 | std = hf['std'][:] 127 | mean_gamm = hf['mean_gamm'][:] 128 | std_gamm = hf['std_gamm'][:] 129 | mean_mfcc = hf['mean_mfcc'][:] 130 | std_mfcc = hf['std_mfcc'][:] 131 | mean_panns = hf['mean_panns'][:] 132 | std_panns = hf['std_panns'][:] 133 | 134 | scalar = {'mean': mean, 'std': std,'mean_gamm': mean_gamm, 'std_gamm': std_gamm,'mean_mfcc': mean_mfcc, 'std_mfcc': std_mfcc,'mean_panns': mean_panns, 'std_panns': std_panns} 135 | return scalar 136 | 137 | 138 | def scale(x, mean, std): 139 | return (x - mean) / std 140 | 141 | 142 | def inverse_scale(x, mean, std): 143 | return x * std + mean 144 | 145 | 146 | def get_subdir(subtask, data_type): 147 | if subtask == 'a': 148 | subdir = 'TAU-urban-acoustic-scenes-2020-mobile-{}'.format(data_type) 149 | elif subtask == 'b': 150 | subdir = 'TAU-urban-acoustic-scenes-2019-mobile-{}'.format(data_type) 151 | elif subtask == 'c': 152 | subdir = 'TAU-urban-acoustic-scenes-2019-openset-{}'.format(data_type) 153 | else: 154 | raise Exception('Incorrect argument!') 155 | 156 | return subdir 157 | 158 | 159 | def read_metadata(metadata_path): 160 | '''Read metadata from a csv file. 161 | 162 | Returns: 163 | meta_dict: dict of meta data, e.g.: 164 | {'audio_name': np.array(['a.wav', 'b.wav', ...]), 165 | 'scene_label': np.array(['airport', 'bus', ...]), 166 | ...} 167 | ''' 168 | df = pd.read_csv(metadata_path, sep='\t') 169 | 170 | meta_dict = {} 171 | 172 | meta_dict['audio_name'] = np.array( 173 | [name.split('/')[1] for name in df['filename'].tolist()]) 174 | 175 | if 'scene_label' in df.keys(): 176 | meta_dict['scene_label'] = np.array(df['scene_label']) 177 | 178 | if 'identifier' in df.keys(): 179 | meta_dict['identifier'] = np.array(df['identifier']) 180 | 181 | if 'source_label' in df.keys(): 182 | meta_dict['source_label'] = np.array(df['source_label']) 183 | 184 | return meta_dict 185 | 186 | 187 | def sparse_to_categorical(x, n_out): 188 | x = x.astype(int) 189 | shape = x.shape 190 | x = x.flatten() 191 | N = len(x) 192 | x_categ = np.zeros((N,n_out)) 193 | x_categ[np.arange(N), x] = 1 194 | return x_categ.reshape((shape)+(n_out,)) 195 | 196 | 197 | def get_sources(subtask): 198 | if subtask in ['a', 'c']: 199 | return ['a', 'b', 'c', 's1', 's2', 's3'] 200 | elif subtask == 'b': 201 | return ['a', 'b', 'c'] 202 | else: 203 | raise Exception('Incorrect argument!') 204 | 205 | 206 | def write_submission(output_dict, subtask, data_type, submission_path): 207 | '''Write output to submission. 208 | 209 | Args: 210 | output_dict: 211 | {'audio_name': (audios_num,), 'output': (audios_num, classes_num)} 212 | subtask: 'a' | 'b' | 'c' 213 | data_type: 'leaderboard' | 'evaluation' 214 | submission_path: string 215 | ''' 216 | fw = open(submission_path, 'w') 217 | csv_writer = csv.writer(fw,delimiter='\t') 218 | csv_writer.writerow(['filename','scene_label','airport','bus','metro','metro_station','park','public_square','shopping_mall','street_pedestrian','street_traffic','tram']) 219 | #0,7,8,2,9,4,1,3,5,6 220 | 221 | for n in range(len(output_dict['audio_name'])): 222 | 223 | audio_name = output_dict['audio_name'][n] 224 | pred_label = config.idx_to_lb[np.argmax(output_dict['output'][n])] 225 | logloss = output_dict['loss'][n] 226 | print(n,pred_label,logloss) 227 | csv_writer.writerow([audio_name,pred_label,logloss[0],logloss[7],logloss[8],logloss[2],logloss[9],logloss[4],logloss[1],logloss[3],logloss[5],logloss[6]]) 228 | fw.close() 229 | logging.info('Write submission to {}'.format(submission_path)) 230 | -------------------------------------------------------------------------------- /utils/plot_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import matplotlib.pyplot as plt 4 | import _pickle as cPickle 5 | import numpy as np 6 | 7 | from utilities import get_subdir 8 | import config 9 | 10 | 11 | def plot_results(args): 12 | '''Plot statistics curve of different models. 13 | 14 | Args: 15 | workspace: string, directory of workspace 16 | subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1 17 | ''' 18 | 19 | # Arugments & parameters 20 | workspace = args.workspace 21 | subtask = args.subtask 22 | 23 | filename = 'main' 24 | prefix = '' 25 | frames_per_second = config.frames_per_second 26 | mel_bins = config.mel_bins 27 | holdout_fold = 1 28 | max_plot_iteration = 5000 29 | data_type = 'development' 30 | 31 | iterations = np.arange(0, max_plot_iteration, 200) 32 | 33 | def _load_stat(model_type, subtask, source): 34 | sub_dir = get_subdir(subtask, data_type) 35 | 36 | validate_statistics_path = os.path.join(workspace, 'statistics', filename, 37 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 38 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 39 | model_type, 'validate_statistics.pickle') 40 | 41 | validate_statistics_dict = cPickle.load(open(validate_statistics_path, 'rb')) 42 | 43 | accuracy_matrix = np.array([stat['accuracy'] for 44 | stat in validate_statistics_dict[source]]) 45 | 46 | confusion_matrix = np.array([stat['confusion_matrix'] for 47 | stat in validate_statistics_dict[source]]) 48 | 49 | legend = '{}'.format(model_type) 50 | 51 | if subtask in ['a', 'b']: 52 | accuracy = np.mean(accuracy_matrix, axis=-1) 53 | results = {'accuracy': accuracy, 'legend': legend} 54 | print('Subtask: {}, Source: {}, Model: {} accuracy: {:.3f}'.format( 55 | subtask, source, model_type, accuracy[-1])) 56 | 57 | elif subtask == 'c': 58 | accuracy = np.mean(accuracy_matrix[:, 0 : -1], axis=-1) 59 | unknown_accuracy = accuracy_matrix[:, -1] 60 | results = { 61 | 'accuracy': accuracy, 62 | 'unknown_accuracy': unknown_accuracy, 63 | 'legend': legend} 64 | 65 | print('Subtask: {}, Source: {}, Model: {}, accuracy: {:.3f}, ' 66 | 'Unknown accuracy: {:.3f}'.format(subtask, source, model_type, 67 | accuracy[-1], unknown_accuracy[-1])) 68 | 69 | return results 70 | 71 | if subtask == 'a': 72 | source = 'a' 73 | 74 | fig, ax = plt.subplots(1, 1, figsize=(6, 4)) 75 | lines = [] 76 | 77 | results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=source) 78 | line, = ax.plot(results['accuracy'], label=results['legend']) 79 | lines.append(line) 80 | 81 | results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=source) 82 | line, = ax.plot(results['accuracy'], label=results['legend']) 83 | lines.append(line) 84 | 85 | results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=source) 86 | line, = ax.plot(results['accuracy'], label=results['legend']) 87 | lines.append(line) 88 | 89 | results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=source) 90 | line, = ax.plot(results['accuracy'], label=results['legend']) 91 | lines.append(line) 92 | 93 | ax.set_title('Device: {}'.format(source)) 94 | ax.legend(handles=lines, loc=4) 95 | ax.set_ylim(0, 1.0) 96 | ax.set_xlabel('Iterations') 97 | ax.set_ylabel('Accuracy') 98 | ax.grid(color='b', linestyle='solid', linewidth=0.2) 99 | ax.xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4)) 100 | ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4)) 101 | 102 | plt.tight_layout() 103 | 104 | elif subtask == 'b': 105 | sources = ['a', 'b', 'c'] 106 | 107 | fig, axs = plt.subplots(2, 2, figsize=(12, 8)) 108 | 109 | for n in range(3): 110 | lines = [] 111 | 112 | results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=sources[n]) 113 | line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend']) 114 | lines.append(line) 115 | 116 | results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=sources[n]) 117 | line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend']) 118 | lines.append(line) 119 | 120 | results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=sources[n]) 121 | line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend']) 122 | lines.append(line) 123 | 124 | results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=sources[n]) 125 | line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend']) 126 | lines.append(line) 127 | 128 | axs[n // 2, n % 2].set_title('Device: {}'.format(sources[n])) 129 | axs[n // 2, n % 2].legend(handles=lines, loc=4) 130 | axs[n // 2, n % 2].set_ylim(0, 1.0) 131 | axs[n // 2, n % 2].set_xlabel('Iterations') 132 | axs[n // 2, n % 2].set_ylabel('Accuracy') 133 | axs[n // 2, n % 2].grid(color='b', linestyle='solid', linewidth=0.2) 134 | axs[n // 2, n % 2].xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4)) 135 | axs[n // 2, n % 2].xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4)) 136 | 137 | axs[1, 1].set_visible(False) 138 | plt.tight_layout() 139 | 140 | elif subtask == 'c': 141 | source = 'a' 142 | 143 | fig, axs = plt.subplots(1, 2, figsize=(12, 4)) 144 | 145 | axs[0].set_title('Device: {}, known'.format(source)) 146 | axs[1].set_title('Device: {}, unknown'.format(source)) 147 | 148 | lines = [] 149 | 150 | results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=source) 151 | line, = axs[0].plot(results['accuracy'], label=results['legend']) 152 | line, = axs[1].plot(results['unknown_accuracy'], label=results['legend']) 153 | lines.append(line) 154 | 155 | results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=source) 156 | line, = axs[0].plot(results['accuracy'], label=results['legend']) 157 | line, = axs[1].plot(results['unknown_accuracy'], label=results['legend']) 158 | lines.append(line) 159 | 160 | results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=source) 161 | line, = axs[0].plot(results['accuracy'], label=results['legend']) 162 | line, = axs[1].plot(results['unknown_accuracy'], label=results['legend']) 163 | lines.append(line) 164 | 165 | results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=source) 166 | line, = axs[0].plot(results['accuracy'], label=results['legend']) 167 | line, = axs[1].plot(results['unknown_accuracy'], label=results['legend']) 168 | lines.append(line) 169 | 170 | for n in range(2): 171 | axs[n].legend(handles=lines, loc=4) 172 | axs[n].set_ylim(0, 1.0) 173 | axs[n].set_xlabel('Iterations') 174 | axs[n].set_ylabel('Accuracy') 175 | axs[n].grid(color='b', linestyle='solid', linewidth=0.2) 176 | axs[n].xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4)) 177 | axs[n].xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4)) 178 | 179 | plt.tight_layout() 180 | 181 | fig_path = 'result_subtask_{}.png'.format(subtask) 182 | plt.savefig(fig_path) 183 | print('Save result figure to {}'.format(fig_path)) 184 | 185 | 186 | if __name__ == '__main__': 187 | parser = argparse.ArgumentParser(description='') 188 | parser.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 189 | parser.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.') 190 | 191 | args = parser.parse_args() 192 | 193 | plot_results(args) -------------------------------------------------------------------------------- /pytorch/evaluate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '../utils')) 4 | 5 | import numpy as np 6 | import time 7 | import logging 8 | import matplotlib.pyplot as plt 9 | from sklearn import metrics 10 | import datetime 11 | import _pickle as cPickle 12 | #import sed_eval 13 | 14 | from utilities import get_filename, inverse_scale 15 | from pytorch_utils import forward 16 | import config 17 | from sklearn.metrics import log_loss 18 | 19 | 20 | class Evaluator(object): 21 | def __init__(self, model, data_generator, subtask, cuda=True): 22 | '''Evaluator to evaluate prediction performance. 23 | 24 | Args: 25 | model: object 26 | data_generator: object 27 | subtask: 'a' | 'b' | 'c' 28 | cuda: bool 29 | ''' 30 | 31 | self.model = model 32 | self.data_generator = data_generator 33 | self.subtask = subtask 34 | self.cuda = cuda 35 | 36 | self.frames_per_second = config.frames_per_second 37 | self.labels = config.labels 38 | self.in_domain_classes_num = len(config.labels) - 1 39 | self.all_classes_num = len(config.labels)-1 40 | self.idx_to_lb = config.idx_to_lb 41 | self.lb_to_idx = config.lb_to_idx 42 | 43 | def evaluate(self, data_type, source, max_iteration=None, verbose=False): 44 | '''Evaluate the performance. 45 | 46 | Args: 47 | data_type: 'train' | 'validate' 48 | source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3' 49 | max_iteration: None | int, maximum iteration to run to speed up evaluation 50 | verbose: bool 51 | ''' 52 | 53 | generate_func = self.data_generator.generate_validate( 54 | data_type=data_type, 55 | source=source, 56 | max_iteration=max_iteration) 57 | 58 | # Forward 59 | output_dict = forward( 60 | model=self.model, 61 | generate_func=generate_func, 62 | cuda=self.cuda, 63 | return_target=True) 64 | 65 | if output_dict['output'].ndim == 2: # single scale models 66 | output = output_dict['output'] # (audios_num, in_domain_classes_num) 67 | target = output_dict['target'] # (audios_num, in_domain_classes_num) 68 | loss = output_dict['loss'] 69 | prob = np.exp(output) 70 | # Evaluate 71 | y_true = np.argmax(target, axis=-1) 72 | y_pred = np.argmax(prob, axis=-1) 73 | confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num)) 74 | classwise_accuracy = np.diag(confusion_matrix) \ 75 | / np.sum(confusion_matrix, axis=-1) 76 | logging.info('Single-Classifier:') 77 | logging.info('Data type: {}'.format(data_type)) 78 | logging.info(' Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy))) 79 | logging.info(' Log loss: {:.3f}'.format(log_loss(y_true,loss))) 80 | else: 81 | for i in range(output_dict['output'].shape[1]-1): 82 | output = output_dict['output'][:,i,:] # (audios_num, in_domain_classes_num) 83 | target = output_dict['target'] # (audios_num, in_domain_classes_num) 84 | loss = output_dict['loss'][:,i,:] 85 | prob = np.exp(output) 86 | # Evaluate 87 | y_true = np.argmax(target, axis=-1) 88 | y_pred = np.argmax(prob, axis=-1) 89 | confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num)) 90 | classwise_accuracy = np.diag(confusion_matrix) \ 91 | / np.sum(confusion_matrix, axis=-1) 92 | logging.info('Scale'+str(i+1)+'-Classifier:') 93 | logging.info('Data type: {}'.format(data_type)) 94 | logging.info(' Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy))) 95 | logging.info(' Log loss: {:.3f}'.format(log_loss(y_true,loss))) 96 | 97 | output = output_dict['output'][:,-1,:] # (audios_num, in_domain_classes_num) 98 | target = output_dict['target'] # (audios_num, in_domain_classes_num) 99 | output = output_dict['loss'][:,-1,:] 100 | prob = np.exp(output) 101 | # Evaluate 102 | y_true = np.argmax(target, axis=-1) 103 | y_pred = np.argmax(prob, axis=-1) 104 | confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num)) 105 | classwise_accuracy = np.diag(confusion_matrix) \ 106 | / np.sum(confusion_matrix, axis=-1) 107 | logging.info('Global-Classifier:') 108 | logging.info('Data type: {}'.format(data_type)) 109 | logging.info(' Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy))) 110 | logging.info(' Log loss: {:.3f}'.format(log_loss(y_true,loss))) 111 | 112 | 113 | if verbose: 114 | classes_num = len(classwise_accuracy) 115 | for n in range(classes_num): 116 | logging.info('{:<20}{:.3f}'.format(self.labels[n], 117 | classwise_accuracy[n])) 118 | 119 | logging.info(confusion_matrix) 120 | 121 | statistics = { 122 | 'accuracy': classwise_accuracy, 123 | 'confusion_matrix': confusion_matrix} 124 | 125 | return statistics 126 | 127 | def visualize(self, data_type, source, max_iteration=None): 128 | '''Visualize log mel spectrogram of different sound classes. 129 | 130 | Args: 131 | data_type: 'train' | 'validate' 132 | source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3' 133 | max_iteration: None | int, maximum iteration to run to speed up evaluation 134 | ''' 135 | mel_bins = config.mel_bins 136 | audio_duration = config.audio_duration 137 | frames_num = config.frames_num 138 | labels = config.labels 139 | in_domain_classes_num = len(config.labels) - 1 140 | idx_to_lb = config.idx_to_lb 141 | 142 | generate_func = self.data_generator.generate_validate( 143 | data_type=data_type, 144 | source=source, 145 | max_iteration=max_iteration) 146 | 147 | # Forward 148 | output_dict = forward( 149 | model=self.model, 150 | generate_func=generate_func, 151 | cuda=self.cuda, 152 | return_input=True, 153 | return_target=True) 154 | 155 | # Plot log mel spectrogram of different sound classes 156 | rows_num = 3 157 | cols_num = 4 158 | 159 | fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5)) 160 | 161 | for k in range(in_domain_classes_num): 162 | for n, audio_name in enumerate(output_dict['audio_name']): 163 | if output_dict['target'][n, k] == 1: 164 | title = idx_to_lb[k] 165 | row = k // cols_num 166 | col = k % cols_num 167 | axs[row, col].set_title(title, color='r') 168 | logmel = inverse_scale(output_dict['feature'][n], self.data_generator.scalar['mean'], self.data_generator.scalar['std']) 169 | axs[row, col].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet') 170 | axs[row, col].set_xticks([0, frames_num]) 171 | axs[row, col].set_xticklabels(['0', '{:.1f} s'.format(audio_duration)]) 172 | axs[row, col].xaxis.set_ticks_position('bottom') 173 | axs[row, col].set_ylabel('Mel bins') 174 | axs[row, col].set_yticks([]) 175 | break 176 | 177 | for k in range(in_domain_classes_num, rows_num * cols_num): 178 | row = k // cols_num 179 | col = k % cols_num 180 | axs[row, col].set_visible(False) 181 | 182 | fig.tight_layout(pad=0, w_pad=0, h_pad=0) 183 | plt.show() 184 | 185 | 186 | class StatisticsContainer(object): 187 | def __init__(self, statistics_path): 188 | '''Container of statistics during training. 189 | 190 | Args: 191 | statistics_path: string, path to write out 192 | ''' 193 | self.statistics_path = statistics_path 194 | 195 | self.backup_statistics_path = '{}_{}.pickle'.format( 196 | os.path.splitext(self.statistics_path)[0], 197 | datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) 198 | 199 | # Statistics of device 'a', 'b' and 'c' 200 | self.statistics_dict = {'a': [], 'b': [], 'c': [], 's1': [], 's2': [], 's3': []} 201 | 202 | def append_and_dump(self, iteration, source, statistics): 203 | '''Append statistics to container and dump the container. 204 | 205 | Args: 206 | iteration: int 207 | source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3', device 208 | statistics: dict of statistics 209 | ''' 210 | statistics['iteration'] = iteration 211 | self.statistics_dict[source].append(statistics) 212 | 213 | cPickle.dump(self.statistics_dict, open(self.statistics_path, 'wb')) 214 | cPickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb')) 215 | logging.info(' Dump statistics to {}'.format(self.statistics_path)) 216 | -------------------------------------------------------------------------------- /utils/features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(1, os.path.join(sys.path[0], 'utils')) 4 | import numpy as np 5 | import argparse 6 | import h5py 7 | import librosa 8 | from scipy import signal 9 | import matplotlib.pyplot as plt 10 | import time 11 | import math 12 | import pandas as pd 13 | import random 14 | from gtg_example import gtg_in_dB 15 | from utilities import (create_folder, read_audio, calculate_scalar_of_tensor, 16 | pad_truncate_sequence, get_subdir, read_metadata, read_audio_gamm) 17 | import config 18 | 19 | 20 | class LogMelExtractor(object): 21 | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax): 22 | '''Log mel feature extractor. 23 | 24 | Args: 25 | sample_rate: int 26 | window_size: int 27 | hop_size: int 28 | mel_bins: int 29 | fmin: int, minimum frequency of mel filter banks 30 | fmax: int, maximum frequency of mel filter banks 31 | ''' 32 | 33 | self.window_size = window_size 34 | self.hop_size = hop_size 35 | self.window_func = np.hanning(window_size) 36 | 37 | self.melW = librosa.filters.mel( 38 | sr=sample_rate, 39 | n_fft=window_size, 40 | n_mels=mel_bins, 41 | fmin=fmin, 42 | fmax=fmax).T 43 | '''(n_fft // 2 + 1, mel_bins)''' 44 | 45 | def transform(self, audio): 46 | '''Extract feature of a singlechannel audio file. 47 | 48 | Args: 49 | audio: (samples,) 50 | 51 | Returns: 52 | feature: (frames_num, freq_bins) 53 | ''' 54 | 55 | window_size = self.window_size 56 | hop_size = self.hop_size 57 | window_func = self.window_func 58 | 59 | # Compute short-time Fourier transform 60 | stft_matrix = librosa.core.stft( 61 | y=audio, 62 | n_fft=window_size, 63 | hop_length=hop_size, 64 | window=window_func, 65 | center=True, 66 | dtype=np.complex64, 67 | pad_mode='reflect').T 68 | '''(N, n_fft // 2 + 1)''' 69 | 70 | # Mel spectrogram 71 | mel_spectrogram = np.dot(np.abs(stft_matrix) ** 2, self.melW) 72 | 73 | # Log mel spectrogram 74 | logmel_spectrogram = librosa.core.power_to_db( 75 | mel_spectrogram, ref=1.0, amin=1e-10, 76 | top_db=None) 77 | 78 | logmel_spectrogram = logmel_spectrogram.astype(np.float32) 79 | 80 | return logmel_spectrogram 81 | 82 | 83 | def calculate_feature_for_all_audio_files(args): 84 | '''Calculate feature of audio files and write out features to a hdf5 file. 85 | 86 | Args: 87 | dataset_dir: string 88 | workspace: string 89 | subtask: 'a' | 'b' | 'c' 90 | data_type: 'development' | 'evaluation' 91 | mini_data: bool, set True for debugging on a small part of data 92 | ''' 93 | 94 | # Arguments & parameters 95 | dataset_dir = args.dataset_dir 96 | workspace = args.workspace 97 | subtask = args.subtask 98 | data_type = args.data_type 99 | mini_data = args.mini_data 100 | 101 | sample_rate = config.sample_rate 102 | window_size = config.window_size 103 | hop_size = config.hop_size 104 | mel_bins = config.mel_bins 105 | fmin = config.fmin 106 | fmax = config.fmax 107 | frames_per_second = config.frames_per_second 108 | frames_num = config.frames_num 109 | total_samples = config.total_samples 110 | lb_to_idx = config.lb_to_idx 111 | mfcc_frames = config.mfcc_frames 112 | n_mfcc = config.n_mfcc 113 | mfcc_hop_size = config.mfcc_hop_size 114 | gamm_frames = config.gamm_frames 115 | n_gamm = config.n_gamm 116 | # Paths 117 | if mini_data: 118 | prefix = 'minidata_' 119 | else: 120 | prefix = '' 121 | 122 | sub_dir = get_subdir(subtask, data_type) 123 | audios_dir = os.path.join(dataset_dir, sub_dir, 'audio') 124 | 125 | if data_type == 'development': 126 | metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv') 127 | elif data_type == 'leaderboard': 128 | metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv') 129 | elif data_type == 'evaluation': 130 | metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'fold1_test.csv') 131 | else: 132 | raise Exception('Incorrect data_type!') 133 | 134 | feature_path = os.path.join(workspace, 'features', 135 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 136 | '{}.h5'.format(sub_dir)) 137 | create_folder(os.path.dirname(feature_path)) 138 | 139 | # Feature extractor 140 | feature_extractor = LogMelExtractor( 141 | sample_rate=sample_rate, 142 | window_size=window_size, 143 | hop_size=hop_size, 144 | mel_bins=mel_bins, 145 | fmin=fmin, 146 | fmax=fmax) 147 | 148 | # Read metadata 149 | meta_dict = read_metadata(metadata_path) 150 | 151 | # Extract features and targets 152 | if mini_data: 153 | mini_num = 10 154 | total_num = len(meta_dict['audio_name']) 155 | random_state = np.random.RandomState(1234) 156 | indexes = random_state.choice(total_num, size=mini_num, replace=False) 157 | for key in meta_dict.keys(): 158 | meta_dict[key] = meta_dict[key][indexes] 159 | 160 | print('Extracting features of all audio files ...') 161 | extract_time = time.time() 162 | 163 | # Hdf5 file for storing features and targets 164 | hf = h5py.File(feature_path, 'w') 165 | 166 | hf.create_dataset( 167 | name='audio_name', 168 | data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 169 | dtype='S80') 170 | 171 | if 'scene_label' in meta_dict.keys(): 172 | hf.create_dataset( 173 | name='scene_label', 174 | data=[scene_label.encode() for scene_label in meta_dict['scene_label']], 175 | dtype='S24') 176 | 177 | if 'identifier' in meta_dict.keys(): 178 | hf.create_dataset( 179 | name='identifier', 180 | data=[identifier.encode() for identifier in meta_dict['identifier']], 181 | dtype='S24') 182 | 183 | if 'source_label' in meta_dict.keys(): 184 | hf.create_dataset( 185 | name='source_label', 186 | data=[source_label.encode() for source_label in meta_dict['source_label']], 187 | dtype='S8') 188 | 189 | hf.create_dataset( 190 | name='feature', 191 | shape=(0, total_samples), 192 | maxshape=(None, total_samples), 193 | dtype=np.float32) 194 | hf.create_dataset( 195 | name='feature_gamm', 196 | shape=(0, gamm_frames, n_gamm), 197 | maxshape=(None, gamm_frames, n_gamm), 198 | dtype=np.float32) 199 | hf.create_dataset( 200 | name='feature_mfcc', 201 | shape=(0, mfcc_frames, n_mfcc), 202 | maxshape=(None, mfcc_frames, n_mfcc), 203 | dtype=np.float32) 204 | hf.create_dataset( 205 | name='feature_panns', 206 | shape=(0, 320000), 207 | maxshape=(None, 320000), 208 | dtype=np.float32) 209 | 210 | for (n, audio_name) in enumerate(meta_dict['audio_name']): 211 | audio_path = os.path.join(audios_dir, audio_name) 212 | print(n, audio_path) 213 | 214 | # Read audio 215 | (audio, _) = read_audio( 216 | audio_path=audio_path, 217 | target_fs=sample_rate) 218 | audio = audio[:sample_rate*10] 219 | (audio_gamm, _) = read_audio_gamm( 220 | audio_path=audio_path, 221 | target_fs=sample_rate) 222 | fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) 223 | fea_gamm = fea_gamm.transpose(1, 0) 224 | sound, fs = librosa.load(audio_path) 225 | fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc) 226 | fea_mfcc = fea_mfcc.transpose(1, 0) 227 | (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True) 228 | waveform = waveform[:320000] 229 | 230 | hf['feature'].resize((n + 1, total_samples)) 231 | hf['feature'][n] = audio 232 | hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm)) 233 | hf['feature_gamm'][n] = fea_gamm 234 | hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc)) 235 | hf['feature_mfcc'][n] = fea_mfcc 236 | hf['feature_panns'].resize((n + 1, 320000)) 237 | hf['feature_panns'][n] = waveform 238 | 239 | hf.close() 240 | 241 | print('Write hdf5 file to {} using {:.3f} s'.format( 242 | feature_path, time.time() - extract_time)) 243 | 244 | 245 | def calculate_scalar(args): 246 | '''Calculate and write out scalar of features. 247 | 248 | Args: 249 | workspace: string 250 | subtask: 'a' | 'b' | 'c' 251 | data_type: 'train' 252 | mini_data: bool, set True for debugging on a small part of data 253 | ''' 254 | 255 | # Arguments & parameters 256 | workspace = args.workspace 257 | subtask = args.subtask 258 | data_type = args.data_type 259 | mini_data = args.mini_data 260 | 261 | mel_bins = config.mel_bins 262 | frames_per_second = config.frames_per_second 263 | 264 | # Paths 265 | if mini_data: 266 | prefix = 'minidata_' 267 | else: 268 | prefix = '' 269 | 270 | sub_dir = get_subdir(subtask, data_type) 271 | 272 | feature_path = os.path.join(workspace, 'features', 273 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 274 | '{}.h5'.format(sub_dir)) 275 | 276 | scalar_path = os.path.join(workspace, 'scalars', 277 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 278 | '{}.h5'.format(sub_dir)) 279 | create_folder(os.path.dirname(scalar_path)) 280 | 281 | # Load data 282 | load_time = time.time() 283 | 284 | with h5py.File(feature_path, 'r') as hf: 285 | features = hf['feature'][:] 286 | features_gamm = hf['feature_gamm'][:] 287 | features_mfcc = hf['feature_mfcc'][:] 288 | features_panns = hf['feature_panns'][:] 289 | # Calculate scalar 290 | features = np.concatenate(features[None,:], axis=0) 291 | (mean, std) = calculate_scalar_of_tensor(features) 292 | features_gamm = np.concatenate(features_gamm, axis=0) 293 | (mean_gamm, std_gamm) = calculate_scalar_of_tensor(features_gamm) 294 | features_mfcc = np.concatenate(features_mfcc, axis=0) 295 | (mean_mfcc, std_mfcc) = calculate_scalar_of_tensor(features_mfcc) 296 | features_panns = np.concatenate(features_panns[None,:], axis=0) 297 | (mean_panns, std_panns) = calculate_scalar_of_tensor(features_panns) 298 | with h5py.File(scalar_path, 'w') as hf: 299 | hf.create_dataset('mean', data=mean, dtype=np.float32) 300 | hf.create_dataset('std', data=std, dtype=np.float32) 301 | hf.create_dataset('mean_gamm', data=mean_gamm, dtype=np.float32) 302 | hf.create_dataset('std_gamm', data=std_gamm, dtype=np.float32) 303 | hf.create_dataset('mean_mfcc', data=mean_mfcc, dtype=np.float32) 304 | hf.create_dataset('std_mfcc', data=std_mfcc, dtype=np.float32) 305 | hf.create_dataset('mean_panns', data=mean_panns, dtype=np.float32) 306 | hf.create_dataset('std_panns', data=std_panns, dtype=np.float32) 307 | 308 | print('Write out scalar to {}'.format(scalar_path)) 309 | 310 | 311 | if __name__ == '__main__': 312 | 313 | parser = argparse.ArgumentParser(description='') 314 | subparsers = parser.add_subparsers(dest='mode') 315 | 316 | # Calculate feature for all audio files 317 | parser_logmel = subparsers.add_parser('calculate_feature_for_all_audio_files') 318 | parser_logmel.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.') 319 | parser_logmel.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 320 | parser_logmel.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1') 321 | parser_logmel.add_argument('--data_type', type=str, choices=['development', 'leaderboard', 'evaluation'], required=True) 322 | parser_logmel.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.') 323 | 324 | # Calculate scalar 325 | parser_scalar = subparsers.add_parser('calculate_scalar') 326 | parser_scalar.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 327 | parser_scalar.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1') 328 | parser_scalar.add_argument('--data_type', type=str, choices=['development', 'evaluation'], required=True) 329 | parser_scalar.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.') 330 | 331 | # Parse arguments 332 | args = parser.parse_args() 333 | 334 | if args.mode == 'calculate_feature_for_all_audio_files': 335 | calculate_feature_for_all_audio_files(args) 336 | 337 | elif args.mode == 'calculate_scalar': 338 | calculate_scalar(args) 339 | 340 | else: 341 | raise Exception('Incorrect arguments!') -------------------------------------------------------------------------------- /utils/data_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import h5py 3 | import csv 4 | import time 5 | import logging 6 | import os 7 | #import glob 8 | import matplotlib.pyplot as plt 9 | import logging 10 | 11 | from utilities import scale, read_metadata, sparse_to_categorical 12 | import config 13 | 14 | 15 | class Base(object): 16 | 17 | def __init__(self): 18 | '''Base class for data generator 19 | ''' 20 | pass 21 | 22 | def load_hdf5(self, hdf5_path): 23 | '''Load hdf5 file. 24 | 25 | Returns: 26 | data_dict: dict of data, e.g.: 27 | {'audio_name': np.array(['a.wav', 'b.wav', ...]), 28 | 'feature': (audios_num, frames_num, mel_bins) 29 | 'target': (audios_num,), 30 | ...} 31 | ''' 32 | data_dict = {} 33 | 34 | with h5py.File(hdf5_path, 'r') as hf: 35 | data_dict['audio_name'] = np.array( 36 | [audio_name.decode() for audio_name in hf['audio_name'][:]]) 37 | 38 | data_dict['feature'] = hf['feature'][:].astype(np.float32) 39 | data_dict['feature_gamm'] = hf['feature_gamm'][:].astype(np.float32) 40 | data_dict['feature_mfcc'] = hf['feature_mfcc'][:].astype(np.float32) 41 | data_dict['feature_panns'] = hf['feature_panns'][:].astype(np.float32) 42 | 43 | if 'scene_label' in hf.keys(): 44 | data_dict['target'] = np.array( 45 | [self.lb_to_idx[scene_label.decode()] \ 46 | for scene_label in hf['scene_label'][:]]) 47 | 48 | if 'identifier' in hf.keys(): 49 | data_dict['identifier'] = np.array( 50 | [identifier.decode() for identifier in hf['identifier'][:]]) 51 | 52 | if 'source_label' in hf.keys(): 53 | data_dict['source_label'] = np.array( 54 | [source_label.decode() \ 55 | for source_label in hf['source_label'][:]]) 56 | 57 | return data_dict 58 | 59 | def transform(self, x): 60 | return scale(x, self.scalar['mean'], self.scalar['std']) 61 | def transform_gamm(self, x): 62 | return scale(x, self.scalar['mean_gamm'], self.scalar['std_gamm']) 63 | def transform_mfcc(self, x): 64 | return scale(x, self.scalar['mean_mfcc'], self.scalar['std_mfcc']) 65 | def transform_panns(self, x): 66 | return scale(x, self.scalar['mean_panns'], self.scalar['std_panns']) 67 | 68 | 69 | class DataGenerator(Base): 70 | 71 | def __init__(self, feature_hdf5_path, train_csv, validate_csv, holdout_fold, 72 | scalar, batch_size, seed=1234): 73 | '''Data generator for training and validation. 74 | 75 | Args: 76 | feature_hdf5_path: string, path of hdf5 feature file 77 | train_csv: string, path of train csv file 78 | validate_csv: string, path of validate csv file 79 | holdout_fold: set 1 for development and none for training 80 | on all data without validation 81 | scalar: object, containing mean and std value 82 | batch_size: int 83 | seed: int, random seed 84 | ''' 85 | 86 | self.scalar = scalar 87 | self.batch_size = batch_size 88 | self.random_state = np.random.RandomState(seed) 89 | 90 | # self.classes_num = classes_num 91 | self.in_domain_classes_num = len(config.labels) - 1 92 | self.all_classes_num = len(config.labels) - 1 93 | self.lb_to_idx = config.lb_to_idx 94 | self.idx_to_lb = config.idx_to_lb 95 | 96 | # Load training data 97 | load_time = time.time() 98 | 99 | self.data_dict = self.load_hdf5(feature_hdf5_path) 100 | 101 | train_meta = read_metadata(train_csv) 102 | validate_meta = read_metadata(validate_csv) 103 | 104 | self.train_audio_indexes = self.get_audio_indexes( 105 | train_meta, self.data_dict, holdout_fold, 'train') 106 | 107 | self.validate_audio_indexes = self.get_audio_indexes( 108 | validate_meta, self.data_dict, holdout_fold, 'validate') 109 | 110 | if holdout_fold == 'none': 111 | self.train_audio_indexes = np.concatenate( 112 | (self.train_audio_indexes, self.validate_audio_indexes), axis=0) 113 | 114 | self.validate_audio_indexes = np.array([]) 115 | 116 | logging.info('Load data time: {:.3f} s'.format(time.time() - load_time)) 117 | logging.info('Training audio num: {}'.format(len(self.train_audio_indexes))) 118 | logging.info('Validation audio num: {}'.format(len(self.validate_audio_indexes))) 119 | 120 | self.random_state.shuffle(self.train_audio_indexes) 121 | self.pointer = 0 122 | 123 | def get_audio_indexes(self, meta_data, data_dict, holdout_fold, data_type): 124 | '''Get train or validate indexes. 125 | ''' 126 | audio_indexes = [] 127 | 128 | for name in meta_data['audio_name']: 129 | loct = np.argwhere(data_dict['audio_name'] == name) 130 | 131 | if len(loct) > 0: 132 | index = loct[0, 0] 133 | label = self.idx_to_lb[self.data_dict['target'][index]] 134 | if holdout_fold != 'none': 135 | if data_type == 'train' and label != 'unknown': 136 | audio_indexes.append(index) 137 | elif data_type == 'validate': 138 | audio_indexes.append(index) 139 | else: 140 | if label != 'unknown': 141 | audio_indexes.append(index) 142 | 143 | return np.array(audio_indexes) 144 | 145 | def generate_train(self): 146 | '''Generate mini-batch data for training. 147 | 148 | Returns: 149 | batch_data_dict: dict containing audio_name, feature and target 150 | ''' 151 | 152 | while True: 153 | # Reset pointer 154 | if self.pointer >= len(self.train_audio_indexes): 155 | self.pointer = 0 156 | self.random_state.shuffle(self.train_audio_indexes) 157 | 158 | # Get batch audio_indexes 159 | batch_audio_indexes = self.train_audio_indexes[ 160 | self.pointer: self.pointer + self.batch_size] 161 | 162 | self.pointer += self.batch_size 163 | 164 | batch_data_dict = {} 165 | 166 | batch_data_dict['audio_name'] = \ 167 | self.data_dict['audio_name'][batch_audio_indexes] 168 | 169 | batch_feature = self.data_dict['feature'][batch_audio_indexes] 170 | batch_feature = self.transform(batch_feature) 171 | batch_data_dict['feature'] = batch_feature 172 | 173 | batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes] 174 | batch_feature_gamm = self.transform_gamm(batch_feature_gamm) 175 | batch_data_dict['feature_gamm'] = batch_feature_gamm 176 | 177 | batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes] 178 | batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc) 179 | batch_data_dict['feature_mfcc'] = batch_feature_mfcc 180 | 181 | batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes] 182 | batch_feature_panns = self.transform_panns(batch_feature_panns) 183 | batch_data_dict['feature_panns'] = batch_feature_panns 184 | 185 | sparse_target = self.data_dict['target'][batch_audio_indexes] 186 | batch_data_dict['target'] = sparse_to_categorical( 187 | sparse_target, self.in_domain_classes_num) 188 | 189 | yield batch_data_dict 190 | 191 | def get_source_indexes(self, indexes, data_dict, source): 192 | '''Get indexes of specific source. 193 | ''' 194 | source_indexes = np.array([index for index in indexes \ 195 | if data_dict['source_label'][index] == source]) 196 | 197 | return source_indexes 198 | 199 | def generate_validate(self, data_type, source, max_iteration=None): 200 | '''Generate mini-batch data for training. 201 | 202 | Args: 203 | data_type: 'train' | 'validate' 204 | source: 'a' | 'b' | 'c' 205 | max_iteration: int, maximum iteration to validate to speed up validation 206 | 207 | Returns: 208 | batch_data_dict: dict containing audio_name, feature and target 209 | ''' 210 | 211 | batch_size = self.batch_size 212 | 213 | if data_type == 'train': 214 | audio_indexes = np.array(self.train_audio_indexes) 215 | elif data_type == 'validate': 216 | audio_indexes = np.array(self.validate_audio_indexes) 217 | else: 218 | raise Exception('Incorrect argument!') 219 | 220 | audio_indexes = self.get_source_indexes( 221 | audio_indexes, self.data_dict, source) 222 | 223 | iteration = 0 224 | pointer = 0 225 | 226 | while True: 227 | if iteration == max_iteration: 228 | break 229 | 230 | # Reset pointer 231 | if pointer >= len(audio_indexes): 232 | break 233 | 234 | # Get batch audio_indexes 235 | batch_audio_indexes = audio_indexes[pointer: pointer + batch_size] 236 | pointer += batch_size 237 | iteration += 1 238 | 239 | batch_data_dict = {} 240 | 241 | batch_data_dict['audio_name'] = \ 242 | self.data_dict['audio_name'][batch_audio_indexes] 243 | 244 | batch_feature = self.data_dict['feature'][batch_audio_indexes] 245 | batch_feature = self.transform(batch_feature) 246 | batch_data_dict['feature'] = batch_feature 247 | 248 | batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes] 249 | batch_feature_gamm = self.transform_gamm(batch_feature_gamm) 250 | batch_data_dict['feature_gamm'] = batch_feature_gamm 251 | 252 | batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes] 253 | batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc) 254 | batch_data_dict['feature_mfcc'] = batch_feature_mfcc 255 | 256 | batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes] 257 | batch_feature_panns = self.transform_panns(batch_feature_panns) 258 | batch_data_dict['feature_panns'] = batch_feature_panns 259 | 260 | sparse_target = self.data_dict['target'][batch_audio_indexes] 261 | batch_data_dict['target'] = sparse_to_categorical( 262 | sparse_target, self.all_classes_num) 263 | 264 | yield batch_data_dict 265 | 266 | 267 | class EvaluationDataGenerator(Base): 268 | def __init__(self, feature_hdf5_path, scalar, batch_size, seed=1234): 269 | '''Data generator for training and validation. 270 | 271 | Args: 272 | feature_hdf5_path: string, path of hdf5 feature file 273 | scalar: object, containing mean and std value 274 | batch_size: int 275 | seed: int, random seed 276 | ''' 277 | self.scalar = scalar 278 | self.batch_size = batch_size 279 | 280 | self.data_dict = self.load_hdf5(feature_hdf5_path) 281 | 282 | def generate_evaluation(self, data_type, max_iteration=None): 283 | '''Generate mini-batch data for training. 284 | 285 | Args: 286 | data_type: 'train' | 'validate' 287 | max_iteration: int, maximum iteration to validate to speed up validation 288 | 289 | Returns: 290 | batch_data_dict: dict containing audio_name, feature and target 291 | ''' 292 | 293 | batch_size = self.batch_size 294 | audio_indexes = np.arange(len(self.data_dict['audio_name'])) 295 | 296 | iteration = 0 297 | pointer = 0 298 | 299 | while True: 300 | if iteration == max_iteration: 301 | break 302 | 303 | # Reset pointer 304 | if pointer >= len(audio_indexes): 305 | break 306 | 307 | # Get batch audio_indexes 308 | batch_audio_indexes = audio_indexes[pointer: pointer + batch_size] 309 | pointer += batch_size 310 | iteration += 1 311 | 312 | batch_data_dict = {} 313 | 314 | batch_data_dict['audio_name'] = \ 315 | self.data_dict['audio_name'][batch_audio_indexes] 316 | 317 | batch_feature = self.data_dict['feature'][batch_audio_indexes] 318 | batch_feature = self.transform(batch_feature) 319 | batch_data_dict['feature'] = batch_feature 320 | 321 | batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes] 322 | batch_feature_gamm = self.transform_gamm(batch_feature_gamm) 323 | batch_data_dict['feature_gamm'] = batch_feature_gamm 324 | 325 | batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes] 326 | batch_feature_panns = self.transform_panns(batch_feature_panns) 327 | batch_data_dict['feature_panns'] = batch_feature_panns 328 | 329 | batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes] 330 | batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc) 331 | batch_data_dict['feature_mfcc'] = batch_feature_mfcc 332 | 333 | yield batch_data_dict 334 | 335 | def transform(self, x): 336 | return scale(x, self.scalar['mean'], self.scalar['std']) 337 | def transform_gamm(self, x): 338 | return scale(x, self.scalar['mean_gamm'], self.scalar['std_gamm']) 339 | def transform_mfcc(self, x): 340 | return scale(x, self.scalar['mean_mfcc'], self.scalar['std_mfcc']) 341 | def transform_panns(self, x): 342 | return scale(x, self.scalar['mean_panns'], self.scalar['std_panns']) 343 | -------------------------------------------------------------------------------- /pytorch/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '../utils')) 4 | import numpy as np 5 | import argparse 6 | import h5py 7 | import math 8 | import time 9 | import logging 10 | import matplotlib.pyplot as plt 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | 16 | from utilities import (create_folder, get_filename, create_logging, load_scalar, 17 | get_subdir, get_sources, write_submission, mixup_data, mixup_criterion) 18 | from data_generator import DataGenerator, EvaluationDataGenerator 19 | from models import * 20 | from losses import nll_loss 21 | from evaluate import Evaluator, StatisticsContainer 22 | from pytorch_utils import move_data_to_gpu, forward 23 | import config 24 | 25 | 26 | def train(args): 27 | '''Training. Model will be saved after several iterations. 28 | 29 | Args: 30 | dataset_dir: string, directory of dataset 31 | workspace: string, directory of workspace 32 | subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1 33 | data_type: 'development' | 'evaluation' 34 | holdout_fold: '1' | 'none', set 1 for development and none for training 35 | on all data without validation 36 | model_type: string, e.g. 'Cnn_9layers_AvgPooling' 37 | batch_size: int 38 | cuda: bool 39 | mini_data: bool, set True for debugging on a small part of data 40 | ''' 41 | 42 | # Arugments & parameters 43 | dataset_dir = args.dataset_dir 44 | workspace = args.workspace 45 | subtask = args.subtask 46 | data_type = args.data_type 47 | holdout_fold = args.holdout_fold 48 | model_type = args.model_type 49 | batch_size = args.batch_size 50 | cuda = args.cuda and torch.cuda.is_available() 51 | mini_data = args.mini_data 52 | filename = args.filename 53 | ite_train = args.ite_train 54 | ite_eva = args.ite_eva 55 | ite_store = args.ite_store 56 | 57 | mel_bins = config.mel_bins 58 | frames_per_second = config.frames_per_second 59 | max_iteration = None # Number of mini-batches to evaluate on training data 60 | reduce_lr = True 61 | 62 | sources_to_evaluate = get_sources(subtask) 63 | in_domain_classes_num = len(config.labels) - 1 64 | 65 | # Paths 66 | if mini_data: 67 | prefix = 'minidata_' 68 | else: 69 | prefix = '' 70 | 71 | sub_dir = get_subdir(subtask, data_type) 72 | 73 | train_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 74 | 'fold1_train.csv') 75 | 76 | validate_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 77 | 'fold1_evaluate.csv') 78 | 79 | feature_hdf5_path = os.path.join(workspace, 'features', 80 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 81 | '{}.h5'.format(sub_dir)) 82 | 83 | scalar_path = os.path.join(workspace, 'scalars', 84 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 85 | '{}.h5'.format(sub_dir)) 86 | 87 | checkpoints_dir = os.path.join(workspace, 'checkpoints', filename, 88 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 89 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 90 | model_type) 91 | create_folder(checkpoints_dir) 92 | 93 | validate_statistics_path = os.path.join(workspace, 'statistics', filename, 94 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 95 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 96 | model_type, 'validate_statistics.pickle') 97 | 98 | create_folder(os.path.dirname(validate_statistics_path)) 99 | 100 | logs_dir = os.path.join(workspace, 'logs', filename, args.mode, 101 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 102 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), model_type) 103 | create_logging(logs_dir, 'w') 104 | logging.info(args) 105 | 106 | if cuda: 107 | logging.info('Using GPU.') 108 | else: 109 | logging.info('Using CPU. Set --cuda flag to use GPU.') 110 | 111 | # Load scalar 112 | scalar = load_scalar(scalar_path) 113 | 114 | # Model 115 | Model = eval(model_type) 116 | 117 | if subtask in ['a', 'b']: 118 | model = Model(in_domain_classes_num, activation='logsoftmax') 119 | loss_func = nll_loss 120 | 121 | elif subtask == 'c': 122 | model = Model(in_domain_classes_num, activation='sigmoid') 123 | loss_func = F.binary_cross_entropy 124 | 125 | if cuda: 126 | model.cuda() 127 | 128 | optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), 129 | eps=1e-08, weight_decay=0., amsgrad=True) 130 | 131 | # Data generator 132 | data_generator = DataGenerator( 133 | feature_hdf5_path=feature_hdf5_path, 134 | train_csv=train_csv, 135 | validate_csv=validate_csv, 136 | holdout_fold=holdout_fold, 137 | scalar=scalar, 138 | batch_size=batch_size) 139 | 140 | # Evaluator 141 | evaluator = Evaluator( 142 | model=model, 143 | data_generator=data_generator, 144 | subtask=subtask, 145 | cuda=cuda) 146 | 147 | # Statistics 148 | validate_statistics_container = StatisticsContainer(validate_statistics_path) 149 | 150 | train_bgn_time = time.time() 151 | iteration = 0 152 | 153 | # Train on mini batches 154 | for batch_data_dict in data_generator.generate_train(): 155 | 156 | # Evaluate 157 | #1800 158 | if iteration % 200 == 0: 159 | print(iteration) 160 | if iteration % 200 == 0 and iteration > ite_eva: 161 | logging.info('------------------------------------') 162 | logging.info('Iteration: {}'.format(iteration)) 163 | 164 | train_fin_time = time.time() 165 | 166 | for source in sources_to_evaluate: 167 | train_statistics = evaluator.evaluate( 168 | data_type='train', 169 | source=source, 170 | max_iteration=None, 171 | verbose=False) 172 | 173 | if holdout_fold != 'none': 174 | for source in sources_to_evaluate: 175 | validate_statistics = evaluator.evaluate( 176 | data_type='validate', 177 | source=source, 178 | max_iteration=None, 179 | verbose=False) 180 | 181 | validate_statistics_container.append_and_dump( 182 | iteration, source, validate_statistics) 183 | 184 | train_time = train_fin_time - train_bgn_time 185 | validate_time = time.time() - train_fin_time 186 | 187 | logging.info( 188 | 'Train time: {:.3f} s, validate time: {:.3f} s' 189 | ''.format(train_time, validate_time)) 190 | 191 | train_bgn_time = time.time() 192 | 193 | # Save model 194 | if iteration % 200 == 0 and iteration > ite_store: 195 | checkpoint = { 196 | 'iteration': iteration, 197 | 'model': model.state_dict(), 198 | 'optimizer': optimizer.state_dict()} 199 | 200 | checkpoint_path = os.path.join( 201 | checkpoints_dir, '{}_iterations.pth'.format(iteration)) 202 | 203 | torch.save(checkpoint, checkpoint_path) 204 | logging.info('Model saved to {}'.format(checkpoint_path)) 205 | 206 | # Reduce learning rate 207 | if reduce_lr and iteration % 200 == 0 and iteration > 0: 208 | for param_group in optimizer.param_groups: 209 | param_group['lr'] *= 0.91 210 | 211 | # Move data to GPU 212 | for key in batch_data_dict.keys(): 213 | if key in ['feature', 'feature_gamm', 'feature_mfcc', 'feature_panns', 'target']: 214 | batch_data_dict[key] = move_data_to_gpu(batch_data_dict[key], cuda) 215 | 216 | # Train 217 | # batch_output,batch_loss = model(batch_data_dict['feature'], batch_data_dict['feature_gamm'], batch_data_dict['feature_mfcc'], batch_data_dict['feature_panns']) 218 | # loss = loss_func(batch_output, batch_data_dict['target']) 219 | 220 | # Using Mixup 221 | model.train() 222 | mixed_x1, mixed_x2, mixed_x3, mixed_x4, y_a, y_b, lam = mixup_data(x1=batch_data_dict['feature'], x2=batch_data_dict['feature_gamm'], x3=batch_data_dict['feature_mfcc'], x4=batch_data_dict['feature_panns'], y=batch_data_dict['target'], alpha=0.2) 223 | batch_output,batch_loss = model(mixed_x1, mixed_x2, mixed_x3, mixed_x4) 224 | 225 | if batch_output.shape[1] == 10: # single scale models 226 | loss = mixup_criterion(loss_func, batch_output, y_a, y_b, lam) 227 | else: # multi scale models 228 | losses = [] 229 | for ite in range(batch_output.shape[1]-1): 230 | loss = mixup_criterion(loss_func, batch_output[:,ite,:], y_a, y_b, lam) 231 | losses.append(loss) 232 | loss = sum(losses) 233 | 234 | # Backward 235 | optimizer.zero_grad() 236 | loss.backward() 237 | optimizer.step() 238 | 239 | # Stop learning 240 | # 12000 for scratch 241 | if iteration == ite_train: 242 | break 243 | 244 | iteration += 1 245 | 246 | 247 | def inference_validation(args): 248 | '''Inference and calculate metrics on validation data. 249 | 250 | Args: 251 | dataset_dir: string, directory of dataset 252 | subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1 253 | data_type: 'development' 254 | workspace: string, directory of workspace 255 | model_type: string, e.g. 'Cnn_9layers' 256 | iteration: int 257 | batch_size: int 258 | cuda: bool 259 | mini_data: bool, set True for debugging on a small part of data 260 | visualize: bool 261 | ''' 262 | # Arugments & parameters 263 | dataset_dir = args.dataset_dir 264 | subtask = args.subtask 265 | data_type = args.data_type 266 | workspace = args.workspace 267 | model_type = args.model_type 268 | holdout_fold = args.holdout_fold 269 | iteration = args.iteration 270 | batch_size = args.batch_size 271 | cuda = args.cuda and torch.cuda.is_available() 272 | mini_data = args.mini_data 273 | visualize = args.visualize 274 | filename = args.filename 275 | 276 | mel_bins = config.mel_bins 277 | frames_per_second = config.frames_per_second 278 | 279 | sources = get_sources(subtask) 280 | in_domain_classes_num = len(config.labels) - 1 281 | 282 | # Paths 283 | if mini_data: 284 | prefix = 'minidata_' 285 | else: 286 | prefix = '' 287 | 288 | sub_dir = get_subdir(subtask, data_type) 289 | 290 | train_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 291 | 'fold1_train.csv') 292 | 293 | validate_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 294 | 'fold1_evaluate.csv') 295 | 296 | feature_hdf5_path = os.path.join(workspace, 'features', 297 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 298 | '{}.h5'.format(sub_dir)) 299 | 300 | scalar_path = os.path.join(workspace, 'scalars', 301 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 302 | '{}.h5'.format(sub_dir)) 303 | 304 | checkpoint_path = os.path.join(workspace, 'checkpoints', filename, 305 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 306 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 307 | model_type, '{}_iterations.pth'.format(iteration)) 308 | 309 | logs_dir = os.path.join(workspace, 'logs', filename, args.mode, 310 | '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 311 | '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 312 | model_type) 313 | create_logging(logs_dir, 'w') 314 | logging.info(args) 315 | 316 | # Load scalar 317 | scalar = load_scalar(scalar_path) 318 | 319 | # Load model 320 | Model = eval(model_type) 321 | 322 | if subtask in ['a', 'b']: 323 | model = Model(in_domain_classes_num, activation='logsoftmax') 324 | loss_func = nll_loss 325 | 326 | elif subtask == 'c': 327 | model = Model(in_domain_classes_num, activation='sigmoid') 328 | loss_func = F.binary_cross_entropy 329 | 330 | checkpoint = torch.load(checkpoint_path) 331 | model.load_state_dict(checkpoint['model']) 332 | 333 | if cuda: 334 | model.cuda() 335 | 336 | # Data generator 337 | data_generator = DataGenerator( 338 | feature_hdf5_path=feature_hdf5_path, 339 | train_csv=train_csv, 340 | validate_csv=validate_csv, 341 | holdout_fold=holdout_fold, 342 | scalar=scalar, 343 | batch_size=batch_size) 344 | 345 | # Evaluator 346 | evaluator = Evaluator( 347 | model=model, 348 | data_generator=data_generator, 349 | subtask=subtask, 350 | cuda=cuda) 351 | 352 | if subtask in ['a', 'c']: 353 | evaluator.evaluate(data_type='validate', source='a', verbose=True) 354 | evaluator.evaluate(data_type='validate', source='b', verbose=True) 355 | evaluator.evaluate(data_type='validate', source='c', verbose=True) 356 | evaluator.evaluate(data_type='validate', source='s1', verbose=True) 357 | evaluator.evaluate(data_type='validate', source='s2', verbose=True) 358 | evaluator.evaluate(data_type='validate', source='s3', verbose=True) 359 | 360 | elif subtask == 'b': 361 | evaluator.evaluate(data_type='validate', source='a', verbose=True) 362 | evaluator.evaluate(data_type='validate', source='b', verbose=True) 363 | evaluator.evaluate(data_type='validate', source='c', verbose=True) 364 | 365 | # Visualize log mel spectrogram 366 | if visualize: 367 | evaluator.visualize(data_type='validate', source='a') 368 | 369 | 370 | 371 | if __name__ == '__main__': 372 | parser = argparse.ArgumentParser(description='Example of parser. ') 373 | subparsers = parser.add_subparsers(dest='mode') 374 | 375 | # Train 376 | parser_train = subparsers.add_parser('train') 377 | parser_train.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.') 378 | parser_train.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 379 | parser_train.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.') 380 | parser_train.add_argument('--data_type', type=str, choices=['development', 'evaluation'], required=True) 381 | parser_train.add_argument('--holdout_fold', type=str, choices=['1', 'none'], required=True, help='Set 1 for development and none for training on all data without validation.') 382 | parser_train.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.') 383 | parser_train.add_argument('--batch_size', type=int, required=True) 384 | parser_train.add_argument('--ite_train', type=int, required=True) 385 | parser_train.add_argument('--ite_eva', type=int, required=True) 386 | parser_train.add_argument('--ite_store', type=int, required=True) 387 | parser_train.add_argument('--cuda', action='store_true', default=False) 388 | parser_train.add_argument('--fixed', type=str, default=False) 389 | parser_train.add_argument('--finetune', type=str, default=False) 390 | parser_train.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.') 391 | 392 | # Inference validation data 393 | parser_inference_validation = subparsers.add_parser('inference_validation') 394 | parser_inference_validation.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.') 395 | parser_inference_validation.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 396 | parser_inference_validation.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.') 397 | parser_inference_validation.add_argument('--data_type', type=str, choices=['development'], required=True) 398 | parser_inference_validation.add_argument('--holdout_fold', type=str, choices=['1'], required=True) 399 | parser_inference_validation.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.') 400 | parser_inference_validation.add_argument('--iteration', type=int, required=True, help='Load model of this iteration.') 401 | parser_inference_validation.add_argument('--batch_size', type=int, required=True) 402 | parser_inference_validation.add_argument('--cuda', action='store_true', default=False) 403 | parser_inference_validation.add_argument('--visualize', action='store_true', default=False, help='Visualize log mel spectrogram of different sound classes.') 404 | parser_inference_validation.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.') 405 | 406 | # Inference evaluation data 407 | parser_inference_validation = subparsers.add_parser('inference_evaluation') 408 | parser_inference_validation.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.') 409 | parser_inference_validation.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.') 410 | parser_inference_validation.add_argument('--data_type', type=str, choices=['leaderboard', 'evaluation'], required=True) 411 | parser_inference_validation.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.') 412 | parser_inference_validation.add_argument('--iteration', type=int, required=True, help='Load model of this iteration.') 413 | parser_inference_validation.add_argument('--batch_size', type=int, required=True) 414 | parser_inference_validation.add_argument('--cuda', action='store_true', default=False) 415 | parser_inference_validation.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.') 416 | 417 | # Parse arguments 418 | args = parser.parse_args() 419 | args.filename = get_filename(__file__) 420 | 421 | if args.mode == 'train': 422 | train(args) 423 | 424 | elif args.mode == 'inference_validation': 425 | inference_validation(args) 426 | 427 | elif args.mode == 'inference_evaluation': 428 | inference_evaluation(args) 429 | 430 | else: 431 | raise Exception('Error argument!') 432 | -------------------------------------------------------------------------------- /pytorch/models.py: -------------------------------------------------------------------------------- 1 | import math 2 | from torch.autograd import Variable 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from nnAudio import Spectrogram as Spec 7 | from torchlibrosa.augmentation import SpecAugmentation 8 | import config 9 | 10 | def init_layer(layer, nonlinearity='leaky_relu'): 11 | """Initialize a Linear or Convolutional layer. """ 12 | nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity) 13 | 14 | if hasattr(layer, 'bias'): 15 | if layer.bias is not None: 16 | layer.bias.data.fill_(0.) 17 | 18 | 19 | def init_bn(bn): 20 | """Initialize a Batchnorm layer. """ 21 | 22 | bn.bias.data.fill_(0.) 23 | bn.running_mean.data.fill_(0.) 24 | bn.weight.data.fill_(1.) 25 | bn.running_var.data.fill_(1.) 26 | 27 | def conv3x3(in_planes, out_planes, stride=1): 28 | """3x3 convolution with padding""" 29 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 30 | padding=1, bias=False) 31 | 32 | def conv1x1(in_planes, out_planes, stride=1): 33 | """1x1 convolution""" 34 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 35 | 36 | class ConvBlock(nn.Module): 37 | def __init__(self, in_channels, out_channels): 38 | 39 | super(ConvBlock, self).__init__() 40 | 41 | self.conv1 = nn.Conv2d(in_channels=in_channels, 42 | out_channels=out_channels, 43 | kernel_size=(3, 3), stride=(1, 1), 44 | padding=(1, 1), bias=False) 45 | 46 | self.conv2 = nn.Conv2d(in_channels=out_channels, 47 | out_channels=out_channels, 48 | kernel_size=(3, 3), stride=(1, 1), 49 | padding=(1, 1), bias=False) 50 | 51 | self.bn1 = nn.BatchNorm2d(out_channels) 52 | self.bn2 = nn.BatchNorm2d(out_channels) 53 | 54 | self.init_weights() 55 | 56 | def init_weights(self): 57 | 58 | init_layer(self.conv1) 59 | init_layer(self.conv2) 60 | init_bn(self.bn1) 61 | init_bn(self.bn2) 62 | 63 | def forward(self, input, pool_size=(2, 2), pool_type='avg'): 64 | 65 | x = input 66 | x = F.relu_(self.bn1(self.conv1(x))) 67 | x = F.relu_(self.bn2(self.conv2(x))) 68 | if pool_type == 'max': 69 | x = F.max_pool2d(x, kernel_size=pool_size) 70 | elif pool_type == 'avg': 71 | x = F.avg_pool2d(x, kernel_size=pool_size) 72 | else: 73 | raise Exception('Incorrect argument!') 74 | 75 | return x 76 | 77 | class Logmel_Cnn(nn.Module): 78 | 79 | def __init__(self, classes_num, activation): 80 | super(Logmel_Cnn, self).__init__() 81 | 82 | self.activation = activation 83 | self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False) 84 | #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2) 85 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 86 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 87 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 88 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 89 | self.fc2 = nn.Linear(512, 512, bias=True) 90 | self.fc = nn.Linear(512, classes_num, bias=True) 91 | 92 | self.init_weights() 93 | 94 | def init_weights(self): 95 | 96 | init_layer(self.fc) 97 | init_layer(self.fc2) 98 | 99 | def forward(self, x1,x2,x3,x4): 100 | ''' 101 | Input: (batch_size, times_steps, freq_bins)''' 102 | 103 | x = x1 104 | x = self.spec_layer(x) 105 | x = 10*torch.log10(x) 106 | x = torch.clamp(x, min=-100.) 107 | x = x.transpose(1,2) 108 | x = x[:,None,:,:] 109 | #if self.training: 110 | #x = self.spec_augmenter(x) 111 | '''(batch_size, 1, times_steps, freq_bins)''' 112 | 113 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 114 | #x = F.dropout(x, p=0.2, training=self.training) 115 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 116 | #x = F.dropout(x, p=0.2, training=self.training) 117 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 118 | #x = F.dropout(x, p=0.2, training=self.training) 119 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 120 | #x = F.dropout(x, p=0.2, training=self.training) 121 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 122 | 123 | x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) 124 | (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 125 | x = F.relu_(self.fc2(x)) 126 | x = F.dropout(x, p=0.5, training=self.training) 127 | x = self.fc(x) 128 | loss = torch.softmax(x,-1) 129 | if self.activation == 'logsoftmax': 130 | output = F.log_softmax(x, dim=-1) 131 | 132 | elif self.activation == 'sigmoid': 133 | output = torch.sigmoid(x) 134 | 135 | return output,loss 136 | 137 | class Cqt_Cnn(nn.Module): 138 | 139 | def __init__(self, classes_num, activation): 140 | super(Cqt_Cnn, self).__init__() 141 | 142 | self.activation = activation 143 | self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect') 144 | #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2) 145 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 146 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 147 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 148 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 149 | self.fc2 = nn.Linear(512, 512, bias=True) 150 | self.fc = nn.Linear(512, classes_num, bias=True) 151 | 152 | self.init_weights() 153 | 154 | def init_weights(self): 155 | 156 | init_layer(self.fc) 157 | init_layer(self.fc2) 158 | 159 | def forward(self, x1,x2,x3,x4): 160 | ''' 161 | Input: (batch_size, times_steps, freq_bins)''' 162 | 163 | x = x1 164 | x = self.spec_layer(x) 165 | x = 10*torch.log10(x) 166 | x = torch.clamp(x, min=-100.) 167 | x = x.transpose(1,2) 168 | x = x[:,None,:,:] 169 | #if self.training: 170 | #x = self.spec_augmenter(x) 171 | '''(batch_size, 1, times_steps, freq_bins)''' 172 | 173 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 174 | # x = F.dropout(x, p=0.2, training=self.training) 175 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 176 | # x = F.dropout(x, p=0.2, training=self.training) 177 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 178 | # x = F.dropout(x, p=0.2, training=self.training) 179 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 180 | # x = F.dropout(x, p=0.2, training=self.training) 181 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 182 | 183 | x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) 184 | (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 185 | x = F.relu_(self.fc2(x)) 186 | x = F.dropout(x, p=0.5, training=self.training) 187 | x = self.fc(x) 188 | loss = torch.softmax(x,-1) 189 | if self.activation == 'logsoftmax': 190 | output = F.log_softmax(x, dim=-1) 191 | 192 | elif self.activation == 'sigmoid': 193 | output = torch.sigmoid(x) 194 | 195 | return output,loss 196 | 197 | class Gamm_Cnn(nn.Module): 198 | 199 | def __init__(self, classes_num, activation): 200 | super(Gamm_Cnn, self).__init__() 201 | 202 | self.activation = activation 203 | #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2) 204 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 205 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 206 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 207 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 208 | self.fc2 = nn.Linear(512, 512, bias=True) 209 | self.fc = nn.Linear(512, classes_num, bias=True) 210 | 211 | self.init_weights() 212 | 213 | def init_weights(self): 214 | 215 | init_layer(self.fc) 216 | init_layer(self.fc2) 217 | 218 | def forward(self, x1,x2,x3,x4): 219 | ''' 220 | Input: (batch_size, times_steps, freq_bins)''' 221 | 222 | x = x2 223 | x = x[:,None,:,:] 224 | #if self.training: 225 | #x = self.spec_augmenter(x) 226 | '''(batch_size, 1, times_steps, freq_bins)''' 227 | 228 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 229 | # x = F.dropout(x, p=0.2, training=self.training) 230 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 231 | # x = F.dropout(x, p=0.2, training=self.training) 232 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 233 | # x = F.dropout(x, p=0.2, training=self.training) 234 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 235 | # x = F.dropout(x, p=0.2, training=self.training) 236 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 237 | 238 | x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) 239 | (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 240 | x = F.relu_(self.fc2(x)) 241 | x = F.dropout(x, p=0.5, training=self.training) 242 | x = self.fc(x) 243 | loss = torch.softmax(x,-1) 244 | if self.activation == 'logsoftmax': 245 | output = F.log_softmax(x, dim=-1) 246 | 247 | elif self.activation == 'sigmoid': 248 | output = torch.sigmoid(x) 249 | 250 | return output,loss 251 | 252 | class Mfcc_Cnn(nn.Module): 253 | 254 | def __init__(self, classes_num, activation): 255 | super(Mfcc_Cnn, self).__init__() 256 | 257 | self.activation = activation 258 | #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2) 259 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 260 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 261 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 262 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 263 | self.fc2 = nn.Linear(512, 512, bias=True) 264 | self.fc = nn.Linear(512, classes_num, bias=True) 265 | 266 | self.init_weights() 267 | 268 | def init_weights(self): 269 | 270 | init_layer(self.fc) 271 | init_layer(self.fc2) 272 | 273 | def forward(self, x1,x2,x3,x4): 274 | ''' 275 | Input: (batch_size, times_steps, freq_bins)''' 276 | 277 | x = x3 278 | x = x[:,None,:,:] 279 | #if self.training: 280 | #x = self.spec_augmenter(x) 281 | '''(batch_size, 1, times_steps, freq_bins)''' 282 | 283 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 284 | # x = F.dropout(x, p=0.2, training=self.training) 285 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 286 | # x = F.dropout(x, p=0.2, training=self.training) 287 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 288 | # x = F.dropout(x, p=0.2, training=self.training) 289 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 290 | # x = F.dropout(x, p=0.2, training=self.training) 291 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 292 | 293 | x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_stpes) 294 | (x, _) = torch.max(x, dim=2) # (batch_size, feature_maps) 295 | x = F.relu_(self.fc2(x)) 296 | x = F.dropout(x, p=0.5, training=self.training) 297 | x = self.fc(x) 298 | loss = torch.softmax(x,-1) 299 | if self.activation == 'logsoftmax': 300 | output = F.log_softmax(x, dim=-1) 301 | 302 | elif self.activation == 'sigmoid': 303 | output = torch.sigmoid(x) 304 | 305 | return output,loss 306 | 307 | class DCMR(nn.Module): 308 | 309 | def __init__(self, classes_num, activation): 310 | super(DCMR, self).__init__() 311 | 312 | self.activation = activation 313 | self.logmel_cnn = Logmel_Cnn(classes_num, activation) 314 | self.cqt_cnn = Cqt_Cnn(classes_num, activation) 315 | self.gamm_cnn = Gamm_Cnn(classes_num, activation) 316 | self.mfcc_cnn = Mfcc_Cnn(classes_num, activation) 317 | self.ite = 15000 # set your pre-trained model iteration 318 | self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/' # set your model saved path. 319 | self.init_weights() 320 | 321 | def init_weights(self): 322 | logmel_path = self.model_path +'Logmel_Cnn/'+str(self.ite)+'_iterations.pth' 323 | cqt_path = self.model_path +'Cqt_Cnn/'+str(self.ite)+'_iterations.pth' 324 | gamm_path = self.model_path +'Gamm_Cnn/'+str(self.ite)+'_iterations.pth' 325 | mfcc_path = self.model_path +'Mfcc_Cnn/'+str(self.ite)+'_iterations.pth' 326 | 327 | logmel_ch = torch.load(logmel_path) 328 | cqt_ch = torch.load(cqt_path) 329 | gamm_ch = torch.load(gamm_path) 330 | mfcc_ch = torch.load(mfcc_path) 331 | self.logmel_cnn.load_state_dict(logmel_ch['model']) 332 | self.cqt_cnn.load_state_dict(cqt_ch['model']) 333 | self.gamm_cnn.load_state_dict(gamm_ch['model']) 334 | self.mfcc_cnn.load_state_dict(mfcc_ch['model']) 335 | 336 | def forward(self, input1,input2,input3,input4): 337 | ''' 338 | Input: (batch_size, total_frames)''' 339 | 340 | x1,loss1 = self.logmel_cnn(input1,input2,input3,input4) 341 | x2,loss2 = self.cqt_cnn(input1,input2,input3,input4) 342 | x3,loss3 = self.gamm_cnn(input1,input2,input3,input4) 343 | x4,loss4 = self.mfcc_cnn(input1,input2,input3,input4) 344 | x = x1 + x2 + x3 + x4 345 | loss = (loss1+loss2+loss3+loss4)/4. 346 | return x,loss 347 | 348 | class Logmel_DCMF(nn.Module): 349 | 350 | def __init__(self, classes_num, activation): 351 | super(Logmel_DCMF, self).__init__() 352 | 353 | self.activation = activation 354 | self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False) 355 | self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64) 356 | self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128) 357 | self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256) 358 | self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512) 359 | 360 | self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64) 361 | self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128) 362 | self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256) 363 | self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512) 364 | 365 | self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64) 366 | self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128) 367 | self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256) 368 | self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512) 369 | 370 | self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64) 371 | self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128) 372 | self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256) 373 | self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512) 374 | 375 | 376 | self.fc1_1 = nn.Linear(512, 512, bias=True) 377 | self.dropout1 = nn.Dropout(p=0.5) 378 | self.fc1_2 = nn.Linear(512, classes_num, bias=True) 379 | 380 | self.fc2_1 = nn.Linear(512, 512, bias=True) 381 | self.dropout2 = nn.Dropout(p=0.5) 382 | self.fc2_2 = nn.Linear(512, classes_num, bias=True) 383 | 384 | self.fc3_1 = nn.Linear(512, 512, bias=True) 385 | self.dropout3 = nn.Dropout(p=0.5) 386 | self.fc3_2 = nn.Linear(512, classes_num, bias=True) 387 | 388 | self.fc4_1 = nn.Linear(512, 512, bias=True) 389 | self.dropout4 = nn.Dropout(p=0.5) 390 | self.fc4_2 = nn.Linear(512, classes_num, bias=True) 391 | 392 | self.init_weights() 393 | 394 | def init_weights(self): 395 | 396 | init_layer(self.fc1_1) 397 | init_layer(self.fc1_2) 398 | init_layer(self.fc2_1) 399 | init_layer(self.fc2_2) 400 | init_layer(self.fc3_1) 401 | init_layer(self.fc3_2) 402 | init_layer(self.fc4_1) 403 | init_layer(self.fc4_2) 404 | 405 | def forward(self, x1,x2,x3,x4): 406 | ''' 407 | Input: (batch_size, scale_num, total_frames)''' 408 | outputs = [] 409 | 410 | x = input 411 | x = self.spec_layer(x1) 412 | x = 10*torch.log10(x) 413 | x = torch.clamp(x, min=-100.) 414 | x = x.transpose(1,2) 415 | x = x[:,None,:,:] 416 | '''(batch_size, 1, times_steps, freq_bins)''' 417 | temp = x.clone() 418 | count = 0. 419 | 420 | x = temp[:,:,:,0:16] 421 | x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg') 422 | x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg') 423 | x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg') 424 | x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg') 425 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 426 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 427 | x = F.relu_(self.fc1_1(x)) 428 | x = self.dropout1(x) 429 | x = self.fc1_2(x) 430 | x = x.view(x.shape[0], 1, x.shape[1]) 431 | co_output = x 432 | count += 1. 433 | outputs.append(x) 434 | 435 | 436 | x = temp[:,:,:,8:24] 437 | x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg') 438 | x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg') 439 | x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg') 440 | x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg') 441 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 442 | x = F.relu_(self.fc2_1(x)) 443 | x = self.dropout2(x) 444 | x = self.fc2_2(x) 445 | x = x.view(x.shape[0], 1, x.shape[1]) 446 | co_output = x + co_output 447 | count += 1. 448 | outputs.append(x) 449 | 450 | x = temp[:,:,:,16:32] 451 | x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg') 452 | x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg') 453 | x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg') 454 | x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg') 455 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 456 | x = F.relu_(self.fc3_1(x)) 457 | x = self.dropout3(x) 458 | x = self.fc3_2(x) 459 | x = x.view(x.shape[0], 1, x.shape[1]) 460 | co_output = x + co_output 461 | count += 1. 462 | outputs.append(x) 463 | 464 | x = temp[:,:,:,24:40] 465 | x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg') 466 | x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg') 467 | x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg') 468 | x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg') 469 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 470 | x = F.relu_(self.fc4_1(x)) 471 | x = self.dropout4(x) 472 | x = self.fc4_2(x) 473 | x = x.view(x.shape[0], 1, x.shape[1]) 474 | co_output = x + co_output 475 | count += 1. 476 | outputs.append(x) 477 | 478 | outputs.append(co_output/count) 479 | outputs = torch.cat((outputs), 1) 480 | loss = torch.softmax(outputs,-1) 481 | if self.activation == 'logsoftmax': 482 | output = F.log_softmax(outputs, dim=-1) 483 | 484 | elif self.activation == 'sigmoid': 485 | output = torch.sigmoid(outputs) 486 | 487 | return output,loss 488 | 489 | class Mfcc_DCMF(nn.Module): 490 | 491 | def __init__(self, classes_num, activation): 492 | super(Mfcc_DCMF, self).__init__() 493 | 494 | self.activation = activation 495 | 496 | self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64) 497 | self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128) 498 | self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256) 499 | self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512) 500 | 501 | self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64) 502 | self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128) 503 | self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256) 504 | self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512) 505 | 506 | self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64) 507 | self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128) 508 | self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256) 509 | self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512) 510 | 511 | self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64) 512 | self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128) 513 | self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256) 514 | self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512) 515 | 516 | 517 | self.fc1_1 = nn.Linear(512, 512, bias=True) 518 | self.dropout1 = nn.Dropout(p=0.5) 519 | self.fc1_2 = nn.Linear(512, classes_num, bias=True) 520 | 521 | self.fc2_1 = nn.Linear(512, 512, bias=True) 522 | self.dropout2 = nn.Dropout(p=0.5) 523 | self.fc2_2 = nn.Linear(512, classes_num, bias=True) 524 | 525 | self.fc3_1 = nn.Linear(512, 512, bias=True) 526 | self.dropout3 = nn.Dropout(p=0.5) 527 | self.fc3_2 = nn.Linear(512, classes_num, bias=True) 528 | 529 | self.fc4_1 = nn.Linear(512, 512, bias=True) 530 | self.dropout4 = nn.Dropout(p=0.5) 531 | self.fc4_2 = nn.Linear(512, classes_num, bias=True) 532 | 533 | self.init_weights() 534 | 535 | def init_weights(self): 536 | 537 | init_layer(self.fc1_1) 538 | init_layer(self.fc1_2) 539 | init_layer(self.fc2_1) 540 | init_layer(self.fc2_2) 541 | init_layer(self.fc3_1) 542 | init_layer(self.fc3_2) 543 | init_layer(self.fc4_1) 544 | init_layer(self.fc4_2) 545 | 546 | def forward(self, x1,x2,x3,x4): 547 | ''' 548 | Input: (batch_size, scale_num, total_frames)''' 549 | outputs = [] 550 | 551 | x = x3 552 | x = x[:,None,:,:] 553 | '''(batch_size, 1, times_steps, freq_bins)''' 554 | temp = x.clone() 555 | count = 0. 556 | 557 | x = temp[:,:,:,0:16] 558 | x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg') 559 | x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg') 560 | x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg') 561 | x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg') 562 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 563 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 564 | x = F.relu_(self.fc1_1(x)) 565 | x = self.dropout1(x) 566 | x = self.fc1_2(x) 567 | x = x.view(x.shape[0], 1, x.shape[1]) 568 | co_output = x 569 | count += 1. 570 | outputs.append(x) 571 | 572 | 573 | x = temp[:,:,:,8:24] 574 | x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg') 575 | x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg') 576 | x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg') 577 | x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg') 578 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 579 | x = F.relu_(self.fc2_1(x)) 580 | x = self.dropout2(x) 581 | x = self.fc2_2(x) 582 | x = x.view(x.shape[0], 1, x.shape[1]) 583 | co_output = x + co_output 584 | count += 1. 585 | outputs.append(x) 586 | 587 | x = temp[:,:,:,16:32] 588 | x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg') 589 | x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg') 590 | x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg') 591 | x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg') 592 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 593 | x = F.relu_(self.fc3_1(x)) 594 | x = self.dropout3(x) 595 | x = self.fc3_2(x) 596 | x = x.view(x.shape[0], 1, x.shape[1]) 597 | co_output = x + co_output 598 | count += 1. 599 | outputs.append(x) 600 | 601 | x = temp[:,:,:,24:40] 602 | x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg') 603 | x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg') 604 | x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg') 605 | x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg') 606 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 607 | x = F.relu_(self.fc4_1(x)) 608 | x = self.dropout4(x) 609 | x = self.fc4_2(x) 610 | x = x.view(x.shape[0], 1, x.shape[1]) 611 | co_output = x + co_output 612 | count += 1. 613 | outputs.append(x) 614 | 615 | outputs.append(co_output/count) 616 | outputs = torch.cat((outputs), 1) 617 | loss = torch.softmax(outputs,-1) 618 | if self.activation == 'logsoftmax': 619 | output = F.log_softmax(outputs, dim=-1) 620 | 621 | elif self.activation == 'sigmoid': 622 | output = torch.sigmoid(outputs) 623 | 624 | return output,loss 625 | 626 | class Cqt_DCMF(nn.Module): 627 | 628 | def __init__(self, classes_num, activation): 629 | super(Cqt_DCMF, self).__init__() 630 | 631 | self.activation = activation 632 | self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect') 633 | self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64) 634 | self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128) 635 | self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256) 636 | self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512) 637 | 638 | self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64) 639 | self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128) 640 | self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256) 641 | self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512) 642 | 643 | self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64) 644 | self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128) 645 | self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256) 646 | self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512) 647 | 648 | self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64) 649 | self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128) 650 | self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256) 651 | self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512) 652 | 653 | self.conv_block5_1 = ConvBlock(in_channels=1, out_channels=64) 654 | self.conv_block5_2 = ConvBlock(in_channels=64, out_channels=128) 655 | self.conv_block5_3 = ConvBlock(in_channels=128, out_channels=256) 656 | self.conv_block5_4 = ConvBlock(in_channels=256, out_channels=512) 657 | 658 | self.conv_block6_1 = ConvBlock(in_channels=1, out_channels=64) 659 | self.conv_block6_2 = ConvBlock(in_channels=64, out_channels=128) 660 | self.conv_block6_3 = ConvBlock(in_channels=128, out_channels=256) 661 | self.conv_block6_4 = ConvBlock(in_channels=256, out_channels=512) 662 | 663 | self.conv_block7_1 = ConvBlock(in_channels=1, out_channels=64) 664 | self.conv_block7_2 = ConvBlock(in_channels=64, out_channels=128) 665 | self.conv_block7_3 = ConvBlock(in_channels=128, out_channels=256) 666 | self.conv_block7_4 = ConvBlock(in_channels=256, out_channels=512) 667 | 668 | 669 | self.fc1_1 = nn.Linear(512, 512, bias=True) 670 | self.dropout1 = nn.Dropout(p=0.5) 671 | self.fc1_2 = nn.Linear(512, classes_num, bias=True) 672 | 673 | self.fc2_1 = nn.Linear(512, 512, bias=True) 674 | self.dropout2 = nn.Dropout(p=0.5) 675 | self.fc2_2 = nn.Linear(512, classes_num, bias=True) 676 | 677 | self.fc3_1 = nn.Linear(512, 512, bias=True) 678 | self.dropout3 = nn.Dropout(p=0.5) 679 | self.fc3_2 = nn.Linear(512, classes_num, bias=True) 680 | 681 | self.fc4_1 = nn.Linear(512, 512, bias=True) 682 | self.dropout4 = nn.Dropout(p=0.5) 683 | self.fc4_2 = nn.Linear(512, classes_num, bias=True) 684 | 685 | self.fc5_1 = nn.Linear(512, 512, bias=True) 686 | self.dropout5 = nn.Dropout(p=0.5) 687 | self.fc5_2 = nn.Linear(512, classes_num, bias=True) 688 | 689 | self.fc6_1 = nn.Linear(512, 512, bias=True) 690 | self.dropout6 = nn.Dropout(p=0.5) 691 | self.fc6_2 = nn.Linear(512, classes_num, bias=True) 692 | 693 | self.fc7_1 = nn.Linear(512, 512, bias=True) 694 | self.dropout7 = nn.Dropout(p=0.5) 695 | self.fc7_2 = nn.Linear(512, classes_num, bias=True) 696 | 697 | self.init_weights() 698 | 699 | def init_weights(self): 700 | 701 | init_layer(self.fc1_1) 702 | init_layer(self.fc1_2) 703 | init_layer(self.fc2_1) 704 | init_layer(self.fc2_2) 705 | init_layer(self.fc3_1) 706 | init_layer(self.fc3_2) 707 | init_layer(self.fc4_1) 708 | init_layer(self.fc4_2) 709 | init_layer(self.fc5_1) 710 | init_layer(self.fc5_2) 711 | init_layer(self.fc6_1) 712 | init_layer(self.fc6_2) 713 | init_layer(self.fc7_1) 714 | init_layer(self.fc7_2) 715 | 716 | def forward(self, x1,x2,x3,x4): 717 | ''' 718 | Input: (batch_size, scale_num, total_frames)''' 719 | outputs = [] 720 | 721 | x = input 722 | x = self.spec_layer(x1) 723 | x = 10*torch.log10(x) 724 | x = torch.clamp(x, min=-100.) 725 | x = x.transpose(1,2) 726 | x = x[:,None,:,:] 727 | '''(batch_size, 1, times_steps, freq_bins)''' 728 | temp = x.clone() 729 | count = 0. 730 | 731 | x = temp[:,:,:,0:16] 732 | x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg') 733 | x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg') 734 | x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg') 735 | x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg') 736 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 737 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 738 | x = F.relu_(self.fc1_1(x)) 739 | x = self.dropout1(x) 740 | x = self.fc1_2(x) 741 | x = x.view(x.shape[0], 1, x.shape[1]) 742 | co_output = x 743 | count += 1. 744 | outputs.append(x) 745 | 746 | 747 | x = temp[:,:,:,8:24] 748 | x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg') 749 | x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg') 750 | x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg') 751 | x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg') 752 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 753 | x = F.relu_(self.fc2_1(x)) 754 | x = self.dropout2(x) 755 | x = self.fc2_2(x) 756 | x = x.view(x.shape[0], 1, x.shape[1]) 757 | co_output = x + co_output 758 | count += 1. 759 | outputs.append(x) 760 | 761 | x = temp[:,:,:,16:32] 762 | x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg') 763 | x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg') 764 | x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg') 765 | x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg') 766 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 767 | x = F.relu_(self.fc3_1(x)) 768 | x = self.dropout3(x) 769 | x = self.fc3_2(x) 770 | x = x.view(x.shape[0], 1, x.shape[1]) 771 | co_output = x + co_output 772 | count += 1. 773 | outputs.append(x) 774 | 775 | x = temp[:,:,:,24:40] 776 | x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg') 777 | x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg') 778 | x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg') 779 | x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg') 780 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 781 | x = F.relu_(self.fc4_1(x)) 782 | x = self.dropout4(x) 783 | x = self.fc4_2(x) 784 | x = x.view(x.shape[0], 1, x.shape[1]) 785 | co_output = x + co_output 786 | count += 1. 787 | outputs.append(x) 788 | 789 | x = temp[:,:,:,32:48] 790 | x = self.conv_block5_1(x, pool_size=(4, 2), pool_type='avg') 791 | x = self.conv_block5_2(x, pool_size=(4, 2), pool_type='avg') 792 | x = self.conv_block5_3(x, pool_size=(2, 2), pool_type='avg') 793 | x = self.conv_block5_4(x, pool_size=(2, 2), pool_type='avg') 794 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 795 | x = F.relu_(self.fc5_1(x)) 796 | x = self.dropout5(x) 797 | x = self.fc5_2(x) 798 | x = x.view(x.shape[0], 1, x.shape[1]) 799 | co_output = x + co_output 800 | count += 1. 801 | outputs.append(x) 802 | 803 | x = temp[:,:,:,40:56] 804 | x = self.conv_block6_1(x, pool_size=(4, 2), pool_type='avg') 805 | x = self.conv_block6_2(x, pool_size=(4, 2), pool_type='avg') 806 | x = self.conv_block6_3(x, pool_size=(2, 2), pool_type='avg') 807 | x = self.conv_block6_4(x, pool_size=(2, 2), pool_type='avg') 808 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 809 | x = F.relu_(self.fc6_1(x)) 810 | x = self.dropout6(x) 811 | x = self.fc6_2(x) 812 | x = x.view(x.shape[0], 1, x.shape[1]) 813 | co_output = x + co_output 814 | count += 1. 815 | outputs.append(x) 816 | 817 | x = temp[:,:,:,48:64] 818 | x = self.conv_block7_1(x, pool_size=(4, 2), pool_type='avg') 819 | x = self.conv_block7_2(x, pool_size=(4, 2), pool_type='avg') 820 | x = self.conv_block7_3(x, pool_size=(2, 2), pool_type='avg') 821 | x = self.conv_block7_4(x, pool_size=(2, 2), pool_type='avg') 822 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 823 | x = F.relu_(self.fc7_1(x)) 824 | x = self.dropout7(x) 825 | x = self.fc7_2(x) 826 | x = x.view(x.shape[0], 1, x.shape[1]) 827 | co_output = x + co_output 828 | count += 1. 829 | outputs.append(x) 830 | 831 | outputs.append(co_output/count) 832 | outputs = torch.cat((outputs), 1) 833 | loss = torch.softmax(outputs,-1) 834 | if self.activation == 'logsoftmax': 835 | output = F.log_softmax(outputs, dim=-1) 836 | 837 | elif self.activation == 'sigmoid': 838 | output = torch.sigmoid(outputs) 839 | 840 | return output,loss 841 | 842 | class Gamm_DCMF(nn.Module): 843 | 844 | def __init__(self, classes_num, activation, fixed=False): 845 | super(Gamm_DCMF, self).__init__() 846 | 847 | self.activation = activation 848 | self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64) 849 | self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128) 850 | self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256) 851 | self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512) 852 | 853 | self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64) 854 | self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128) 855 | self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256) 856 | self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512) 857 | 858 | self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64) 859 | self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128) 860 | self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256) 861 | self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512) 862 | 863 | self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64) 864 | self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128) 865 | self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256) 866 | self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512) 867 | 868 | self.conv_block5_1 = ConvBlock(in_channels=1, out_channels=64) 869 | self.conv_block5_2 = ConvBlock(in_channels=64, out_channels=128) 870 | self.conv_block5_3 = ConvBlock(in_channels=128, out_channels=256) 871 | self.conv_block5_4 = ConvBlock(in_channels=256, out_channels=512) 872 | 873 | self.conv_block6_1 = ConvBlock(in_channels=1, out_channels=64) 874 | self.conv_block6_2 = ConvBlock(in_channels=64, out_channels=128) 875 | self.conv_block6_3 = ConvBlock(in_channels=128, out_channels=256) 876 | self.conv_block6_4 = ConvBlock(in_channels=256, out_channels=512) 877 | 878 | self.conv_block7_1 = ConvBlock(in_channels=1, out_channels=64) 879 | self.conv_block7_2 = ConvBlock(in_channels=64, out_channels=128) 880 | self.conv_block7_3 = ConvBlock(in_channels=128, out_channels=256) 881 | self.conv_block7_4 = ConvBlock(in_channels=256, out_channels=512) 882 | 883 | 884 | self.fc1_1 = nn.Linear(512, 512, bias=True) 885 | self.dropout1 = nn.Dropout(p=0.5) 886 | self.fc1_2 = nn.Linear(512, classes_num, bias=True) 887 | 888 | self.fc2_1 = nn.Linear(512, 512, bias=True) 889 | self.dropout2 = nn.Dropout(p=0.5) 890 | self.fc2_2 = nn.Linear(512, classes_num, bias=True) 891 | 892 | self.fc3_1 = nn.Linear(512, 512, bias=True) 893 | self.dropout3 = nn.Dropout(p=0.5) 894 | self.fc3_2 = nn.Linear(512, classes_num, bias=True) 895 | 896 | self.fc4_1 = nn.Linear(512, 512, bias=True) 897 | self.dropout4 = nn.Dropout(p=0.5) 898 | self.fc4_2 = nn.Linear(512, classes_num, bias=True) 899 | 900 | self.fc5_1 = nn.Linear(512, 512, bias=True) 901 | self.dropout5 = nn.Dropout(p=0.5) 902 | self.fc5_2 = nn.Linear(512, classes_num, bias=True) 903 | 904 | self.fc6_1 = nn.Linear(512, 512, bias=True) 905 | self.dropout6 = nn.Dropout(p=0.5) 906 | self.fc6_2 = nn.Linear(512, classes_num, bias=True) 907 | 908 | self.fc7_1 = nn.Linear(512, 512, bias=True) 909 | self.dropout7 = nn.Dropout(p=0.5) 910 | self.fc7_2 = nn.Linear(512, classes_num, bias=True) 911 | 912 | self.init_weights() 913 | 914 | def init_weights(self): 915 | 916 | init_layer(self.fc1_1) 917 | init_layer(self.fc1_2) 918 | init_layer(self.fc2_1) 919 | init_layer(self.fc2_2) 920 | init_layer(self.fc3_1) 921 | init_layer(self.fc3_2) 922 | init_layer(self.fc4_1) 923 | init_layer(self.fc4_2) 924 | init_layer(self.fc5_1) 925 | init_layer(self.fc5_2) 926 | init_layer(self.fc6_1) 927 | init_layer(self.fc6_2) 928 | init_layer(self.fc7_1) 929 | init_layer(self.fc7_2) 930 | 931 | def forward(self, x1,x2,x3,x4): 932 | ''' 933 | Input: (batch_size, scale_num, total_frames)''' 934 | outputs = [] 935 | 936 | x = x2 937 | x = x[:,None,:,:] 938 | '''(batch_size, 1, times_steps, freq_bins)''' 939 | temp = x.clone() 940 | count = 0. 941 | 942 | x = temp[:,:,:,0:16] 943 | x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg') 944 | x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg') 945 | x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg') 946 | x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg') 947 | '''(batch_size, feature_maps, time_steps, freq_bins)''' 948 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 949 | x = F.relu_(self.fc1_1(x)) 950 | x = self.dropout1(x) 951 | x = self.fc1_2(x) 952 | x = x.view(x.shape[0], 1, x.shape[1]) 953 | co_output = x 954 | count += 1. 955 | outputs.append(x) 956 | 957 | 958 | x = temp[:,:,:,8:24] 959 | x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg') 960 | x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg') 961 | x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg') 962 | x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg') 963 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 964 | x = F.relu_(self.fc2_1(x)) 965 | x = self.dropout2(x) 966 | x = self.fc2_2(x) 967 | x = x.view(x.shape[0], 1, x.shape[1]) 968 | co_output = x + co_output 969 | count += 1. 970 | outputs.append(x) 971 | 972 | x = temp[:,:,:,16:32] 973 | x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg') 974 | x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg') 975 | x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg') 976 | x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg') 977 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 978 | x = F.relu_(self.fc3_1(x)) 979 | x = self.dropout3(x) 980 | x = self.fc3_2(x) 981 | x = x.view(x.shape[0], 1, x.shape[1]) 982 | co_output = x + co_output 983 | count += 1. 984 | outputs.append(x) 985 | 986 | x = temp[:,:,:,24:40] 987 | x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg') 988 | x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg') 989 | x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg') 990 | x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg') 991 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 992 | x = F.relu_(self.fc4_1(x)) 993 | x = self.dropout4(x) 994 | x = self.fc4_2(x) 995 | x = x.view(x.shape[0], 1, x.shape[1]) 996 | co_output = x + co_output 997 | count += 1. 998 | outputs.append(x) 999 | 1000 | x = temp[:,:,:,32:48] 1001 | x = self.conv_block5_1(x, pool_size=(4, 2), pool_type='avg') 1002 | x = self.conv_block5_2(x, pool_size=(4, 2), pool_type='avg') 1003 | x = self.conv_block5_3(x, pool_size=(2, 2), pool_type='avg') 1004 | x = self.conv_block5_4(x, pool_size=(2, 2), pool_type='avg') 1005 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 1006 | x = F.relu_(self.fc5_1(x)) 1007 | x = self.dropout5(x) 1008 | x = self.fc5_2(x) 1009 | x = x.view(x.shape[0], 1, x.shape[1]) 1010 | co_output = x + co_output 1011 | count += 1. 1012 | outputs.append(x) 1013 | 1014 | x = temp[:,:,:,40:56] 1015 | x = self.conv_block6_1(x, pool_size=(4, 2), pool_type='avg') 1016 | x = self.conv_block6_2(x, pool_size=(4, 2), pool_type='avg') 1017 | x = self.conv_block6_3(x, pool_size=(2, 2), pool_type='avg') 1018 | x = self.conv_block6_4(x, pool_size=(2, 2), pool_type='avg') 1019 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 1020 | x = F.relu_(self.fc6_1(x)) 1021 | x = self.dropout6(x) 1022 | x = self.fc6_2(x) 1023 | x = x.view(x.shape[0], 1, x.shape[1]) 1024 | co_output = x + co_output 1025 | count += 1. 1026 | outputs.append(x) 1027 | 1028 | x = temp[:,:,:,48:64] 1029 | x = self.conv_block7_1(x, pool_size=(4, 2), pool_type='avg') 1030 | x = self.conv_block7_2(x, pool_size=(4, 2), pool_type='avg') 1031 | x = self.conv_block7_3(x, pool_size=(2, 2), pool_type='avg') 1032 | x = self.conv_block7_4(x, pool_size=(2, 2), pool_type='avg') 1033 | (x, _) = torch.max(x[:,:,:,0], dim=2) # (batch_size, feature_maps) 1034 | x = F.relu_(self.fc7_1(x)) 1035 | x = self.dropout7(x) 1036 | x = self.fc7_2(x) 1037 | x = x.view(x.shape[0], 1, x.shape[1]) 1038 | co_output = x + co_output 1039 | count += 1. 1040 | outputs.append(x) 1041 | 1042 | outputs.append(co_output/count) 1043 | outputs = torch.cat((outputs), 1) 1044 | loss = torch.softmax(outputs,-1) 1045 | if self.activation == 'logsoftmax': 1046 | output = F.log_softmax(outputs, dim=-1) 1047 | 1048 | elif self.activation == 'sigmoid': 1049 | output = torch.sigmoid(outputs) 1050 | 1051 | return output,loss 1052 | 1053 | class Logmel_DCMT(nn.Module): 1054 | 1055 | def __init__(self, classes_num, activation): 1056 | super(Logmel_DCMT, self).__init__() 1057 | 1058 | self.activation = activation 1059 | self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False) 1060 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 1061 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 1062 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 1063 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 1064 | self.fc1 = nn.Linear(512, 512, bias=True) 1065 | self.dropout = nn.Dropout(p=0.5) 1066 | self.fc2 = nn.Linear(512, classes_num, bias=True) 1067 | 1068 | self.init_weights() 1069 | 1070 | def init_weights(self): 1071 | 1072 | init_layer(self.fc1) 1073 | init_layer(self.fc2) 1074 | 1075 | def forward(self, x1,x2,x3,x4): 1076 | ''' 1077 | Input: (batch_size, scale_num, total_frames)''' 1078 | outputs = [] 1079 | 1080 | x = input 1081 | x = self.spec_layer(x1) 1082 | x = 10*torch.log10(x) 1083 | x = torch.clamp(x, min=-100.) 1084 | x = x.transpose(1,2) 1085 | x = x[:,None,:,:] 1086 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 1087 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 1088 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 1089 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 1090 | x = torch.mean(x, dim=3) 1091 | temp = x.clone() 1092 | 1093 | count = 0. 1094 | x = temp[:,:,0] 1095 | x = F.relu_(self.fc1(x)) 1096 | x = self.dropout(x) 1097 | x = self.fc2(x) 1098 | x = x.view(x.shape[0], 1, x.shape[1]) 1099 | co_output = x 1100 | count += 1. 1101 | outputs.append(x) 1102 | 1103 | for i in range(1,temp.shape[2],1): 1104 | x = temp[:,:,i] 1105 | x = F.relu_(self.fc1(x)) 1106 | x = self.dropout(x) 1107 | x = self.fc2(x) 1108 | x = x.view(x.shape[0], 1, x.shape[1]) 1109 | co_output = x + co_output 1110 | count += 1. 1111 | outputs.append(x) 1112 | 1113 | outputs.append(co_output/count) 1114 | outputs = torch.cat((outputs), 1) 1115 | loss = torch.softmax(outputs,-1) 1116 | if self.activation == 'logsoftmax': 1117 | output = F.log_softmax(outputs, dim=-1) 1118 | 1119 | elif self.activation == 'sigmoid': 1120 | output = torch.sigmoid(outputs) 1121 | 1122 | return output,loss 1123 | 1124 | class Cqt_DCMT(nn.Module): 1125 | 1126 | def __init__(self, classes_num, activation): 1127 | super(Cqt_DCMT, self).__init__() 1128 | 1129 | self.activation = activation 1130 | self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect') 1131 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 1132 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 1133 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 1134 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 1135 | self.fc1 = nn.Linear(512, 512, bias=True) 1136 | self.dropout = nn.Dropout(p=0.5) 1137 | self.fc2 = nn.Linear(512, classes_num, bias=True) 1138 | 1139 | self.init_weights() 1140 | 1141 | def init_weights(self): 1142 | 1143 | init_layer(self.fc1) 1144 | init_layer(self.fc2) 1145 | 1146 | def forward(self, x1,x2,x3,x4): 1147 | ''' 1148 | Input: (batch_size, scale_num, total_frames)''' 1149 | outputs = [] 1150 | 1151 | x = input 1152 | x = self.spec_layer(x1) 1153 | x = 10*torch.log10(x) 1154 | x = torch.clamp(x, min=-100.) 1155 | x = x.transpose(1,2) 1156 | x = x[:,None,:,:] 1157 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 1158 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 1159 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 1160 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 1161 | x = torch.mean(x, dim=3) 1162 | temp = x.clone() 1163 | 1164 | count = 0. 1165 | x = temp[:,:,0] 1166 | x = F.relu_(self.fc1(x)) 1167 | x = self.dropout(x) 1168 | x = self.fc2(x) 1169 | x = x.view(x.shape[0], 1, x.shape[1]) 1170 | co_output = x 1171 | count += 1. 1172 | outputs.append(x) 1173 | 1174 | for i in range(1,temp.shape[2],1): 1175 | x = temp[:,:,i] 1176 | x = F.relu_(self.fc1(x)) 1177 | x = self.dropout(x) 1178 | x = self.fc2(x) 1179 | x = x.view(x.shape[0], 1, x.shape[1]) 1180 | co_output = x + co_output 1181 | count += 1. 1182 | outputs.append(x) 1183 | 1184 | outputs.append(co_output/count) 1185 | outputs = torch.cat((outputs), 1) 1186 | loss = torch.softmax(outputs,-1) 1187 | if self.activation == 'logsoftmax': 1188 | output = F.log_softmax(outputs, dim=-1) 1189 | 1190 | elif self.activation == 'sigmoid': 1191 | output = torch.sigmoid(outputs) 1192 | 1193 | return output,loss 1194 | 1195 | class Gamm_DCMT(nn.Module): 1196 | 1197 | def __init__(self, classes_num, activation): 1198 | super(Gamm_DCMT, self).__init__() 1199 | 1200 | self.activation = activation 1201 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 1202 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 1203 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 1204 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 1205 | self.fc1 = nn.Linear(512, 512, bias=True) 1206 | self.dropout = nn.Dropout(p=0.5) 1207 | self.fc2 = nn.Linear(512, classes_num, bias=True) 1208 | 1209 | self.init_weights() 1210 | 1211 | def init_weights(self): 1212 | 1213 | init_layer(self.fc1) 1214 | init_layer(self.fc2) 1215 | 1216 | def forward(self, x1,x2,x3,x4): 1217 | ''' 1218 | Input: (batch_size, scale_num, total_frames)''' 1219 | outputs = [] 1220 | 1221 | x = x2 1222 | x = x[:,None,:,:] 1223 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 1224 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 1225 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 1226 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 1227 | x = torch.mean(x, dim=3) 1228 | temp = x.clone() 1229 | 1230 | count = 0. 1231 | x = temp[:,:,0] 1232 | x = F.relu_(self.fc1(x)) 1233 | x = self.dropout(x) 1234 | x = self.fc2(x) 1235 | x = x.view(x.shape[0], 1, x.shape[1]) 1236 | co_output = x 1237 | count += 1. 1238 | outputs.append(x) 1239 | 1240 | for i in range(1,temp.shape[2],1): 1241 | x = temp[:,:,i] 1242 | x = F.relu_(self.fc1(x)) 1243 | x = self.dropout(x) 1244 | x = self.fc2(x) 1245 | x = x.view(x.shape[0], 1, x.shape[1]) 1246 | co_output = x + co_output 1247 | count += 1. 1248 | outputs.append(x) 1249 | 1250 | outputs.append(co_output/count) 1251 | outputs = torch.cat((outputs), 1) 1252 | loss = torch.softmax(outputs,-1) 1253 | if self.activation == 'logsoftmax': 1254 | output = F.log_softmax(outputs, dim=-1) 1255 | 1256 | elif self.activation == 'sigmoid': 1257 | output = torch.sigmoid(outputs) 1258 | 1259 | return output,loss 1260 | 1261 | class Mfcc_DCMT(nn.Module): 1262 | 1263 | def __init__(self, classes_num, activation): 1264 | super(Mfcc_DCMT, self).__init__() 1265 | 1266 | self.activation = activation 1267 | self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) 1268 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 1269 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 1270 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 1271 | self.fc1 = nn.Linear(512, 512, bias=True) 1272 | self.dropout = nn.Dropout(p=0.5) 1273 | self.fc2 = nn.Linear(512, classes_num, bias=True) 1274 | 1275 | self.init_weights() 1276 | 1277 | def init_weights(self): 1278 | 1279 | init_layer(self.fc1) 1280 | init_layer(self.fc2) 1281 | 1282 | def forward(self, x1,x2,x3,x4): 1283 | ''' 1284 | Input: (batch_size, scale_num, total_frames)''' 1285 | outputs = [] 1286 | 1287 | x = x3 1288 | x = x[:,None,:,:] 1289 | x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg') 1290 | x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg') 1291 | x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') 1292 | x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 1293 | x = torch.mean(x, dim=3) 1294 | temp = x.clone() 1295 | 1296 | count = 0. 1297 | x = temp[:,:,0] 1298 | x = F.relu_(self.fc1(x)) 1299 | x = self.dropout(x) 1300 | x = self.fc2(x) 1301 | x = x.view(x.shape[0], 1, x.shape[1]) 1302 | co_output = x 1303 | count += 1. 1304 | outputs.append(x) 1305 | 1306 | for i in range(1,temp.shape[2],1): 1307 | x = temp[:,:,i] 1308 | x = F.relu_(self.fc1(x)) 1309 | x = self.dropout(x) 1310 | x = self.fc2(x) 1311 | x = x.view(x.shape[0], 1, x.shape[1]) 1312 | co_output = x + co_output 1313 | count += 1. 1314 | outputs.append(x) 1315 | 1316 | outputs.append(co_output/count) 1317 | outputs = torch.cat((outputs), 1) 1318 | loss = torch.softmax(outputs,-1) 1319 | if self.activation == 'logsoftmax': 1320 | output = F.log_softmax(outputs, dim=-1) 1321 | 1322 | elif self.activation == 'sigmoid': 1323 | output = torch.sigmoid(outputs) 1324 | 1325 | return output,loss 1326 | 1327 | class DCMR_DCMT(nn.Module): 1328 | 1329 | def __init__(self, classes_num, activation): 1330 | super(DCMR_DCMT, self).__init__() 1331 | 1332 | self.activation = activation 1333 | self.model1 = Logmel_DCMT(classes_num, activation) 1334 | self.model2 = Cqt_DCMT(classes_num, activation) 1335 | self.model3 = Gamm_MultiFrames_CNN(classes_num, activation) 1336 | self.model4 = Mfcc_MultiFrames_CNN(classes_num, activation) 1337 | 1338 | self.ite = 15000 # set your pre-trained model iteration 1339 | self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/' # set your model saved path. 1340 | self.init_weights() 1341 | 1342 | def init_weights(self): 1343 | model_path1 = self.model_path +'Logmel_DCMT/'+str(self.ite)+'_iterations.pth' 1344 | model_path2 = self.model_path +'Cqt_DCMT/'+str(self.ite)+'_iterations.pth' 1345 | model_path3 = self.model_path +'Gamm_DCMT/'+str(self.ite)+'_iterations.pth' 1346 | model_path4 = self.model_path +'Mfcc_DCMT/'+str(self.ite)+'_iterations.pth' 1347 | 1348 | model_ch1 = torch.load(model_path1) 1349 | model_ch2 = torch.load(model_path2) 1350 | model_ch3 = torch.load(model_path3) 1351 | model_ch4 = torch.load(model_path4) 1352 | 1353 | self.model1.load_state_dict(model_ch1['model']) 1354 | self.model2.load_state_dict(model_ch2['model']) 1355 | self.model3.load_state_dict(model_ch3['model']) 1356 | self.model4.load_state_dict(model_ch4['model']) 1357 | 1358 | 1359 | def forward(self, input1,input2,input3,input4): 1360 | ''' 1361 | Input: (batch_size, total_frames)''' 1362 | 1363 | x1,loss1 = self.model1(input1,input2,input3,input4) 1364 | x2,loss2 = self.model2(input1,input2,input3,input4) 1365 | x3,loss3 = self.model3(input1,input2,input3,input4) 1366 | x4,loss4 = self.model4(input1,input2,input3,input4) 1367 | 1368 | x = x1[:,-1] + x2[:,-1] + x3[:,-1] + x4[:,-1] 1369 | loss = (loss1[:,-1]+loss2[:,-1]+loss3[:,-1]+loss4[:,-1])/4. 1370 | return x,loss 1371 | 1372 | class DCMR_DCMF(nn.Module): 1373 | 1374 | def __init__(self, classes_num, activation): 1375 | super(Ensemble_CNN6, self).__init__() 1376 | 1377 | self.activation = activation 1378 | self.model1 = Logmel_DCMF(classes_num, activation) 1379 | self.model2 = Cqt_DCMF(classes_num, activation) 1380 | self.model3 = Gamm_DCMF(classes_num, activation) 1381 | self.model4 = Mfcc_DCMF(classes_num, activation) 1382 | self.ite = 15000 # set your pre-trained model iteration 1383 | self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/' # set your model saved path. 1384 | self.init_weights() 1385 | 1386 | def init_weights(self): 1387 | model_path1 = self.model_path +'Logmel_DCMF/'+str(self.ite)+'_iterations.pth' 1388 | model_path2 = self.model_path +'Cqt_DCMF/'+str(self.ite)+'_iterations.pth' 1389 | model_path3 = self.model_path +'Gamm_DCMF/'+str(self.ite)+'_iterations.pth' 1390 | model_path4 = self.model_path +'Mfcc_DCMF/'+str(self.ite)+'_iterations.pth' 1391 | 1392 | model_ch1 = torch.load(model_path1) 1393 | model_ch2 = torch.load(model_path2) 1394 | model_ch3 = torch.load(model_path3) 1395 | model_ch4 = torch.load(model_path4) 1396 | 1397 | self.model1.load_state_dict(model_ch1['model']) 1398 | self.model2.load_state_dict(model_ch2['model']) 1399 | self.model3.load_state_dict(model_ch3['model']) 1400 | self.model4.load_state_dict(model_ch4['model']) 1401 | 1402 | 1403 | def forward(self, input1,input2,input3,input4): 1404 | ''' 1405 | Input: (batch_size, total_frames)''' 1406 | 1407 | x1,loss1 = self.model1(input1,input2,input3,input4) 1408 | x2,loss2 = self.model2(input1,input2,input3,input4) 1409 | x3,loss3 = self.model3(input1,input2,input3,input4) 1410 | x4,loss4 = self.model4(input1,input2,input3,input4) 1411 | 1412 | x = x1[:,-1] + x2[:,-1] + x3[:,-1] + x4[:,-1] 1413 | loss = (loss1[:,-1]+loss2[:,-1]+loss3[:,-1]+loss4[:,-1])/4. 1414 | return x,loss 1415 | 1416 | class DCMR_DCMF_DCMT(nn.Module): 1417 | 1418 | def __init__(self, classes_num, activation): 1419 | super(DCMR_DCMF_DCMT, self).__init__() 1420 | 1421 | self.activation = activation 1422 | self.model1 = DCMR_DCMT(classes_num, activation) 1423 | self.model2 = DCMR_DCMF(classes_num, activation) 1424 | 1425 | def forward(self, input1,input2,input3,input4): 1426 | ''' 1427 | Input: (batch_size, total_frames)''' 1428 | 1429 | x1,loss1 = self.model1(input1,input2,input3,input4) 1430 | x2,loss2 = self.model2(input1,input2,input3,input4) 1431 | 1432 | x = x1 + x2 1433 | loss = (loss1+loss2)/2. 1434 | 1435 | return x,loss 1436 | --------------------------------------------------------------------------------