├── pytorch
    ├── ReadMe.md
    ├── losses.py
    ├── pytorch_utils.py
    ├── evaluate.py
    ├── main.py
    └── models.py
├── utils
    ├── ReadMe.md
    ├── config.py
    ├── gtg_example.py
    ├── gtg.py
    ├── utilities.py
    ├── plot_results.py
    ├── features.py
    └── data_generator.py
├── workspace
    └── ReadMe.md
├── runme.sh
└── README.md


/pytorch/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/utils/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/workspace/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pytorch/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def nll_loss(output, target):
 6 |     '''Negative likelihood loss. The output should be obtained using F.log_softmax(x). 
 7 |     
 8 |     Args:
 9 |       output: (N, classes_num)
10 |       target: (N, classes_num)
11 |     '''
12 |     loss = - torch.mean(target * output)
13 |     return loss


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | sample_rate = 44100
 2 | window_size = 2048
 3 | hop_size = 512      # So that there are 64 frames per second
 4 | mel_bins = 40
 5 | fmin = 50       # Hz
 6 | fmax = 14000    # Hz
 7 | 
 8 | # MFCC setings
 9 | mfcc_frames = 431
10 | n_mfcc = 40
11 | mfcc_hop_size = 512
12 | 
13 | # gammatonegram settings
14 | gamm_frames = 499
15 | n_gamm = 64
16 | 
17 | frames_per_second = sample_rate // hop_size
18 | audio_duration = 10     # Audio recordings in DCASE2019 Task1 are all 10 seconds
19 | frames_num = frames_per_second * audio_duration
20 | total_samples = sample_rate * audio_duration
21 | total_frames = (sample_rate * audio_duration) // hop_size
22 | 
23 | labels = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', 
24 |     'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park', 'unknown']
25 |     
26 | # classes_num = len(labels)
27 | lb_to_idx = {lb: idx for idx, lb in enumerate(labels)}
28 | idx_to_lb = {idx: lb for idx, lb in enumerate(labels)}


--------------------------------------------------------------------------------
/runme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # You need to modify this path to your downloaded dataset directory
 3 | DATASET_DIR='data/'
 4 | 
 5 | # You need to modify this path to your workspace to store features and models
 6 | WORKSPACE='workspace/'
 7 | 
 8 | # Hyper-parameters
 9 | GPU_ID=0
10 | MODEL_TYPE='Logmel_Cnn'
11 | # MODEL_TYPE='Cqt_Cnn'
12 | # MODEL_TYPE='Gamm_Cnn'
13 | # MODEL_TYPE='Mfcc_Cnn'
14 | # MODEL_TYPE='DCMR'
15 | # MODEL_TYPE='Logmel_DCMF'
16 | # MODEL_TYPE='Cqt_DCMF'
17 | # MODEL_TYPE='Gamm_DCMF'
18 | # MODEL_TYPE='Mfcc_DCMF'
19 | # MODEL_TYPE='Logmel_DCMT'
20 | # MODEL_TYPE='Cqt_DCMT'
21 | # MODEL_TYPE='Gamm_DCMT'
22 | # MODEL_TYPE='Mfcc_DCMT'
23 | # MODEL_TYPE='DCMR_DCMF'
24 | # MODEL_TYPE='DCMR_DCMT'
25 | # MODEL_TYPE='DCMR_DCMF_DCMT'
26 | 
27 | 
28 | #### Origanl Train (Other Models)
29 | BATCH_SIZE=128
30 | ITE_TRAIN=15000
31 | ITE_EVA=12000
32 | ITE_STORE=12000
33 | 
34 | ############ Train and validate on development dataset ############
35 | # Calculate feature
36 | python utils/features.py calculate_feature_for_all_audio_files --dataset_dir=$DATASET_DIR --subtask='a' --data_type='development' --workspace=$WORKSPACE
37 | 
38 | # Calculate scalar
39 | python utils/features.py calculate_scalar --subtask='a' --data_type='development' --workspace=$WORKSPACE
40 | 
41 | # Subtask A
42 | CUDA_VISIBLE_DEVICES=$GPU_ID python pytorch/main.py train --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --subtask='a' --data_type='development' --holdout_fold=1 --model_type=$MODEL_TYPE --batch_size=$BATCH_SIZE --ite_train=$ITE_TRAIN --ite_eva=$ITE_EVA --ite_store=$ITE_STORE --fixed=$FIXED --finetune=$FINETUNE --cuda
43 | 
44 | CUDA_VISIBLE_DEVICES=$GPU_ID python pytorch/main.py inference_validation --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE --subtask='a' --data_type='development' --holdout_fold=1 --model_type=$MODEL_TYPE --iteration=$ITE_TRAIN --batch_size=$BATCH_SIZE --cuda
45 | 


--------------------------------------------------------------------------------
/pytorch/pytorch_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def move_data_to_gpu(x, cuda):
 6 |     if 'float' in str(x.dtype):
 7 |         x = torch.Tensor(x)
 8 |     elif 'int' in str(x.dtype):
 9 |         x = torch.LongTensor(x)
10 |     else:
11 |         raise Exception("Error!")
12 | 
13 |     if cuda:
14 |         x = x.cuda()
15 | 
16 |     return x
17 |     
18 |     
19 | def append_to_dict(dict, key, value):
20 |     if key in dict.keys():
21 |         dict[key].append(value)
22 |     else:
23 |         dict[key] = [value]
24 |     
25 |     
26 | def forward(model, generate_func, cuda, return_input=False, 
27 |     return_target=False):
28 |     '''Forward data to model in mini-batch. 
29 |     
30 |     Args: 
31 |       model: object
32 |       generate_func: function
33 |       cuda: bool
34 |       return_input: bool
35 |       return_target: bool
36 |       max_validate_num: None | int, maximum mini-batch to forward to speed up validation
37 |     '''
38 |     output_dict = {}
39 |     
40 |     # Evaluate on mini-batch
41 |     for batch_data_dict in generate_func:
42 | 
43 |         # Predict
44 |         batch_feature = move_data_to_gpu(batch_data_dict['feature'], cuda)
45 |         batch_feature_mfcc = move_data_to_gpu(batch_data_dict['feature_mfcc'], cuda)
46 |         batch_feature_gamm = move_data_to_gpu(batch_data_dict['feature_gamm'], cuda)
47 |         batch_feature_panns = move_data_to_gpu(batch_data_dict['feature_panns'], cuda)
48 |         
49 |         with torch.no_grad():
50 |             model.eval()
51 |             batch_output, batch_loss = model(batch_feature,batch_feature_gamm,batch_feature_mfcc,batch_feature_panns)
52 | 
53 |         append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
54 |         
55 |         append_to_dict(output_dict, 'output', batch_output.data.cpu().numpy())
56 |         
57 |         append_to_dict(output_dict, 'loss', batch_loss.data.cpu().numpy())
58 |             
59 |         if return_input:
60 |             append_to_dict(output_dict, 'feature', batch_data_dict['feature'])
61 |             
62 |         if return_target:
63 |             if 'target' in batch_data_dict.keys():
64 |                 append_to_dict(output_dict, 'target', batch_data_dict['target'])
65 |                 
66 |     for key in output_dict.keys():
67 |         output_dict[key] = np.concatenate(output_dict[key], axis=0)
68 | 
69 |     return output_dict
70 |     


--------------------------------------------------------------------------------
/utils/gtg_example.py:
--------------------------------------------------------------------------------
 1 | from gtg import gammatonegram
 2 | import numpy as np
 3 | from scipy.stats import norm as ssn
 4 | import scipy.io.wavfile as wf
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | """
 8 | Example of using python gammatonegram code 
 9 | mcusi@mit.edu, 2018 Sept 24
10 | """
11 | 
12 | def gtg_in_dB(sound, sampling_rate=44100, log_constant=1e-80, dB_threshold=-50.0):
13 |     """ Convert sound into gammatonegram, with amplitude in decibels"""
14 |     sxx, center_frequencies = gammatonegram(sound, sr=sampling_rate, fmin=20, fmax=int(sampling_rate/2.))
15 |     sxx[sxx == 0] = log_constant
16 |     sxx = 20.*np.log10(sxx) #convert to dB
17 |     sxx[sxx < dB_threshold] = dB_threshold  
18 |     return sxx, center_frequencies
19 | 
20 | """
21 | def loglikelihood(sxx_observation, sxx_hypothesis):
22 |     likelihood_weighting = 5.0 #free parameter!
23 |     loglikelihood = likelihood_weighting*np.sum(ssn.logpdf(sxx_observation, 
24 |                                                            loc=sxx_hypothesis, scale=1))
25 |     return loglikelihood 
26 | """
27 | 
28 | def gtgplot(sxx, center_frequencies, sample_duration, sampling_rate, 
29 |             dB_threshold=-50.0, dB_max=10.0, t_space=50, f_space=10):
30 |     """Plot gammatonegram"""
31 |     fig, ax = plt.subplots(1,1)
32 |     
33 |     time_per_pixel = sample_duration/(1.*sampling_rate*sxx.shape[1])
34 |     t = time_per_pixel * np.arange(sxx.shape[1])
35 |     
36 |     plt.pcolormesh(sxx,vmin=dB_threshold, vmax=dB_max, cmap='Blues')
37 |     ax.set_ylabel('Frequency (Hz)', fontsize=16)
38 |     ax.set_xlabel('Time (s)',fontsize=16)
39 |     ax.xaxis.set_ticks(range(sxx.shape[1])[::t_space])
40 |     ax.xaxis.set_ticklabels((t[::t_space]*100.).astype(int)/100.,fontsize=16)
41 |     ax.set_xbound(0,sxx.shape[1])
42 |     ax.yaxis.set_ticks(range(len(center_frequencies))[::f_space])
43 |     ax.yaxis.set_ticklabels(center_frequencies.astype(int)[::f_space],fontsize=16)
44 |     ax.set_ybound(0,len(center_frequencies))
45 |     cbar = plt.colorbar(pad=0.01)
46 |     cbar.ax.tick_params(labelsize=16)
47 |     cbar.ax.set_ylabel('Amplitude (dB)', rotation=270, labelpad=15,fontsize=16)
48 |     plt.show()
49 |     plt.close()
50 |    
51 | if __name__ == '__main__':
52 |     sampling_rate, sound = wf.read('1-7973-A-7.wav')
53 |     sxx, center_frequencies = gtg_in_dB(sound, sampling_rate)
54 |     print(sxx.shape)
55 |     print(center_frequencies.shape)
56 |     gtgplot(sxx, center_frequencies, len(sound), sampling_rate)
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DCASE-2020-Task1A-Code
 2 | A Pytorch implementation of the paper : ["Acoustic Scene Classification with Spectrogram Processing Strategies"](http://dcase.community/documents/workshop2020/proceedings/DCASE2020Workshop_Wang_58.pdf)
 3 | 
 4 | ## Paper results
 5 | 
 6 | Note that the test results are obtained only by the device a for the DCASE2020 Task1A dev set.
 7 | 
 8 | Model | Accuracy |  Log loss | Model size
 9 | -|-|-|-
10 | DCASE2020 Task 1 Baseline|70.6%|1.356|19.1 MB
11 | Log-Mel CNN|72.1%|0.879|18.9 MB
12 | Gamma CNN|76.1% |0.762 |18.9 MB
13 | MFCC CNN| 63.6%| 1.029| 18.9 MB
14 | SPSMR| 79.4%| 0.696| 75.5 MB
15 | Log-Mel CNN + SPSMF| 75.5%| 1.135| 94.4 MB
16 | CQT CNN + SPSMF| 74.5%| 1.185| 94.4 MB
17 | Gamma CNN + SPSMF| 78.8%| 1.169| 94.4 MB
18 | MFCC CNN + SPSMF| 60.9%| 1.801| 94.4 MB
19 | SPSMR + SPSMF| 80.9%| 0.737| 377.6 MB
20 | Log-Mel CNN + SPSMT| 74.5%| 0.987| 18.9 MB
21 | CQT CNN + SPSMT| 73.3%| 1.032| 18.9 MB
22 | Gamma CNN + SPSMT| 78.2%| 0.866| 18.9 MB
23 | MFCC CNN + SPSMT| 67.6%| 1.081| 18.9 MB
24 | SPSMR + SPSMT| 79.7%| 0.701| 75.5 MB
25 | SPSMR + SPSMF + SPSMT| 81.8%| 0.694| 453.1 MB
26 | 
27 | 
28 | ## Citation
29 | If this code is helpful, please feel free to cite the following papers:
30 | ```
31 | @inproceedings{Wang2020,
32 |     author = "Wang, Helin and Zou, Yuexian and Chong, DaDing",
33 |     title = "Acoustic Scene Classification with Spectrogram Processing Strategies",
34 |     booktitle = "Proceedings of the Detection and Classification of Acoustic Scenes and Events 2020 Workshop (DCASE2020)",
35 |     address = "Tokyo, Japan",
36 |     month = "November",
37 |     year = "2020",
38 |     pages = "210--214",
39 |     abstract = "Recently, convolutional neural networks (CNN) have achieved the state-of-the-art performance in acoustic scene classification (ASC) task. The audio data is often transformed into two-dimensional spectrogram representations, which are then fed to the neural networks. In this paper, we study the problem of efficiently taking advantage of different spectrogram representations through discriminative processing strategies. There are two main contributions. The first contribution is exploring the impact of the combination of multiple spectrogram representations at different stages, which provides a meaningful reference for the effective spectrogram fusion. The second contribution is that the processing strategies in multiple frequency bands and multiple temporal frames are proposed to make fully use of a single spectrogram representation. The proposed spectrogram processing strategies can be easily transferred to any network structures. The experiments are carried out on the DCASE 2020 Task1 datasets, and the results show that our method could achieve the accuracy of 81.8\% (official baseline: 54.1\%) and 92.1\% (official baseline: 87.3\%) on the officially provided fold 1 evaluation dataset of Task1A and Task1B, respectively."
40 | }
41 | ```
42 | 
43 | ## Acknowledgment
44 | Thanks for the base code provided by https://github.com/qiuqiangkong/dcase2019_task1/.
45 | 
46 | ```
47 | @article{kong2019cross,
48 |   title={Cross-task learning for audio tagging, sound event detection and spatial localization: DCASE 2019 baseline systems},
49 |   author={Kong, Qiuqiang and Cao, Yin and Iqbal, Turab and Xu, Yong and Wang, Wenwu and Plumbley, Mark D},
50 |   journal={arXiv preprint arXiv:1904.03476},
51 |   year={2019}
52 | }
53 | ```
54 | 
55 | ## Contact
56 | If you have any problem about our code, feel free to contact
57 | - wanghl15@pku.edu.cn
58 | 
59 | or describe your problem in Issues.
60 | 


--------------------------------------------------------------------------------
/utils/gtg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Sat May 27 15:37:50 2017
  3 | Python version of:
  4 | D. P. W. Ellis (2009). "Gammatone-like spectrograms", web resource. http://www.ee.columbia.edu/~dpwe/resources/matlab/gammatonegram/
  5 | On the corresponding webpage, Dan notes that he would be grateful if you cited him if you use his work (as above).
  6 | This python code does not contain all features present in MATLAB code.
  7 | Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017
  8 | """
  9 | 
 10 | from __future__ import division 
 11 | import numpy as np
 12 | import scipy.signal as sps
 13 | import scipy.io.wavfile as wf
 14 | 
 15 | def fft2gammatonemx(nfft, sr=20000, nfilts=64, width=1.0, minfreq=100, 
 16 |                     maxfreq=10000, maxlen=1024):    
 17 |     """
 18 |     # Ellis' description in MATLAB:
 19 |     # [wts,cfreqa] = fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen)
 20 |     #      Generate a matrix of weights to combine FFT bins into
 21 |     #      Gammatone bins.  nfft defines the source FFT size at
 22 |     #      sampling rate sr.  Optional nfilts specifies the number of
 23 |     #      output bands required (default 64), and width is the
 24 |     #      constant width of each band in Bark (default 1).
 25 |     #      minfreq, maxfreq specify range covered in Hz (100, sr/2).
 26 |     #      While wts has nfft columns, the second half are all zero. 
 27 |     #      Hence, aud spectrum is
 28 |     #      fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft));
 29 |     #      maxlen truncates the rows to this many bins.
 30 |     #      cfreqs returns the actual center frequencies of each
 31 |     #      gammatone band in Hz.
 32 |     #
 33 |     # 2009/02/22 02:29:25 Dan Ellis dpwe@ee.columbia.edu  based on rastamat/audspec.m
 34 |     # Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017: convert to python
 35 |     """
 36 |     
 37 |     wts = np.zeros([nfilts,nfft])
 38 |     
 39 |     #after Slaney's MakeERBFilters
 40 |     EarQ = 9.26449; minBW = 24.7; order = 1;
 41 |     
 42 |     nFr = np.array(range(nfilts)) + 1
 43 |     em = EarQ*minBW
 44 |     cfreqs = (maxfreq+em)*np.exp(nFr*(-np.log(maxfreq + em)+np.log(minfreq + em))/nfilts)-em
 45 |     cfreqs = cfreqs[::-1]
 46 |     
 47 |     GTord = 4
 48 |     ucircArray = np.array(range(int(nfft/2 + 1)))
 49 |     ucirc = np.exp(1j*2*np.pi*ucircArray/nfft);
 50 |     #justpoles = 0 :taking out the 'if' corresponding to this. 
 51 | 
 52 |     ERB = width*np.power(np.power(cfreqs/EarQ,order) + np.power(minBW,order),1/order);
 53 |     B = 1.019 * 2 * np.pi * ERB;
 54 |     r = np.exp(-B/sr)
 55 |     theta = 2*np.pi*cfreqs/sr
 56 |     pole = r*np.exp(1j*theta)
 57 |     T = 1/sr
 58 |     ebt = np.exp(B*T); cpt = 2*cfreqs*np.pi*T;  
 59 |     ccpt = 2*T*np.cos(cpt); scpt = 2*T*np.sin(cpt);
 60 |     A11 = -np.divide(np.divide(ccpt,ebt) + np.divide(np.sqrt(3+2**1.5)*scpt,ebt),2); 
 61 |     A12 = -np.divide(np.divide(ccpt,ebt) - np.divide(np.sqrt(3+2**1.5)*scpt,ebt),2);
 62 |     A13 = -np.divide(np.divide(ccpt,ebt) + np.divide(np.sqrt(3-2**1.5)*scpt,ebt),2); 
 63 |     A14 = -np.divide(np.divide(ccpt,ebt) - np.divide(np.sqrt(3-2**1.5)*scpt,ebt),2);
 64 |     zros = -np.array([A11, A12, A13, A14])/T;
 65 |     wIdx = range(int(nfft/2 + 1))  
 66 |     gain = np.abs((-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) - np.sqrt(3 - 2**(3/2))*  np.sin(2*cfreqs*np.pi*T))) *(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) + np.sqrt(3 - 2**(3/2)) *  np.sin(2*cfreqs*np.pi*T)))*(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) -  np.sqrt(3 + 2**(3/2))*np.sin(2*cfreqs*np.pi*T))) *(-2*np.exp(4*1j*cfreqs*np.pi*T)*T + 2*np.exp(-(B*T) + 2*1j*cfreqs*np.pi*T)*T* (np.cos(2*cfreqs*np.pi*T) + np.sqrt(3 + 2**(3/2))*np.sin(2*cfreqs*np.pi*T))) /(-2 / np.exp(2*B*T) - 2*np.exp(4*1j*cfreqs*np.pi*T) +  2*(1 + np.exp(4*1j*cfreqs*np.pi*T))/np.exp(B*T))**4);
 67 |     #in MATLAB, there used to be 64 where here it says nfilts:
 68 |     wts[:, wIdx] =  ((T**4)/np.reshape(gain,(nfilts,1))) * np.abs(ucirc-np.reshape(zros[0],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[1],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[2],(nfilts,1)))*np.abs(ucirc-np.reshape(zros[3],(nfilts,1)))*(np.abs(np.power(np.multiply(np.reshape(pole,(nfilts,1))-ucirc,np.conj(np.reshape(pole,(nfilts,1)))-ucirc),-GTord)));           
 69 |     wts = wts[:,range(maxlen)];   
 70 |     
 71 |     return wts, cfreqs
 72 | 
 73 | def gammatonegram(x,sr=44100,twin=0.040,thop=0.020,N=64,
 74 |                   fmin=50,fmax=14000,width=1.0):
 75 |     """
 76 |     # Ellis' description in MATLAB: 
 77 |     # [Y,F] = gammatonegram(X,SR,N,TWIN,THOP,FMIN,FMAX,USEFFT,WIDTH)
 78 |     # Calculate a spectrogram-like time frequency magnitude array
 79 |     # based on Gammatone subband filters.  Waveform X (at sample
 80 |     # rate SR) is passed through an N (default 64) channel gammatone 
 81 |     # auditory model filterbank, with lowest frequency FMIN (50) 
 82 |     # and highest frequency FMAX (SR/2).  The outputs of each band 
 83 |     # then have their energy integrated over windows of TWIN secs 
 84 |     # (0.025), advancing by THOP secs (0.010) for successive
 85 |     # columns.  These magnitudes are returned as an N-row
 86 |     # nonnegative real matrix, Y.
 87 |     # WIDTH (default 1.0) is how to scale bandwidth of filters 
 88 |     # relative to ERB default (for fast method only).
 89 |     # F returns the center frequencies in Hz of each row of Y
 90 |     # (uniformly spaced on a Bark scale).
 91 |       
 92 |     # 2009/02/23 DAn Ellis dpwe@ee.columbia.edu
 93 |     # Sat May 27 15:37:50 2017 Maddie Cusimano mcusi@mit.edu, converted to python
 94 |     """
 95 | 
 96 |     #Entirely skipping Malcolm's function, because would require
 97 |     #altering ERBFilterBank code as well. 
 98 |     #i.e., in Ellis' code: usefft = 1
 99 |     assert(x.dtype == 'int32')
100 | 
101 |     # How long a window to use relative to the integration window requested
102 |     winext = 1;
103 |     twinmod = winext*twin;
104 |     nfft = int(2**(np.ceil(np.log(2*twinmod*sr)/np.log(2))))
105 |     nhop = int(np.round(thop*sr))
106 |     nwin = int(np.round(twinmod*sr))
107 |     [gtm,f] = fft2gammatonemx(nfft, sr, N, width, fmin, fmax, int(nfft/2+1))
108 |     # perform FFT and weighting in amplitude domain
109 |     # note: in MATLAB, abs(spectrogram(X, hanning(nwin), nwin-nhop, nfft, SR))
110 |     #                  = abs(specgram(X,nfft,SR,nwin,nwin-nhop))
111 |     # in python approx = sps.spectrogram(x, fs=sr, window='hann', nperseg=nwin, 
112 |     #                    noverlap=nwin-nhop, nfft=nfft, detrend=False, 
113 |     #                    scaling='density', mode='magnitude')
114 |     plotF, plotT, Sxx = sps.spectrogram(x, fs=sr, window='hann', nperseg=nwin, 
115 |                                 noverlap=nwin-nhop, nfft=nfft, detrend=False, 
116 |                                 scaling='density', mode='magnitude')
117 |     y = (1/nfft)*np.dot(gtm,Sxx)
118 |     
119 |     return y, f
120 | 


--------------------------------------------------------------------------------
/utils/utilities.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | import soundfile
  5 | import librosa
  6 | import h5py
  7 | import math
  8 | import pandas as pd
  9 | from sklearn import metrics
 10 | import logging
 11 | import matplotlib.pyplot as plt
 12 | import torch
 13 | import config
 14 | import scipy.io.wavfile as wf
 15 | from pydub import AudioSegment
 16 | import csv
 17 | 
 18 | '''
 19 | def mixup_data(x, y, alpha=0.2):
 20 |     if alpha > 0.:
 21 |         lam = np.random.beta(alpha, alpha)
 22 |     else:
 23 |         lam = 1.
 24 |     batch_size = x.size()[0]
 25 |     index = torch.randperm(batch_size).cuda()
 26 |     mixed_x = lam * x + (1 - lam) * x[index, :]
 27 |     y_a, y_b = y, y[index]
 28 |     return mixed_x, y_a, y_b, lam
 29 | '''
 30 | def mixup_data(x1,x2,x3,x4, y, alpha=0.2):
 31 | 
 32 |     '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda'''
 33 |     if alpha > 0.:
 34 |         lam = np.random.beta(alpha, alpha)
 35 |     else:
 36 |         lam = 1.
 37 |     batch_size = x1.size()[0]
 38 |     index = torch.randperm(batch_size).cuda()
 39 |     mixed_x1 = lam * x1 + (1 - lam) * x1[index, :]
 40 |     mixed_x2 = lam * x2 + (1 - lam) * x2[index, :]
 41 |     mixed_x3 = lam * x3 + (1 - lam) * x3[index, :]
 42 |     mixed_x4 = lam * x4 + (1 - lam) * x4[index, :]
 43 |     y_a, y_b = y, y[index]
 44 |     return mixed_x1, mixed_x2, mixed_x3,mixed_x4, y_a, y_b, lam
 45 | 
 46 | def mixup_criterion(class_criterion, pred, y_a, y_b, lam):
 47 |     return lam * class_criterion(pred, y_a) + (1 - lam) * class_criterion(pred, y_b)
 48 | 
 49 | def create_folder(fd):
 50 |     if not os.path.exists(fd):
 51 |         os.makedirs(fd)
 52 | 
 53 | def get_filename(path):
 54 |     path = os.path.realpath(path)
 55 |     name_ext = path.split('/')[-1]
 56 |     name = os.path.splitext(name_ext)[0]
 57 |     return name
 58 | 
 59 | def create_logging(log_dir, filemode):
 60 |     create_folder(log_dir)
 61 |     i1 = 0
 62 | 
 63 |     while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
 64 |         i1 += 1
 65 |         
 66 |     log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
 67 |     logging.basicConfig(
 68 |         level=logging.DEBUG,
 69 |         format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
 70 |         datefmt='%a, %d %b %Y %H:%M:%S',
 71 |         filename=log_path,
 72 |         filemode=filemode)
 73 |     # Print to console
 74 |     console = logging.StreamHandler()
 75 |     console.setLevel(logging.INFO)
 76 |     formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
 77 |     console.setFormatter(formatter)
 78 |     logger = logging.getLogger('')
 79 |     while logger.handlers:
 80 |         logger.handlers.pop()
 81 |     logger.addHandler(console)
 82 |     
 83 |     return logging
 84 |     
 85 | def read_audio(audio_path, target_fs=None):
 86 |     (audio, fs) = soundfile.read(audio_path)
 87 | 
 88 |     if audio.ndim > 1:
 89 |         audio = np.mean(audio, axis=1)
 90 |         
 91 |     if target_fs is not None and fs != target_fs:
 92 |         audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
 93 |         fs = target_fs
 94 |         
 95 |     return audio, fs
 96 |     
 97 | def read_audio_gamm(audio_path, target_fs=None):
 98 |     audio1='temp_16.wav'
 99 |     s = AudioSegment.from_file(audio_path, format = "wav" )
100 |     s.export(audio1 , bitrate="16", format="wav")
101 |     fs, audio = wf.read(audio1)
102 |     return audio, fs
103 | 
104 | def pad_truncate_sequence(x, max_len):
105 |     if len(x) < max_len:
106 |         return np.concatenate((x, np.zeros(max_len - len(x))))
107 |     else:
108 |         return x[0 : max_len]
109 |     
110 |     
111 | def calculate_scalar_of_tensor(x):
112 |     if x.ndim == 2:
113 |         axis = 0
114 |     elif x.ndim == 3:
115 |         axis = (0, 1)
116 | 
117 |     mean = np.mean(x, axis=axis)
118 |     std = np.std(x, axis=axis)
119 | 
120 |     return mean, std
121 | 
122 | 
123 | def load_scalar(scalar_path):
124 |     with h5py.File(scalar_path, 'r') as hf:
125 |         mean = hf['mean'][:]
126 |         std = hf['std'][:]
127 |         mean_gamm = hf['mean_gamm'][:]
128 |         std_gamm = hf['std_gamm'][:]
129 |         mean_mfcc = hf['mean_mfcc'][:]
130 |         std_mfcc = hf['std_mfcc'][:]
131 |         mean_panns = hf['mean_panns'][:]
132 |         std_panns = hf['std_panns'][:]
133 |         
134 |     scalar = {'mean': mean, 'std': std,'mean_gamm': mean_gamm, 'std_gamm': std_gamm,'mean_mfcc': mean_mfcc, 'std_mfcc': std_mfcc,'mean_panns': mean_panns, 'std_panns': std_panns}
135 |     return scalar
136 |     
137 |     
138 | def scale(x, mean, std):
139 |     return (x - mean) / std
140 |     
141 |     
142 | def inverse_scale(x, mean, std):
143 |     return x * std + mean
144 |     
145 |     
146 | def get_subdir(subtask, data_type):
147 |     if subtask == 'a':
148 |         subdir = 'TAU-urban-acoustic-scenes-2020-mobile-{}'.format(data_type)
149 |     elif subtask == 'b':
150 |         subdir = 'TAU-urban-acoustic-scenes-2019-mobile-{}'.format(data_type)
151 |     elif subtask == 'c':
152 |         subdir = 'TAU-urban-acoustic-scenes-2019-openset-{}'.format(data_type)
153 |     else:
154 |         raise Exception('Incorrect argument!')
155 |     
156 |     return subdir
157 |         
158 |         
159 | def read_metadata(metadata_path):
160 |     '''Read metadata from a csv file. 
161 |     
162 |     Returns:
163 |       meta_dict: dict of meta data, e.g.:
164 |         {'audio_name': np.array(['a.wav', 'b.wav', ...]), 
165 |          'scene_label': np.array(['airport', 'bus', ...]), 
166 |          ...}
167 |     '''
168 |     df = pd.read_csv(metadata_path, sep='\t')
169 |     
170 |     meta_dict = {}
171 |     
172 |     meta_dict['audio_name'] = np.array(
173 |         [name.split('/')[1] for name in df['filename'].tolist()])
174 |     
175 |     if 'scene_label' in df.keys():
176 |         meta_dict['scene_label'] = np.array(df['scene_label'])
177 |         
178 |     if 'identifier' in df.keys():
179 |         meta_dict['identifier'] = np.array(df['identifier'])
180 |         
181 |     if 'source_label' in df.keys():
182 |         meta_dict['source_label'] = np.array(df['source_label'])
183 |     
184 |     return meta_dict
185 |     
186 |     
187 | def sparse_to_categorical(x, n_out):
188 |     x = x.astype(int)
189 |     shape = x.shape
190 |     x = x.flatten()
191 |     N = len(x)
192 |     x_categ = np.zeros((N,n_out))
193 |     x_categ[np.arange(N), x] = 1
194 |     return x_categ.reshape((shape)+(n_out,))
195 |     
196 |     
197 | def get_sources(subtask):
198 |     if subtask in ['a', 'c']:
199 |         return ['a', 'b', 'c', 's1', 's2', 's3']
200 |     elif subtask == 'b':
201 |         return ['a', 'b', 'c']
202 |     else:
203 |         raise Exception('Incorrect argument!')
204 |         
205 | 
206 | def write_submission(output_dict, subtask, data_type, submission_path):
207 |     '''Write output to submission. 
208 | 
209 |     Args:
210 |       output_dict: 
211 |         {'audio_name': (audios_num,), 'output': (audios_num, classes_num)}
212 |       subtask: 'a' | 'b' | 'c'
213 |       data_type: 'leaderboard' | 'evaluation'
214 |       submission_path: string
215 |     '''
216 |     fw = open(submission_path, 'w')
217 |     csv_writer = csv.writer(fw,delimiter='\t')
218 |     csv_writer.writerow(['filename','scene_label','airport','bus','metro','metro_station','park','public_square','shopping_mall','street_pedestrian','street_traffic','tram'])
219 |     #0,7,8,2,9,4,1,3,5,6
220 | 
221 |     for n in range(len(output_dict['audio_name'])):
222 |    
223 |         audio_name = output_dict['audio_name'][n]
224 |         pred_label = config.idx_to_lb[np.argmax(output_dict['output'][n])]
225 |         logloss = output_dict['loss'][n]
226 |         print(n,pred_label,logloss)
227 |         csv_writer.writerow([audio_name,pred_label,logloss[0],logloss[7],logloss[8],logloss[2],logloss[9],logloss[4],logloss[1],logloss[3],logloss[5],logloss[6]])                  
228 |     fw.close()
229 |     logging.info('Write submission to {}'.format(submission_path))
230 | 


--------------------------------------------------------------------------------
/utils/plot_results.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import matplotlib.pyplot as plt
  4 | import _pickle as cPickle
  5 | import numpy as np
  6 | 
  7 | from utilities import get_subdir
  8 | import config
  9 | 
 10 | 
 11 | def plot_results(args):
 12 |     '''Plot statistics curve of different models. 
 13 |     
 14 |     Args:
 15 |       workspace: string, directory of workspace
 16 |       subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1
 17 |     '''
 18 |     
 19 |     # Arugments & parameters
 20 |     workspace = args.workspace
 21 |     subtask = args.subtask
 22 |     
 23 |     filename = 'main'
 24 |     prefix = ''
 25 |     frames_per_second = config.frames_per_second
 26 |     mel_bins = config.mel_bins
 27 |     holdout_fold = 1
 28 |     max_plot_iteration = 5000
 29 |     data_type = 'development'
 30 |     
 31 |     iterations = np.arange(0, max_plot_iteration, 200)
 32 |     
 33 |     def _load_stat(model_type, subtask, source):
 34 |         sub_dir = get_subdir(subtask, data_type)
 35 | 
 36 |         validate_statistics_path = os.path.join(workspace, 'statistics', filename, 
 37 |             '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
 38 |             '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 
 39 |             model_type, 'validate_statistics.pickle')
 40 |         
 41 |         validate_statistics_dict = cPickle.load(open(validate_statistics_path, 'rb'))
 42 |         
 43 |         accuracy_matrix = np.array([stat['accuracy'] for 
 44 |             stat in validate_statistics_dict[source]])
 45 |             
 46 |         confusion_matrix = np.array([stat['confusion_matrix'] for 
 47 |             stat in validate_statistics_dict[source]])
 48 |             
 49 |         legend = '{}'.format(model_type)
 50 |         
 51 |         if subtask in ['a', 'b']:
 52 |             accuracy = np.mean(accuracy_matrix, axis=-1)
 53 |             results = {'accuracy': accuracy, 'legend': legend}
 54 |             print('Subtask: {}, Source: {}, Model: {} accuracy: {:.3f}'.format(
 55 |                 subtask, source, model_type, accuracy[-1]))
 56 |             
 57 |         elif subtask == 'c':
 58 |             accuracy = np.mean(accuracy_matrix[:, 0 : -1], axis=-1)
 59 |             unknown_accuracy = accuracy_matrix[:, -1]
 60 |             results = {
 61 |                 'accuracy': accuracy, 
 62 |                 'unknown_accuracy': unknown_accuracy, 
 63 |                 'legend': legend}
 64 |                 
 65 |             print('Subtask: {}, Source: {}, Model: {}, accuracy: {:.3f}, '
 66 |                 'Unknown accuracy: {:.3f}'.format(subtask, source, model_type, 
 67 |                 accuracy[-1], unknown_accuracy[-1]))
 68 |         
 69 |         return results
 70 |         
 71 |     if subtask == 'a':
 72 |         source = 'a'
 73 |         
 74 |         fig, ax = plt.subplots(1, 1, figsize=(6, 4))
 75 |         lines = []
 76 |         
 77 |         results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=source)
 78 |         line, = ax.plot(results['accuracy'], label=results['legend'])
 79 |         lines.append(line)
 80 |         
 81 |         results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=source)
 82 |         line, = ax.plot(results['accuracy'], label=results['legend'])
 83 |         lines.append(line)
 84 |         
 85 |         results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=source)
 86 |         line, = ax.plot(results['accuracy'], label=results['legend'])
 87 |         lines.append(line)
 88 | 
 89 |         results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=source)
 90 |         line, = ax.plot(results['accuracy'], label=results['legend'])
 91 |         lines.append(line)
 92 |         
 93 |         ax.set_title('Device: {}'.format(source))
 94 |         ax.legend(handles=lines, loc=4)
 95 |         ax.set_ylim(0, 1.0)
 96 |         ax.set_xlabel('Iterations')
 97 |         ax.set_ylabel('Accuracy')
 98 |         ax.grid(color='b', linestyle='solid', linewidth=0.2)
 99 |         ax.xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4))
100 |         ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4))
101 |             
102 |         plt.tight_layout()
103 |         
104 |     elif subtask == 'b':
105 |         sources = ['a', 'b', 'c']
106 |         
107 |         fig, axs = plt.subplots(2, 2, figsize=(12, 8))
108 |         
109 |         for n in range(3):
110 |             lines = []
111 |             
112 |             results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=sources[n])
113 |             line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend'])
114 |             lines.append(line)
115 | 
116 |             results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=sources[n])
117 |             line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend'])
118 |             lines.append(line)
119 | 
120 |             results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=sources[n])
121 |             line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend'])
122 |             lines.append(line)
123 | 
124 |             results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=sources[n])
125 |             line, = axs[n // 2, n % 2].plot(results['accuracy'], label=results['legend'])
126 |             lines.append(line)
127 |             
128 |             axs[n // 2, n % 2].set_title('Device: {}'.format(sources[n]))
129 |             axs[n // 2, n % 2].legend(handles=lines, loc=4)
130 |             axs[n // 2, n % 2].set_ylim(0, 1.0)
131 |             axs[n // 2, n % 2].set_xlabel('Iterations')
132 |             axs[n // 2, n % 2].set_ylabel('Accuracy')
133 |             axs[n // 2, n % 2].grid(color='b', linestyle='solid', linewidth=0.2)
134 |             axs[n // 2, n % 2].xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4))
135 |             axs[n // 2, n % 2].xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4))
136 |             
137 |         axs[1, 1].set_visible(False)
138 |         plt.tight_layout()
139 |         
140 |     elif subtask == 'c':
141 |         source = 'a'
142 |         
143 |         fig, axs = plt.subplots(1, 2, figsize=(12, 4))
144 | 
145 |         axs[0].set_title('Device: {}, known'.format(source))
146 |         axs[1].set_title('Device: {}, unknown'.format(source))
147 |             
148 |         lines = []
149 |         
150 |         results = _load_stat('Cnn_5layers_AvgPooling', subtask, source=source)
151 |         line, = axs[0].plot(results['accuracy'], label=results['legend'])
152 |         line, = axs[1].plot(results['unknown_accuracy'], label=results['legend'])
153 |         lines.append(line)
154 | 
155 |         results = _load_stat('Cnn_9layers_AvgPooling', subtask, source=source)
156 |         line, = axs[0].plot(results['accuracy'], label=results['legend'])
157 |         line, = axs[1].plot(results['unknown_accuracy'], label=results['legend'])
158 |         lines.append(line)
159 | 
160 |         results = _load_stat('Cnn_9layers_MaxPooling', subtask, source=source)
161 |         line, = axs[0].plot(results['accuracy'], label=results['legend'])
162 |         line, = axs[1].plot(results['unknown_accuracy'], label=results['legend'])
163 |         lines.append(line)
164 | 
165 |         results = _load_stat('Cnn_13layers_AvgPooling', subtask, source=source)
166 |         line, = axs[0].plot(results['accuracy'], label=results['legend'])
167 |         line, = axs[1].plot(results['unknown_accuracy'], label=results['legend'])
168 |         lines.append(line)
169 |         
170 |         for n in range(2):
171 |             axs[n].legend(handles=lines, loc=4)
172 |             axs[n].set_ylim(0, 1.0)
173 |             axs[n].set_xlabel('Iterations')
174 |             axs[n].set_ylabel('Accuracy')
175 |             axs[n].grid(color='b', linestyle='solid', linewidth=0.2)
176 |             axs[n].xaxis.set_ticks(np.arange(0, len(iterations) + 1, len(iterations) // 4))
177 |             axs[n].xaxis.set_ticklabels(np.arange(0, max_plot_iteration + 1, max_plot_iteration // 4))
178 |             
179 |         plt.tight_layout()
180 | 
181 |     fig_path = 'result_subtask_{}.png'.format(subtask)
182 |     plt.savefig(fig_path)
183 |     print('Save result figure to {}'.format(fig_path))
184 |     
185 | 
186 | if __name__ == '__main__':
187 |     parser = argparse.ArgumentParser(description='')
188 |     parser.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')
189 |     parser.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.')
190 | 
191 |     args = parser.parse_args()
192 |     
193 |     plot_results(args)


--------------------------------------------------------------------------------
/pytorch/evaluate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
  4 | 
  5 | import numpy as np
  6 | import time
  7 | import logging
  8 | import matplotlib.pyplot as plt
  9 | from sklearn import metrics
 10 | import datetime
 11 | import _pickle as cPickle
 12 | #import sed_eval
 13 | 
 14 | from utilities import get_filename, inverse_scale
 15 | from pytorch_utils import forward
 16 | import config
 17 | from sklearn.metrics import log_loss
 18 | 
 19 | 
 20 | class Evaluator(object):
 21 |     def __init__(self, model, data_generator, subtask, cuda=True):
 22 |         '''Evaluator to evaluate prediction performance. 
 23 |         
 24 |         Args: 
 25 |           model: object
 26 |           data_generator: object
 27 |           subtask: 'a' | 'b' | 'c'
 28 |           cuda: bool
 29 |         '''
 30 |         
 31 |         self.model = model
 32 |         self.data_generator = data_generator
 33 |         self.subtask = subtask
 34 |         self.cuda = cuda
 35 |         
 36 |         self.frames_per_second = config.frames_per_second
 37 |         self.labels = config.labels
 38 |         self.in_domain_classes_num = len(config.labels) - 1
 39 |         self.all_classes_num = len(config.labels)-1
 40 |         self.idx_to_lb = config.idx_to_lb
 41 |         self.lb_to_idx = config.lb_to_idx
 42 | 
 43 |     def evaluate(self, data_type, source, max_iteration=None, verbose=False):
 44 |         '''Evaluate the performance. 
 45 |         
 46 |         Args: 
 47 |           data_type: 'train' | 'validate'
 48 |           source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3'
 49 |           max_iteration: None | int, maximum iteration to run to speed up evaluation
 50 |           verbose: bool
 51 |         '''
 52 | 
 53 |         generate_func = self.data_generator.generate_validate(
 54 |             data_type=data_type, 
 55 |             source=source, 
 56 |             max_iteration=max_iteration)
 57 |         
 58 |         # Forward
 59 |         output_dict = forward(
 60 |             model=self.model, 
 61 |             generate_func=generate_func, 
 62 |             cuda=self.cuda, 
 63 |             return_target=True)
 64 |         
 65 |         if output_dict['output'].ndim == 2: # single scale models
 66 |             output = output_dict['output']  # (audios_num, in_domain_classes_num)
 67 |             target = output_dict['target']  # (audios_num, in_domain_classes_num)
 68 |             loss = output_dict['loss']
 69 |             prob = np.exp(output) 
 70 |             # Evaluate
 71 |             y_true = np.argmax(target, axis=-1)
 72 |             y_pred = np.argmax(prob, axis=-1) 
 73 |             confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num))
 74 |             classwise_accuracy = np.diag(confusion_matrix) \
 75 |                 / np.sum(confusion_matrix, axis=-1)
 76 |             logging.info('Single-Classifier:')     
 77 |             logging.info('Data type: {}'.format(data_type))
 78 |             logging.info('    Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy)))
 79 |             logging.info('    Log loss: {:.3f}'.format(log_loss(y_true,loss)))
 80 |         else:  
 81 |             for i in range(output_dict['output'].shape[1]-1):
 82 |                 output = output_dict['output'][:,i,:]  # (audios_num, in_domain_classes_num)
 83 |                 target = output_dict['target']  # (audios_num, in_domain_classes_num)
 84 |                 loss = output_dict['loss'][:,i,:]
 85 |                 prob = np.exp(output) 
 86 |                 # Evaluate
 87 |                 y_true = np.argmax(target, axis=-1)
 88 |                 y_pred = np.argmax(prob, axis=-1) 
 89 |                 confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num))
 90 |                 classwise_accuracy = np.diag(confusion_matrix) \
 91 |                     / np.sum(confusion_matrix, axis=-1)
 92 |                 logging.info('Scale'+str(i+1)+'-Classifier:')     
 93 |                 logging.info('Data type: {}'.format(data_type))
 94 |                 logging.info('    Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy)))
 95 |                 logging.info('    Log loss: {:.3f}'.format(log_loss(y_true,loss)))
 96 | 
 97 |             output = output_dict['output'][:,-1,:]  # (audios_num, in_domain_classes_num)
 98 |             target = output_dict['target']  # (audios_num, in_domain_classes_num)
 99 |             output = output_dict['loss'][:,-1,:]
100 |             prob = np.exp(output) 
101 |             # Evaluate
102 |             y_true = np.argmax(target, axis=-1)
103 |             y_pred = np.argmax(prob, axis=-1) 
104 |             confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=np.arange(self.in_domain_classes_num))
105 |             classwise_accuracy = np.diag(confusion_matrix) \
106 |                 / np.sum(confusion_matrix, axis=-1)
107 |             logging.info('Global-Classifier:')     
108 |             logging.info('Data type: {}'.format(data_type))
109 |             logging.info('    Average ccuracy: {:.3f}'.format(np.mean(classwise_accuracy)))
110 |             logging.info('    Log loss: {:.3f}'.format(log_loss(y_true,loss)))
111 |             
112 |             
113 |         if verbose:
114 |             classes_num = len(classwise_accuracy)
115 |             for n in range(classes_num):
116 |                 logging.info('{:<20}{:.3f}'.format(self.labels[n], 
117 |                     classwise_accuracy[n]))
118 |                     
119 |             logging.info(confusion_matrix)
120 | 
121 |         statistics = {
122 |             'accuracy': classwise_accuracy, 
123 |             'confusion_matrix': confusion_matrix}
124 | 
125 |         return statistics
126 |       
127 |     def visualize(self, data_type, source, max_iteration=None):
128 |         '''Visualize log mel spectrogram of different sound classes.
129 |         
130 |         Args: 
131 |           data_type: 'train' | 'validate'
132 |           source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3'
133 |           max_iteration: None | int, maximum iteration to run to speed up evaluation
134 |         '''
135 |         mel_bins = config.mel_bins
136 |         audio_duration = config.audio_duration
137 |         frames_num = config.frames_num
138 |         labels = config.labels
139 |         in_domain_classes_num = len(config.labels) - 1
140 |         idx_to_lb = config.idx_to_lb
141 |         
142 |         generate_func = self.data_generator.generate_validate(
143 |             data_type=data_type, 
144 |             source=source, 
145 |             max_iteration=max_iteration)
146 |         
147 |         # Forward
148 |         output_dict = forward(
149 |             model=self.model, 
150 |             generate_func=generate_func, 
151 |             cuda=self.cuda, 
152 |             return_input=True, 
153 |             return_target=True)
154 | 
155 |         # Plot log mel spectrogram of different sound classes
156 |         rows_num = 3
157 |         cols_num = 4
158 |         
159 |         fig, axs = plt.subplots(rows_num, cols_num, figsize=(10, 5))
160 | 
161 |         for k in range(in_domain_classes_num):
162 |             for n, audio_name in enumerate(output_dict['audio_name']):
163 |                 if output_dict['target'][n, k] == 1:
164 |                     title = idx_to_lb[k]
165 |                     row = k // cols_num
166 |                     col = k % cols_num
167 |                     axs[row, col].set_title(title, color='r')
168 |                     logmel = inverse_scale(output_dict['feature'][n], self.data_generator.scalar['mean'], self.data_generator.scalar['std'])
169 |                     axs[row, col].matshow(logmel.T, origin='lower', aspect='auto', cmap='jet')                
170 |                     axs[row, col].set_xticks([0, frames_num])
171 |                     axs[row, col].set_xticklabels(['0', '{:.1f} s'.format(audio_duration)])
172 |                     axs[row, col].xaxis.set_ticks_position('bottom')
173 |                     axs[row, col].set_ylabel('Mel bins')
174 |                     axs[row, col].set_yticks([])
175 |                     break
176 |         
177 |         for k in range(in_domain_classes_num, rows_num * cols_num):
178 |             row = k // cols_num
179 |             col = k % cols_num
180 |             axs[row, col].set_visible(False)
181 |             
182 |         fig.tight_layout(pad=0, w_pad=0, h_pad=0)
183 |         plt.show()
184 | 
185 | 
186 | class StatisticsContainer(object):
187 |     def __init__(self, statistics_path):
188 |         '''Container of statistics during training. 
189 |         
190 |         Args:
191 |           statistics_path: string, path to write out
192 |         '''
193 |         self.statistics_path = statistics_path
194 | 
195 |         self.backup_statistics_path = '{}_{}.pickle'.format(
196 |             os.path.splitext(self.statistics_path)[0], 
197 |                 datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
198 | 
199 |         # Statistics of device 'a', 'b' and 'c'
200 |         self.statistics_dict = {'a': [], 'b': [], 'c': [], 's1': [], 's2': [], 's3': []}
201 | 
202 |     def append_and_dump(self, iteration, source, statistics):
203 |         '''Append statistics to container and dump the container. 
204 |         
205 |         Args:
206 |           iteration: int
207 |           source: 'a' | 'b' | 'c' | 's1' | 's2' | 's3', device
208 |           statistics: dict of statistics
209 |         '''
210 |         statistics['iteration'] = iteration
211 |         self.statistics_dict[source].append(statistics)
212 | 
213 |         cPickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
214 |         cPickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
215 |         logging.info('    Dump statistics to {}'.format(self.statistics_path))
216 | 


--------------------------------------------------------------------------------
/utils/features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(1, os.path.join(sys.path[0], 'utils'))
  4 | import numpy as np
  5 | import argparse
  6 | import h5py
  7 | import librosa
  8 | from scipy import signal
  9 | import matplotlib.pyplot as plt
 10 | import time
 11 | import math
 12 | import pandas as pd
 13 | import random
 14 | from gtg_example import gtg_in_dB
 15 | from utilities import (create_folder, read_audio, calculate_scalar_of_tensor, 
 16 |     pad_truncate_sequence, get_subdir, read_metadata, read_audio_gamm)
 17 | import config
 18 | 
 19 | 
 20 | class LogMelExtractor(object):
 21 |     def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax):
 22 |         '''Log mel feature extractor. 
 23 |         
 24 |         Args:
 25 |           sample_rate: int
 26 |           window_size: int
 27 |           hop_size: int
 28 |           mel_bins: int
 29 |           fmin: int, minimum frequency of mel filter banks
 30 |           fmax: int, maximum frequency of mel filter banks
 31 |         '''
 32 |         
 33 |         self.window_size = window_size
 34 |         self.hop_size = hop_size
 35 |         self.window_func = np.hanning(window_size)
 36 |         
 37 |         self.melW = librosa.filters.mel(
 38 |             sr=sample_rate, 
 39 |             n_fft=window_size, 
 40 |             n_mels=mel_bins, 
 41 |             fmin=fmin, 
 42 |             fmax=fmax).T
 43 |         '''(n_fft // 2 + 1, mel_bins)'''
 44 | 
 45 |     def transform(self, audio):
 46 |         '''Extract feature of a singlechannel audio file. 
 47 |         
 48 |         Args:
 49 |           audio: (samples,)
 50 |           
 51 |         Returns:
 52 |           feature: (frames_num, freq_bins)
 53 |         '''
 54 |     
 55 |         window_size = self.window_size
 56 |         hop_size = self.hop_size
 57 |         window_func = self.window_func
 58 |         
 59 |         # Compute short-time Fourier transform
 60 |         stft_matrix = librosa.core.stft(
 61 |             y=audio, 
 62 |             n_fft=window_size, 
 63 |             hop_length=hop_size, 
 64 |             window=window_func, 
 65 |             center=True, 
 66 |             dtype=np.complex64, 
 67 |             pad_mode='reflect').T
 68 |         '''(N, n_fft // 2 + 1)'''
 69 |     
 70 |         # Mel spectrogram
 71 |         mel_spectrogram = np.dot(np.abs(stft_matrix) ** 2, self.melW)
 72 |         
 73 |         # Log mel spectrogram
 74 |         logmel_spectrogram = librosa.core.power_to_db(
 75 |             mel_spectrogram, ref=1.0, amin=1e-10, 
 76 |             top_db=None)
 77 |         
 78 |         logmel_spectrogram = logmel_spectrogram.astype(np.float32)
 79 |         
 80 |         return logmel_spectrogram
 81 | 
 82 | 
 83 | def calculate_feature_for_all_audio_files(args):
 84 |     '''Calculate feature of audio files and write out features to a hdf5 file. 
 85 |     
 86 |     Args:
 87 |       dataset_dir: string
 88 |       workspace: string
 89 |       subtask: 'a' | 'b' | 'c'
 90 |       data_type: 'development' | 'evaluation'
 91 |       mini_data: bool, set True for debugging on a small part of data
 92 |     '''
 93 | 
 94 |     # Arguments & parameters
 95 |     dataset_dir = args.dataset_dir
 96 |     workspace = args.workspace
 97 |     subtask = args.subtask
 98 |     data_type = args.data_type
 99 |     mini_data = args.mini_data
100 |     
101 |     sample_rate = config.sample_rate
102 |     window_size = config.window_size
103 |     hop_size = config.hop_size
104 |     mel_bins = config.mel_bins
105 |     fmin = config.fmin
106 |     fmax = config.fmax
107 |     frames_per_second = config.frames_per_second
108 |     frames_num = config.frames_num
109 |     total_samples = config.total_samples
110 |     lb_to_idx = config.lb_to_idx
111 |     mfcc_frames = config.mfcc_frames
112 |     n_mfcc = config.n_mfcc
113 |     mfcc_hop_size = config.mfcc_hop_size
114 |     gamm_frames = config.gamm_frames
115 |     n_gamm = config.n_gamm
116 |     # Paths
117 |     if mini_data:
118 |         prefix = 'minidata_'
119 |     else:
120 |         prefix = ''
121 |         
122 |     sub_dir = get_subdir(subtask, data_type)
123 |     audios_dir = os.path.join(dataset_dir, sub_dir, 'audio')
124 | 
125 |     if data_type == 'development':
126 |         metadata_path = os.path.join(dataset_dir, sub_dir, 'meta.csv')
127 |     elif data_type == 'leaderboard':
128 |         metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'test.csv')
129 |     elif data_type == 'evaluation':
130 |         metadata_path = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 'fold1_test.csv')
131 |     else:
132 |         raise Exception('Incorrect data_type!')
133 |     
134 |     feature_path = os.path.join(workspace, 'features', 
135 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
136 |         '{}.h5'.format(sub_dir))
137 |     create_folder(os.path.dirname(feature_path))
138 |         
139 |     # Feature extractor
140 |     feature_extractor = LogMelExtractor(
141 |         sample_rate=sample_rate, 
142 |         window_size=window_size, 
143 |         hop_size=hop_size, 
144 |         mel_bins=mel_bins, 
145 |         fmin=fmin, 
146 |         fmax=fmax)
147 | 
148 |     # Read metadata
149 |     meta_dict = read_metadata(metadata_path)
150 | 
151 |     # Extract features and targets 
152 |     if mini_data:
153 |         mini_num = 10
154 |         total_num = len(meta_dict['audio_name'])
155 |         random_state = np.random.RandomState(1234)
156 |         indexes = random_state.choice(total_num, size=mini_num, replace=False)
157 |         for key in meta_dict.keys():
158 |             meta_dict[key] = meta_dict[key][indexes]
159 |         
160 |     print('Extracting features of all audio files ...')
161 |     extract_time = time.time()
162 |     
163 |     # Hdf5 file for storing features and targets
164 |     hf = h5py.File(feature_path, 'w')
165 | 
166 |     hf.create_dataset(
167 |         name='audio_name', 
168 |         data=[audio_name.encode() for audio_name in meta_dict['audio_name']], 
169 |         dtype='S80')
170 | 
171 |     if 'scene_label' in meta_dict.keys():
172 |         hf.create_dataset(
173 |             name='scene_label', 
174 |             data=[scene_label.encode() for scene_label in meta_dict['scene_label']], 
175 |             dtype='S24')
176 |             
177 |     if 'identifier' in meta_dict.keys():
178 |         hf.create_dataset(
179 |             name='identifier', 
180 |             data=[identifier.encode() for identifier in meta_dict['identifier']], 
181 |             dtype='S24')
182 |             
183 |     if 'source_label' in meta_dict.keys():
184 |         hf.create_dataset(
185 |             name='source_label', 
186 |             data=[source_label.encode() for source_label in meta_dict['source_label']], 
187 |             dtype='S8')
188 | 
189 |     hf.create_dataset(
190 |         name='feature', 
191 |         shape=(0, total_samples), 
192 |         maxshape=(None, total_samples), 
193 |         dtype=np.float32)
194 |     hf.create_dataset(
195 |         name='feature_gamm', 
196 |         shape=(0, gamm_frames, n_gamm), 
197 |         maxshape=(None, gamm_frames, n_gamm), 
198 |         dtype=np.float32)
199 |     hf.create_dataset(
200 |         name='feature_mfcc', 
201 |         shape=(0, mfcc_frames, n_mfcc), 
202 |         maxshape=(None, mfcc_frames, n_mfcc), 
203 |         dtype=np.float32)
204 |     hf.create_dataset(
205 |         name='feature_panns', 
206 |         shape=(0, 320000), 
207 |         maxshape=(None, 320000), 
208 |         dtype=np.float32)
209 |     
210 |     for (n, audio_name) in enumerate(meta_dict['audio_name']):
211 |         audio_path = os.path.join(audios_dir, audio_name)
212 |         print(n, audio_path)
213 |         
214 |         # Read audio
215 |         (audio, _) = read_audio(
216 |             audio_path=audio_path, 
217 |             target_fs=sample_rate)
218 |         audio = audio[:sample_rate*10]
219 |         (audio_gamm, _) = read_audio_gamm(
220 |             audio_path=audio_path, 
221 |             target_fs=sample_rate)
222 |         fea_gamm, _ = gtg_in_dB(audio_gamm, sample_rate) 
223 |         fea_gamm = fea_gamm.transpose(1, 0)
224 |         sound, fs = librosa.load(audio_path)
225 |         fea_mfcc = librosa.feature.mfcc(y=sound, sr=fs, hop_length=mfcc_hop_size, n_mfcc=n_mfcc)
226 |         fea_mfcc = fea_mfcc.transpose(1, 0)
227 |         (waveform, _) = librosa.core.load(audio_path, sr=32000, mono=True)
228 |         waveform = waveform[:320000]
229 |         
230 |         hf['feature'].resize((n + 1, total_samples))
231 |         hf['feature'][n] = audio
232 |         hf['feature_gamm'].resize((n + 1, gamm_frames, n_gamm))
233 |         hf['feature_gamm'][n] = fea_gamm
234 |         hf['feature_mfcc'].resize((n + 1, mfcc_frames, n_mfcc))
235 |         hf['feature_mfcc'][n] = fea_mfcc
236 |         hf['feature_panns'].resize((n + 1, 320000))
237 |         hf['feature_panns'][n] = waveform
238 |             
239 |     hf.close()
240 |         
241 |     print('Write hdf5 file to {} using {:.3f} s'.format(
242 |         feature_path, time.time() - extract_time))
243 |     
244 |     
245 | def calculate_scalar(args):
246 |     '''Calculate and write out scalar of features. 
247 |     
248 |     Args:
249 |       workspace: string
250 |       subtask: 'a' | 'b' | 'c'
251 |       data_type: 'train'
252 |       mini_data: bool, set True for debugging on a small part of data
253 |     '''
254 | 
255 |     # Arguments & parameters
256 |     workspace = args.workspace
257 |     subtask = args.subtask
258 |     data_type = args.data_type
259 |     mini_data = args.mini_data
260 |     
261 |     mel_bins = config.mel_bins
262 |     frames_per_second = config.frames_per_second
263 |     
264 |     # Paths
265 |     if mini_data:
266 |         prefix = 'minidata_'
267 |     else:
268 |         prefix = ''
269 |     
270 |     sub_dir = get_subdir(subtask, data_type)
271 |     
272 |     feature_path = os.path.join(workspace, 'features', 
273 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
274 |         '{}.h5'.format(sub_dir))
275 |         
276 |     scalar_path = os.path.join(workspace, 'scalars', 
277 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
278 |         '{}.h5'.format(sub_dir))
279 |     create_folder(os.path.dirname(scalar_path))
280 |         
281 |     # Load data
282 |     load_time = time.time()
283 |     
284 |     with h5py.File(feature_path, 'r') as hf:
285 |         features = hf['feature'][:]
286 |         features_gamm = hf['feature_gamm'][:]
287 |         features_mfcc = hf['feature_mfcc'][:]
288 |         features_panns = hf['feature_panns'][:]
289 |     # Calculate scalar
290 |     features = np.concatenate(features[None,:], axis=0)
291 |     (mean, std) = calculate_scalar_of_tensor(features)
292 |     features_gamm = np.concatenate(features_gamm, axis=0)
293 |     (mean_gamm, std_gamm) = calculate_scalar_of_tensor(features_gamm)
294 |     features_mfcc = np.concatenate(features_mfcc, axis=0)
295 |     (mean_mfcc, std_mfcc) = calculate_scalar_of_tensor(features_mfcc)
296 |     features_panns = np.concatenate(features_panns[None,:], axis=0)
297 |     (mean_panns, std_panns) = calculate_scalar_of_tensor(features_panns)
298 |     with h5py.File(scalar_path, 'w') as hf:
299 |         hf.create_dataset('mean', data=mean, dtype=np.float32)
300 |         hf.create_dataset('std', data=std, dtype=np.float32)
301 |         hf.create_dataset('mean_gamm', data=mean_gamm, dtype=np.float32)
302 |         hf.create_dataset('std_gamm', data=std_gamm, dtype=np.float32)
303 |         hf.create_dataset('mean_mfcc', data=mean_mfcc, dtype=np.float32)
304 |         hf.create_dataset('std_mfcc', data=std_mfcc, dtype=np.float32)
305 |         hf.create_dataset('mean_panns', data=mean_panns, dtype=np.float32)
306 |         hf.create_dataset('std_panns', data=std_panns, dtype=np.float32)
307 |     
308 |     print('Write out scalar to {}'.format(scalar_path))
309 |             
310 | 
311 | if __name__ == '__main__':
312 |     
313 |     parser = argparse.ArgumentParser(description='')
314 |     subparsers = parser.add_subparsers(dest='mode')
315 | 
316 |     # Calculate feature for all audio files
317 |     parser_logmel = subparsers.add_parser('calculate_feature_for_all_audio_files')    
318 |     parser_logmel.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.')    
319 |     parser_logmel.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')        
320 |     parser_logmel.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1')        
321 |     parser_logmel.add_argument('--data_type', type=str, choices=['development', 'leaderboard', 'evaluation'], required=True)        
322 |     parser_logmel.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.')
323 |     
324 |     # Calculate scalar
325 |     parser_scalar = subparsers.add_parser('calculate_scalar')    
326 |     parser_scalar.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')    
327 |     parser_scalar.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1')        
328 |     parser_scalar.add_argument('--data_type', type=str, choices=['development', 'evaluation'], required=True)        
329 |     parser_scalar.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.')
330 |     
331 |     # Parse arguments
332 |     args = parser.parse_args()
333 |     
334 |     if args.mode == 'calculate_feature_for_all_audio_files':
335 |         calculate_feature_for_all_audio_files(args)
336 |         
337 |     elif args.mode == 'calculate_scalar':
338 |         calculate_scalar(args)
339 |         
340 |     else:
341 |         raise Exception('Incorrect arguments!')


--------------------------------------------------------------------------------
/utils/data_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import h5py
  3 | import csv
  4 | import time
  5 | import logging
  6 | import os
  7 | #import glob
  8 | import matplotlib.pyplot as plt
  9 | import logging
 10 | 
 11 | from utilities import scale, read_metadata, sparse_to_categorical
 12 | import config
 13 | 
 14 | 
 15 | class Base(object):
 16 |     
 17 |     def __init__(self):
 18 |         '''Base class for data generator
 19 |         '''
 20 |         pass
 21 | 
 22 |     def load_hdf5(self, hdf5_path):
 23 |         '''Load hdf5 file. 
 24 |         
 25 |         Returns:
 26 |           data_dict: dict of data, e.g.:
 27 |             {'audio_name': np.array(['a.wav', 'b.wav', ...]), 
 28 |              'feature': (audios_num, frames_num, mel_bins)
 29 |              'target': (audios_num,), 
 30 |              ...}
 31 |         '''
 32 |         data_dict = {}
 33 |         
 34 |         with h5py.File(hdf5_path, 'r') as hf:
 35 |             data_dict['audio_name'] = np.array(
 36 |                 [audio_name.decode() for audio_name in hf['audio_name'][:]])
 37 | 
 38 |             data_dict['feature'] = hf['feature'][:].astype(np.float32)
 39 |             data_dict['feature_gamm'] = hf['feature_gamm'][:].astype(np.float32)
 40 |             data_dict['feature_mfcc'] = hf['feature_mfcc'][:].astype(np.float32)
 41 |             data_dict['feature_panns'] = hf['feature_panns'][:].astype(np.float32)
 42 |             
 43 |             if 'scene_label' in hf.keys():
 44 |                 data_dict['target'] = np.array(
 45 |                     [self.lb_to_idx[scene_label.decode()] \
 46 |                         for scene_label in hf['scene_label'][:]])
 47 |                 
 48 |             if 'identifier' in hf.keys():
 49 |                 data_dict['identifier'] = np.array(
 50 |                     [identifier.decode() for identifier in hf['identifier'][:]])
 51 |                 
 52 |             if 'source_label' in hf.keys():
 53 |                 data_dict['source_label'] = np.array(
 54 |                     [source_label.decode() \
 55 |                         for source_label in hf['source_label'][:]])
 56 |             
 57 |         return data_dict
 58 | 
 59 |     def transform(self, x):
 60 |         return scale(x, self.scalar['mean'], self.scalar['std'])
 61 |     def transform_gamm(self, x):
 62 |         return scale(x, self.scalar['mean_gamm'], self.scalar['std_gamm'])
 63 |     def transform_mfcc(self, x):
 64 |         return scale(x, self.scalar['mean_mfcc'], self.scalar['std_mfcc'])
 65 |     def transform_panns(self, x):
 66 |         return scale(x, self.scalar['mean_panns'], self.scalar['std_panns'])
 67 | 
 68 | 
 69 | class DataGenerator(Base):
 70 |     
 71 |     def __init__(self, feature_hdf5_path, train_csv, validate_csv, holdout_fold, 
 72 |         scalar, batch_size, seed=1234):
 73 |         '''Data generator for training and validation. 
 74 |         
 75 |         Args:
 76 |           feature_hdf5_path: string, path of hdf5 feature file
 77 |           train_csv: string, path of train csv file
 78 |           validate_csv: string, path of validate csv file
 79 |           holdout_fold: set 1 for development and none for training 
 80 |               on all data without validation
 81 |           scalar: object, containing mean and std value
 82 |           batch_size: int
 83 |           seed: int, random seed
 84 |         '''
 85 | 
 86 |         self.scalar = scalar
 87 |         self.batch_size = batch_size
 88 |         self.random_state = np.random.RandomState(seed)
 89 |         
 90 |         # self.classes_num = classes_num
 91 |         self.in_domain_classes_num = len(config.labels) - 1
 92 |         self.all_classes_num = len(config.labels) - 1
 93 |         self.lb_to_idx = config.lb_to_idx
 94 |         self.idx_to_lb = config.idx_to_lb
 95 |         
 96 |         # Load training data
 97 |         load_time = time.time()
 98 |         
 99 |         self.data_dict = self.load_hdf5(feature_hdf5_path)
100 |         
101 |         train_meta = read_metadata(train_csv)
102 |         validate_meta = read_metadata(validate_csv)
103 | 
104 |         self.train_audio_indexes = self.get_audio_indexes(
105 |             train_meta, self.data_dict, holdout_fold, 'train')
106 |             
107 |         self.validate_audio_indexes = self.get_audio_indexes(
108 |             validate_meta, self.data_dict, holdout_fold, 'validate')
109 |             
110 |         if holdout_fold == 'none':
111 |             self.train_audio_indexes = np.concatenate(
112 |                 (self.train_audio_indexes, self.validate_audio_indexes), axis=0)
113 |                 
114 |             self.validate_audio_indexes = np.array([])
115 |         
116 |         logging.info('Load data time: {:.3f} s'.format(time.time() - load_time))
117 |         logging.info('Training audio num: {}'.format(len(self.train_audio_indexes)))            
118 |         logging.info('Validation audio num: {}'.format(len(self.validate_audio_indexes)))
119 |         
120 |         self.random_state.shuffle(self.train_audio_indexes)
121 |         self.pointer = 0
122 |         
123 |     def get_audio_indexes(self, meta_data, data_dict, holdout_fold, data_type):
124 |         '''Get train or validate indexes. 
125 |         '''
126 |         audio_indexes = []
127 |         
128 |         for name in meta_data['audio_name']:
129 |             loct = np.argwhere(data_dict['audio_name'] == name)
130 |             
131 |             if len(loct) > 0:
132 |                 index = loct[0, 0]
133 |                 label = self.idx_to_lb[self.data_dict['target'][index]]
134 |                 if holdout_fold != 'none':
135 |                     if data_type == 'train' and label != 'unknown':
136 |                         audio_indexes.append(index)
137 |                     elif data_type == 'validate':
138 |                         audio_indexes.append(index)
139 |                 else:
140 |                     if label != 'unknown':
141 |                         audio_indexes.append(index)
142 |             
143 |         return np.array(audio_indexes)
144 |         
145 |     def generate_train(self):
146 |         '''Generate mini-batch data for training. 
147 |         
148 |         Returns:
149 |           batch_data_dict: dict containing audio_name, feature and target
150 |         '''
151 | 
152 |         while True:
153 |             # Reset pointer
154 |             if self.pointer >= len(self.train_audio_indexes):
155 |                 self.pointer = 0
156 |                 self.random_state.shuffle(self.train_audio_indexes)
157 | 
158 |             # Get batch audio_indexes
159 |             batch_audio_indexes = self.train_audio_indexes[
160 |                 self.pointer: self.pointer + self.batch_size]
161 |                 
162 |             self.pointer += self.batch_size
163 | 
164 |             batch_data_dict = {}
165 | 
166 |             batch_data_dict['audio_name'] = \
167 |                 self.data_dict['audio_name'][batch_audio_indexes]
168 |             
169 |             batch_feature = self.data_dict['feature'][batch_audio_indexes]
170 |             batch_feature = self.transform(batch_feature)
171 |             batch_data_dict['feature'] = batch_feature
172 |             
173 |             batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes]
174 |             batch_feature_gamm = self.transform_gamm(batch_feature_gamm)
175 |             batch_data_dict['feature_gamm'] = batch_feature_gamm
176 |             
177 |             batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes]
178 |             batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc)
179 |             batch_data_dict['feature_mfcc'] = batch_feature_mfcc
180 |             
181 |             batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes]
182 |             batch_feature_panns = self.transform_panns(batch_feature_panns)
183 |             batch_data_dict['feature_panns'] = batch_feature_panns
184 |             
185 |             sparse_target = self.data_dict['target'][batch_audio_indexes]
186 |             batch_data_dict['target'] = sparse_to_categorical(
187 |                 sparse_target, self.in_domain_classes_num)
188 |             
189 |             yield batch_data_dict
190 |             
191 |     def get_source_indexes(self, indexes, data_dict, source): 
192 |         '''Get indexes of specific source. 
193 |         '''
194 |         source_indexes = np.array([index for index in indexes \
195 |             if data_dict['source_label'][index] == source])
196 |             
197 |         return source_indexes
198 |             
199 |     def generate_validate(self, data_type, source, max_iteration=None):
200 |         '''Generate mini-batch data for training. 
201 |         
202 |         Args:
203 |           data_type: 'train' | 'validate'
204 |           source: 'a' | 'b' | 'c'
205 |           max_iteration: int, maximum iteration to validate to speed up validation
206 |         
207 |         Returns:
208 |           batch_data_dict: dict containing audio_name, feature and target
209 |         '''
210 |         
211 |         batch_size = self.batch_size
212 |         
213 |         if data_type == 'train':
214 |             audio_indexes = np.array(self.train_audio_indexes)
215 |         elif data_type == 'validate':
216 |             audio_indexes = np.array(self.validate_audio_indexes)
217 |         else:
218 |             raise Exception('Incorrect argument!')
219 |             
220 |         audio_indexes = self.get_source_indexes(
221 |             audio_indexes, self.data_dict, source)
222 |             
223 |         iteration = 0
224 |         pointer = 0
225 |         
226 |         while True:
227 |             if iteration == max_iteration:
228 |                 break
229 | 
230 |             # Reset pointer
231 |             if pointer >= len(audio_indexes):
232 |                 break
233 | 
234 |             # Get batch audio_indexes
235 |             batch_audio_indexes = audio_indexes[pointer: pointer + batch_size]                
236 |             pointer += batch_size
237 |             iteration += 1
238 | 
239 |             batch_data_dict = {}
240 | 
241 |             batch_data_dict['audio_name'] = \
242 |                 self.data_dict['audio_name'][batch_audio_indexes]
243 |             
244 |             batch_feature = self.data_dict['feature'][batch_audio_indexes]
245 |             batch_feature = self.transform(batch_feature)
246 |             batch_data_dict['feature'] = batch_feature
247 |             
248 |             batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes]
249 |             batch_feature_gamm = self.transform_gamm(batch_feature_gamm)
250 |             batch_data_dict['feature_gamm'] = batch_feature_gamm
251 |             
252 |             batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes]
253 |             batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc)
254 |             batch_data_dict['feature_mfcc'] = batch_feature_mfcc
255 |             
256 |             batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes]
257 |             batch_feature_panns = self.transform_panns(batch_feature_panns)
258 |             batch_data_dict['feature_panns'] = batch_feature_panns
259 |             
260 |             sparse_target = self.data_dict['target'][batch_audio_indexes]
261 |             batch_data_dict['target'] = sparse_to_categorical(
262 |                 sparse_target, self.all_classes_num)
263 | 
264 |             yield batch_data_dict
265 |             
266 | 
267 | class EvaluationDataGenerator(Base):
268 |     def __init__(self, feature_hdf5_path, scalar, batch_size, seed=1234):
269 |         '''Data generator for training and validation. 
270 |         
271 |         Args:
272 |           feature_hdf5_path: string, path of hdf5 feature file
273 |           scalar: object, containing mean and std value
274 |           batch_size: int
275 |           seed: int, random seed
276 |         '''
277 |         self.scalar = scalar
278 |         self.batch_size = batch_size
279 | 
280 |         self.data_dict = self.load_hdf5(feature_hdf5_path)
281 | 
282 |     def generate_evaluation(self, data_type, max_iteration=None):
283 |         '''Generate mini-batch data for training. 
284 |         
285 |         Args:
286 |           data_type: 'train' | 'validate'
287 |           max_iteration: int, maximum iteration to validate to speed up validation
288 |         
289 |         Returns:
290 |           batch_data_dict: dict containing audio_name, feature and target
291 |         '''
292 |         
293 |         batch_size = self.batch_size 
294 |         audio_indexes = np.arange(len(self.data_dict['audio_name']))
295 | 
296 |         iteration = 0
297 |         pointer = 0
298 |         
299 |         while True:
300 |             if iteration == max_iteration:
301 |                 break
302 | 
303 |             # Reset pointer
304 |             if pointer >= len(audio_indexes):
305 |                 break
306 | 
307 |             # Get batch audio_indexes
308 |             batch_audio_indexes = audio_indexes[pointer: pointer + batch_size]                
309 |             pointer += batch_size
310 |             iteration += 1
311 | 
312 |             batch_data_dict = {}
313 | 
314 |             batch_data_dict['audio_name'] = \
315 |                 self.data_dict['audio_name'][batch_audio_indexes]
316 |             
317 |             batch_feature = self.data_dict['feature'][batch_audio_indexes]
318 |             batch_feature = self.transform(batch_feature)
319 |             batch_data_dict['feature'] = batch_feature
320 |             
321 |             batch_feature_gamm = self.data_dict['feature_gamm'][batch_audio_indexes]
322 |             batch_feature_gamm = self.transform_gamm(batch_feature_gamm)
323 |             batch_data_dict['feature_gamm'] = batch_feature_gamm
324 |             
325 |             batch_feature_panns = self.data_dict['feature_panns'][batch_audio_indexes]
326 |             batch_feature_panns = self.transform_panns(batch_feature_panns)
327 |             batch_data_dict['feature_panns'] = batch_feature_panns
328 |             
329 |             batch_feature_mfcc = self.data_dict['feature_mfcc'][batch_audio_indexes]
330 |             batch_feature_mfcc = self.transform_mfcc(batch_feature_mfcc)
331 |             batch_data_dict['feature_mfcc'] = batch_feature_mfcc
332 |             
333 |             yield batch_data_dict
334 |             
335 |     def transform(self, x):
336 |         return scale(x, self.scalar['mean'], self.scalar['std'])
337 |     def transform_gamm(self, x):
338 |         return scale(x, self.scalar['mean_gamm'], self.scalar['std_gamm'])
339 |     def transform_mfcc(self, x):
340 |         return scale(x, self.scalar['mean_mfcc'], self.scalar['std_mfcc'])
341 |     def transform_panns(self, x):
342 |         return scale(x, self.scalar['mean_panns'], self.scalar['std_panns'])
343 | 


--------------------------------------------------------------------------------
/pytorch/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
  4 | import numpy as np
  5 | import argparse
  6 | import h5py
  7 | import math
  8 | import time
  9 | import logging
 10 | import matplotlib.pyplot as plt
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import torch.optim as optim
 15 | 
 16 | from utilities import (create_folder, get_filename, create_logging, load_scalar, 
 17 |     get_subdir, get_sources, write_submission, mixup_data, mixup_criterion)
 18 | from data_generator import DataGenerator, EvaluationDataGenerator
 19 | from models import *
 20 | from losses import nll_loss
 21 | from evaluate import Evaluator, StatisticsContainer
 22 | from pytorch_utils import move_data_to_gpu, forward
 23 | import config
 24 | 
 25 | 
 26 | def train(args):
 27 |     '''Training. Model will be saved after several iterations. 
 28 |     
 29 |     Args: 
 30 |       dataset_dir: string, directory of dataset
 31 |       workspace: string, directory of workspace
 32 |       subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1
 33 |       data_type: 'development' | 'evaluation'
 34 |       holdout_fold: '1' | 'none', set 1 for development and none for training 
 35 |           on all data without validation
 36 |       model_type: string, e.g. 'Cnn_9layers_AvgPooling'
 37 |       batch_size: int
 38 |       cuda: bool
 39 |       mini_data: bool, set True for debugging on a small part of data
 40 |     '''
 41 |     
 42 |     # Arugments & parameters
 43 |     dataset_dir = args.dataset_dir
 44 |     workspace = args.workspace
 45 |     subtask = args.subtask
 46 |     data_type = args.data_type
 47 |     holdout_fold = args.holdout_fold
 48 |     model_type = args.model_type
 49 |     batch_size = args.batch_size
 50 |     cuda = args.cuda and torch.cuda.is_available()
 51 |     mini_data = args.mini_data
 52 |     filename = args.filename
 53 |     ite_train = args.ite_train
 54 |     ite_eva = args.ite_eva
 55 |     ite_store = args.ite_store
 56 |     
 57 |     mel_bins = config.mel_bins
 58 |     frames_per_second = config.frames_per_second
 59 |     max_iteration = None      # Number of mini-batches to evaluate on training data
 60 |     reduce_lr = True
 61 |     
 62 |     sources_to_evaluate = get_sources(subtask)
 63 |     in_domain_classes_num = len(config.labels) - 1
 64 |     
 65 |     # Paths
 66 |     if mini_data:
 67 |         prefix = 'minidata_'
 68 |     else:
 69 |         prefix = ''
 70 |     
 71 |     sub_dir = get_subdir(subtask, data_type)
 72 |     
 73 |     train_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 
 74 |         'fold1_train.csv')
 75 |         
 76 |     validate_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 
 77 |         'fold1_evaluate.csv')
 78 |                 
 79 |     feature_hdf5_path = os.path.join(workspace, 'features', 
 80 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
 81 |         '{}.h5'.format(sub_dir))
 82 |         
 83 |     scalar_path = os.path.join(workspace, 'scalars', 
 84 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
 85 |         '{}.h5'.format(sub_dir))
 86 |         
 87 |     checkpoints_dir = os.path.join(workspace, 'checkpoints', filename, 
 88 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
 89 |         '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 
 90 |         model_type)
 91 |     create_folder(checkpoints_dir)
 92 | 
 93 |     validate_statistics_path = os.path.join(workspace, 'statistics', filename, 
 94 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
 95 |         '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 
 96 |         model_type, 'validate_statistics.pickle')
 97 |     
 98 |     create_folder(os.path.dirname(validate_statistics_path))
 99 |     
100 |     logs_dir = os.path.join(workspace, 'logs', filename, args.mode, 
101 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
102 |         '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), model_type)
103 |     create_logging(logs_dir, 'w')
104 |     logging.info(args)
105 | 
106 |     if cuda:
107 |         logging.info('Using GPU.')
108 |     else:
109 |         logging.info('Using CPU. Set --cuda flag to use GPU.')
110 | 
111 |     # Load scalar
112 |     scalar = load_scalar(scalar_path)
113 |     
114 |     # Model
115 |     Model = eval(model_type)
116 |     
117 |     if subtask in ['a', 'b']:
118 |         model = Model(in_domain_classes_num, activation='logsoftmax')
119 |         loss_func = nll_loss
120 |         
121 |     elif subtask == 'c':
122 |         model = Model(in_domain_classes_num, activation='sigmoid')
123 |         loss_func = F.binary_cross_entropy
124 | 
125 |     if cuda:
126 |         model.cuda()
127 | 
128 |     optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999),
129 |                                eps=1e-08, weight_decay=0., amsgrad=True)
130 |         
131 |     # Data generator
132 |     data_generator = DataGenerator(
133 |         feature_hdf5_path=feature_hdf5_path, 
134 |         train_csv=train_csv, 
135 |         validate_csv=validate_csv, 
136 |         holdout_fold=holdout_fold, 
137 |         scalar=scalar, 
138 |         batch_size=batch_size)
139 |     
140 |     # Evaluator
141 |     evaluator = Evaluator(
142 |         model=model, 
143 |         data_generator=data_generator, 
144 |         subtask=subtask, 
145 |         cuda=cuda)
146 |     
147 |     # Statistics
148 |     validate_statistics_container = StatisticsContainer(validate_statistics_path)
149 | 
150 |     train_bgn_time = time.time()
151 |     iteration = 0
152 |     
153 |     # Train on mini batches
154 |     for batch_data_dict in data_generator.generate_train():
155 |         
156 |         # Evaluate
157 |         #1800
158 |         if iteration % 200 == 0:
159 |             print(iteration)
160 |         if iteration % 200 == 0 and iteration > ite_eva:
161 |             logging.info('------------------------------------')
162 |             logging.info('Iteration: {}'.format(iteration))
163 | 
164 |             train_fin_time = time.time()
165 | 
166 |             for source in sources_to_evaluate:
167 |                 train_statistics = evaluator.evaluate(
168 |                     data_type='train', 
169 |                     source=source, 
170 |                     max_iteration=None, 
171 |                     verbose=False)
172 |             
173 |             if holdout_fold != 'none':
174 |                 for source in sources_to_evaluate:
175 |                     validate_statistics = evaluator.evaluate(
176 |                         data_type='validate', 
177 |                         source=source, 
178 |                         max_iteration=None, 
179 |                         verbose=False)
180 | 
181 |                     validate_statistics_container.append_and_dump(
182 |                         iteration, source, validate_statistics)
183 | 
184 |             train_time = train_fin_time - train_bgn_time
185 |             validate_time = time.time() - train_fin_time
186 | 
187 |             logging.info(
188 |                 'Train time: {:.3f} s, validate time: {:.3f} s'
189 |                 ''.format(train_time, validate_time))
190 | 
191 |             train_bgn_time = time.time()
192 | 
193 |         # Save model
194 |         if iteration % 200 == 0 and iteration > ite_store:
195 |             checkpoint = {
196 |                 'iteration': iteration, 
197 |                 'model': model.state_dict(), 
198 |                 'optimizer': optimizer.state_dict()}
199 | 
200 |             checkpoint_path = os.path.join(
201 |                 checkpoints_dir, '{}_iterations.pth'.format(iteration))
202 |                 
203 |             torch.save(checkpoint, checkpoint_path)
204 |             logging.info('Model saved to {}'.format(checkpoint_path))
205 |             
206 |         # Reduce learning rate
207 |         if reduce_lr and iteration % 200 == 0 and iteration > 0:
208 |             for param_group in optimizer.param_groups:
209 |                 param_group['lr'] *= 0.91
210 |         
211 |         # Move data to GPU
212 |         for key in batch_data_dict.keys():
213 |             if key in ['feature', 'feature_gamm', 'feature_mfcc', 'feature_panns', 'target']:
214 |                 batch_data_dict[key] = move_data_to_gpu(batch_data_dict[key], cuda)
215 |         
216 |         # Train
217 | #         batch_output,batch_loss = model(batch_data_dict['feature'], batch_data_dict['feature_gamm'], batch_data_dict['feature_mfcc'], batch_data_dict['feature_panns'])
218 | #         loss = loss_func(batch_output, batch_data_dict['target'])
219 |     
220 |         # Using Mixup
221 |         model.train()
222 |         mixed_x1, mixed_x2, mixed_x3, mixed_x4, y_a, y_b, lam = mixup_data(x1=batch_data_dict['feature'], x2=batch_data_dict['feature_gamm'], x3=batch_data_dict['feature_mfcc'], x4=batch_data_dict['feature_panns'], y=batch_data_dict['target'], alpha=0.2)
223 |         batch_output,batch_loss = model(mixed_x1, mixed_x2, mixed_x3, mixed_x4)
224 | 
225 |         if batch_output.shape[1] == 10: # single scale models
226 |             loss = mixup_criterion(loss_func, batch_output, y_a, y_b, lam)
227 |         else:                  # multi scale models
228 |             losses = []
229 |             for ite in range(batch_output.shape[1]-1):
230 |                 loss = mixup_criterion(loss_func, batch_output[:,ite,:], y_a, y_b, lam)
231 |                 losses.append(loss)
232 |             loss = sum(losses)
233 | 
234 |         # Backward
235 |         optimizer.zero_grad()
236 |         loss.backward()
237 |         optimizer.step()
238 | 
239 |         # Stop learning
240 |         # 12000 for scratch
241 |         if iteration == ite_train:
242 |             break
243 |             
244 |         iteration += 1
245 |         
246 | 
247 | def inference_validation(args):
248 |     '''Inference and calculate metrics on validation data. 
249 |     
250 |     Args: 
251 |       dataset_dir: string, directory of dataset
252 |       subtask: 'a' | 'b' | 'c', corresponds to 3 subtasks in DCASE2019 Task1
253 |       data_type: 'development'
254 |       workspace: string, directory of workspace
255 |       model_type: string, e.g. 'Cnn_9layers'
256 |       iteration: int
257 |       batch_size: int
258 |       cuda: bool
259 |       mini_data: bool, set True for debugging on a small part of data
260 |       visualize: bool
261 |     '''
262 |     # Arugments & parameters
263 |     dataset_dir = args.dataset_dir
264 |     subtask = args.subtask
265 |     data_type = args.data_type
266 |     workspace = args.workspace
267 |     model_type = args.model_type
268 |     holdout_fold = args.holdout_fold
269 |     iteration = args.iteration
270 |     batch_size = args.batch_size
271 |     cuda = args.cuda and torch.cuda.is_available()
272 |     mini_data = args.mini_data
273 |     visualize = args.visualize
274 |     filename = args.filename
275 |     
276 |     mel_bins = config.mel_bins
277 |     frames_per_second = config.frames_per_second
278 |     
279 |     sources = get_sources(subtask)
280 |     in_domain_classes_num = len(config.labels) - 1
281 |     
282 |     # Paths
283 |     if mini_data:
284 |         prefix = 'minidata_'
285 |     else:
286 |         prefix = ''
287 |         
288 |     sub_dir = get_subdir(subtask, data_type)
289 |     
290 |     train_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 
291 |         'fold1_train.csv')
292 |         
293 |     validate_csv = os.path.join(dataset_dir, sub_dir, 'evaluation_setup', 
294 |         'fold1_evaluate.csv')
295 |                 
296 |     feature_hdf5_path = os.path.join(workspace, 'features', 
297 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
298 |         '{}.h5'.format(sub_dir))
299 |         
300 |     scalar_path = os.path.join(workspace, 'scalars', 
301 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
302 |         '{}.h5'.format(sub_dir))
303 |         
304 |     checkpoint_path = os.path.join(workspace, 'checkpoints', filename, 
305 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
306 |         '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 
307 |         model_type, '{}_iterations.pth'.format(iteration))
308 |     
309 |     logs_dir = os.path.join(workspace, 'logs', filename, args.mode, 
310 |         '{}logmel_{}frames_{}melbins'.format(prefix, frames_per_second, mel_bins), 
311 |         '{}'.format(sub_dir), 'holdout_fold={}'.format(holdout_fold), 
312 |         model_type)
313 |     create_logging(logs_dir, 'w')
314 |     logging.info(args)
315 |         
316 |     # Load scalar
317 |     scalar = load_scalar(scalar_path)
318 | 
319 |     # Load model
320 |     Model = eval(model_type)
321 |     
322 |     if subtask in ['a', 'b']:
323 |         model = Model(in_domain_classes_num, activation='logsoftmax')
324 |         loss_func = nll_loss
325 |         
326 |     elif subtask == 'c':
327 |         model = Model(in_domain_classes_num, activation='sigmoid')
328 |         loss_func = F.binary_cross_entropy
329 |         
330 |     checkpoint = torch.load(checkpoint_path)
331 |     model.load_state_dict(checkpoint['model'])
332 |     
333 |     if cuda:
334 |         model.cuda()
335 |         
336 |     # Data generator
337 |     data_generator = DataGenerator(
338 |         feature_hdf5_path=feature_hdf5_path, 
339 |         train_csv=train_csv, 
340 |         validate_csv=validate_csv, 
341 |         holdout_fold=holdout_fold, 
342 |         scalar=scalar, 
343 |         batch_size=batch_size)
344 |     
345 |     # Evaluator
346 |     evaluator = Evaluator(
347 |         model=model, 
348 |         data_generator=data_generator, 
349 |         subtask=subtask, 
350 |         cuda=cuda)
351 |     
352 |     if subtask in ['a', 'c']:
353 |         evaluator.evaluate(data_type='validate', source='a', verbose=True)
354 |         evaluator.evaluate(data_type='validate', source='b', verbose=True)
355 |         evaluator.evaluate(data_type='validate', source='c', verbose=True)
356 |         evaluator.evaluate(data_type='validate', source='s1', verbose=True)
357 |         evaluator.evaluate(data_type='validate', source='s2', verbose=True)
358 |         evaluator.evaluate(data_type='validate', source='s3', verbose=True)
359 |         
360 |     elif subtask == 'b':
361 |         evaluator.evaluate(data_type='validate', source='a', verbose=True)
362 |         evaluator.evaluate(data_type='validate', source='b', verbose=True)
363 |         evaluator.evaluate(data_type='validate', source='c', verbose=True)
364 |     
365 |     # Visualize log mel spectrogram
366 |     if visualize:
367 |         evaluator.visualize(data_type='validate', source='a')
368 | 
369 | 
370 | 
371 | if __name__ == '__main__':
372 |     parser = argparse.ArgumentParser(description='Example of parser. ')
373 |     subparsers = parser.add_subparsers(dest='mode')
374 | 
375 |     # Train
376 |     parser_train = subparsers.add_parser('train')
377 |     parser_train.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.')
378 |     parser_train.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')
379 |     parser_train.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.')
380 |     parser_train.add_argument('--data_type', type=str, choices=['development', 'evaluation'], required=True)
381 |     parser_train.add_argument('--holdout_fold', type=str, choices=['1', 'none'], required=True, help='Set 1 for development and none for training on all data without validation.')
382 |     parser_train.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.')
383 |     parser_train.add_argument('--batch_size', type=int, required=True)
384 |     parser_train.add_argument('--ite_train', type=int, required=True)
385 |     parser_train.add_argument('--ite_eva', type=int, required=True)
386 |     parser_train.add_argument('--ite_store', type=int, required=True)
387 |     parser_train.add_argument('--cuda', action='store_true', default=False)
388 |     parser_train.add_argument('--fixed', type=str, default=False)
389 |     parser_train.add_argument('--finetune', type=str, default=False)
390 |     parser_train.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.')
391 | 
392 |     # Inference validation data
393 |     parser_inference_validation = subparsers.add_parser('inference_validation')
394 |     parser_inference_validation.add_argument('--dataset_dir', type=str, required=True, help='Directory of dataset.')
395 |     parser_inference_validation.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')
396 |     parser_inference_validation.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.')
397 |     parser_inference_validation.add_argument('--data_type', type=str, choices=['development'], required=True)
398 |     parser_inference_validation.add_argument('--holdout_fold', type=str, choices=['1'], required=True)
399 |     parser_inference_validation.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.')
400 |     parser_inference_validation.add_argument('--iteration', type=int, required=True, help='Load model of this iteration.')
401 |     parser_inference_validation.add_argument('--batch_size', type=int, required=True)
402 |     parser_inference_validation.add_argument('--cuda', action='store_true', default=False)
403 |     parser_inference_validation.add_argument('--visualize', action='store_true', default=False, help='Visualize log mel spectrogram of different sound classes.')
404 |     parser_inference_validation.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.')
405 | 
406 |     # Inference evaluation data
407 |     parser_inference_validation = subparsers.add_parser('inference_evaluation')
408 |     parser_inference_validation.add_argument('--workspace', type=str, required=True, help='Directory of your workspace.')
409 |     parser_inference_validation.add_argument('--subtask', type=str, choices=['a', 'b', 'c'], required=True, help='Correspond to 3 subtasks in DCASE2019 Task1.')
410 |     parser_inference_validation.add_argument('--data_type', type=str, choices=['leaderboard', 'evaluation'], required=True)
411 |     parser_inference_validation.add_argument('--model_type', type=str, required=True, help='E.g., Cnn_9layers_AvgPooling.')
412 |     parser_inference_validation.add_argument('--iteration', type=int, required=True, help='Load model of this iteration.')
413 |     parser_inference_validation.add_argument('--batch_size', type=int, required=True)
414 |     parser_inference_validation.add_argument('--cuda', action='store_true', default=False)
415 |     parser_inference_validation.add_argument('--mini_data', action='store_true', default=False, help='Set True for debugging on a small part of data.')
416 | 
417 |     # Parse arguments
418 |     args = parser.parse_args()
419 |     args.filename = get_filename(__file__)
420 | 
421 |     if args.mode == 'train':
422 |         train(args)
423 | 
424 |     elif args.mode == 'inference_validation':
425 |         inference_validation(args)
426 | 
427 |     elif args.mode == 'inference_evaluation':
428 |         inference_evaluation(args)
429 | 
430 |     else:
431 |         raise Exception('Error argument!')
432 | 


--------------------------------------------------------------------------------
/pytorch/models.py:
--------------------------------------------------------------------------------
   1 | import math
   2 | from torch.autograd import Variable
   3 | import torch
   4 | import torch.nn as nn
   5 | import torch.nn.functional as F
   6 | from nnAudio import Spectrogram as Spec
   7 | from torchlibrosa.augmentation import SpecAugmentation
   8 | import config
   9 | 
  10 | def init_layer(layer, nonlinearity='leaky_relu'):
  11 |     """Initialize a Linear or Convolutional layer. """
  12 |     nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
  13 | 
  14 |     if hasattr(layer, 'bias'):
  15 |         if layer.bias is not None:
  16 |             layer.bias.data.fill_(0.)
  17 |     
  18 |     
  19 | def init_bn(bn):
  20 |     """Initialize a Batchnorm layer. """
  21 |     
  22 |     bn.bias.data.fill_(0.)
  23 |     bn.running_mean.data.fill_(0.)
  24 |     bn.weight.data.fill_(1.)
  25 |     bn.running_var.data.fill_(1.)
  26 |     
  27 | def conv3x3(in_planes, out_planes, stride=1):
  28 |     """3x3 convolution with padding"""
  29 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
  30 |                      padding=1, bias=False)
  31 | 
  32 | def conv1x1(in_planes, out_planes, stride=1):
  33 |     """1x1 convolution"""
  34 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
  35 | 
  36 | class ConvBlock(nn.Module):
  37 |     def __init__(self, in_channels, out_channels):
  38 |         
  39 |         super(ConvBlock, self).__init__()
  40 |         
  41 |         self.conv1 = nn.Conv2d(in_channels=in_channels, 
  42 |                               out_channels=out_channels,
  43 |                               kernel_size=(3, 3), stride=(1, 1),
  44 |                               padding=(1, 1), bias=False)
  45 |                               
  46 |         self.conv2 = nn.Conv2d(in_channels=out_channels, 
  47 |                               out_channels=out_channels,
  48 |                               kernel_size=(3, 3), stride=(1, 1),
  49 |                               padding=(1, 1), bias=False)
  50 |                               
  51 |         self.bn1 = nn.BatchNorm2d(out_channels)
  52 |         self.bn2 = nn.BatchNorm2d(out_channels)
  53 |         
  54 |         self.init_weights()
  55 |         
  56 |     def init_weights(self):
  57 |         
  58 |         init_layer(self.conv1)
  59 |         init_layer(self.conv2)
  60 |         init_bn(self.bn1)
  61 |         init_bn(self.bn2)
  62 |         
  63 |     def forward(self, input, pool_size=(2, 2), pool_type='avg'):
  64 |         
  65 |         x = input
  66 |         x = F.relu_(self.bn1(self.conv1(x)))
  67 |         x = F.relu_(self.bn2(self.conv2(x)))
  68 |         if pool_type == 'max':
  69 |             x = F.max_pool2d(x, kernel_size=pool_size)
  70 |         elif pool_type == 'avg':
  71 |             x = F.avg_pool2d(x, kernel_size=pool_size)
  72 |         else:
  73 |             raise Exception('Incorrect argument!')
  74 |         
  75 |         return x
  76 | 
  77 | class Logmel_Cnn(nn.Module):
  78 |     
  79 |     def __init__(self, classes_num, activation):
  80 |         super(Logmel_Cnn, self).__init__()
  81 | 
  82 |         self.activation = activation
  83 |         self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False)
  84 |         #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2)
  85 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
  86 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
  87 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
  88 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
  89 |         self.fc2 = nn.Linear(512, 512, bias=True)
  90 |         self.fc = nn.Linear(512, classes_num, bias=True)
  91 | 
  92 |         self.init_weights()
  93 | 
  94 |     def init_weights(self):
  95 | 
  96 |         init_layer(self.fc)
  97 |         init_layer(self.fc2)
  98 | 
  99 |     def forward(self, x1,x2,x3,x4):
 100 |         '''
 101 |         Input: (batch_size, times_steps, freq_bins)'''
 102 |         
 103 |         x = x1
 104 |         x = self.spec_layer(x)
 105 |         x = 10*torch.log10(x)
 106 |         x = torch.clamp(x, min=-100.)
 107 |         x = x.transpose(1,2)
 108 |         x = x[:,None,:,:]
 109 |         #if self.training:
 110 |             #x = self.spec_augmenter(x)
 111 |         '''(batch_size, 1, times_steps, freq_bins)'''
 112 |         
 113 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
 114 |         #x = F.dropout(x, p=0.2, training=self.training)
 115 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
 116 |         #x = F.dropout(x, p=0.2, training=self.training)
 117 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
 118 |         #x = F.dropout(x, p=0.2, training=self.training)
 119 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
 120 |         #x = F.dropout(x, p=0.2, training=self.training)
 121 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 122 |         
 123 |         x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
 124 |         (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
 125 |         x = F.relu_(self.fc2(x))
 126 |         x = F.dropout(x, p=0.5, training=self.training)
 127 |         x = self.fc(x)
 128 |         loss = torch.softmax(x,-1)
 129 |         if self.activation == 'logsoftmax':
 130 |             output = F.log_softmax(x, dim=-1)
 131 |             
 132 |         elif self.activation == 'sigmoid':
 133 |             output = torch.sigmoid(x)
 134 |         
 135 |         return output,loss
 136 | 
 137 | class Cqt_Cnn(nn.Module):
 138 |     
 139 |     def __init__(self, classes_num, activation):
 140 |         super(Cqt_Cnn, self).__init__()
 141 | 
 142 |         self.activation = activation
 143 |         self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect')
 144 |         #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2)
 145 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
 146 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
 147 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
 148 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
 149 |         self.fc2 = nn.Linear(512, 512, bias=True)
 150 |         self.fc = nn.Linear(512, classes_num, bias=True)
 151 | 
 152 |         self.init_weights()
 153 | 
 154 |     def init_weights(self):
 155 | 
 156 |         init_layer(self.fc)
 157 |         init_layer(self.fc2)
 158 | 
 159 |     def forward(self, x1,x2,x3,x4):
 160 |         '''
 161 |         Input: (batch_size, times_steps, freq_bins)'''
 162 |         
 163 |         x = x1
 164 |         x = self.spec_layer(x)
 165 |         x = 10*torch.log10(x)
 166 |         x = torch.clamp(x, min=-100.)
 167 |         x = x.transpose(1,2)
 168 |         x = x[:,None,:,:]
 169 |         #if self.training:
 170 |             #x = self.spec_augmenter(x)
 171 |         '''(batch_size, 1, times_steps, freq_bins)'''
 172 |         
 173 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
 174 |         # x = F.dropout(x, p=0.2, training=self.training)
 175 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
 176 |         # x = F.dropout(x, p=0.2, training=self.training)
 177 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
 178 |         # x = F.dropout(x, p=0.2, training=self.training)
 179 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
 180 |         # x = F.dropout(x, p=0.2, training=self.training)
 181 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 182 |         
 183 |         x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
 184 |         (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
 185 |         x = F.relu_(self.fc2(x))
 186 |         x = F.dropout(x, p=0.5, training=self.training)
 187 |         x = self.fc(x)
 188 |         loss = torch.softmax(x,-1)
 189 |         if self.activation == 'logsoftmax':
 190 |             output = F.log_softmax(x, dim=-1)
 191 |             
 192 |         elif self.activation == 'sigmoid':
 193 |             output = torch.sigmoid(x)
 194 |         
 195 |         return output,loss
 196 | 
 197 | class Gamm_Cnn(nn.Module):
 198 |     
 199 |     def __init__(self, classes_num, activation):
 200 |         super(Gamm_Cnn, self).__init__()
 201 | 
 202 |         self.activation = activation
 203 |         #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2)
 204 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
 205 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
 206 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
 207 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
 208 |         self.fc2 = nn.Linear(512, 512, bias=True)
 209 |         self.fc = nn.Linear(512, classes_num, bias=True)
 210 | 
 211 |         self.init_weights()
 212 | 
 213 |     def init_weights(self):
 214 | 
 215 |         init_layer(self.fc)
 216 |         init_layer(self.fc2)
 217 | 
 218 |     def forward(self, x1,x2,x3,x4):
 219 |         '''
 220 |         Input: (batch_size, times_steps, freq_bins)'''
 221 |         
 222 |         x = x2
 223 |         x = x[:,None,:,:]
 224 |         #if self.training:
 225 |             #x = self.spec_augmenter(x)
 226 |         '''(batch_size, 1, times_steps, freq_bins)'''
 227 |         
 228 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
 229 |         # x = F.dropout(x, p=0.2, training=self.training)
 230 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
 231 |         # x = F.dropout(x, p=0.2, training=self.training)
 232 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
 233 |         # x = F.dropout(x, p=0.2, training=self.training)
 234 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
 235 |         # x = F.dropout(x, p=0.2, training=self.training)
 236 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 237 |         
 238 |         x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
 239 |         (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
 240 |         x = F.relu_(self.fc2(x))
 241 |         x = F.dropout(x, p=0.5, training=self.training)
 242 |         x = self.fc(x)
 243 |         loss = torch.softmax(x,-1)
 244 |         if self.activation == 'logsoftmax':
 245 |             output = F.log_softmax(x, dim=-1)
 246 |             
 247 |         elif self.activation == 'sigmoid':
 248 |             output = torch.sigmoid(x)
 249 |         
 250 |         return output,loss
 251 | 
 252 | class Mfcc_Cnn(nn.Module):
 253 |     
 254 |     def __init__(self, classes_num, activation):
 255 |         super(Mfcc_Cnn, self).__init__()
 256 | 
 257 |         self.activation = activation
 258 |         #self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=5, freq_stripes_num=2)
 259 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
 260 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
 261 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
 262 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
 263 |         self.fc2 = nn.Linear(512, 512, bias=True)
 264 |         self.fc = nn.Linear(512, classes_num, bias=True)
 265 | 
 266 |         self.init_weights()
 267 | 
 268 |     def init_weights(self):
 269 | 
 270 |         init_layer(self.fc)
 271 |         init_layer(self.fc2)
 272 | 
 273 |     def forward(self, x1,x2,x3,x4):
 274 |         '''
 275 |         Input: (batch_size, times_steps, freq_bins)'''
 276 |         
 277 |         x = x3
 278 |         x = x[:,None,:,:]
 279 |         #if self.training:
 280 |             #x = self.spec_augmenter(x)
 281 |         '''(batch_size, 1, times_steps, freq_bins)'''
 282 |         
 283 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
 284 |         # x = F.dropout(x, p=0.2, training=self.training)
 285 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
 286 |         # x = F.dropout(x, p=0.2, training=self.training)
 287 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
 288 |         # x = F.dropout(x, p=0.2, training=self.training)
 289 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
 290 |         # x = F.dropout(x, p=0.2, training=self.training)
 291 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 292 |         
 293 |         x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
 294 |         (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
 295 |         x = F.relu_(self.fc2(x))
 296 |         x = F.dropout(x, p=0.5, training=self.training)
 297 |         x = self.fc(x)
 298 |         loss = torch.softmax(x,-1)
 299 |         if self.activation == 'logsoftmax':
 300 |             output = F.log_softmax(x, dim=-1)
 301 |             
 302 |         elif self.activation == 'sigmoid':
 303 |             output = torch.sigmoid(x)
 304 |         
 305 |         return output,loss
 306 | 
 307 | class DCMR(nn.Module):
 308 |     
 309 |     def __init__(self, classes_num, activation):
 310 |         super(DCMR, self).__init__()
 311 | 
 312 |         self.activation = activation
 313 |         self.logmel_cnn = Logmel_Cnn(classes_num, activation)
 314 |         self.cqt_cnn = Cqt_Cnn(classes_num, activation)
 315 |         self.gamm_cnn = Gamm_Cnn(classes_num, activation)
 316 |         self.mfcc_cnn = Mfcc_Cnn(classes_num, activation)
 317 |         self.ite = 15000 # set your pre-trained model iteration
 318 |         self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/' # set your model saved path.
 319 |         self.init_weights()
 320 | 
 321 |     def init_weights(self):
 322 |         logmel_path = self.model_path +'Logmel_Cnn/'+str(self.ite)+'_iterations.pth'
 323 |         cqt_path = self.model_path +'Cqt_Cnn/'+str(self.ite)+'_iterations.pth'
 324 |         gamm_path = self.model_path +'Gamm_Cnn/'+str(self.ite)+'_iterations.pth'
 325 |         mfcc_path = self.model_path +'Mfcc_Cnn/'+str(self.ite)+'_iterations.pth'
 326 | 
 327 |         logmel_ch = torch.load(logmel_path)
 328 |         cqt_ch = torch.load(cqt_path)
 329 |         gamm_ch = torch.load(gamm_path)
 330 |         mfcc_ch = torch.load(mfcc_path)
 331 |         self.logmel_cnn.load_state_dict(logmel_ch['model'])
 332 |         self.cqt_cnn.load_state_dict(cqt_ch['model'])
 333 |         self.gamm_cnn.load_state_dict(gamm_ch['model'])
 334 |         self.mfcc_cnn.load_state_dict(mfcc_ch['model'])
 335 | 
 336 |     def forward(self, input1,input2,input3,input4):
 337 |         '''
 338 |         Input: (batch_size, total_frames)'''
 339 |         
 340 |         x1,loss1 = self.logmel_cnn(input1,input2,input3,input4)
 341 |         x2,loss2 = self.cqt_cnn(input1,input2,input3,input4)
 342 |         x3,loss3 = self.gamm_cnn(input1,input2,input3,input4)
 343 |         x4,loss4 = self.mfcc_cnn(input1,input2,input3,input4)
 344 |         x = x1 + x2 + x3 + x4
 345 |         loss = (loss1+loss2+loss3+loss4)/4.
 346 |         return x,loss
 347 | 
 348 | class Logmel_DCMF(nn.Module):
 349 |     
 350 |     def __init__(self, classes_num, activation):
 351 |         super(Logmel_DCMF, self).__init__()
 352 | 
 353 |         self.activation = activation
 354 |         self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False)
 355 |         self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64)
 356 |         self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128)
 357 |         self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256)
 358 |         self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512)
 359 |         
 360 |         self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64)
 361 |         self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128)
 362 |         self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256)
 363 |         self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512)
 364 |                               
 365 |         self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64)
 366 |         self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128)
 367 |         self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256)
 368 |         self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512)
 369 |                               
 370 |         self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64)
 371 |         self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128)
 372 |         self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256)
 373 |         self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512)
 374 |                               
 375 |                              
 376 |         self.fc1_1 = nn.Linear(512, 512, bias=True)
 377 |         self.dropout1 = nn.Dropout(p=0.5)
 378 |         self.fc1_2 = nn.Linear(512, classes_num, bias=True)
 379 |         
 380 |         self.fc2_1 = nn.Linear(512, 512, bias=True)
 381 |         self.dropout2 = nn.Dropout(p=0.5)
 382 |         self.fc2_2 = nn.Linear(512, classes_num, bias=True)
 383 |         
 384 |         self.fc3_1 = nn.Linear(512, 512, bias=True)
 385 |         self.dropout3 = nn.Dropout(p=0.5)
 386 |         self.fc3_2 = nn.Linear(512, classes_num, bias=True)
 387 |         
 388 |         self.fc4_1 = nn.Linear(512, 512, bias=True)
 389 |         self.dropout4 = nn.Dropout(p=0.5)
 390 |         self.fc4_2 = nn.Linear(512, classes_num, bias=True)
 391 | 
 392 |         self.init_weights()
 393 | 
 394 |     def init_weights(self):
 395 | 
 396 |         init_layer(self.fc1_1)
 397 |         init_layer(self.fc1_2)
 398 |         init_layer(self.fc2_1)
 399 |         init_layer(self.fc2_2)
 400 |         init_layer(self.fc3_1)
 401 |         init_layer(self.fc3_2)
 402 |         init_layer(self.fc4_1)
 403 |         init_layer(self.fc4_2)
 404 | 
 405 |     def forward(self, x1,x2,x3,x4):
 406 |         '''
 407 |         Input: (batch_size, scale_num, total_frames)'''
 408 |         outputs = []
 409 |         
 410 |         x = input
 411 |         x = self.spec_layer(x1)
 412 |         x = 10*torch.log10(x)
 413 |         x = torch.clamp(x, min=-100.)
 414 |         x = x.transpose(1,2)
 415 |         x = x[:,None,:,:]
 416 |         '''(batch_size, 1, times_steps, freq_bins)'''
 417 |         temp = x.clone()
 418 |         count = 0.
 419 |         
 420 |         x = temp[:,:,:,0:16]
 421 |         x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg')
 422 |         x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg')
 423 |         x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg')
 424 |         x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg')
 425 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 426 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 427 |         x = F.relu_(self.fc1_1(x))
 428 |         x = self.dropout1(x)
 429 |         x = self.fc1_2(x)
 430 |         x = x.view(x.shape[0], 1, x.shape[1])
 431 |         co_output = x
 432 |         count += 1.
 433 |         outputs.append(x)
 434 |         
 435 |         
 436 |         x = temp[:,:,:,8:24]
 437 |         x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg')
 438 |         x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg')
 439 |         x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg')
 440 |         x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg')
 441 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 442 |         x = F.relu_(self.fc2_1(x))
 443 |         x = self.dropout2(x)
 444 |         x = self.fc2_2(x)
 445 |         x = x.view(x.shape[0], 1, x.shape[1])
 446 |         co_output = x + co_output
 447 |         count += 1.
 448 |         outputs.append(x)
 449 |         
 450 |         x = temp[:,:,:,16:32]
 451 |         x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg')
 452 |         x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg')
 453 |         x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg')
 454 |         x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg')
 455 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 456 |         x = F.relu_(self.fc3_1(x))
 457 |         x = self.dropout3(x)
 458 |         x = self.fc3_2(x)
 459 |         x = x.view(x.shape[0], 1, x.shape[1])
 460 |         co_output = x + co_output
 461 |         count += 1.
 462 |         outputs.append(x)
 463 |         
 464 |         x = temp[:,:,:,24:40]
 465 |         x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg')
 466 |         x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg')
 467 |         x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg')
 468 |         x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg')
 469 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 470 |         x = F.relu_(self.fc4_1(x))
 471 |         x = self.dropout4(x)
 472 |         x = self.fc4_2(x)
 473 |         x = x.view(x.shape[0], 1, x.shape[1])
 474 |         co_output = x + co_output
 475 |         count += 1.
 476 |         outputs.append(x)
 477 |         
 478 |         outputs.append(co_output/count)
 479 |         outputs = torch.cat((outputs), 1)
 480 |         loss = torch.softmax(outputs,-1)
 481 |         if self.activation == 'logsoftmax':
 482 |             output = F.log_softmax(outputs, dim=-1)
 483 |             
 484 |         elif self.activation == 'sigmoid':
 485 |             output = torch.sigmoid(outputs)
 486 |             
 487 |         return output,loss
 488 | 
 489 | class Mfcc_DCMF(nn.Module):
 490 |     
 491 |     def __init__(self, classes_num, activation):
 492 |         super(Mfcc_DCMF, self).__init__()
 493 | 
 494 |         self.activation = activation
 495 |         
 496 |         self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64)
 497 |         self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128)
 498 |         self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256)
 499 |         self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512)
 500 |         
 501 |         self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64)
 502 |         self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128)
 503 |         self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256)
 504 |         self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512)
 505 |                               
 506 |         self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64)
 507 |         self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128)
 508 |         self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256)
 509 |         self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512)
 510 |                               
 511 |         self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64)
 512 |         self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128)
 513 |         self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256)
 514 |         self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512)
 515 |                               
 516 |                              
 517 |         self.fc1_1 = nn.Linear(512, 512, bias=True)
 518 |         self.dropout1 = nn.Dropout(p=0.5)
 519 |         self.fc1_2 = nn.Linear(512, classes_num, bias=True)
 520 |         
 521 |         self.fc2_1 = nn.Linear(512, 512, bias=True)
 522 |         self.dropout2 = nn.Dropout(p=0.5)
 523 |         self.fc2_2 = nn.Linear(512, classes_num, bias=True)
 524 |         
 525 |         self.fc3_1 = nn.Linear(512, 512, bias=True)
 526 |         self.dropout3 = nn.Dropout(p=0.5)
 527 |         self.fc3_2 = nn.Linear(512, classes_num, bias=True)
 528 |         
 529 |         self.fc4_1 = nn.Linear(512, 512, bias=True)
 530 |         self.dropout4 = nn.Dropout(p=0.5)
 531 |         self.fc4_2 = nn.Linear(512, classes_num, bias=True)
 532 | 
 533 |         self.init_weights()
 534 | 
 535 |     def init_weights(self):
 536 | 
 537 |         init_layer(self.fc1_1)
 538 |         init_layer(self.fc1_2)
 539 |         init_layer(self.fc2_1)
 540 |         init_layer(self.fc2_2)
 541 |         init_layer(self.fc3_1)
 542 |         init_layer(self.fc3_2)
 543 |         init_layer(self.fc4_1)
 544 |         init_layer(self.fc4_2)
 545 | 
 546 |     def forward(self, x1,x2,x3,x4):
 547 |         '''
 548 |         Input: (batch_size, scale_num, total_frames)'''
 549 |         outputs = []
 550 |         
 551 |         x = x3
 552 |         x = x[:,None,:,:]
 553 |         '''(batch_size, 1, times_steps, freq_bins)'''
 554 |         temp = x.clone()
 555 |         count = 0.
 556 |         
 557 |         x = temp[:,:,:,0:16]
 558 |         x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg')
 559 |         x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg')
 560 |         x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg')
 561 |         x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg')
 562 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 563 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 564 |         x = F.relu_(self.fc1_1(x))
 565 |         x = self.dropout1(x)
 566 |         x = self.fc1_2(x)
 567 |         x = x.view(x.shape[0], 1, x.shape[1])
 568 |         co_output = x
 569 |         count += 1.
 570 |         outputs.append(x)
 571 |         
 572 |         
 573 |         x = temp[:,:,:,8:24]
 574 |         x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg')
 575 |         x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg')
 576 |         x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg')
 577 |         x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg')
 578 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 579 |         x = F.relu_(self.fc2_1(x))
 580 |         x = self.dropout2(x)
 581 |         x = self.fc2_2(x)
 582 |         x = x.view(x.shape[0], 1, x.shape[1])
 583 |         co_output = x + co_output
 584 |         count += 1.
 585 |         outputs.append(x)
 586 |         
 587 |         x = temp[:,:,:,16:32]
 588 |         x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg')
 589 |         x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg')
 590 |         x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg')
 591 |         x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg')
 592 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 593 |         x = F.relu_(self.fc3_1(x))
 594 |         x = self.dropout3(x)
 595 |         x = self.fc3_2(x)
 596 |         x = x.view(x.shape[0], 1, x.shape[1])
 597 |         co_output = x + co_output
 598 |         count += 1.
 599 |         outputs.append(x)
 600 |         
 601 |         x = temp[:,:,:,24:40]
 602 |         x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg')
 603 |         x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg')
 604 |         x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg')
 605 |         x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg')
 606 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 607 |         x = F.relu_(self.fc4_1(x))
 608 |         x = self.dropout4(x)
 609 |         x = self.fc4_2(x)
 610 |         x = x.view(x.shape[0], 1, x.shape[1])
 611 |         co_output = x + co_output
 612 |         count += 1.
 613 |         outputs.append(x)
 614 |         
 615 |         outputs.append(co_output/count)
 616 |         outputs = torch.cat((outputs), 1)
 617 |         loss = torch.softmax(outputs,-1)
 618 |         if self.activation == 'logsoftmax':
 619 |             output = F.log_softmax(outputs, dim=-1)
 620 |             
 621 |         elif self.activation == 'sigmoid':
 622 |             output = torch.sigmoid(outputs)
 623 |             
 624 |         return output,loss
 625 |     
 626 | class Cqt_DCMF(nn.Module):
 627 |     
 628 |     def __init__(self, classes_num, activation):
 629 |         super(Cqt_DCMF, self).__init__()
 630 | 
 631 |         self.activation = activation
 632 |         self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect')
 633 |         self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64)
 634 |         self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128)
 635 |         self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256)
 636 |         self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512)
 637 |         
 638 |         self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64)
 639 |         self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128)
 640 |         self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256)
 641 |         self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512)
 642 |                               
 643 |         self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64)
 644 |         self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128)
 645 |         self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256)
 646 |         self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512)
 647 |                               
 648 |         self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64)
 649 |         self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128)
 650 |         self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256)
 651 |         self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512)
 652 |         
 653 |         self.conv_block5_1 = ConvBlock(in_channels=1, out_channels=64)
 654 |         self.conv_block5_2 = ConvBlock(in_channels=64, out_channels=128)
 655 |         self.conv_block5_3 = ConvBlock(in_channels=128, out_channels=256)
 656 |         self.conv_block5_4 = ConvBlock(in_channels=256, out_channels=512)
 657 |         
 658 |         self.conv_block6_1 = ConvBlock(in_channels=1, out_channels=64)
 659 |         self.conv_block6_2 = ConvBlock(in_channels=64, out_channels=128)
 660 |         self.conv_block6_3 = ConvBlock(in_channels=128, out_channels=256)
 661 |         self.conv_block6_4 = ConvBlock(in_channels=256, out_channels=512)
 662 |         
 663 |         self.conv_block7_1 = ConvBlock(in_channels=1, out_channels=64)
 664 |         self.conv_block7_2 = ConvBlock(in_channels=64, out_channels=128)
 665 |         self.conv_block7_3 = ConvBlock(in_channels=128, out_channels=256)
 666 |         self.conv_block7_4 = ConvBlock(in_channels=256, out_channels=512)
 667 |                               
 668 |                              
 669 |         self.fc1_1 = nn.Linear(512, 512, bias=True)
 670 |         self.dropout1 = nn.Dropout(p=0.5)
 671 |         self.fc1_2 = nn.Linear(512, classes_num, bias=True)
 672 |         
 673 |         self.fc2_1 = nn.Linear(512, 512, bias=True)
 674 |         self.dropout2 = nn.Dropout(p=0.5)
 675 |         self.fc2_2 = nn.Linear(512, classes_num, bias=True)
 676 |         
 677 |         self.fc3_1 = nn.Linear(512, 512, bias=True)
 678 |         self.dropout3 = nn.Dropout(p=0.5)
 679 |         self.fc3_2 = nn.Linear(512, classes_num, bias=True)
 680 |         
 681 |         self.fc4_1 = nn.Linear(512, 512, bias=True)
 682 |         self.dropout4 = nn.Dropout(p=0.5)
 683 |         self.fc4_2 = nn.Linear(512, classes_num, bias=True)
 684 |         
 685 |         self.fc5_1 = nn.Linear(512, 512, bias=True)
 686 |         self.dropout5 = nn.Dropout(p=0.5)
 687 |         self.fc5_2 = nn.Linear(512, classes_num, bias=True)
 688 |         
 689 |         self.fc6_1 = nn.Linear(512, 512, bias=True)
 690 |         self.dropout6 = nn.Dropout(p=0.5)
 691 |         self.fc6_2 = nn.Linear(512, classes_num, bias=True)
 692 |         
 693 |         self.fc7_1 = nn.Linear(512, 512, bias=True)
 694 |         self.dropout7 = nn.Dropout(p=0.5)
 695 |         self.fc7_2 = nn.Linear(512, classes_num, bias=True)
 696 | 
 697 |         self.init_weights()
 698 | 
 699 |     def init_weights(self):
 700 | 
 701 |         init_layer(self.fc1_1)
 702 |         init_layer(self.fc1_2)
 703 |         init_layer(self.fc2_1)
 704 |         init_layer(self.fc2_2)
 705 |         init_layer(self.fc3_1)
 706 |         init_layer(self.fc3_2)
 707 |         init_layer(self.fc4_1)
 708 |         init_layer(self.fc4_2)
 709 |         init_layer(self.fc5_1)
 710 |         init_layer(self.fc5_2)
 711 |         init_layer(self.fc6_1)
 712 |         init_layer(self.fc6_2)
 713 |         init_layer(self.fc7_1)
 714 |         init_layer(self.fc7_2)
 715 | 
 716 |     def forward(self, x1,x2,x3,x4):
 717 |         '''
 718 |         Input: (batch_size, scale_num, total_frames)'''
 719 |         outputs = []
 720 |         
 721 |         x = input
 722 |         x = self.spec_layer(x1)
 723 |         x = 10*torch.log10(x)
 724 |         x = torch.clamp(x, min=-100.)
 725 |         x = x.transpose(1,2)
 726 |         x = x[:,None,:,:]
 727 |         '''(batch_size, 1, times_steps, freq_bins)'''
 728 |         temp = x.clone()
 729 |         count = 0.
 730 |         
 731 |         x = temp[:,:,:,0:16]
 732 |         x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg')
 733 |         x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg')
 734 |         x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg')
 735 |         x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg')
 736 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 737 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 738 |         x = F.relu_(self.fc1_1(x))
 739 |         x = self.dropout1(x)
 740 |         x = self.fc1_2(x)
 741 |         x = x.view(x.shape[0], 1, x.shape[1])
 742 |         co_output = x
 743 |         count += 1.
 744 |         outputs.append(x)
 745 |         
 746 |         
 747 |         x = temp[:,:,:,8:24]
 748 |         x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg')
 749 |         x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg')
 750 |         x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg')
 751 |         x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg')
 752 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 753 |         x = F.relu_(self.fc2_1(x))
 754 |         x = self.dropout2(x)
 755 |         x = self.fc2_2(x)
 756 |         x = x.view(x.shape[0], 1, x.shape[1])
 757 |         co_output = x + co_output
 758 |         count += 1.
 759 |         outputs.append(x)
 760 |         
 761 |         x = temp[:,:,:,16:32]
 762 |         x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg')
 763 |         x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg')
 764 |         x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg')
 765 |         x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg')
 766 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 767 |         x = F.relu_(self.fc3_1(x))
 768 |         x = self.dropout3(x)
 769 |         x = self.fc3_2(x)
 770 |         x = x.view(x.shape[0], 1, x.shape[1])
 771 |         co_output = x + co_output
 772 |         count += 1.
 773 |         outputs.append(x)
 774 |         
 775 |         x = temp[:,:,:,24:40]
 776 |         x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg')
 777 |         x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg')
 778 |         x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg')
 779 |         x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg')
 780 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 781 |         x = F.relu_(self.fc4_1(x))
 782 |         x = self.dropout4(x)
 783 |         x = self.fc4_2(x)
 784 |         x = x.view(x.shape[0], 1, x.shape[1])
 785 |         co_output = x + co_output
 786 |         count += 1.
 787 |         outputs.append(x)
 788 |         
 789 |         x = temp[:,:,:,32:48]
 790 |         x = self.conv_block5_1(x, pool_size=(4, 2), pool_type='avg')
 791 |         x = self.conv_block5_2(x, pool_size=(4, 2), pool_type='avg')
 792 |         x = self.conv_block5_3(x, pool_size=(2, 2), pool_type='avg')
 793 |         x = self.conv_block5_4(x, pool_size=(2, 2), pool_type='avg')
 794 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 795 |         x = F.relu_(self.fc5_1(x))
 796 |         x = self.dropout5(x)
 797 |         x = self.fc5_2(x)
 798 |         x = x.view(x.shape[0], 1, x.shape[1])
 799 |         co_output = x + co_output
 800 |         count += 1.
 801 |         outputs.append(x)
 802 |         
 803 |         x = temp[:,:,:,40:56]
 804 |         x = self.conv_block6_1(x, pool_size=(4, 2), pool_type='avg')
 805 |         x = self.conv_block6_2(x, pool_size=(4, 2), pool_type='avg')
 806 |         x = self.conv_block6_3(x, pool_size=(2, 2), pool_type='avg')
 807 |         x = self.conv_block6_4(x, pool_size=(2, 2), pool_type='avg')
 808 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 809 |         x = F.relu_(self.fc6_1(x))
 810 |         x = self.dropout6(x)
 811 |         x = self.fc6_2(x)
 812 |         x = x.view(x.shape[0], 1, x.shape[1])
 813 |         co_output = x + co_output
 814 |         count += 1.
 815 |         outputs.append(x)
 816 |         
 817 |         x = temp[:,:,:,48:64]
 818 |         x = self.conv_block7_1(x, pool_size=(4, 2), pool_type='avg')
 819 |         x = self.conv_block7_2(x, pool_size=(4, 2), pool_type='avg')
 820 |         x = self.conv_block7_3(x, pool_size=(2, 2), pool_type='avg')
 821 |         x = self.conv_block7_4(x, pool_size=(2, 2), pool_type='avg')
 822 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 823 |         x = F.relu_(self.fc7_1(x))
 824 |         x = self.dropout7(x)
 825 |         x = self.fc7_2(x)
 826 |         x = x.view(x.shape[0], 1, x.shape[1])
 827 |         co_output = x + co_output
 828 |         count += 1.
 829 |         outputs.append(x)
 830 |         
 831 |         outputs.append(co_output/count)
 832 |         outputs = torch.cat((outputs), 1)
 833 |         loss = torch.softmax(outputs,-1)
 834 |         if self.activation == 'logsoftmax':
 835 |             output = F.log_softmax(outputs, dim=-1)
 836 |             
 837 |         elif self.activation == 'sigmoid':
 838 |             output = torch.sigmoid(outputs)
 839 |             
 840 |         return output,loss
 841 | 
 842 | class Gamm_DCMF(nn.Module):
 843 |     
 844 |     def __init__(self, classes_num, activation, fixed=False):
 845 |         super(Gamm_DCMF, self).__init__()
 846 | 
 847 |         self.activation = activation
 848 |         self.conv_block1_1 = ConvBlock(in_channels=1, out_channels=64)
 849 |         self.conv_block1_2 = ConvBlock(in_channels=64, out_channels=128)
 850 |         self.conv_block1_3 = ConvBlock(in_channels=128, out_channels=256)
 851 |         self.conv_block1_4 = ConvBlock(in_channels=256, out_channels=512)
 852 |         
 853 |         self.conv_block2_1 = ConvBlock(in_channels=1, out_channels=64)
 854 |         self.conv_block2_2 = ConvBlock(in_channels=64, out_channels=128)
 855 |         self.conv_block2_3 = ConvBlock(in_channels=128, out_channels=256)
 856 |         self.conv_block2_4 = ConvBlock(in_channels=256, out_channels=512)
 857 |                               
 858 |         self.conv_block3_1 = ConvBlock(in_channels=1, out_channels=64)
 859 |         self.conv_block3_2 = ConvBlock(in_channels=64, out_channels=128)
 860 |         self.conv_block3_3 = ConvBlock(in_channels=128, out_channels=256)
 861 |         self.conv_block3_4 = ConvBlock(in_channels=256, out_channels=512)
 862 |                               
 863 |         self.conv_block4_1 = ConvBlock(in_channels=1, out_channels=64)
 864 |         self.conv_block4_2 = ConvBlock(in_channels=64, out_channels=128)
 865 |         self.conv_block4_3 = ConvBlock(in_channels=128, out_channels=256)
 866 |         self.conv_block4_4 = ConvBlock(in_channels=256, out_channels=512)
 867 |         
 868 |         self.conv_block5_1 = ConvBlock(in_channels=1, out_channels=64)
 869 |         self.conv_block5_2 = ConvBlock(in_channels=64, out_channels=128)
 870 |         self.conv_block5_3 = ConvBlock(in_channels=128, out_channels=256)
 871 |         self.conv_block5_4 = ConvBlock(in_channels=256, out_channels=512)
 872 |         
 873 |         self.conv_block6_1 = ConvBlock(in_channels=1, out_channels=64)
 874 |         self.conv_block6_2 = ConvBlock(in_channels=64, out_channels=128)
 875 |         self.conv_block6_3 = ConvBlock(in_channels=128, out_channels=256)
 876 |         self.conv_block6_4 = ConvBlock(in_channels=256, out_channels=512)
 877 |         
 878 |         self.conv_block7_1 = ConvBlock(in_channels=1, out_channels=64)
 879 |         self.conv_block7_2 = ConvBlock(in_channels=64, out_channels=128)
 880 |         self.conv_block7_3 = ConvBlock(in_channels=128, out_channels=256)
 881 |         self.conv_block7_4 = ConvBlock(in_channels=256, out_channels=512)
 882 |                               
 883 |                              
 884 |         self.fc1_1 = nn.Linear(512, 512, bias=True)
 885 |         self.dropout1 = nn.Dropout(p=0.5)
 886 |         self.fc1_2 = nn.Linear(512, classes_num, bias=True)
 887 |         
 888 |         self.fc2_1 = nn.Linear(512, 512, bias=True)
 889 |         self.dropout2 = nn.Dropout(p=0.5)
 890 |         self.fc2_2 = nn.Linear(512, classes_num, bias=True)
 891 |         
 892 |         self.fc3_1 = nn.Linear(512, 512, bias=True)
 893 |         self.dropout3 = nn.Dropout(p=0.5)
 894 |         self.fc3_2 = nn.Linear(512, classes_num, bias=True)
 895 |         
 896 |         self.fc4_1 = nn.Linear(512, 512, bias=True)
 897 |         self.dropout4 = nn.Dropout(p=0.5)
 898 |         self.fc4_2 = nn.Linear(512, classes_num, bias=True)
 899 |         
 900 |         self.fc5_1 = nn.Linear(512, 512, bias=True)
 901 |         self.dropout5 = nn.Dropout(p=0.5)
 902 |         self.fc5_2 = nn.Linear(512, classes_num, bias=True)
 903 |         
 904 |         self.fc6_1 = nn.Linear(512, 512, bias=True)
 905 |         self.dropout6 = nn.Dropout(p=0.5)
 906 |         self.fc6_2 = nn.Linear(512, classes_num, bias=True)
 907 |         
 908 |         self.fc7_1 = nn.Linear(512, 512, bias=True)
 909 |         self.dropout7 = nn.Dropout(p=0.5)
 910 |         self.fc7_2 = nn.Linear(512, classes_num, bias=True)
 911 | 
 912 |         self.init_weights()
 913 | 
 914 |     def init_weights(self):
 915 | 
 916 |         init_layer(self.fc1_1)
 917 |         init_layer(self.fc1_2)
 918 |         init_layer(self.fc2_1)
 919 |         init_layer(self.fc2_2)
 920 |         init_layer(self.fc3_1)
 921 |         init_layer(self.fc3_2)
 922 |         init_layer(self.fc4_1)
 923 |         init_layer(self.fc4_2)
 924 |         init_layer(self.fc5_1)
 925 |         init_layer(self.fc5_2)
 926 |         init_layer(self.fc6_1)
 927 |         init_layer(self.fc6_2)
 928 |         init_layer(self.fc7_1)
 929 |         init_layer(self.fc7_2)
 930 | 
 931 |     def forward(self, x1,x2,x3,x4):
 932 |         '''
 933 |         Input: (batch_size, scale_num, total_frames)'''
 934 |         outputs = []
 935 |         
 936 |         x = x2
 937 |         x = x[:,None,:,:]
 938 |         '''(batch_size, 1, times_steps, freq_bins)'''
 939 |         temp = x.clone()
 940 |         count = 0.
 941 |         
 942 |         x = temp[:,:,:,0:16]
 943 |         x = self.conv_block1_1(x, pool_size=(4, 2), pool_type='avg')
 944 |         x = self.conv_block1_2(x, pool_size=(4, 2), pool_type='avg')
 945 |         x = self.conv_block1_3(x, pool_size=(2, 2), pool_type='avg')
 946 |         x = self.conv_block1_4(x, pool_size=(2, 2), pool_type='avg')
 947 |         '''(batch_size, feature_maps, time_steps, freq_bins)'''
 948 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 949 |         x = F.relu_(self.fc1_1(x))
 950 |         x = self.dropout1(x)
 951 |         x = self.fc1_2(x)
 952 |         x = x.view(x.shape[0], 1, x.shape[1])
 953 |         co_output = x
 954 |         count += 1.
 955 |         outputs.append(x)
 956 |         
 957 |         
 958 |         x = temp[:,:,:,8:24]
 959 |         x = self.conv_block2_1(x, pool_size=(4, 2), pool_type='avg')
 960 |         x = self.conv_block2_2(x, pool_size=(4, 2), pool_type='avg')
 961 |         x = self.conv_block2_3(x, pool_size=(2, 2), pool_type='avg')
 962 |         x = self.conv_block2_4(x, pool_size=(2, 2), pool_type='avg')
 963 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 964 |         x = F.relu_(self.fc2_1(x))
 965 |         x = self.dropout2(x)
 966 |         x = self.fc2_2(x)
 967 |         x = x.view(x.shape[0], 1, x.shape[1])
 968 |         co_output = x + co_output
 969 |         count += 1.
 970 |         outputs.append(x)
 971 |         
 972 |         x = temp[:,:,:,16:32]
 973 |         x = self.conv_block3_1(x, pool_size=(4, 2), pool_type='avg')
 974 |         x = self.conv_block3_2(x, pool_size=(4, 2), pool_type='avg')
 975 |         x = self.conv_block3_3(x, pool_size=(2, 2), pool_type='avg')
 976 |         x = self.conv_block3_4(x, pool_size=(2, 2), pool_type='avg')
 977 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 978 |         x = F.relu_(self.fc3_1(x))
 979 |         x = self.dropout3(x)
 980 |         x = self.fc3_2(x)
 981 |         x = x.view(x.shape[0], 1, x.shape[1])
 982 |         co_output = x + co_output
 983 |         count += 1.
 984 |         outputs.append(x)
 985 |         
 986 |         x = temp[:,:,:,24:40]
 987 |         x = self.conv_block4_1(x, pool_size=(4, 2), pool_type='avg')
 988 |         x = self.conv_block4_2(x, pool_size=(4, 2), pool_type='avg')
 989 |         x = self.conv_block4_3(x, pool_size=(2, 2), pool_type='avg')
 990 |         x = self.conv_block4_4(x, pool_size=(2, 2), pool_type='avg')
 991 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
 992 |         x = F.relu_(self.fc4_1(x))
 993 |         x = self.dropout4(x)
 994 |         x = self.fc4_2(x)
 995 |         x = x.view(x.shape[0], 1, x.shape[1])
 996 |         co_output = x + co_output
 997 |         count += 1.
 998 |         outputs.append(x)
 999 |         
1000 |         x = temp[:,:,:,32:48]
1001 |         x = self.conv_block5_1(x, pool_size=(4, 2), pool_type='avg')
1002 |         x = self.conv_block5_2(x, pool_size=(4, 2), pool_type='avg')
1003 |         x = self.conv_block5_3(x, pool_size=(2, 2), pool_type='avg')
1004 |         x = self.conv_block5_4(x, pool_size=(2, 2), pool_type='avg')
1005 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
1006 |         x = F.relu_(self.fc5_1(x))
1007 |         x = self.dropout5(x)
1008 |         x = self.fc5_2(x)
1009 |         x = x.view(x.shape[0], 1, x.shape[1])
1010 |         co_output = x + co_output
1011 |         count += 1.
1012 |         outputs.append(x)
1013 |         
1014 |         x = temp[:,:,:,40:56]
1015 |         x = self.conv_block6_1(x, pool_size=(4, 2), pool_type='avg')
1016 |         x = self.conv_block6_2(x, pool_size=(4, 2), pool_type='avg')
1017 |         x = self.conv_block6_3(x, pool_size=(2, 2), pool_type='avg')
1018 |         x = self.conv_block6_4(x, pool_size=(2, 2), pool_type='avg')
1019 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
1020 |         x = F.relu_(self.fc6_1(x))
1021 |         x = self.dropout6(x)
1022 |         x = self.fc6_2(x)
1023 |         x = x.view(x.shape[0], 1, x.shape[1])
1024 |         co_output = x + co_output
1025 |         count += 1.
1026 |         outputs.append(x)
1027 |         
1028 |         x = temp[:,:,:,48:64]
1029 |         x = self.conv_block7_1(x, pool_size=(4, 2), pool_type='avg')
1030 |         x = self.conv_block7_2(x, pool_size=(4, 2), pool_type='avg')
1031 |         x = self.conv_block7_3(x, pool_size=(2, 2), pool_type='avg')
1032 |         x = self.conv_block7_4(x, pool_size=(2, 2), pool_type='avg')
1033 |         (x, _) = torch.max(x[:,:,:,0], dim=2)    # (batch_size, feature_maps)
1034 |         x = F.relu_(self.fc7_1(x))
1035 |         x = self.dropout7(x)
1036 |         x = self.fc7_2(x)
1037 |         x = x.view(x.shape[0], 1, x.shape[1])
1038 |         co_output = x + co_output
1039 |         count += 1.
1040 |         outputs.append(x)
1041 |         
1042 |         outputs.append(co_output/count)
1043 |         outputs = torch.cat((outputs), 1)
1044 |         loss = torch.softmax(outputs,-1)
1045 |         if self.activation == 'logsoftmax':
1046 |             output = F.log_softmax(outputs, dim=-1)
1047 |             
1048 |         elif self.activation == 'sigmoid':
1049 |             output = torch.sigmoid(outputs)
1050 |             
1051 |         return output,loss
1052 |     
1053 | class Logmel_DCMT(nn.Module):
1054 |     
1055 |     def __init__(self, classes_num, activation):
1056 |         super(Logmel_DCMT, self).__init__()
1057 | 
1058 |         self.activation = activation
1059 |         self.spec_layer = Spec.MelSpectrogram(sr=config.sample_rate, n_fft=config.window_size, n_mels=config.mel_bins, hop_length=config.hop_size, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False)
1060 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
1061 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
1062 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
1063 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
1064 |         self.fc1 = nn.Linear(512, 512, bias=True)
1065 |         self.dropout = nn.Dropout(p=0.5)
1066 |         self.fc2 = nn.Linear(512, classes_num, bias=True)
1067 | 
1068 |         self.init_weights()
1069 | 
1070 |     def init_weights(self):
1071 | 
1072 |         init_layer(self.fc1)
1073 |         init_layer(self.fc2)
1074 | 
1075 |     def forward(self, x1,x2,x3,x4):
1076 |         '''
1077 |         Input: (batch_size, scale_num, total_frames)'''
1078 |         outputs = []
1079 |         
1080 |         x = input
1081 |         x = self.spec_layer(x1)
1082 |         x = 10*torch.log10(x)
1083 |         x = torch.clamp(x, min=-100.)
1084 |         x = x.transpose(1,2)
1085 |         x = x[:,None,:,:]
1086 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
1087 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
1088 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')   
1089 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 
1090 |         x = torch.mean(x, dim=3)
1091 |         temp = x.clone()
1092 |         
1093 |         count = 0.
1094 |         x = temp[:,:,0]
1095 |         x = F.relu_(self.fc1(x))
1096 |         x = self.dropout(x)
1097 |         x = self.fc2(x)
1098 |         x = x.view(x.shape[0], 1, x.shape[1])
1099 |         co_output = x
1100 |         count += 1.
1101 |         outputs.append(x)
1102 |         
1103 |         for i in range(1,temp.shape[2],1):
1104 |             x = temp[:,:,i]
1105 |             x = F.relu_(self.fc1(x))
1106 |             x = self.dropout(x)
1107 |             x = self.fc2(x)
1108 |             x = x.view(x.shape[0], 1, x.shape[1])
1109 |             co_output = x + co_output
1110 |             count += 1.
1111 |             outputs.append(x)
1112 | 
1113 |         outputs.append(co_output/count)
1114 |         outputs = torch.cat((outputs), 1)
1115 |         loss = torch.softmax(outputs,-1)
1116 |         if self.activation == 'logsoftmax':
1117 |             output = F.log_softmax(outputs, dim=-1)
1118 |             
1119 |         elif self.activation == 'sigmoid':
1120 |             output = torch.sigmoid(outputs)
1121 |             
1122 |         return output,loss
1123 | 
1124 | class Cqt_DCMT(nn.Module):
1125 |     
1126 |     def __init__(self, classes_num, activation):
1127 |         super(Cqt_DCMT, self).__init__()
1128 | 
1129 |         self.activation = activation
1130 |         self.spec_layer = Spec.CQT(sr=config.sample_rate, hop_length=config.hop_size, fmin=220, fmax=None, n_bins=64, bins_per_octave=12, norm=1, window='hann', center=True, pad_mode='reflect')
1131 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
1132 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
1133 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
1134 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
1135 |         self.fc1 = nn.Linear(512, 512, bias=True)
1136 |         self.dropout = nn.Dropout(p=0.5)
1137 |         self.fc2 = nn.Linear(512, classes_num, bias=True)
1138 | 
1139 |         self.init_weights()
1140 | 
1141 |     def init_weights(self):
1142 | 
1143 |         init_layer(self.fc1)
1144 |         init_layer(self.fc2)
1145 | 
1146 |     def forward(self, x1,x2,x3,x4):
1147 |         '''
1148 |         Input: (batch_size, scale_num, total_frames)'''
1149 |         outputs = []
1150 |         
1151 |         x = input
1152 |         x = self.spec_layer(x1)
1153 |         x = 10*torch.log10(x)
1154 |         x = torch.clamp(x, min=-100.)
1155 |         x = x.transpose(1,2)
1156 |         x = x[:,None,:,:]
1157 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
1158 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
1159 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')   
1160 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 
1161 |         x = torch.mean(x, dim=3)
1162 |         temp = x.clone()
1163 |         
1164 |         count = 0.
1165 |         x = temp[:,:,0]
1166 |         x = F.relu_(self.fc1(x))
1167 |         x = self.dropout(x)
1168 |         x = self.fc2(x)
1169 |         x = x.view(x.shape[0], 1, x.shape[1])
1170 |         co_output = x
1171 |         count += 1.
1172 |         outputs.append(x)
1173 |         
1174 |         for i in range(1,temp.shape[2],1):
1175 |             x = temp[:,:,i]
1176 |             x = F.relu_(self.fc1(x))
1177 |             x = self.dropout(x)
1178 |             x = self.fc2(x)
1179 |             x = x.view(x.shape[0], 1, x.shape[1])
1180 |             co_output = x + co_output
1181 |             count += 1.
1182 |             outputs.append(x)
1183 | 
1184 |         outputs.append(co_output/count)
1185 |         outputs = torch.cat((outputs), 1)
1186 |         loss = torch.softmax(outputs,-1)
1187 |         if self.activation == 'logsoftmax':
1188 |             output = F.log_softmax(outputs, dim=-1)
1189 |             
1190 |         elif self.activation == 'sigmoid':
1191 |             output = torch.sigmoid(outputs)
1192 |             
1193 |         return output,loss
1194 |     
1195 | class Gamm_DCMT(nn.Module):
1196 |     
1197 |     def __init__(self, classes_num, activation):
1198 |         super(Gamm_DCMT, self).__init__()
1199 | 
1200 |         self.activation = activation
1201 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
1202 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
1203 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
1204 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
1205 |         self.fc1 = nn.Linear(512, 512, bias=True)
1206 |         self.dropout = nn.Dropout(p=0.5)
1207 |         self.fc2 = nn.Linear(512, classes_num, bias=True)
1208 | 
1209 |         self.init_weights()
1210 | 
1211 |     def init_weights(self):
1212 | 
1213 |         init_layer(self.fc1)
1214 |         init_layer(self.fc2)
1215 | 
1216 |     def forward(self, x1,x2,x3,x4):
1217 |         '''
1218 |         Input: (batch_size, scale_num, total_frames)'''
1219 |         outputs = []
1220 |         
1221 |         x = x2
1222 |         x = x[:,None,:,:]
1223 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
1224 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
1225 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')   
1226 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 
1227 |         x = torch.mean(x, dim=3)
1228 |         temp = x.clone()
1229 |         
1230 |         count = 0.
1231 |         x = temp[:,:,0]
1232 |         x = F.relu_(self.fc1(x))
1233 |         x = self.dropout(x)
1234 |         x = self.fc2(x)
1235 |         x = x.view(x.shape[0], 1, x.shape[1])
1236 |         co_output = x
1237 |         count += 1.
1238 |         outputs.append(x)
1239 |         
1240 |         for i in range(1,temp.shape[2],1):
1241 |             x = temp[:,:,i]
1242 |             x = F.relu_(self.fc1(x))
1243 |             x = self.dropout(x)
1244 |             x = self.fc2(x)
1245 |             x = x.view(x.shape[0], 1, x.shape[1])
1246 |             co_output = x + co_output
1247 |             count += 1.
1248 |             outputs.append(x)
1249 | 
1250 |         outputs.append(co_output/count)
1251 |         outputs = torch.cat((outputs), 1)
1252 |         loss = torch.softmax(outputs,-1)
1253 |         if self.activation == 'logsoftmax':
1254 |             output = F.log_softmax(outputs, dim=-1)
1255 |             
1256 |         elif self.activation == 'sigmoid':
1257 |             output = torch.sigmoid(outputs)
1258 |             
1259 |         return output,loss
1260 |     
1261 | class Mfcc_DCMT(nn.Module):
1262 |     
1263 |     def __init__(self, classes_num, activation):
1264 |         super(Mfcc_DCMT, self).__init__()
1265 | 
1266 |         self.activation = activation
1267 |         self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
1268 |         self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
1269 |         self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
1270 |         self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
1271 |         self.fc1 = nn.Linear(512, 512, bias=True)
1272 |         self.dropout = nn.Dropout(p=0.5)
1273 |         self.fc2 = nn.Linear(512, classes_num, bias=True)
1274 | 
1275 |         self.init_weights()
1276 | 
1277 |     def init_weights(self):
1278 | 
1279 |         init_layer(self.fc1)
1280 |         init_layer(self.fc2)
1281 | 
1282 |     def forward(self, x1,x2,x3,x4):
1283 |         '''
1284 |         Input: (batch_size, scale_num, total_frames)'''
1285 |         outputs = []
1286 |         
1287 |         x = x3
1288 |         x = x[:,None,:,:]
1289 |         x = self.conv_block1(x, pool_size=(4, 2), pool_type='avg')
1290 |         x = self.conv_block2(x, pool_size=(4, 2), pool_type='avg')
1291 |         x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')   
1292 |         x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') 
1293 |         x = torch.mean(x, dim=3)
1294 |         temp = x.clone()
1295 |         
1296 |         count = 0.
1297 |         x = temp[:,:,0]
1298 |         x = F.relu_(self.fc1(x))
1299 |         x = self.dropout(x)
1300 |         x = self.fc2(x)
1301 |         x = x.view(x.shape[0], 1, x.shape[1])
1302 |         co_output = x
1303 |         count += 1.
1304 |         outputs.append(x)
1305 |         
1306 |         for i in range(1,temp.shape[2],1):
1307 |             x = temp[:,:,i]
1308 |             x = F.relu_(self.fc1(x))
1309 |             x = self.dropout(x)
1310 |             x = self.fc2(x)
1311 |             x = x.view(x.shape[0], 1, x.shape[1])
1312 |             co_output = x + co_output
1313 |             count += 1.
1314 |             outputs.append(x)
1315 | 
1316 |         outputs.append(co_output/count)
1317 |         outputs = torch.cat((outputs), 1)
1318 |         loss = torch.softmax(outputs,-1)
1319 |         if self.activation == 'logsoftmax':
1320 |             output = F.log_softmax(outputs, dim=-1)
1321 |             
1322 |         elif self.activation == 'sigmoid':
1323 |             output = torch.sigmoid(outputs)
1324 |             
1325 |         return output,loss
1326 | 
1327 | class DCMR_DCMT(nn.Module):
1328 |     
1329 |     def __init__(self, classes_num, activation):
1330 |         super(DCMR_DCMT, self).__init__()
1331 | 
1332 |         self.activation = activation
1333 |         self.model1 = Logmel_DCMT(classes_num, activation)
1334 |         self.model2 = Cqt_DCMT(classes_num, activation)
1335 |         self.model3 = Gamm_MultiFrames_CNN(classes_num, activation)
1336 |         self.model4 = Mfcc_MultiFrames_CNN(classes_num, activation)
1337 | 
1338 |         self.ite = 15000  # set your pre-trained model iteration
1339 |         self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/'  # set your model saved path.
1340 |         self.init_weights()
1341 | 
1342 |     def init_weights(self):
1343 |         model_path1 = self.model_path +'Logmel_DCMT/'+str(self.ite)+'_iterations.pth'
1344 |         model_path2 = self.model_path +'Cqt_DCMT/'+str(self.ite)+'_iterations.pth'
1345 |         model_path3 = self.model_path +'Gamm_DCMT/'+str(self.ite)+'_iterations.pth'
1346 |         model_path4 = self.model_path +'Mfcc_DCMT/'+str(self.ite)+'_iterations.pth'
1347 | 
1348 |         model_ch1 = torch.load(model_path1)
1349 |         model_ch2 = torch.load(model_path2)
1350 |         model_ch3 = torch.load(model_path3)
1351 |         model_ch4 = torch.load(model_path4)
1352 |         
1353 |         self.model1.load_state_dict(model_ch1['model'])
1354 |         self.model2.load_state_dict(model_ch2['model'])
1355 |         self.model3.load_state_dict(model_ch3['model'])
1356 |         self.model4.load_state_dict(model_ch4['model'])
1357 |        
1358 | 
1359 |     def forward(self, input1,input2,input3,input4):
1360 |         '''
1361 |         Input: (batch_size, total_frames)'''
1362 |         
1363 |         x1,loss1 = self.model1(input1,input2,input3,input4)
1364 |         x2,loss2 = self.model2(input1,input2,input3,input4)
1365 |         x3,loss3 = self.model3(input1,input2,input3,input4)
1366 |         x4,loss4 = self.model4(input1,input2,input3,input4)
1367 |         
1368 |         x = x1[:,-1] + x2[:,-1] + x3[:,-1] + x4[:,-1]
1369 |         loss = (loss1[:,-1]+loss2[:,-1]+loss3[:,-1]+loss4[:,-1])/4.
1370 |         return x,loss
1371 | 
1372 | class DCMR_DCMF(nn.Module):
1373 |     
1374 |     def __init__(self, classes_num, activation):
1375 |         super(Ensemble_CNN6, self).__init__()
1376 | 
1377 |         self.activation = activation
1378 |         self.model1 = Logmel_DCMF(classes_num, activation)
1379 |         self.model2 = Cqt_DCMF(classes_num, activation)
1380 |         self.model3 = Gamm_DCMF(classes_num, activation)
1381 |         self.model4 = Mfcc_DCMF(classes_num, activation)
1382 |         self.ite = 15000  # set your pre-trained model iteration
1383 |         self.model_path = 'workspace/checkpoints/main_eva/logmel_86frames_40melbins/TAU-urban-acoustic-scenes-2020-mobile-development/holdout_fold=1/'  # set your model saved path.
1384 |         self.init_weights()
1385 | 
1386 |     def init_weights(self):
1387 |         model_path1 = self.model_path +'Logmel_DCMF/'+str(self.ite)+'_iterations.pth'
1388 |         model_path2 = self.model_path +'Cqt_DCMF/'+str(self.ite)+'_iterations.pth'
1389 |         model_path3 = self.model_path +'Gamm_DCMF/'+str(self.ite)+'_iterations.pth'
1390 |         model_path4 = self.model_path +'Mfcc_DCMF/'+str(self.ite)+'_iterations.pth'
1391 | 
1392 |         model_ch1 = torch.load(model_path1)
1393 |         model_ch2 = torch.load(model_path2)
1394 |         model_ch3 = torch.load(model_path3)
1395 |         model_ch4 = torch.load(model_path4)
1396 |         
1397 |         self.model1.load_state_dict(model_ch1['model'])
1398 |         self.model2.load_state_dict(model_ch2['model'])
1399 |         self.model3.load_state_dict(model_ch3['model'])
1400 |         self.model4.load_state_dict(model_ch4['model'])
1401 |        
1402 | 
1403 |     def forward(self, input1,input2,input3,input4):
1404 |         '''
1405 |         Input: (batch_size, total_frames)'''
1406 |         
1407 |         x1,loss1 = self.model1(input1,input2,input3,input4)
1408 |         x2,loss2 = self.model2(input1,input2,input3,input4)
1409 |         x3,loss3 = self.model3(input1,input2,input3,input4)
1410 |         x4,loss4 = self.model4(input1,input2,input3,input4)
1411 |         
1412 |         x = x1[:,-1] + x2[:,-1] + x3[:,-1] + x4[:,-1]
1413 |         loss = (loss1[:,-1]+loss2[:,-1]+loss3[:,-1]+loss4[:,-1])/4.
1414 |         return x,loss
1415 |     
1416 | class DCMR_DCMF_DCMT(nn.Module):
1417 |     
1418 |     def __init__(self, classes_num, activation):
1419 |         super(DCMR_DCMF_DCMT, self).__init__()
1420 | 
1421 |         self.activation = activation
1422 |         self.model1 = DCMR_DCMT(classes_num, activation)
1423 |         self.model2 = DCMR_DCMF(classes_num, activation)
1424 | 
1425 |     def forward(self, input1,input2,input3,input4):
1426 |         '''
1427 |         Input: (batch_size, total_frames)'''
1428 |         
1429 |         x1,loss1 = self.model1(input1,input2,input3,input4)
1430 |         x2,loss2 = self.model2(input1,input2,input3,input4)
1431 | 
1432 |         x = x1 + x2
1433 |         loss = (loss1+loss2)/2.
1434 | 
1435 |         return x,loss  
1436 | 


--------------------------------------------------------------------------------