├── src ├── __init__.py ├── make_downsample.sh ├── general.py ├── make_speed.sh ├── ui.py ├── sound_event_detection.py ├── files.py ├── features.py ├── sound_event_detection_cnn.py ├── features_cnn.py └── evaluation.py ├── requirements.txt ├── run-cnn-pipeline.sh ├── .gitignore ├── README.md ├── task3_gmm_baseline.yaml ├── task3_cnn.yaml ├── task3_cnn.init.yaml ├── task3_cnn.augm.yaml ├── task3_gmm_baseline.py └── task3_cnn.py /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy>=0.15.1 2 | numpy>=1.9.2 3 | scikit-learn>=0.16.1 4 | pyyaml>=3.11 5 | librosa>=0.4.0 6 | keras>=1.0.1 7 | h5py>=2.6 8 | export KERAS_BACKEND=theano 9 | -------------------------------------------------------------------------------- /src/make_downsample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf data_16k 4 | cp -rf data data_16k 5 | 6 | for scene in home residential_area 7 | do 8 | for file in `ls data/TUT-sound-events-2016-development/audio/$scene/*.wav`; 9 | do 10 | fbase=`basename $file` 11 | sox --norm -t wavpcm $file -r 16000 -b 24 -c 1 -t wavpcm data_16k/TUT-sound-events-2016-development/audio/$scene/$fbase 12 | done 13 | done 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /run-cnn-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir logs 4 | 5 | echo "=== run baseline to prepare data folder and check that things work fine ===" 6 | python task3_gmm_baseline.py &> logs/task3_gmm_baseline.log 7 | 8 | echo "=== prepare 16kHz data ===" 9 | src/make_downsample.sh 10 | 11 | echo "=== train CNN model using the original train data ===" 12 | cp -rf task3_cnn.init.yaml task3_cnn.yaml 13 | python task3_cnn.py &> logs/task3_cnn.init.log 14 | 15 | echo "=== train CNN using speed perturbed data ===" 16 | src/make_speed.sh 17 | cp -rf task3_cnn.augm.yaml task3_cnn.yaml 18 | python task3_cnn.py &> logs/task3_cnn.augm.log 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/general.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import hashlib 6 | import json 7 | 8 | 9 | def check_path(path): 10 | """Check if path exists, if not creates one 11 | 12 | Parameters 13 | ---------- 14 | path : str 15 | Path to be checked. 16 | 17 | Returns 18 | ------- 19 | Nothing 20 | 21 | """ 22 | 23 | if not os.path.isdir(path): 24 | os.makedirs(path) 25 | 26 | 27 | def get_parameter_hash(params): 28 | """Get unique hash string (md5) for given parameter dict 29 | 30 | Parameters 31 | ---------- 32 | params : dict 33 | Input parameters 34 | 35 | Returns 36 | ------- 37 | md5_hash : str 38 | Unique hash for parameter dict 39 | 40 | """ 41 | 42 | md5 = hashlib.md5() 43 | md5.update(str(json.dumps(params, sort_keys=True))) 44 | return md5.hexdigest() 45 | 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | data/ 65 | system/ 66 | .idea/ 67 | .* 68 | !/.gitignore 69 | -------------------------------------------------------------------------------- /src/make_speed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf data_speed_16k 4 | cp -rf data_16k data_speed_16k 5 | 6 | # speed perturbation 7 | for scene in home residential_area; 8 | do 9 | mkdir data_speed_16k/TUT-sound-events-2016-development/audio/${scene}_sp 10 | rm -rf data_speed_16k/TUT-sound-events-2016-development/audio/$scene/*_S*.wav 11 | ls data_speed_16k/TUT-sound-events-2016-development/audio/$scene/*.wav > /tmp/flist 12 | for wav in `grep $scene /tmp/flist`; do 13 | fbase=`basename $wav .wav` 14 | for speed in 0.8 0.9 1.1 1.2; 15 | do 16 | sox --norm -t wavpcm $wav -t wavpcm data_speed_16k/TUT-sound-events-2016-development/audio/${scene}_sp/${fbase}_S${speed}.wav speed $speed & 17 | sleep 0.5 18 | done 19 | done 20 | done 21 | for file in `ls data_16k/TUT-sound-events-2016-development/evaluation_setup/*_train.txt`; 22 | do 23 | fbase=`basename $file` 24 | cat $file | python -c " 25 | import os,sys 26 | for line in sys.stdin.readlines(): 27 | line = line.strip() 28 | if len(line) == 0: 29 | continue 30 | tok = line.split('\t') 31 | folder = os.path.dirname(tok[0])+'_sp/' 32 | fname = ((tok[0]).split('/')[-1]).split('.')[0] 33 | print line 34 | for speed in [0.8, 0.9, 1.1, 1.2]: 35 | print folder+fname+'_S'+str(speed)+'.wav'+'\t'+tok[1]+'\t'+str(float(tok[2])/speed)+'\t'+str(float(tok[3])/speed)+'\t'+tok[-1] 36 | " > data_speed_16k/TUT-sound-events-2016-development/evaluation_setup/$fbase 37 | done 38 | 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNN based DCASE 2016 sound event detection system 2 | 3 | Sound event detection system submitted to [DCASE 2016](http://www.cs.tut.fi/sgn/arg/dcase2016/task-sound-event-detection-in-real-life-audio) (detection and classification of acoustic scenes and events) challenge. 4 | 5 | Convolutional neural network is used for detecting and classifying polyphonic events in a long temporal context of filter bank acoustic features. Training data are augmented via sox speed perturbation. 6 | 7 | On development data set the system achieves 0.84% segment error rate (7.7% relative imporment compared to baseline) 36.3% F-measure (55.1 relative better than baseline system). 8 | 9 | Technical details are descibed in the [challenge report](https://dcase.community/documents/challenge2016/technical_reports/DCASE2016_Gorin_3012.pdf). Detailed results summary on development and evaluation audios are also [available](https://dcase.community/challenge2016/task-sound-event-detection-in-real-life-audio-results): 10 | 11 | ## Basic usage 12 | 13 | *run-cnn-pipeline.sh* - complete self-documented script for reproducing all the experiments including the following: 14 | 15 | * *task3_gmm_baseline.py* - baseline GMM system [provided](https://github.com/TUT-ARG/DCASE2016-baseline-system-python) by organizers. 16 | 17 | * *src/make_downsample.sh* - basic data preparation (down sampling) 18 | 19 | * *task3_cnn.py* - run CNN based system training and testing 20 | 21 | * *src/make_speed.sh* - speed perturbation 22 | -------------------------------------------------------------------------------- /task3_gmm_baseline.yaml: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Flow 3 | # ========================================================== 4 | flow: 5 | initialize: true 6 | extract_features: true 7 | feature_normalizer: true 8 | train_system: true 9 | test_system: true 10 | evaluate_system: true 11 | 12 | # ========================================================== 13 | # General 14 | # ========================================================== 15 | general: 16 | development_dataset: TUTSoundEvents_2016_DevelopmentSet 17 | challenge_dataset: TUTSoundEvents_2016_EvaluationSet 18 | 19 | overwrite: false # Overwrite previously stored data 20 | 21 | # ========================================================== 22 | # Paths 23 | # ========================================================== 24 | path: 25 | data: data/ 26 | 27 | base: system/baseline_dcase2016_task3/ 28 | features: features/ 29 | feature_normalizers: feature_normalizers/ 30 | models: acoustic_models/ 31 | results: evaluation_results/ 32 | 33 | challenge_results: challenge_submission/task_3_sound_event_detection_in_real_life_audio/ 34 | 35 | # ========================================================== 36 | # Feature extraction 37 | # ========================================================== 38 | features: 39 | fs: 44100 40 | win_length_seconds: 0.04 41 | hop_length_seconds: 0.02 42 | 43 | include_mfcc0: false 44 | include_delta: true 45 | include_acceleration: true 46 | 47 | mfcc: 48 | window: hamming_asymmetric # [hann_asymmetric, hamming_asymmetric] 49 | n_mfcc: 20 # Number of MFCC coefficients 50 | n_mels: 40 # Number of MEL bands used 51 | n_fft: 2048 # FFT length 52 | fmin: 0 # Minimum frequency when constructing MEL bands 53 | fmax: 22050 # Maximum frequency when constructing MEL band 54 | htk: false # Switch for HTK-styled MEL-frequency equation 55 | 56 | mfcc_delta: 57 | width: 9 58 | 59 | mfcc_acceleration: 60 | width: 9 61 | 62 | # ========================================================== 63 | # Classifier 64 | # ========================================================== 65 | classifier: 66 | method: gmm # The system supports only gmm 67 | parameters: !!null # Parameters are copied from classifier_parameters based on defined method 68 | 69 | classifier_parameters: 70 | gmm: 71 | n_components: 16 # Number of Gaussian components 72 | covariance_type: diag # [diag|full] Diagonal or full covariance matrix 73 | random_state: 0 74 | thresh: !!null 75 | tol: 0.001 76 | min_covar: 0.001 77 | n_iter: 40 78 | n_init: 1 79 | params: wmc 80 | init_params: wmc 81 | 82 | # ========================================================== 83 | # Detector 84 | # ========================================================== 85 | detector: 86 | decision_threshold: 160.0 87 | smoothing_window_length: 1.0 # seconds 88 | minimum_event_length: 0.1 # seconds 89 | minimum_event_gap: 0.1 # seconds 90 | -------------------------------------------------------------------------------- /task3_cnn.yaml: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Flow 3 | # ========================================================== 4 | flow: 5 | initialize: true 6 | extract_features: true 7 | feature_normalizer: true 8 | train_system: true 9 | test_system: true 10 | evaluate_system: true 11 | 12 | # ========================================================== 13 | # General 14 | # ========================================================== 15 | general: 16 | development_dataset: TUTSoundEvents_2016_DevelopmentSet 17 | challenge_dataset: TUTSoundEvents_2016_EvaluationSet 18 | 19 | overwrite: true # Overwrite previously stored data 20 | 21 | # ========================================================== 22 | # Paths 23 | # ========================================================== 24 | path: 25 | data: data_16k 26 | base: system/task3_01_cnn/ 27 | features: features/ 28 | feature_normalizers: feature_normalizers/ 29 | models: acoustic_models/ 30 | results: evaluation_results/ 31 | 32 | challenge_results: challenge_submission/task_3_sound_event_detection_in_real_life_audio/ 33 | 34 | # ========================================================== 35 | # Feature extraction 36 | # ========================================================== 37 | features: 38 | fs: 16000 39 | win_length_seconds: 0.025 40 | hop_length_seconds: 0.010 41 | 42 | include_mfcc0: false 43 | include_delta: true 44 | include_acceleration: true 45 | cmvn: true 46 | 47 | 48 | mfcc: 49 | window: hamming_asymmetric # [hann_asymmetric, hamming_asymmetric] 50 | n_mfcc: 60 # Number of MFCC coefficients 51 | n_mels: 60 # Number of MEL bands used 52 | n_fft: 1024 # FFT length 53 | fmin: 0 # Minimum frequency when constructing MEL bands 54 | fmax: 8000 # Maximum frequency when constructing MEL band 55 | htk: false # Switch for HTK-styled MEL-frequency equation 56 | ftype: mel_fbank # mfcc, mel_fbank or raw_spec 57 | 58 | mfcc_delta: 59 | width: 15 60 | 61 | mfcc_acceleration: 62 | width: 15 63 | 64 | # ========================================================== 65 | # Classifier 66 | # ========================================================== 67 | classifier: 68 | method: cnn # The system supports only gmm 69 | parameters: !!null # Parameters are copied from classifier_parameters based on defined method 70 | 71 | classifier_parameters: 72 | gmm: 73 | n_components: 2 # Number of Gaussian components 74 | covariance_type: diag # [diag|full] Diagonal or full covariance matrix 75 | random_state: 0 76 | thresh: !!null 77 | tol: 0.001 78 | min_covar: 0.001 79 | n_iter: 40 80 | n_init: 1 81 | params: wmc 82 | init_params: wmc 83 | cnn: 84 | splice: 30 85 | step: 30 86 | epochs: 200 87 | batch_size: 200 88 | lr: 0.001 89 | class_weights: false 90 | # ========================================================== 91 | # Detector 92 | # ========================================================== 93 | detector: 94 | decision_threshold: 160.0 95 | smoothing_window_length: 1.0 # seconds 96 | minimum_event_length: 0.05 # seconds 97 | minimum_event_gap: 0.1 # seconds 98 | splice: 30 # must be the same as in training 99 | -------------------------------------------------------------------------------- /task3_cnn.init.yaml: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Flow 3 | # ========================================================== 4 | flow: 5 | initialize: true 6 | extract_features: true 7 | feature_normalizer: true 8 | train_system: true 9 | test_system: true 10 | evaluate_system: true 11 | 12 | # ========================================================== 13 | # General 14 | # ========================================================== 15 | general: 16 | development_dataset: TUTSoundEvents_2016_DevelopmentSet 17 | challenge_dataset: TUTSoundEvents_2016_EvaluationSet 18 | 19 | overwrite: true # Overwrite previously stored data 20 | 21 | # ========================================================== 22 | # Paths 23 | # ========================================================== 24 | path: 25 | data: data_16k 26 | base: system/task3_01_cnn/ 27 | features: features/ 28 | feature_normalizers: feature_normalizers/ 29 | models: acoustic_models/ 30 | results: evaluation_results/ 31 | 32 | challenge_results: challenge_submission/task_3_sound_event_detection_in_real_life_audio/ 33 | 34 | # ========================================================== 35 | # Feature extraction 36 | # ========================================================== 37 | features: 38 | fs: 16000 39 | win_length_seconds: 0.025 40 | hop_length_seconds: 0.010 41 | 42 | include_mfcc0: false 43 | include_delta: true 44 | include_acceleration: true 45 | cmvn: true 46 | 47 | 48 | mfcc: 49 | window: hamming_asymmetric # [hann_asymmetric, hamming_asymmetric] 50 | n_mfcc: 60 # Number of MFCC coefficients 51 | n_mels: 60 # Number of MEL bands used 52 | n_fft: 1024 # FFT length 53 | fmin: 0 # Minimum frequency when constructing MEL bands 54 | fmax: 8000 # Maximum frequency when constructing MEL band 55 | htk: false # Switch for HTK-styled MEL-frequency equation 56 | ftype: mel_fbank # mfcc, mel_fbank or raw_spec 57 | 58 | mfcc_delta: 59 | width: 15 60 | 61 | mfcc_acceleration: 62 | width: 15 63 | 64 | # ========================================================== 65 | # Classifier 66 | # ========================================================== 67 | classifier: 68 | method: cnn # The system supports only gmm 69 | parameters: !!null # Parameters are copied from classifier_parameters based on defined method 70 | 71 | classifier_parameters: 72 | gmm: 73 | n_components: 2 # Number of Gaussian components 74 | covariance_type: diag # [diag|full] Diagonal or full covariance matrix 75 | random_state: 0 76 | thresh: !!null 77 | tol: 0.001 78 | min_covar: 0.001 79 | n_iter: 40 80 | n_init: 1 81 | params: wmc 82 | init_params: wmc 83 | cnn: 84 | splice: 30 85 | step: 30 86 | epochs: 200 87 | batch_size: 200 88 | lr: 0.001 89 | class_weights: false 90 | # ========================================================== 91 | # Detector 92 | # ========================================================== 93 | detector: 94 | decision_threshold: 160.0 95 | smoothing_window_length: 1.0 # seconds 96 | minimum_event_length: 0.05 # seconds 97 | minimum_event_gap: 0.1 # seconds 98 | splice: 30 # must be the same as in training 99 | -------------------------------------------------------------------------------- /task3_cnn.augm.yaml: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Flow 3 | # ========================================================== 4 | flow: 5 | initialize: true 6 | extract_features: true 7 | feature_normalizer: true 8 | train_system: true 9 | test_system: true 10 | evaluate_system: true 11 | 12 | # ========================================================== 13 | # General 14 | # ========================================================== 15 | general: 16 | development_dataset: TUTSoundEvents_2016_DevelopmentSet 17 | challenge_dataset: TUTSoundEvents_2016_EvaluationSet 18 | 19 | overwrite: true # Overwrite previously stored data 20 | 21 | # ========================================================== 22 | # Paths 23 | # ========================================================== 24 | path: 25 | data: data_speed_16k 26 | base: system/task3_01_cnn_augm/ 27 | features: features/ 28 | feature_normalizers: feature_normalizers/ 29 | models: acoustic_models/ 30 | results: evaluation_results/ 31 | 32 | challenge_results: challenge_submission/task_3_sound_event_detection_in_real_life_audio/ 33 | 34 | # ========================================================== 35 | # Feature extraction 36 | # ========================================================== 37 | features: 38 | fs: 16000 39 | win_length_seconds: 0.025 40 | hop_length_seconds: 0.010 41 | 42 | include_mfcc0: false 43 | include_delta: true 44 | include_acceleration: true 45 | cmvn: true 46 | 47 | 48 | mfcc: 49 | window: hamming_asymmetric # [hann_asymmetric, hamming_asymmetric] 50 | n_mfcc: 60 # Number of MFCC coefficients 51 | n_mels: 60 # Number of MEL bands used 52 | n_fft: 1024 # FFT length 53 | fmin: 0 # Minimum frequency when constructing MEL bands 54 | fmax: 8000 # Maximum frequency when constructing MEL band 55 | htk: false # Switch for HTK-styled MEL-frequency equation 56 | ftype: mel_fbank # mfcc, mel_fbank or raw_spec 57 | 58 | mfcc_delta: 59 | width: 15 60 | 61 | mfcc_acceleration: 62 | width: 15 63 | 64 | # ========================================================== 65 | # Classifier 66 | # ========================================================== 67 | classifier: 68 | method: cnn # The system supports only gmm 69 | parameters: !!null # Parameters are copied from classifier_parameters based on defined method 70 | 71 | classifier_parameters: 72 | gmm: 73 | n_components: 2 # Number of Gaussian components 74 | covariance_type: diag # [diag|full] Diagonal or full covariance matrix 75 | random_state: 0 76 | thresh: !!null 77 | tol: 0.001 78 | min_covar: 0.001 79 | n_iter: 40 80 | n_init: 1 81 | params: wmc 82 | init_params: wmc 83 | cnn: 84 | splice: 30 85 | step: 30 86 | epochs: 200 87 | batch_size: 200 88 | lr: 0.001 89 | class_weights: false 90 | # ========================================================== 91 | # Detector 92 | # ========================================================== 93 | detector: 94 | decision_threshold: 160.0 95 | smoothing_window_length: 1.0 # seconds 96 | minimum_event_length: 0.05 # seconds 97 | minimum_event_gap: 0.1 # seconds 98 | splice: 30 # must be the same as in training 99 | -------------------------------------------------------------------------------- /src/ui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import itertools 6 | 7 | spinner = itertools.cycle(["`", "*", ";", ","]) 8 | 9 | 10 | def title(text): 11 | """Prints title 12 | 13 | Parameters 14 | ---------- 15 | text : str 16 | Title 17 | 18 | Returns 19 | ------- 20 | Nothing 21 | 22 | """ 23 | 24 | print "--------------------------------" 25 | print text 26 | print "--------------------------------" 27 | 28 | 29 | def section_header(text): 30 | """Prints section header 31 | 32 | Parameters 33 | ---------- 34 | text : str 35 | Section header 36 | 37 | Returns 38 | ------- 39 | Nothing 40 | 41 | """ 42 | 43 | print " " 44 | print text 45 | print "================================" 46 | 47 | 48 | def foot(): 49 | """Prints foot 50 | 51 | Parameters 52 | ---------- 53 | Nothing 54 | 55 | Returns 56 | ------- 57 | Nothing 58 | 59 | """ 60 | 61 | print " [Done] " 62 | 63 | 64 | def progress(title_text=None, fold=None, percentage=None, note=None, label=None): 65 | """Prints progress line 66 | 67 | Parameters 68 | ---------- 69 | title_text : str or None 70 | Title 71 | 72 | fold : int > 0 [scalar] or None 73 | Fold number 74 | 75 | percentage : float [0-1] or None 76 | Progress percentage. 77 | 78 | note : str or None 79 | Note 80 | 81 | label : str or None 82 | Label 83 | 84 | Returns 85 | ------- 86 | Nothing 87 | 88 | """ 89 | 90 | if title_text is not None and fold is not None and percentage is not None and note is not None and label is None: 91 | print " {:2s} {:20s} fold[{:1d}] [{:3.0f}%] [{:20s}] \r".format(spinner.next(), title_text, fold,percentage * 100, note), 92 | 93 | elif title_text is not None and fold is not None and percentage is None and note is not None and label is None: 94 | print " {:2s} {:20s} fold[{:1d}] [{:20s}] \r".format(spinner.next(), title_text, fold, note), 95 | 96 | elif title_text is not None and fold is None and percentage is not None and note is not None and label is None: 97 | print " {:2s} {:20s} [{:3.0f}%] [{:20s}] \r".format(spinner.next(), title_text, percentage * 100, note), 98 | 99 | elif title_text is not None and fold is None and percentage is not None and note is None and label is None: 100 | print " {:2s} {:20s} [{:3.0f}%] \r".format(spinner.next(), title_text, percentage * 100), 101 | 102 | elif title_text is not None and fold is None and percentage is None and note is not None and label is None: 103 | print " {:2s} {:20s} [{:20s}] \r".format(spinner.next(), title_text, note), 104 | 105 | elif title_text is not None and fold is None and percentage is None and note is not None and label is not None: 106 | print " {:2s} {:20s} [{:20s}] [{:20s}] \r".format(spinner.next(), title_text, label, note), 107 | 108 | elif title_text is not None and fold is None and percentage is not None and note is not None and label is not None: 109 | print " {:2s} {:20s} [{:20s}] [{:3.0f}%] [{:20s}] \r".format(spinner.next(), title_text, label, percentage * 100, note), 110 | 111 | elif title_text is not None and fold is not None and percentage is not None and note is not None and label is not None: 112 | print " {:2s} {:20s} fold[{:1d}] [{:10s}] [{:3.0f}%] [{:20s}] \r".format(spinner.next(), title_text, fold, label, percentage * 100, note), 113 | 114 | elif title_text is not None and fold is not None and percentage is None and note is None and label is not None: 115 | print " {:2s} {:20s} fold[{:1d}] [{:10s}] \r".format(spinner.next(), title_text, fold, label), 116 | 117 | sys.stdout.flush() 118 | -------------------------------------------------------------------------------- /src/sound_event_detection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy 5 | 6 | 7 | def event_detection(feature_data, model_container, hop_length_seconds=0.01, smoothing_window_length_seconds=1.0, decision_threshold=0.0, minimum_event_length=0.1, minimum_event_gap=0.1): 8 | """Sound event detection 9 | 10 | Parameters 11 | ---------- 12 | feature_data : numpy.ndarray [shape=(n_features, t)] 13 | Feature matrix 14 | 15 | model_container : dict 16 | Sound event model pairs [positive and negative] in dict 17 | 18 | hop_length_seconds : float > 0.0 19 | Feature hop length in seconds, used to convert feature index into time-stamp 20 | (Default value=0.01) 21 | 22 | smoothing_window_length_seconds : float > 0.0 23 | Accumulation window (look-back) length, withing the window likelihoods are accumulated. 24 | (Default value=1.0) 25 | 26 | decision_threshold : float > 0.0 27 | Likelihood ratio threshold for making the decision. 28 | (Default value=0.0) 29 | 30 | minimum_event_length : float > 0.0 31 | Minimum event length in seconds, shorten than given are filtered out from the output. 32 | (Default value=0.1) 33 | 34 | minimum_event_gap : float > 0.0 35 | Minimum allowed gap between events in seconds from same event label class. 36 | (Default value=0.1) 37 | 38 | Returns 39 | ------- 40 | results : list (event dicts) 41 | Detection result, event list 42 | 43 | """ 44 | 45 | smoothing_window = int(smoothing_window_length_seconds / hop_length_seconds) 46 | 47 | results = [] 48 | for event_label in model_container['models']: 49 | positive = model_container['models'][event_label]['positive'].score_samples(feature_data)[0] 50 | negative = model_container['models'][event_label]['negative'].score_samples(feature_data)[0] 51 | 52 | # Lets keep the system causal and use look-back while smoothing (accumulating) likelihoods 53 | for stop_id in range(0, feature_data.shape[0]): 54 | start_id = stop_id - smoothing_window 55 | if start_id < 0: 56 | start_id = 0 57 | positive[start_id] = sum(positive[start_id:stop_id]) 58 | negative[start_id] = sum(negative[start_id:stop_id]) 59 | 60 | likelihood_ratio = positive - negative 61 | event_activity = likelihood_ratio > decision_threshold 62 | 63 | # Find contiguous segments and convert frame-ids into times 64 | event_segments = contiguous_regions(event_activity) * hop_length_seconds 65 | 66 | # Preprocess the event segments 67 | event_segments = postprocess_event_segments(event_segments=event_segments, 68 | minimum_event_length=minimum_event_length, 69 | minimum_event_gap=minimum_event_gap) 70 | 71 | for event in event_segments: 72 | results.append((event[0], event[1], event_label)) 73 | 74 | return results 75 | 76 | 77 | def contiguous_regions(activity_array): 78 | """Find contiguous regions from bool valued numpy.array. 79 | Transforms boolean values for each frame into pairs of onsets and offsets. 80 | 81 | Parameters 82 | ---------- 83 | activity_array : numpy.array [shape=(t)] 84 | Event activity array, bool values 85 | 86 | Returns 87 | ------- 88 | change_indices : numpy.ndarray [shape=(2, number of found changes)] 89 | Onset and offset indices pairs in matrix 90 | 91 | """ 92 | 93 | # Find the changes in the activity_array 94 | change_indices = numpy.diff(activity_array).nonzero()[0] 95 | 96 | # Shift change_index with one, focus on frame after the change. 97 | change_indices += 1 98 | 99 | if activity_array[0]: 100 | # If the first element of activity_array is True add 0 at the beginning 101 | change_indices = numpy.r_[0, change_indices] 102 | 103 | if activity_array[-1]: 104 | # If the last element of activity_array is True, add the length of the array 105 | change_indices = numpy.r_[change_indices, activity_array.size] 106 | 107 | # Reshape the result into two columns 108 | return change_indices.reshape((-1, 2)) 109 | 110 | 111 | def postprocess_event_segments(event_segments, minimum_event_length=0.1, minimum_event_gap=0.1): 112 | """Post process event segment list. Makes sure that minimum event length and minimum event gap conditions are met. 113 | 114 | Parameters 115 | ---------- 116 | event_segments : numpy.ndarray [shape=(2, number of event)] 117 | Event segments, first column has the onset, second has the offset. 118 | 119 | minimum_event_length : float > 0.0 120 | Minimum event length in seconds, shorten than given are filtered out from the output. 121 | (Default value=0.1) 122 | 123 | minimum_event_gap : float > 0.0 124 | Minimum allowed gap between events in seconds from same event label class. 125 | (Default value=0.1) 126 | 127 | Returns 128 | ------- 129 | event_results : numpy.ndarray [shape=(2, number of event)] 130 | postprocessed event segments 131 | 132 | """ 133 | 134 | # 1. remove short events 135 | event_results_1 = [] 136 | for event in event_segments: 137 | if event[1]-event[0] >= minimum_event_length: 138 | event_results_1.append((event[0], event[1])) 139 | 140 | if len(event_results_1): 141 | # 2. remove small gaps between events 142 | event_results_2 = [] 143 | 144 | # Load first event into event buffer 145 | buffered_event_onset = event_results_1[0][0] 146 | buffered_event_offset = event_results_1[0][1] 147 | for i in range(1, len(event_results_1)): 148 | if event_results_1[i][0] - buffered_event_offset > minimum_event_gap: 149 | # The gap between current event and the buffered is bigger than minimum event gap, 150 | # store event, and replace buffered event 151 | event_results_2.append((buffered_event_onset, buffered_event_offset)) 152 | buffered_event_onset = event_results_1[i][0] 153 | buffered_event_offset = event_results_1[i][1] 154 | else: 155 | # The gap between current event and the buffered is smalle than minimum event gap, 156 | # extend the buffered event until the current offset 157 | buffered_event_offset = event_results_1[i][1] 158 | 159 | # Store last event from buffer 160 | event_results_2.append((buffered_event_onset, buffered_event_offset)) 161 | 162 | return event_results_2 163 | else: 164 | return event_results_1 165 | -------------------------------------------------------------------------------- /src/files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import wave 6 | import numpy 7 | import csv 8 | import cPickle as pickle 9 | import librosa 10 | import yaml 11 | 12 | 13 | def load_audio(filename, mono=True, fs=44100): 14 | """Load audio file into numpy array 15 | 16 | Supports 24-bit wav-format, and flac audio through librosa. 17 | 18 | Parameters 19 | ---------- 20 | filename: str 21 | Path to audio file 22 | 23 | mono : bool 24 | In case of multi-channel audio, channels are averaged into single channel. 25 | (Default value=True) 26 | 27 | fs : int > 0 [scalar] 28 | Target sample rate, if input audio does not fulfil this, audio is resampled. 29 | (Default value=44100) 30 | 31 | Returns 32 | ------- 33 | audio_data : numpy.ndarray [shape=(signal_length, channel)] 34 | Audio 35 | 36 | sample_rate : integer 37 | Sample rate 38 | 39 | """ 40 | 41 | file_base, file_extension = os.path.splitext(filename) 42 | if file_extension == '.wav': 43 | audio_file = wave.open(filename) 44 | 45 | # Audio info 46 | sample_rate = audio_file.getframerate() 47 | sample_width = audio_file.getsampwidth() 48 | number_of_channels = audio_file.getnchannels() 49 | number_of_frames = audio_file.getnframes() 50 | 51 | # Read raw bytes 52 | data = audio_file.readframes(number_of_frames) 53 | audio_file.close() 54 | 55 | # Convert bytes based on sample_width 56 | num_samples, remainder = divmod(len(data), sample_width * number_of_channels) 57 | if remainder > 0: 58 | raise ValueError('The length of data is not a multiple of sample size * number of channels.') 59 | if sample_width > 4: 60 | raise ValueError('Sample size cannot be bigger than 4 bytes.') 61 | 62 | if sample_width == 3: 63 | # 24 bit audio 64 | a = numpy.empty((num_samples, number_of_channels, 4), dtype=numpy.uint8) 65 | raw_bytes = numpy.fromstring(data, dtype=numpy.uint8) 66 | a[:, :, :sample_width] = raw_bytes.reshape(-1, number_of_channels, sample_width) 67 | a[:, :, sample_width:] = (a[:, :, sample_width - 1:sample_width] >> 7) * 255 68 | audio_data = a.view(' 0 [scalar] 32 | Sample rate 33 | (Default value=44100) 34 | 35 | statistics: bool 36 | Calculate feature statistics for extracted matrix 37 | (Default value=True) 38 | 39 | include_mfcc0: bool 40 | Include 0th MFCC coefficient into static coefficients. 41 | (Default value=True) 42 | 43 | include_delta: bool 44 | Include delta MFCC coefficients. 45 | (Default value=True) 46 | 47 | include_acceleration: bool 48 | Include acceleration MFCC coefficients. 49 | (Default value=True) 50 | 51 | mfcc_params: dict or None 52 | Parameters for extraction of static MFCC coefficients. 53 | 54 | delta_params: dict or None 55 | Parameters for extraction of delta MFCC coefficients. 56 | 57 | acceleration_params: dict or None 58 | Parameters for extraction of acceleration MFCC coefficients. 59 | 60 | Returns 61 | ------- 62 | result: dict 63 | Feature dict 64 | 65 | """ 66 | 67 | eps = numpy.spacing(1) 68 | 69 | # Windowing function 70 | if mfcc_params['window'] == 'hamming_asymmetric': 71 | window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) 72 | elif mfcc_params['window'] == 'hamming_symmetric': 73 | window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) 74 | elif mfcc_params['window'] == 'hann_asymmetric': 75 | window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) 76 | elif mfcc_params['window'] == 'hann_symmetric': 77 | window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) 78 | else: 79 | window = None 80 | 81 | # Calculate Static Coefficients 82 | power_spectrogram = numpy.abs(librosa.stft(y + eps, 83 | n_fft=mfcc_params['n_fft'], 84 | win_length=mfcc_params['win_length'], 85 | hop_length=mfcc_params['hop_length'], 86 | center=True, 87 | window=window))**2 88 | mel_basis = librosa.filters.mel(sr=fs, 89 | n_fft=mfcc_params['n_fft'], 90 | n_mels=mfcc_params['n_mels'], 91 | fmin=mfcc_params['fmin'], 92 | fmax=mfcc_params['fmax'], 93 | htk=mfcc_params['htk']) 94 | mel_spectrum = numpy.dot(mel_basis, power_spectrogram) 95 | mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum), 96 | n_mfcc=mfcc_params['n_mfcc']) 97 | 98 | # Collect the feature matrix 99 | feature_matrix = mfcc 100 | if include_delta: 101 | # Delta coefficients 102 | mfcc_delta = librosa.feature.delta(mfcc, **delta_params) 103 | 104 | # Add Delta Coefficients to feature matrix 105 | feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) 106 | 107 | if include_acceleration: 108 | # Acceleration coefficients (aka delta delta) 109 | mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) 110 | 111 | # Add Acceleration Coefficients to feature matrix 112 | feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) 113 | 114 | if not include_mfcc0: 115 | # Omit mfcc0 116 | feature_matrix = feature_matrix[1:, :] 117 | 118 | feature_matrix = feature_matrix.T 119 | 120 | # Collect into data structure 121 | if statistics: 122 | return { 123 | 'feat': feature_matrix, 124 | 'stat': { 125 | 'mean': numpy.mean(feature_matrix, axis=0), 126 | 'std': numpy.std(feature_matrix, axis=0), 127 | 'N': feature_matrix.shape[0], 128 | 'S1': numpy.sum(feature_matrix, axis=0), 129 | 'S2': numpy.sum(feature_matrix ** 2, axis=0), 130 | } 131 | } 132 | else: 133 | return { 134 | 'feat': feature_matrix} 135 | 136 | 137 | class FeatureNormalizer(object): 138 | """Feature normalizer class 139 | 140 | Accumulates feature statistics 141 | 142 | Examples 143 | -------- 144 | 145 | >>> normalizer = FeatureNormalizer() 146 | >>> for feature_matrix in training_items: 147 | >>> normalizer.accumulate(feature_matrix) 148 | >>> 149 | >>> normalizer.finalize() 150 | 151 | >>> for feature_matrix in test_items: 152 | >>> feature_matrix_normalized = normalizer.normalize(feature_matrix) 153 | >>> # used the features 154 | 155 | """ 156 | def __init__(self, feature_matrix=None): 157 | """__init__ method. 158 | 159 | Parameters 160 | ---------- 161 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] or None 162 | Feature matrix to be used in the initialization 163 | 164 | """ 165 | if feature_matrix is None: 166 | self.N = 0 167 | self.mean = 0 168 | self.S1 = 0 169 | self.S2 = 0 170 | self.std = 0 171 | else: 172 | self.mean = numpy.mean(feature_matrix, axis=0) 173 | self.std = numpy.std(feature_matrix, axis=0) 174 | self.N = feature_matrix.shape[0] 175 | self.S1 = numpy.sum(feature_matrix, axis=0) 176 | self.S2 = numpy.sum(feature_matrix ** 2, axis=0) 177 | self.finalize() 178 | 179 | def __enter__(self): 180 | # Initialize Normalization class and return it 181 | self.N = 0 182 | self.mean = 0 183 | self.S1 = 0 184 | self.S2 = 0 185 | self.std = 0 186 | return self 187 | 188 | def __exit__(self, type, value, traceback): 189 | # Finalize accumulated calculation 190 | self.finalize() 191 | 192 | def accumulate(self, stat): 193 | """Accumalate statistics 194 | 195 | Input is statistics dict, format: 196 | 197 | { 198 | 'mean': numpy.mean(feature_matrix, axis=0), 199 | 'std': numpy.std(feature_matrix, axis=0), 200 | 'N': feature_matrix.shape[0], 201 | 'S1': numpy.sum(feature_matrix, axis=0), 202 | 'S2': numpy.sum(feature_matrix ** 2, axis=0), 203 | } 204 | 205 | Parameters 206 | ---------- 207 | stat : dict 208 | Statistics dict 209 | 210 | Returns 211 | ------- 212 | nothing 213 | 214 | """ 215 | self.N += stat['N'] 216 | self.mean += stat['mean'] 217 | self.S1 += stat['S1'] 218 | self.S2 += stat['S2'] 219 | 220 | def finalize(self): 221 | """Finalize statistics calculation 222 | 223 | Accumulated values are used to get mean and std for the seen feature data. 224 | 225 | Parameters 226 | ---------- 227 | nothing 228 | 229 | Returns 230 | ------- 231 | nothing 232 | 233 | """ 234 | 235 | # Finalize statistics 236 | self.mean = self.S1 / self.N 237 | self.std = numpy.sqrt((self.N * self.S2 - (self.S1 * self.S1)) / (self.N * (self.N - 1))) 238 | 239 | # In case we have very brain-death material we get std = Nan => 0.0 240 | self.std = numpy.nan_to_num(self.std) 241 | 242 | self.mean = numpy.reshape(self.mean, [1, -1]) 243 | self.std = numpy.reshape(self.std, [1, -1]) 244 | 245 | def normalize(self, feature_matrix): 246 | """Normalize feature matrix with internal statistics of the class 247 | 248 | Parameters 249 | ---------- 250 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] 251 | Feature matrix to be normalized 252 | 253 | Returns 254 | ------- 255 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] 256 | Normalized feature matrix 257 | 258 | """ 259 | 260 | return (feature_matrix - self.mean) / self.std 261 | -------------------------------------------------------------------------------- /src/sound_event_detection_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy 5 | from keras.models import model_from_json 6 | 7 | 8 | def event_detection(feature_data, model_container, hop_length_seconds=0.01, smoothing_window_length_seconds=1.0, decision_threshold=0.0, minimum_event_length=0.1, minimum_event_gap=0.1): 9 | """Sound event detection 10 | 11 | Parameters 12 | ---------- 13 | feature_data : numpy.ndarray [shape=(n_features, t)] 14 | Feature matrix 15 | 16 | model_container : dict 17 | Sound event model pairs [positive and negative] in dict 18 | 19 | hop_length_seconds : float > 0.0 20 | Feature hop length in seconds, used to convert feature index into time-stamp 21 | (Default value=0.01) 22 | 23 | smoothing_window_length_seconds : float > 0.0 24 | Accumulation window (look-back) length, withing the window likelihoods are accumulated. 25 | (Default value=1.0) 26 | 27 | decision_threshold : float > 0.0 28 | Likelihood ratio threshold for making the decision. 29 | (Default value=0.0) 30 | 31 | minimum_event_length : float > 0.0 32 | Minimum event length in seconds, shorten than given are filtered out from the output. 33 | (Default value=0.1) 34 | 35 | minimum_event_gap : float > 0.0 36 | Minimum allowed gap between events in seconds from same event label class. 37 | (Default value=0.1) 38 | 39 | Returns 40 | ------- 41 | results : list (event dicts) 42 | Detection result, event list 43 | 44 | """ 45 | 46 | smoothing_window = int(smoothing_window_length_seconds / hop_length_seconds) 47 | 48 | results = [] 49 | for event_label in model_container['models']: 50 | positive = model_container['models'][event_label]['positive'].score_samples(feature_data)[0] 51 | negative = model_container['models'][event_label]['negative'].score_samples(feature_data)[0] 52 | 53 | # Lets keep the system causal and use look-back while smoothing (accumulating) likelihoods 54 | for stop_id in range(0, feature_data.shape[0]): 55 | start_id = stop_id - smoothing_window 56 | if start_id < 0: 57 | start_id = 0 58 | positive[start_id] = sum(positive[start_id:stop_id]) 59 | negative[start_id] = sum(negative[start_id:stop_id]) 60 | 61 | likelihood_ratio = positive - negative 62 | event_activity = likelihood_ratio > decision_threshold 63 | 64 | # Find contiguous segments and convert frame-ids into times 65 | event_segments = contiguous_regions(event_activity) * hop_length_seconds 66 | 67 | # Preprocess the event segments 68 | event_segments = postprocess_event_segments(event_segments=event_segments, 69 | minimum_event_length=minimum_event_length, 70 | minimum_event_gap=minimum_event_gap) 71 | 72 | for event in event_segments: 73 | results.append((event[0], event[1], event_label)) 74 | 75 | return results 76 | 77 | 78 | 79 | 80 | def event_detection_cnn(feature_data, model_container, hop_length_seconds=0.01, smoothing_window_length_seconds=1.0, decision_threshold=0.0, minimum_event_length=0.1, minimum_event_gap=0.1, scene_label=None, splice=15): 81 | """event detector for CNN model 82 | 83 | equivalent to event_detection function with 84 | additional parameters: 85 | scene_label : string 86 | 'home' or 'residential_area' 87 | splice : int 88 | feature splicing 89 | """ 90 | 91 | event_dic = model_container['event_dic'] # maps CNN output to events 92 | model = model_from_json(open(model_container['model_arch_file']).read()) 93 | model.load_weights(model_container['model_weights_file']) 94 | model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 95 | 96 | smoothing_window = int(smoothing_window_length_seconds / hop_length_seconds) 97 | 98 | results = [] 99 | X = [] 100 | 101 | splice = splice 102 | step = 1 103 | wd = 2 * splice + 1 104 | for i in range(0, feature_data.shape[0] - wd, step): 105 | if scene_label == 'home': 106 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), -1.0)) , axis=1) 107 | else: 108 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), 1.0)) , axis=1) 109 | X.append(X_seq) 110 | 111 | X = numpy.array(X) 112 | X = numpy.expand_dims(X, axis=1) 113 | 114 | predictions = model.predict(X) 115 | predictions = numpy.vstack(( numpy.zeros([splice, predictions.shape[1]]) , predictions)) 116 | event_predictions = numpy.zeros(predictions.shape) 117 | 118 | for stop_id in range(0, predictions.shape[0]): 119 | start_id = stop_id - smoothing_window 120 | if start_id < 0: 121 | start_id = 0 122 | predictions_pos = numpy.sum(predictions[start_id:stop_id]>=0.5, axis=0) 123 | event_predictions[start_id, predictions_pos > 0.2*(stop_id - start_id)] = 1 124 | 125 | for event_label in model_container['test_dic']: 126 | if not event_label.startswith('[silence]'): 127 | event_activity = event_predictions[:,event_dic[event_label+scene_label]] 128 | event_segments = contiguous_regions(event_activity) * hop_length_seconds 129 | event_segments = postprocess_event_segments(event_segments=event_segments, minimum_event_length=minimum_event_length, minimum_event_gap=minimum_event_gap) 130 | for event in event_segments: 131 | results.append((event[0], event[1], event_label)) 132 | 133 | return results 134 | 135 | 136 | def contiguous_regions(activity_array): 137 | """Find contiguous regions from bool valued numpy.array. 138 | Transforms boolean values for each frame into pairs of onsets and offsets. 139 | 140 | Parameters 141 | ---------- 142 | activity_array : numpy.array [shape=(t)] 143 | Event activity array, bool values 144 | 145 | Returns 146 | ------- 147 | change_indices : numpy.ndarray [shape=(2, number of found changes)] 148 | Onset and offset indices pairs in matrix 149 | 150 | """ 151 | 152 | # Find the changes in the activity_array 153 | change_indices = numpy.diff(activity_array).nonzero()[0] 154 | 155 | # Shift change_index with one, focus on frame after the change. 156 | change_indices += 1 157 | 158 | if activity_array[0]: 159 | # If the first element of activity_array is True add 0 at the beginning 160 | change_indices = numpy.r_[0, change_indices] 161 | 162 | if activity_array[-1]: 163 | # If the last element of activity_array is True, add the length of the array 164 | change_indices = numpy.r_[change_indices, activity_array.size] 165 | 166 | # Reshape the result into two columns 167 | return change_indices.reshape((-1, 2)) 168 | 169 | 170 | def postprocess_event_segments(event_segments, minimum_event_length=0.1, minimum_event_gap=0.1): 171 | """Post process event segment list. Makes sure that minimum event length and minimum event gap conditions are met. 172 | 173 | Parameters 174 | ---------- 175 | event_segments : numpy.ndarray [shape=(2, number of event)] 176 | Event segments, first column has the onset, second has the offset. 177 | 178 | minimum_event_length : float > 0.0 179 | Minimum event length in seconds, shorten than given are filtered out from the output. 180 | (Default value=0.1) 181 | 182 | minimum_event_gap : float > 0.0 183 | Minimum allowed gap between events in seconds from same event label class. 184 | (Default value=0.1) 185 | 186 | Returns 187 | ------- 188 | event_results : numpy.ndarray [shape=(2, number of event)] 189 | postprocessed event segments 190 | 191 | """ 192 | 193 | # 1. remove short events 194 | event_results_1 = [] 195 | for event in event_segments: 196 | if event[1]-event[0] >= minimum_event_length: 197 | event_results_1.append((event[0], event[1])) 198 | 199 | if len(event_results_1): 200 | # 2. remove small gaps between events 201 | event_results_2 = [] 202 | 203 | # Load first event into event buffer 204 | buffered_event_onset = event_results_1[0][0] 205 | buffered_event_offset = event_results_1[0][1] 206 | for i in range(1, len(event_results_1)): 207 | if event_results_1[i][0] - buffered_event_offset > minimum_event_gap: 208 | # The gap between current event and the buffered is bigger than minimum event gap, 209 | # store event, and replace buffered event 210 | event_results_2.append((buffered_event_onset, buffered_event_offset)) 211 | buffered_event_onset = event_results_1[i][0] 212 | buffered_event_offset = event_results_1[i][1] 213 | else: 214 | # The gap between current event and the buffered is smalle than minimum event gap, 215 | # extend the buffered event until the current offset 216 | buffered_event_offset = event_results_1[i][1] 217 | 218 | # Store last event from buffer 219 | event_results_2.append((buffered_event_onset, buffered_event_offset)) 220 | 221 | return event_results_2 222 | else: 223 | return event_results_1 224 | -------------------------------------------------------------------------------- /src/features_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy 5 | import librosa 6 | import scipy 7 | 8 | 9 | def feature_extraction(y, fs=44100, statistics=True, include_mfcc0=True, include_delta=True, 10 | include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): 11 | """Feature extraction, MFCC based features 12 | 13 | Outputs features in dict, format: 14 | 15 | { 16 | 'feat': feature_matrix [shape=(frame count, feature vector size)], 17 | 'stat': { 18 | 'mean': numpy.mean(feature_matrix, axis=0), 19 | 'std': numpy.std(feature_matrix, axis=0), 20 | 'N': feature_matrix.shape[0], 21 | 'S1': numpy.sum(feature_matrix, axis=0), 22 | 'S2': numpy.sum(feature_matrix ** 2, axis=0), 23 | } 24 | } 25 | 26 | Parameters 27 | ---------- 28 | y: numpy.array [shape=(signal_length, )] 29 | Audio 30 | 31 | fs: int > 0 [scalar] 32 | Sample rate 33 | (Default value=44100) 34 | 35 | statistics: bool 36 | Calculate feature statistics for extracted matrix 37 | (Default value=True) 38 | 39 | include_mfcc0: bool 40 | Include 0th MFCC coefficient into static coefficients. 41 | (Default value=True) 42 | 43 | include_delta: bool 44 | Include delta MFCC coefficients. 45 | (Default value=True) 46 | 47 | include_acceleration: bool 48 | Include acceleration MFCC coefficients. 49 | (Default value=True) 50 | 51 | mfcc_params: dict or None 52 | Parameters for extraction of static MFCC coefficients. 53 | 54 | delta_params: dict or None 55 | Parameters for extraction of delta MFCC coefficients. 56 | 57 | acceleration_params: dict or None 58 | Parameters for extraction of acceleration MFCC coefficients. 59 | 60 | Returns 61 | ------- 62 | result: dict 63 | Feature dict 64 | 65 | """ 66 | 67 | eps = numpy.spacing(1) 68 | 69 | # Windowing function 70 | if mfcc_params['window'] == 'hamming_asymmetric': 71 | window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) 72 | elif mfcc_params['window'] == 'hamming_symmetric': 73 | window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) 74 | elif mfcc_params['window'] == 'hann_asymmetric': 75 | window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) 76 | elif mfcc_params['window'] == 'hann_symmetric': 77 | window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) 78 | else: 79 | window = None 80 | 81 | # Calculate Static Coefficients 82 | magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, 83 | n_fft=mfcc_params['n_fft'], 84 | win_length=mfcc_params['win_length'], 85 | hop_length=mfcc_params['hop_length'], 86 | center=True, 87 | window=window))**2 88 | mel_basis = librosa.filters.mel(sr=fs, 89 | n_fft=mfcc_params['n_fft'], 90 | n_mels=mfcc_params['n_mels'], 91 | fmin=mfcc_params['fmin'], 92 | fmax=mfcc_params['fmax'], 93 | htk=mfcc_params['htk']) 94 | mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram) 95 | 96 | if 'norm_frames' in mfcc_params and mfcc_parames['norm_frames']: 97 | mel_spectrum = librosa.util.normalize(mel_spectrum, norm=1, axis=0) 98 | 99 | if 'ftype' not in mfcc_params: 100 | mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum), n_mfcc=mfcc_params['n_mfcc']) 101 | elif mfcc_params['ftype'] == 'log_spec': 102 | mfcc = librosa.logamplitude(magnitude_spectrogram) 103 | elif mfcc_params['ftype'] == 'mel_fbank': 104 | logenergy = numpy.log(numpy.sum(magnitude_spectrogram, axis=0)).reshape(1,mel_spectrum.shape[1]) 105 | mfcc = numpy.concatenate((librosa.logamplitude(mel_spectrum), logenergy), axis=0) 106 | 107 | elif mfcc_params['ftype'] == 'mfcc': 108 | mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum), n_mfcc=mfcc_params['n_mfcc']) 109 | 110 | # Collect the feature matrix 111 | feature_matrix = mfcc 112 | if include_delta: 113 | # Delta coefficients 114 | mfcc_delta = librosa.feature.delta(mfcc, **delta_params) 115 | # Add Delta Coefficients to feature matrix 116 | feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) 117 | 118 | if include_acceleration: 119 | # Acceleration coefficients (aka delta) 120 | mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) 121 | # Add Acceleration Coefficients to feature matrix 122 | feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) 123 | 124 | if not include_mfcc0 and ('ftype' not in mfcc_params or mfcc_params['ftype'] == 'mfcc'): 125 | # Omit mfcc0 126 | feature_matrix = feature_matrix[1:, :] 127 | 128 | feature_matrix = feature_matrix.T 129 | 130 | # Collect into data structure 131 | if statistics: 132 | return { 133 | 'feat': feature_matrix, 134 | 'stat': { 135 | 'mean': numpy.mean(feature_matrix, axis=0), 136 | 'std': numpy.std(feature_matrix, axis=0), 137 | 'N': feature_matrix.shape[0], 138 | 'S1': numpy.sum(feature_matrix, axis=0), 139 | 'S2': numpy.sum(feature_matrix ** 2, axis=0), 140 | } 141 | } 142 | else: 143 | return { 144 | 'feat': feature_matrix} 145 | 146 | 147 | 148 | class FeatureNormalizer(object): 149 | """Feature normalizer class 150 | 151 | Accumulates feature statistics 152 | 153 | Examples 154 | -------- 155 | 156 | >>> normalizer = FeatureNormalizer() 157 | >>> for feature_matrix in training_items: 158 | >>> normalizer.accumulate(feature_matrix) 159 | >>> 160 | >>> normalizer.finalize() 161 | 162 | >>> for feature_matrix in test_items: 163 | >>> feature_matrix_normalized = normalizer.normalize(feature_matrix) 164 | >>> # used the features 165 | 166 | """ 167 | def __init__(self, feature_matrix=None): 168 | """__init__ method. 169 | 170 | Parameters 171 | ---------- 172 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] or None 173 | Feature matrix to be used in the initialization 174 | 175 | """ 176 | if feature_matrix is None: 177 | self.N = 0 178 | self.mean = 0 179 | self.S1 = 0 180 | self.S2 = 0 181 | self.std = 0 182 | else: 183 | self.mean = numpy.mean(feature_matrix, axis=0) 184 | self.std = numpy.std(feature_matrix, axis=0) 185 | self.N = feature_matrix.shape[0] 186 | self.S1 = numpy.sum(feature_matrix, axis=0) 187 | self.S2 = numpy.sum(feature_matrix ** 2, axis=0) 188 | self.finalize() 189 | 190 | def __enter__(self): 191 | # Initialize Normalization class and return it 192 | self.N = 0 193 | self.mean = 0 194 | self.S1 = 0 195 | self.S2 = 0 196 | self.std = 0 197 | return self 198 | 199 | def __exit__(self, type, value, traceback): 200 | # Finalize accumulated calculation 201 | self.finalize() 202 | 203 | def accumulate(self, stat): 204 | """Accumalate statistics 205 | 206 | Input is statistics dict, format: 207 | 208 | { 209 | 'mean': numpy.mean(feature_matrix, axis=0), 210 | 'std': numpy.std(feature_matrix, axis=0), 211 | 'N': feature_matrix.shape[0], 212 | 'S1': numpy.sum(feature_matrix, axis=0), 213 | 'S2': numpy.sum(feature_matrix ** 2, axis=0), 214 | } 215 | 216 | Parameters 217 | ---------- 218 | stat : dict 219 | Statistics dict 220 | 221 | Returns 222 | ------- 223 | nothing 224 | 225 | """ 226 | self.N += stat['N'] 227 | self.mean += stat['mean'] 228 | self.S1 += stat['S1'] 229 | self.S2 += stat['S2'] 230 | 231 | def finalize(self): 232 | """Finalize statistics calculation 233 | 234 | Accumulated values are used to get mean and std for the seen feature data. 235 | 236 | Parameters 237 | ---------- 238 | nothing 239 | 240 | Returns 241 | ------- 242 | nothing 243 | 244 | """ 245 | 246 | # Finalize statistics 247 | self.mean = self.S1 / self.N 248 | self.std = numpy.sqrt((self.N * self.S2 - (self.S1 * self.S1)) / (self.N * (self.N - 1))) 249 | 250 | # In case we have very brain-death material we get std = Nan => 0.0 251 | self.std = numpy.nan_to_num(self.std) 252 | 253 | self.mean = numpy.reshape(self.mean, [1, -1]) 254 | self.std = numpy.reshape(self.std, [1, -1]) 255 | 256 | def normalize(self, feature_matrix): 257 | """Normalize feature matrix with internal statistics of the class 258 | 259 | Parameters 260 | ---------- 261 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] 262 | Feature matrix to be normalized 263 | 264 | Returns 265 | ------- 266 | feature_matrix : numpy.ndarray [shape=(frames, number of feature values)] 267 | Normalized feature matrix 268 | 269 | """ 270 | norm_global = (feature_matrix - self.mean) / self.std 271 | return norm_global 272 | 273 | -------------------------------------------------------------------------------- /src/evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import numpy 6 | import math 7 | from sklearn import metrics 8 | 9 | class DCASE2016_SceneClassification_Metrics(): 10 | """DCASE 2016 scene classification metrics 11 | 12 | Examples 13 | -------- 14 | 15 | >>> dcase2016_scene_metric = DCASE2016_SceneClassification_Metrics(class_list=dataset.scene_labels) 16 | >>> for fold in dataset.folds(mode=dataset_evaluation_mode): 17 | >>> results = [] 18 | >>> result_filename = get_result_filename(fold=fold, path=result_path) 19 | >>> 20 | >>> if os.path.isfile(result_filename): 21 | >>> with open(result_filename, 'rt') as f: 22 | >>> for row in csv.reader(f, delimiter='\t'): 23 | >>> results.append(row) 24 | >>> 25 | >>> y_true = [] 26 | >>> y_pred = [] 27 | >>> for result in results: 28 | >>> y_true.append(dataset.file_meta(result[0])[0]['scene_label']) 29 | >>> y_pred.append(result[1]) 30 | >>> 31 | >>> dcase2016_scene_metric.evaluate(system_output=y_pred, annotated_ground_truth=y_true) 32 | >>> 33 | >>> results = dcase2016_scene_metric.results() 34 | 35 | """ 36 | 37 | def __init__(self, class_list): 38 | """__init__ method. 39 | 40 | Parameters 41 | ---------- 42 | class_list : list 43 | Evaluated scene labels in the list 44 | 45 | """ 46 | self.accuracies_per_class = None 47 | self.Nsys = None 48 | self.Nref = None 49 | self.class_list = class_list 50 | self.eps = numpy.spacing(1) 51 | 52 | def __enter__(self): 53 | return self 54 | 55 | def __exit__(self, type, value, traceback): 56 | return self.results() 57 | 58 | def accuracies(self, y_true, y_pred, labels): 59 | """Calculate accuracy 60 | 61 | Parameters 62 | ---------- 63 | y_true : numpy.array 64 | Ground truth array, list of scene labels 65 | 66 | y_pred : numpy.array 67 | System output array, list of scene labels 68 | 69 | labels : list 70 | list of scene labels 71 | 72 | Returns 73 | ------- 74 | array : numpy.array [shape=(number of scene labels,)] 75 | Accuracy per scene label class 76 | 77 | """ 78 | 79 | confusion_matrix = metrics.confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels).astype(float) 80 | return numpy.divide(numpy.diag(confusion_matrix), numpy.sum(confusion_matrix, 1)+self.eps) 81 | 82 | def evaluate(self, annotated_ground_truth, system_output): 83 | """Evaluate system output and annotated ground truth pair. 84 | 85 | Use results method to get results. 86 | 87 | Parameters 88 | ---------- 89 | annotated_ground_truth : numpy.array 90 | Ground truth array, list of scene labels 91 | 92 | system_output : numpy.array 93 | System output array, list of scene labels 94 | 95 | Returns 96 | ------- 97 | nothing 98 | 99 | """ 100 | 101 | accuracies_per_class = self.accuracies(y_pred=system_output, y_true=annotated_ground_truth, labels=self.class_list) 102 | 103 | if self.accuracies_per_class is None: 104 | self.accuracies_per_class = accuracies_per_class 105 | else: 106 | self.accuracies_per_class = numpy.vstack((self.accuracies_per_class, accuracies_per_class)) 107 | 108 | Nref = numpy.zeros(len(self.class_list)) 109 | Nsys = numpy.zeros(len(self.class_list)) 110 | 111 | for class_id, class_label in enumerate(self.class_list): 112 | for item in system_output: 113 | if item == class_label: 114 | Nsys[class_id] += 1 115 | 116 | for item in annotated_ground_truth: 117 | if item == class_label: 118 | Nref[class_id] += 1 119 | 120 | if self.Nref is None: 121 | self.Nref = Nref 122 | else: 123 | self.Nref = numpy.vstack((self.Nref, Nref)) 124 | 125 | if self.Nsys is None: 126 | self.Nsys = Nsys 127 | else: 128 | self.Nsys = numpy.vstack((self.Nsys, Nsys)) 129 | 130 | def results(self): 131 | """Get results 132 | 133 | Outputs results in dict, format: 134 | 135 | { 136 | 'class_wise_data': 137 | { 138 | 'office': { 139 | 'Nsys': 10, 140 | 'Nref': 7, 141 | }, 142 | } 143 | 'class_wise_accuracy': 144 | { 145 | 'office': 0.6, 146 | 'home': 0.4, 147 | } 148 | 'overall_accuracy': numpy.mean(self.accuracies_per_class) 149 | 'Nsys': 100, 150 | 'Nref': 100, 151 | } 152 | 153 | Parameters 154 | ---------- 155 | nothing 156 | 157 | Returns 158 | ------- 159 | results : dict 160 | Results dict 161 | 162 | """ 163 | 164 | results = { 165 | 'class_wise_data': {}, 166 | 'class_wise_accuracy': {}, 167 | 'overall_accuracy': numpy.mean(self.accuracies_per_class) 168 | } 169 | if len(self.Nsys.shape) == 2: 170 | results['Nsys'] = int(sum(sum(self.Nsys))) 171 | results['Nref'] = int(sum(sum(self.Nref))) 172 | else: 173 | results['Nsys'] = int(sum(self.Nsys)) 174 | results['Nref'] = int(sum(self.Nref)) 175 | 176 | for class_id, class_label in enumerate(self.class_list): 177 | if len(self.accuracies_per_class.shape) == 2: 178 | results['class_wise_accuracy'][class_label] = numpy.mean(self.accuracies_per_class[:, class_id]) 179 | results['class_wise_data'][class_label] = { 180 | 'Nsys': int(sum(self.Nsys[:, class_id])), 181 | 'Nref': int(sum(self.Nref[:, class_id])), 182 | } 183 | else: 184 | results['class_wise_accuracy'][class_label] = numpy.mean(self.accuracies_per_class[class_id]) 185 | results['class_wise_data'][class_label] = { 186 | 'Nsys': int(self.Nsys[class_id]), 187 | 'Nref': int(self.Nref[class_id]), 188 | } 189 | 190 | return results 191 | 192 | 193 | class EventDetectionMetrics(object): 194 | """Baseclass for sound event metric classes. 195 | """ 196 | 197 | def __init__(self, class_list): 198 | """__init__ method. 199 | 200 | Parameters 201 | ---------- 202 | class_list : list 203 | List of class labels to be evaluated. 204 | 205 | """ 206 | 207 | self.class_list = class_list 208 | self.eps = numpy.spacing(1) 209 | 210 | def max_event_offset(self, data): 211 | """Get maximum event offset from event list 212 | 213 | Parameters 214 | ---------- 215 | data : list 216 | Event list, list of event dicts 217 | 218 | Returns 219 | ------- 220 | max : float > 0 221 | Maximum event offset 222 | """ 223 | 224 | max = 0 225 | for event in data: 226 | if event['event_offset'] > max: 227 | max = event['event_offset'] 228 | return max 229 | 230 | def list_to_roll(self, data, time_resolution=0.01): 231 | """Convert event list into event roll. 232 | Event roll is binary matrix indicating event activity withing time segment defined by time_resolution. 233 | 234 | Parameters 235 | ---------- 236 | data : list 237 | Event list, list of event dicts 238 | 239 | time_resolution : float > 0 240 | Time resolution used when converting event into event roll. 241 | 242 | Returns 243 | ------- 244 | event_roll : numpy.ndarray [shape=(math.ceil(data_length * 1 / time_resolution), amount of classes)] 245 | Event roll 246 | """ 247 | 248 | # Initialize 249 | data_length = self.max_event_offset(data) 250 | event_roll = numpy.zeros(( int(math.ceil(data_length * 1 / time_resolution)), len(self.class_list))) 251 | 252 | # Fill-in event_roll 253 | for event in data: 254 | pos = self.class_list.index(event['event_label'].rstrip()) 255 | 256 | onset = int(math.floor(event['event_onset'] * 1 / time_resolution)) 257 | offset = int(math.ceil(event['event_offset'] * 1 / time_resolution)) 258 | 259 | event_roll[onset:offset, pos] = 1 260 | 261 | return event_roll 262 | 263 | 264 | class DCASE2016_EventDetection_SegmentBasedMetrics(EventDetectionMetrics): 265 | """DCASE2016 Segment based metrics for sound event detection 266 | 267 | Supported metrics: 268 | - Overall 269 | - Error rate (ER), Substitutions (S), Insertions (I), Deletions (D) 270 | - F-score (F1) 271 | - Class-wise 272 | - Error rate (ER), Insertions (I), Deletions (D) 273 | - F-score (F1) 274 | 275 | Examples 276 | -------- 277 | 278 | >>> overall_metrics_per_scene = {} 279 | >>> for scene_id, scene_label in enumerate(dataset.scene_labels): 280 | >>> dcase2016_segment_based_metric = DCASE2016_EventDetection_SegmentBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 281 | >>> for fold in dataset.folds(mode=dataset_evaluation_mode): 282 | >>> results = [] 283 | >>> result_filename = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 284 | >>> 285 | >>> if os.path.isfile(result_filename): 286 | >>> with open(result_filename, 'rt') as f: 287 | >>> for row in csv.reader(f, delimiter='\t'): 288 | >>> results.append(row) 289 | >>> 290 | >>> for file_id, item in enumerate(dataset.test(fold,scene_label=scene_label)): 291 | >>> current_file_results = [] 292 | >>> for result_line in results: 293 | >>> if result_line[0] == dataset.absolute_to_relative(item['file']): 294 | >>> current_file_results.append( 295 | >>> {'file': result_line[0], 296 | >>> 'event_onset': float(result_line[1]), 297 | >>> 'event_offset': float(result_line[2]), 298 | >>> 'event_label': result_line[3] 299 | >>> } 300 | >>> ) 301 | >>> meta = dataset.file_meta(dataset.absolute_to_relative(item['file'])) 302 | >>> dcase2016_segment_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 303 | >>> overall_metrics_per_scene[scene_label]['segment_based_metrics'] = dcase2016_segment_based_metric.results() 304 | 305 | """ 306 | 307 | def __init__(self, class_list, time_resolution=1.0): 308 | """__init__ method. 309 | 310 | Parameters 311 | ---------- 312 | class_list : list 313 | List of class labels to be evaluated. 314 | 315 | time_resolution : float > 0 316 | Time resolution used when converting event into event roll. 317 | (Default value = 1.0) 318 | 319 | """ 320 | 321 | self.time_resolution = time_resolution 322 | 323 | self.overall = { 324 | 'Ntp': 0.0, 325 | 'Ntn': 0.0, 326 | 'Nfp': 0.0, 327 | 'Nfn': 0.0, 328 | 'Nref': 0.0, 329 | 'Nsys': 0.0, 330 | 'ER': 0.0, 331 | 'S': 0.0, 332 | 'D': 0.0, 333 | 'I': 0.0, 334 | } 335 | self.class_wise = {} 336 | 337 | for class_label in class_list: 338 | self.class_wise[class_label] = { 339 | 'Ntp': 0.0, 340 | 'Ntn': 0.0, 341 | 'Nfp': 0.0, 342 | 'Nfn': 0.0, 343 | 'Nref': 0.0, 344 | 'Nsys': 0.0, 345 | } 346 | 347 | EventDetectionMetrics.__init__(self, class_list=class_list) 348 | 349 | def __enter__(self): 350 | # Initialize class and return it 351 | return self 352 | 353 | def __exit__(self, type, value, traceback): 354 | # Finalize evaluation and return results 355 | return self.results() 356 | 357 | def evaluate(self, annotated_ground_truth, system_output): 358 | """Evaluate system output and annotated ground truth pair. 359 | 360 | Use results method to get results. 361 | 362 | Parameters 363 | ---------- 364 | annotated_ground_truth : numpy.array 365 | Ground truth array, list of scene labels 366 | 367 | system_output : numpy.array 368 | System output array, list of scene labels 369 | 370 | Returns 371 | ------- 372 | nothing 373 | 374 | """ 375 | 376 | # Convert event list into frame-based representation 377 | system_event_roll = self.list_to_roll(data=system_output, time_resolution=self.time_resolution) 378 | annotated_event_roll = self.list_to_roll(data=annotated_ground_truth, time_resolution=self.time_resolution) 379 | 380 | # Fix durations of both event_rolls to be equal 381 | if annotated_event_roll.shape[0] > system_event_roll.shape[0]: 382 | padding = numpy.zeros((annotated_event_roll.shape[0] - system_event_roll.shape[0], len(self.class_list))) 383 | system_event_roll = numpy.vstack((system_event_roll, padding)) 384 | 385 | if system_event_roll.shape[0] > annotated_event_roll.shape[0]: 386 | padding = numpy.zeros((system_event_roll.shape[0] - annotated_event_roll.shape[0], len(self.class_list))) 387 | annotated_event_roll = numpy.vstack((annotated_event_roll, padding)) 388 | 389 | # Compute segment-based overall metrics 390 | for segment_id in range(0, annotated_event_roll.shape[0]): 391 | annotated_segment = annotated_event_roll[segment_id, :] 392 | system_segment = system_event_roll[segment_id, :] 393 | 394 | Ntp = sum(system_segment + annotated_segment > 1) 395 | Ntn = sum(system_segment + annotated_segment == 0) 396 | Nfp = sum(system_segment - annotated_segment > 0) 397 | Nfn = sum(annotated_segment - system_segment > 0) 398 | 399 | Nref = sum(annotated_segment) 400 | Nsys = sum(system_segment) 401 | 402 | S = min(Nref, Nsys) - Ntp 403 | D = max(0, Nref - Nsys) 404 | I = max(0, Nsys - Nref) 405 | ER = max(Nref, Nsys) - Ntp 406 | 407 | self.overall['Ntp'] += Ntp 408 | self.overall['Ntn'] += Ntn 409 | self.overall['Nfp'] += Nfp 410 | self.overall['Nfn'] += Nfn 411 | self.overall['Nref'] += Nref 412 | self.overall['Nsys'] += Nsys 413 | self.overall['S'] += S 414 | self.overall['D'] += D 415 | self.overall['I'] += I 416 | self.overall['ER'] += ER 417 | 418 | for class_id, class_label in enumerate(self.class_list): 419 | annotated_segment = annotated_event_roll[:, class_id] 420 | system_segment = system_event_roll[:, class_id] 421 | 422 | Ntp = sum(system_segment + annotated_segment > 1) 423 | Ntn = sum(system_segment + annotated_segment == 0) 424 | Nfp = sum(system_segment - annotated_segment > 0) 425 | Nfn = sum(annotated_segment - system_segment > 0) 426 | 427 | Nref = sum(annotated_segment) 428 | Nsys = sum(system_segment) 429 | 430 | self.class_wise[class_label]['Ntp'] += Ntp 431 | self.class_wise[class_label]['Ntn'] += Ntn 432 | self.class_wise[class_label]['Nfp'] += Nfp 433 | self.class_wise[class_label]['Nfn'] += Nfn 434 | self.class_wise[class_label]['Nref'] += Nref 435 | self.class_wise[class_label]['Nsys'] += Nsys 436 | 437 | return self 438 | 439 | def results(self): 440 | """Get results 441 | 442 | Outputs results in dict, format: 443 | 444 | { 445 | 'overall': 446 | { 447 | 'Pre': 448 | 'Rec': 449 | 'F': 450 | 'ER': 451 | 'S': 452 | 'D': 453 | 'I': 454 | } 455 | 'class_wise': 456 | { 457 | 'office': { 458 | 'Pre': 459 | 'Rec': 460 | 'F': 461 | 'ER': 462 | 'D': 463 | 'I': 464 | 'Nref': 465 | 'Nsys': 466 | 'Ntp': 467 | 'Nfn': 468 | 'Nfp': 469 | }, 470 | } 471 | 'class_wise_average': 472 | { 473 | 'F': 474 | 'ER': 475 | } 476 | } 477 | 478 | Parameters 479 | ---------- 480 | nothing 481 | 482 | Returns 483 | ------- 484 | results : dict 485 | Results dict 486 | 487 | """ 488 | 489 | results = {'overall': {}, 490 | 'class_wise': {}, 491 | 'class_wise_average': {}, 492 | } 493 | 494 | # Overall metrics 495 | results['overall']['Pre'] = self.overall['Ntp'] / (self.overall['Nsys'] + self.eps) 496 | results['overall']['Rec'] = self.overall['Ntp'] / self.overall['Nref'] 497 | results['overall']['F'] = 2 * ((results['overall']['Pre'] * results['overall']['Rec']) / (results['overall']['Pre'] + results['overall']['Rec'] + self.eps)) 498 | 499 | results['overall']['ER'] = self.overall['ER'] / self.overall['Nref'] 500 | results['overall']['S'] = self.overall['S'] / self.overall['Nref'] 501 | results['overall']['D'] = self.overall['D'] / self.overall['Nref'] 502 | results['overall']['I'] = self.overall['I'] / self.overall['Nref'] 503 | 504 | # Class-wise metrics 505 | class_wise_F = [] 506 | class_wise_ER = [] 507 | for class_id, class_label in enumerate(self.class_list): 508 | if class_label not in results['class_wise']: 509 | results['class_wise'][class_label] = {} 510 | results['class_wise'][class_label]['Pre'] = self.class_wise[class_label]['Ntp'] / (self.class_wise[class_label]['Nsys'] + self.eps) 511 | results['class_wise'][class_label]['Rec'] = self.class_wise[class_label]['Ntp'] / (self.class_wise[class_label]['Nref'] + self.eps) 512 | results['class_wise'][class_label]['F'] = 2 * ((results['class_wise'][class_label]['Pre'] * results['class_wise'][class_label]['Rec']) / (results['class_wise'][class_label]['Pre'] + results['class_wise'][class_label]['Rec'] + self.eps)) 513 | 514 | results['class_wise'][class_label]['ER'] = (self.class_wise[class_label]['Nfn'] + self.class_wise[class_label]['Nfp']) / (self.class_wise[class_label]['Nref'] + self.eps) 515 | results['class_wise'][class_label]['D'] = self.class_wise[class_label]['Nfn'] / (self.class_wise[class_label]['Nref'] + self.eps) 516 | results['class_wise'][class_label]['I'] = self.class_wise[class_label]['Nfp'] / (self.class_wise[class_label]['Nref'] + self.eps) 517 | 518 | results['class_wise'][class_label]['Nref'] = self.class_wise[class_label]['Nref'] 519 | results['class_wise'][class_label]['Nsys'] = self.class_wise[class_label]['Nsys'] 520 | results['class_wise'][class_label]['Ntp'] = self.class_wise[class_label]['Ntp'] 521 | results['class_wise'][class_label]['Nfn'] = self.class_wise[class_label]['Nfn'] 522 | results['class_wise'][class_label]['Nfp'] = self.class_wise[class_label]['Nfp'] 523 | 524 | class_wise_F.append(results['class_wise'][class_label]['F']) 525 | class_wise_ER.append(results['class_wise'][class_label]['ER']) 526 | 527 | results['class_wise_average']['F'] = numpy.mean(class_wise_F) 528 | results['class_wise_average']['ER'] = numpy.mean(class_wise_ER) 529 | 530 | return results 531 | 532 | 533 | class DCASE2016_EventDetection_EventBasedMetrics(EventDetectionMetrics): 534 | """DCASE2016 Event based metrics for sound event detection 535 | 536 | Supported metrics: 537 | - Overall 538 | - Error rate (ER), Substitutions (S), Insertions (I), Deletions (D) 539 | - F-score (F1) 540 | - Class-wise 541 | - Error rate (ER), Insertions (I), Deletions (D) 542 | - F-score (F1) 543 | 544 | Examples 545 | -------- 546 | 547 | >>> overall_metrics_per_scene = {} 548 | >>> for scene_id, scene_label in enumerate(dataset.scene_labels): 549 | >>> dcase2016_event_based_metric = DCASE2016_EventDetection_EventBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 550 | >>> for fold in dataset.folds(mode=dataset_evaluation_mode): 551 | >>> results = [] 552 | >>> result_filename = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 553 | >>> 554 | >>> if os.path.isfile(result_filename): 555 | >>> with open(result_filename, 'rt') as f: 556 | >>> for row in csv.reader(f, delimiter='\t'): 557 | >>> results.append(row) 558 | >>> 559 | >>> for file_id, item in enumerate(dataset.test(fold,scene_label=scene_label)): 560 | >>> current_file_results = [] 561 | >>> for result_line in results: 562 | >>> if result_line[0] == dataset.absolute_to_relative(item['file']): 563 | >>> current_file_results.append( 564 | >>> {'file': result_line[0], 565 | >>> 'event_onset': float(result_line[1]), 566 | >>> 'event_offset': float(result_line[2]), 567 | >>> 'event_label': result_line[3] 568 | >>> } 569 | >>> ) 570 | >>> meta = dataset.file_meta(dataset.absolute_to_relative(item['file'])) 571 | >>> dcase2016_event_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 572 | >>> overall_metrics_per_scene[scene_label]['event_based_metrics'] = dcase2016_event_based_metric.results() 573 | 574 | """ 575 | 576 | def __init__(self, class_list, time_resolution=1.0, t_collar=0.2): 577 | """__init__ method. 578 | 579 | Parameters 580 | ---------- 581 | class_list : list 582 | List of class labels to be evaluated. 583 | 584 | time_resolution : float > 0 585 | Time resolution used when converting event into event roll. 586 | (Default value = 1.0) 587 | 588 | t_collar : float > 0 589 | Time collar for event onset and offset condition 590 | (Default value = 0.2) 591 | 592 | """ 593 | 594 | self.time_resolution = time_resolution 595 | self.t_collar = t_collar 596 | 597 | self.overall = { 598 | 'Nref': 0.0, 599 | 'Nsys': 0.0, 600 | 'Nsubs': 0.0, 601 | 'Ntp': 0.0, 602 | 'Nfp': 0.0, 603 | 'Nfn': 0.0, 604 | } 605 | self.class_wise = {} 606 | 607 | for class_label in class_list: 608 | self.class_wise[class_label] = { 609 | 'Nref': 0.0, 610 | 'Nsys': 0.0, 611 | 'Ntp': 0.0, 612 | 'Ntn': 0.0, 613 | 'Nfp': 0.0, 614 | 'Nfn': 0.0, 615 | } 616 | 617 | EventDetectionMetrics.__init__(self, class_list=class_list) 618 | 619 | def __enter__(self): 620 | # Initialize class and return it 621 | return self 622 | 623 | def __exit__(self, type, value, traceback): 624 | # Finalize evaluation and return results 625 | return self.results() 626 | 627 | def evaluate(self, annotated_ground_truth, system_output): 628 | """Evaluate system output and annotated ground truth pair. 629 | 630 | Use results method to get results. 631 | 632 | Parameters 633 | ---------- 634 | annotated_ground_truth : numpy.array 635 | Ground truth array, list of scene labels 636 | 637 | system_output : numpy.array 638 | System output array, list of scene labels 639 | 640 | Returns 641 | ------- 642 | nothing 643 | 644 | """ 645 | 646 | # Overall metrics 647 | 648 | # Total number of detected and reference events 649 | Nsys = len(system_output) 650 | Nref = len(annotated_ground_truth) 651 | 652 | sys_correct = numpy.zeros(Nsys, dtype=bool) 653 | ref_correct = numpy.zeros(Nref, dtype=bool) 654 | 655 | # Number of correctly transcribed events, onset/offset within a t_collar range 656 | for j in range(0, len(annotated_ground_truth)): 657 | for i in range(0, len(system_output)): 658 | label_condition = annotated_ground_truth[j]['event_label'] == system_output[i]['event_label'] 659 | onset_condition = self.onset_condition(annotated_event=annotated_ground_truth[j], 660 | system_event=system_output[i], 661 | t_collar=self.t_collar) 662 | 663 | offset_condition = self.offset_condition(annotated_event=annotated_ground_truth[j], 664 | system_event=system_output[i], 665 | t_collar=self.t_collar) 666 | 667 | if label_condition and onset_condition and offset_condition: 668 | ref_correct[j] = True 669 | sys_correct[i] = True 670 | break 671 | 672 | Ntp = numpy.sum(sys_correct) 673 | 674 | sys_leftover = numpy.nonzero(numpy.negative(sys_correct))[0] 675 | ref_leftover = numpy.nonzero(numpy.negative(ref_correct))[0] 676 | 677 | # Substitutions 678 | Nsubs = 0 679 | for j in ref_leftover: 680 | for i in sys_leftover: 681 | onset_condition = self.onset_condition(annotated_event=annotated_ground_truth[j], 682 | system_event=system_output[i], 683 | t_collar=self.t_collar) 684 | 685 | offset_condition = self.offset_condition(annotated_event=annotated_ground_truth[j], 686 | system_event=system_output[i], 687 | t_collar=self.t_collar) 688 | 689 | if onset_condition and offset_condition: 690 | Nsubs += 1 691 | break 692 | 693 | Nfp = Nsys - Ntp - Nsubs 694 | Nfn = Nref - Ntp - Nsubs 695 | 696 | self.overall['Nref'] += Nref 697 | self.overall['Nsys'] += Nsys 698 | self.overall['Ntp'] += Ntp 699 | self.overall['Nsubs'] += Nsubs 700 | self.overall['Nfp'] += Nfp 701 | self.overall['Nfn'] += Nfn 702 | 703 | # Class-wise metrics 704 | for class_id, class_label in enumerate(self.class_list): 705 | Nref = 0.0 706 | Nsys = 0.0 707 | Ntp = 0.0 708 | 709 | # Count event frequencies in the ground truth 710 | for i in range(0, len(annotated_ground_truth)): 711 | if annotated_ground_truth[i]['event_label'] == class_label: 712 | Nref += 1 713 | 714 | # Count event frequencies in the system output 715 | for i in range(0, len(system_output)): 716 | if system_output[i]['event_label'] == class_label: 717 | Nsys += 1 718 | 719 | for j in range(0, len(annotated_ground_truth)): 720 | for i in range(0, len(system_output)): 721 | if annotated_ground_truth[j]['event_label'] == class_label and system_output[i]['event_label'] == class_label: 722 | onset_condition = self.onset_condition(annotated_event=annotated_ground_truth[j], 723 | system_event=system_output[i], 724 | t_collar=self.t_collar) 725 | 726 | offset_condition = self.offset_condition(annotated_event=annotated_ground_truth[j], 727 | system_event=system_output[i], 728 | t_collar=self.t_collar) 729 | 730 | if onset_condition and offset_condition: 731 | Ntp += 1 732 | break 733 | 734 | Nfp = Nsys - Ntp 735 | Nfn = Nref - Ntp 736 | 737 | self.class_wise[class_label]['Nref'] += Nref 738 | self.class_wise[class_label]['Nsys'] += Nsys 739 | 740 | self.class_wise[class_label]['Ntp'] += Ntp 741 | self.class_wise[class_label]['Nfp'] += Nfp 742 | self.class_wise[class_label]['Nfn'] += Nfn 743 | 744 | def onset_condition(self, annotated_event, system_event, t_collar=0.200): 745 | """Onset condition, checked does the event pair fulfill condition 746 | 747 | Condition: 748 | 749 | - event onsets are within t_collar each other 750 | 751 | Parameters 752 | ---------- 753 | annotated_event : dict 754 | Event dict 755 | 756 | system_event : dict 757 | Event dict 758 | 759 | t_collar : float > 0 760 | Defines how close event onsets have to be in order to be considered match. In seconds. 761 | (Default value = 0.2) 762 | 763 | Returns 764 | ------- 765 | result : bool 766 | Condition result 767 | 768 | """ 769 | 770 | return math.fabs(annotated_event['event_onset'] - system_event['event_onset']) <= t_collar 771 | 772 | def offset_condition(self, annotated_event, system_event, t_collar=0.200, percentage_of_length=0.5): 773 | """Offset condition, checking does the event pair fulfill condition 774 | 775 | Condition: 776 | 777 | - event offsets are within t_collar each other 778 | or 779 | - system event offset is within the percentage_of_length*annotated event_length 780 | 781 | Parameters 782 | ---------- 783 | annotated_event : dict 784 | Event dict 785 | 786 | system_event : dict 787 | Event dict 788 | 789 | t_collar : float > 0 790 | Defines how close event onsets have to be in order to be considered match. In seconds. 791 | (Default value = 0.2) 792 | 793 | percentage_of_length : float [0-1] 794 | 795 | 796 | Returns 797 | ------- 798 | result : bool 799 | Condition result 800 | 801 | """ 802 | annotated_length = annotated_event['event_offset'] - annotated_event['event_onset'] 803 | return math.fabs(annotated_event['event_offset'] - system_event['event_offset']) <= max(t_collar, percentage_of_length * annotated_length) 804 | 805 | def results(self): 806 | """Get results 807 | 808 | Outputs results in dict, format: 809 | 810 | { 811 | 'overall': 812 | { 813 | 'Pre': 814 | 'Rec': 815 | 'F': 816 | 'ER': 817 | 'S': 818 | 'D': 819 | 'I': 820 | } 821 | 'class_wise': 822 | { 823 | 'office': { 824 | 'Pre': 825 | 'Rec': 826 | 'F': 827 | 'ER': 828 | 'D': 829 | 'I': 830 | 'Nref': 831 | 'Nsys': 832 | 'Ntp': 833 | 'Nfn': 834 | 'Nfp': 835 | }, 836 | } 837 | 'class_wise_average': 838 | { 839 | 'F': 840 | 'ER': 841 | } 842 | } 843 | 844 | Parameters 845 | ---------- 846 | nothing 847 | 848 | Returns 849 | ------- 850 | results : dict 851 | Results dict 852 | 853 | """ 854 | 855 | results = { 856 | 'overall': {}, 857 | 'class_wise': {}, 858 | 'class_wise_average': {}, 859 | } 860 | 861 | # Overall metrics 862 | results['overall']['Pre'] = self.overall['Ntp'] / (self.overall['Nsys'] + self.eps) 863 | results['overall']['Rec'] = self.overall['Ntp'] / self.overall['Nref'] 864 | results['overall']['F'] = 2 * ((results['overall']['Pre'] * results['overall']['Rec']) / (results['overall']['Pre'] + results['overall']['Rec'] + self.eps)) 865 | 866 | results['overall']['ER'] = (self.overall['Nfn'] + self.overall['Nfp'] + self.overall['Nsubs']) / self.overall['Nref'] 867 | results['overall']['S'] = self.overall['Nsubs'] / self.overall['Nref'] 868 | results['overall']['D'] = self.overall['Nfn'] / self.overall['Nref'] 869 | results['overall']['I'] = self.overall['Nfp'] / self.overall['Nref'] 870 | 871 | # Class-wise metrics 872 | class_wise_F = [] 873 | class_wise_ER = [] 874 | 875 | for class_label in self.class_list: 876 | if class_label not in results['class_wise']: 877 | results['class_wise'][class_label] = {} 878 | 879 | results['class_wise'][class_label]['Pre'] = self.class_wise[class_label]['Ntp'] / (self.class_wise[class_label]['Nsys'] + self.eps) 880 | results['class_wise'][class_label]['Rec'] = self.class_wise[class_label]['Ntp'] / (self.class_wise[class_label]['Nref'] + self.eps) 881 | results['class_wise'][class_label]['F'] = 2 * ((results['class_wise'][class_label]['Pre'] * results['class_wise'][class_label]['Rec']) / (results['class_wise'][class_label]['Pre'] + results['class_wise'][class_label]['Rec'] + self.eps)) 882 | 883 | results['class_wise'][class_label]['ER'] = (self.class_wise[class_label]['Nfn']+self.class_wise[class_label]['Nfp']) / (self.class_wise[class_label]['Nref'] + self.eps) 884 | results['class_wise'][class_label]['D'] = self.class_wise[class_label]['Nfn'] / (self.class_wise[class_label]['Nref'] + self.eps) 885 | results['class_wise'][class_label]['I'] = self.class_wise[class_label]['Nfp'] / (self.class_wise[class_label]['Nref'] + self.eps) 886 | 887 | results['class_wise'][class_label]['Nref'] = self.class_wise[class_label]['Nref'] 888 | results['class_wise'][class_label]['Nsys'] = self.class_wise[class_label]['Nsys'] 889 | results['class_wise'][class_label]['Ntp'] = self.class_wise[class_label]['Ntp'] 890 | results['class_wise'][class_label]['Nfn'] = self.class_wise[class_label]['Nfn'] 891 | results['class_wise'][class_label]['Nfp'] = self.class_wise[class_label]['Nfp'] 892 | 893 | class_wise_F.append(results['class_wise'][class_label]['F']) 894 | class_wise_ER.append(results['class_wise'][class_label]['ER']) 895 | 896 | # Class-wise average 897 | results['class_wise_average']['F'] = numpy.mean(class_wise_F) 898 | results['class_wise_average']['ER'] = numpy.mean(class_wise_ER) 899 | 900 | return results 901 | 902 | 903 | class DCASE2013_EventDetection_Metrics(EventDetectionMetrics): 904 | """Lecagy DCASE2013 metrics, converted from the provided Matlab implementation 905 | 906 | Supported metrics: 907 | - Frame based 908 | - F-score (F) 909 | - AEER 910 | - Event based 911 | - Onset 912 | - F-Score (F) 913 | - AEER 914 | - Onset-offset 915 | - F-Score (F) 916 | - AEER 917 | - Class based 918 | - Onset 919 | - F-Score (F) 920 | - AEER 921 | - Onset-offset 922 | - F-Score (F) 923 | - AEER 924 | """ 925 | 926 | # 927 | 928 | def frame_based(self, annotated_ground_truth, system_output, resolution=0.01): 929 | # Convert event list into frame-based representation 930 | system_event_roll = self.list_to_roll(data=system_output, time_resolution=resolution) 931 | annotated_event_roll = self.list_to_roll(data=annotated_ground_truth, time_resolution=resolution) 932 | 933 | # Fix durations of both event_rolls to be equal 934 | if annotated_event_roll.shape[0] > system_event_roll.shape[0]: 935 | padding = numpy.zeros((annotated_event_roll.shape[0] - system_event_roll.shape[0], len(self.class_list))) 936 | system_event_roll = numpy.vstack((system_event_roll, padding)) 937 | 938 | if system_event_roll.shape[0] > annotated_event_roll.shape[0]: 939 | padding = numpy.zeros((system_event_roll.shape[0] - annotated_event_roll.shape[0], len(self.class_list))) 940 | annotated_event_roll = numpy.vstack((annotated_event_roll, padding)) 941 | 942 | # Compute frame-based metrics 943 | Nref = sum(sum(annotated_event_roll)) 944 | Ntot = sum(sum(system_event_roll)) 945 | Ntp = sum(sum(system_event_roll + annotated_event_roll > 1)) 946 | Nfp = sum(sum(system_event_roll - annotated_event_roll > 0)) 947 | Nfn = sum(sum(annotated_event_roll - system_event_roll > 0)) 948 | Nsubs = min(Nfp, Nfn) 949 | 950 | eps = numpy.spacing(1) 951 | 952 | results = dict() 953 | results['Rec'] = Ntp / (Nref + eps) 954 | results['Pre'] = Ntp / (Ntot + eps) 955 | results['F'] = 2 * ((results['Pre'] * results['Rec']) / (results['Pre'] + results['Rec'] + eps)) 956 | results['AEER'] = (Nfn + Nfp + Nsubs) / (Nref + eps) 957 | 958 | return results 959 | 960 | def event_based(self, annotated_ground_truth, system_output): 961 | # Event-based evaluation for event detection task 962 | # outputFile: the output of the event detection system 963 | # GTFile: the ground truth list of events 964 | 965 | # Total number of detected and reference events 966 | Ntot = len(system_output) 967 | Nref = len(annotated_ground_truth) 968 | 969 | # Number of correctly transcribed events, onset within a +/-100 ms range 970 | Ncorr = 0 971 | NcorrOff = 0 972 | for j in range(0, len(annotated_ground_truth)): 973 | for i in range(0, len(system_output)): 974 | if annotated_ground_truth[j]['event_label'] == system_output[i]['event_label'] and (math.fabs(annotated_ground_truth[j]['event_onset'] - system_output[i]['event_onset']) <= 0.1): 975 | Ncorr += 1 976 | 977 | # If offset within a +/-100 ms range or within 50% of ground-truth event's duration 978 | if math.fabs(annotated_ground_truth[j]['event_offset'] - system_output[i]['event_offset']) <= max(0.1, 0.5 * (annotated_ground_truth[j]['event_offset'] - annotated_ground_truth[j]['event_onset'])): 979 | NcorrOff += 1 980 | 981 | break # In order to not evaluate duplicates 982 | 983 | # Compute onset-only event-based metrics 984 | eps = numpy.spacing(1) 985 | results = { 986 | 'onset': {}, 987 | 'onset-offset': {}, 988 | } 989 | 990 | Nfp = Ntot - Ncorr 991 | Nfn = Nref - Ncorr 992 | Nsubs = min(Nfp, Nfn) 993 | results['onset']['Rec'] = Ncorr / (Nref + eps) 994 | results['onset']['Pre'] = Ncorr / (Ntot + eps) 995 | results['onset']['F'] = 2 * ( 996 | (results['onset']['Pre'] * results['onset']['Rec']) / ( 997 | results['onset']['Pre'] + results['onset']['Rec'] + eps)) 998 | results['onset']['AEER'] = (Nfn + Nfp + Nsubs) / (Nref + eps) 999 | 1000 | # Compute onset-offset event-based metrics 1001 | NfpOff = Ntot - NcorrOff 1002 | NfnOff = Nref - NcorrOff 1003 | NsubsOff = min(NfpOff, NfnOff) 1004 | results['onset-offset']['Rec'] = NcorrOff / (Nref + eps) 1005 | results['onset-offset']['Pre'] = NcorrOff / (Ntot + eps) 1006 | results['onset-offset']['F'] = 2 * ((results['onset-offset']['Pre'] * results['onset-offset']['Rec']) / ( 1007 | results['onset-offset']['Pre'] + results['onset-offset']['Rec'] + eps)) 1008 | results['onset-offset']['AEER'] = (NfnOff + NfpOff + NsubsOff) / (Nref + eps) 1009 | 1010 | return results 1011 | 1012 | def class_based(self, annotated_ground_truth, system_output): 1013 | # Class-wise event-based evaluation for event detection task 1014 | # outputFile: the output of the event detection system 1015 | # GTFile: the ground truth list of events 1016 | 1017 | # Total number of detected and reference events per class 1018 | Ntot = numpy.zeros((len(self.class_list), 1)) 1019 | for event in system_output: 1020 | pos = self.class_list.index(event['event_label']) 1021 | Ntot[pos] += 1 1022 | 1023 | Nref = numpy.zeros((len(self.class_list), 1)) 1024 | for event in annotated_ground_truth: 1025 | pos = self.class_list.index(event['event_label']) 1026 | Nref[pos] += 1 1027 | 1028 | I = (Nref > 0).nonzero()[0] # index for classes present in ground-truth 1029 | 1030 | # Number of correctly transcribed events per class, onset within a +/-100 ms range 1031 | Ncorr = numpy.zeros((len(self.class_list), 1)) 1032 | NcorrOff = numpy.zeros((len(self.class_list), 1)) 1033 | 1034 | for j in range(0, len(annotated_ground_truth)): 1035 | for i in range(0, len(system_output)): 1036 | if annotated_ground_truth[j]['event_label'] == system_output[i]['event_label'] and ( 1037 | math.fabs( 1038 | annotated_ground_truth[j]['event_onset'] - system_output[i]['event_onset']) <= 0.1): 1039 | pos = self.class_list.index(system_output[i]['event_label']) 1040 | Ncorr[pos] += 1 1041 | 1042 | # If offset within a +/-100 ms range or within 50% of ground-truth event's duration 1043 | if math.fabs(annotated_ground_truth[j]['event_offset'] - system_output[i]['event_offset']) <= max( 1044 | 0.1, 0.5 * ( 1045 | annotated_ground_truth[j]['event_offset'] - annotated_ground_truth[j][ 1046 | 'event_onset'])): 1047 | pos = self.class_list.index(system_output[i]['event_label']) 1048 | NcorrOff[pos] += 1 1049 | 1050 | break # In order to not evaluate duplicates 1051 | 1052 | # Compute onset-only class-wise event-based metrics 1053 | eps = numpy.spacing(1) 1054 | results = { 1055 | 'onset': {}, 1056 | 'onset-offset': {}, 1057 | } 1058 | 1059 | Nfp = Ntot - Ncorr 1060 | Nfn = Nref - Ncorr 1061 | Nsubs = numpy.minimum(Nfp, Nfn) 1062 | tempRec = Ncorr[I] / (Nref[I] + eps) 1063 | tempPre = Ncorr[I] / (Ntot[I] + eps) 1064 | results['onset']['Rec'] = numpy.mean(tempRec) 1065 | results['onset']['Pre'] = numpy.mean(tempPre) 1066 | tempF = 2 * ((tempPre * tempRec) / (tempPre + tempRec + eps)) 1067 | results['onset']['F'] = numpy.mean(tempF) 1068 | tempAEER = (Nfn[I] + Nfp[I] + Nsubs[I]) / (Nref[I] + eps) 1069 | results['onset']['AEER'] = numpy.mean(tempAEER) 1070 | 1071 | # Compute onset-offset class-wise event-based metrics 1072 | NfpOff = Ntot - NcorrOff 1073 | NfnOff = Nref - NcorrOff 1074 | NsubsOff = numpy.minimum(NfpOff, NfnOff) 1075 | tempRecOff = NcorrOff[I] / (Nref[I] + eps) 1076 | tempPreOff = NcorrOff[I] / (Ntot[I] + eps) 1077 | results['onset-offset']['Rec'] = numpy.mean(tempRecOff) 1078 | results['onset-offset']['Pre'] = numpy.mean(tempPreOff) 1079 | tempFOff = 2 * ((tempPreOff * tempRecOff) / (tempPreOff + tempRecOff + eps)) 1080 | results['onset-offset']['F'] = numpy.mean(tempFOff) 1081 | tempAEEROff = (NfnOff[I] + NfpOff[I] + NsubsOff[I]) / (Nref[I] + eps) 1082 | results['onset-offset']['AEER'] = numpy.mean(tempAEEROff) 1083 | 1084 | return results 1085 | 1086 | 1087 | def main(argv): 1088 | # Examples to show usage and required data structures 1089 | class_list = ['class1', 'class2', 'class3'] 1090 | system_output = [ 1091 | { 1092 | 'event_label': 'class1', 1093 | 'event_onset': 0.1, 1094 | 'event_offset': 1.0 1095 | }, 1096 | { 1097 | 'event_label': 'class2', 1098 | 'event_onset': 4.1, 1099 | 'event_offset': 4.7 1100 | }, 1101 | { 1102 | 'event_label': 'class3', 1103 | 'event_onset': 5.5, 1104 | 'event_offset': 6.7 1105 | } 1106 | ] 1107 | annotated_groundtruth = [ 1108 | { 1109 | 'event_label': 'class1', 1110 | 'event_onset': 0.1, 1111 | 'event_offset': 1.0 1112 | }, 1113 | { 1114 | 'event_label': 'class2', 1115 | 'event_onset': 4.2, 1116 | 'event_offset': 5.4 1117 | }, 1118 | { 1119 | 'event_label': 'class3', 1120 | 'event_onset': 5.5, 1121 | 'event_offset': 6.7 1122 | } 1123 | ] 1124 | dcase2013metric = DCASE2013_EventDetection_Metrics(class_list=class_list) 1125 | 1126 | print 'DCASE2013' 1127 | print 'Frame-based:', dcase2013metric.frame_based(system_output=system_output, 1128 | annotated_ground_truth=annotated_groundtruth) 1129 | print 'Event-based:', dcase2013metric.event_based(system_output=system_output, 1130 | annotated_ground_truth=annotated_groundtruth) 1131 | print 'Class-based:', dcase2013metric.class_based(system_output=system_output, 1132 | annotated_ground_truth=annotated_groundtruth) 1133 | 1134 | dcase2016_metric = DCASE2016_EventDetection_SegmentBasedMetrics(class_list=class_list) 1135 | print 'DCASE2016' 1136 | print dcase2016_metric.evaluate(system_output=system_output, annotated_ground_truth=annotated_groundtruth).results() 1137 | 1138 | 1139 | if __name__ == "__main__": 1140 | sys.exit(main(sys.argv)) 1141 | -------------------------------------------------------------------------------- /task3_gmm_baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # DCASE 2016::Sound Event Detection in Real-life Audio / Baseline System 5 | 6 | from src.ui import * 7 | from src.general import * 8 | from src.files import * 9 | 10 | from src.features import * 11 | from src.sound_event_detection import * 12 | from src.dataset import * 13 | from src.evaluation import * 14 | 15 | import numpy 16 | import csv 17 | import warnings 18 | import argparse 19 | import textwrap 20 | import math 21 | 22 | from sklearn import mixture 23 | 24 | __version_info__ = ('1', '0', '1') 25 | __version__ = '.'.join(__version_info__) 26 | 27 | 28 | def main(argv): 29 | numpy.random.seed(123456) # let's make randomization predictable 30 | 31 | parser = argparse.ArgumentParser( 32 | prefix_chars='-+', 33 | formatter_class=argparse.RawDescriptionHelpFormatter, 34 | description=textwrap.dedent('''\ 35 | DCASE 2016 36 | Task 3: Sound Event Detection in Real-life Audio 37 | Baseline System 38 | --------------------------------------------- 39 | Tampere University of Technology / Audio Research Group 40 | Author: Toni Heittola ( toni.heittola@tut.fi ) 41 | 42 | System description 43 | This is an baseline implementation for the D-CASE 2016, task 3 - Sound event detection in real life audio. 44 | The system has binary classifier for each included sound event class. The GMM classifier is trained with 45 | the positive and negative examples from the mixture signals, and classification is done between these 46 | two models as likelihood ratio. Acoustic features are MFCC+Delta+Acceleration (MFCC0 omitted). 47 | 48 | ''')) 49 | 50 | parser.add_argument("-development", help="Use the system in the development mode", action='store_true', 51 | default=False, dest='development') 52 | parser.add_argument("-challenge", help="Use the system in the challenge mode", action='store_true', 53 | default=False, dest='challenge') 54 | 55 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) 56 | args = parser.parse_args() 57 | 58 | # Load parameters from config file 59 | parameter_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 60 | os.path.splitext(os.path.basename(__file__))[0]+'.yaml') 61 | params = load_parameters(parameter_file) 62 | params = process_parameters(params) 63 | make_folders(params) 64 | 65 | title("DCASE 2016::Sound Event Detection in Real-life Audio / Baseline System") 66 | 67 | # Check if mode is defined 68 | if not (args.development or args.challenge): 69 | args.development = True 70 | args.challenge = False 71 | 72 | dataset_evaluation_mode = 'folds' 73 | if args.development and not args.challenge: 74 | print "Running system in development mode" 75 | dataset_evaluation_mode = 'folds' 76 | elif not args.development and args.challenge: 77 | print "Running system in challenge mode" 78 | dataset_evaluation_mode = 'full' 79 | 80 | # Get dataset container class 81 | dataset = eval(params['general']['development_dataset'])(data_path=params['path']['data']) 82 | 83 | # Fetch data over internet and setup the data 84 | # ================================================== 85 | if params['flow']['initialize']: 86 | dataset.fetch() 87 | 88 | # Extract features for all audio files in the dataset 89 | # ================================================== 90 | if params['flow']['extract_features']: 91 | section_header('Feature extraction [Development data]') 92 | 93 | # Collect files from evaluation sets 94 | files = [] 95 | for fold in dataset.folds(mode=dataset_evaluation_mode): 96 | for item_id, item in enumerate(dataset.train(fold)): 97 | if item['file'] not in files: 98 | files.append(item['file']) 99 | for item_id, item in enumerate(dataset.test(fold)): 100 | if item['file'] not in files: 101 | files.append(item['file']) 102 | 103 | # Go through files and make sure all features are extracted 104 | do_feature_extraction(files=files, 105 | dataset=dataset, 106 | feature_path=params['path']['features'], 107 | params=params['features'], 108 | overwrite=params['general']['overwrite']) 109 | 110 | foot() 111 | 112 | # Prepare feature normalizers 113 | # ================================================== 114 | if params['flow']['feature_normalizer']: 115 | section_header('Feature normalizer [Development data]') 116 | 117 | do_feature_normalization(dataset=dataset, 118 | feature_normalizer_path=params['path']['feature_normalizers'], 119 | feature_path=params['path']['features'], 120 | dataset_evaluation_mode=dataset_evaluation_mode, 121 | overwrite=params['general']['overwrite']) 122 | 123 | foot() 124 | 125 | # System training 126 | # ================================================== 127 | if params['flow']['train_system']: 128 | section_header('System training [Development data]') 129 | 130 | do_system_training(dataset=dataset, 131 | model_path=params['path']['models'], 132 | feature_normalizer_path=params['path']['feature_normalizers'], 133 | feature_path=params['path']['features'], 134 | hop_length_seconds=params['features']['hop_length_seconds'], 135 | classifier_params=params['classifier']['parameters'], 136 | dataset_evaluation_mode=dataset_evaluation_mode, 137 | classifier_method=params['classifier']['method'], 138 | overwrite=params['general']['overwrite'] 139 | ) 140 | 141 | foot() 142 | 143 | # System evaluation in development mode 144 | if args.development and not args.challenge: 145 | 146 | # System testing 147 | # ================================================== 148 | if params['flow']['test_system']: 149 | section_header('System testing [Development data]') 150 | 151 | do_system_testing(dataset=dataset, 152 | result_path=params['path']['results'], 153 | feature_path=params['path']['features'], 154 | model_path=params['path']['models'], 155 | feature_params=params['features'], 156 | detector_params=params['detector'], 157 | dataset_evaluation_mode=dataset_evaluation_mode, 158 | classifier_method=params['classifier']['method'], 159 | overwrite=params['general']['overwrite'] 160 | ) 161 | foot() 162 | 163 | # System evaluation 164 | # ================================================== 165 | if params['flow']['evaluate_system']: 166 | section_header('System evaluation [Development data]') 167 | 168 | do_system_evaluation(dataset=dataset, 169 | dataset_evaluation_mode=dataset_evaluation_mode, 170 | result_path=params['path']['results']) 171 | 172 | foot() 173 | 174 | # System evaluation with challenge data 175 | elif not args.development and args.challenge: 176 | # Fetch data over internet and setup the data 177 | challenge_dataset = eval(params['general']['challenge_dataset'])(data_path=params['path']['data']) 178 | 179 | if params['flow']['initialize']: 180 | challenge_dataset.fetch() 181 | 182 | # System testing 183 | if params['flow']['test_system']: 184 | section_header('System testing [Challenge data]') 185 | 186 | do_system_testing(dataset=challenge_dataset, 187 | result_path=params['path']['challenge_results'], 188 | feature_path=params['path']['features'], 189 | model_path=params['path']['models'], 190 | feature_params=params['features'], 191 | detector_params=params['detector'], 192 | dataset_evaluation_mode=dataset_evaluation_mode, 193 | classifier_method=params['classifier']['method'], 194 | overwrite=True 195 | ) 196 | foot() 197 | 198 | print " " 199 | print "Your results for the challenge data are stored at ["+params['path']['challenge_results']+"]" 200 | print " " 201 | 202 | 203 | def process_parameters(params): 204 | """Parameter post-processing. 205 | 206 | Parameters 207 | ---------- 208 | params : dict 209 | parameters in dict 210 | 211 | Returns 212 | ------- 213 | params : dict 214 | processed parameters 215 | 216 | """ 217 | 218 | params['features']['mfcc']['win_length'] = int(params['features']['win_length_seconds'] * params['features']['fs']) 219 | params['features']['mfcc']['hop_length'] = int(params['features']['hop_length_seconds'] * params['features']['fs']) 220 | 221 | # Copy parameters for current classifier method 222 | params['classifier']['parameters'] = params['classifier_parameters'][params['classifier']['method']] 223 | 224 | # Hash 225 | params['features']['hash'] = get_parameter_hash(params['features']) 226 | params['classifier']['hash'] = get_parameter_hash(params['classifier']) 227 | params['detector']['hash'] = get_parameter_hash(params['detector']) 228 | 229 | # Paths 230 | params['path']['data'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), params['path']['data']) 231 | params['path']['base'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), params['path']['base']) 232 | 233 | # Features 234 | params['path']['features_'] = params['path']['features'] 235 | params['path']['features'] = os.path.join(params['path']['base'], 236 | params['path']['features'], 237 | params['features']['hash']) 238 | 239 | # Feature normalizers 240 | params['path']['feature_normalizers_'] = params['path']['feature_normalizers'] 241 | params['path']['feature_normalizers'] = os.path.join(params['path']['base'], 242 | params['path']['feature_normalizers'], 243 | params['features']['hash']) 244 | 245 | # Models 246 | # Save parameters into folders to help manual browsing of files. 247 | params['path']['models_'] = params['path']['models'] 248 | params['path']['models'] = os.path.join(params['path']['base'], 249 | params['path']['models'], 250 | params['features']['hash'], 251 | params['classifier']['hash']) 252 | 253 | # Results 254 | params['path']['results_'] = params['path']['results'] 255 | params['path']['results'] = os.path.join(params['path']['base'], 256 | params['path']['results'], 257 | params['features']['hash'], 258 | params['classifier']['hash'], 259 | params['detector']['hash']) 260 | return params 261 | 262 | 263 | def make_folders(params, parameter_filename='parameters.yaml'): 264 | """Create all needed folders, and saves parameters in yaml-file for easier manual browsing of data. 265 | 266 | Parameters 267 | ---------- 268 | params : dict 269 | parameters in dict 270 | 271 | parameter_filename : str 272 | filename to save parameters used to generate the folder name 273 | 274 | Returns 275 | ------- 276 | nothing 277 | 278 | """ 279 | 280 | # Check that target path exists, create if not 281 | check_path(params['path']['features']) 282 | check_path(params['path']['feature_normalizers']) 283 | check_path(params['path']['models']) 284 | check_path(params['path']['results']) 285 | 286 | # Save parameters into folders to help manual browsing of files. 287 | 288 | # Features 289 | feature_parameter_filename = os.path.join(params['path']['features'], parameter_filename) 290 | if not os.path.isfile(feature_parameter_filename): 291 | save_parameters(feature_parameter_filename, params['features']) 292 | 293 | # Feature normalizers 294 | feature_normalizer_parameter_filename = os.path.join(params['path']['feature_normalizers'], parameter_filename) 295 | if not os.path.isfile(feature_normalizer_parameter_filename): 296 | save_parameters(feature_normalizer_parameter_filename, params['features']) 297 | 298 | # Models 299 | model_features_parameter_filename = os.path.join(params['path']['base'], 300 | params['path']['models_'], 301 | params['features']['hash'], 302 | parameter_filename) 303 | if not os.path.isfile(model_features_parameter_filename): 304 | save_parameters(model_features_parameter_filename, params['features']) 305 | 306 | model_models_parameter_filename = os.path.join(params['path']['base'], 307 | params['path']['models_'], 308 | params['features']['hash'], 309 | params['classifier']['hash'], 310 | parameter_filename) 311 | if not os.path.isfile(model_models_parameter_filename): 312 | save_parameters(model_models_parameter_filename, params['classifier']) 313 | 314 | # Results 315 | # Save parameters into folders to help manual browsing of files. 316 | result_features_parameter_filename = os.path.join(params['path']['base'], 317 | params['path']['results_'], 318 | params['features']['hash'], 319 | parameter_filename) 320 | if not os.path.isfile(result_features_parameter_filename): 321 | save_parameters(result_features_parameter_filename, params['features']) 322 | 323 | result_models_parameter_filename = os.path.join(params['path']['base'], 324 | params['path']['results_'], 325 | params['features']['hash'], 326 | params['classifier']['hash'], 327 | parameter_filename) 328 | if not os.path.isfile(result_models_parameter_filename): 329 | save_parameters(result_models_parameter_filename, params['classifier']) 330 | 331 | result_detector_parameter_filename = os.path.join(params['path']['base'], 332 | params['path']['results_'], 333 | params['features']['hash'], 334 | params['classifier']['hash'], 335 | params['detector']['hash'], 336 | parameter_filename) 337 | if not os.path.isfile(result_detector_parameter_filename): 338 | save_parameters(result_detector_parameter_filename, params['detector']) 339 | 340 | 341 | def get_feature_filename(audio_file, path, extension='cpickle'): 342 | """Get feature filename 343 | 344 | Parameters 345 | ---------- 346 | audio_file : str 347 | audio file name from which the features are extracted 348 | 349 | path : str 350 | feature path 351 | 352 | extension : str 353 | file extension 354 | (Default value='cpickle') 355 | 356 | Returns 357 | ------- 358 | feature_filename : str 359 | full feature filename 360 | 361 | """ 362 | 363 | return os.path.join(path, 'sequence_' + os.path.splitext(audio_file)[0] + '.' + extension) 364 | 365 | 366 | def get_feature_normalizer_filename(fold, scene_label, path, extension='cpickle'): 367 | """Get normalizer filename 368 | 369 | Parameters 370 | ---------- 371 | fold : int >= 0 372 | evaluation fold number 373 | 374 | scene_label : str 375 | scene label 376 | 377 | path : str 378 | normalizer path 379 | 380 | extension : str 381 | file extension 382 | (Default value='cpickle') 383 | 384 | Returns 385 | ------- 386 | normalizer_filename : str 387 | full normalizer filename 388 | 389 | """ 390 | 391 | return os.path.join(path, 'scale_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 392 | 393 | 394 | def get_model_filename(fold, scene_label, path, extension='cpickle'): 395 | """Get model filename 396 | 397 | Parameters 398 | ---------- 399 | fold : int >= 0 400 | evaluation fold number 401 | 402 | scene_label : str 403 | scene label 404 | 405 | path : str 406 | model path 407 | 408 | extension : str 409 | file extension 410 | (Default value='cpickle') 411 | 412 | Returns 413 | ------- 414 | model_filename : str 415 | full model filename 416 | 417 | """ 418 | 419 | return os.path.join(path, 'model_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 420 | 421 | 422 | def get_result_filename(fold, scene_label, path, extension='txt'): 423 | """Get result filename 424 | 425 | Parameters 426 | ---------- 427 | fold : int >= 0 428 | evaluation fold number 429 | 430 | scene_label : str 431 | scene label 432 | 433 | path : str 434 | result path 435 | 436 | extension : str 437 | file extension 438 | (Default value='cpickle') 439 | 440 | Returns 441 | ------- 442 | result_filename : str 443 | full result filename 444 | 445 | """ 446 | 447 | if fold == 0: 448 | return os.path.join(path, 'results_' + str(scene_label) + '.' + extension) 449 | else: 450 | return os.path.join(path, 'results_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 451 | 452 | 453 | def do_feature_extraction(files, dataset, feature_path, params, overwrite=False): 454 | """Feature extraction 455 | 456 | Parameters 457 | ---------- 458 | files : list 459 | file list 460 | 461 | dataset : class 462 | dataset class 463 | 464 | feature_path : str 465 | path where the features are saved 466 | 467 | params : dict 468 | parameter dict 469 | 470 | overwrite : bool 471 | overwrite existing feature files 472 | (Default value=False) 473 | 474 | Returns 475 | ------- 476 | nothing 477 | 478 | Raises 479 | ------- 480 | IOError 481 | Audio file not found. 482 | 483 | """ 484 | 485 | for file_id, audio_filename in enumerate(files): 486 | # Get feature filename 487 | current_feature_file = get_feature_filename(audio_file=os.path.split(audio_filename)[1], path=feature_path) 488 | 489 | progress(title_text='Extracting [sequences]', 490 | percentage=(float(file_id) / len(files)), 491 | note=os.path.split(audio_filename)[1]) 492 | 493 | if not os.path.isfile(current_feature_file) or overwrite: 494 | # Load audio 495 | if os.path.isfile(dataset.relative_to_absolute_path(audio_filename)): 496 | y, fs = load_audio(filename=dataset.relative_to_absolute_path(audio_filename), mono=True, fs=params['fs']) 497 | else: 498 | raise IOError("Audio file not found [%s]" % audio_filename) 499 | 500 | # Extract features 501 | feature_data = feature_extraction(y=y, 502 | fs=fs, 503 | include_mfcc0=params['include_mfcc0'], 504 | include_delta=params['include_delta'], 505 | include_acceleration=params['include_acceleration'], 506 | mfcc_params=params['mfcc'], 507 | delta_params=params['mfcc_delta'], 508 | acceleration_params=params['mfcc_acceleration']) 509 | # Save 510 | save_data(current_feature_file, feature_data) 511 | 512 | 513 | def do_feature_normalization(dataset, feature_normalizer_path, feature_path, dataset_evaluation_mode='folds', overwrite=False): 514 | """Feature normalization 515 | 516 | Calculated normalization factors for each evaluation fold based on the training material available. 517 | 518 | Parameters 519 | ---------- 520 | dataset : class 521 | dataset class 522 | 523 | feature_normalizer_path : str 524 | path where the feature normalizers are saved. 525 | 526 | feature_path : str 527 | path where the features are saved. 528 | 529 | dataset_evaluation_mode : str ['folds', 'full'] 530 | evaluation mode, 'full' all material available is considered to belong to one fold. 531 | (Default value='folds') 532 | 533 | overwrite : bool 534 | overwrite existing normalizers 535 | (Default value=False) 536 | 537 | Returns 538 | ------- 539 | nothing 540 | 541 | Raises 542 | ------- 543 | IOError 544 | Feature file not found. 545 | 546 | """ 547 | 548 | for fold in dataset.folds(mode=dataset_evaluation_mode): 549 | for scene_id, scene_label in enumerate(dataset.scene_labels): 550 | current_normalizer_file = get_feature_normalizer_filename(fold=fold, scene_label=scene_label, path=feature_normalizer_path) 551 | 552 | if not os.path.isfile(current_normalizer_file) or overwrite: 553 | # Collect sequence files from scene class 554 | files = [] 555 | for item_id, item in enumerate(dataset.train(fold, scene_label=scene_label)): 556 | if item['file'] not in files: 557 | files.append(item['file']) 558 | 559 | file_count = len(files) 560 | 561 | # Initialize statistics 562 | normalizer = FeatureNormalizer() 563 | 564 | for file_id, audio_filename in enumerate(files): 565 | progress(title_text='Collecting data', 566 | fold=fold, 567 | percentage=(float(file_id) / file_count), 568 | note=os.path.split(audio_filename)[1]) 569 | 570 | # Load features 571 | feature_filename = get_feature_filename(audio_file=os.path.split(audio_filename)[1], path=feature_path) 572 | if os.path.isfile(feature_filename): 573 | feature_data = load_data(feature_filename)['stat'] 574 | else: 575 | raise IOError("Feature file not found [%s]" % audio_filename) 576 | 577 | # Accumulate statistics 578 | normalizer.accumulate(feature_data) 579 | 580 | # Calculate normalization factors 581 | normalizer.finalize() 582 | 583 | # Save 584 | save_data(current_normalizer_file, normalizer) 585 | 586 | 587 | def do_system_training(dataset, model_path, feature_normalizer_path, feature_path, hop_length_seconds, classifier_params, 588 | dataset_evaluation_mode='folds', classifier_method='gmm', overwrite=False): 589 | """System training 590 | 591 | Train a model pair for each sound event class, one for activity and one for inactivity. 592 | 593 | model container format: 594 | 595 | { 596 | 'normalizer': normalizer class 597 | 'models' : 598 | { 599 | 'mouse click' : 600 | { 601 | 'positive': mixture.GMM class, 602 | 'negative': mixture.GMM class 603 | } 604 | 'keyboard typing' : 605 | { 606 | 'positive': mixture.GMM class, 607 | 'negative': mixture.GMM class 608 | } 609 | ... 610 | } 611 | } 612 | 613 | Parameters 614 | ---------- 615 | dataset : class 616 | dataset class 617 | 618 | model_path : str 619 | path where the models are saved. 620 | 621 | feature_normalizer_path : str 622 | path where the feature normalizers are saved. 623 | 624 | feature_path : str 625 | path where the features are saved. 626 | 627 | hop_length_seconds : float > 0 628 | feature frame hop length in seconds 629 | 630 | classifier_params : dict 631 | parameter dict 632 | 633 | dataset_evaluation_mode : str ['folds', 'full'] 634 | evaluation mode, 'full' all material available is considered to belong to one fold. 635 | (Default value='folds') 636 | 637 | classifier_method : str ['gmm'] 638 | classifier method, currently only GMM supported 639 | (Default value='gmm') 640 | 641 | overwrite : bool 642 | overwrite existing models 643 | (Default value=False) 644 | 645 | Returns 646 | ------- 647 | nothing 648 | 649 | Raises 650 | ------- 651 | ValueError 652 | classifier_method is unknown. 653 | 654 | IOError 655 | Feature normalizer not found. 656 | Feature file not found. 657 | 658 | """ 659 | 660 | if classifier_method != 'gmm': 661 | raise ValueError("Unknown classifier method ["+classifier_method+"]") 662 | 663 | for fold in dataset.folds(mode=dataset_evaluation_mode): 664 | for scene_id, scene_label in enumerate(dataset.scene_labels): 665 | current_model_file = get_model_filename(fold=fold, scene_label=scene_label, path=model_path) 666 | if not os.path.isfile(current_model_file) or overwrite: 667 | 668 | # Load normalizer 669 | feature_normalizer_filename = get_feature_normalizer_filename(fold=fold, scene_label=scene_label, path=feature_normalizer_path) 670 | if os.path.isfile(feature_normalizer_filename): 671 | normalizer = load_data(feature_normalizer_filename) 672 | else: 673 | raise IOError("Feature normalizer not found [%s]" % feature_normalizer_filename) 674 | 675 | # Initialize model container 676 | model_container = {'normalizer': normalizer, 'models': {}} 677 | 678 | # Restructure training data in to structure[files][events] 679 | ann = {} 680 | for item_id, item in enumerate(dataset.train(fold=fold, scene_label=scene_label)): 681 | filename = os.path.split(item['file'])[1] 682 | if filename not in ann: 683 | ann[filename] = {} 684 | if item['event_label'] not in ann[filename]: 685 | ann[filename][item['event_label']] = [] 686 | ann[filename][item['event_label']].append((item['event_onset'], item['event_offset'])) 687 | 688 | # Collect training examples 689 | data_positive = {} 690 | data_negative = {} 691 | file_count = len(ann) 692 | for item_id, audio_filename in enumerate(ann): 693 | progress(title_text='Collecting data', 694 | fold=fold, 695 | percentage=(float(item_id) / file_count), 696 | note=scene_label+" / "+os.path.split(audio_filename)[1]) 697 | 698 | # Load features 699 | feature_filename = get_feature_filename(audio_file=audio_filename, path=feature_path) 700 | if os.path.isfile(feature_filename): 701 | feature_data = load_data(feature_filename)['feat'] 702 | else: 703 | raise IOError("Feature file not found [%s]" % feature_filename) 704 | 705 | # Normalize features 706 | feature_data = model_container['normalizer'].normalize(feature_data) 707 | 708 | for event_label in ann[audio_filename]: 709 | positive_mask = numpy.zeros((feature_data.shape[0]), dtype=bool) 710 | 711 | for event in ann[audio_filename][event_label]: 712 | start_frame = int(math.floor(event[0] / hop_length_seconds)) 713 | stop_frame = int(math.ceil(event[1] / hop_length_seconds)) 714 | 715 | if stop_frame > feature_data.shape[0]: 716 | stop_frame = feature_data.shape[0] 717 | 718 | positive_mask[start_frame:stop_frame] = True 719 | 720 | # Store positive examples 721 | if event_label not in data_positive: 722 | data_positive[event_label] = feature_data[positive_mask, :] 723 | else: 724 | data_positive[event_label] = numpy.vstack((data_positive[event_label], feature_data[positive_mask, :])) 725 | 726 | # Store negative examples 727 | if event_label not in data_negative: 728 | data_negative[event_label] = feature_data[~positive_mask, :] 729 | else: 730 | data_negative[event_label] = numpy.vstack((data_negative[event_label], feature_data[~positive_mask, :])) 731 | 732 | # Train models for each class 733 | for event_label in data_positive: 734 | progress(title_text='Train models', 735 | fold=fold, 736 | note=scene_label+" / "+event_label) 737 | if classifier_method == 'gmm': 738 | model_container['models'][event_label] = {} 739 | model_container['models'][event_label]['positive'] = mixture.GMM(**classifier_params).fit(data_positive[event_label]) 740 | model_container['models'][event_label]['negative'] = mixture.GMM(**classifier_params).fit(data_negative[event_label]) 741 | else: 742 | raise ValueError("Unknown classifier method ["+classifier_method+"]") 743 | 744 | # Save models 745 | save_data(current_model_file, model_container) 746 | 747 | 748 | def do_system_testing(dataset, result_path, feature_path, model_path, feature_params, detector_params, 749 | dataset_evaluation_mode='folds', classifier_method='gmm', overwrite=False): 750 | """System testing. 751 | 752 | If extracted features are not found from disk, they are extracted but not saved. 753 | 754 | Parameters 755 | ---------- 756 | dataset : class 757 | dataset class 758 | 759 | result_path : str 760 | path where the results are saved. 761 | 762 | feature_path : str 763 | path where the features are saved. 764 | 765 | model_path : str 766 | path where the models are saved. 767 | 768 | feature_params : dict 769 | parameter dict 770 | 771 | dataset_evaluation_mode : str ['folds', 'full'] 772 | evaluation mode, 'full' all material available is considered to belong to one fold. 773 | (Default value='folds') 774 | 775 | classifier_method : str ['gmm'] 776 | classifier method, currently only GMM supported 777 | (Default value='gmm') 778 | 779 | overwrite : bool 780 | overwrite existing models 781 | (Default value=False) 782 | 783 | Returns 784 | ------- 785 | nothing 786 | 787 | Raises 788 | ------- 789 | ValueError 790 | classifier_method is unknown. 791 | 792 | IOError 793 | Model file not found. 794 | Audio file not found. 795 | 796 | """ 797 | 798 | if classifier_method != 'gmm': 799 | raise ValueError("Unknown classifier method ["+classifier_method+"]") 800 | 801 | # Check that target path exists, create if not 802 | check_path(result_path) 803 | 804 | for fold in dataset.folds(mode=dataset_evaluation_mode): 805 | for scene_id, scene_label in enumerate(dataset.scene_labels): 806 | current_result_file = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 807 | 808 | if not os.path.isfile(current_result_file) or overwrite: 809 | results = [] 810 | 811 | # Load class model container 812 | model_filename = get_model_filename(fold=fold, scene_label=scene_label, path=model_path) 813 | if os.path.isfile(model_filename): 814 | model_container = load_data(model_filename) 815 | else: 816 | raise IOError("Model file not found [%s]" % model_filename) 817 | 818 | file_count = len(dataset.test(fold, scene_label=scene_label)) 819 | for file_id, item in enumerate(dataset.test(fold=fold, scene_label=scene_label)): 820 | progress(title_text='Testing', 821 | fold=fold, 822 | percentage=(float(file_id) / file_count), 823 | note=scene_label+" / "+os.path.split(item['file'])[1]) 824 | 825 | # Load features 826 | feature_filename = get_feature_filename(audio_file=item['file'], path=feature_path) 827 | 828 | if os.path.isfile(feature_filename): 829 | feature_data = load_data(feature_filename)['feat'] 830 | else: 831 | # Load audio 832 | if os.path.isfile(dataset.relative_to_absolute_path(item['file'])): 833 | y, fs = load_audio(filename=item['file'], mono=True, fs=feature_params['fs']) 834 | else: 835 | raise IOError("Audio file not found [%s]" % item['file']) 836 | 837 | # Extract features 838 | feature_data = feature_extraction(y=y, 839 | fs=fs, 840 | include_mfcc0=feature_params['include_mfcc0'], 841 | include_delta=feature_params['include_delta'], 842 | include_acceleration=feature_params['include_acceleration'], 843 | mfcc_params=feature_params['mfcc'], 844 | delta_params=feature_params['mfcc_delta'], 845 | acceleration_params=feature_params['mfcc_acceleration'], 846 | statistics=False)['feat'] 847 | 848 | # Normalize features 849 | feature_data = model_container['normalizer'].normalize(feature_data) 850 | 851 | current_results = event_detection(feature_data=feature_data, 852 | model_container=model_container, 853 | hop_length_seconds=feature_params['hop_length_seconds'], 854 | smoothing_window_length_seconds=detector_params['smoothing_window_length'], 855 | decision_threshold=detector_params['decision_threshold'], 856 | minimum_event_length=detector_params['minimum_event_length'], 857 | minimum_event_gap=detector_params['minimum_event_gap']) 858 | 859 | # Store the result 860 | for event in current_results: 861 | results.append((dataset.absolute_to_relative(item['file']), event[0], event[1], event[2] )) 862 | 863 | # Save testing results 864 | with open(current_result_file, 'wt') as f: 865 | writer = csv.writer(f, delimiter='\t') 866 | for result_item in results: 867 | writer.writerow(result_item) 868 | 869 | 870 | def do_system_evaluation(dataset, result_path, dataset_evaluation_mode='folds'): 871 | """System evaluation. Testing outputs are collected and evaluated. Evaluation results are printed. 872 | 873 | Parameters 874 | ---------- 875 | dataset : class 876 | dataset class 877 | 878 | result_path : str 879 | path where the results are saved. 880 | 881 | dataset_evaluation_mode : str ['folds', 'full'] 882 | evaluation mode, 'full' all material available is considered to belong to one fold. 883 | (Default value='folds') 884 | 885 | Returns 886 | ------- 887 | nothing 888 | 889 | Raises 890 | ------- 891 | IOError 892 | Result file not found 893 | 894 | """ 895 | 896 | # Set warnings off, sklearn metrics will trigger warning for classes without 897 | # predicted samples in F1-scoring. This is just to keep printing clean. 898 | warnings.simplefilter("ignore") 899 | 900 | overall_metrics_per_scene = {} 901 | 902 | for scene_id, scene_label in enumerate(dataset.scene_labels): 903 | if scene_label not in overall_metrics_per_scene: 904 | overall_metrics_per_scene[scene_label] = {} 905 | 906 | dcase2016_segment_based_metric = DCASE2016_EventDetection_SegmentBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 907 | dcase2016_event_based_metric = DCASE2016_EventDetection_EventBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 908 | 909 | for fold in dataset.folds(mode=dataset_evaluation_mode): 910 | results = [] 911 | result_filename = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 912 | 913 | if os.path.isfile(result_filename): 914 | with open(result_filename, 'rt') as f: 915 | for row in csv.reader(f, delimiter='\t'): 916 | results.append(row) 917 | else: 918 | raise IOError("Result file not found [%s]" % result_filename) 919 | 920 | for file_id, item in enumerate(dataset.test(fold, scene_label=scene_label)): 921 | current_file_results = [] 922 | for result_line in results: 923 | if len(result_line) != 0 and result_line[0] == dataset.absolute_to_relative(item['file']): 924 | current_file_results.append( 925 | {'file': result_line[0], 926 | 'event_onset': float(result_line[1]), 927 | 'event_offset': float(result_line[2]), 928 | 'event_label': result_line[3].rstrip() 929 | } 930 | ) 931 | meta = dataset.file_meta(dataset.absolute_to_relative(item['file'])) 932 | 933 | dcase2016_segment_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 934 | dcase2016_event_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 935 | 936 | overall_metrics_per_scene[scene_label]['segment_based_metrics'] = dcase2016_segment_based_metric.results() 937 | overall_metrics_per_scene[scene_label]['event_based_metrics'] = dcase2016_event_based_metric.results() 938 | 939 | print " Evaluation over %d folds" % dataset.fold_count 940 | print " " 941 | print " Results per scene " 942 | print " {:18s} | {:5s} | | {:39s} ".format('', 'Main', 'Secondary metrics') 943 | print " {:18s} | {:5s} | | {:38s} | {:14s} | {:14s} | {:14s} ".format('', '', 'Seg/Overall','Seg/Class', 'Event/Overall','Event/Class') 944 | print " {:18s} | {:5s} | | {:6s} : {:5s} : {:5s} : {:5s} : {:5s} | {:6s} : {:5s} | {:6s} : {:5s} | {:6s} : {:5s} |".format('Scene', 'ER', 'F1', 'ER', 'ER/S', 'ER/D', 'ER/I', 'F1', 'ER', 'F1', 'ER', 'F1', 'ER') 945 | print " -------------------+-------+ +--------+-------+-------+-------+-------+--------+-------+--------+-------+--------+-------+" 946 | averages = { 947 | 'segment_based_metrics': { 948 | 'overall': { 949 | 'ER': [], 950 | 'F': [], 951 | }, 952 | 'class_wise_average': { 953 | 'ER': [], 954 | 'F': [], 955 | } 956 | }, 957 | 'event_based_metrics': { 958 | 'overall': { 959 | 'ER': [], 960 | 'F': [], 961 | }, 962 | 'class_wise_average': { 963 | 'ER': [], 964 | 'F': [], 965 | } 966 | }, 967 | } 968 | for scene_id, scene_label in enumerate(dataset.scene_labels): 969 | print " {:18s} | {:5.2f} | | {:4.1f} % : {:5.2f} : {:5.2f} : {:5.2f} : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} |".format(scene_label, 970 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER'], 971 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['F'] * 100, 972 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER'], 973 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['S'], 974 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['D'], 975 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['I'], 976 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']*100, 977 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER'], 978 | overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['F']*100, 979 | overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['ER'], 980 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']*100, 981 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER'], 982 | ) 983 | averages['segment_based_metrics']['overall']['ER'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER']) 984 | averages['segment_based_metrics']['overall']['F'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['F']) 985 | averages['segment_based_metrics']['class_wise_average']['ER'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER']) 986 | averages['segment_based_metrics']['class_wise_average']['F'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']) 987 | averages['event_based_metrics']['overall']['ER'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['ER']) 988 | averages['event_based_metrics']['overall']['F'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['F']) 989 | averages['event_based_metrics']['class_wise_average']['ER'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER']) 990 | averages['event_based_metrics']['class_wise_average']['F'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']) 991 | 992 | print " -------------------+-------+ +--------+-------+-------+-------+-------+--------+-------+--------+-------+--------+-------+" 993 | print " {:18s} | {:5.2f} | | {:4.1f} % : {:5.2f} : {:21s} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} |".format('Average', 994 | numpy.mean(averages['segment_based_metrics']['overall']['ER']), 995 | numpy.mean(averages['segment_based_metrics']['overall']['F'])*100, 996 | numpy.mean(averages['segment_based_metrics']['overall']['ER']), 997 | ' ', 998 | numpy.mean(averages['segment_based_metrics']['class_wise_average']['F'])*100, 999 | numpy.mean(averages['segment_based_metrics']['class_wise_average']['ER']), 1000 | numpy.mean(averages['event_based_metrics']['overall']['F'])*100, 1001 | numpy.mean(averages['event_based_metrics']['overall']['ER']), 1002 | numpy.mean(averages['event_based_metrics']['class_wise_average']['F'])*100, 1003 | numpy.mean(averages['event_based_metrics']['class_wise_average']['ER']), 1004 | ) 1005 | 1006 | print " " 1007 | # Restore warnings to default settings 1008 | warnings.simplefilter("default") 1009 | print " Results per events " 1010 | 1011 | for scene_id, scene_label in enumerate(dataset.scene_labels): 1012 | print " " 1013 | print " "+scene_label.upper() 1014 | print " {:20s} | {:30s} | | {:15s} ".format('', 'Segment-based', 'Event-based') 1015 | print " {:20s} | {:5s} : {:5s} : {:6s} : {:5s} | | {:5s} : {:5s} : {:6s} : {:5s} |".format('Event', 'Nref', 'Nsys', 'F1', 'ER', 'Nref', 'Nsys', 'F1', 'ER') 1016 | print " ---------------------+-------+-------+--------+-------+ +-------+-------+--------+-------+" 1017 | seg_Nref = 0 1018 | seg_Nsys = 0 1019 | 1020 | event_Nref = 0 1021 | event_Nsys = 0 1022 | for event_label in sorted(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise']): 1023 | print " {:20s} | {:5d} : {:5d} : {:4.1f} % : {:5.2f} | | {:5d} : {:5d} : {:4.1f} % : {:5.2f} |".format(event_label, 1024 | int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nref']), 1025 | int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nsys']), 1026 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['F']*100, 1027 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['ER'], 1028 | int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nref']), 1029 | int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nsys']), 1030 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['F']*100, 1031 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['ER']) 1032 | seg_Nref += int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nref']) 1033 | seg_Nsys += int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nsys']) 1034 | 1035 | event_Nref += int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nref']) 1036 | event_Nsys += int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nsys']) 1037 | print " ---------------------+-------+-------+--------+-------+ +-------+-------+--------+-------+" 1038 | print " {:20s} | {:5d} : {:5d} : {:14s} | | {:5d} : {:5d} : {:14s} |".format('Sum', 1039 | seg_Nref, 1040 | seg_Nsys, 1041 | '', 1042 | event_Nref, 1043 | event_Nsys, 1044 | '') 1045 | print " {:20s} | {:5s} {:5s} : {:4.1f} % : {:5.2f} | | {:5s} {:5s} : {:4.1f} % : {:5.2f} |".format('Average', 1046 | '', '', 1047 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']*100, 1048 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER'], 1049 | '', '', 1050 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']*100, 1051 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER']) 1052 | print " " 1053 | 1054 | if __name__ == "__main__": 1055 | try: 1056 | sys.exit(main(sys.argv)) 1057 | except (ValueError, IOError) as e: 1058 | sys.exit(e) -------------------------------------------------------------------------------- /task3_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # DCASE 2016::Sound Event Detection in Real-life Audio / CNN based system 5 | # Arseniy Gorin, AC Technologies LLC 6 | 7 | from src.ui import * 8 | from src.general import * 9 | from src.files import * 10 | 11 | from src.features_cnn import * 12 | from src.sound_event_detection_cnn import * 13 | from src.dataset_cnn import * 14 | from src.evaluation import * 15 | 16 | import numpy 17 | import csv 18 | import warnings 19 | import argparse 20 | import textwrap 21 | import math 22 | import os 23 | 24 | import random 25 | 26 | from keras.models import Sequential 27 | from keras.layers import Dense, Activation, Dropout, Flatten 28 | from keras.layers import Convolution2D, MaxPooling2D 29 | from keras.models import model_from_json 30 | from keras.callbacks import EarlyStopping, ModelCheckpoint 31 | from keras.optimizers import SGD, Adam 32 | from keras.regularizers import WeightRegularizer 33 | 34 | from sklearn import mixture 35 | 36 | __version_info__ = ('1', '0', '1') 37 | __version__ = '.'.join(__version_info__) 38 | 39 | def main(argv): 40 | numpy.random.seed(123456) # let's make randomization predictable 41 | 42 | parser = argparse.ArgumentParser( 43 | prefix_chars='-+', 44 | formatter_class=argparse.RawDescriptionHelpFormatter, 45 | description=textwrap.dedent('''\ 46 | DCASE 2016 47 | Task 3: Sound Event Detection in Real-life Audio 48 | CNN based system 49 | --------------------------------------------- 50 | Tampere University of Technology / Audio Research Group 51 | Author: Toni Heittola ( toni.heittola@tut.fi ) 52 | Arseniy Gorin ( gorinars@yandex.ru ) 53 | 54 | System description 55 | The CNN based system built on top of GMM DCASE 2016 baseline 56 | Complete description is available in the report 57 | http://www.cs.tut.fi/sgn/arg/dcase2016/documents/challenge_technical_reports/Task3/Gorin_2016_task3.pdf 58 | 59 | ''')) 60 | 61 | parser.add_argument("-development", help="Use the system in the development mode", action='store_true', 62 | default=False, dest='development') 63 | parser.add_argument("-challenge", help="Use the system in the challenge mode", action='store_true', 64 | default=False, dest='challenge') 65 | 66 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) 67 | args = parser.parse_args() 68 | 69 | # Load parameters from config file 70 | parameter_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 71 | os.path.splitext(os.path.basename(__file__))[0]+'.yaml') 72 | params = load_parameters(parameter_file) 73 | params = process_parameters(params) 74 | make_folders(params) 75 | 76 | title("DCASE 2016::Sound Event Detection in Real-life Audio / Baseline System") 77 | 78 | # Check if mode is defined 79 | if not (args.development or args.challenge): 80 | args.development = True 81 | args.challenge = False 82 | 83 | dataset_evaluation_mode = 'folds' 84 | if args.development and not args.challenge: 85 | print "Running system in development mode" 86 | dataset_evaluation_mode = 'folds' 87 | elif not args.development and args.challenge: 88 | print "Running system in challenge mode" 89 | dataset_evaluation_mode = 'full' 90 | 91 | # Get dataset container class 92 | dataset = eval(params['general']['development_dataset'])(data_path=params['path']['data']) 93 | 94 | # Fetch data over internet and setup the data 95 | # ================================================== 96 | if params['flow']['initialize']: 97 | dataset.fetch() 98 | 99 | # Extract features for all audio files in the dataset 100 | # ================================================== 101 | if params['flow']['extract_features']: 102 | section_header('Feature extraction [Development data]') 103 | 104 | # Collect files from evaluation sets 105 | files = [] 106 | for fold in dataset.folds(mode=dataset_evaluation_mode): 107 | for item_id, item in enumerate(dataset.train(fold)): 108 | if item['file'] not in files: 109 | files.append(item['file']) 110 | for item_id, item in enumerate(dataset.test(fold)): 111 | if item['file'] not in files: 112 | files.append(item['file']) 113 | 114 | # Go through files and make sure all features are extracted 115 | do_feature_extraction(files=files, 116 | dataset=dataset, 117 | feature_path=params['path']['features'], 118 | params=params['features'], 119 | overwrite=params['general']['overwrite']) 120 | 121 | foot() 122 | 123 | # Prepare feature normalizers 124 | # ================================================== 125 | if params['flow']['feature_normalizer']: 126 | section_header('Feature normalizer [Development data]') 127 | 128 | do_feature_normalization(dataset=dataset, 129 | feature_normalizer_path=params['path']['feature_normalizers'], 130 | feature_path=params['path']['features'], 131 | dataset_evaluation_mode=dataset_evaluation_mode, 132 | overwrite=params['general']['overwrite']) 133 | 134 | foot() 135 | 136 | # System training 137 | # ================================================== 138 | if params['flow']['train_system']: 139 | section_header('System training [Development data]') 140 | 141 | do_system_training(dataset=dataset, 142 | model_path=params['path']['models'], 143 | feature_normalizer_path=params['path']['feature_normalizers'], 144 | feature_path=params['path']['features'], 145 | hop_length_seconds=params['features']['hop_length_seconds'], 146 | classifier_params=params['classifier']['parameters'], 147 | dataset_evaluation_mode=dataset_evaluation_mode, 148 | classifier_method=params['classifier']['method'], 149 | overwrite=params['general']['overwrite'] 150 | ) 151 | 152 | foot() 153 | 154 | # System evaluation in development mode 155 | if args.development and not args.challenge: 156 | 157 | # System testing 158 | # ================================================== 159 | if params['flow']['test_system']: 160 | section_header('System testing [Development data]') 161 | 162 | do_system_testing(dataset=dataset, 163 | result_path=params['path']['results'], 164 | feature_path=params['path']['features'], 165 | model_path=params['path']['models'], 166 | feature_params=params['features'], 167 | detector_params=params['detector'], 168 | dataset_evaluation_mode=dataset_evaluation_mode, 169 | classifier_method=params['classifier']['method'], 170 | overwrite=params['general']['overwrite'] 171 | ) 172 | foot() 173 | 174 | # System evaluation 175 | # ================================================== 176 | if params['flow']['evaluate_system']: 177 | section_header('System evaluation [Development data]') 178 | 179 | do_system_evaluation(dataset=dataset, 180 | dataset_evaluation_mode=dataset_evaluation_mode, 181 | result_path=params['path']['results']) 182 | 183 | foot() 184 | 185 | # System evaluation with challenge data 186 | elif not args.development and args.challenge: 187 | # Fetch data over internet and setup the data 188 | challenge_dataset = eval(params['general']['challenge_dataset'])(data_path=params['path']['data']) 189 | 190 | if params['flow']['initialize']: 191 | challenge_dataset.fetch() 192 | 193 | # System testing 194 | if params['flow']['test_system']: 195 | section_header('System testing [Challenge data]') 196 | 197 | do_system_testing(dataset=challenge_dataset, 198 | result_path=params['path']['challenge_results'], 199 | feature_path=params['path']['features'], 200 | model_path=params['path']['models'], 201 | feature_params=params['features'], 202 | detector_params=params['detector'], 203 | dataset_evaluation_mode=dataset_evaluation_mode, 204 | classifier_method=params['classifier']['method'], 205 | overwrite=True 206 | ) 207 | foot() 208 | 209 | print " " 210 | print "Your results for the challenge data are stored at ["+params['path']['challenge_results']+"]" 211 | print " " 212 | 213 | def myGenerator(X_train,y_train,batch,sub_len=20000): 214 | """ generate subset of train data randomly """ 215 | while 1: 216 | idx = random.sample(range(X_train.shape[0]), sub_len) 217 | for i in range(0,len(idx),batch): 218 | yield X_train[idx[i:i+batch]], y_train[idx[i:i+batch]] 219 | 220 | 221 | def process_parameters(params): 222 | """Parameter post-processing. 223 | 224 | Parameters 225 | ---------- 226 | params : dict 227 | parameters in dict 228 | 229 | Returns 230 | ------- 231 | params : dict 232 | processed parameters 233 | 234 | """ 235 | 236 | params['features']['mfcc']['win_length'] = int(params['features']['win_length_seconds'] * params['features']['fs']) 237 | params['features']['mfcc']['hop_length'] = int(params['features']['hop_length_seconds'] * params['features']['fs']) 238 | 239 | # Copy parameters for current classifier method 240 | params['classifier']['parameters'] = params['classifier_parameters'][params['classifier']['method']] 241 | 242 | # Hash 243 | params['features']['hash'] = get_parameter_hash(params['features']) 244 | params['classifier']['hash'] = get_parameter_hash(params['classifier']) 245 | params['detector']['hash'] = get_parameter_hash(params['detector']) 246 | 247 | # Paths 248 | params['path']['data'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), params['path']['data']) 249 | params['path']['base'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), params['path']['base']) 250 | 251 | # Features 252 | params['path']['features_'] = params['path']['features'] 253 | params['path']['features'] = os.path.join(params['path']['base'], 254 | params['path']['features'], 255 | params['features']['hash']) 256 | 257 | # Feature normalizers 258 | params['path']['feature_normalizers_'] = params['path']['feature_normalizers'] 259 | params['path']['feature_normalizers'] = os.path.join(params['path']['base'], 260 | params['path']['feature_normalizers'], 261 | params['features']['hash']) 262 | 263 | # Models 264 | # Save parameters into folders to help manual browsing of files. 265 | params['path']['models_'] = params['path']['models'] 266 | params['path']['models'] = os.path.join(params['path']['base'], 267 | params['path']['models'], 268 | params['features']['hash'], 269 | params['classifier']['hash']) 270 | 271 | # Results 272 | params['path']['results_'] = params['path']['results'] 273 | params['path']['results'] = os.path.join(params['path']['base'], 274 | params['path']['results'], 275 | params['features']['hash'], 276 | params['classifier']['hash'], 277 | params['detector']['hash']) 278 | return params 279 | 280 | 281 | def make_folders(params, parameter_filename='parameters.yaml'): 282 | """Create all needed folders, and saves parameters in yaml-file for easier manual browsing of data. 283 | 284 | Parameters 285 | ---------- 286 | params : dict 287 | parameters in dict 288 | 289 | parameter_filename : str 290 | filename to save parameters used to generate the folder name 291 | 292 | Returns 293 | ------- 294 | nothing 295 | 296 | """ 297 | 298 | # Check that target path exists, create if not 299 | check_path(params['path']['features']) 300 | check_path(params['path']['feature_normalizers']) 301 | check_path(params['path']['models']) 302 | check_path(params['path']['results']) 303 | 304 | # Save parameters into folders to help manual browsing of files. 305 | 306 | # Features 307 | feature_parameter_filename = os.path.join(params['path']['features'], parameter_filename) 308 | if not os.path.isfile(feature_parameter_filename): 309 | save_parameters(feature_parameter_filename, params['features']) 310 | 311 | # Feature normalizers 312 | feature_normalizer_parameter_filename = os.path.join(params['path']['feature_normalizers'], parameter_filename) 313 | if not os.path.isfile(feature_normalizer_parameter_filename): 314 | save_parameters(feature_normalizer_parameter_filename, params['features']) 315 | 316 | # Models 317 | model_features_parameter_filename = os.path.join(params['path']['base'], 318 | params['path']['models_'], 319 | params['features']['hash'], 320 | parameter_filename) 321 | if not os.path.isfile(model_features_parameter_filename): 322 | save_parameters(model_features_parameter_filename, params['features']) 323 | 324 | model_models_parameter_filename = os.path.join(params['path']['base'], 325 | params['path']['models_'], 326 | params['features']['hash'], 327 | params['classifier']['hash'], 328 | parameter_filename) 329 | if not os.path.isfile(model_models_parameter_filename): 330 | save_parameters(model_models_parameter_filename, params['classifier']) 331 | 332 | # Results 333 | # Save parameters into folders to help manual browsing of files. 334 | result_features_parameter_filename = os.path.join(params['path']['base'], 335 | params['path']['results_'], 336 | params['features']['hash'], 337 | parameter_filename) 338 | if not os.path.isfile(result_features_parameter_filename): 339 | save_parameters(result_features_parameter_filename, params['features']) 340 | 341 | result_models_parameter_filename = os.path.join(params['path']['base'], 342 | params['path']['results_'], 343 | params['features']['hash'], 344 | params['classifier']['hash'], 345 | parameter_filename) 346 | if not os.path.isfile(result_models_parameter_filename): 347 | save_parameters(result_models_parameter_filename, params['classifier']) 348 | 349 | result_detector_parameter_filename = os.path.join(params['path']['base'], 350 | params['path']['results_'], 351 | params['features']['hash'], 352 | params['classifier']['hash'], 353 | params['detector']['hash'], 354 | parameter_filename) 355 | if not os.path.isfile(result_detector_parameter_filename): 356 | save_parameters(result_detector_parameter_filename, params['detector']) 357 | 358 | 359 | def get_feature_filename(audio_file, path, extension='cpickle'): 360 | """Get feature filename 361 | 362 | Parameters 363 | ---------- 364 | audio_file : str 365 | audio file name from which the features are extracted 366 | 367 | path : str 368 | feature path 369 | 370 | extension : str 371 | file extension 372 | (Default value='cpickle') 373 | 374 | Returns 375 | ------- 376 | feature_filename : str 377 | full feature filename 378 | 379 | """ 380 | 381 | return os.path.join(path, 'sequence_' + os.path.splitext(audio_file)[0] + '.' + extension) 382 | 383 | 384 | def get_feature_normalizer_filename(fold, scene_label, path, extension='cpickle'): 385 | """Get normalizer filename 386 | 387 | Parameters 388 | ---------- 389 | fold : int >= 0 390 | evaluation fold number 391 | 392 | scene_label : str 393 | scene label 394 | 395 | path : str 396 | normalizer path 397 | 398 | extension : str 399 | file extension 400 | (Default value='cpickle') 401 | 402 | Returns 403 | ------- 404 | normalizer_filename : str 405 | full normalizer filename 406 | 407 | """ 408 | 409 | return os.path.join(path, 'scale_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 410 | 411 | 412 | def get_model_filename(fold, scene_label, path, extension='cpickle'): 413 | """Get model filename 414 | 415 | Parameters 416 | ---------- 417 | fold : int >= 0 418 | evaluation fold number 419 | 420 | scene_label : str 421 | scene label 422 | 423 | path : str 424 | model path 425 | 426 | extension : str 427 | file extension 428 | (Default value='cpickle') 429 | 430 | Returns 431 | ------- 432 | model_filename : str 433 | full model filename 434 | 435 | """ 436 | 437 | return os.path.join(path, 'model_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 438 | 439 | 440 | def get_result_filename(fold, scene_label, path, extension='txt'): 441 | """Get result filename 442 | 443 | Parameters 444 | ---------- 445 | fold : int >= 0 446 | evaluation fold number 447 | 448 | scene_label : str 449 | scene label 450 | 451 | path : str 452 | result path 453 | 454 | extension : str 455 | file extension 456 | (Default value='cpickle') 457 | 458 | Returns 459 | ------- 460 | result_filename : str 461 | full result filename 462 | 463 | """ 464 | 465 | if fold == 0: 466 | return os.path.join(path, 'results_' + str(scene_label) + '.' + extension) 467 | else: 468 | return os.path.join(path, 'results_fold' + str(fold) + '_' + str(scene_label) + '.' + extension) 469 | 470 | 471 | def do_feature_extraction(files, dataset, feature_path, params, overwrite=False): 472 | """Feature extraction 473 | 474 | Parameters 475 | ---------- 476 | files : list 477 | file list 478 | 479 | dataset : class 480 | dataset class 481 | 482 | feature_path : str 483 | path where the features are saved 484 | 485 | params : dict 486 | parameter dict 487 | 488 | overwrite : bool 489 | overwrite existing feature files 490 | (Default value=False) 491 | 492 | Returns 493 | ------- 494 | nothing 495 | 496 | Raises 497 | ------- 498 | IOError 499 | Audio file not found. 500 | 501 | """ 502 | 503 | for file_id, audio_filename in enumerate(files): 504 | # Get feature filename 505 | current_feature_file = get_feature_filename(audio_file=os.path.split(audio_filename)[1], path=feature_path) 506 | 507 | progress(title_text='Extracting [sequences]', 508 | percentage=(float(file_id) / len(files)), 509 | note=os.path.split(audio_filename)[1]) 510 | 511 | if not os.path.isfile(current_feature_file) or overwrite: 512 | # Load audio 513 | if os.path.isfile(dataset.relative_to_absolute_path(audio_filename)): 514 | y, fs = load_audio(filename=dataset.relative_to_absolute_path(audio_filename), mono=True, fs=params['fs']) 515 | else: 516 | raise IOError("Audio file not found [%s]" % audio_filename) 517 | 518 | # Extract features 519 | feature_data = feature_extraction(y=y, 520 | fs=fs, 521 | include_mfcc0=params['include_mfcc0'], 522 | include_delta=params['include_delta'], 523 | include_acceleration=params['include_acceleration'], 524 | mfcc_params=params['mfcc'], 525 | delta_params=params['mfcc_delta'], 526 | acceleration_params=params['mfcc_acceleration']) 527 | 528 | if params['cmvn']: 529 | feature_data['feat'] = ( feature_data['feat'] - feature_data['stat']['mean'] ) / feature_data['stat']['std'] 530 | 531 | # Save 532 | save_data(current_feature_file, feature_data) 533 | 534 | 535 | def do_feature_normalization(dataset, feature_normalizer_path, feature_path, dataset_evaluation_mode='folds', overwrite=False): 536 | """Feature normalization 537 | 538 | Calculated normalization factors for each evaluation fold based on the training material available. 539 | 540 | Parameters 541 | ---------- 542 | dataset : class 543 | dataset class 544 | 545 | feature_normalizer_path : str 546 | path where the feature normalizers are saved. 547 | 548 | feature_path : str 549 | path where the features are saved. 550 | 551 | dataset_evaluation_mode : str ['folds', 'full'] 552 | evaluation mode, 'full' all material available is considered to belong to one fold. 553 | (Default value='folds') 554 | 555 | overwrite : bool 556 | overwrite existing normalizers 557 | (Default value=False) 558 | 559 | Returns 560 | ------- 561 | nothing 562 | 563 | Raises 564 | ------- 565 | IOError 566 | Feature file not found. 567 | 568 | """ 569 | 570 | for fold in dataset.folds(mode=dataset_evaluation_mode): 571 | for scene_id, scene_label in enumerate(dataset.scene_labels): 572 | current_normalizer_file = get_feature_normalizer_filename(fold=fold, scene_label=scene_label, path=feature_normalizer_path) 573 | 574 | if not os.path.isfile(current_normalizer_file) or overwrite: 575 | # Collect sequence files from scene class 576 | files = [] 577 | for item_id, item in enumerate(dataset.train(fold, scene_label=scene_label)): 578 | if item['file'] not in files: 579 | files.append(item['file']) 580 | 581 | file_count = len(files) 582 | 583 | # Initialize statistics 584 | normalizer = FeatureNormalizer() 585 | 586 | for file_id, audio_filename in enumerate(files): 587 | progress(title_text='Collecting data', 588 | fold=fold, 589 | percentage=(float(file_id) / file_count), 590 | note=os.path.split(audio_filename)[1]) 591 | 592 | # Load features 593 | feature_filename = get_feature_filename(audio_file=os.path.split(audio_filename)[1], path=feature_path) 594 | if os.path.isfile(feature_filename): 595 | feature_data = load_data(feature_filename)['stat'] 596 | else: 597 | raise IOError("Feature file not found [%s]" % audio_filename) 598 | 599 | # Accumulate statistics 600 | normalizer.accumulate(feature_data) 601 | 602 | # Calculate normalization factors 603 | normalizer.finalize() 604 | 605 | # Save 606 | save_data(current_normalizer_file, normalizer) 607 | 608 | 609 | def do_system_training(dataset, model_path, feature_normalizer_path, feature_path, hop_length_seconds, classifier_params, 610 | dataset_evaluation_mode='folds', classifier_method='cnn', overwrite=False): 611 | """System training 612 | Trains GMM baseline or CNN model 613 | 614 | Parameters 615 | ---------- 616 | dataset : class 617 | dataset class 618 | 619 | model_path : str 620 | path where the models are saved. 621 | 622 | feature_normalizer_path : str 623 | path where the feature normalizers are saved. 624 | 625 | feature_path : str 626 | path where the features are saved. 627 | 628 | hop_length_seconds : float > 0 629 | feature frame hop length in seconds 630 | 631 | classifier_params : dict 632 | parameter dict 633 | 634 | dataset_evaluation_mode : str ['folds', 'full'] 635 | evaluation mode, 'full' all material available is considered to belong to one fold. 636 | (Default value='folds') 637 | 638 | classifier_method : str ['gmm','cnn'] 639 | classifier method, GMM and CNN are supported 640 | (Default value='cnn') 641 | 642 | overwrite : bool 643 | overwrite existing models 644 | (Default value=False) 645 | 646 | Returns 647 | ------- 648 | nothing 649 | 650 | Raises 651 | ------- 652 | ValueError 653 | classifier_method is unknown. 654 | 655 | IOError 656 | Feature normalizer not found. 657 | Feature file not found. 658 | 659 | """ 660 | 661 | if classifier_method != 'gmm' and classifier_method != 'cnn': 662 | raise ValueError("Unknown classifier method ["+classifier_method+"]") 663 | 664 | for fold in [4,3,2,1]: 665 | X = [] 666 | y = [] 667 | # indices of the original data for fine-tuning 668 | finetune_idx = [] 669 | 670 | X_val = [] 671 | y_val = [] 672 | n_home = 0 # n classes in home scene 673 | 674 | 675 | event_dic = {'[silence]home': 0, '[silence]residential_area': 1} 676 | current_model_file = get_model_filename(fold=fold, scene_label='mixed', path=model_path) 677 | dic_home = {} 678 | dic_residential = {} 679 | 680 | for scene_id, scene_label in enumerate(dataset.scene_labels): 681 | if not os.path.isfile(current_model_file) or overwrite: 682 | # Load normalizer 683 | feature_normalizer_filename = get_feature_normalizer_filename(fold=fold, scene_label=scene_label, path=feature_normalizer_path) 684 | if os.path.isfile(feature_normalizer_filename): 685 | normalizer = load_data(feature_normalizer_filename) 686 | else: 687 | raise IOError("Feature normalizer not found [%s]" % feature_normalizer_filename) 688 | # Initialize model container 689 | model_container = {'normalizer': normalizer, 'models': {}} 690 | 691 | # Restructure training data in to structure[files][events] 692 | ann = {} 693 | for item_id, item in enumerate(dataset.train(fold=fold, scene_label=scene_label)): 694 | filename = os.path.split(item['file'])[1] 695 | if filename not in ann: 696 | ann[filename] = {} 697 | if item['event_label'] not in ann[filename]: 698 | ann[filename][item['event_label']] = [] 699 | ann[filename][item['event_label']].append((item['event_onset'], item['event_offset'])) 700 | if item['event_label']+scene_label not in event_dic: 701 | event_dic[item['event_label']+scene_label] = len(event_dic.keys()) 702 | if scene_label == 'home': 703 | dic_home[item['event_label']] = 1 704 | if scene_label == 'residential_area': 705 | dic_residential[item['event_label']] = 1 706 | 707 | # Collect training examples 708 | file_count = len(ann) 709 | for item_id, audio_filename in enumerate(ann): 710 | progress(title_text='Collecting data', 711 | fold=fold, 712 | percentage=(float(item_id) / file_count), 713 | note=scene_label+" / "+os.path.split(audio_filename)[1]) 714 | 715 | # Load features 716 | feature_filename = get_feature_filename(audio_file=audio_filename, path=feature_path) 717 | if os.path.isfile(feature_filename): 718 | feature_data = load_data(feature_filename)['feat'] 719 | else: 720 | raise IOError("Feature file not found [%s]" % feature_filename) 721 | 722 | if classifier_method == 'cnn': 723 | file_frame_labels = numpy.zeros([feature_data.shape[0],100]) 724 | for event_label in ann[audio_filename]: 725 | for event in ann[audio_filename][event_label]: 726 | start_frame = int(math.floor(event[0] / hop_length_seconds)) 727 | stop_frame = int(math.ceil(event[1] / hop_length_seconds)) 728 | if stop_frame > feature_data.shape[0]: 729 | stop_frame = feature_data.shape[0] 730 | file_frame_labels[start_frame:stop_frame, event_dic[event_label+scene_label]] = 1 731 | 732 | 733 | # fill background 0 label (garbage) 734 | sil_frames = numpy.where(~file_frame_labels.any(axis=1))[0] 735 | file_frame_labels[sil_frames,0] = 1 736 | 737 | wd = 2 * classifier_params['splice'] + 1 738 | for i in range(0, feature_data.shape[0] - wd, classifier_params['step']): 739 | # add scene label as a feature: 740 | if scene_label == 'home': 741 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), -1.0)) , axis=1) 742 | else: 743 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), 1.0)) , axis=1) 744 | y_seq = file_frame_labels[i + classifier_params['splice'] + 1,:] 745 | X.append(X_seq) 746 | y.append(y_seq) 747 | if not '_S' in audio_filename and not audio_filename.startswith('m'): 748 | finetune_idx.append(len(X) - 1) 749 | 750 | 751 | 752 | ######### reading dev data for early stopping ############# 753 | ann = {} 754 | for item_id, item in enumerate(dataset.dev(fold=fold, scene_label=scene_label)): 755 | filename = os.path.split(item['file'])[1] 756 | if filename not in ann: 757 | ann[filename] = {} 758 | if item['event_label'] not in ann[filename]: 759 | ann[filename][item['event_label']] = [] 760 | ann[filename][item['event_label']].append((item['event_onset'], item['event_offset'])) 761 | 762 | file_count = len(ann) 763 | for item_id, audio_filename in enumerate(ann): 764 | progress(title_text='Collecting data', 765 | fold=fold, 766 | percentage=(float(item_id) / file_count), 767 | note=scene_label+" / "+os.path.split(audio_filename)[1]) 768 | 769 | # Load features 770 | feature_filename = get_feature_filename(audio_file=audio_filename, path=feature_path) 771 | if os.path.isfile(feature_filename): 772 | feature_data = load_data(feature_filename)['feat'] 773 | else: 774 | raise IOError("Feature file not found [%s]" % feature_filename) 775 | 776 | if classifier_method == 'cnn': 777 | file_frame_labels = numpy.zeros([feature_data.shape[0],100]) 778 | for event_label in ann[audio_filename]: 779 | for event in ann[audio_filename][event_label]: 780 | start_frame = int(math.floor(event[0] / hop_length_seconds)) 781 | stop_frame = int(math.ceil(event[1] / hop_length_seconds)) 782 | if stop_frame > feature_data.shape[0]: 783 | stop_frame = feature_data.shape[0] 784 | file_frame_labels[start_frame:stop_frame, event_dic[event_label+scene_label]] = 1 785 | 786 | # fill background 0 label (garbage) 787 | sil_frames = numpy.where(~file_frame_labels.any(axis=1))[0] 788 | file_frame_labels[sil_frames,0] = 1 789 | 790 | wd = 2 * classifier_params['splice'] + 1 791 | for i in range(0, feature_data.shape[0] - wd, classifier_params['step']): 792 | #X_seq = feature_data[i: i + wd,:] 793 | if scene_label == 'home': 794 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), -1.0)) , axis=1) 795 | else: 796 | X_seq = numpy.concatenate( (feature_data[i: i + wd,:], numpy.full((wd, 1), 1.0)) , axis=1) 797 | 798 | y_seq = file_frame_labels[i + classifier_params['splice'] + 1,:] 799 | X_val.append(X_seq) 800 | y_val.append(y_seq) 801 | 802 | if classifier_method == 'cnn': 803 | print('Build model...') 804 | model_container['model_arch_file'] = current_model_file + '_arch.json' 805 | model_container['model_weights_file'] = current_model_file + '_weights.h5' 806 | model_container['event_dic'] = event_dic 807 | # 808 | output_dim = len(event_dic) 809 | # 810 | WR = WeightRegularizer(l2= 0.0001) 811 | # 812 | X_val = numpy.expand_dims(X_val, axis=1) 813 | model = Sequential() 814 | # 815 | model.add(Convolution2D(80, 6, 60, W_regularizer=WR, input_shape=(1, X[0].shape[0], X[0].shape[1]))) 816 | model.add(Activation('relu')) 817 | model.add(MaxPooling2D(pool_size=(3, 4), strides=(3,4))) 818 | model.add(Dropout(0.2)) 819 | # 820 | model.add(Convolution2D(80, 3, 1, W_regularizer=WR)) 821 | model.add(Activation('relu')) 822 | model.add(MaxPooling2D(pool_size=(1, 3), strides=(3,1))) 823 | # 824 | model.add(Flatten()) 825 | model.add(Dense(1024, W_regularizer=WR)) 826 | model.add(Activation('relu')) 827 | model.add(Dropout(0.5)) 828 | # 829 | model.add(Dense(1024, W_regularizer=WR)) 830 | model.add(Activation('relu')) 831 | model.add(Dropout(0.5)) 832 | 833 | model.add(Dense(output_dim)) 834 | model.add(Activation('sigmoid')) 835 | 836 | optimizer = Adam(lr=classifier_params['lr'], beta_1=0.9, beta_2=0.999, epsilon=1e-08) 837 | model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 838 | 839 | earlyStopping=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') 840 | checkpointer = ModelCheckpoint(monitor='val_loss',filepath=model_container['model_weights_file'], verbose=1, save_best_only=True) 841 | 842 | X_train=numpy.array(X) 843 | X_train = numpy.expand_dims(X_train, axis=1) 844 | y_train = numpy.array(y) 845 | X_val = numpy.array(X_val) 846 | y_val = numpy.array(y_val) 847 | 848 | # create the actual target matrix 849 | y_train = y_train[:,0:len(event_dic)] 850 | y_val = y_val[:,0:len(event_dic)] 851 | 852 | class_weights = None 853 | # balancing class weights 854 | if classifier_params['class_weights']: 855 | class_weights = {} 856 | cwlist = float(y_train.shape[0]) / (y_train.shape[1] * numpy.sum(y_train,axis=0)) 857 | for i in range(len(cwlist)): 858 | class_weights[i] = cwlist[i] 859 | 860 | model.fit(X_train, y_train, 861 | nb_epoch=classifier_params['epochs'], batch_size=classifier_params['batch_size'], 862 | callbacks=[earlyStopping, checkpointer], shuffle=True, 863 | validation_data=(X_val, y_val), verbose=1, 864 | class_weight=class_weights) 865 | 866 | print '== FINETUNING ==' 867 | json_string = model.to_json() 868 | open(model_container['model_arch_file'], 'w').write(json_string) 869 | model = model_from_json(open(model_container['model_arch_file']).read()) 870 | model.load_weights(model_container['model_weights_file']) 871 | 872 | earlyStopping=EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto') 873 | checkpointer = ModelCheckpoint(monitor='val_loss',filepath=model_container['model_weights_file'], verbose=1, save_best_only=True) 874 | optimizer = SGD(lr=classifier_params['lr'], decay=1e-6, momentum=0.9, nesterov=True) 875 | model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 876 | 877 | 878 | model.fit(X_train[finetune_idx], y_train[finetune_idx], 879 | nb_epoch=classifier_params['epochs'], batch_size=classifier_params['batch_size'], 880 | callbacks=[earlyStopping, checkpointer], shuffle=True, 881 | validation_data=(X_val, y_val), verbose=1, 882 | class_weight=class_weights) 883 | 884 | # save model 885 | json_string = model.to_json() 886 | open(model_container['model_arch_file'], 'w').write(json_string) 887 | 888 | # Save models 889 | save_data(current_model_file, model_container) 890 | 891 | # same the same model for home and residential area 892 | current_model_file = get_model_filename(fold=fold, scene_label='home', path=model_path) 893 | model_container['test_dic'] = dic_home 894 | save_data(current_model_file, model_container) 895 | 896 | current_model_file = get_model_filename(fold=fold, scene_label='residential_area', path=model_path) 897 | model_container['test_dic'] = dic_residential 898 | save_data(current_model_file, model_container) 899 | 900 | 901 | def do_system_testing(dataset, result_path, feature_path, model_path, feature_params, detector_params, 902 | dataset_evaluation_mode='folds', classifier_method='gmm', overwrite=False): 903 | """System testing. 904 | 905 | If extracted features are not found from disk, they are extracted but not saved. 906 | 907 | Parameters 908 | ---------- 909 | dataset : class 910 | dataset class 911 | 912 | result_path : str 913 | path where the results are saved. 914 | 915 | feature_path : str 916 | path where the features are saved. 917 | 918 | model_path : str 919 | path where the models are saved. 920 | 921 | feature_params : dict 922 | parameter dict 923 | 924 | dataset_evaluation_mode : str ['folds', 'full'] 925 | evaluation mode, 'full' all material available is considered to belong to one fold. 926 | (Default value='folds') 927 | 928 | classifier_method : str ['gmm'] 929 | classifier method, currently only GMM, DPGMM are supported 930 | (Default value='gmm') 931 | 932 | overwrite : bool 933 | overwrite existing models 934 | (Default value=False) 935 | 936 | Returns 937 | ------- 938 | nothing 939 | 940 | Raises 941 | ------- 942 | ValueError 943 | classifier_method is unknown. 944 | 945 | IOError 946 | Model file not found. 947 | Audio file not found. 948 | 949 | """ 950 | 951 | if classifier_method != 'gmm' and classifier_method != 'cnn': 952 | raise ValueError("Unknown classifier method ["+classifier_method+"]") 953 | 954 | for fold in dataset.folds(mode=dataset_evaluation_mode): 955 | for scene_id, scene_label in enumerate(dataset.scene_labels): 956 | current_result_file = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 957 | if not os.path.isfile(current_result_file) or overwrite: 958 | results = [] 959 | 960 | # Load class model container 961 | model_filename = get_model_filename(fold=fold, scene_label=scene_label, path=model_path) 962 | if os.path.isfile(model_filename): 963 | model_container = load_data(model_filename) 964 | else: 965 | raise IOError("Model file not found [%s]" % model_filename) 966 | 967 | file_count = len(dataset.test(fold, scene_label=scene_label)) 968 | for file_id, item in enumerate(dataset.test(fold=fold, scene_label=scene_label)): 969 | progress(title_text='Testing', 970 | fold=fold, 971 | percentage=(float(file_id) / file_count), 972 | note=scene_label+" / "+os.path.split(item['file'])[1]) 973 | 974 | # Load features 975 | feature_filename = get_feature_filename(audio_file=item['file'], path=feature_path) 976 | 977 | if os.path.isfile(feature_filename): 978 | feature_data = load_data(feature_filename)['feat'] 979 | else: 980 | # Load audio 981 | if os.path.isfile(dataset.relative_to_absolute_path(item['file'])): 982 | y, fs = load_audio(filename=item['file'], mono=True, fs=feature_params['fs']) 983 | else: 984 | raise IOError("Audio file not found [%s]" % item['file']) 985 | 986 | # Extract features 987 | feats_all = feature_extraction(y=y, 988 | fs=fs, 989 | include_mfcc0=feature_params['include_mfcc0'], 990 | include_delta=feature_params['include_delta'], 991 | include_acceleration=feature_params['include_acceleration'], 992 | mfcc_params=feature_params['mfcc'], 993 | delta_params=feature_params['mfcc_delta'], 994 | acceleration_params=feature_params['mfcc_acceleration']) 995 | feature_data = feats_all['feat'] 996 | 997 | if ( feature_params['cmvn'] ): 998 | feature_data = ( feature_data - feats_all['stat']['mean'] ) / feats_all['stat']['std'] 999 | 1000 | if classifier_method == 'cnn': 1001 | current_results = event_detection_cnn(feature_data=feature_data, 1002 | model_container=model_container, 1003 | hop_length_seconds=feature_params['hop_length_seconds'], 1004 | smoothing_window_length_seconds=detector_params['smoothing_window_length'], 1005 | decision_threshold=detector_params['decision_threshold'], 1006 | minimum_event_length=detector_params['minimum_event_length'], 1007 | minimum_event_gap=detector_params['minimum_event_gap'], 1008 | scene_label=scene_label, 1009 | splice=detector_params['splice']) 1010 | else: 1011 | current_results = event_detection(feature_data=feature_data, 1012 | model_container=model_container, 1013 | hop_length_seconds=feature_params['hop_length_seconds'], 1014 | smoothing_window_length_seconds=detector_params['smoothing_window_length'], 1015 | decision_threshold=detector_params['decision_threshold'], 1016 | minimum_event_length=detector_params['minimum_event_length'], 1017 | minimum_event_gap=detector_params['minimum_event_gap']) 1018 | # Store the result 1019 | for event in current_results: 1020 | results.append((dataset.absolute_to_relative(item['file']), event[0], event[1], event[2] )) 1021 | 1022 | # Save testing results 1023 | with open(current_result_file, 'wt') as f: 1024 | writer = csv.writer(f, delimiter='\t') 1025 | for result_item in results: 1026 | writer.writerow(result_item) 1027 | 1028 | 1029 | def do_system_evaluation(dataset, result_path, dataset_evaluation_mode='folds'): 1030 | """System evaluation. Testing outputs are collected and evaluated. Evaluation results are printed. 1031 | 1032 | Parameters 1033 | ---------- 1034 | dataset : class 1035 | dataset class 1036 | 1037 | result_path : str 1038 | path where the results are saved. 1039 | 1040 | dataset_evaluation_mode : str ['folds', 'full'] 1041 | evaluation mode, 'full' all material available is considered to belong to one fold. 1042 | (Default value='folds') 1043 | 1044 | Returns 1045 | ------- 1046 | nothing 1047 | 1048 | Raises 1049 | ------- 1050 | IOError 1051 | Result file not found 1052 | 1053 | """ 1054 | 1055 | # Set warnings off, sklearn metrics will trigger warning for classes without 1056 | # predicted samples in F1-scoring. This is just to keep printing clean. 1057 | warnings.simplefilter("ignore") 1058 | 1059 | overall_metrics_per_scene = {} 1060 | 1061 | for scene_id, scene_label in enumerate(dataset.scene_labels): 1062 | if scene_label not in overall_metrics_per_scene: 1063 | overall_metrics_per_scene[scene_label] = {} 1064 | 1065 | dcase2016_segment_based_metric = DCASE2016_EventDetection_SegmentBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 1066 | dcase2016_event_based_metric = DCASE2016_EventDetection_EventBasedMetrics(class_list=dataset.event_labels(scene_label=scene_label)) 1067 | 1068 | for fold in dataset.folds(mode=dataset_evaluation_mode): 1069 | results = [] 1070 | result_filename = get_result_filename(fold=fold, scene_label=scene_label, path=result_path) 1071 | 1072 | if os.path.isfile(result_filename): 1073 | with open(result_filename, 'rt') as f: 1074 | for row in csv.reader(f, delimiter='\t'): 1075 | results.append(row) 1076 | else: 1077 | raise IOError("Result file not found [%s]" % result_filename) 1078 | 1079 | for file_id, item in enumerate(dataset.test(fold, scene_label=scene_label)): 1080 | current_file_results = [] 1081 | for result_line in results: 1082 | if len(result_line) != 0 and result_line[0] == dataset.absolute_to_relative(item['file']): 1083 | current_file_results.append( 1084 | {'file': result_line[0], 1085 | 'event_onset': float(result_line[1]), 1086 | 'event_offset': float(result_line[2]), 1087 | 'event_label': result_line[3].rstrip() 1088 | } 1089 | ) 1090 | meta = dataset.file_meta(dataset.absolute_to_relative(item['file'])) 1091 | 1092 | dcase2016_segment_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 1093 | dcase2016_event_based_metric.evaluate(system_output=current_file_results, annotated_ground_truth=meta) 1094 | 1095 | overall_metrics_per_scene[scene_label]['segment_based_metrics'] = dcase2016_segment_based_metric.results() 1096 | overall_metrics_per_scene[scene_label]['event_based_metrics'] = dcase2016_event_based_metric.results() 1097 | 1098 | print " Evaluation over %d folds" % dataset.fold_count 1099 | print " " 1100 | print " Results per scene " 1101 | print " {:18s} | {:5s} | | {:39s} ".format('', 'Main', 'Secondary metrics') 1102 | print " {:18s} | {:5s} | | {:38s} | {:14s} | {:14s} | {:14s} ".format('', '', 'Seg/Overall','Seg/Class', 'Event/Overall','Event/Class') 1103 | print " {:18s} | {:5s} | | {:6s} : {:5s} : {:5s} : {:5s} : {:5s} | {:6s} : {:5s} | {:6s} : {:5s} | {:6s} : {:5s} |".format('Scene', 'ER', 'F1', 'ER', 'ER/S', 'ER/D', 'ER/I', 'F1', 'ER', 'F1', 'ER', 'F1', 'ER') 1104 | print " -------------------+-------+ +--------+-------+-------+-------+-------+--------+-------+--------+-------+--------+-------+" 1105 | averages = { 1106 | 'segment_based_metrics': { 1107 | 'overall': { 1108 | 'ER': [], 1109 | 'F': [], 1110 | }, 1111 | 'class_wise_average': { 1112 | 'ER': [], 1113 | 'F': [], 1114 | } 1115 | }, 1116 | 'event_based_metrics': { 1117 | 'overall': { 1118 | 'ER': [], 1119 | 'F': [], 1120 | }, 1121 | 'class_wise_average': { 1122 | 'ER': [], 1123 | 'F': [], 1124 | } 1125 | }, 1126 | } 1127 | for scene_id, scene_label in enumerate(dataset.scene_labels): 1128 | print " {:18s} | {:5.2f} | | {:4.1f} % : {:5.2f} : {:5.2f} : {:5.2f} : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} |".format(scene_label, 1129 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER'], 1130 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['F'] * 100, 1131 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER'], 1132 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['S'], 1133 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['D'], 1134 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['I'], 1135 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']*100, 1136 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER'], 1137 | overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['F']*100, 1138 | overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['ER'], 1139 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']*100, 1140 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER'], 1141 | ) 1142 | averages['segment_based_metrics']['overall']['ER'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['ER']) 1143 | averages['segment_based_metrics']['overall']['F'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['overall']['F']) 1144 | averages['segment_based_metrics']['class_wise_average']['ER'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER']) 1145 | averages['segment_based_metrics']['class_wise_average']['F'].append(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']) 1146 | averages['event_based_metrics']['overall']['ER'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['ER']) 1147 | averages['event_based_metrics']['overall']['F'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['overall']['F']) 1148 | averages['event_based_metrics']['class_wise_average']['ER'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER']) 1149 | averages['event_based_metrics']['class_wise_average']['F'].append(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']) 1150 | 1151 | print " -------------------+-------+ +--------+-------+-------+-------+-------+--------+-------+--------+-------+--------+-------+" 1152 | print " {:18s} | {:5.2f} | | {:4.1f} % : {:5.2f} : {:21s} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} | {:4.1f} % : {:5.2f} |".format('Average', 1153 | numpy.mean(averages['segment_based_metrics']['overall']['ER']), 1154 | numpy.mean(averages['segment_based_metrics']['overall']['F'])*100, 1155 | numpy.mean(averages['segment_based_metrics']['overall']['ER']), 1156 | ' ', 1157 | numpy.mean(averages['segment_based_metrics']['class_wise_average']['F'])*100, 1158 | numpy.mean(averages['segment_based_metrics']['class_wise_average']['ER']), 1159 | numpy.mean(averages['event_based_metrics']['overall']['F'])*100, 1160 | numpy.mean(averages['event_based_metrics']['overall']['ER']), 1161 | numpy.mean(averages['event_based_metrics']['class_wise_average']['F'])*100, 1162 | numpy.mean(averages['event_based_metrics']['class_wise_average']['ER']), 1163 | ) 1164 | 1165 | print " " 1166 | # Restore warnings to default settings 1167 | warnings.simplefilter("default") 1168 | print " Results per events " 1169 | 1170 | for scene_id, scene_label in enumerate(dataset.scene_labels): 1171 | print " " 1172 | print " "+scene_label.upper() 1173 | print " {:20s} | {:30s} | | {:15s} ".format('', 'Segment-based', 'Event-based') 1174 | print " {:20s} | {:5s} : {:5s} : {:6s} : {:5s} | | {:5s} : {:5s} : {:6s} : {:5s} |".format('Event', 'Nref', 'Nsys', 'F1', 'ER', 'Nref', 'Nsys', 'F1', 'ER') 1175 | print " ---------------------+-------+-------+--------+-------+ +-------+-------+--------+-------+" 1176 | seg_Nref = 0 1177 | seg_Nsys = 0 1178 | 1179 | event_Nref = 0 1180 | event_Nsys = 0 1181 | for event_label in sorted(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise']): 1182 | print " {:20s} | {:5d} : {:5d} : {:4.1f} % : {:5.2f} | | {:5d} : {:5d} : {:4.1f} % : {:5.2f} |".format(event_label, 1183 | int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nref']), 1184 | int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nsys']), 1185 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['F']*100, 1186 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['ER'], 1187 | int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nref']), 1188 | int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nsys']), 1189 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['F']*100, 1190 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['ER']) 1191 | seg_Nref += int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nref']) 1192 | seg_Nsys += int(overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise'][event_label]['Nsys']) 1193 | 1194 | event_Nref += int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nref']) 1195 | event_Nsys += int(overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise'][event_label]['Nsys']) 1196 | print " ---------------------+-------+-------+--------+-------+ +-------+-------+--------+-------+" 1197 | print " {:20s} | {:5d} : {:5d} : {:14s} | | {:5d} : {:5d} : {:14s} |".format('Sum', 1198 | seg_Nref, 1199 | seg_Nsys, 1200 | '', 1201 | event_Nref, 1202 | event_Nsys, 1203 | '') 1204 | print " {:20s} | {:5s} {:5s} : {:4.1f} % : {:5.2f} | | {:5s} {:5s} : {:4.1f} % : {:5.2f} |".format('Average', 1205 | '', '', 1206 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['F']*100, 1207 | overall_metrics_per_scene[scene_label]['segment_based_metrics']['class_wise_average']['ER'], 1208 | '', '', 1209 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['F']*100, 1210 | overall_metrics_per_scene[scene_label]['event_based_metrics']['class_wise_average']['ER']) 1211 | print " " 1212 | 1213 | if __name__ == "__main__": 1214 | try: 1215 | sys.exit(main(sys.argv)) 1216 | except (ValueError, IOError) as e: 1217 | sys.exit(e) 1218 | --------------------------------------------------------------------------------